From 984fdba6a32e4e9819ebc06ca3acec6582ffd99f Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 7 Oct 2016 16:57:57 -0700
Subject: mm, compaction: use proper alloc_flags in __compaction_suitable()

The __compaction_suitable() function checks the low watermark plus a
compact_gap() gap to decide if there's enough free memory to perform
compaction.  This check uses direct compactor's alloc_flags, but that's
wrong, since these flags are not applicable for freepage isolation.

For example, alloc_flags may indicate access to memory reserves, making
compaction proceed, and then fail watermark check during the isolation.

A similar problem exists for ALLOC_CMA, which may be part of
alloc_flags, but not during freepage isolation.  In this case however it
makes sense to use ALLOC_CMA both in __compaction_suitable() and
__isolate_free_page(), since there's actually nothing preventing the
freepage scanner to isolate from CMA pageblocks, with the assumption
that a page that could be migrated once by compaction can be migrated
also later by CMA allocation.  Thus we should count pages in CMA
pageblocks when considering compaction suitability and when isolating
freepages.

To sum up, this patch should remove some false positives from
__compaction_suitable(), and allow compaction to proceed when free pages
required for compaction reside in the CMA pageblocks.

Link: http://lkml.kernel.org/r/20160810091226.6709-10-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a2214c64ed3c..637b0e907df0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2491,7 +2491,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
 	if (!is_migrate_isolate(mt)) {
 		/* Obey watermarks as if the page was being allocated */
 		watermark = low_wmark_pages(zone) + (1 << order);
-		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+		if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
 			return 0;
 
 		__mod_zone_freepage_state(zone, -(1UL << order), mt);
-- 
cgit v1.2.3


From 8348faf91f56371d4bada6fc5915e19580a15ffe Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 7 Oct 2016 16:58:00 -0700
Subject: mm, compaction: require only min watermarks for non-costly orders

The __compaction_suitable() function checks the low watermark plus a
compact_gap() gap to decide if there's enough free memory to perform
compaction.  Then __isolate_free_page uses low watermark check to decide
if particular free page can be isolated.  In the latter case, using low
watermark is needlessly pessimistic, as the free page isolations are
only temporary.  For __compaction_suitable() the higher watermark makes
sense for high-order allocations where more freepages increase the
chance of success, and we can typically fail with some order-0 fallback
when the system is struggling to reach that watermark.  But for
low-order allocation, forming the page should not be that hard.  So
using low watermark here might just prevent compaction from even trying,
and eventually lead to OOM killer even if we are above min watermarks.

So after this patch, we use min watermark for non-costly orders in
__compaction_suitable(), and for all orders in __isolate_free_page().

[vbabka@suse.cz: clarify __isolate_free_page() comment]
 Link: http://lkml.kernel.org/r/7ae4baec-4eca-e70b-2a69-94bea4fb19fa@suse.cz
Link: http://lkml.kernel.org/r/20160810091226.6709-11-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Lorenzo Stoakes <lstoakes@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 6 +++++-
 mm/page_alloc.c | 9 +++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/compaction.c b/mm/compaction.c
index 658c009d60cc..29f6c49dc9c2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1399,10 +1399,14 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
 	 * isolation. We however do use the direct compactor's classzone_idx to
 	 * skip over zones where lowmem reserves would prevent allocation even
 	 * if compaction succeeds.
+	 * For costly orders, we require low watermark instead of min for
+	 * compaction to proceed to increase its chances.
 	 * ALLOC_CMA is used, as pages in CMA pageblocks are considered
 	 * suitable migration targets
 	 */
-	watermark = low_wmark_pages(zone) + compact_gap(order);
+	watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+				low_wmark_pages(zone) : min_wmark_pages(zone);
+	watermark += compact_gap(order);
 	if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
 						ALLOC_CMA, wmark_target))
 		return COMPACT_SKIPPED;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 637b0e907df0..c988d324e3f6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2489,8 +2489,13 @@ int __isolate_free_page(struct page *page, unsigned int order)
 	mt = get_pageblock_migratetype(page);
 
 	if (!is_migrate_isolate(mt)) {
-		/* Obey watermarks as if the page was being allocated */
-		watermark = low_wmark_pages(zone) + (1 << order);
+		/*
+		 * Obey watermarks as if the page was being allocated. We can
+		 * emulate a high-order watermark check with a raised order-0
+		 * watermark, because we already know our high-order page
+		 * exists.
+		 */
+		watermark = min_wmark_pages(zone) + (1UL << order);
 		if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
 			return 0;
 
-- 
cgit v1.2.3


From e506b99696a296e9aba2e5f3bc5768aa7d8e2396 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Fri, 7 Oct 2016 16:58:06 -0700
Subject: mem-hotplug: fix node spanned pages when we have a movable node

Commit 342332e6a925 ("mm/page_alloc.c: introduce kernelcore=mirror
option") rewrote the calculation of node spanned pages.  But when we
have a movable node, the size of node spanned pages is double added.
That's because we have an empty normal zone, the present pages is zero,
but its spanned pages is not zero.

e.g.
    Zone ranges:
      DMA      [mem 0x0000000000001000-0x0000000000ffffff]
      DMA32    [mem 0x0000000001000000-0x00000000ffffffff]
      Normal   [mem 0x0000000100000000-0x0000007c7fffffff]
    Movable zone start for each node
      Node 1: 0x0000001080000000
      Node 2: 0x0000002080000000
      Node 3: 0x0000003080000000
      Node 4: 0x0000003c80000000
      Node 5: 0x0000004c80000000
      Node 6: 0x0000005c80000000
    Early memory node ranges
      node   0: [mem 0x0000000000001000-0x000000000009ffff]
      node   0: [mem 0x0000000000100000-0x000000007552afff]
      node   0: [mem 0x000000007bd46000-0x000000007bd46fff]
      node   0: [mem 0x000000007bdcd000-0x000000007bffffff]
      node   0: [mem 0x0000000100000000-0x000000107fffffff]
      node   1: [mem 0x0000001080000000-0x000000207fffffff]
      node   2: [mem 0x0000002080000000-0x000000307fffffff]
      node   3: [mem 0x0000003080000000-0x0000003c7fffffff]
      node   4: [mem 0x0000003c80000000-0x0000004c7fffffff]
      node   5: [mem 0x0000004c80000000-0x0000005c7fffffff]
      node   6: [mem 0x0000005c80000000-0x0000006c7fffffff]
      node   7: [mem 0x0000006c80000000-0x0000007c7fffffff]

  node1:
    Normal, start=0x1080000, present=0x0, spanned=0x1000000
    Movable, start=0x1080000, present=0x1000000, spanned=0x1000000
    pgdat, start=0x1080000, present=0x1000000, spanned=0x2000000

After this patch, the problem is fixed.

  node1:
    Normal, start=0x0, present=0x0, spanned=0x0
    Movable, start=0x1080000, present=0x1000000, spanned=0x1000000
    pgdat, start=0x1080000, present=0x1000000, spanned=0x1000000

Link: http://lkml.kernel.org/r/57A325E8.6070100@huawei.com
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 54 +++++++++++++++++++++++-------------------------------
 1 file changed, 23 insertions(+), 31 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c988d324e3f6..26246fdf45b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5004,15 +5004,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			break;
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-		/*
-		 * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
-		 * from zone_movable_pfn[nid] to end of each node should be
-		 * ZONE_MOVABLE not ZONE_NORMAL. skip it.
-		 */
-		if (!mirrored_kernelcore && zone_movable_pfn[nid])
-			if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
-				continue;
-
 		/*
 		 * Check given memblock attribute by firmware which can affect
 		 * kernel memory layout.  If zone==ZONE_MOVABLE but memory is
@@ -5456,6 +5447,12 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
 			*zone_end_pfn = min(node_end_pfn,
 				arch_zone_highest_possible_pfn[movable_zone]);
 
+		/* Adjust for ZONE_MOVABLE starting within this range */
+		} else if (!mirrored_kernelcore &&
+			*zone_start_pfn < zone_movable_pfn[nid] &&
+			*zone_end_pfn > zone_movable_pfn[nid]) {
+			*zone_end_pfn = zone_movable_pfn[nid];
+
 		/* Check if this whole range is within ZONE_MOVABLE */
 		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
 			*zone_start_pfn = *zone_end_pfn;
@@ -5559,28 +5556,23 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
 	 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
 	 * and vice versa.
 	 */
-	if (zone_movable_pfn[nid]) {
-		if (mirrored_kernelcore) {
-			unsigned long start_pfn, end_pfn;
-			struct memblock_region *r;
-
-			for_each_memblock(memory, r) {
-				start_pfn = clamp(memblock_region_memory_base_pfn(r),
-						  zone_start_pfn, zone_end_pfn);
-				end_pfn = clamp(memblock_region_memory_end_pfn(r),
-						zone_start_pfn, zone_end_pfn);
-
-				if (zone_type == ZONE_MOVABLE &&
-				    memblock_is_mirror(r))
-					nr_absent += end_pfn - start_pfn;
-
-				if (zone_type == ZONE_NORMAL &&
-				    !memblock_is_mirror(r))
-					nr_absent += end_pfn - start_pfn;
-			}
-		} else {
-			if (zone_type == ZONE_NORMAL)
-				nr_absent += node_end_pfn - zone_movable_pfn[nid];
+	if (mirrored_kernelcore && zone_movable_pfn[nid]) {
+		unsigned long start_pfn, end_pfn;
+		struct memblock_region *r;
+
+		for_each_memblock(memory, r) {
+			start_pfn = clamp(memblock_region_memory_base_pfn(r),
+					  zone_start_pfn, zone_end_pfn);
+			end_pfn = clamp(memblock_region_memory_end_pfn(r),
+					zone_start_pfn, zone_end_pfn);
+
+			if (zone_type == ZONE_MOVABLE &&
+			    memblock_is_mirror(r))
+				nr_absent += end_pfn - start_pfn;
+
+			if (zone_type == ZONE_NORMAL &&
+			    !memblock_is_mirror(r))
+				nr_absent += end_pfn - start_pfn;
 		}
 	}
 
-- 
cgit v1.2.3


From e780149bcd4be171421535db0514fa9ff556cb87 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Fri, 7 Oct 2016 16:58:09 -0700
Subject: mm: fix set pageblock migratetype in deferred struct page init

On x86_64 MAX_ORDER_NR_PAGES is usually 4M, and a pageblock is usually
2M, so we only set one pageblock's migratetype in deferred_free_range()
if pfn is aligned to MAX_ORDER_NR_PAGES.  That means it causes
uninitialized migratetype blocks, you can see from "cat
/proc/pagetypeinfo", almost half blocks are Unmovable.

Also we missed freeing the last block in deferred_init_memmap(), it
causes memory leak.

Fixes: ac5d2539b238 ("mm: meminit: reduce number of times pageblocks are set during struct page init")
Link: http://lkml.kernel.org/r/57A3260F.4050709@huawei.com
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 26246fdf45b5..0c34633720c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1393,15 +1393,18 @@ static void __init deferred_free_range(struct page *page,
 		return;
 
 	/* Free a large naturally-aligned chunk if possible */
-	if (nr_pages == MAX_ORDER_NR_PAGES &&
-	    (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
+	if (nr_pages == pageblock_nr_pages &&
+	    (pfn & (pageblock_nr_pages - 1)) == 0) {
 		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-		__free_pages_boot_core(page, MAX_ORDER-1);
+		__free_pages_boot_core(page, pageblock_order);
 		return;
 	}
 
-	for (i = 0; i < nr_pages; i++, page++)
+	for (i = 0; i < nr_pages; i++, page++, pfn++) {
+		if ((pfn & (pageblock_nr_pages - 1)) == 0)
+			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		__free_pages_boot_core(page, 0);
+	}
 }
 
 /* Completion tracking for deferred_init_memmap() threads */
@@ -1469,9 +1472,9 @@ static int __init deferred_init_memmap(void *data)
 
 			/*
 			 * Ensure pfn_valid is checked every
-			 * MAX_ORDER_NR_PAGES for memory holes
+			 * pageblock_nr_pages for memory holes
 			 */
-			if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+			if ((pfn & (pageblock_nr_pages - 1)) == 0) {
 				if (!pfn_valid(pfn)) {
 					page = NULL;
 					goto free_range;
@@ -1484,7 +1487,7 @@ static int __init deferred_init_memmap(void *data)
 			}
 
 			/* Minimise pfn page lookups and scheduler checks */
-			if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) {
+			if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
 				page++;
 			} else {
 				nr_pages += nr_to_free;
@@ -1520,6 +1523,9 @@ free_range:
 			free_base_page = NULL;
 			free_base_pfn = nr_to_free = 0;
 		}
+		/* Free the last block of pages to allocator */
+		nr_pages += nr_to_free;
+		deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
 
 		first_init_pfn = max(end_pfn, first_init_pfn);
 	}
-- 
cgit v1.2.3


From acbc15a4b397f86d39416df143e30982b1da528b Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Fri, 7 Oct 2016 16:58:15 -0700
Subject: mm/debug_pagealloc.c: clean-up guard page handling code

Patch series "Reduce memory waste by page extension user".

This patchset tries to reduce memory waste by page extension user.

First case is architecture supported debug_pagealloc.  It doesn't
requires additional memory if guard page isn't used.  8 bytes per page
will be saved in this case.

Second case is related to page owner feature.  Until now, if page_ext
users want to use it's own fields on page_ext, fields should be defined
in struct page_ext by hard-coding.  It has a following problem.

  struct page_ext {
   #ifdef CONFIG_A
  	int a;
   #endif
   #ifdef CONFIG_B
	int b;
   #endif
  };

Assume that kernel is built with both CONFIG_A and CONFIG_B.  Even if we
enable feature A and doesn't enable feature B at runtime, each entry of
struct page_ext takes two int rather than one int.  It's undesirable
waste so this patch tries to reduce it.  By this patchset, we can save
20 bytes per page dedicated for page owner feature in some
configurations.

This patch (of 6):

We can make code clean by moving decision condition for set_page_guard()
into set_page_guard() itself.  It will help code readability.  There is
no functional change.

Link: http://lkml.kernel.org/r/1471315879-32294-2-git-send-email-iamjoonsoo.kim@lge.com
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0c34633720c0..e150ba9e1311 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -637,17 +637,20 @@ static int __init debug_guardpage_minorder_setup(char *buf)
 }
 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
 
-static inline void set_page_guard(struct zone *zone, struct page *page,
+static inline bool set_page_guard(struct zone *zone, struct page *page,
 				unsigned int order, int migratetype)
 {
 	struct page_ext *page_ext;
 
 	if (!debug_guardpage_enabled())
-		return;
+		return false;
+
+	if (order >= debug_guardpage_minorder())
+		return false;
 
 	page_ext = lookup_page_ext(page);
 	if (unlikely(!page_ext))
-		return;
+		return false;
 
 	__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
 
@@ -655,6 +658,8 @@ static inline void set_page_guard(struct zone *zone, struct page *page,
 	set_page_private(page, order);
 	/* Guard pages are not available for any usage */
 	__mod_zone_freepage_state(zone, -(1 << order), migratetype);
+
+	return true;
 }
 
 static inline void clear_page_guard(struct zone *zone, struct page *page,
@@ -677,8 +682,8 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
 }
 #else
 struct page_ext_operations debug_guardpage_ops = { NULL, };
-static inline void set_page_guard(struct zone *zone, struct page *page,
-				unsigned int order, int migratetype) {}
+static inline bool set_page_guard(struct zone *zone, struct page *page,
+			unsigned int order, int migratetype) { return false; }
 static inline void clear_page_guard(struct zone *zone, struct page *page,
 				unsigned int order, int migratetype) {}
 #endif
@@ -1622,18 +1627,15 @@ static inline void expand(struct zone *zone, struct page *page,
 		size >>= 1;
 		VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
 
-		if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
-			debug_guardpage_enabled() &&
-			high < debug_guardpage_minorder()) {
-			/*
-			 * Mark as guard pages (or page), that will allow to
-			 * merge back to allocator when buddy will be freed.
-			 * Corresponding page table entries will not be touched,
-			 * pages will stay not present in virtual address space
-			 */
-			set_page_guard(zone, &page[size], high, migratetype);
+		/*
+		 * Mark as guard pages (or page), that will allow to
+		 * merge back to allocator when buddy will be freed.
+		 * Corresponding page table entries will not be touched,
+		 * pages will stay not present in virtual address space
+		 */
+		if (set_page_guard(zone, &page[size], high, migratetype))
 			continue;
-		}
+
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
-- 
cgit v1.2.3


From f1c1e9f7b5b3ddce6b4f1986939ec87b27515086 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Fri, 7 Oct 2016 16:58:18 -0700
Subject: mm/debug_pagealloc.c: don't allocate page_ext if we don't use guard
 page

What debug_pagealloc does is just mapping/unmapping page table.
Basically, it doesn't need additional memory space to memorize
something.  But, with guard page feature, it requires additional memory
to distinguish if the page is for guard or not.  Guard page is only used
when debug_guardpage_minorder is non-zero so this patch removes
additional memory allocation (page_ext) if debug_guardpage_minorder is
zero.

It saves memory if we just use debug_pagealloc and not guard page.

Link: http://lkml.kernel.org/r/1471315879-32294-3-git-send-email-iamjoonsoo.kim@lge.com
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e150ba9e1311..06ea805d1b14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -607,6 +607,9 @@ static bool need_debug_guardpage(void)
 	if (!debug_pagealloc_enabled())
 		return false;
 
+	if (!debug_guardpage_minorder())
+		return false;
+
 	return true;
 }
 
@@ -615,6 +618,9 @@ static void init_debug_guardpage(void)
 	if (!debug_pagealloc_enabled())
 		return;
 
+	if (!debug_guardpage_minorder())
+		return;
+
 	_debug_guardpage_enabled = true;
 }
 
@@ -635,7 +641,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
 	pr_info("Setting debug_guardpage_minorder to %lu\n", res);
 	return 0;
 }
-__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
 
 static inline bool set_page_guard(struct zone *zone, struct page *page,
 				unsigned int order, int migratetype)
-- 
cgit v1.2.3


From 980ac1672e7edaa927557a5186f1967cd45afcf5 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Fri, 7 Oct 2016 16:58:27 -0700
Subject: mm/page_ext: support extra space allocation by page_ext user

Until now, if some page_ext users want to use it's own field on
page_ext, it should be defined in struct page_ext by hard-coding.  It
has a problem that wastes memory in following situation.

  struct page_ext {
   #ifdef CONFIG_A
  	int a;
   #endif
   #ifdef CONFIG_B
  	int b;
   #endif
  };

Assume that kernel is built with both CONFIG_A and CONFIG_B.  Even if we
enable feature A and doesn't enable feature B at runtime, each entry of
struct page_ext takes two int rather than one int.  It's undesirable
result so this patch tries to fix it.

To solve above problem, this patch implements to support extra space
allocation at runtime.  When need() callback returns true, it's extra
memory requirement is summed to entry size of page_ext.  Also, offset
for each user's extra memory space is returned.  With this offset, user
can use this extra space and there is no need to define needed field on
page_ext by hard-coding.

This patch only implements an infrastructure.  Following patch will use
it for page_owner which is only user having it's own fields on page_ext.

Link: http://lkml.kernel.org/r/1471315879-32294-6-git-send-email-iamjoonsoo.kim@lge.com
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_ext.h |  2 ++
 mm/page_alloc.c          |  2 +-
 mm/page_ext.c            | 41 +++++++++++++++++++++++++++++++----------
 3 files changed, 34 insertions(+), 11 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 03f2a3e7d76d..179bdc4a470c 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -7,6 +7,8 @@
 
 struct pglist_data;
 struct page_ext_operations {
+	size_t offset;
+	size_t size;
 	bool (*need)(void);
 	void (*init)(void);
 };
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 06ea805d1b14..b0f133f2c655 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -687,7 +687,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
 		__mod_zone_freepage_state(zone, (1 << order), migratetype);
 }
 #else
-struct page_ext_operations debug_guardpage_ops = { NULL, };
+struct page_ext_operations debug_guardpage_ops;
 static inline bool set_page_guard(struct zone *zone, struct page *page,
 			unsigned int order, int migratetype) { return false; }
 static inline void clear_page_guard(struct zone *zone, struct page *page,
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 16292829c5c5..121dcffc4ec1 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -42,6 +42,11 @@
  * and page extension core can skip to allocate memory. As result,
  * none of memory is wasted.
  *
+ * When need callback returns true, page_ext checks if there is a request for
+ * extra memory through size in struct page_ext_operations. If it is non-zero,
+ * extra space is allocated for each page_ext entry and offset is returned to
+ * user through offset in struct page_ext_operations.
+ *
  * The init callback is used to do proper initialization after page extension
  * is completely initialized. In sparse memory system, extra memory is
  * allocated some time later than memmap is allocated. In other words, lifetime
@@ -66,18 +71,24 @@ static struct page_ext_operations *page_ext_ops[] = {
 };
 
 static unsigned long total_usage;
+static unsigned long extra_mem;
 
 static bool __init invoke_need_callbacks(void)
 {
 	int i;
 	int entries = ARRAY_SIZE(page_ext_ops);
+	bool need = false;
 
 	for (i = 0; i < entries; i++) {
-		if (page_ext_ops[i]->need && page_ext_ops[i]->need())
-			return true;
+		if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
+			page_ext_ops[i]->offset = sizeof(struct page_ext) +
+						extra_mem;
+			extra_mem += page_ext_ops[i]->size;
+			need = true;
+		}
 	}
 
-	return false;
+	return need;
 }
 
 static void __init invoke_init_callbacks(void)
@@ -91,6 +102,16 @@ static void __init invoke_init_callbacks(void)
 	}
 }
 
+static unsigned long get_entry_size(void)
+{
+	return sizeof(struct page_ext) + extra_mem;
+}
+
+static inline struct page_ext *get_entry(void *base, unsigned long index)
+{
+	return base + get_entry_size() * index;
+}
+
 #if !defined(CONFIG_SPARSEMEM)
 
 
@@ -121,7 +142,7 @@ struct page_ext *lookup_page_ext(struct page *page)
 #endif
 	index = pfn - round_down(node_start_pfn(page_to_nid(page)),
 					MAX_ORDER_NR_PAGES);
-	return base + index;
+	return get_entry(base, index);
 }
 
 static int __init alloc_node_page_ext(int nid)
@@ -143,7 +164,7 @@ static int __init alloc_node_page_ext(int nid)
 		!IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
 		nr_pages += MAX_ORDER_NR_PAGES;
 
-	table_size = sizeof(struct page_ext) * nr_pages;
+	table_size = get_entry_size() * nr_pages;
 
 	base = memblock_virt_alloc_try_nid_nopanic(
 			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
@@ -196,7 +217,7 @@ struct page_ext *lookup_page_ext(struct page *page)
 	if (!section->page_ext)
 		return NULL;
 #endif
-	return section->page_ext + pfn;
+	return get_entry(section->page_ext, pfn);
 }
 
 static void *__meminit alloc_page_ext(size_t size, int nid)
@@ -229,7 +250,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
 	if (section->page_ext)
 		return 0;
 
-	table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
+	table_size = get_entry_size() * PAGES_PER_SECTION;
 	base = alloc_page_ext(table_size, nid);
 
 	/*
@@ -249,7 +270,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
 	 * we need to apply a mask.
 	 */
 	pfn &= PAGE_SECTION_MASK;
-	section->page_ext = base - pfn;
+	section->page_ext = (void *)base - get_entry_size() * pfn;
 	total_usage += table_size;
 	return 0;
 }
@@ -262,7 +283,7 @@ static void free_page_ext(void *addr)
 		struct page *page = virt_to_page(addr);
 		size_t table_size;
 
-		table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
+		table_size = get_entry_size() * PAGES_PER_SECTION;
 
 		BUG_ON(PageReserved(page));
 		free_pages_exact(addr, table_size);
@@ -277,7 +298,7 @@ static void __free_page_ext(unsigned long pfn)
 	ms = __pfn_to_section(pfn);
 	if (!ms || !ms->page_ext)
 		return;
-	base = ms->page_ext + pfn;
+	base = get_entry(ms->page_ext, pfn);
 	free_page_ext(base);
 	ms->page_ext = NULL;
 }
-- 
cgit v1.2.3


From c9634cf012321243ee8e4ea0fb0709904cd58395 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 7 Oct 2016 16:59:12 -0700
Subject: mm: use zonelist name instead of using hardcoded index

Use the existing enums instead of hardcoded index when looking at the
zonelist.  This makes it more readable.  No functionality change by this
patch.

Link: http://lkml.kernel.org/r/1472227078-24852-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c  | 2 +-
 mm/page_alloc.c | 8 ++++----
 mm/vmscan.c     | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2da72a5b6ecc..ad1c96ac313c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1749,7 +1749,7 @@ unsigned int mempolicy_slab_node(void)
 		 */
 		struct zonelist *zonelist;
 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
-		zonelist = &NODE_DATA(node)->node_zonelists[0];
+		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
 		z = first_zones_zonelist(zonelist, highest_zoneidx,
 							&policy->v.nodes);
 		return z->zone ? z->zone->node : node;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b0f133f2c655..f6a5a221496a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4574,7 +4574,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 	int j;
 	struct zonelist *zonelist;
 
-	zonelist = &pgdat->node_zonelists[0];
+	zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
 	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
 	j = build_zonelists_node(NODE_DATA(node), zonelist, j);
@@ -4590,7 +4590,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
 	int j;
 	struct zonelist *zonelist;
 
-	zonelist = &pgdat->node_zonelists[1];
+	zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
 	j = build_zonelists_node(pgdat, zonelist, 0);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
@@ -4611,7 +4611,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 	struct zone *z;
 	struct zonelist *zonelist;
 
-	zonelist = &pgdat->node_zonelists[0];
+	zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
 	pos = 0;
 	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
 		for (j = 0; j < nr_nodes; j++) {
@@ -4746,7 +4746,7 @@ static void build_zonelists(pg_data_t *pgdat)
 
 	local_node = pgdat->node_id;
 
-	zonelist = &pgdat->node_zonelists[0];
+	zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
 	j = build_zonelists_node(pgdat, zonelist, 0);
 
 	/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d3715c1b2e36..744f926af442 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3036,7 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 	 */
 	nid = mem_cgroup_select_victim_node(memcg);
 
-	zonelist = NODE_DATA(nid)->node_zonelists;
+	zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
 
 	trace_mm_vmscan_memcg_reclaim_begin(0,
 					    sc.may_writepage,
-- 
cgit v1.2.3


From f6f34b4387d9e18304451a131b35d7c4f27a0b5a Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Fri, 7 Oct 2016 16:59:15 -0700
Subject: mm: introduce arch_reserved_kernel_pages()

Currently arch specific code can reserve memory blocks but
alloc_large_system_hash() may not take it into consideration when sizing
the hashes.  This can lead to bigger hash than required and lead to no
available memory for other purposes.  This is specifically true for
systems with CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled.

One approach to solve this problem would be to walk through the memblock
regions and calculate the available memory and base the size of hash
system on the available memory.

The other approach would be to depend on the architecture to provide the
number of pages that are reserved.  This change provides hooks to allow
the architecture to provide the required info.

Link: http://lkml.kernel.org/r/1472476010-4709-2-git-send-email-srikar@linux.vnet.ibm.com
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  3 +++
 mm/page_alloc.c    | 12 ++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0a063b4e4456..046077b4209d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1924,6 +1924,9 @@ extern void show_mem(unsigned int flags);
 extern long si_mem_available(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
+#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+extern unsigned long arch_reserved_kernel_pages(void);
+#endif
 
 extern __printf(3, 4)
 void warn_alloc_failed(gfp_t gfp_mask, unsigned int order,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f6a5a221496a..e00f545c2398 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6940,6 +6940,17 @@ static int __init set_hashdist(char *str)
 __setup("hashdist=", set_hashdist);
 #endif
 
+#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+/*
+ * Returns the number of pages that arch has reserved but
+ * is not known to alloc_large_system_hash().
+ */
+static unsigned long __init arch_reserved_kernel_pages(void)
+{
+	return 0;
+}
+#endif
+
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
@@ -6964,6 +6975,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
+		numentries -= arch_reserved_kernel_pages();
 
 		/* It isn't necessary when PAGE_SIZE >= 1MB */
 		if (PAGE_SHIFT < 20)
-- 
cgit v1.2.3


From 3250845d0526407330592dd43b9f1354b6fe7a14 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 7 Oct 2016 17:00:28 -0700
Subject: Revert "mm, oom: prevent premature OOM killer invocation for high
 order request"

Patch series "reintroduce compaction feedback for OOM decisions".

After several people reported OOM's for order-2 allocations in 4.7 due
to Michal Hocko's OOM rework, he reverted the part that considered
compaction feedback [1] in the decisions to retry reclaim/compaction.
This was to provide a fix quickly for 4.8 rc and 4.7 stable series,
while mmotm had an almost complete solution that instead improved
compaction reliability.

This series completes the mmotm solution and reintroduces the compaction
feedback into OOM decisions.  The first two patches restore the state of
mmotm before the temporary solution was merged, the last patch should be
the missing piece for reliability.  The third patch restricts the
hardened compaction to non-costly orders, since costly orders don't
result in OOMs in the first place.

[1] http://marc.info/?i=20160822093249.GA14916%40dhcp22.suse.cz%3E

This patch (of 4):

Commit 6b4e3181d7bd ("mm, oom: prevent premature OOM killer invocation
for high order request") was intended as a quick fix of OOM regressions
for 4.8 and stable 4.7.x kernels.  For a better long-term solution, we
still want to consider compaction feedback, which should be possible
after some more improvements in the following patches.

This reverts commit 6b4e3181d7bd5ca5ab6f45929e4a5ffa7ab4ab7f.

Link: http://lkml.kernel.org/r/20160906135258.18335-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e00f545c2398..634806f55120 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3156,6 +3156,54 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	return NULL;
 }
 
+static inline bool
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+		     enum compact_result compact_result,
+		     enum compact_priority *compact_priority,
+		     int compaction_retries)
+{
+	int max_retries = MAX_COMPACT_RETRIES;
+
+	if (!order)
+		return false;
+
+	/*
+	 * compaction considers all the zone as desperately out of memory
+	 * so it doesn't really make much sense to retry except when the
+	 * failure could be caused by insufficient priority
+	 */
+	if (compaction_failed(compact_result)) {
+		if (*compact_priority > MIN_COMPACT_PRIORITY) {
+			(*compact_priority)--;
+			return true;
+		}
+		return false;
+	}
+
+	/*
+	 * make sure the compaction wasn't deferred or didn't bail out early
+	 * due to locks contention before we declare that we should give up.
+	 * But do not retry if the given zonelist is not suitable for
+	 * compaction.
+	 */
+	if (compaction_withdrawn(compact_result))
+		return compaction_zonelist_suitable(ac, order, alloc_flags);
+
+	/*
+	 * !costly requests are much more important than __GFP_REPEAT
+	 * costly ones because they are de facto nofail and invoke OOM
+	 * killer to move on while costly can fail and users are ready
+	 * to cope with that. 1/4 retries is rather arbitrary but we
+	 * would need much more detailed feedback from compaction to
+	 * make a better decision.
+	 */
+	if (order > PAGE_ALLOC_COSTLY_ORDER)
+		max_retries /= 4;
+	if (compaction_retries <= max_retries)
+		return true;
+
+	return false;
+}
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
@@ -3166,8 +3214,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	return NULL;
 }
 
-#endif /* CONFIG_COMPACTION */
-
 static inline bool
 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
 		     enum compact_result compact_result,
@@ -3194,6 +3240,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
 	}
 	return false;
 }
+#endif /* CONFIG_COMPACTION */
 
 /* Perform direct synchronous page reclaim */
 static int
-- 
cgit v1.2.3


From d943649831aba0fcdda37a0e9e25b332a634cf5e Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 7 Oct 2016 17:00:31 -0700
Subject: mm, compaction: more reliably increase direct compaction priority

During reclaim/compaction loop, compaction priority can be increased by
the should_compact_retry() function, but the current code is not
optimal.  Priority is only increased when compaction_failed() is true,
which means that compaction has scanned the whole zone.  This may not
happen even after multiple attempts with a lower priority due to
parallel activity, so we might needlessly struggle on the lower
priorities and possibly run out of compaction retry attempts in the
process.

After this patch we are guaranteed at least one attempt at the highest
compaction priority even if we exhaust all retries at the lower
priorities.

Link: http://lkml.kernel.org/r/20160906135258.18335-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 634806f55120..a8703b592c39 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3160,25 +3160,23 @@ static inline bool
 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 		     enum compact_result compact_result,
 		     enum compact_priority *compact_priority,
-		     int compaction_retries)
+		     int *compaction_retries)
 {
 	int max_retries = MAX_COMPACT_RETRIES;
 
 	if (!order)
 		return false;
 
+	if (compaction_made_progress(compact_result))
+		(*compaction_retries)++;
+
 	/*
 	 * compaction considers all the zone as desperately out of memory
 	 * so it doesn't really make much sense to retry except when the
 	 * failure could be caused by insufficient priority
 	 */
-	if (compaction_failed(compact_result)) {
-		if (*compact_priority > MIN_COMPACT_PRIORITY) {
-			(*compact_priority)--;
-			return true;
-		}
-		return false;
-	}
+	if (compaction_failed(compact_result))
+		goto check_priority;
 
 	/*
 	 * make sure the compaction wasn't deferred or didn't bail out early
@@ -3199,9 +3197,19 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 	 */
 	if (order > PAGE_ALLOC_COSTLY_ORDER)
 		max_retries /= 4;
-	if (compaction_retries <= max_retries)
+	if (*compaction_retries <= max_retries)
 		return true;
 
+	/*
+	 * Make sure there are attempts at the highest priority if we exhausted
+	 * all retries or failed at the lower priorities.
+	 */
+check_priority:
+	if (*compact_priority > MIN_COMPACT_PRIORITY) {
+		(*compact_priority)--;
+		*compaction_retries = 0;
+		return true;
+	}
 	return false;
 }
 #else
@@ -3218,7 +3226,7 @@ static inline bool
 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
 		     enum compact_result compact_result,
 		     enum compact_priority *compact_priority,
-		     int compaction_retries)
+		     int *compaction_retries)
 {
 	struct zone *zone;
 	struct zoneref *z;
@@ -3620,9 +3628,6 @@ retry:
 	if (page)
 		goto got_pg;
 
-	if (order && compaction_made_progress(compact_result))
-		compaction_retries++;
-
 	/* Do not loop if specifically requested */
 	if (gfp_mask & __GFP_NORETRY)
 		goto nopage;
@@ -3657,7 +3662,7 @@ retry:
 	if (did_some_progress > 0 &&
 			should_compact_retry(ac, order, alloc_flags,
 				compact_result, &compact_priority,
-				compaction_retries))
+				&compaction_retries))
 		goto retry;
 
 	/* Reclaim has failed us, start killing things */
-- 
cgit v1.2.3


From c2033b00dbe856909fcaccf038e4e0d3dcfb85af Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 7 Oct 2016 17:00:34 -0700
Subject: mm, compaction: restrict full priority to non-costly orders

The new ultimate compaction priority disables some heuristics, which may
result in excessive cost.  This is fine for non-costly orders where we
want to try hard before resulting for OOM, but might be disruptive for
costly orders which do not trigger OOM and should generally have some
fallback.  Thus, we disable the full priority for costly orders.

Suggested-by: Michal Hocko <mhocko@kernel.org>
Link: http://lkml.kernel.org/r/20160906135258.18335-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 1 +
 mm/page_alloc.c            | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 585d55cb0dc0..0d8415820fc3 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -9,6 +9,7 @@ enum compact_priority {
 	COMPACT_PRIO_SYNC_FULL,
 	MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL,
 	COMPACT_PRIO_SYNC_LIGHT,
+	MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
 	DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
 	COMPACT_PRIO_ASYNC,
 	INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a8703b592c39..891e3881a6e0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3163,6 +3163,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 		     int *compaction_retries)
 {
 	int max_retries = MAX_COMPACT_RETRIES;
+	int min_priority;
 
 	if (!order)
 		return false;
@@ -3205,7 +3206,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 	 * all retries or failed at the lower priorities.
 	 */
 check_priority:
-	if (*compact_priority > MIN_COMPACT_PRIORITY) {
+	min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+			MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+	if (*compact_priority > min_priority) {
 		(*compact_priority)--;
 		*compaction_retries = 0;
 		return true;
-- 
cgit v1.2.3


From 423b452e1553e3d19b632880bf2adf1f058ab267 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 7 Oct 2016 17:00:40 -0700
Subject: mm, page_alloc: pull no_progress_loops update to
 should_reclaim_retry()

The should_reclaim_retry() makes decisions based on no_progress_loops,
so it makes sense to also update the counter there.  It will be also
consistent with should_compact_retry() and compaction_retries.  No
functional change.

[hillf.zj@alibaba-inc.com: fix missing pointer dereferences]
Link: http://lkml.kernel.org/r/20160926162025.21555-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 891e3881a6e0..bcfa647c1752 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3402,16 +3402,26 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 static inline bool
 should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 		     struct alloc_context *ac, int alloc_flags,
-		     bool did_some_progress, int no_progress_loops)
+		     bool did_some_progress, int *no_progress_loops)
 {
 	struct zone *zone;
 	struct zoneref *z;
 
+	/*
+	 * Costly allocations might have made a progress but this doesn't mean
+	 * their order will become available due to high fragmentation so
+	 * always increment the no progress counter for them
+	 */
+	if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
+		*no_progress_loops = 0;
+	else
+		(*no_progress_loops)++;
+
 	/*
 	 * Make sure we converge to OOM if we cannot make any progress
 	 * several times in the row.
 	 */
-	if (no_progress_loops > MAX_RECLAIM_RETRIES)
+	if (*no_progress_loops > MAX_RECLAIM_RETRIES)
 		return false;
 
 	/*
@@ -3426,7 +3436,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 		unsigned long reclaimable;
 
 		available = reclaimable = zone_reclaimable_pages(zone);
-		available -= DIV_ROUND_UP(no_progress_loops * available,
+		available -= DIV_ROUND_UP((*no_progress_loops) * available,
 					  MAX_RECLAIM_RETRIES);
 		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
 
@@ -3642,18 +3652,8 @@ retry:
 	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
 		goto nopage;
 
-	/*
-	 * Costly allocations might have made a progress but this doesn't mean
-	 * their order will become available due to high fragmentation so
-	 * always increment the no progress counter for them
-	 */
-	if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
-		no_progress_loops = 0;
-	else
-		no_progress_loops++;
-
 	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
-				 did_some_progress > 0, no_progress_loops))
+				 did_some_progress > 0, &no_progress_loops))
 		goto retry;
 
 	/*
-- 
cgit v1.2.3


From 7877cdcc3893c1bd9a833b2f0398e7320794c6e6 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 7 Oct 2016 17:01:55 -0700
Subject: mm: consolidate warn_alloc_failed users

warn_alloc_failed is currently used from the page and vmalloc
allocators.  This is a good reuse of the code except that vmalloc would
appreciate a slightly different warning message.  This is already
handled by the fmt parameter except that

  "%s: page allocation failure: order:%u, mode:%#x(%pGg)"

is printed anyway.  This might be quite misleading because it might be a
vmalloc failure which leads to the warning while the page allocator is
not the culprit here.  Fix this by always using the fmt string and only
print the context that makes sense for the particular context (e.g.
order makes only very little sense for the vmalloc context).

Rename the function to not miss any user and also because a later patch
will reuse it also for !failure cases.

Link: http://lkml.kernel.org/r/20160929084407.7004-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  5 ++---
 mm/page_alloc.c    | 27 ++++++++++++---------------
 mm/vmalloc.c       | 14 ++++++--------
 3 files changed, 20 insertions(+), 26 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2c8ed8a894c8..f7231411ad5a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1916,9 +1916,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid);
 extern unsigned long arch_reserved_kernel_pages(void);
 #endif
 
-extern __printf(3, 4)
-void warn_alloc_failed(gfp_t gfp_mask, unsigned int order,
-		const char *fmt, ...);
+extern __printf(2, 3)
+void warn_alloc(gfp_t gfp_mask, const char *fmt, ...);
 
 extern void setup_per_cpu_pageset(void);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bcfa647c1752..5ab2e30a1006 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2979,9 +2979,11 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
 		DEFAULT_RATELIMIT_INTERVAL,
 		DEFAULT_RATELIMIT_BURST);
 
-void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
+void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
+	struct va_format vaf;
+	va_list args;
 
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
@@ -2999,22 +3001,16 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
 	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 
-	if (fmt) {
-		struct va_format vaf;
-		va_list args;
+	pr_warn("%s: ", current->comm);
 
-		va_start(args, fmt);
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	pr_cont("%pV", &vaf);
+	va_end(args);
 
-		vaf.fmt = fmt;
-		vaf.va = &args;
+	pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
 
-		pr_warn("%pV", &vaf);
-
-		va_end(args);
-	}
-
-	pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
-		current->comm, order, gfp_mask, &gfp_mask);
 	dump_stack();
 	if (!should_suppress_show_mem())
 		show_mem(filter);
@@ -3680,7 +3676,8 @@ retry:
 	}
 
 nopage:
-	warn_alloc_failed(gfp_mask, order, NULL);
+	warn_alloc(gfp_mask,
+			"page allocation failure: order:%u", order);
 got_pg:
 	return page;
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 80660a0f989b..f2481cb4e6b2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1601,7 +1601,6 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, int node)
 {
-	const int order = 0;
 	struct page **pages;
 	unsigned int nr_pages, array_size, i;
 	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
@@ -1629,9 +1628,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		struct page *page;
 
 		if (node == NUMA_NO_NODE)
-			page = alloc_pages(alloc_mask, order);
+			page = alloc_page(alloc_mask);
 		else
-			page = alloc_pages_node(node, alloc_mask, order);
+			page = alloc_pages_node(node, alloc_mask, 0);
 
 		if (unlikely(!page)) {
 			/* Successfully allocated i pages, free them in __vunmap() */
@@ -1648,8 +1647,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	return area->addr;
 
 fail:
-	warn_alloc_failed(gfp_mask, order,
-			  "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
+	warn_alloc(gfp_mask,
+			  "vmalloc: allocation failure, allocated %ld of %ld bytes",
 			  (area->nr_pages*PAGE_SIZE), area->size);
 	vfree(area->addr);
 	return NULL;
@@ -1710,9 +1709,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	return addr;
 
 fail:
-	warn_alloc_failed(gfp_mask, 0,
-			  "vmalloc: allocation failure: %lu bytes\n",
-			  real_size);
+	warn_alloc(gfp_mask,
+			  "vmalloc: allocation failure: %lu bytes", real_size);
 	return NULL;
 }
 
-- 
cgit v1.2.3


From 63f53dea0c9866e93802d50a230c460a024c44e5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 7 Oct 2016 17:01:58 -0700
Subject: mm: warn about allocations which stall for too long

Currently we do warn only about allocation failures but small
allocations are basically nofail and they might loop in the page
allocator for a long time.  Especially when the reclaim cannot make any
progress - e.g.  GFP_NOFS cannot invoke the oom killer and rely on a
different context to make a forward progress in case there is a lot
memory used by filesystems.

Give us at least a clue when something like this happens and warn about
allocations which take more than 10s.  Print the basic allocation
context information along with the cumulative time spent in the
allocation as well as the allocation stack.  Repeat the warning after
every 10 seconds so that we know that the problem is permanent rather
than ephemeral.

Link: http://lkml.kernel.org/r/20160929084407.7004-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5ab2e30a1006..ca423cc20b59 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3493,6 +3493,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	enum compact_result compact_result;
 	int compaction_retries = 0;
 	int no_progress_loops = 0;
+	unsigned long alloc_start = jiffies;
+	unsigned int stall_timeout = 10 * HZ;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3648,6 +3650,14 @@ retry:
 	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
 		goto nopage;
 
+	/* Make sure we know about allocations which stall for too long */
+	if (time_after(jiffies, alloc_start + stall_timeout)) {
+		warn_alloc(gfp_mask,
+			"page alloction stalls for %ums, order:%u\n",
+			jiffies_to_msecs(jiffies-alloc_start), order);
+		stall_timeout += 10 * HZ;
+	}
+
 	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
 				 did_some_progress > 0, &no_progress_loops))
 		goto retry;
-- 
cgit v1.2.3


From 38addce8b600ca335dc86fa3d48c890f1c6fa1f4 Mon Sep 17 00:00:00 2001
From: Emese Revfy <re.emese@gmail.com>
Date: Mon, 20 Jun 2016 20:41:19 +0200
Subject: gcc-plugins: Add latent_entropy plugin

This adds a new gcc plugin named "latent_entropy". It is designed to
extract as much possible uncertainty from a running system at boot time as
possible, hoping to capitalize on any possible variation in CPU operation
(due to runtime data differences, hardware differences, SMP ordering,
thermal timing variation, cache behavior, etc).

At the very least, this plugin is a much more comprehensive example for
how to manipulate kernel code using the gcc plugin internals.

The need for very-early boot entropy tends to be very architecture or
system design specific, so this plugin is more suited for those sorts
of special cases. The existing kernel RNG already attempts to extract
entropy from reliable runtime variation, but this plugin takes the idea to
a logical extreme by permuting a global variable based on any variation
in code execution (e.g. a different value (and permutation function)
is used to permute the global based on loop count, case statement,
if/then/else branching, etc).

To do this, the plugin starts by inserting a local variable in every
marked function. The plugin then adds logic so that the value of this
variable is modified by randomly chosen operations (add, xor and rol) and
random values (gcc generates separate static values for each location at
compile time and also injects the stack pointer at runtime). The resulting
value depends on the control flow path (e.g., loops and branches taken).

Before the function returns, the plugin mixes this local variable into
the latent_entropy global variable. The value of this global variable
is added to the kernel entropy pool in do_one_initcall() and _do_fork(),
though it does not credit any bytes of entropy to the pool; the contents
of the global are just used to mix the pool.

Additionally, the plugin can pre-initialize arrays with build-time
random contents, so that two different kernel builds running on identical
hardware will not have the same starting values.

Signed-off-by: Emese Revfy <re.emese@gmail.com>
[kees: expanded commit message and code comments]
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/Kconfig                                |  18 +
 arch/powerpc/kernel/Makefile                |   5 +
 include/linux/random.h                      |  11 +
 init/main.c                                 |   1 +
 kernel/fork.c                               |   1 +
 mm/page_alloc.c                             |   5 +
 scripts/Makefile.gcc-plugins                |   9 +-
 scripts/gcc-plugins/latent_entropy_plugin.c | 640 ++++++++++++++++++++++++++++
 8 files changed, 689 insertions(+), 1 deletion(-)
 create mode 100644 scripts/gcc-plugins/latent_entropy_plugin.c

(limited to 'mm/page_alloc.c')

diff --git a/arch/Kconfig b/arch/Kconfig
index fd6e9712af81..ae80cd24e8f0 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -383,6 +383,24 @@ config GCC_PLUGIN_SANCOV
 	  gcc-4.5 on). It is based on the commit "Add fuzzing coverage support"
 	  by Dmitry Vyukov <dvyukov@google.com>.
 
+config GCC_PLUGIN_LATENT_ENTROPY
+	bool "Generate some entropy during boot and runtime"
+	depends on GCC_PLUGINS
+	help
+	  By saying Y here the kernel will instrument some kernel code to
+	  extract some entropy from both original and artificially created
+	  program state.  This will help especially embedded systems where
+	  there is little 'natural' source of entropy normally.  The cost
+	  is some slowdown of the boot process (about 0.5%) and fork and
+	  irq processing.
+
+	  Note that entropy extracted this way is not cryptographically
+	  secure!
+
+	  This plugin was ported from grsecurity/PaX. More information at:
+	   * https://grsecurity.net/
+	   * https://pax.grsecurity.net/
+
 config HAVE_CC_STACKPROTECTOR
 	bool
 	help
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index fe4c075bcf50..62df36c3f138 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -14,6 +14,11 @@ CFLAGS_prom_init.o      += -fPIC
 CFLAGS_btext.o		+= -fPIC
 endif
 
+CFLAGS_cputable.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+CFLAGS_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace early boot code
 CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
diff --git a/include/linux/random.h b/include/linux/random.h
index 3d6e9815cd85..a59c74cdb1eb 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -18,6 +18,17 @@ struct random_ready_callback {
 };
 
 extern void add_device_randomness(const void *, unsigned int);
+
+#if defined(CONFIG_GCC_PLUGIN_LATENT_ENTROPY) && !defined(__CHECKER__)
+static inline void add_latent_entropy(void)
+{
+	add_device_randomness((const void *)&latent_entropy,
+			      sizeof(latent_entropy));
+}
+#else
+static inline void add_latent_entropy(void) {}
+#endif
+
 extern void add_input_randomness(unsigned int type, unsigned int code,
 				 unsigned int value);
 extern void add_interrupt_randomness(int irq, int irq_flags);
diff --git a/init/main.c b/init/main.c
index a8a58e2794a5..2858be732f6d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -789,6 +789,7 @@ int __init_or_module do_one_initcall(initcall_t fn)
 	}
 	WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf);
 
+	add_latent_entropy();
 	return ret;
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index beb31725f7e2..001b18473a07 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1780,6 +1780,7 @@ long _do_fork(unsigned long clone_flags,
 
 	p = copy_process(clone_flags, stack_start, stack_size,
 			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+	add_latent_entropy();
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a2214c64ed3c..248851d1fc86 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -91,6 +91,11 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 int _node_numa_mem_[MAX_NUMNODES];
 #endif
 
+#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
+volatile u64 latent_entropy;
+EXPORT_SYMBOL(latent_entropy);
+#endif
+
 /*
  * Array of node states.
  */
diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
index 61f0e6db909b..060d2cb373db 100644
--- a/scripts/Makefile.gcc-plugins
+++ b/scripts/Makefile.gcc-plugins
@@ -6,6 +6,12 @@ ifdef CONFIG_GCC_PLUGINS
 
   gcc-plugin-$(CONFIG_GCC_PLUGIN_CYC_COMPLEXITY)	+= cyc_complexity_plugin.so
 
+  gcc-plugin-$(CONFIG_GCC_PLUGIN_LATENT_ENTROPY)	+= latent_entropy_plugin.so
+  gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_LATENT_ENTROPY)	+= -DLATENT_ENTROPY_PLUGIN
+  ifdef CONFIG_PAX_LATENT_ENTROPY
+    DISABLE_LATENT_ENTROPY_PLUGIN			+= -fplugin-arg-latent_entropy_plugin-disable
+  endif
+
   ifdef CONFIG_GCC_PLUGIN_SANCOV
     ifeq ($(CFLAGS_KCOV),)
       # It is needed because of the gcc-plugin.sh and gcc version checks.
@@ -21,7 +27,8 @@ ifdef CONFIG_GCC_PLUGINS
 
   GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y))
 
-  export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN GCC_PLUGIN_SUBDIR SANCOV_PLUGIN
+  export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN GCC_PLUGIN_SUBDIR
+  export SANCOV_PLUGIN DISABLE_LATENT_ENTROPY_PLUGIN
 
   ifneq ($(PLUGINCC),)
     # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication.
diff --git a/scripts/gcc-plugins/latent_entropy_plugin.c b/scripts/gcc-plugins/latent_entropy_plugin.c
new file mode 100644
index 000000000000..ff1939b804ae
--- /dev/null
+++ b/scripts/gcc-plugins/latent_entropy_plugin.c
@@ -0,0 +1,640 @@
+/*
+ * Copyright 2012-2016 by the PaX Team <pageexec@freemail.hu>
+ * Copyright 2016 by Emese Revfy <re.emese@gmail.com>
+ * Licensed under the GPL v2
+ *
+ * Note: the choice of the license means that the compilation process is
+ *       NOT 'eligible' as defined by gcc's library exception to the GPL v3,
+ *       but for the kernel it doesn't matter since it doesn't link against
+ *       any of the gcc libraries
+ *
+ * This gcc plugin helps generate a little bit of entropy from program state,
+ * used throughout the uptime of the kernel. Here is an instrumentation example:
+ *
+ * before:
+ * void __latent_entropy test(int argc, char *argv[])
+ * {
+ *	if (argc <= 1)
+ *		printf("%s: no command arguments :(\n", *argv);
+ *	else
+ *		printf("%s: %d command arguments!\n", *argv, args - 1);
+ * }
+ *
+ * after:
+ * void __latent_entropy test(int argc, char *argv[])
+ * {
+ *	// latent_entropy_execute() 1.
+ *	unsigned long local_entropy;
+ *	// init_local_entropy() 1.
+ *	void *local_entropy_frameaddr;
+ *	// init_local_entropy() 3.
+ *	unsigned long tmp_latent_entropy;
+ *
+ *	// init_local_entropy() 2.
+ *	local_entropy_frameaddr = __builtin_frame_address(0);
+ *	local_entropy = (unsigned long) local_entropy_frameaddr;
+ *
+ *	// init_local_entropy() 4.
+ *	tmp_latent_entropy = latent_entropy;
+ *	// init_local_entropy() 5.
+ *	local_entropy ^= tmp_latent_entropy;
+ *
+ *	// latent_entropy_execute() 3.
+ *	if (argc <= 1) {
+ *		// perturb_local_entropy()
+ *		local_entropy += 4623067384293424948;
+ *		printf("%s: no command arguments :(\n", *argv);
+ *		// perturb_local_entropy()
+ *	} else {
+ *		local_entropy ^= 3896280633962944730;
+ *		printf("%s: %d command arguments!\n", *argv, args - 1);
+ *	}
+ *
+ *	// latent_entropy_execute() 4.
+ *	tmp_latent_entropy = rol(tmp_latent_entropy, local_entropy);
+ *	latent_entropy = tmp_latent_entropy;
+ * }
+ *
+ * TODO:
+ * - add ipa pass to identify not explicitly marked candidate functions
+ * - mix in more program state (function arguments/return values,
+ *   loop variables, etc)
+ * - more instrumentation control via attribute parameters
+ *
+ * BUGS:
+ * - none known
+ *
+ * Options:
+ * -fplugin-arg-latent_entropy_plugin-disable
+ *
+ * Attribute: __attribute__((latent_entropy))
+ *  The latent_entropy gcc attribute can be only on functions and variables.
+ *  If it is on a function then the plugin will instrument it. If the attribute
+ *  is on a variable then the plugin will initialize it with a random value.
+ *  The variable must be an integer, an integer array type or a structure
+ *  with integer fields.
+ */
+
+#include "gcc-common.h"
+
+int plugin_is_GPL_compatible;
+
+static GTY(()) tree latent_entropy_decl;
+
+static struct plugin_info latent_entropy_plugin_info = {
+	.version	= "201606141920vanilla",
+	.help		= "disable\tturn off latent entropy instrumentation\n",
+};
+
+static unsigned HOST_WIDE_INT seed;
+/*
+ * get_random_seed() (this is a GCC function) generates the seed.
+ * This is a simple random generator without any cryptographic security because
+ * the entropy doesn't come from here.
+ */
+static unsigned HOST_WIDE_INT get_random_const(void)
+{
+	unsigned int i;
+	unsigned HOST_WIDE_INT ret = 0;
+
+	for (i = 0; i < 8 * sizeof(ret); i++) {
+		ret = (ret << 1) | (seed & 1);
+		seed >>= 1;
+		if (ret & 1)
+			seed ^= 0xD800000000000000ULL;
+	}
+
+	return ret;
+}
+
+static tree tree_get_random_const(tree type)
+{
+	unsigned long long mask;
+
+	mask = 1ULL << (TREE_INT_CST_LOW(TYPE_SIZE(type)) - 1);
+	mask = 2 * (mask - 1) + 1;
+
+	if (TYPE_UNSIGNED(type))
+		return build_int_cstu(type, mask & get_random_const());
+	return build_int_cst(type, mask & get_random_const());
+}
+
+static tree handle_latent_entropy_attribute(tree *node, tree name,
+						tree args __unused,
+						int flags __unused,
+						bool *no_add_attrs)
+{
+	tree type;
+#if BUILDING_GCC_VERSION <= 4007
+	VEC(constructor_elt, gc) *vals;
+#else
+	vec<constructor_elt, va_gc> *vals;
+#endif
+
+	switch (TREE_CODE(*node)) {
+	default:
+		*no_add_attrs = true;
+		error("%qE attribute only applies to functions and variables",
+			name);
+		break;
+
+	case VAR_DECL:
+		if (DECL_INITIAL(*node)) {
+			*no_add_attrs = true;
+			error("variable %qD with %qE attribute must not be initialized",
+				*node, name);
+			break;
+		}
+
+		if (!TREE_STATIC(*node)) {
+			*no_add_attrs = true;
+			error("variable %qD with %qE attribute must not be local",
+				*node, name);
+			break;
+		}
+
+		type = TREE_TYPE(*node);
+		switch (TREE_CODE(type)) {
+		default:
+			*no_add_attrs = true;
+			error("variable %qD with %qE attribute must be an integer or a fixed length integer array type or a fixed sized structure with integer fields",
+				*node, name);
+			break;
+
+		case RECORD_TYPE: {
+			tree fld, lst = TYPE_FIELDS(type);
+			unsigned int nelt = 0;
+
+			for (fld = lst; fld; nelt++, fld = TREE_CHAIN(fld)) {
+				tree fieldtype;
+
+				fieldtype = TREE_TYPE(fld);
+				if (TREE_CODE(fieldtype) == INTEGER_TYPE)
+					continue;
+
+				*no_add_attrs = true;
+				error("structure variable %qD with %qE attribute has a non-integer field %qE",
+					*node, name, fld);
+				break;
+			}
+
+			if (fld)
+				break;
+
+#if BUILDING_GCC_VERSION <= 4007
+			vals = VEC_alloc(constructor_elt, gc, nelt);
+#else
+			vec_alloc(vals, nelt);
+#endif
+
+			for (fld = lst; fld; fld = TREE_CHAIN(fld)) {
+				tree random_const, fld_t = TREE_TYPE(fld);
+
+				random_const = tree_get_random_const(fld_t);
+				CONSTRUCTOR_APPEND_ELT(vals, fld, random_const);
+			}
+
+			/* Initialize the fields with random constants */
+			DECL_INITIAL(*node) = build_constructor(type, vals);
+			break;
+		}
+
+		/* Initialize the variable with a random constant */
+		case INTEGER_TYPE:
+			DECL_INITIAL(*node) = tree_get_random_const(type);
+			break;
+
+		case ARRAY_TYPE: {
+			tree elt_type, array_size, elt_size;
+			unsigned int i, nelt;
+			HOST_WIDE_INT array_size_int, elt_size_int;
+
+			elt_type = TREE_TYPE(type);
+			elt_size = TYPE_SIZE_UNIT(TREE_TYPE(type));
+			array_size = TYPE_SIZE_UNIT(type);
+
+			if (TREE_CODE(elt_type) != INTEGER_TYPE || !array_size
+				|| TREE_CODE(array_size) != INTEGER_CST) {
+				*no_add_attrs = true;
+				error("array variable %qD with %qE attribute must be a fixed length integer array type",
+					*node, name);
+				break;
+			}
+
+			array_size_int = TREE_INT_CST_LOW(array_size);
+			elt_size_int = TREE_INT_CST_LOW(elt_size);
+			nelt = array_size_int / elt_size_int;
+
+#if BUILDING_GCC_VERSION <= 4007
+			vals = VEC_alloc(constructor_elt, gc, nelt);
+#else
+			vec_alloc(vals, nelt);
+#endif
+
+			for (i = 0; i < nelt; i++) {
+				tree cst = size_int(i);
+				tree rand_cst = tree_get_random_const(elt_type);
+
+				CONSTRUCTOR_APPEND_ELT(vals, cst, rand_cst);
+			}
+
+			/*
+			 * Initialize the elements of the array with random
+			 * constants
+			 */
+			DECL_INITIAL(*node) = build_constructor(type, vals);
+			break;
+		}
+		}
+		break;
+
+	case FUNCTION_DECL:
+		break;
+	}
+
+	return NULL_TREE;
+}
+
+static struct attribute_spec latent_entropy_attr = {
+	.name				= "latent_entropy",
+	.min_length			= 0,
+	.max_length			= 0,
+	.decl_required			= true,
+	.type_required			= false,
+	.function_type_required		= false,
+	.handler			= handle_latent_entropy_attribute,
+#if BUILDING_GCC_VERSION >= 4007
+	.affects_type_identity		= false
+#endif
+};
+
+static void register_attributes(void *event_data __unused, void *data __unused)
+{
+	register_attribute(&latent_entropy_attr);
+}
+
+static bool latent_entropy_gate(void)
+{
+	tree list;
+
+	/* don't bother with noreturn functions for now */
+	if (TREE_THIS_VOLATILE(current_function_decl))
+		return false;
+
+	/* gcc-4.5 doesn't discover some trivial noreturn functions */
+	if (EDGE_COUNT(EXIT_BLOCK_PTR_FOR_FN(cfun)->preds) == 0)
+		return false;
+
+	list = DECL_ATTRIBUTES(current_function_decl);
+	return lookup_attribute("latent_entropy", list) != NULL_TREE;
+}
+
+static tree create_var(tree type, const char *name)
+{
+	tree var;
+
+	var = create_tmp_var(type, name);
+	add_referenced_var(var);
+	mark_sym_for_renaming(var);
+	return var;
+}
+
+/*
+ * Set up the next operation and its constant operand to use in the latent
+ * entropy PRNG. When RHS is specified, the request is for perturbing the
+ * local latent entropy variable, otherwise it is for perturbing the global
+ * latent entropy variable where the two operands are already given by the
+ * local and global latent entropy variables themselves.
+ *
+ * The operation is one of add/xor/rol when instrumenting the local entropy
+ * variable and one of add/xor when perturbing the global entropy variable.
+ * Rotation is not used for the latter case because it would transmit less
+ * entropy to the global variable than the other two operations.
+ */
+static enum tree_code get_op(tree *rhs)
+{
+	static enum tree_code op;
+	unsigned HOST_WIDE_INT random_const;
+
+	random_const = get_random_const();
+
+	switch (op) {
+	case BIT_XOR_EXPR:
+		op = PLUS_EXPR;
+		break;
+
+	case PLUS_EXPR:
+		if (rhs) {
+			op = LROTATE_EXPR;
+			/*
+			 * This code limits the value of random_const to
+			 * the size of a wide int for the rotation
+			 */
+			random_const &= HOST_BITS_PER_WIDE_INT - 1;
+			break;
+		}
+
+	case LROTATE_EXPR:
+	default:
+		op = BIT_XOR_EXPR;
+		break;
+	}
+	if (rhs)
+		*rhs = build_int_cstu(unsigned_intDI_type_node, random_const);
+	return op;
+}
+
+static gimple create_assign(enum tree_code code, tree lhs, tree op1,
+				tree op2)
+{
+	return gimple_build_assign_with_ops(code, lhs, op1, op2);
+}
+
+static void perturb_local_entropy(basic_block bb, tree local_entropy)
+{
+	gimple_stmt_iterator gsi;
+	gimple assign;
+	tree rhs;
+	enum tree_code op;
+
+	op = get_op(&rhs);
+	assign = create_assign(op, local_entropy, local_entropy, rhs);
+	gsi = gsi_after_labels(bb);
+	gsi_insert_before(&gsi, assign, GSI_NEW_STMT);
+	update_stmt(assign);
+}
+
+static void __perturb_latent_entropy(gimple_stmt_iterator *gsi,
+					tree local_entropy)
+{
+	gimple assign;
+	tree temp;
+	enum tree_code op;
+
+	/* 1. create temporary copy of latent_entropy */
+	temp = create_var(unsigned_intDI_type_node, "tmp_latent_entropy");
+
+	/* 2. read... */
+	add_referenced_var(latent_entropy_decl);
+	mark_sym_for_renaming(latent_entropy_decl);
+	assign = gimple_build_assign(temp, latent_entropy_decl);
+	gsi_insert_before(gsi, assign, GSI_NEW_STMT);
+	update_stmt(assign);
+
+	/* 3. ...modify... */
+	op = get_op(NULL);
+	assign = create_assign(op, temp, temp, local_entropy);
+	gsi_insert_after(gsi, assign, GSI_NEW_STMT);
+	update_stmt(assign);
+
+	/* 4. ...write latent_entropy */
+	assign = gimple_build_assign(latent_entropy_decl, temp);
+	gsi_insert_after(gsi, assign, GSI_NEW_STMT);
+	update_stmt(assign);
+}
+
+static bool handle_tail_calls(basic_block bb, tree local_entropy)
+{
+	gimple_stmt_iterator gsi;
+
+	for (gsi = gsi_start_bb(bb); !gsi_end_p(gsi); gsi_next(&gsi)) {
+		gcall *call;
+		gimple stmt = gsi_stmt(gsi);
+
+		if (!is_gimple_call(stmt))
+			continue;
+
+		call = as_a_gcall(stmt);
+		if (!gimple_call_tail_p(call))
+			continue;
+
+		__perturb_latent_entropy(&gsi, local_entropy);
+		return true;
+	}
+
+	return false;
+}
+
+static void perturb_latent_entropy(tree local_entropy)
+{
+	edge_iterator ei;
+	edge e, last_bb_e;
+	basic_block last_bb;
+
+	gcc_assert(single_pred_p(EXIT_BLOCK_PTR_FOR_FN(cfun)));
+	last_bb_e = single_pred_edge(EXIT_BLOCK_PTR_FOR_FN(cfun));
+
+	FOR_EACH_EDGE(e, ei, last_bb_e->src->preds) {
+		if (ENTRY_BLOCK_PTR_FOR_FN(cfun) == e->src)
+			continue;
+		if (EXIT_BLOCK_PTR_FOR_FN(cfun) == e->src)
+			continue;
+
+		handle_tail_calls(e->src, local_entropy);
+	}
+
+	last_bb = single_pred(EXIT_BLOCK_PTR_FOR_FN(cfun));
+	if (!handle_tail_calls(last_bb, local_entropy)) {
+		gimple_stmt_iterator gsi = gsi_last_bb(last_bb);
+
+		__perturb_latent_entropy(&gsi, local_entropy);
+	}
+}
+
+static void init_local_entropy(basic_block bb, tree local_entropy)
+{
+	gimple assign, call;
+	tree frame_addr, rand_const, tmp, fndecl, udi_frame_addr;
+	enum tree_code op;
+	unsigned HOST_WIDE_INT rand_cst;
+	gimple_stmt_iterator gsi = gsi_after_labels(bb);
+
+	/* 1. create local_entropy_frameaddr */
+	frame_addr = create_var(ptr_type_node, "local_entropy_frameaddr");
+
+	/* 2. local_entropy_frameaddr = __builtin_frame_address() */
+	fndecl = builtin_decl_implicit(BUILT_IN_FRAME_ADDRESS);
+	call = gimple_build_call(fndecl, 1, integer_zero_node);
+	gimple_call_set_lhs(call, frame_addr);
+	gsi_insert_before(&gsi, call, GSI_NEW_STMT);
+	update_stmt(call);
+
+	udi_frame_addr = fold_convert(unsigned_intDI_type_node, frame_addr);
+	assign = gimple_build_assign(local_entropy, udi_frame_addr);
+	gsi_insert_after(&gsi, assign, GSI_NEW_STMT);
+	update_stmt(assign);
+
+	/* 3. create temporary copy of latent_entropy */
+	tmp = create_var(unsigned_intDI_type_node, "tmp_latent_entropy");
+
+	/* 4. read the global entropy variable into local entropy */
+	add_referenced_var(latent_entropy_decl);
+	mark_sym_for_renaming(latent_entropy_decl);
+	assign = gimple_build_assign(tmp, latent_entropy_decl);
+	gsi_insert_after(&gsi, assign, GSI_NEW_STMT);
+	update_stmt(assign);
+
+	/* 5. mix local_entropy_frameaddr into local entropy */
+	assign = create_assign(BIT_XOR_EXPR, local_entropy, local_entropy, tmp);
+	gsi_insert_after(&gsi, assign, GSI_NEW_STMT);
+	update_stmt(assign);
+
+	rand_cst = get_random_const();
+	rand_const = build_int_cstu(unsigned_intDI_type_node, rand_cst);
+	op = get_op(NULL);
+	assign = create_assign(op, local_entropy, local_entropy, rand_const);
+	gsi_insert_after(&gsi, assign, GSI_NEW_STMT);
+	update_stmt(assign);
+}
+
+static bool create_latent_entropy_decl(void)
+{
+	varpool_node_ptr node;
+
+	if (latent_entropy_decl != NULL_TREE)
+		return true;
+
+	FOR_EACH_VARIABLE(node) {
+		tree name, var = NODE_DECL(node);
+
+		if (DECL_NAME_LENGTH(var) < sizeof("latent_entropy") - 1)
+			continue;
+
+		name = DECL_NAME(var);
+		if (strcmp(IDENTIFIER_POINTER(name), "latent_entropy"))
+			continue;
+
+		latent_entropy_decl = var;
+		break;
+	}
+
+	return latent_entropy_decl != NULL_TREE;
+}
+
+static unsigned int latent_entropy_execute(void)
+{
+	basic_block bb;
+	tree local_entropy;
+
+	if (!create_latent_entropy_decl())
+		return 0;
+
+	/* prepare for step 2 below */
+	gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun)));
+	bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun));
+	if (!single_pred_p(bb)) {
+		split_edge(single_succ_edge(ENTRY_BLOCK_PTR_FOR_FN(cfun)));
+		gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun)));
+		bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun));
+	}
+
+	/* 1. create the local entropy variable */
+	local_entropy = create_var(unsigned_intDI_type_node, "local_entropy");
+
+	/* 2. initialize the local entropy variable */
+	init_local_entropy(bb, local_entropy);
+
+	bb = bb->next_bb;
+
+	/*
+	 * 3. instrument each BB with an operation on the
+	 *    local entropy variable
+	 */
+	while (bb != EXIT_BLOCK_PTR_FOR_FN(cfun)) {
+		perturb_local_entropy(bb, local_entropy);
+		bb = bb->next_bb;
+	};
+
+	/* 4. mix local entropy into the global entropy variable */
+	perturb_latent_entropy(local_entropy);
+	return 0;
+}
+
+static void latent_entropy_start_unit(void *gcc_data __unused,
+					void *user_data __unused)
+{
+	tree type, id;
+	int quals;
+
+	seed = get_random_seed(false);
+
+	if (in_lto_p)
+		return;
+
+	/* extern volatile u64 latent_entropy */
+	gcc_assert(TYPE_PRECISION(long_long_unsigned_type_node) == 64);
+	quals = TYPE_QUALS(long_long_unsigned_type_node) | TYPE_QUAL_VOLATILE;
+	type = build_qualified_type(long_long_unsigned_type_node, quals);
+	id = get_identifier("latent_entropy");
+	latent_entropy_decl = build_decl(UNKNOWN_LOCATION, VAR_DECL, id, type);
+
+	TREE_STATIC(latent_entropy_decl) = 1;
+	TREE_PUBLIC(latent_entropy_decl) = 1;
+	TREE_USED(latent_entropy_decl) = 1;
+	DECL_PRESERVE_P(latent_entropy_decl) = 1;
+	TREE_THIS_VOLATILE(latent_entropy_decl) = 1;
+	DECL_EXTERNAL(latent_entropy_decl) = 1;
+	DECL_ARTIFICIAL(latent_entropy_decl) = 1;
+	lang_hooks.decls.pushdecl(latent_entropy_decl);
+}
+
+#define PASS_NAME latent_entropy
+#define PROPERTIES_REQUIRED PROP_gimple_leh | PROP_cfg
+#define TODO_FLAGS_FINISH TODO_verify_ssa | TODO_verify_stmts | TODO_dump_func \
+	| TODO_update_ssa
+#include "gcc-generate-gimple-pass.h"
+
+int plugin_init(struct plugin_name_args *plugin_info,
+		struct plugin_gcc_version *version)
+{
+	bool enabled = true;
+	const char * const plugin_name = plugin_info->base_name;
+	const int argc = plugin_info->argc;
+	const struct plugin_argument * const argv = plugin_info->argv;
+	int i;
+
+	struct register_pass_info latent_entropy_pass_info;
+
+	latent_entropy_pass_info.pass		= make_latent_entropy_pass();
+	latent_entropy_pass_info.reference_pass_name		= "optimized";
+	latent_entropy_pass_info.ref_pass_instance_number	= 1;
+	latent_entropy_pass_info.pos_op		= PASS_POS_INSERT_BEFORE;
+	static const struct ggc_root_tab gt_ggc_r_gt_latent_entropy[] = {
+		{
+			.base = &latent_entropy_decl,
+			.nelt = 1,
+			.stride = sizeof(latent_entropy_decl),
+			.cb = &gt_ggc_mx_tree_node,
+			.pchw = &gt_pch_nx_tree_node
+		},
+		LAST_GGC_ROOT_TAB
+	};
+
+	if (!plugin_default_version_check(version, &gcc_version)) {
+		error(G_("incompatible gcc/plugin versions"));
+		return 1;
+	}
+
+	for (i = 0; i < argc; ++i) {
+		if (!(strcmp(argv[i].key, "disable"))) {
+			enabled = false;
+			continue;
+		}
+		error(G_("unkown option '-fplugin-arg-%s-%s'"), plugin_name, argv[i].key);
+	}
+
+	register_callback(plugin_name, PLUGIN_INFO, NULL,
+				&latent_entropy_plugin_info);
+	if (enabled) {
+		register_callback(plugin_name, PLUGIN_START_UNIT,
+					&latent_entropy_start_unit, NULL);
+		register_callback(plugin_name, PLUGIN_REGISTER_GGC_ROOTS,
+				  NULL, (void *)&gt_ggc_r_gt_latent_entropy);
+		register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL,
+					&latent_entropy_pass_info);
+	}
+	register_callback(plugin_name, PLUGIN_ATTRIBUTES, register_attributes,
+				NULL);
+
+	return 0;
+}
-- 
cgit v1.2.3


From 0766f788eb727e2e330d55d30545db65bcf2623f Mon Sep 17 00:00:00 2001
From: Emese Revfy <re.emese@gmail.com>
Date: Mon, 20 Jun 2016 20:42:34 +0200
Subject: latent_entropy: Mark functions with __latent_entropy

The __latent_entropy gcc attribute can be used only on functions and
variables.  If it is on a function then the plugin will instrument it for
gathering control-flow entropy. If the attribute is on a variable then
the plugin will initialize it with random contents.  The variable must
be an integer, an integer array type or a structure with integer fields.

These specific functions have been selected because they are init
functions (to help gather boot-time entropy), are called at unpredictable
times, or they have variable loops, each of which provide some level of
latent entropy.

Signed-off-by: Emese Revfy <re.emese@gmail.com>
[kees: expanded commit message]
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 block/blk-softirq.c          | 2 +-
 drivers/char/random.c        | 4 ++--
 fs/namespace.c               | 1 +
 include/linux/compiler-gcc.h | 7 +++++++
 include/linux/compiler.h     | 4 ++++
 include/linux/fdtable.h      | 2 +-
 include/linux/genhd.h        | 2 +-
 include/linux/init.h         | 5 +++--
 include/linux/random.h       | 4 ++--
 kernel/fork.c                | 6 ++++--
 kernel/rcu/tiny.c            | 2 +-
 kernel/rcu/tree.c            | 2 +-
 kernel/sched/fair.c          | 2 +-
 kernel/softirq.c             | 4 ++--
 kernel/time/timer.c          | 2 +-
 lib/irq_poll.c               | 2 +-
 lib/random32.c               | 2 +-
 mm/page_alloc.c              | 2 +-
 net/core/dev.c               | 4 ++--
 19 files changed, 37 insertions(+), 22 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 53b1737e978d..489eab825a8a 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -18,7 +18,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
  * Softirq action handler - move entries to local list and loop over them
  * while passing them to the queue registered handler.
  */
-static void blk_done_softirq(struct softirq_action *h)
+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
 {
 	struct list_head *cpu_list, local_list;
 
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 3efb3bf0ab83..7274ae89ddb3 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -479,8 +479,8 @@ static ssize_t _extract_entropy(struct entropy_store *r, void *buf,
 
 static void crng_reseed(struct crng_state *crng, struct entropy_store *r);
 static void push_to_pool(struct work_struct *work);
-static __u32 input_pool_data[INPUT_POOL_WORDS];
-static __u32 blocking_pool_data[OUTPUT_POOL_WORDS];
+static __u32 input_pool_data[INPUT_POOL_WORDS] __latent_entropy;
+static __u32 blocking_pool_data[OUTPUT_POOL_WORDS] __latent_entropy;
 
 static struct entropy_store input_pool = {
 	.poolinfo = &poolinfo_table[0],
diff --git a/fs/namespace.c b/fs/namespace.c
index 7bb2cda3bfef..4a9568b81138 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2759,6 +2759,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 	return new_ns;
 }
 
+__latent_entropy
 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 		struct user_namespace *user_ns, struct fs_struct *new_fs)
 {
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 573c5a18908f..432f5c97e18f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -188,6 +188,13 @@
 #endif /* GCC_VERSION >= 40300 */
 
 #if GCC_VERSION >= 40500
+
+#ifndef __CHECKER__
+#ifdef LATENT_ENTROPY_PLUGIN
+#define __latent_entropy __attribute__((latent_entropy))
+#endif
+#endif
+
 /*
  * Mark a position in code as unreachable.  This can be used to
  * suppress control flow warnings after asm blocks that transfer
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 668569844d37..ceaddaf76ff1 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -406,6 +406,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 # define __attribute_const__	/* unimplemented */
 #endif
 
+#ifndef __latent_entropy
+# define __latent_entropy
+#endif
+
 /*
  * Tell gcc if a function is cold. The compiler will assume any path
  * directly leading to the call is unlikely.
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 5295535b60c6..9852c7e33466 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -105,7 +105,7 @@ struct files_struct *get_files_struct(struct task_struct *);
 void put_files_struct(struct files_struct *fs);
 void reset_files_struct(struct files_struct *);
 int unshare_files(struct files_struct **);
-struct files_struct *dup_fd(struct files_struct *, int *);
+struct files_struct *dup_fd(struct files_struct *, int *) __latent_entropy;
 void do_close_on_exec(struct files_struct *);
 int iterate_fd(struct files_struct *, unsigned,
 		int (*)(const void *, struct file *, unsigned),
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 1dbf52f9c24b..e0341af6950e 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -437,7 +437,7 @@ extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
 extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask);
 
 /* drivers/char/random.c */
-extern void add_disk_randomness(struct gendisk *disk);
+extern void add_disk_randomness(struct gendisk *disk) __latent_entropy;
 extern void rand_initialize_disk(struct gendisk *disk);
 
 static inline sector_t get_start_sect(struct block_device *bdev)
diff --git a/include/linux/init.h b/include/linux/init.h
index 6935d02474aa..1e5c131d5c9a 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -39,7 +39,7 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__section(.init.text) __cold notrace
+#define __init		__section(.init.text) __cold notrace __latent_entropy
 #define __initdata	__section(.init.data)
 #define __initconst	__constsection(.init.rodata)
 #define __exitdata	__section(.exit.data)
@@ -86,7 +86,8 @@
 #define __exit          __section(.exit.text) __exitused __cold notrace
 
 /* Used for MEMORY_HOTPLUG */
-#define __meminit        __section(.meminit.text) __cold notrace
+#define __meminit        __section(.meminit.text) __cold notrace \
+						  __latent_entropy
 #define __meminitdata    __section(.meminit.data)
 #define __meminitconst   __constsection(.meminit.rodata)
 #define __memexit        __section(.memexit.text) __exitused __cold notrace
diff --git a/include/linux/random.h b/include/linux/random.h
index a59c74cdb1eb..d80a4388a4fd 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -30,8 +30,8 @@ static inline void add_latent_entropy(void) {}
 #endif
 
 extern void add_input_randomness(unsigned int type, unsigned int code,
-				 unsigned int value);
-extern void add_interrupt_randomness(int irq, int irq_flags);
+				 unsigned int value) __latent_entropy;
+extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
 
 extern void get_random_bytes(void *buf, int nbytes);
 extern int add_random_ready_callback(struct random_ready_callback *rdy);
diff --git a/kernel/fork.c b/kernel/fork.c
index 001b18473a07..05393881ef39 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -404,7 +404,8 @@ free_tsk:
 }
 
 #ifdef CONFIG_MMU
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+static __latent_entropy int dup_mmap(struct mm_struct *mm,
+					struct mm_struct *oldmm)
 {
 	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
 	struct rb_node **rb_link, *rb_parent;
@@ -1296,7 +1297,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
  * parts of the process environment (as per the clone
  * flags). The actual kick-off is left to the caller.
  */
-static struct task_struct *copy_process(unsigned long clone_flags,
+static __latent_entropy struct task_struct *copy_process(
+					unsigned long clone_flags,
 					unsigned long stack_start,
 					unsigned long stack_size,
 					int __user *child_tidptr,
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 944b1b491ed8..1898559e6b60 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 				      false));
 }
 
-static void rcu_process_callbacks(struct softirq_action *unused)
+static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
 {
 	__rcu_process_callbacks(&rcu_sched_ctrlblk);
 	__rcu_process_callbacks(&rcu_bh_ctrlblk);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5d80925e7fc8..e5164deb51e1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
 /*
  * Do RCU core processing for the current CPU.
  */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
 {
 	struct rcu_state *rsp;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 039de34f1521..004996df2f10 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8283,7 +8283,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
  */
-static void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 {
 	struct rq *this_rq = this_rq();
 	enum cpu_idle_type idle = this_rq->idle_balance ?
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 17caf4b63342..34033fd09c8c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -482,7 +482,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
 }
 EXPORT_SYMBOL(__tasklet_hi_schedule_first);
 
-static void tasklet_action(struct softirq_action *a)
+static __latent_entropy void tasklet_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
 
@@ -518,7 +518,7 @@ static void tasklet_action(struct softirq_action *a)
 	}
 }
 
-static void tasklet_hi_action(struct softirq_action *a)
+static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 32bf6f75a8fe..2d47980a1bc4 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1633,7 +1633,7 @@ static inline void __run_timers(struct timer_base *base)
 /*
  * This function runs timers and the timer-tq in bottom half context.
  */
-static void run_timer_softirq(struct softirq_action *h)
+static __latent_entropy void run_timer_softirq(struct softirq_action *h)
 {
 	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 
diff --git a/lib/irq_poll.c b/lib/irq_poll.c
index 836f7db4e548..63be7495dbb2 100644
--- a/lib/irq_poll.c
+++ b/lib/irq_poll.c
@@ -74,7 +74,7 @@ void irq_poll_complete(struct irq_poll *iop)
 }
 EXPORT_SYMBOL(irq_poll_complete);
 
-static void irq_poll_softirq(struct softirq_action *h)
+static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
 {
 	struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
 	int rearm = 0, budget = irq_poll_budget;
diff --git a/lib/random32.c b/lib/random32.c
index 69ed593aab07..a30923573676 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -47,7 +47,7 @@ static inline void prandom_state_selftest(void)
 }
 #endif
 
-static DEFINE_PER_CPU(struct rnd_state, net_rand_state);
+static DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy;
 
 /**
  *	prandom_u32_state - seeded pseudo-random number generator.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 248851d1fc86..901121af4e54 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -92,7 +92,7 @@ int _node_numa_mem_[MAX_NUMNODES];
 #endif
 
 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
-volatile u64 latent_entropy;
+volatile u64 latent_entropy __latent_entropy;
 EXPORT_SYMBOL(latent_entropy);
 #endif
 
diff --git a/net/core/dev.c b/net/core/dev.c
index ea6312057a71..ee076c2791f9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3855,7 +3855,7 @@ int netif_rx_ni(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_rx_ni);
 
-static void net_tx_action(struct softirq_action *h)
+static __latent_entropy void net_tx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 
@@ -5187,7 +5187,7 @@ out_unlock:
 	return work;
 }
 
-static void net_rx_action(struct softirq_action *h)
+static __latent_entropy void net_rx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 	unsigned long time_limit = jiffies + 2;
-- 
cgit v1.2.3