summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorJames Morris <james.l.morris@oracle.com>2017-11-29 12:47:41 +1100
committerJames Morris <james.l.morris@oracle.com>2017-11-29 12:47:41 +1100
commitcf40a76e7d5874bb25f4404eecc58a2e033af885 (patch)
tree8fd81cbea03c87b3d41d7ae5b1d11eadd35d6ef5 /mm/page_alloc.c
parentab5348c9c23cd253f5902980d2d8fe067dc24c82 (diff)
parent4fbd8d194f06c8a3fd2af1ce560ddb31f7ec8323 (diff)
Merge tag 'v4.15-rc1' into next-seccomp
Linux 4.15-rc1
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c1011
1 files changed, 526 insertions, 485 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d30e914afb6..d4096f4a5c1f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -24,7 +24,6 @@
#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/suspend.h>
@@ -66,6 +65,8 @@
#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
+#include <linux/lockdep.h>
+#include <linux/nmi.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -81,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif
+DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
+
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -288,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+/*
+ * Determine how many pages need to be initialized durig early boot
+ * (non-deferred initialization).
+ * The value of first_deferred_pfn will be set later, once non-deferred pages
+ * are initialized, but for now set it ULONG_MAX.
+ */
static inline void reset_deferred_meminit(pg_data_t *pgdat)
{
- unsigned long max_initialise;
- unsigned long reserved_lowmem;
+ phys_addr_t start_addr, end_addr;
+ unsigned long max_pgcnt;
+ unsigned long reserved;
/*
* Initialise at least 2G of a node but also take into account that
* two large system hashes that can take up 1GB for 0.25TB/node.
*/
- max_initialise = max(2UL << (30 - PAGE_SHIFT),
- (pgdat->node_spanned_pages >> 8));
+ max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
+ (pgdat->node_spanned_pages >> 8));
/*
* Compensate the all the memblock reservations (e.g. crash kernel)
* from the initial estimation to make sure we will initialize enough
* memory to boot.
*/
- reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
- pgdat->node_start_pfn + max_initialise);
- max_initialise += reserved_lowmem;
+ start_addr = PFN_PHYS(pgdat->node_start_pfn);
+ end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
+ reserved = memblock_reserved_memory_within(start_addr, end_addr);
+ max_pgcnt += PHYS_PFN(reserved);
- pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
+ pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
pgdat->first_deferred_pfn = ULONG_MAX;
}
@@ -336,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
if (zone_end < pgdat_end_pfn(pgdat))
return true;
(*nr_initialised)++;
- if ((*nr_initialised > pgdat->static_init_size) &&
+ if ((*nr_initialised > pgdat->static_init_pgcnt) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
@@ -1011,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
- kmemcheck_free_shadow(page, order);
/*
* Check tail pages before head page information is cleared to
@@ -1168,6 +1179,7 @@ static void free_one_page(struct zone *zone,
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
{
+ mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
@@ -1188,7 +1200,7 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void init_reserved_page(unsigned long pfn)
+static void __meminit init_reserved_page(unsigned long pfn)
{
pg_data_t *pgdat;
int nid, zid;
@@ -1408,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone)
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __init deferred_free_range(struct page *page,
- unsigned long pfn, int nr_pages)
+static void __init deferred_free_range(unsigned long pfn,
+ unsigned long nr_pages)
{
- int i;
+ struct page *page;
+ unsigned long i;
- if (!page)
+ if (!nr_pages)
return;
+ page = pfn_to_page(pfn);
+
/* Free a large naturally-aligned chunk if possible */
if (nr_pages == pageblock_nr_pages &&
(pfn & (pageblock_nr_pages - 1)) == 0) {
@@ -1441,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void)
complete(&pgdat_init_all_done_comp);
}
+/*
+ * Helper for deferred_init_range, free the given range, reset the counters, and
+ * return number of pages freed.
+ */
+static inline unsigned long __init __def_free(unsigned long *nr_free,
+ unsigned long *free_base_pfn,
+ struct page **page)
+{
+ unsigned long nr = *nr_free;
+
+ deferred_free_range(*free_base_pfn, nr);
+ *free_base_pfn = 0;
+ *nr_free = 0;
+ *page = NULL;
+
+ return nr;
+}
+
+static unsigned long __init deferred_init_range(int nid, int zid,
+ unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ struct mminit_pfnnid_cache nid_init_state = { };
+ unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ unsigned long free_base_pfn = 0;
+ unsigned long nr_pages = 0;
+ unsigned long nr_free = 0;
+ struct page *page = NULL;
+ unsigned long pfn;
+
+ /*
+ * First we check if pfn is valid on architectures where it is possible
+ * to have holes within pageblock_nr_pages. On systems where it is not
+ * possible, this function is optimized out.
+ *
+ * Then, we check if a current large page is valid by only checking the
+ * validity of the head pfn.
+ *
+ * meminit_pfn_in_nid is checked on systems where pfns can interleave
+ * within a node: a pfn is between start and end of a node, but does not
+ * belong to this memory node.
+ *
+ * Finally, we minimize pfn page lookups and scheduler checks by
+ * performing it only once every pageblock_nr_pages.
+ *
+ * We do it in two loops: first we initialize struct page, than free to
+ * buddy allocator, becuse while we are freeing pages we can access
+ * pages that are ahead (computing buddy page in __free_one_page()).
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn))
+ continue;
+ if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
+ if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+ if (page && (pfn & nr_pgmask))
+ page++;
+ else
+ page = pfn_to_page(pfn);
+ __init_single_page(page, pfn, zid, nid);
+ cond_resched();
+ }
+ }
+ }
+
+ page = NULL;
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn)) {
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ } else if (page && (pfn & nr_pgmask)) {
+ page++;
+ nr_free++;
+ } else {
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ page = pfn_to_page(pfn);
+ free_base_pfn = pfn;
+ nr_free = 1;
+ cond_resched();
+ }
+ }
+ /* Free the last block of pages to allocator */
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+
+ return nr_pages;
+}
+
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
int nid = pgdat->node_id;
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long start = jiffies;
unsigned long nr_pages = 0;
- unsigned long walk_start, walk_end;
- int i, zid;
+ unsigned long spfn, epfn;
+ phys_addr_t spa, epa;
+ int zid;
struct zone *zone;
unsigned long first_init_pfn = pgdat->first_deferred_pfn;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ u64 i;
if (first_init_pfn == ULONG_MAX) {
pgdat_init_report_one_done();
@@ -1475,83 +1580,12 @@ static int __init deferred_init_memmap(void *data)
if (first_init_pfn < zone_end_pfn(zone))
break;
}
+ first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
- for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
- unsigned long pfn, end_pfn;
- struct page *page = NULL;
- struct page *free_base_page = NULL;
- unsigned long free_base_pfn = 0;
- int nr_to_free = 0;
-
- end_pfn = min(walk_end, zone_end_pfn(zone));
- pfn = first_init_pfn;
- if (pfn < walk_start)
- pfn = walk_start;
- if (pfn < zone->zone_start_pfn)
- pfn = zone->zone_start_pfn;
-
- for (; pfn < end_pfn; pfn++) {
- if (!pfn_valid_within(pfn))
- goto free_range;
-
- /*
- * Ensure pfn_valid is checked every
- * pageblock_nr_pages for memory holes
- */
- if ((pfn & (pageblock_nr_pages - 1)) == 0) {
- if (!pfn_valid(pfn)) {
- page = NULL;
- goto free_range;
- }
- }
-
- if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
- page = NULL;
- goto free_range;
- }
-
- /* Minimise pfn page lookups and scheduler checks */
- if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
- page++;
- } else {
- nr_pages += nr_to_free;
- deferred_free_range(free_base_page,
- free_base_pfn, nr_to_free);
- free_base_page = NULL;
- free_base_pfn = nr_to_free = 0;
-
- page = pfn_to_page(pfn);
- cond_resched();
- }
-
- if (page->flags) {
- VM_BUG_ON(page_zone(page) != zone);
- goto free_range;
- }
-
- __init_single_page(page, pfn, zid, nid);
- if (!free_base_page) {
- free_base_page = page;
- free_base_pfn = pfn;
- nr_to_free = 0;
- }
- nr_to_free++;
-
- /* Where possible, batch up pages for a single free */
- continue;
-free_range:
- /* Free the current block of pages to allocator */
- nr_pages += nr_to_free;
- deferred_free_range(free_base_page, free_base_pfn,
- nr_to_free);
- free_base_page = NULL;
- free_base_pfn = nr_to_free = 0;
- }
- /* Free the last block of pages to allocator */
- nr_pages += nr_to_free;
- deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
-
- first_init_pfn = max(end_pfn, first_init_pfn);
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+ spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+ epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+ nr_pages += deferred_init_range(nid, zid, spfn, epfn);
}
/* Sanity check that the next zone really is unpopulated */
@@ -1584,6 +1618,10 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+ /* Discard memblock private memory */
+ memblock_discard();
+#endif
for_each_populated_zone(zone)
set_zone_contiguous(zone);
@@ -1786,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
-static inline
+static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
@@ -1830,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
};
#ifdef CONFIG_CMA
-static struct page *__rmqueue_cma_fallback(struct zone *zone,
+static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order)
{
return __rmqueue_smallest(zone, order, MIGRATE_CMA);
@@ -2211,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
-static inline bool
+static __always_inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area *area;
@@ -2283,8 +2321,8 @@ do_steal:
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
-static struct page *__rmqueue(struct zone *zone, unsigned int order,
- int migratetype)
+static __always_inline struct page *
+__rmqueue(struct zone *zone, unsigned int order, int migratetype)
{
struct page *page;
@@ -2309,7 +2347,7 @@ retry:
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
- int migratetype, bool cold)
+ int migratetype)
{
int i, alloced = 0;
@@ -2323,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
continue;
/*
- * Split buddy pages returned by expand() are received here
- * in physical page order. The page is added to the callers and
- * list and the list head then moves forward. From the callers
- * perspective, the linked list is ordered by page number in
- * some conditions. This is useful for IO devices that can
- * merge IO requests if the physical pages are ordered
- * properly.
+ * Split buddy pages returned by expand() are received here in
+ * physical page order. The page is added to the tail of
+ * caller's list. From the callers perspective, the linked list
+ * is ordered by page number under some conditions. This is
+ * useful for IO devices that can forward direction from the
+ * head, thus also in the physical page order. This is useful
+ * for IO devices that can merge IO requests if the physical
+ * pages are ordered properly.
*/
- if (likely(!cold))
- list_add(&page->lru, list);
- else
- list_add_tail(&page->lru, list);
- list = &page->lru;
+ list_add_tail(&page->lru, list);
alloced++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -2531,9 +2566,14 @@ void drain_all_pages(struct zone *zone)
#ifdef CONFIG_HIBERNATION
+/*
+ * Touch the watchdog for every WD_PAGE_COUNT pages.
+ */
+#define WD_PAGE_COUNT (128*1024)
+
void mark_free_pages(struct zone *zone)
{
- unsigned long pfn, max_zone_pfn;
+ unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
unsigned long flags;
unsigned int order, t;
struct page *page;
@@ -2548,6 +2588,11 @@ void mark_free_pages(struct zone *zone)
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+
if (page_zone(page) != zone)
continue;
@@ -2561,32 +2606,38 @@ void mark_free_pages(struct zone *zone)
unsigned long i;
pfn = page_to_pfn(page);
- for (i = 0; i < (1UL << order); i++)
+ for (i = 0; i < (1UL << order); i++) {
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
swsusp_set_page_free(pfn_to_page(pfn + i));
+ }
}
}
spin_unlock_irqrestore(&zone->lock, flags);
}
#endif /* CONFIG_PM */
-/*
- * Free a 0-order page
- * cold == true ? free a cold page : free a hot page
- */
-void free_hot_cold_page(struct page *page, bool cold)
+static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
{
- struct zone *zone = page_zone(page);
- struct per_cpu_pages *pcp;
- unsigned long flags;
- unsigned long pfn = page_to_pfn(page);
int migratetype;
if (!free_pcp_prepare(page))
- return;
+ return false;
migratetype = get_pfnblock_migratetype(page, pfn);
set_pcppage_migratetype(page, migratetype);
- local_irq_save(flags);
+ return true;
+}
+
+static void free_unref_page_commit(struct page *page, unsigned long pfn)
+{
+ struct zone *zone = page_zone(page);
+ struct per_cpu_pages *pcp;
+ int migratetype;
+
+ migratetype = get_pcppage_migratetype(page);
__count_vm_event(PGFREE);
/*
@@ -2599,38 +2650,62 @@ void free_hot_cold_page(struct page *page, bool cold)
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
free_one_page(zone, page, pfn, 0, migratetype);
- goto out;
+ return;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
- if (!cold)
- list_add(&page->lru, &pcp->lists[migratetype]);
- else
- list_add_tail(&page->lru, &pcp->lists[migratetype]);
+ list_add(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
unsigned long batch = READ_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
pcp->count -= batch;
}
+}
-out:
+/*
+ * Free a 0-order page
+ */
+void free_unref_page(struct page *page)
+{
+ unsigned long flags;
+ unsigned long pfn = page_to_pfn(page);
+
+ if (!free_unref_page_prepare(page, pfn))
+ return;
+
+ local_irq_save(flags);
+ free_unref_page_commit(page, pfn);
local_irq_restore(flags);
}
/*
* Free a list of 0-order pages
*/
-void free_hot_cold_page_list(struct list_head *list, bool cold)
+void free_unref_page_list(struct list_head *list)
{
struct page *page, *next;
+ unsigned long flags, pfn;
+ /* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
- trace_mm_page_free_batched(page, cold);
- free_hot_cold_page(page, cold);
+ pfn = page_to_pfn(page);
+ if (!free_unref_page_prepare(page, pfn))
+ list_del(&page->lru);
+ set_page_private(page, pfn);
}
+
+ local_irq_save(flags);
+ list_for_each_entry_safe(page, next, list, lru) {
+ unsigned long pfn = page_private(page);
+
+ set_page_private(page, 0);
+ trace_mm_page_free_batched(page);
+ free_unref_page_commit(page, pfn);
+ }
+ local_irq_restore(flags);
}
/*
@@ -2648,15 +2723,6 @@ void split_page(struct page *page, unsigned int order)
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
-#ifdef CONFIG_KMEMCHECK
- /*
- * Split shadow pages too, because free(page[0]) would
- * otherwise free the whole shadow.
- */
- if (kmemcheck_page_is_tracked(page))
- split_page(virt_to_page(page[0].shadow), order);
-#endif
-
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, order);
@@ -2720,24 +2786,28 @@ int __isolate_free_page(struct page *page, unsigned int order)
static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
{
#ifdef CONFIG_NUMA
- enum zone_stat_item local_stat = NUMA_LOCAL;
+ enum numa_stat_item local_stat = NUMA_LOCAL;
+
+ /* skip numa counters update if numa stats is disabled */
+ if (!static_branch_likely(&vm_numa_stat_key))
+ return;
if (z->node != numa_node_id())
local_stat = NUMA_OTHER;
if (z->node == preferred_zone->node)
- __inc_zone_state(z, NUMA_HIT);
+ __inc_numa_state(z, NUMA_HIT);
else {
- __inc_zone_state(z, NUMA_MISS);
- __inc_zone_state(preferred_zone, NUMA_FOREIGN);
+ __inc_numa_state(z, NUMA_MISS);
+ __inc_numa_state(preferred_zone, NUMA_FOREIGN);
}
- __inc_zone_state(z, local_stat);
+ __inc_numa_state(z, local_stat);
#endif
}
/* Remove page from the per-cpu list, caller must protect the list */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
- bool cold, struct per_cpu_pages *pcp,
+ struct per_cpu_pages *pcp,
struct list_head *list)
{
struct page *page;
@@ -2746,16 +2816,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
- migratetype, cold);
+ migratetype);
if (unlikely(list_empty(list)))
return NULL;
}
- if (cold)
- page = list_last_entry(list, struct page, lru);
- else
- page = list_first_entry(list, struct page, lru);
-
+ page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
pcp->count--;
} while (check_new_pcp(page));
@@ -2770,14 +2836,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
{
struct per_cpu_pages *pcp;
struct list_head *list;
- bool cold = ((gfp_flags & __GFP_COLD) != 0);
struct page *page;
unsigned long flags;
local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
- page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
+ page = __rmqueue_pcplist(zone, migratetype, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
@@ -2930,7 +2995,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
{
long min = mark;
int o;
- const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
free_pages -= (1 << order) - 1;
@@ -2943,10 +3008,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
* the high-atomic reserves. This will over-estimate the size of the
* atomic reserve but it avoids a search.
*/
- if (likely(!alloc_harder))
+ if (likely(!alloc_harder)) {
free_pages -= z->nr_reserved_highatomic;
- else
- min -= min / 4;
+ } else {
+ /*
+ * OOM victims can try even harder than normal ALLOC_HARDER
+ * users on the grounds that it's definitely going to be in
+ * the exit path shortly and free memory. Any allocation it
+ * makes during the free path will be small and short-lived.
+ */
+ if (alloc_flags & ALLOC_OOM)
+ min -= min / 2;
+ else
+ min -= min / 4;
+ }
+
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
@@ -2974,9 +3050,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
if (!area->nr_free)
continue;
- if (alloc_harder)
- return true;
-
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
if (!list_empty(&area->free_list[mt]))
return true;
@@ -2988,6 +3061,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
return true;
}
#endif
+ if (alloc_harder &&
+ !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+ return true;
}
return false;
}
@@ -3184,7 +3260,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
* of allowed nodes.
*/
if (!(gfp_mask & __GFP_NOMEMALLOC))
- if (test_thread_flag(TIF_MEMDIE) ||
+ if (tsk_is_oom_victim(current) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -3203,20 +3279,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
return;
- pr_warn("%s: ", current->comm);
-
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- pr_cont("%pV", &vaf);
+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
+ current->comm, &vaf, gfp_mask, &gfp_mask,
+ nodemask_pr_args(nodemask));
va_end(args);
- pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
- if (nodemask)
- pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
- else
- pr_cont("(null)\n");
-
cpuset_print_current_mems_allowed();
dump_stack();
@@ -3271,10 +3341,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
- * we're still under heavy pressure.
+ * we're still under heavy pressure. But make sure that this reclaim
+ * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
+ * allocation which will never fail due to oom_lock already held.
*/
- page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
+ page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
+ ~__GFP_DIRECT_RECLAIM, order,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
if (page)
goto out;
@@ -3490,6 +3563,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
}
#endif /* CONFIG_COMPACTION */
+#ifdef CONFIG_LOCKDEP
+struct lockdep_map __fs_reclaim_map =
+ STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
+
+static bool __need_fs_reclaim(gfp_t gfp_mask)
+{
+ gfp_mask = current_gfp_context(gfp_mask);
+
+ /* no reclaim without waiting on it */
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
+ return false;
+
+ /* this guy won't enter reclaim */
+ if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+ return false;
+
+ /* We're only interested __GFP_FS allocations for now */
+ if (!(gfp_mask & __GFP_FS))
+ return false;
+
+ if (gfp_mask & __GFP_NOLOCKDEP)
+ return false;
+
+ return true;
+}
+
+void fs_reclaim_acquire(gfp_t gfp_mask)
+{
+ if (__need_fs_reclaim(gfp_mask))
+ lock_map_acquire(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
+
+void fs_reclaim_release(gfp_t gfp_mask)
+{
+ if (__need_fs_reclaim(gfp_mask))
+ lock_map_release(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_release);
+#endif
+
/* Perform direct synchronous page reclaim */
static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -3504,7 +3618,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
noreclaim_flag = memalloc_noreclaim_save();
- lockdep_set_current_reclaim_state(gfp_mask);
+ fs_reclaim_acquire(gfp_mask);
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
@@ -3512,7 +3626,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
ac->nodemask);
current->reclaim_state = NULL;
- lockdep_clear_current_reclaim_state();
+ fs_reclaim_release(gfp_mask);
memalloc_noreclaim_restore(noreclaim_flag);
cond_resched();
@@ -3603,21 +3717,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
return alloc_flags;
}
-bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+static bool oom_reserves_allowed(struct task_struct *tsk)
{
- if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ if (!tsk_is_oom_victim(tsk))
+ return false;
+
+ /*
+ * !MMU doesn't have oom reaper so give access to memory reserves
+ * only to the thread with TIF_MEMDIE set
+ */
+ if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
return false;
+ return true;
+}
+
+/*
+ * Distinguish requests which really need access to full memory
+ * reserves from oom victims which can live with a portion of it
+ */
+static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
+{
+ if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ return 0;
if (gfp_mask & __GFP_MEMALLOC)
- return true;
+ return ALLOC_NO_WATERMARKS;
if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
- return true;
- if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
- return true;
+ return ALLOC_NO_WATERMARKS;
+ if (!in_interrupt()) {
+ if (current->flags & PF_MEMALLOC)
+ return ALLOC_NO_WATERMARKS;
+ else if (oom_reserves_allowed(current))
+ return ALLOC_OOM;
+ }
- return false;
+ return 0;
+}
+
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+ return !!__gfp_pfmemalloc_flags(gfp_mask);
}
/*
@@ -3767,9 +3906,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
enum compact_result compact_result;
int compaction_retries;
int no_progress_loops;
- unsigned long alloc_start = jiffies;
- unsigned int stall_timeout = 10 * HZ;
unsigned int cpuset_mems_cookie;
+ int reserve_flags;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3875,15 +4013,16 @@ retry:
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
- if (gfp_pfmemalloc_allowed(gfp_mask))
- alloc_flags = ALLOC_NO_WATERMARKS;
+ reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
+ if (reserve_flags)
+ alloc_flags = reserve_flags;
/*
* Reset the zonelist iterators if memory policies can be ignored.
* These allocations are high priority and system rather than user
* orientated.
*/
- if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
+ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
@@ -3898,14 +4037,6 @@ retry:
if (!can_direct_reclaim)
goto nopage;
- /* Make sure we know about allocations which stall for too long */
- if (time_after(jiffies, alloc_start + stall_timeout)) {
- warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
- "page allocation stalls for %ums, order:%u",
- jiffies_to_msecs(jiffies-alloc_start), order);
- stall_timeout += 10 * HZ;
- }
-
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;
@@ -3960,8 +4091,8 @@ retry:
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */
- if (test_thread_flag(TIF_MEMDIE) &&
- (alloc_flags == ALLOC_NO_WATERMARKS ||
+ if (tsk_is_oom_victim(current) &&
+ (alloc_flags == ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
@@ -4041,7 +4172,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
*alloc_flags |= ALLOC_CPUSET;
}
- lockdep_trace_alloc(gfp_mask);
+ fs_reclaim_acquire(gfp_mask);
+ fs_reclaim_release(gfp_mask);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
@@ -4079,10 +4211,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
- gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+ gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
gfp_mask &= gfp_allowed_mask;
+ alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
@@ -4118,9 +4251,6 @@ out:
page = NULL;
}
- if (kmemcheck_enabled && page)
- kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-
trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
@@ -4157,7 +4287,7 @@ void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
- free_hot_cold_page(page, false);
+ free_unref_page(page);
else
__free_pages_ok(page, order);
}
@@ -4215,7 +4345,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
unsigned int order = compound_order(page);
if (order == 0)
- free_hot_cold_page(page, false);
+ free_unref_page(page);
else
__free_pages_ok(page, order);
}
@@ -4443,7 +4573,7 @@ long si_mem_available(void)
* Estimate the amount of memory available for userspace allocations,
* without causing swapping.
*/
- available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
@@ -4458,8 +4588,9 @@ long si_mem_available(void)
* Part of the reclaimable slab consists of items that are in use,
* and cannot be freed. Cap this estimate at the low watermark.
*/
- available += global_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+ available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
+ wmark_low);
if (available < 0)
available = 0;
@@ -4471,7 +4602,7 @@ void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
val->sharedram = global_node_page_state(NR_SHMEM);
- val->freeram = global_page_state(NR_FREE_PAGES);
+ val->freeram = global_zone_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
@@ -4602,15 +4733,15 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
global_node_page_state(NR_UNSTABLE_NFS),
- global_page_state(NR_SLAB_RECLAIMABLE),
- global_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_node_page_state(NR_SLAB_RECLAIMABLE),
+ global_node_page_state(NR_SLAB_UNRECLAIMABLE),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
- global_page_state(NR_PAGETABLE),
- global_page_state(NR_BOUNCE),
- global_page_state(NR_FREE_PAGES),
+ global_zone_page_state(NR_PAGETABLE),
+ global_zone_page_state(NR_BOUNCE),
+ global_zone_page_state(NR_FREE_PAGES),
free_pcp,
- global_page_state(NR_FREE_CMA_PAGES));
+ global_zone_page_state(NR_FREE_CMA_PAGES));
for_each_online_pgdat(pgdat) {
if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
@@ -4772,18 +4903,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
*
* Add all populated zones of a node to the zonelist.
*/
-static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
- int nr_zones)
+static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
{
struct zone *zone;
enum zone_type zone_type = MAX_NR_ZONES;
+ int nr_zones = 0;
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
if (managed_zone(zone)) {
- zoneref_set_zone(zone,
- &zonelist->_zonerefs[nr_zones++]);
+ zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
} while (zone_type);
@@ -4791,52 +4921,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
return nr_zones;
}
-
-/*
- * zonelist_order:
- * 0 = automatic detection of better ordering.
- * 1 = order by ([node] distance, -zonetype)
- * 2 = order by (-zonetype, [node] distance)
- *
- * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
- * the same zonelist. So only NUMA can configure this param.
- */
-#define ZONELIST_ORDER_DEFAULT 0
-#define ZONELIST_ORDER_NODE 1
-#define ZONELIST_ORDER_ZONE 2
-
-/* zonelist order in the kernel.
- * set_zonelist_order() will set this to NODE or ZONE.
- */
-static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
-static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
-
-
#ifdef CONFIG_NUMA
-/* The value user specified ....changed by config */
-static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
-/* string for sysctl */
-#define NUMA_ZONELIST_ORDER_LEN 16
-char numa_zonelist_order[16] = "default";
-
-/*
- * interface for configure zonelist ordering.
- * command line option "numa_zonelist_order"
- * = "[dD]efault - default, automatic configuration.
- * = "[nN]ode - order by node locality, then by zone within node
- * = "[zZ]one - order by zone, then by locality within zone
- */
static int __parse_numa_zonelist_order(char *s)
{
- if (*s == 'd' || *s == 'D') {
- user_zonelist_order = ZONELIST_ORDER_DEFAULT;
- } else if (*s == 'n' || *s == 'N') {
- user_zonelist_order = ZONELIST_ORDER_NODE;
- } else if (*s == 'z' || *s == 'Z') {
- user_zonelist_order = ZONELIST_ORDER_ZONE;
- } else {
- pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s);
+ /*
+ * We used to support different zonlists modes but they turned
+ * out to be just not useful. Let's keep the warning in place
+ * if somebody still use the cmd line parameter so that we do
+ * not fail it silently
+ */
+ if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
+ pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
@@ -4844,19 +4940,15 @@ static int __parse_numa_zonelist_order(char *s)
static __init int setup_numa_zonelist_order(char *s)
{
- int ret;
-
if (!s)
return 0;
- ret = __parse_numa_zonelist_order(s);
- if (ret == 0)
- strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
-
- return ret;
+ return __parse_numa_zonelist_order(s);
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
+char numa_zonelist_order[] = "Node";
+
/*
* sysctl handler for numa_zonelist_order
*/
@@ -4864,40 +4956,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos)
{
- char saved_string[NUMA_ZONELIST_ORDER_LEN];
+ char *str;
int ret;
- static DEFINE_MUTEX(zl_order_mutex);
- mutex_lock(&zl_order_mutex);
- if (write) {
- if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
- ret = -EINVAL;
- goto out;
- }
- strcpy(saved_string, (char *)table->data);
- }
- ret = proc_dostring(table, write, buffer, length, ppos);
- if (ret)
- goto out;
- if (write) {
- int oldval = user_zonelist_order;
+ if (!write)
+ return proc_dostring(table, write, buffer, length, ppos);
+ str = memdup_user_nul(buffer, 16);
+ if (IS_ERR(str))
+ return PTR_ERR(str);
- ret = __parse_numa_zonelist_order((char *)table->data);
- if (ret) {
- /*
- * bogus value. restore saved string
- */
- strncpy((char *)table->data, saved_string,
- NUMA_ZONELIST_ORDER_LEN);
- user_zonelist_order = oldval;
- } else if (oldval != user_zonelist_order) {
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
- mutex_unlock(&zonelists_mutex);
- }
- }
-out:
- mutex_unlock(&zl_order_mutex);
+ ret = __parse_numa_zonelist_order(str);
+ kfree(str);
return ret;
}
@@ -4971,17 +5040,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
-static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
+ unsigned nr_nodes)
{
- int j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int i;
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
- ;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+
+ for (i = 0; i < nr_nodes; i++) {
+ int nr_zones;
+
+ pg_data_t *node = NODE_DATA(node_order[i]);
+
+ nr_zones = build_zonerefs_node(node, zonerefs);
+ zonerefs += nr_zones;
+ }
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
/*
@@ -4989,13 +5065,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
- int j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int nr_zones;
- zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
- j = build_zonelists_node(pgdat, zonelist, 0);
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs += nr_zones;
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
/*
@@ -5004,79 +5081,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
* exhausted, but results in overflowing to remote node while memory
* may still exist in local DMA zone.
*/
-static int node_order[MAX_NUMNODES];
-
-static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
-{
- int pos, j, node;
- int zone_type; /* needs to be signed */
- struct zone *z;
- struct zonelist *zonelist;
-
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- pos = 0;
- for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
- for (j = 0; j < nr_nodes; j++) {
- node = node_order[j];
- z = &NODE_DATA(node)->node_zones[zone_type];
- if (managed_zone(z)) {
- zoneref_set_zone(z,
- &zonelist->_zonerefs[pos++]);
- check_highest_zone(zone_type);
- }
- }
- }
- zonelist->_zonerefs[pos].zone = NULL;
- zonelist->_zonerefs[pos].zone_idx = 0;
-}
-
-#if defined(CONFIG_64BIT)
-/*
- * Devices that require DMA32/DMA are relatively rare and do not justify a
- * penalty to every machine in case the specialised case applies. Default
- * to Node-ordering on 64-bit NUMA machines
- */
-static int default_zonelist_order(void)
-{
- return ZONELIST_ORDER_NODE;
-}
-#else
-/*
- * On 32-bit, the Normal zone needs to be preserved for allocations accessible
- * by the kernel. If processes running on node 0 deplete the low memory zone
- * then reclaim will occur more frequency increasing stalls and potentially
- * be easier to OOM if a large percentage of the zone is under writeback or
- * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
- * Hence, default to zone ordering on 32-bit.
- */
-static int default_zonelist_order(void)
-{
- return ZONELIST_ORDER_ZONE;
-}
-#endif /* CONFIG_64BIT */
-
-static void set_zonelist_order(void)
-{
- if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
- current_zonelist_order = default_zonelist_order();
- else
- current_zonelist_order = user_zonelist_order;
-}
static void build_zonelists(pg_data_t *pgdat)
{
- int i, node, load;
+ static int node_order[MAX_NUMNODES];
+ int node, load, nr_nodes = 0;
nodemask_t used_mask;
int local_node, prev_node;
- struct zonelist *zonelist;
- unsigned int order = current_zonelist_order;
-
- /* initialize zonelists */
- for (i = 0; i < MAX_ZONELISTS; i++) {
- zonelist = pgdat->node_zonelists + i;
- zonelist->_zonerefs[0].zone = NULL;
- zonelist->_zonerefs[0].zone_idx = 0;
- }
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
@@ -5085,8 +5096,6 @@ static void build_zonelists(pg_data_t *pgdat)
nodes_clear(used_mask);
memset(node_order, 0, sizeof(node_order));
- i = 0;
-
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
* We don't want to pressure a particular node.
@@ -5097,19 +5106,12 @@ static void build_zonelists(pg_data_t *pgdat)
node_distance(local_node, prev_node))
node_load[node] = load;
+ node_order[nr_nodes++] = node;
prev_node = node;
load--;
- if (order == ZONELIST_ORDER_NODE)
- build_zonelists_in_node_order(pgdat, node);
- else
- node_order[i++] = node; /* remember order */
- }
-
- if (order == ZONELIST_ORDER_ZONE) {
- /* calculate node order -- i.e., DMA last! */
- build_zonelists_in_zone_order(pgdat, i);
}
+ build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
}
@@ -5135,21 +5137,17 @@ static void setup_min_unmapped_ratio(void);
static void setup_min_slab_ratio(void);
#else /* CONFIG_NUMA */
-static void set_zonelist_order(void)
-{
- current_zonelist_order = ZONELIST_ORDER_ZONE;
-}
-
static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
- enum zone_type j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int nr_zones;
local_node = pgdat->node_id;
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- j = build_zonelists_node(pgdat, zonelist, 0);
+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs += nr_zones;
/*
* Now we build the zonelist so that it contains the zones
@@ -5162,16 +5160,18 @@ static void build_zonelists(pg_data_t *pgdat)
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ zonerefs += nr_zones;
}
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ zonerefs += nr_zones;
}
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
#endif /* CONFIG_NUMA */
@@ -5194,50 +5194,32 @@ static void build_zonelists(pg_data_t *pgdat)
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
-static void setup_zone_pageset(struct zone *zone);
-
-/*
- * Global mutex to protect against size modification of zonelists
- * as well as to serialize pageset setup for the new populated zone.
- */
-DEFINE_MUTEX(zonelists_mutex);
-/* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *data)
+static void __build_all_zonelists(void *data)
{
int nid;
- int cpu;
+ int __maybe_unused cpu;
pg_data_t *self = data;
+ static DEFINE_SPINLOCK(lock);
+
+ spin_lock(&lock);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
+ /*
+ * This node is hotadded and no memory is yet present. So just
+ * building zonelists is fine - no need to touch other nodes.
+ */
if (self && !node_online(self->node_id)) {
build_zonelists(self);
- }
-
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid);
-
- build_zonelists(pgdat);
- }
+ } else {
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
- /*
- * Initialize the boot_pagesets that are going to be used
- * for bootstrapping processors. The real pagesets for
- * each zone will be allocated later when the per cpu
- * allocator is available.
- *
- * boot_pagesets are used also for bootstrapping offline
- * cpus if the system is already booted because the pagesets
- * are needed to initialize allocators on a specific cpu too.
- * F.e. the percpu allocator needs the page allocator which
- * needs the percpu allocator in order to allocate its pagesets
- * (a chicken-egg dilemma).
- */
- for_each_possible_cpu(cpu) {
- setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+ build_zonelists(pgdat);
+ }
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
@@ -5248,45 +5230,53 @@ static int __build_all_zonelists(void *data)
* secondary cpus' numa_mem as they come on-line. During
* node/memory hotplug, we'll fixup all on-line cpus.
*/
- if (cpu_online(cpu))
+ for_each_online_cpu(cpu)
set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
#endif
}
- return 0;
+ spin_unlock(&lock);
}
static noinline void __init
build_all_zonelists_init(void)
{
+ int cpu;
+
__build_all_zonelists(NULL);
+
+ /*
+ * Initialize the boot_pagesets that are going to be used
+ * for bootstrapping processors. The real pagesets for
+ * each zone will be allocated later when the per cpu
+ * allocator is available.
+ *
+ * boot_pagesets are used also for bootstrapping offline
+ * cpus if the system is already booted because the pagesets
+ * are needed to initialize allocators on a specific cpu too.
+ * F.e. the percpu allocator needs the page allocator which
+ * needs the percpu allocator in order to allocate its pagesets
+ * (a chicken-egg dilemma).
+ */
+ for_each_possible_cpu(cpu)
+ setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
}
/*
- * Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*
- * __ref due to (1) call of __meminit annotated setup_zone_pageset
- * [we're only called with non-NULL zone through __meminit paths] and
- * (2) call of __init annotated helper build_all_zonelists_init
+ * __ref due to call of __init annotated helper build_all_zonelists_init
* [protected by SYSTEM_BOOTING].
*/
-void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
+void __ref build_all_zonelists(pg_data_t *pgdat)
{
- set_zonelist_order();
-
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
} else {
-#ifdef CONFIG_MEMORY_HOTPLUG
- if (zone)
- setup_zone_pageset(zone);
-#endif
- /* we have to stop all cpus to guarantee there is no user
- of zonelist */
- stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
+ __build_all_zonelists(pgdat);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
@@ -5302,9 +5292,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
+ pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
- zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
@@ -5403,6 +5392,7 @@ not_early:
__init_single_page(page, pfn, zone, nid);
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ cond_resched();
} else {
__init_single_pfn(pfn, zone, nid);
}
@@ -5558,7 +5548,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu)
pageset_set_high_and_batch(zone, pcp);
}
-static void __meminit setup_zone_pageset(struct zone *zone)
+void __meminit setup_zone_pageset(struct zone *zone)
{
int cpu;
zone->pageset = alloc_percpu(struct per_cpu_pageset);
@@ -6161,6 +6151,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
}
}
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
{
unsigned long __maybe_unused start = 0;
@@ -6170,7 +6161,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
if (!pgdat->node_spanned_pages)
return;
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
offset = pgdat->node_start_pfn - start;
/* ia64 gets its own node_mem_map, before this, without bootmem */
@@ -6192,6 +6182,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
pgdat->node_id);
pgdat->node_mem_map = map + offset;
}
+ pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
+ __func__, pgdat->node_id, (unsigned long)pgdat,
+ (unsigned long)pgdat->node_mem_map);
#ifndef CONFIG_NEED_MULTIPLE_NODES
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
@@ -6204,8 +6197,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
-#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
+#else
+static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
@@ -6232,16 +6227,49 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
zones_size, zholes_size);
alloc_node_mem_map(pgdat);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
- printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
- nid, (unsigned long)pgdat,
- (unsigned long)pgdat->node_mem_map);
-#endif
reset_deferred_meminit(pgdat);
free_area_init_core(pgdat);
}
+#ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Only struct pages that are backed by physical memory are zeroed and
+ * initialized by going through __init_single_page(). But, there are some
+ * struct pages which are reserved in memblock allocator and their fields
+ * may be accessed (for example page_to_pfn() on some configuration accesses
+ * flags). We must explicitly zero those struct pages.
+ */
+void __paginginit zero_resv_unavail(void)
+{
+ phys_addr_t start, end;
+ unsigned long pfn;
+ u64 i, pgcnt;
+
+ /*
+ * Loop through ranges that are reserved, but do not have reported
+ * physical memory backing.
+ */
+ pgcnt = 0;
+ for_each_resv_unavail_range(i, &start, &end) {
+ for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
+ mm_zero_struct_page(pfn_to_page(pfn));
+ pgcnt++;
+ }
+ }
+
+ /*
+ * Struct pages that do not have backing memory. This could be because
+ * firmware is using some of this memory, or for some other reasons.
+ * Once memblock is changed so such behaviour is not allowed: i.e.
+ * list of "reserved" memory must be a subset of list of "memory", then
+ * this code can be removed.
+ */
+ if (pgcnt)
+ pr_info("Reserved but unavailable: %lld pages", pgcnt);
+}
+#endif /* CONFIG_HAVE_MEMBLOCK */
+
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
#if MAX_NUMNODES > 1
@@ -6665,6 +6693,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
+ zero_resv_unavail();
}
static int __init cmdline_parse_core(char *p, unsigned long *core)
@@ -6828,6 +6857,7 @@ void __init free_area_init(unsigned long *zones_size)
{
free_area_init_node(0, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+ zero_resv_unavail();
}
static int page_alloc_cpu_dead(unsigned int cpu)
@@ -7012,9 +7042,11 @@ static void __setup_per_zone_wmarks(void)
*/
void setup_per_zone_wmarks(void)
{
- mutex_lock(&zonelists_mutex);
+ static DEFINE_SPINLOCK(lock);
+
+ spin_lock(&lock);
__setup_per_zone_wmarks();
- mutex_unlock(&zonelists_mutex);
+ spin_unlock(&lock);
}
/*
@@ -7338,18 +7370,17 @@ void *__init alloc_large_system_hash(const char *tablename,
log2qty = ilog2(numentries);
- /*
- * memblock allocator returns zeroed memory already, so HASH_ZERO is
- * currently not used when HASH_EARLY is specified.
- */
gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
size = bucketsize << log2qty;
- if (flags & HASH_EARLY)
- table = memblock_virt_alloc_nopanic(size, 0);
- else if (hashdist)
+ if (flags & HASH_EARLY) {
+ if (flags & HASH_ZERO)
+ table = memblock_virt_alloc_nopanic(size, 0);
+ else
+ table = memblock_virt_alloc_raw(size, 0);
+ } else if (hashdist) {
table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
- else {
+ } else {
/*
* If bucketsize is not a power-of-two, we may free
* some pages at the end of hash table which
@@ -7386,10 +7417,10 @@ void *__init alloc_large_system_hash(const char *tablename,
* race condition. So you can't expect this function should be exact.
*/
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+ int migratetype,
bool skip_hwpoisoned_pages)
{
unsigned long pfn, iter, found;
- int mt;
/*
* For avoiding noise data, lru_add_drain_all() should be called
@@ -7397,8 +7428,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
*/
if (zone_idx(zone) == ZONE_MOVABLE)
return false;
- mt = get_pageblock_migratetype(page);
- if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
+
+ /*
+ * CMA allocations (alloc_contig_range) really need to mark isolate
+ * CMA pageblocks even when they are not movable in fact so consider
+ * them movable here.
+ */
+ if (is_migrate_cma(migratetype) &&
+ is_migrate_cma(get_pageblock_migratetype(page)))
return false;
pfn = page_to_pfn(page);
@@ -7410,6 +7447,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
page = pfn_to_page(check);
+ if (PageReserved(page))
+ return true;
+
/*
* Hugepages are not in LRU lists, but they're movable.
* We need not scan over tail pages bacause we don't
@@ -7483,7 +7523,7 @@ bool is_pageblock_removable_nolock(struct page *page)
if (!zone_spans_pfn(zone, pfn))
return false;
- return !has_unmovable_pages(zone, page, 0, true);
+ return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
}
#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
@@ -7579,6 +7619,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.zone = page_zone(pfn_to_page(start)),
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
+ .no_set_skip_hint = true,
.gfp_mask = current_gfp_context(gfp_mask),
};
INIT_LIST_HEAD(&cc.migratepages);
@@ -7666,7 +7707,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, false)) {
- pr_info("%s: [%lx, %lx) PFNs busy\n",
+ pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
__func__, outer_start, end);
ret = -EBUSY;
goto done;