diff options
author | James Morris <james.l.morris@oracle.com> | 2017-11-29 12:47:41 +1100 |
---|---|---|
committer | James Morris <james.l.morris@oracle.com> | 2017-11-29 12:47:41 +1100 |
commit | cf40a76e7d5874bb25f4404eecc58a2e033af885 (patch) | |
tree | 8fd81cbea03c87b3d41d7ae5b1d11eadd35d6ef5 /mm/page_alloc.c | |
parent | ab5348c9c23cd253f5902980d2d8fe067dc24c82 (diff) | |
parent | 4fbd8d194f06c8a3fd2af1ce560ddb31f7ec8323 (diff) |
Merge tag 'v4.15-rc1' into next-seccomp
Linux 4.15-rc1
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 1011 |
1 files changed, 526 insertions, 485 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d30e914afb6..d4096f4a5c1f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -24,7 +24,6 @@ #include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/kasan.h> #include <linux/module.h> #include <linux/suspend.h> @@ -66,6 +65,8 @@ #include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> +#include <linux/lockdep.h> +#include <linux/nmi.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -81,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); #endif +DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); + #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. @@ -288,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + +/* + * Determine how many pages need to be initialized durig early boot + * (non-deferred initialization). + * The value of first_deferred_pfn will be set later, once non-deferred pages + * are initialized, but for now set it ULONG_MAX. + */ static inline void reset_deferred_meminit(pg_data_t *pgdat) { - unsigned long max_initialise; - unsigned long reserved_lowmem; + phys_addr_t start_addr, end_addr; + unsigned long max_pgcnt; + unsigned long reserved; /* * Initialise at least 2G of a node but also take into account that * two large system hashes that can take up 1GB for 0.25TB/node. */ - max_initialise = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); + max_pgcnt = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); /* * Compensate the all the memblock reservations (e.g. crash kernel) * from the initial estimation to make sure we will initialize enough * memory to boot. */ - reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, - pgdat->node_start_pfn + max_initialise); - max_initialise += reserved_lowmem; + start_addr = PFN_PHYS(pgdat->node_start_pfn); + end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); + reserved = memblock_reserved_memory_within(start_addr, end_addr); + max_pgcnt += PHYS_PFN(reserved); - pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); + pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -336,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat, if (zone_end < pgdat_end_pfn(pgdat)) return true; (*nr_initialised)++; - if ((*nr_initialised > pgdat->static_init_size) && + if ((*nr_initialised > pgdat->static_init_pgcnt) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -1011,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); - kmemcheck_free_shadow(page, order); /* * Check tail pages before head page information is cleared to @@ -1168,6 +1179,7 @@ static void free_one_page(struct zone *zone, static void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { + mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); @@ -1188,7 +1200,7 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void init_reserved_page(unsigned long pfn) +static void __meminit init_reserved_page(unsigned long pfn) { pg_data_t *pgdat; int nid, zid; @@ -1408,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone) } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void __init deferred_free_range(struct page *page, - unsigned long pfn, int nr_pages) +static void __init deferred_free_range(unsigned long pfn, + unsigned long nr_pages) { - int i; + struct page *page; + unsigned long i; - if (!page) + if (!nr_pages) return; + page = pfn_to_page(pfn); + /* Free a large naturally-aligned chunk if possible */ if (nr_pages == pageblock_nr_pages && (pfn & (pageblock_nr_pages - 1)) == 0) { @@ -1441,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void) complete(&pgdat_init_all_done_comp); } +/* + * Helper for deferred_init_range, free the given range, reset the counters, and + * return number of pages freed. + */ +static inline unsigned long __init __def_free(unsigned long *nr_free, + unsigned long *free_base_pfn, + struct page **page) +{ + unsigned long nr = *nr_free; + + deferred_free_range(*free_base_pfn, nr); + *free_base_pfn = 0; + *nr_free = 0; + *page = NULL; + + return nr; +} + +static unsigned long __init deferred_init_range(int nid, int zid, + unsigned long start_pfn, + unsigned long end_pfn) +{ + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long nr_pgmask = pageblock_nr_pages - 1; + unsigned long free_base_pfn = 0; + unsigned long nr_pages = 0; + unsigned long nr_free = 0; + struct page *page = NULL; + unsigned long pfn; + + /* + * First we check if pfn is valid on architectures where it is possible + * to have holes within pageblock_nr_pages. On systems where it is not + * possible, this function is optimized out. + * + * Then, we check if a current large page is valid by only checking the + * validity of the head pfn. + * + * meminit_pfn_in_nid is checked on systems where pfns can interleave + * within a node: a pfn is between start and end of a node, but does not + * belong to this memory node. + * + * Finally, we minimize pfn page lookups and scheduler checks by + * performing it only once every pageblock_nr_pages. + * + * We do it in two loops: first we initialize struct page, than free to + * buddy allocator, becuse while we are freeing pages we can access + * pages that are ahead (computing buddy page in __free_one_page()). + */ + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + if ((pfn & nr_pgmask) || pfn_valid(pfn)) { + if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + if (page && (pfn & nr_pgmask)) + page++; + else + page = pfn_to_page(pfn); + __init_single_page(page, pfn, zid, nid); + cond_resched(); + } + } + } + + page = NULL; + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (page && (pfn & nr_pgmask)) { + page++; + nr_free++; + } else { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + page = pfn_to_page(pfn); + free_base_pfn = pfn; + nr_free = 1; + cond_resched(); + } + } + /* Free the last block of pages to allocator */ + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + + return nr_pages; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; int nid = pgdat->node_id; - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long start = jiffies; unsigned long nr_pages = 0; - unsigned long walk_start, walk_end; - int i, zid; + unsigned long spfn, epfn; + phys_addr_t spa, epa; + int zid; struct zone *zone; unsigned long first_init_pfn = pgdat->first_deferred_pfn; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + u64 i; if (first_init_pfn == ULONG_MAX) { pgdat_init_report_one_done(); @@ -1475,83 +1580,12 @@ static int __init deferred_init_memmap(void *data) if (first_init_pfn < zone_end_pfn(zone)) break; } + first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); - for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { - unsigned long pfn, end_pfn; - struct page *page = NULL; - struct page *free_base_page = NULL; - unsigned long free_base_pfn = 0; - int nr_to_free = 0; - - end_pfn = min(walk_end, zone_end_pfn(zone)); - pfn = first_init_pfn; - if (pfn < walk_start) - pfn = walk_start; - if (pfn < zone->zone_start_pfn) - pfn = zone->zone_start_pfn; - - for (; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) - goto free_range; - - /* - * Ensure pfn_valid is checked every - * pageblock_nr_pages for memory holes - */ - if ((pfn & (pageblock_nr_pages - 1)) == 0) { - if (!pfn_valid(pfn)) { - page = NULL; - goto free_range; - } - } - - if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { - page = NULL; - goto free_range; - } - - /* Minimise pfn page lookups and scheduler checks */ - if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { - page++; - } else { - nr_pages += nr_to_free; - deferred_free_range(free_base_page, - free_base_pfn, nr_to_free); - free_base_page = NULL; - free_base_pfn = nr_to_free = 0; - - page = pfn_to_page(pfn); - cond_resched(); - } - - if (page->flags) { - VM_BUG_ON(page_zone(page) != zone); - goto free_range; - } - - __init_single_page(page, pfn, zid, nid); - if (!free_base_page) { - free_base_page = page; - free_base_pfn = pfn; - nr_to_free = 0; - } - nr_to_free++; - - /* Where possible, batch up pages for a single free */ - continue; -free_range: - /* Free the current block of pages to allocator */ - nr_pages += nr_to_free; - deferred_free_range(free_base_page, free_base_pfn, - nr_to_free); - free_base_page = NULL; - free_base_pfn = nr_to_free = 0; - } - /* Free the last block of pages to allocator */ - nr_pages += nr_to_free; - deferred_free_range(free_base_page, free_base_pfn, nr_to_free); - - first_init_pfn = max(end_pfn, first_init_pfn); + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + nr_pages += deferred_init_range(nid, zid, spfn, epfn); } /* Sanity check that the next zone really is unpopulated */ @@ -1584,6 +1618,10 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + /* Discard memblock private memory */ + memblock_discard(); +#endif for_each_populated_zone(zone) set_zone_contiguous(zone); @@ -1786,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists */ -static inline +static __always_inline struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { @@ -1830,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { }; #ifdef CONFIG_CMA -static struct page *__rmqueue_cma_fallback(struct zone *zone, +static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, unsigned int order) { return __rmqueue_smallest(zone, order, MIGRATE_CMA); @@ -2211,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * deviation from the rest of this file, to make the for loop * condition simpler. */ -static inline bool +static __always_inline bool __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { struct free_area *area; @@ -2283,8 +2321,8 @@ do_steal: * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype) +static __always_inline struct page * +__rmqueue(struct zone *zone, unsigned int order, int migratetype) { struct page *page; @@ -2309,7 +2347,7 @@ retry: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, bool cold) + int migratetype) { int i, alloced = 0; @@ -2323,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, continue; /* - * Split buddy pages returned by expand() are received here - * in physical page order. The page is added to the callers and - * list and the list head then moves forward. From the callers - * perspective, the linked list is ordered by page number in - * some conditions. This is useful for IO devices that can - * merge IO requests if the physical pages are ordered - * properly. + * Split buddy pages returned by expand() are received here in + * physical page order. The page is added to the tail of + * caller's list. From the callers perspective, the linked list + * is ordered by page number under some conditions. This is + * useful for IO devices that can forward direction from the + * head, thus also in the physical page order. This is useful + * for IO devices that can merge IO requests if the physical + * pages are ordered properly. */ - if (likely(!cold)) - list_add(&page->lru, list); - else - list_add_tail(&page->lru, list); - list = &page->lru; + list_add_tail(&page->lru, list); alloced++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, @@ -2531,9 +2566,14 @@ void drain_all_pages(struct zone *zone) #ifdef CONFIG_HIBERNATION +/* + * Touch the watchdog for every WD_PAGE_COUNT pages. + */ +#define WD_PAGE_COUNT (128*1024) + void mark_free_pages(struct zone *zone) { - unsigned long pfn, max_zone_pfn; + unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; unsigned long flags; unsigned int order, t; struct page *page; @@ -2548,6 +2588,11 @@ void mark_free_pages(struct zone *zone) if (pfn_valid(pfn)) { page = pfn_to_page(pfn); + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + if (page_zone(page) != zone) continue; @@ -2561,32 +2606,38 @@ void mark_free_pages(struct zone *zone) unsigned long i; pfn = page_to_pfn(page); - for (i = 0; i < (1UL << order); i++) + for (i = 0; i < (1UL << order); i++) { + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } swsusp_set_page_free(pfn_to_page(pfn + i)); + } } } spin_unlock_irqrestore(&zone->lock, flags); } #endif /* CONFIG_PM */ -/* - * Free a 0-order page - * cold == true ? free a cold page : free a hot page - */ -void free_hot_cold_page(struct page *page, bool cold) +static bool free_unref_page_prepare(struct page *page, unsigned long pfn) { - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; - unsigned long flags; - unsigned long pfn = page_to_pfn(page); int migratetype; if (!free_pcp_prepare(page)) - return; + return false; migratetype = get_pfnblock_migratetype(page, pfn); set_pcppage_migratetype(page, migratetype); - local_irq_save(flags); + return true; +} + +static void free_unref_page_commit(struct page *page, unsigned long pfn) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + int migratetype; + + migratetype = get_pcppage_migratetype(page); __count_vm_event(PGFREE); /* @@ -2599,38 +2650,62 @@ void free_hot_cold_page(struct page *page, bool cold) if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { free_one_page(zone, page, pfn, 0, migratetype); - goto out; + return; } migratetype = MIGRATE_MOVABLE; } pcp = &this_cpu_ptr(zone->pageset)->pcp; - if (!cold) - list_add(&page->lru, &pcp->lists[migratetype]); - else - list_add_tail(&page->lru, &pcp->lists[migratetype]); + list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { unsigned long batch = READ_ONCE(pcp->batch); free_pcppages_bulk(zone, batch, pcp); pcp->count -= batch; } +} -out: +/* + * Free a 0-order page + */ +void free_unref_page(struct page *page) +{ + unsigned long flags; + unsigned long pfn = page_to_pfn(page); + + if (!free_unref_page_prepare(page, pfn)) + return; + + local_irq_save(flags); + free_unref_page_commit(page, pfn); local_irq_restore(flags); } /* * Free a list of 0-order pages */ -void free_hot_cold_page_list(struct list_head *list, bool cold) +void free_unref_page_list(struct list_head *list) { struct page *page, *next; + unsigned long flags, pfn; + /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { - trace_mm_page_free_batched(page, cold); - free_hot_cold_page(page, cold); + pfn = page_to_pfn(page); + if (!free_unref_page_prepare(page, pfn)) + list_del(&page->lru); + set_page_private(page, pfn); } + + local_irq_save(flags); + list_for_each_entry_safe(page, next, list, lru) { + unsigned long pfn = page_private(page); + + set_page_private(page, 0); + trace_mm_page_free_batched(page); + free_unref_page_commit(page, pfn); + } + local_irq_restore(flags); } /* @@ -2648,15 +2723,6 @@ void split_page(struct page *page, unsigned int order) VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); -#ifdef CONFIG_KMEMCHECK - /* - * Split shadow pages too, because free(page[0]) would - * otherwise free the whole shadow. - */ - if (kmemcheck_page_is_tracked(page)) - split_page(virt_to_page(page[0].shadow), order); -#endif - for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, order); @@ -2720,24 +2786,28 @@ int __isolate_free_page(struct page *page, unsigned int order) static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA - enum zone_stat_item local_stat = NUMA_LOCAL; + enum numa_stat_item local_stat = NUMA_LOCAL; + + /* skip numa counters update if numa stats is disabled */ + if (!static_branch_likely(&vm_numa_stat_key)) + return; if (z->node != numa_node_id()) local_stat = NUMA_OTHER; if (z->node == preferred_zone->node) - __inc_zone_state(z, NUMA_HIT); + __inc_numa_state(z, NUMA_HIT); else { - __inc_zone_state(z, NUMA_MISS); - __inc_zone_state(preferred_zone, NUMA_FOREIGN); + __inc_numa_state(z, NUMA_MISS); + __inc_numa_state(preferred_zone, NUMA_FOREIGN); } - __inc_zone_state(z, local_stat); + __inc_numa_state(z, local_stat); #endif } /* Remove page from the per-cpu list, caller must protect the list */ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, - bool cold, struct per_cpu_pages *pcp, + struct per_cpu_pages *pcp, struct list_head *list) { struct page *page; @@ -2746,16 +2816,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, - migratetype, cold); + migratetype); if (unlikely(list_empty(list))) return NULL; } - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); - + page = list_first_entry(list, struct page, lru); list_del(&page->lru); pcp->count--; } while (check_new_pcp(page)); @@ -2770,14 +2836,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, { struct per_cpu_pages *pcp; struct list_head *list; - bool cold = ((gfp_flags & __GFP_COLD) != 0); struct page *page; unsigned long flags; local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; - page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); + page = __rmqueue_pcplist(zone, migratetype, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); @@ -2930,7 +2995,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, { long min = mark; int o; - const bool alloc_harder = (alloc_flags & ALLOC_HARDER); + const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -2943,10 +3008,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * the high-atomic reserves. This will over-estimate the size of the * atomic reserve but it avoids a search. */ - if (likely(!alloc_harder)) + if (likely(!alloc_harder)) { free_pages -= z->nr_reserved_highatomic; - else - min -= min / 4; + } else { + /* + * OOM victims can try even harder than normal ALLOC_HARDER + * users on the grounds that it's definitely going to be in + * the exit path shortly and free memory. Any allocation it + * makes during the free path will be small and short-lived. + */ + if (alloc_flags & ALLOC_OOM) + min -= min / 2; + else + min -= min / 4; + } + #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ @@ -2974,9 +3050,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, if (!area->nr_free) continue; - if (alloc_harder) - return true; - for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { if (!list_empty(&area->free_list[mt])) return true; @@ -2988,6 +3061,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; } #endif + if (alloc_harder && + !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + return true; } return false; } @@ -3184,7 +3260,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) * of allowed nodes. */ if (!(gfp_mask & __GFP_NOMEMALLOC)) - if (test_thread_flag(TIF_MEMDIE) || + if (tsk_is_oom_victim(current) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) @@ -3203,20 +3279,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) return; - pr_warn("%s: ", current->comm); - va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - pr_cont("%pV", &vaf); + pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", + current->comm, &vaf, gfp_mask, &gfp_mask, + nodemask_pr_args(nodemask)); va_end(args); - pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); - if (nodemask) - pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); - else - pr_cont("(null)\n"); - cpuset_print_current_mems_allowed(); dump_stack(); @@ -3271,10 +3341,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if - * we're still under heavy pressure. + * we're still under heavy pressure. But make sure that this reclaim + * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY + * allocation which will never fail due to oom_lock already held. */ - page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, - ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); + page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & + ~__GFP_DIRECT_RECLAIM, order, + ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); if (page) goto out; @@ -3490,6 +3563,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla } #endif /* CONFIG_COMPACTION */ +#ifdef CONFIG_LOCKDEP +struct lockdep_map __fs_reclaim_map = + STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); + +static bool __need_fs_reclaim(gfp_t gfp_mask) +{ + gfp_mask = current_gfp_context(gfp_mask); + + /* no reclaim without waiting on it */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) + return false; + + /* this guy won't enter reclaim */ + if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + return false; + + /* We're only interested __GFP_FS allocations for now */ + if (!(gfp_mask & __GFP_FS)) + return false; + + if (gfp_mask & __GFP_NOLOCKDEP) + return false; + + return true; +} + +void fs_reclaim_acquire(gfp_t gfp_mask) +{ + if (__need_fs_reclaim(gfp_mask)) + lock_map_acquire(&__fs_reclaim_map); +} +EXPORT_SYMBOL_GPL(fs_reclaim_acquire); + +void fs_reclaim_release(gfp_t gfp_mask) +{ + if (__need_fs_reclaim(gfp_mask)) + lock_map_release(&__fs_reclaim_map); +} +EXPORT_SYMBOL_GPL(fs_reclaim_release); +#endif + /* Perform direct synchronous page reclaim */ static int __perform_reclaim(gfp_t gfp_mask, unsigned int order, @@ -3504,7 +3618,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); noreclaim_flag = memalloc_noreclaim_save(); - lockdep_set_current_reclaim_state(gfp_mask); + fs_reclaim_acquire(gfp_mask); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3512,7 +3626,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, ac->nodemask); current->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); + fs_reclaim_release(gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); cond_resched(); @@ -3603,21 +3717,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask) return alloc_flags; } -bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +static bool oom_reserves_allowed(struct task_struct *tsk) { - if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + if (!tsk_is_oom_victim(tsk)) + return false; + + /* + * !MMU doesn't have oom reaper so give access to memory reserves + * only to the thread with TIF_MEMDIE set + */ + if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) return false; + return true; +} + +/* + * Distinguish requests which really need access to full memory + * reserves from oom victims which can live with a portion of it + */ +static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) +{ + if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + return 0; if (gfp_mask & __GFP_MEMALLOC) - return true; + return ALLOC_NO_WATERMARKS; if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) - return true; - if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) - return true; + return ALLOC_NO_WATERMARKS; + if (!in_interrupt()) { + if (current->flags & PF_MEMALLOC) + return ALLOC_NO_WATERMARKS; + else if (oom_reserves_allowed(current)) + return ALLOC_OOM; + } - return false; + return 0; +} + +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ + return !!__gfp_pfmemalloc_flags(gfp_mask); } /* @@ -3767,9 +3906,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, enum compact_result compact_result; int compaction_retries; int no_progress_loops; - unsigned long alloc_start = jiffies; - unsigned int stall_timeout = 10 * HZ; unsigned int cpuset_mems_cookie; + int reserve_flags; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3875,15 +4013,16 @@ retry: if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); - if (gfp_pfmemalloc_allowed(gfp_mask)) - alloc_flags = ALLOC_NO_WATERMARKS; + reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); + if (reserve_flags) + alloc_flags = reserve_flags; /* * Reset the zonelist iterators if memory policies can be ignored. * These allocations are high priority and system rather than user * orientated. */ - if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { + if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); @@ -3898,14 +4037,6 @@ retry: if (!can_direct_reclaim) goto nopage; - /* Make sure we know about allocations which stall for too long */ - if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, - "page allocation stalls for %ums, order:%u", - jiffies_to_msecs(jiffies-alloc_start), order); - stall_timeout += 10 * HZ; - } - /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) goto nopage; @@ -3960,8 +4091,8 @@ retry: goto got_pg; /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE) && - (alloc_flags == ALLOC_NO_WATERMARKS || + if (tsk_is_oom_victim(current) && + (alloc_flags == ALLOC_OOM || (gfp_mask & __GFP_NOMEMALLOC))) goto nopage; @@ -4041,7 +4172,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, *alloc_flags |= ALLOC_CPUSET; } - lockdep_trace_alloc(gfp_mask); + fs_reclaim_acquire(gfp_mask); + fs_reclaim_release(gfp_mask); might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); @@ -4079,10 +4211,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ + gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { }; gfp_mask &= gfp_allowed_mask; + alloc_mask = gfp_mask; if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; @@ -4118,9 +4251,6 @@ out: page = NULL; } - if (kmemcheck_enabled && page) - kmemcheck_pagealloc_alloc(page, order, gfp_mask); - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); return page; @@ -4157,7 +4287,7 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) - free_hot_cold_page(page, false); + free_unref_page(page); else __free_pages_ok(page, order); } @@ -4215,7 +4345,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) unsigned int order = compound_order(page); if (order == 0) - free_hot_cold_page(page, false); + free_unref_page(page); else __free_pages_ok(page, order); } @@ -4443,7 +4573,7 @@ long si_mem_available(void) * Estimate the amount of memory available for userspace allocations, * without causing swapping. */ - available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; + available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will @@ -4458,8 +4588,9 @@ long si_mem_available(void) * Part of the reclaimable slab consists of items that are in use, * and cannot be freed. Cap this estimate at the low watermark. */ - available += global_page_state(NR_SLAB_RECLAIMABLE) - - min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + available += global_node_page_state(NR_SLAB_RECLAIMABLE) - + min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, + wmark_low); if (available < 0) available = 0; @@ -4471,7 +4602,7 @@ void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; val->sharedram = global_node_page_state(NR_SHMEM); - val->freeram = global_page_state(NR_FREE_PAGES); + val->freeram = global_zone_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; val->freehigh = nr_free_highpages(); @@ -4602,15 +4733,15 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), global_node_page_state(NR_UNSTABLE_NFS), - global_page_state(NR_SLAB_RECLAIMABLE), - global_page_state(NR_SLAB_UNRECLAIMABLE), + global_node_page_state(NR_SLAB_RECLAIMABLE), + global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), - global_page_state(NR_PAGETABLE), - global_page_state(NR_BOUNCE), - global_page_state(NR_FREE_PAGES), + global_zone_page_state(NR_PAGETABLE), + global_zone_page_state(NR_BOUNCE), + global_zone_page_state(NR_FREE_PAGES), free_pcp, - global_page_state(NR_FREE_CMA_PAGES)); + global_zone_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) @@ -4772,18 +4903,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) * * Add all populated zones of a node to the zonelist. */ -static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, - int nr_zones) +static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) { struct zone *zone; enum zone_type zone_type = MAX_NR_ZONES; + int nr_zones = 0; do { zone_type--; zone = pgdat->node_zones + zone_type; if (managed_zone(zone)) { - zoneref_set_zone(zone, - &zonelist->_zonerefs[nr_zones++]); + zoneref_set_zone(zone, &zonerefs[nr_zones++]); check_highest_zone(zone_type); } } while (zone_type); @@ -4791,52 +4921,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, return nr_zones; } - -/* - * zonelist_order: - * 0 = automatic detection of better ordering. - * 1 = order by ([node] distance, -zonetype) - * 2 = order by (-zonetype, [node] distance) - * - * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create - * the same zonelist. So only NUMA can configure this param. - */ -#define ZONELIST_ORDER_DEFAULT 0 -#define ZONELIST_ORDER_NODE 1 -#define ZONELIST_ORDER_ZONE 2 - -/* zonelist order in the kernel. - * set_zonelist_order() will set this to NODE or ZONE. - */ -static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; -static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; - - #ifdef CONFIG_NUMA -/* The value user specified ....changed by config */ -static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; -/* string for sysctl */ -#define NUMA_ZONELIST_ORDER_LEN 16 -char numa_zonelist_order[16] = "default"; - -/* - * interface for configure zonelist ordering. - * command line option "numa_zonelist_order" - * = "[dD]efault - default, automatic configuration. - * = "[nN]ode - order by node locality, then by zone within node - * = "[zZ]one - order by zone, then by locality within zone - */ static int __parse_numa_zonelist_order(char *s) { - if (*s == 'd' || *s == 'D') { - user_zonelist_order = ZONELIST_ORDER_DEFAULT; - } else if (*s == 'n' || *s == 'N') { - user_zonelist_order = ZONELIST_ORDER_NODE; - } else if (*s == 'z' || *s == 'Z') { - user_zonelist_order = ZONELIST_ORDER_ZONE; - } else { - pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); + /* + * We used to support different zonlists modes but they turned + * out to be just not useful. Let's keep the warning in place + * if somebody still use the cmd line parameter so that we do + * not fail it silently + */ + if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { + pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -4844,19 +4940,15 @@ static int __parse_numa_zonelist_order(char *s) static __init int setup_numa_zonelist_order(char *s) { - int ret; - if (!s) return 0; - ret = __parse_numa_zonelist_order(s); - if (ret == 0) - strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); - - return ret; + return __parse_numa_zonelist_order(s); } early_param("numa_zonelist_order", setup_numa_zonelist_order); +char numa_zonelist_order[] = "Node"; + /* * sysctl handler for numa_zonelist_order */ @@ -4864,40 +4956,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - char saved_string[NUMA_ZONELIST_ORDER_LEN]; + char *str; int ret; - static DEFINE_MUTEX(zl_order_mutex); - mutex_lock(&zl_order_mutex); - if (write) { - if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { - ret = -EINVAL; - goto out; - } - strcpy(saved_string, (char *)table->data); - } - ret = proc_dostring(table, write, buffer, length, ppos); - if (ret) - goto out; - if (write) { - int oldval = user_zonelist_order; + if (!write) + return proc_dostring(table, write, buffer, length, ppos); + str = memdup_user_nul(buffer, 16); + if (IS_ERR(str)) + return PTR_ERR(str); - ret = __parse_numa_zonelist_order((char *)table->data); - if (ret) { - /* - * bogus value. restore saved string - */ - strncpy((char *)table->data, saved_string, - NUMA_ZONELIST_ORDER_LEN); - user_zonelist_order = oldval; - } else if (oldval != user_zonelist_order) { - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); - } - } -out: - mutex_unlock(&zl_order_mutex); + ret = __parse_numa_zonelist_order(str); + kfree(str); return ret; } @@ -4971,17 +5040,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) * This results in maximum locality--normal zone overflows into local * DMA zone, if any--but risks exhausting DMA zone. */ -static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) +static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, + unsigned nr_nodes) { - int j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int i; - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) - ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; + + for (i = 0; i < nr_nodes; i++) { + int nr_zones; + + pg_data_t *node = NODE_DATA(node_order[i]); + + nr_zones = build_zonerefs_node(node, zonerefs); + zonerefs += nr_zones; + } + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } /* @@ -4989,13 +5065,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) */ static void build_thisnode_zonelists(pg_data_t *pgdat) { - int j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int nr_zones; - zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; - j = build_zonelists_node(pgdat, zonelist, 0); - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; + nr_zones = build_zonerefs_node(pgdat, zonerefs); + zonerefs += nr_zones; + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } /* @@ -5004,79 +5081,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) * exhausted, but results in overflowing to remote node while memory * may still exist in local DMA zone. */ -static int node_order[MAX_NUMNODES]; - -static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) -{ - int pos, j, node; - int zone_type; /* needs to be signed */ - struct zone *z; - struct zonelist *zonelist; - - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - pos = 0; - for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { - for (j = 0; j < nr_nodes; j++) { - node = node_order[j]; - z = &NODE_DATA(node)->node_zones[zone_type]; - if (managed_zone(z)) { - zoneref_set_zone(z, - &zonelist->_zonerefs[pos++]); - check_highest_zone(zone_type); - } - } - } - zonelist->_zonerefs[pos].zone = NULL; - zonelist->_zonerefs[pos].zone_idx = 0; -} - -#if defined(CONFIG_64BIT) -/* - * Devices that require DMA32/DMA are relatively rare and do not justify a - * penalty to every machine in case the specialised case applies. Default - * to Node-ordering on 64-bit NUMA machines - */ -static int default_zonelist_order(void) -{ - return ZONELIST_ORDER_NODE; -} -#else -/* - * On 32-bit, the Normal zone needs to be preserved for allocations accessible - * by the kernel. If processes running on node 0 deplete the low memory zone - * then reclaim will occur more frequency increasing stalls and potentially - * be easier to OOM if a large percentage of the zone is under writeback or - * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. - * Hence, default to zone ordering on 32-bit. - */ -static int default_zonelist_order(void) -{ - return ZONELIST_ORDER_ZONE; -} -#endif /* CONFIG_64BIT */ - -static void set_zonelist_order(void) -{ - if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) - current_zonelist_order = default_zonelist_order(); - else - current_zonelist_order = user_zonelist_order; -} static void build_zonelists(pg_data_t *pgdat) { - int i, node, load; + static int node_order[MAX_NUMNODES]; + int node, load, nr_nodes = 0; nodemask_t used_mask; int local_node, prev_node; - struct zonelist *zonelist; - unsigned int order = current_zonelist_order; - - /* initialize zonelists */ - for (i = 0; i < MAX_ZONELISTS; i++) { - zonelist = pgdat->node_zonelists + i; - zonelist->_zonerefs[0].zone = NULL; - zonelist->_zonerefs[0].zone_idx = 0; - } /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; @@ -5085,8 +5096,6 @@ static void build_zonelists(pg_data_t *pgdat) nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); - i = 0; - while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { /* * We don't want to pressure a particular node. @@ -5097,19 +5106,12 @@ static void build_zonelists(pg_data_t *pgdat) node_distance(local_node, prev_node)) node_load[node] = load; + node_order[nr_nodes++] = node; prev_node = node; load--; - if (order == ZONELIST_ORDER_NODE) - build_zonelists_in_node_order(pgdat, node); - else - node_order[i++] = node; /* remember order */ - } - - if (order == ZONELIST_ORDER_ZONE) { - /* calculate node order -- i.e., DMA last! */ - build_zonelists_in_zone_order(pgdat, i); } + build_zonelists_in_node_order(pgdat, node_order, nr_nodes); build_thisnode_zonelists(pgdat); } @@ -5135,21 +5137,17 @@ static void setup_min_unmapped_ratio(void); static void setup_min_slab_ratio(void); #else /* CONFIG_NUMA */ -static void set_zonelist_order(void) -{ - current_zonelist_order = ZONELIST_ORDER_ZONE; -} - static void build_zonelists(pg_data_t *pgdat) { int node, local_node; - enum zone_type j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int nr_zones; local_node = pgdat->node_id; - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - j = build_zonelists_node(pgdat, zonelist, 0); + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; + nr_zones = build_zonerefs_node(pgdat, zonerefs); + zonerefs += nr_zones; /* * Now we build the zonelist so that it contains the zones @@ -5162,16 +5160,18 @@ static void build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); + nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); + zonerefs += nr_zones; } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); + nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); + zonerefs += nr_zones; } - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } #endif /* CONFIG_NUMA */ @@ -5194,50 +5194,32 @@ static void build_zonelists(pg_data_t *pgdat) static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); -static void setup_zone_pageset(struct zone *zone); - -/* - * Global mutex to protect against size modification of zonelists - * as well as to serialize pageset setup for the new populated zone. - */ -DEFINE_MUTEX(zonelists_mutex); -/* return values int ....just for stop_machine() */ -static int __build_all_zonelists(void *data) +static void __build_all_zonelists(void *data) { int nid; - int cpu; + int __maybe_unused cpu; pg_data_t *self = data; + static DEFINE_SPINLOCK(lock); + + spin_lock(&lock); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); #endif + /* + * This node is hotadded and no memory is yet present. So just + * building zonelists is fine - no need to touch other nodes. + */ if (self && !node_online(self->node_id)) { build_zonelists(self); - } - - for_each_online_node(nid) { - pg_data_t *pgdat = NODE_DATA(nid); - - build_zonelists(pgdat); - } + } else { + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); - /* - * Initialize the boot_pagesets that are going to be used - * for bootstrapping processors. The real pagesets for - * each zone will be allocated later when the per cpu - * allocator is available. - * - * boot_pagesets are used also for bootstrapping offline - * cpus if the system is already booted because the pagesets - * are needed to initialize allocators on a specific cpu too. - * F.e. the percpu allocator needs the page allocator which - * needs the percpu allocator in order to allocate its pagesets - * (a chicken-egg dilemma). - */ - for_each_possible_cpu(cpu) { - setup_pageset(&per_cpu(boot_pageset, cpu), 0); + build_zonelists(pgdat); + } #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* @@ -5248,45 +5230,53 @@ static int __build_all_zonelists(void *data) * secondary cpus' numa_mem as they come on-line. During * node/memory hotplug, we'll fixup all on-line cpus. */ - if (cpu_online(cpu)) + for_each_online_cpu(cpu) set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); #endif } - return 0; + spin_unlock(&lock); } static noinline void __init build_all_zonelists_init(void) { + int cpu; + __build_all_zonelists(NULL); + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) + setup_pageset(&per_cpu(boot_pageset, cpu), 0); + mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } /* - * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. * - * __ref due to (1) call of __meminit annotated setup_zone_pageset - * [we're only called with non-NULL zone through __meminit paths] and - * (2) call of __init annotated helper build_all_zonelists_init + * __ref due to call of __init annotated helper build_all_zonelists_init * [protected by SYSTEM_BOOTING]. */ -void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) +void __ref build_all_zonelists(pg_data_t *pgdat) { - set_zonelist_order(); - if (system_state == SYSTEM_BOOTING) { build_all_zonelists_init(); } else { -#ifdef CONFIG_MEMORY_HOTPLUG - if (zone) - setup_zone_pageset(zone); -#endif - /* we have to stop all cpus to guarantee there is no user - of zonelist */ - stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL); + __build_all_zonelists(pgdat); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); @@ -5302,9 +5292,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", + pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", nr_online_nodes, - zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); #ifdef CONFIG_NUMA @@ -5403,6 +5392,7 @@ not_early: __init_single_page(page, pfn, zone, nid); set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); } else { __init_single_pfn(pfn, zone, nid); } @@ -5558,7 +5548,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) pageset_set_high_and_batch(zone, pcp); } -static void __meminit setup_zone_pageset(struct zone *zone) +void __meminit setup_zone_pageset(struct zone *zone) { int cpu; zone->pageset = alloc_percpu(struct per_cpu_pageset); @@ -6161,6 +6151,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) } } +#ifdef CONFIG_FLAT_NODE_MEM_MAP static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; @@ -6170,7 +6161,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) if (!pgdat->node_spanned_pages) return; -#ifdef CONFIG_FLAT_NODE_MEM_MAP start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); offset = pgdat->node_start_pfn - start; /* ia64 gets its own node_mem_map, before this, without bootmem */ @@ -6192,6 +6182,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) pgdat->node_id); pgdat->node_mem_map = map + offset; } + pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", + __func__, pgdat->node_id, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); #ifndef CONFIG_NEED_MULTIPLE_NODES /* * With no DISCONTIG, the global mem_map is just set as node 0's @@ -6204,8 +6197,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif -#endif /* CONFIG_FLAT_NODE_MEM_MAP */ } +#else +static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } +#endif /* CONFIG_FLAT_NODE_MEM_MAP */ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) @@ -6232,16 +6227,49 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, zones_size, zholes_size); alloc_node_mem_map(pgdat); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", - nid, (unsigned long)pgdat, - (unsigned long)pgdat->node_mem_map); -#endif reset_deferred_meminit(pgdat); free_area_init_core(pgdat); } +#ifdef CONFIG_HAVE_MEMBLOCK +/* + * Only struct pages that are backed by physical memory are zeroed and + * initialized by going through __init_single_page(). But, there are some + * struct pages which are reserved in memblock allocator and their fields + * may be accessed (for example page_to_pfn() on some configuration accesses + * flags). We must explicitly zero those struct pages. + */ +void __paginginit zero_resv_unavail(void) +{ + phys_addr_t start, end; + unsigned long pfn; + u64 i, pgcnt; + + /* + * Loop through ranges that are reserved, but do not have reported + * physical memory backing. + */ + pgcnt = 0; + for_each_resv_unavail_range(i, &start, &end) { + for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { + mm_zero_struct_page(pfn_to_page(pfn)); + pgcnt++; + } + } + + /* + * Struct pages that do not have backing memory. This could be because + * firmware is using some of this memory, or for some other reasons. + * Once memblock is changed so such behaviour is not allowed: i.e. + * list of "reserved" memory must be a subset of list of "memory", then + * this code can be removed. + */ + if (pgcnt) + pr_info("Reserved but unavailable: %lld pages", pgcnt); +} +#endif /* CONFIG_HAVE_MEMBLOCK */ + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #if MAX_NUMNODES > 1 @@ -6665,6 +6693,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) node_set_state(nid, N_MEMORY); check_for_memory(pgdat, nid); } + zero_resv_unavail(); } static int __init cmdline_parse_core(char *p, unsigned long *core) @@ -6828,6 +6857,7 @@ void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + zero_resv_unavail(); } static int page_alloc_cpu_dead(unsigned int cpu) @@ -7012,9 +7042,11 @@ static void __setup_per_zone_wmarks(void) */ void setup_per_zone_wmarks(void) { - mutex_lock(&zonelists_mutex); + static DEFINE_SPINLOCK(lock); + + spin_lock(&lock); __setup_per_zone_wmarks(); - mutex_unlock(&zonelists_mutex); + spin_unlock(&lock); } /* @@ -7338,18 +7370,17 @@ void *__init alloc_large_system_hash(const char *tablename, log2qty = ilog2(numentries); - /* - * memblock allocator returns zeroed memory already, so HASH_ZERO is - * currently not used when HASH_EARLY is specified. - */ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { size = bucketsize << log2qty; - if (flags & HASH_EARLY) - table = memblock_virt_alloc_nopanic(size, 0); - else if (hashdist) + if (flags & HASH_EARLY) { + if (flags & HASH_ZERO) + table = memblock_virt_alloc_nopanic(size, 0); + else + table = memblock_virt_alloc_raw(size, 0); + } else if (hashdist) { table = __vmalloc(size, gfp_flags, PAGE_KERNEL); - else { + } else { /* * If bucketsize is not a power-of-two, we may free * some pages at the end of hash table which @@ -7386,10 +7417,10 @@ void *__init alloc_large_system_hash(const char *tablename, * race condition. So you can't expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, + int migratetype, bool skip_hwpoisoned_pages) { unsigned long pfn, iter, found; - int mt; /* * For avoiding noise data, lru_add_drain_all() should be called @@ -7397,8 +7428,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, */ if (zone_idx(zone) == ZONE_MOVABLE) return false; - mt = get_pageblock_migratetype(page); - if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) + + /* + * CMA allocations (alloc_contig_range) really need to mark isolate + * CMA pageblocks even when they are not movable in fact so consider + * them movable here. + */ + if (is_migrate_cma(migratetype) && + is_migrate_cma(get_pageblock_migratetype(page))) return false; pfn = page_to_pfn(page); @@ -7410,6 +7447,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, page = pfn_to_page(check); + if (PageReserved(page)) + return true; + /* * Hugepages are not in LRU lists, but they're movable. * We need not scan over tail pages bacause we don't @@ -7483,7 +7523,7 @@ bool is_pageblock_removable_nolock(struct page *page) if (!zone_spans_pfn(zone, pfn)) return false; - return !has_unmovable_pages(zone, page, 0, true); + return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); } #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) @@ -7579,6 +7619,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, + .no_set_skip_hint = true, .gfp_mask = current_gfp_context(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); @@ -7666,7 +7707,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { - pr_info("%s: [%lx, %lx) PFNs busy\n", + pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", __func__, outer_start, end); ret = -EBUSY; goto done; |