diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 278 |
1 files changed, 165 insertions, 113 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07efbc3a8656..2302f250d6b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -65,6 +65,7 @@ #include <linux/page_owner.h> #include <linux/kthread.h> #include <linux/memcontrol.h> +#include <linux/ftrace.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -291,6 +292,26 @@ int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void reset_deferred_meminit(pg_data_t *pgdat) { + unsigned long max_initialise; + unsigned long reserved_lowmem; + + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_initialise = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); + + /* + * Compensate the all the memblock reservations (e.g. crash kernel) + * from the initial estimation to make sure we will initialize enough + * memory to boot. + */ + reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, + pgdat->node_start_pfn + max_initialise); + max_initialise += reserved_lowmem; + + pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -313,20 +334,11 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { - unsigned long max_initialise; - /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* - * Initialise at least 2G of a node but also take into account that - * two large system hashes that can take up 1GB for 0.25TB/node. - */ - max_initialise = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); - (*nr_initialised)++; - if ((*nr_initialised > max_initialise) && + if ((*nr_initialised > pgdat->static_init_size) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -1090,14 +1102,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - unsigned long nr_scanned; bool isolated_pageblocks; spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); while (count) { struct page *page; @@ -1150,12 +1158,7 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { - unsigned long nr_scanned; spin_lock(&zone->lock); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); - if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); @@ -1698,10 +1701,10 @@ static inline int check_new_page(struct page *page) return 1; } -static inline bool free_pages_prezeroed(bool poisoned) +static inline bool free_pages_prezeroed(void) { return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled() && poisoned; + page_poisoning_enabled(); } #ifdef CONFIG_DEBUG_VM @@ -1755,17 +1758,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags unsigned int alloc_flags) { int i; - bool poisoned = true; - - for (i = 0; i < (1 << order); i++) { - struct page *p = page + i; - if (poisoned) - poisoned &= page_is_poisoned(p); - } post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) + if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); @@ -1847,9 +1843,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -int move_freepages(struct zone *zone, +static int move_freepages(struct zone *zone, struct page *start_page, struct page *end_page, - int migratetype) + int migratetype, int *num_movable) { struct page *page; unsigned int order; @@ -1866,6 +1862,9 @@ int move_freepages(struct zone *zone, VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); #endif + if (num_movable) + *num_movable = 0; + for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -1876,6 +1875,15 @@ int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); if (!PageBuddy(page)) { + /* + * We assume that pages that could be isolated for + * migration are movable. But we don't actually try + * isolating, as that would be expensive. + */ + if (num_movable && + (PageLRU(page) || __PageMovable(page))) + (*num_movable)++; + page++; continue; } @@ -1891,7 +1899,7 @@ int move_freepages(struct zone *zone, } int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) + int migratetype, int *num_movable) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -1908,7 +1916,8 @@ int move_freepages_block(struct zone *zone, struct page *page, if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_page, end_page, migratetype); + return move_freepages(zone, start_page, end_page, migratetype, + num_movable); } static void change_pageblock_range(struct page *pageblock_page, @@ -1958,28 +1967,79 @@ static bool can_steal_fallback(unsigned int order, int start_mt) /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this - * pageblock and check whether half of pages are moved or not. If half of - * pages are moved, we can change migratetype of pageblock and permanently - * use it's pages as requested migratetype in the future. + * pageblock to our migratetype and determine how many already-allocated pages + * are there in the pageblock with a compatible migratetype. If at least half + * of pages are free or compatible, we can change migratetype of the pageblock + * itself, so pages freed in the future will be put on the correct free list. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, - int start_type) + int start_type, bool whole_block) { unsigned int current_order = page_order(page); - int pages; + struct free_area *area; + int free_pages, movable_pages, alike_pages; + int old_block_type; + + old_block_type = get_pageblock_migratetype(page); + + /* + * This can happen due to races and we want to prevent broken + * highatomic accounting. + */ + if (is_migrate_highatomic(old_block_type)) + goto single_page; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { change_pageblock_range(page, current_order, start_type); - return; + goto single_page; } - pages = move_freepages_block(zone, page, start_type); + /* We are not allowed to try stealing from the whole block */ + if (!whole_block) + goto single_page; - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || + free_pages = move_freepages_block(zone, page, start_type, + &movable_pages); + /* + * Determine how many pages are compatible with our allocation. + * For movable allocation, it's the number of movable pages which + * we just obtained. For other types it's a bit more tricky. + */ + if (start_type == MIGRATE_MOVABLE) { + alike_pages = movable_pages; + } else { + /* + * If we are falling back a RECLAIMABLE or UNMOVABLE allocation + * to MOVABLE pageblock, consider all non-movable pages as + * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or + * vice versa, be conservative since we can't distinguish the + * exact migratetype of non-movable pages. + */ + if (old_block_type == MIGRATE_MOVABLE) + alike_pages = pageblock_nr_pages + - (free_pages + movable_pages); + else + alike_pages = 0; + } + + /* moving whole block can fail due to zone boundary conditions */ + if (!free_pages) + goto single_page; + + /* + * If a sufficient number of pages in the block are either free or of + * comparable migratability as our allocation, claim the whole block. + */ + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); + + return; + +single_page: + area = &zone->free_area[current_order]; + list_move(&page->lru, &area->free_list[start_type]); } /* @@ -2045,11 +2105,11 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, /* Yoink! */ mt = get_pageblock_migratetype(page); - if (mt != MIGRATE_HIGHATOMIC && - !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { + if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) + && !is_migrate_cma(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); - move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); } out_unlock: @@ -2103,8 +2163,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * from highatomic to ac->migratetype. So we should * adjust the count once. */ - if (get_pageblock_migratetype(page) == - MIGRATE_HIGHATOMIC) { + if (is_migrate_highatomic_page(page)) { /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -2127,7 +2186,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * may increase. */ set_pageblock_migratetype(page, ac->migratetype); - ret = move_freepages_block(zone, page, ac->migratetype); + ret = move_freepages_block(zone, page, ac->migratetype, + NULL); if (ret) { spin_unlock_irqrestore(&zone->lock, flags); return ret; @@ -2139,8 +2199,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, return false; } -/* Remove an element from the buddy allocator from the fallback list */ -static inline struct page * +/* + * Try finding a free buddy page on the fallback list and put it on the free + * list of requested migratetype, possibly along with other pages from the same + * block, depending on fragmentation avoidance heuristics. Returns true if + * fallback was found so that __rmqueue_smallest() can grab it. + */ +static inline bool __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; @@ -2161,33 +2226,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal && - get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) - steal_suitable_fallback(zone, page, start_migratetype); - - /* Remove the page from the freelists */ - area->nr_free--; - list_del(&page->lru); - rmv_page_order(page); - expand(zone, page, order, current_order, area, - start_migratetype); - /* - * The pcppage_migratetype may differ from pageblock's - * migratetype depending on the decisions in - * find_suitable_fallback(). This is OK as long as it does not - * differ for MIGRATE_CMA pageblocks. Those can be used as - * fallback only via special __rmqueue_cma_fallback() function - */ - set_pcppage_migratetype(page, start_migratetype); + steal_suitable_fallback(zone, page, start_migratetype, + can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); - return page; + return true; } - return NULL; + return false; } /* @@ -2199,13 +2248,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, { struct page *page; +retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); - if (!page) - page = __rmqueue_fallback(zone, order, migratetype); + if (!page && __rmqueue_fallback(zone, order, migratetype)) + goto retry; } trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -2502,7 +2552,7 @@ void free_hot_cold_page(struct page *page, bool cold) /* * We only track unmovable, reclaimable and movable on pcp lists. * Free ISOLATE pages back to the allocator because they are being - * offlined but treat RESERVE as movable pages so we can get those + * offlined but treat HIGHATOMIC as movable pages so we can get those * areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ @@ -2612,7 +2662,7 @@ int __isolate_free_page(struct page *page, unsigned int order) for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) - && mt != MIGRATE_HIGHATOMIC) + && !is_migrate_highatomic(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } @@ -3110,8 +3160,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || - debug_guardpage_minorder() > 0) + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) return; pr_warn("%s: ", current->comm); @@ -3245,14 +3294,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum compact_priority prio, enum compact_result *compact_result) { struct page *page; + unsigned int noreclaim_flag; if (!order) return NULL; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, prio); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); if (*compact_result <= COMPACT_INACTIVE) return NULL; @@ -3399,12 +3449,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, { struct reclaim_state reclaim_state; int progress; + unsigned int noreclaim_flag; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3414,7 +3465,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); cond_resched(); @@ -3522,19 +3573,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) } /* - * Maximum number of reclaim retries without any progress before OOM killer - * is consider as the only way to move forward. - */ -#define MAX_RECLAIM_RETRIES 16 - -/* * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. - * The reclaim feedback represented by did_some_progress (any progress during - * the last reclaim round) and no_progress_loops (number of reclaim rounds without - * any progress in a row) is considered as well as the reclaimable pages on the - * applicable zone list (with a backoff mechanism which is a function of - * no_progress_loops). + * + * We give up when we either have tried MAX_RECLAIM_RETRIES in a row + * without success, or when we couldn't even meet the watermark if we + * reclaimed all remaining pages on the LRU lists. * * Returns true if a retry is viable or false to enter the oom path. */ @@ -3579,13 +3623,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, bool wmark; available = reclaimable = zone_reclaimable_pages(zone); - available -= DIV_ROUND_UP((*no_progress_loops) * available, - MAX_RECLAIM_RETRIES); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); /* - * Would the allocation succeed if we reclaimed the whole - * available? + * Would the allocation succeed if we reclaimed all + * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, ac_classzone_idx(ac), alloc_flags, available); @@ -3636,6 +3678,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; @@ -3703,12 +3746,17 @@ retry_cpuset: /* * For costly allocations, try direct compaction first, as it's likely - * that we have enough base pages and don't need to reclaim. Don't try - * that for allocations that are allowed to ignore watermarks, as the - * ALLOC_NO_WATERMARKS attempt didn't yet happen. + * that we have enough base pages and don't need to reclaim. For non- + * movable high-order allocations, do that as well, as compaction will + * try prevent permanent fragmentation by migrating from blocks of the + * same migratetype. + * Don't try this for allocations that are allowed to ignore + * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && - !gfp_pfmemalloc_allowed(gfp_mask)) { + if (can_direct_reclaim && + (costly_order || + (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) + && !gfp_pfmemalloc_allowed(gfp_mask)) { page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, INIT_COMPACT_PRIORITY, @@ -3720,7 +3768,7 @@ retry_cpuset: * Checks for costly allocations with __GFP_NORETRY, which * includes THP page fault allocations */ - if (gfp_mask & __GFP_NORETRY) { + if (costly_order && (gfp_mask & __GFP_NORETRY)) { /* * If compaction is deferred for high-order allocations, * it is because sync compaction recently failed. If @@ -3771,7 +3819,7 @@ retry: /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, ac->nodemask, + warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; @@ -3801,7 +3849,7 @@ retry: * Do not retry costly high order allocations unless they are * __GFP_REPEAT */ - if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) + if (costly_order && !(gfp_mask & __GFP_REPEAT)) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -3833,7 +3881,9 @@ retry: goto got_pg; /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE)) + if (test_thread_flag(TIF_MEMDIE) && + (alloc_flags == ALLOC_NO_WATERMARKS || + (gfp_mask & __GFP_NOMEMALLOC))) goto nopage; /* Retry as long as the OOM killer is making progress */ @@ -3971,10 +4021,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, goto out; /* - * Runtime PM, block IO and its error handling path can deadlock - * because I/O on the device might not complete. + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. */ - alloc_mask = memalloc_noio_flags(gfp_mask); + alloc_mask = current_gfp_context(gfp_mask); ac.spread_dirty_pages = false; /* @@ -4247,7 +4299,8 @@ EXPORT_SYMBOL(free_pages_exact); * nr_free_zone_pages() counts the number of counts pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: - * managed_pages - high_pages + * + * nr_free_zone_pages = managed_pages - high_pages */ static unsigned long nr_free_zone_pages(int offset) { @@ -4509,7 +4562,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif " writeback_tmp:%lukB" " unstable:%lukB" - " pages_scanned:%lu" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -4532,8 +4584,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), - node_page_state(pgdat, NR_PAGES_SCANNED), - !pgdat_reclaimable(pgdat) ? "yes" : "no"); + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? + "yes" : "no"); } for_each_populated_zone(zone) { @@ -6097,7 +6149,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); - reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; pgdat->per_cpu_nodestats = NULL; @@ -6119,6 +6170,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif + reset_deferred_meminit(pgdat); free_area_init_core(pgdat); } @@ -7428,7 +7480,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, - .gfp_mask = memalloc_noio_flags(gfp_mask), + .gfp_mask = current_gfp_context(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); |