diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 757 |
1 files changed, 412 insertions, 345 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2302f250d6b1..77e4d3c5c57b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -66,6 +66,8 @@ #include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> +#include <linux/lockdep.h> +#include <linux/nmi.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -113,9 +115,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { #ifdef CONFIG_HIGHMEM [N_HIGH_MEMORY] = { { [0] = 1UL } }, #endif -#ifdef CONFIG_MOVABLE_NODE [N_MEMORY] = { { [0] = 1UL } }, -#endif [N_CPU] = { { [0] = 1UL } }, #endif /* NUMA */ }; @@ -511,7 +511,7 @@ static int page_is_consistent(struct zone *zone, struct page *page) /* * Temporary debugging check for pages not lying within a given zone. */ -static int bad_range(struct zone *zone, struct page *page) +static int __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) return 1; @@ -521,7 +521,7 @@ static int bad_range(struct zone *zone, struct page *page) return 0; } #else -static inline int bad_range(struct zone *zone, struct page *page) +static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) { return 0; } @@ -1190,7 +1190,7 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void init_reserved_page(unsigned long pfn) +static void __meminit init_reserved_page(unsigned long pfn) { pg_data_t *pgdat; int nid, zid; @@ -1297,8 +1297,9 @@ int __meminit early_pfn_to_nid(unsigned long pfn) #endif #ifdef CONFIG_NODES_SPAN_OTHER_NODES -static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +static inline bool __meminit __maybe_unused +meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) { int nid; @@ -1320,8 +1321,9 @@ static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { return true; } -static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +static inline bool __meminit __maybe_unused +meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) { return true; } @@ -1365,7 +1367,9 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) return NULL; - start_page = pfn_to_page(start_pfn); + start_page = pfn_to_online_page(start_pfn); + if (!start_page) + return NULL; if (page_zone(start_page) != zone) return NULL; @@ -1582,6 +1586,10 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + /* Discard memblock private memory */ + memblock_discard(); +#endif for_each_populated_zone(zone) set_zone_contiguous(zone); @@ -2204,19 +2212,26 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * list of requested migratetype, possibly along with other pages from the same * block, depending on fragmentation avoidance heuristics. Returns true if * fallback was found so that __rmqueue_smallest() can grab it. + * + * The use of signed ints for order and current_order is a deliberate + * deviation from the rest of this file, to make the for loop + * condition simpler. */ static inline bool -__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { struct free_area *area; - unsigned int current_order; + int current_order; struct page *page; int fallback_mt; bool can_steal; - /* Find the largest possible block of pages in the other list */ - for (current_order = MAX_ORDER-1; - current_order >= order && current_order <= MAX_ORDER-1; + /* + * Find the largest available free page in the other list. This roughly + * approximates finding the pageblock with the most free pages, which + * would be too costly to do exactly. + */ + for (current_order = MAX_ORDER - 1; current_order >= order; --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2224,19 +2239,50 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) if (fallback_mt == -1) continue; - page = list_first_entry(&area->free_list[fallback_mt], - struct page, lru); + /* + * We cannot steal all free pages from the pageblock and the + * requested migratetype is movable. In that case it's better to + * steal and split the smallest available page instead of the + * largest available page, because even if the next movable + * allocation falls back into a different pageblock than this + * one, it won't cause permanent fragmentation. + */ + if (!can_steal && start_migratetype == MIGRATE_MOVABLE + && current_order > order) + goto find_smallest; - steal_suitable_fallback(zone, page, start_migratetype, - can_steal); + goto do_steal; + } - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, fallback_mt); + return false; - return true; +find_smallest: + for (current_order = order; current_order < MAX_ORDER; + current_order++) { + area = &(zone->free_area[current_order]); + fallback_mt = find_suitable_fallback(area, current_order, + start_migratetype, false, &can_steal); + if (fallback_mt != -1) + break; } - return false; + /* + * This should not happen - we already found a suitable fallback + * when looking for the largest page. + */ + VM_BUG_ON(current_order == MAX_ORDER); + +do_steal: + page = list_first_entry(&area->free_list[fallback_mt], + struct page, lru); + + steal_suitable_fallback(zone, page, start_migratetype, can_steal); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + + return true; + } /* @@ -2491,9 +2537,14 @@ void drain_all_pages(struct zone *zone) #ifdef CONFIG_HIBERNATION +/* + * Touch the watchdog for every WD_PAGE_COUNT pages. + */ +#define WD_PAGE_COUNT (128*1024) + void mark_free_pages(struct zone *zone) { - unsigned long pfn, max_zone_pfn; + unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; unsigned long flags; unsigned int order, t; struct page *page; @@ -2508,6 +2559,11 @@ void mark_free_pages(struct zone *zone) if (pfn_valid(pfn)) { page = pfn_to_page(pfn); + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + if (page_zone(page) != zone) continue; @@ -2521,8 +2577,13 @@ void mark_free_pages(struct zone *zone) unsigned long i; pfn = page_to_pfn(page); - for (i = 0; i < (1UL << order); i++) + for (i = 0; i < (1UL << order); i++) { + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } swsusp_set_page_free(pfn_to_page(pfn + i)); + } } } spin_unlock_irqrestore(&zone->lock, flags); @@ -2680,18 +2741,18 @@ int __isolate_free_page(struct page *page, unsigned int order) static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA - enum zone_stat_item local_stat = NUMA_LOCAL; + enum numa_stat_item local_stat = NUMA_LOCAL; if (z->node != numa_node_id()) local_stat = NUMA_OTHER; if (z->node == preferred_zone->node) - __inc_zone_state(z, NUMA_HIT); + __inc_numa_state(z, NUMA_HIT); else { - __inc_zone_state(z, NUMA_MISS); - __inc_zone_state(preferred_zone, NUMA_FOREIGN); + __inc_numa_state(z, NUMA_MISS); + __inc_numa_state(preferred_zone, NUMA_FOREIGN); } - __inc_zone_state(z, local_stat); + __inc_numa_state(z, local_stat); #endif } @@ -2890,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, { long min = mark; int o; - const bool alloc_harder = (alloc_flags & ALLOC_HARDER); + const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -2903,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * the high-atomic reserves. This will over-estimate the size of the * atomic reserve but it avoids a search. */ - if (likely(!alloc_harder)) + if (likely(!alloc_harder)) { free_pages -= z->nr_reserved_highatomic; - else - min -= min / 4; + } else { + /* + * OOM victims can try even harder than normal ALLOC_HARDER + * users on the grounds that it's definitely going to be in + * the exit path shortly and free memory. Any allocation it + * makes during the free path will be small and short-lived. + */ + if (alloc_flags & ALLOC_OOM) + min -= min / 2; + else + min -= min / 4; + } + #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ @@ -3144,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) * of allowed nodes. */ if (!(gfp_mask & __GFP_NOMEMALLOC)) - if (test_thread_flag(TIF_MEMDIE) || + if (tsk_is_oom_victim(current) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) @@ -3231,10 +3303,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if - * we're still under heavy pressure. + * we're still under heavy pressure. But make sure that this reclaim + * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY + * allocation which will never fail due to oom_lock already held. */ - page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, - ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); + page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & + ~__GFP_DIRECT_RECLAIM, order, + ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); if (page) goto out; @@ -3244,6 +3319,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer will not help higher order allocs */ if (order > PAGE_ALLOC_COSTLY_ORDER) goto out; + /* + * We have already exhausted all our reclaim opportunities without any + * success so it is time to admit defeat. We will skip the OOM killer + * because it is very likely that the caller has a more reasonable + * fallback than shooting a random task. + */ + if (gfp_mask & __GFP_RETRY_MAYFAIL) + goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; @@ -3373,7 +3456,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, } /* - * !costly requests are much more important than __GFP_REPEAT + * !costly requests are much more important than __GFP_RETRY_MAYFAIL * costly ones because they are de facto nofail and invoke OOM * killer to move on while costly can fail and users are ready * to cope with that. 1/4 retries is rather arbitrary but we @@ -3442,6 +3525,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla } #endif /* CONFIG_COMPACTION */ +#ifdef CONFIG_LOCKDEP +struct lockdep_map __fs_reclaim_map = + STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); + +static bool __need_fs_reclaim(gfp_t gfp_mask) +{ + gfp_mask = current_gfp_context(gfp_mask); + + /* no reclaim without waiting on it */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) + return false; + + /* this guy won't enter reclaim */ + if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + return false; + + /* We're only interested __GFP_FS allocations for now */ + if (!(gfp_mask & __GFP_FS)) + return false; + + if (gfp_mask & __GFP_NOLOCKDEP) + return false; + + return true; +} + +void fs_reclaim_acquire(gfp_t gfp_mask) +{ + if (__need_fs_reclaim(gfp_mask)) + lock_map_acquire(&__fs_reclaim_map); +} +EXPORT_SYMBOL_GPL(fs_reclaim_acquire); + +void fs_reclaim_release(gfp_t gfp_mask) +{ + if (__need_fs_reclaim(gfp_mask)) + lock_map_release(&__fs_reclaim_map); +} +EXPORT_SYMBOL_GPL(fs_reclaim_release); +#endif + /* Perform direct synchronous page reclaim */ static int __perform_reclaim(gfp_t gfp_mask, unsigned int order, @@ -3456,7 +3580,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); noreclaim_flag = memalloc_noreclaim_save(); - lockdep_set_current_reclaim_state(gfp_mask); + fs_reclaim_acquire(gfp_mask); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3464,7 +3588,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, ac->nodemask); current->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); + fs_reclaim_release(gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); cond_resched(); @@ -3555,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask) return alloc_flags; } -bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +static bool oom_reserves_allowed(struct task_struct *tsk) { - if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + if (!tsk_is_oom_victim(tsk)) + return false; + + /* + * !MMU doesn't have oom reaper so give access to memory reserves + * only to the thread with TIF_MEMDIE set + */ + if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) return false; + return true; +} + +/* + * Distinguish requests which really need access to full memory + * reserves from oom victims which can live with a portion of it + */ +static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) +{ + if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + return 0; if (gfp_mask & __GFP_MEMALLOC) - return true; + return ALLOC_NO_WATERMARKS; if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) - return true; - if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) - return true; + return ALLOC_NO_WATERMARKS; + if (!in_interrupt()) { + if (current->flags & PF_MEMALLOC) + return ALLOC_NO_WATERMARKS; + else if (oom_reserves_allowed(current)) + return ALLOC_OOM; + } - return false; + return 0; +} + +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ + return !!__gfp_pfmemalloc_flags(gfp_mask); } /* @@ -3673,6 +3822,39 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, return false; } +static inline bool +check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) +{ + /* + * It's possible that cpuset's mems_allowed and the nodemask from + * mempolicy don't intersect. This should be normally dealt with by + * policy_nodemask(), but it's possible to race with cpuset update in + * such a way the check therein was true, and then it became false + * before we got our cpuset_mems_cookie here. + * This assumes that for all allocations, ac->nodemask can come only + * from MPOL_BIND mempolicy (whose documented semantics is to be ignored + * when it does not intersect with the cpuset restrictions) or the + * caller can deal with a violated nodemask. + */ + if (cpusets_enabled() && ac->nodemask && + !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { + ac->nodemask = NULL; + return true; + } + + /* + * When updating a task's mems_allowed or mempolicy nodemask, it is + * possible to race with parallel threads in such a way that our + * allocation can fail while the mask is being updated. If we are about + * to fail, check if the cpuset changed during allocation and if so, + * retry. + */ + if (read_mems_allowed_retry(cpuset_mems_cookie)) + return true; + + return false; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -3689,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long alloc_start = jiffies; unsigned int stall_timeout = 10 * HZ; unsigned int cpuset_mems_cookie; + int reserve_flags; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3794,15 +3977,16 @@ retry: if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); - if (gfp_pfmemalloc_allowed(gfp_mask)) - alloc_flags = ALLOC_NO_WATERMARKS; + reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); + if (reserve_flags) + alloc_flags = reserve_flags; /* * Reset the zonelist iterators if memory policies can be ignored. * These allocations are high priority and system rather than user * orientated. */ - if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { + if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); @@ -3847,9 +4031,9 @@ retry: /* * Do not retry costly high order allocations unless they are - * __GFP_REPEAT + * __GFP_RETRY_MAYFAIL */ - if (costly_order && !(gfp_mask & __GFP_REPEAT)) + if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -3868,11 +4052,9 @@ retry: &compaction_retries)) goto retry; - /* - * It's possible we raced with cpuset update so the OOM would be - * premature (see below the nopage: label for full explanation). - */ - if (read_mems_allowed_retry(cpuset_mems_cookie)) + + /* Deal with possible cpuset update races before we start OOM killing */ + if (check_retry_cpuset(cpuset_mems_cookie, ac)) goto retry_cpuset; /* Reclaim has failed us, start killing things */ @@ -3881,8 +4063,8 @@ retry: goto got_pg; /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE) && - (alloc_flags == ALLOC_NO_WATERMARKS || + if (tsk_is_oom_victim(current) && + (alloc_flags == ALLOC_OOM || (gfp_mask & __GFP_NOMEMALLOC))) goto nopage; @@ -3893,14 +4075,8 @@ retry: } nopage: - /* - * When updating a task's mems_allowed or mempolicy nodemask, it is - * possible to race with parallel threads in such a way that our - * allocation can fail while the mask is being updated. If we are about - * to fail, check if the cpuset changed during allocation and if so, - * retry. - */ - if (read_mems_allowed_retry(cpuset_mems_cookie)) + /* Deal with possible cpuset update races before we fail */ + if (check_retry_cpuset(cpuset_mems_cookie, ac)) goto retry_cpuset; /* @@ -3951,12 +4127,12 @@ got_pg: } static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask, + int preferred_nid, nodemask_t *nodemask, struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags) { ac->high_zoneidx = gfp_zone(gfp_mask); - ac->zonelist = zonelist; + ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; ac->migratetype = gfpflags_to_migratetype(gfp_mask); @@ -3968,7 +4144,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, *alloc_flags |= ALLOC_CPUSET; } - lockdep_trace_alloc(gfp_mask); + fs_reclaim_acquire(gfp_mask); + fs_reclaim_release(gfp_mask); might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); @@ -4001,16 +4178,17 @@ static inline void finalise_ac(gfp_t gfp_mask, * This is the 'heart' of the zoned buddy allocator. */ struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, + nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ + gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { }; gfp_mask &= gfp_allowed_mask; - if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) + alloc_mask = gfp_mask; + if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; finalise_ac(gfp_mask, order, &ac); @@ -4370,7 +4548,7 @@ long si_mem_available(void) * Estimate the amount of memory available for userspace allocations, * without causing swapping. */ - available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; + available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will @@ -4385,8 +4563,9 @@ long si_mem_available(void) * Part of the reclaimable slab consists of items that are in use, * and cannot be freed. Cap this estimate at the low watermark. */ - available += global_page_state(NR_SLAB_RECLAIMABLE) - - min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + available += global_node_page_state(NR_SLAB_RECLAIMABLE) - + min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, + wmark_low); if (available < 0) available = 0; @@ -4398,7 +4577,7 @@ void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; val->sharedram = global_node_page_state(NR_SHMEM); - val->freeram = global_page_state(NR_FREE_PAGES); + val->freeram = global_zone_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; val->freehigh = nr_free_highpages(); @@ -4529,15 +4708,15 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), global_node_page_state(NR_UNSTABLE_NFS), - global_page_state(NR_SLAB_RECLAIMABLE), - global_page_state(NR_SLAB_UNRECLAIMABLE), + global_node_page_state(NR_SLAB_RECLAIMABLE), + global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), - global_page_state(NR_PAGETABLE), - global_page_state(NR_BOUNCE), - global_page_state(NR_FREE_PAGES), + global_zone_page_state(NR_PAGETABLE), + global_zone_page_state(NR_BOUNCE), + global_zone_page_state(NR_FREE_PAGES), free_pcp, - global_page_state(NR_FREE_CMA_PAGES)); + global_zone_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) @@ -4614,8 +4793,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " slab_reclaimable:%lukB" - " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" " bounce:%lukB" @@ -4637,8 +4814,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), - K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), @@ -4703,18 +4878,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) * * Add all populated zones of a node to the zonelist. */ -static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, - int nr_zones) +static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) { struct zone *zone; enum zone_type zone_type = MAX_NR_ZONES; + int nr_zones = 0; do { zone_type--; zone = pgdat->node_zones + zone_type; if (managed_zone(zone)) { - zoneref_set_zone(zone, - &zonelist->_zonerefs[nr_zones++]); + zoneref_set_zone(zone, &zonerefs[nr_zones++]); check_highest_zone(zone_type); } } while (zone_type); @@ -4722,52 +4896,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, return nr_zones; } - -/* - * zonelist_order: - * 0 = automatic detection of better ordering. - * 1 = order by ([node] distance, -zonetype) - * 2 = order by (-zonetype, [node] distance) - * - * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create - * the same zonelist. So only NUMA can configure this param. - */ -#define ZONELIST_ORDER_DEFAULT 0 -#define ZONELIST_ORDER_NODE 1 -#define ZONELIST_ORDER_ZONE 2 - -/* zonelist order in the kernel. - * set_zonelist_order() will set this to NODE or ZONE. - */ -static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; -static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; - - #ifdef CONFIG_NUMA -/* The value user specified ....changed by config */ -static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; -/* string for sysctl */ -#define NUMA_ZONELIST_ORDER_LEN 16 -char numa_zonelist_order[16] = "default"; - -/* - * interface for configure zonelist ordering. - * command line option "numa_zonelist_order" - * = "[dD]efault - default, automatic configuration. - * = "[nN]ode - order by node locality, then by zone within node - * = "[zZ]one - order by zone, then by locality within zone - */ static int __parse_numa_zonelist_order(char *s) { - if (*s == 'd' || *s == 'D') { - user_zonelist_order = ZONELIST_ORDER_DEFAULT; - } else if (*s == 'n' || *s == 'N') { - user_zonelist_order = ZONELIST_ORDER_NODE; - } else if (*s == 'z' || *s == 'Z') { - user_zonelist_order = ZONELIST_ORDER_ZONE; - } else { - pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); + /* + * We used to support different zonlists modes but they turned + * out to be just not useful. Let's keep the warning in place + * if somebody still use the cmd line parameter so that we do + * not fail it silently + */ + if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { + pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -4775,19 +4915,15 @@ static int __parse_numa_zonelist_order(char *s) static __init int setup_numa_zonelist_order(char *s) { - int ret; - if (!s) return 0; - ret = __parse_numa_zonelist_order(s); - if (ret == 0) - strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); - - return ret; + return __parse_numa_zonelist_order(s); } early_param("numa_zonelist_order", setup_numa_zonelist_order); +char numa_zonelist_order[] = "Node"; + /* * sysctl handler for numa_zonelist_order */ @@ -4795,40 +4931,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - char saved_string[NUMA_ZONELIST_ORDER_LEN]; + char *str; int ret; - static DEFINE_MUTEX(zl_order_mutex); - mutex_lock(&zl_order_mutex); - if (write) { - if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { - ret = -EINVAL; - goto out; - } - strcpy(saved_string, (char *)table->data); - } - ret = proc_dostring(table, write, buffer, length, ppos); - if (ret) - goto out; - if (write) { - int oldval = user_zonelist_order; + if (!write) + return proc_dostring(table, write, buffer, length, ppos); + str = memdup_user_nul(buffer, 16); + if (IS_ERR(str)) + return PTR_ERR(str); - ret = __parse_numa_zonelist_order((char *)table->data); - if (ret) { - /* - * bogus value. restore saved string - */ - strncpy((char *)table->data, saved_string, - NUMA_ZONELIST_ORDER_LEN); - user_zonelist_order = oldval; - } else if (oldval != user_zonelist_order) { - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); - } - } -out: - mutex_unlock(&zl_order_mutex); + ret = __parse_numa_zonelist_order(str); + kfree(str); return ret; } @@ -4902,17 +5015,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) * This results in maximum locality--normal zone overflows into local * DMA zone, if any--but risks exhausting DMA zone. */ -static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) +static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, + unsigned nr_nodes) { - int j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int i; + + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; + + for (i = 0; i < nr_nodes; i++) { + int nr_zones; + + pg_data_t *node = NODE_DATA(node_order[i]); - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) - ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + nr_zones = build_zonerefs_node(node, zonerefs); + zonerefs += nr_zones; + } + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } /* @@ -4920,13 +5040,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) */ static void build_thisnode_zonelists(pg_data_t *pgdat) { - int j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int nr_zones; - zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; - j = build_zonelists_node(pgdat, zonelist, 0); - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; + nr_zones = build_zonerefs_node(pgdat, zonerefs); + zonerefs += nr_zones; + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } /* @@ -4935,79 +5056,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) * exhausted, but results in overflowing to remote node while memory * may still exist in local DMA zone. */ -static int node_order[MAX_NUMNODES]; - -static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) -{ - int pos, j, node; - int zone_type; /* needs to be signed */ - struct zone *z; - struct zonelist *zonelist; - - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - pos = 0; - for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { - for (j = 0; j < nr_nodes; j++) { - node = node_order[j]; - z = &NODE_DATA(node)->node_zones[zone_type]; - if (managed_zone(z)) { - zoneref_set_zone(z, - &zonelist->_zonerefs[pos++]); - check_highest_zone(zone_type); - } - } - } - zonelist->_zonerefs[pos].zone = NULL; - zonelist->_zonerefs[pos].zone_idx = 0; -} - -#if defined(CONFIG_64BIT) -/* - * Devices that require DMA32/DMA are relatively rare and do not justify a - * penalty to every machine in case the specialised case applies. Default - * to Node-ordering on 64-bit NUMA machines - */ -static int default_zonelist_order(void) -{ - return ZONELIST_ORDER_NODE; -} -#else -/* - * On 32-bit, the Normal zone needs to be preserved for allocations accessible - * by the kernel. If processes running on node 0 deplete the low memory zone - * then reclaim will occur more frequency increasing stalls and potentially - * be easier to OOM if a large percentage of the zone is under writeback or - * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. - * Hence, default to zone ordering on 32-bit. - */ -static int default_zonelist_order(void) -{ - return ZONELIST_ORDER_ZONE; -} -#endif /* CONFIG_64BIT */ - -static void set_zonelist_order(void) -{ - if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) - current_zonelist_order = default_zonelist_order(); - else - current_zonelist_order = user_zonelist_order; -} static void build_zonelists(pg_data_t *pgdat) { - int i, node, load; + static int node_order[MAX_NUMNODES]; + int node, load, nr_nodes = 0; nodemask_t used_mask; int local_node, prev_node; - struct zonelist *zonelist; - unsigned int order = current_zonelist_order; - - /* initialize zonelists */ - for (i = 0; i < MAX_ZONELISTS; i++) { - zonelist = pgdat->node_zonelists + i; - zonelist->_zonerefs[0].zone = NULL; - zonelist->_zonerefs[0].zone_idx = 0; - } /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; @@ -5016,8 +5071,6 @@ static void build_zonelists(pg_data_t *pgdat) nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); - i = 0; - while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { /* * We don't want to pressure a particular node. @@ -5028,19 +5081,12 @@ static void build_zonelists(pg_data_t *pgdat) node_distance(local_node, prev_node)) node_load[node] = load; + node_order[nr_nodes++] = node; prev_node = node; load--; - if (order == ZONELIST_ORDER_NODE) - build_zonelists_in_node_order(pgdat, node); - else - node_order[i++] = node; /* remember order */ - } - - if (order == ZONELIST_ORDER_ZONE) { - /* calculate node order -- i.e., DMA last! */ - build_zonelists_in_zone_order(pgdat, i); } + build_zonelists_in_node_order(pgdat, node_order, nr_nodes); build_thisnode_zonelists(pgdat); } @@ -5066,21 +5112,17 @@ static void setup_min_unmapped_ratio(void); static void setup_min_slab_ratio(void); #else /* CONFIG_NUMA */ -static void set_zonelist_order(void) -{ - current_zonelist_order = ZONELIST_ORDER_ZONE; -} - static void build_zonelists(pg_data_t *pgdat) { int node, local_node; - enum zone_type j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int nr_zones; local_node = pgdat->node_id; - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - j = build_zonelists_node(pgdat, zonelist, 0); + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; + nr_zones = build_zonerefs_node(pgdat, zonerefs); + zonerefs += nr_zones; /* * Now we build the zonelist so that it contains the zones @@ -5093,16 +5135,18 @@ static void build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); + nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); + zonerefs += nr_zones; } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); + nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); + zonerefs += nr_zones; } - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } #endif /* CONFIG_NUMA */ @@ -5124,50 +5168,33 @@ static void build_zonelists(pg_data_t *pgdat) */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); -static void setup_zone_pageset(struct zone *zone); +static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); -/* - * Global mutex to protect against size modification of zonelists - * as well as to serialize pageset setup for the new populated zone. - */ -DEFINE_MUTEX(zonelists_mutex); - -/* return values int ....just for stop_machine() */ -static int __build_all_zonelists(void *data) +static void __build_all_zonelists(void *data) { int nid; - int cpu; + int __maybe_unused cpu; pg_data_t *self = data; + static DEFINE_SPINLOCK(lock); + + spin_lock(&lock); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); #endif + /* + * This node is hotadded and no memory is yet present. So just + * building zonelists is fine - no need to touch other nodes. + */ if (self && !node_online(self->node_id)) { build_zonelists(self); - } - - for_each_online_node(nid) { - pg_data_t *pgdat = NODE_DATA(nid); - - build_zonelists(pgdat); - } + } else { + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); - /* - * Initialize the boot_pagesets that are going to be used - * for bootstrapping processors. The real pagesets for - * each zone will be allocated later when the per cpu - * allocator is available. - * - * boot_pagesets are used also for bootstrapping offline - * cpus if the system is already booted because the pagesets - * are needed to initialize allocators on a specific cpu too. - * F.e. the percpu allocator needs the page allocator which - * needs the percpu allocator in order to allocate its pagesets - * (a chicken-egg dilemma). - */ - for_each_possible_cpu(cpu) { - setup_pageset(&per_cpu(boot_pageset, cpu), 0); + build_zonelists(pgdat); + } #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* @@ -5178,45 +5205,53 @@ static int __build_all_zonelists(void *data) * secondary cpus' numa_mem as they come on-line. During * node/memory hotplug, we'll fixup all on-line cpus. */ - if (cpu_online(cpu)) + for_each_online_cpu(cpu) set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); #endif } - return 0; + spin_unlock(&lock); } static noinline void __init build_all_zonelists_init(void) { + int cpu; + __build_all_zonelists(NULL); + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) + setup_pageset(&per_cpu(boot_pageset, cpu), 0); + mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } /* - * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. * - * __ref due to (1) call of __meminit annotated setup_zone_pageset - * [we're only called with non-NULL zone through __meminit paths] and - * (2) call of __init annotated helper build_all_zonelists_init + * __ref due to call of __init annotated helper build_all_zonelists_init * [protected by SYSTEM_BOOTING]. */ -void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) +void __ref build_all_zonelists(pg_data_t *pgdat) { - set_zonelist_order(); - if (system_state == SYSTEM_BOOTING) { build_all_zonelists_init(); } else { -#ifdef CONFIG_MEMORY_HOTPLUG - if (zone) - setup_zone_pageset(zone); -#endif - /* we have to stop all cpus to guarantee there is no user - of zonelist */ - stop_machine(__build_all_zonelists, pgdat, NULL); + __build_all_zonelists(pgdat); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); @@ -5232,9 +5267,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", + pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", nr_online_nodes, - zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); #ifdef CONFIG_NUMA @@ -5333,6 +5367,7 @@ not_early: __init_single_page(page, pfn, zone, nid); set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); } else { __init_single_pfn(pfn, zone, nid); } @@ -5488,7 +5523,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) pageset_set_high_and_batch(zone, pcp); } -static void __meminit setup_zone_pageset(struct zone *zone) +void __meminit setup_zone_pageset(struct zone *zone) { int cpu; zone->pageset = alloc_percpu(struct per_cpu_pageset); @@ -5528,7 +5563,7 @@ static __meminit void zone_pcp_init(struct zone *zone) zone_batchsize(zone)); } -int __meminit init_currently_empty_zone(struct zone *zone, +void __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size) { @@ -5546,8 +5581,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_init_free_lists(zone); zone->initialized = 1; - - return 0; } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -6005,7 +6038,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; - int ret; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6027,6 +6059,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); + pgdat->per_cpu_nodestats = &boot_nodestats; + for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; @@ -6087,8 +6121,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); - ret = init_currently_empty_zone(zone, zone_start_pfn, size); - BUG_ON(ret); + init_currently_empty_zone(zone, zone_start_pfn, size); memmap_init(size, nid, j, zone_start_pfn); } } @@ -6944,9 +6977,11 @@ static void __setup_per_zone_wmarks(void) */ void setup_per_zone_wmarks(void) { - mutex_lock(&zonelists_mutex); + static DEFINE_SPINLOCK(lock); + + spin_lock(&lock); __setup_per_zone_wmarks(); - mutex_unlock(&zonelists_mutex); + spin_unlock(&lock); } /* @@ -7182,6 +7217,21 @@ static unsigned long __init arch_reserved_kernel_pages(void) #endif /* + * Adaptive scale is meant to reduce sizes of hash tables on large memory + * machines. As memory size is increased the scale is also increased but at + * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory + * quadruples the scale is increased by one, which means the size of hash table + * only doubles, instead of quadrupling as well. + * Because 32-bit systems cannot have large physical memory, where this scaling + * makes sense, it is disabled on such platforms. + */ +#if __BITS_PER_LONG > 32 +#define ADAPT_SCALE_BASE (64ul << 30) +#define ADAPT_SCALE_SHIFT 2 +#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) +#endif + +/* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 * quantity of entries @@ -7200,6 +7250,7 @@ void *__init alloc_large_system_hash(const char *tablename, unsigned long long max = high_limit; unsigned long log2qty, size; void *table = NULL; + gfp_t gfp_flags; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -7211,6 +7262,16 @@ void *__init alloc_large_system_hash(const char *tablename, if (PAGE_SHIFT < 20) numentries = round_up(numentries, (1<<20)/PAGE_SIZE); +#if __BITS_PER_LONG > 32 + if (!high_limit) { + unsigned long adapt; + + for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; + adapt <<= ADAPT_SCALE_SHIFT) + scale++; + } +#endif + /* limit to 1 bucket per 2^scale bytes of low memory */ if (scale > PAGE_SHIFT) numentries >>= (scale - PAGE_SHIFT); @@ -7244,12 +7305,17 @@ void *__init alloc_large_system_hash(const char *tablename, log2qty = ilog2(numentries); + /* + * memblock allocator returns zeroed memory already, so HASH_ZERO is + * currently not used when HASH_EARLY is specified. + */ + gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { size = bucketsize << log2qty; if (flags & HASH_EARLY) table = memblock_virt_alloc_nopanic(size, 0); else if (hashdist) - table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags, PAGE_KERNEL); else { /* * If bucketsize is not a power-of-two, we may free @@ -7257,8 +7323,8 @@ void *__init alloc_large_system_hash(const char *tablename, * alloc_pages_exact() automatically does */ if (get_order(size) < MAX_ORDER) { - table = alloc_pages_exact(size, GFP_ATOMIC); - kmemleak_alloc(table, size, 1, GFP_ATOMIC); + table = alloc_pages_exact(size, gfp_flags); + kmemleak_alloc(table, size, 1, gfp_flags); } } } while (!table && size > PAGE_SIZE && --log2qty); @@ -7567,7 +7633,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { - pr_info("%s: [%lx, %lx) PFNs busy\n", + pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", __func__, outer_start, end); ret = -EBUSY; goto done; @@ -7660,6 +7726,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) break; if (pfn == end_pfn) return; + offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); pfn = start_pfn; |