summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c757
1 files changed, 412 insertions, 345 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2302f250d6b1..77e4d3c5c57b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,8 @@
#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
+#include <linux/lockdep.h>
+#include <linux/nmi.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -113,9 +115,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
-#ifdef CONFIG_MOVABLE_NODE
[N_MEMORY] = { { [0] = 1UL } },
-#endif
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
@@ -511,7 +511,7 @@ static int page_is_consistent(struct zone *zone, struct page *page)
/*
* Temporary debugging check for pages not lying within a given zone.
*/
-static int bad_range(struct zone *zone, struct page *page)
+static int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
return 1;
@@ -521,7 +521,7 @@ static int bad_range(struct zone *zone, struct page *page)
return 0;
}
#else
-static inline int bad_range(struct zone *zone, struct page *page)
+static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
return 0;
}
@@ -1190,7 +1190,7 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void init_reserved_page(unsigned long pfn)
+static void __meminit init_reserved_page(unsigned long pfn)
{
pg_data_t *pgdat;
int nid, zid;
@@ -1297,8 +1297,9 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
#endif
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+static inline bool __meminit __maybe_unused
+meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
{
int nid;
@@ -1320,8 +1321,9 @@ static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
return true;
}
-static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+static inline bool __meminit __maybe_unused
+meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
{
return true;
}
@@ -1365,7 +1367,9 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
return NULL;
- start_page = pfn_to_page(start_pfn);
+ start_page = pfn_to_online_page(start_pfn);
+ if (!start_page)
+ return NULL;
if (page_zone(start_page) != zone)
return NULL;
@@ -1582,6 +1586,10 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+ /* Discard memblock private memory */
+ memblock_discard();
+#endif
for_each_populated_zone(zone)
set_zone_contiguous(zone);
@@ -2204,19 +2212,26 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* list of requested migratetype, possibly along with other pages from the same
* block, depending on fragmentation avoidance heuristics. Returns true if
* fallback was found so that __rmqueue_smallest() can grab it.
+ *
+ * The use of signed ints for order and current_order is a deliberate
+ * deviation from the rest of this file, to make the for loop
+ * condition simpler.
*/
static inline bool
-__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area *area;
- unsigned int current_order;
+ int current_order;
struct page *page;
int fallback_mt;
bool can_steal;
- /* Find the largest possible block of pages in the other list */
- for (current_order = MAX_ORDER-1;
- current_order >= order && current_order <= MAX_ORDER-1;
+ /*
+ * Find the largest available free page in the other list. This roughly
+ * approximates finding the pageblock with the most free pages, which
+ * would be too costly to do exactly.
+ */
+ for (current_order = MAX_ORDER - 1; current_order >= order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2224,19 +2239,50 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
if (fallback_mt == -1)
continue;
- page = list_first_entry(&area->free_list[fallback_mt],
- struct page, lru);
+ /*
+ * We cannot steal all free pages from the pageblock and the
+ * requested migratetype is movable. In that case it's better to
+ * steal and split the smallest available page instead of the
+ * largest available page, because even if the next movable
+ * allocation falls back into a different pageblock than this
+ * one, it won't cause permanent fragmentation.
+ */
+ if (!can_steal && start_migratetype == MIGRATE_MOVABLE
+ && current_order > order)
+ goto find_smallest;
- steal_suitable_fallback(zone, page, start_migratetype,
- can_steal);
+ goto do_steal;
+ }
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
+ return false;
- return true;
+find_smallest:
+ for (current_order = order; current_order < MAX_ORDER;
+ current_order++) {
+ area = &(zone->free_area[current_order]);
+ fallback_mt = find_suitable_fallback(area, current_order,
+ start_migratetype, false, &can_steal);
+ if (fallback_mt != -1)
+ break;
}
- return false;
+ /*
+ * This should not happen - we already found a suitable fallback
+ * when looking for the largest page.
+ */
+ VM_BUG_ON(current_order == MAX_ORDER);
+
+do_steal:
+ page = list_first_entry(&area->free_list[fallback_mt],
+ struct page, lru);
+
+ steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+
+ return true;
+
}
/*
@@ -2491,9 +2537,14 @@ void drain_all_pages(struct zone *zone)
#ifdef CONFIG_HIBERNATION
+/*
+ * Touch the watchdog for every WD_PAGE_COUNT pages.
+ */
+#define WD_PAGE_COUNT (128*1024)
+
void mark_free_pages(struct zone *zone)
{
- unsigned long pfn, max_zone_pfn;
+ unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
unsigned long flags;
unsigned int order, t;
struct page *page;
@@ -2508,6 +2559,11 @@ void mark_free_pages(struct zone *zone)
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+
if (page_zone(page) != zone)
continue;
@@ -2521,8 +2577,13 @@ void mark_free_pages(struct zone *zone)
unsigned long i;
pfn = page_to_pfn(page);
- for (i = 0; i < (1UL << order); i++)
+ for (i = 0; i < (1UL << order); i++) {
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
swsusp_set_page_free(pfn_to_page(pfn + i));
+ }
}
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -2680,18 +2741,18 @@ int __isolate_free_page(struct page *page, unsigned int order)
static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
{
#ifdef CONFIG_NUMA
- enum zone_stat_item local_stat = NUMA_LOCAL;
+ enum numa_stat_item local_stat = NUMA_LOCAL;
if (z->node != numa_node_id())
local_stat = NUMA_OTHER;
if (z->node == preferred_zone->node)
- __inc_zone_state(z, NUMA_HIT);
+ __inc_numa_state(z, NUMA_HIT);
else {
- __inc_zone_state(z, NUMA_MISS);
- __inc_zone_state(preferred_zone, NUMA_FOREIGN);
+ __inc_numa_state(z, NUMA_MISS);
+ __inc_numa_state(preferred_zone, NUMA_FOREIGN);
}
- __inc_zone_state(z, local_stat);
+ __inc_numa_state(z, local_stat);
#endif
}
@@ -2890,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
{
long min = mark;
int o;
- const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
free_pages -= (1 << order) - 1;
@@ -2903,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
* the high-atomic reserves. This will over-estimate the size of the
* atomic reserve but it avoids a search.
*/
- if (likely(!alloc_harder))
+ if (likely(!alloc_harder)) {
free_pages -= z->nr_reserved_highatomic;
- else
- min -= min / 4;
+ } else {
+ /*
+ * OOM victims can try even harder than normal ALLOC_HARDER
+ * users on the grounds that it's definitely going to be in
+ * the exit path shortly and free memory. Any allocation it
+ * makes during the free path will be small and short-lived.
+ */
+ if (alloc_flags & ALLOC_OOM)
+ min -= min / 2;
+ else
+ min -= min / 4;
+ }
+
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
@@ -3144,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
* of allowed nodes.
*/
if (!(gfp_mask & __GFP_NOMEMALLOC))
- if (test_thread_flag(TIF_MEMDIE) ||
+ if (tsk_is_oom_victim(current) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -3231,10 +3303,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
- * we're still under heavy pressure.
+ * we're still under heavy pressure. But make sure that this reclaim
+ * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
+ * allocation which will never fail due to oom_lock already held.
*/
- page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
+ page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
+ ~__GFP_DIRECT_RECLAIM, order,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
if (page)
goto out;
@@ -3244,6 +3319,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer will not help higher order allocs */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
+ /*
+ * We have already exhausted all our reclaim opportunities without any
+ * success so it is time to admit defeat. We will skip the OOM killer
+ * because it is very likely that the caller has a more reasonable
+ * fallback than shooting a random task.
+ */
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
+ goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
@@ -3373,7 +3456,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
}
/*
- * !costly requests are much more important than __GFP_REPEAT
+ * !costly requests are much more important than __GFP_RETRY_MAYFAIL
* costly ones because they are de facto nofail and invoke OOM
* killer to move on while costly can fail and users are ready
* to cope with that. 1/4 retries is rather arbitrary but we
@@ -3442,6 +3525,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
}
#endif /* CONFIG_COMPACTION */
+#ifdef CONFIG_LOCKDEP
+struct lockdep_map __fs_reclaim_map =
+ STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
+
+static bool __need_fs_reclaim(gfp_t gfp_mask)
+{
+ gfp_mask = current_gfp_context(gfp_mask);
+
+ /* no reclaim without waiting on it */
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
+ return false;
+
+ /* this guy won't enter reclaim */
+ if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+ return false;
+
+ /* We're only interested __GFP_FS allocations for now */
+ if (!(gfp_mask & __GFP_FS))
+ return false;
+
+ if (gfp_mask & __GFP_NOLOCKDEP)
+ return false;
+
+ return true;
+}
+
+void fs_reclaim_acquire(gfp_t gfp_mask)
+{
+ if (__need_fs_reclaim(gfp_mask))
+ lock_map_acquire(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
+
+void fs_reclaim_release(gfp_t gfp_mask)
+{
+ if (__need_fs_reclaim(gfp_mask))
+ lock_map_release(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_release);
+#endif
+
/* Perform direct synchronous page reclaim */
static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -3456,7 +3580,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
noreclaim_flag = memalloc_noreclaim_save();
- lockdep_set_current_reclaim_state(gfp_mask);
+ fs_reclaim_acquire(gfp_mask);
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
@@ -3464,7 +3588,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
ac->nodemask);
current->reclaim_state = NULL;
- lockdep_clear_current_reclaim_state();
+ fs_reclaim_release(gfp_mask);
memalloc_noreclaim_restore(noreclaim_flag);
cond_resched();
@@ -3555,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
return alloc_flags;
}
-bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+static bool oom_reserves_allowed(struct task_struct *tsk)
{
- if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ if (!tsk_is_oom_victim(tsk))
+ return false;
+
+ /*
+ * !MMU doesn't have oom reaper so give access to memory reserves
+ * only to the thread with TIF_MEMDIE set
+ */
+ if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
return false;
+ return true;
+}
+
+/*
+ * Distinguish requests which really need access to full memory
+ * reserves from oom victims which can live with a portion of it
+ */
+static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
+{
+ if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ return 0;
if (gfp_mask & __GFP_MEMALLOC)
- return true;
+ return ALLOC_NO_WATERMARKS;
if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
- return true;
- if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
- return true;
+ return ALLOC_NO_WATERMARKS;
+ if (!in_interrupt()) {
+ if (current->flags & PF_MEMALLOC)
+ return ALLOC_NO_WATERMARKS;
+ else if (oom_reserves_allowed(current))
+ return ALLOC_OOM;
+ }
- return false;
+ return 0;
+}
+
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+ return !!__gfp_pfmemalloc_flags(gfp_mask);
}
/*
@@ -3673,6 +3822,39 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
return false;
}
+static inline bool
+check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
+{
+ /*
+ * It's possible that cpuset's mems_allowed and the nodemask from
+ * mempolicy don't intersect. This should be normally dealt with by
+ * policy_nodemask(), but it's possible to race with cpuset update in
+ * such a way the check therein was true, and then it became false
+ * before we got our cpuset_mems_cookie here.
+ * This assumes that for all allocations, ac->nodemask can come only
+ * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
+ * when it does not intersect with the cpuset restrictions) or the
+ * caller can deal with a violated nodemask.
+ */
+ if (cpusets_enabled() && ac->nodemask &&
+ !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
+ ac->nodemask = NULL;
+ return true;
+ }
+
+ /*
+ * When updating a task's mems_allowed or mempolicy nodemask, it is
+ * possible to race with parallel threads in such a way that our
+ * allocation can fail while the mask is being updated. If we are about
+ * to fail, check if the cpuset changed during allocation and if so,
+ * retry.
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ return true;
+
+ return false;
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
@@ -3689,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned long alloc_start = jiffies;
unsigned int stall_timeout = 10 * HZ;
unsigned int cpuset_mems_cookie;
+ int reserve_flags;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3794,15 +3977,16 @@ retry:
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
- if (gfp_pfmemalloc_allowed(gfp_mask))
- alloc_flags = ALLOC_NO_WATERMARKS;
+ reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
+ if (reserve_flags)
+ alloc_flags = reserve_flags;
/*
* Reset the zonelist iterators if memory policies can be ignored.
* These allocations are high priority and system rather than user
* orientated.
*/
- if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
+ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
@@ -3847,9 +4031,9 @@ retry:
/*
* Do not retry costly high order allocations unless they are
- * __GFP_REPEAT
+ * __GFP_RETRY_MAYFAIL
*/
- if (costly_order && !(gfp_mask & __GFP_REPEAT))
+ if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -3868,11 +4052,9 @@ retry:
&compaction_retries))
goto retry;
- /*
- * It's possible we raced with cpuset update so the OOM would be
- * premature (see below the nopage: label for full explanation).
- */
- if (read_mems_allowed_retry(cpuset_mems_cookie))
+
+ /* Deal with possible cpuset update races before we start OOM killing */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/* Reclaim has failed us, start killing things */
@@ -3881,8 +4063,8 @@ retry:
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */
- if (test_thread_flag(TIF_MEMDIE) &&
- (alloc_flags == ALLOC_NO_WATERMARKS ||
+ if (tsk_is_oom_victim(current) &&
+ (alloc_flags == ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
@@ -3893,14 +4075,8 @@ retry:
}
nopage:
- /*
- * When updating a task's mems_allowed or mempolicy nodemask, it is
- * possible to race with parallel threads in such a way that our
- * allocation can fail while the mask is being updated. If we are about
- * to fail, check if the cpuset changed during allocation and if so,
- * retry.
- */
- if (read_mems_allowed_retry(cpuset_mems_cookie))
+ /* Deal with possible cpuset update races before we fail */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/*
@@ -3951,12 +4127,12 @@ got_pg:
}
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask,
+ int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
ac->high_zoneidx = gfp_zone(gfp_mask);
- ac->zonelist = zonelist;
+ ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfpflags_to_migratetype(gfp_mask);
@@ -3968,7 +4144,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
*alloc_flags |= ALLOC_CPUSET;
}
- lockdep_trace_alloc(gfp_mask);
+ fs_reclaim_acquire(gfp_mask);
+ fs_reclaim_release(gfp_mask);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
@@ -4001,16 +4178,17 @@ static inline void finalise_ac(gfp_t gfp_mask,
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
+ nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
- gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+ gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
gfp_mask &= gfp_allowed_mask;
- if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
+ alloc_mask = gfp_mask;
+ if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
finalise_ac(gfp_mask, order, &ac);
@@ -4370,7 +4548,7 @@ long si_mem_available(void)
* Estimate the amount of memory available for userspace allocations,
* without causing swapping.
*/
- available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
@@ -4385,8 +4563,9 @@ long si_mem_available(void)
* Part of the reclaimable slab consists of items that are in use,
* and cannot be freed. Cap this estimate at the low watermark.
*/
- available += global_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+ available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
+ wmark_low);
if (available < 0)
available = 0;
@@ -4398,7 +4577,7 @@ void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
val->sharedram = global_node_page_state(NR_SHMEM);
- val->freeram = global_page_state(NR_FREE_PAGES);
+ val->freeram = global_zone_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
@@ -4529,15 +4708,15 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
global_node_page_state(NR_UNSTABLE_NFS),
- global_page_state(NR_SLAB_RECLAIMABLE),
- global_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_node_page_state(NR_SLAB_RECLAIMABLE),
+ global_node_page_state(NR_SLAB_UNRECLAIMABLE),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
- global_page_state(NR_PAGETABLE),
- global_page_state(NR_BOUNCE),
- global_page_state(NR_FREE_PAGES),
+ global_zone_page_state(NR_PAGETABLE),
+ global_zone_page_state(NR_BOUNCE),
+ global_zone_page_state(NR_FREE_PAGES),
free_pcp,
- global_page_state(NR_FREE_CMA_PAGES));
+ global_zone_page_state(NR_FREE_CMA_PAGES));
for_each_online_pgdat(pgdat) {
if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
@@ -4614,8 +4793,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
- " slab_reclaimable:%lukB"
- " slab_unreclaimable:%lukB"
" kernel_stack:%lukB"
" pagetables:%lukB"
" bounce:%lukB"
@@ -4637,8 +4814,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone->present_pages),
K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
- K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
@@ -4703,18 +4878,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
*
* Add all populated zones of a node to the zonelist.
*/
-static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
- int nr_zones)
+static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
{
struct zone *zone;
enum zone_type zone_type = MAX_NR_ZONES;
+ int nr_zones = 0;
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
if (managed_zone(zone)) {
- zoneref_set_zone(zone,
- &zonelist->_zonerefs[nr_zones++]);
+ zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
} while (zone_type);
@@ -4722,52 +4896,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
return nr_zones;
}
-
-/*
- * zonelist_order:
- * 0 = automatic detection of better ordering.
- * 1 = order by ([node] distance, -zonetype)
- * 2 = order by (-zonetype, [node] distance)
- *
- * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
- * the same zonelist. So only NUMA can configure this param.
- */
-#define ZONELIST_ORDER_DEFAULT 0
-#define ZONELIST_ORDER_NODE 1
-#define ZONELIST_ORDER_ZONE 2
-
-/* zonelist order in the kernel.
- * set_zonelist_order() will set this to NODE or ZONE.
- */
-static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
-static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
-
-
#ifdef CONFIG_NUMA
-/* The value user specified ....changed by config */
-static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
-/* string for sysctl */
-#define NUMA_ZONELIST_ORDER_LEN 16
-char numa_zonelist_order[16] = "default";
-
-/*
- * interface for configure zonelist ordering.
- * command line option "numa_zonelist_order"
- * = "[dD]efault - default, automatic configuration.
- * = "[nN]ode - order by node locality, then by zone within node
- * = "[zZ]one - order by zone, then by locality within zone
- */
static int __parse_numa_zonelist_order(char *s)
{
- if (*s == 'd' || *s == 'D') {
- user_zonelist_order = ZONELIST_ORDER_DEFAULT;
- } else if (*s == 'n' || *s == 'N') {
- user_zonelist_order = ZONELIST_ORDER_NODE;
- } else if (*s == 'z' || *s == 'Z') {
- user_zonelist_order = ZONELIST_ORDER_ZONE;
- } else {
- pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s);
+ /*
+ * We used to support different zonlists modes but they turned
+ * out to be just not useful. Let's keep the warning in place
+ * if somebody still use the cmd line parameter so that we do
+ * not fail it silently
+ */
+ if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
+ pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
@@ -4775,19 +4915,15 @@ static int __parse_numa_zonelist_order(char *s)
static __init int setup_numa_zonelist_order(char *s)
{
- int ret;
-
if (!s)
return 0;
- ret = __parse_numa_zonelist_order(s);
- if (ret == 0)
- strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
-
- return ret;
+ return __parse_numa_zonelist_order(s);
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
+char numa_zonelist_order[] = "Node";
+
/*
* sysctl handler for numa_zonelist_order
*/
@@ -4795,40 +4931,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos)
{
- char saved_string[NUMA_ZONELIST_ORDER_LEN];
+ char *str;
int ret;
- static DEFINE_MUTEX(zl_order_mutex);
- mutex_lock(&zl_order_mutex);
- if (write) {
- if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
- ret = -EINVAL;
- goto out;
- }
- strcpy(saved_string, (char *)table->data);
- }
- ret = proc_dostring(table, write, buffer, length, ppos);
- if (ret)
- goto out;
- if (write) {
- int oldval = user_zonelist_order;
+ if (!write)
+ return proc_dostring(table, write, buffer, length, ppos);
+ str = memdup_user_nul(buffer, 16);
+ if (IS_ERR(str))
+ return PTR_ERR(str);
- ret = __parse_numa_zonelist_order((char *)table->data);
- if (ret) {
- /*
- * bogus value. restore saved string
- */
- strncpy((char *)table->data, saved_string,
- NUMA_ZONELIST_ORDER_LEN);
- user_zonelist_order = oldval;
- } else if (oldval != user_zonelist_order) {
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
- mutex_unlock(&zonelists_mutex);
- }
- }
-out:
- mutex_unlock(&zl_order_mutex);
+ ret = __parse_numa_zonelist_order(str);
+ kfree(str);
return ret;
}
@@ -4902,17 +5015,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
-static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
+ unsigned nr_nodes)
{
- int j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int i;
+
+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+
+ for (i = 0; i < nr_nodes; i++) {
+ int nr_zones;
+
+ pg_data_t *node = NODE_DATA(node_order[i]);
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
- ;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ nr_zones = build_zonerefs_node(node, zonerefs);
+ zonerefs += nr_zones;
+ }
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
/*
@@ -4920,13 +5040,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
- int j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int nr_zones;
- zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
- j = build_zonelists_node(pgdat, zonelist, 0);
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs += nr_zones;
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
/*
@@ -4935,79 +5056,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
* exhausted, but results in overflowing to remote node while memory
* may still exist in local DMA zone.
*/
-static int node_order[MAX_NUMNODES];
-
-static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
-{
- int pos, j, node;
- int zone_type; /* needs to be signed */
- struct zone *z;
- struct zonelist *zonelist;
-
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- pos = 0;
- for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
- for (j = 0; j < nr_nodes; j++) {
- node = node_order[j];
- z = &NODE_DATA(node)->node_zones[zone_type];
- if (managed_zone(z)) {
- zoneref_set_zone(z,
- &zonelist->_zonerefs[pos++]);
- check_highest_zone(zone_type);
- }
- }
- }
- zonelist->_zonerefs[pos].zone = NULL;
- zonelist->_zonerefs[pos].zone_idx = 0;
-}
-
-#if defined(CONFIG_64BIT)
-/*
- * Devices that require DMA32/DMA are relatively rare and do not justify a
- * penalty to every machine in case the specialised case applies. Default
- * to Node-ordering on 64-bit NUMA machines
- */
-static int default_zonelist_order(void)
-{
- return ZONELIST_ORDER_NODE;
-}
-#else
-/*
- * On 32-bit, the Normal zone needs to be preserved for allocations accessible
- * by the kernel. If processes running on node 0 deplete the low memory zone
- * then reclaim will occur more frequency increasing stalls and potentially
- * be easier to OOM if a large percentage of the zone is under writeback or
- * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
- * Hence, default to zone ordering on 32-bit.
- */
-static int default_zonelist_order(void)
-{
- return ZONELIST_ORDER_ZONE;
-}
-#endif /* CONFIG_64BIT */
-
-static void set_zonelist_order(void)
-{
- if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
- current_zonelist_order = default_zonelist_order();
- else
- current_zonelist_order = user_zonelist_order;
-}
static void build_zonelists(pg_data_t *pgdat)
{
- int i, node, load;
+ static int node_order[MAX_NUMNODES];
+ int node, load, nr_nodes = 0;
nodemask_t used_mask;
int local_node, prev_node;
- struct zonelist *zonelist;
- unsigned int order = current_zonelist_order;
-
- /* initialize zonelists */
- for (i = 0; i < MAX_ZONELISTS; i++) {
- zonelist = pgdat->node_zonelists + i;
- zonelist->_zonerefs[0].zone = NULL;
- zonelist->_zonerefs[0].zone_idx = 0;
- }
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
@@ -5016,8 +5071,6 @@ static void build_zonelists(pg_data_t *pgdat)
nodes_clear(used_mask);
memset(node_order, 0, sizeof(node_order));
- i = 0;
-
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
* We don't want to pressure a particular node.
@@ -5028,19 +5081,12 @@ static void build_zonelists(pg_data_t *pgdat)
node_distance(local_node, prev_node))
node_load[node] = load;
+ node_order[nr_nodes++] = node;
prev_node = node;
load--;
- if (order == ZONELIST_ORDER_NODE)
- build_zonelists_in_node_order(pgdat, node);
- else
- node_order[i++] = node; /* remember order */
- }
-
- if (order == ZONELIST_ORDER_ZONE) {
- /* calculate node order -- i.e., DMA last! */
- build_zonelists_in_zone_order(pgdat, i);
}
+ build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
}
@@ -5066,21 +5112,17 @@ static void setup_min_unmapped_ratio(void);
static void setup_min_slab_ratio(void);
#else /* CONFIG_NUMA */
-static void set_zonelist_order(void)
-{
- current_zonelist_order = ZONELIST_ORDER_ZONE;
-}
-
static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
- enum zone_type j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int nr_zones;
local_node = pgdat->node_id;
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- j = build_zonelists_node(pgdat, zonelist, 0);
+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs += nr_zones;
/*
* Now we build the zonelist so that it contains the zones
@@ -5093,16 +5135,18 @@ static void build_zonelists(pg_data_t *pgdat)
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ zonerefs += nr_zones;
}
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ zonerefs += nr_zones;
}
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
#endif /* CONFIG_NUMA */
@@ -5124,50 +5168,33 @@ static void build_zonelists(pg_data_t *pgdat)
*/
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
-static void setup_zone_pageset(struct zone *zone);
+static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
-/*
- * Global mutex to protect against size modification of zonelists
- * as well as to serialize pageset setup for the new populated zone.
- */
-DEFINE_MUTEX(zonelists_mutex);
-
-/* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *data)
+static void __build_all_zonelists(void *data)
{
int nid;
- int cpu;
+ int __maybe_unused cpu;
pg_data_t *self = data;
+ static DEFINE_SPINLOCK(lock);
+
+ spin_lock(&lock);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
+ /*
+ * This node is hotadded and no memory is yet present. So just
+ * building zonelists is fine - no need to touch other nodes.
+ */
if (self && !node_online(self->node_id)) {
build_zonelists(self);
- }
-
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid);
-
- build_zonelists(pgdat);
- }
+ } else {
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
- /*
- * Initialize the boot_pagesets that are going to be used
- * for bootstrapping processors. The real pagesets for
- * each zone will be allocated later when the per cpu
- * allocator is available.
- *
- * boot_pagesets are used also for bootstrapping offline
- * cpus if the system is already booted because the pagesets
- * are needed to initialize allocators on a specific cpu too.
- * F.e. the percpu allocator needs the page allocator which
- * needs the percpu allocator in order to allocate its pagesets
- * (a chicken-egg dilemma).
- */
- for_each_possible_cpu(cpu) {
- setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+ build_zonelists(pgdat);
+ }
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
@@ -5178,45 +5205,53 @@ static int __build_all_zonelists(void *data)
* secondary cpus' numa_mem as they come on-line. During
* node/memory hotplug, we'll fixup all on-line cpus.
*/
- if (cpu_online(cpu))
+ for_each_online_cpu(cpu)
set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
#endif
}
- return 0;
+ spin_unlock(&lock);
}
static noinline void __init
build_all_zonelists_init(void)
{
+ int cpu;
+
__build_all_zonelists(NULL);
+
+ /*
+ * Initialize the boot_pagesets that are going to be used
+ * for bootstrapping processors. The real pagesets for
+ * each zone will be allocated later when the per cpu
+ * allocator is available.
+ *
+ * boot_pagesets are used also for bootstrapping offline
+ * cpus if the system is already booted because the pagesets
+ * are needed to initialize allocators on a specific cpu too.
+ * F.e. the percpu allocator needs the page allocator which
+ * needs the percpu allocator in order to allocate its pagesets
+ * (a chicken-egg dilemma).
+ */
+ for_each_possible_cpu(cpu)
+ setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
}
/*
- * Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*
- * __ref due to (1) call of __meminit annotated setup_zone_pageset
- * [we're only called with non-NULL zone through __meminit paths] and
- * (2) call of __init annotated helper build_all_zonelists_init
+ * __ref due to call of __init annotated helper build_all_zonelists_init
* [protected by SYSTEM_BOOTING].
*/
-void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
+void __ref build_all_zonelists(pg_data_t *pgdat)
{
- set_zonelist_order();
-
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
} else {
-#ifdef CONFIG_MEMORY_HOTPLUG
- if (zone)
- setup_zone_pageset(zone);
-#endif
- /* we have to stop all cpus to guarantee there is no user
- of zonelist */
- stop_machine(__build_all_zonelists, pgdat, NULL);
+ __build_all_zonelists(pgdat);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
@@ -5232,9 +5267,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
+ pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
- zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
@@ -5333,6 +5367,7 @@ not_early:
__init_single_page(page, pfn, zone, nid);
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ cond_resched();
} else {
__init_single_pfn(pfn, zone, nid);
}
@@ -5488,7 +5523,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu)
pageset_set_high_and_batch(zone, pcp);
}
-static void __meminit setup_zone_pageset(struct zone *zone)
+void __meminit setup_zone_pageset(struct zone *zone)
{
int cpu;
zone->pageset = alloc_percpu(struct per_cpu_pageset);
@@ -5528,7 +5563,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
zone_batchsize(zone));
}
-int __meminit init_currently_empty_zone(struct zone *zone,
+void __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
unsigned long size)
{
@@ -5546,8 +5581,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,
zone_init_free_lists(zone);
zone->initialized = 1;
-
- return 0;
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -6005,7 +6038,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
- int ret;
pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
@@ -6027,6 +6059,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
spin_lock_init(&pgdat->lru_lock);
lruvec_init(node_lruvec(pgdat));
+ pgdat->per_cpu_nodestats = &boot_nodestats;
+
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
@@ -6087,8 +6121,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
set_pageblock_order();
setup_usemap(pgdat, zone, zone_start_pfn, size);
- ret = init_currently_empty_zone(zone, zone_start_pfn, size);
- BUG_ON(ret);
+ init_currently_empty_zone(zone, zone_start_pfn, size);
memmap_init(size, nid, j, zone_start_pfn);
}
}
@@ -6944,9 +6977,11 @@ static void __setup_per_zone_wmarks(void)
*/
void setup_per_zone_wmarks(void)
{
- mutex_lock(&zonelists_mutex);
+ static DEFINE_SPINLOCK(lock);
+
+ spin_lock(&lock);
__setup_per_zone_wmarks();
- mutex_unlock(&zonelists_mutex);
+ spin_unlock(&lock);
}
/*
@@ -7182,6 +7217,21 @@ static unsigned long __init arch_reserved_kernel_pages(void)
#endif
/*
+ * Adaptive scale is meant to reduce sizes of hash tables on large memory
+ * machines. As memory size is increased the scale is also increased but at
+ * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
+ * quadruples the scale is increased by one, which means the size of hash table
+ * only doubles, instead of quadrupling as well.
+ * Because 32-bit systems cannot have large physical memory, where this scaling
+ * makes sense, it is disabled on such platforms.
+ */
+#if __BITS_PER_LONG > 32
+#define ADAPT_SCALE_BASE (64ul << 30)
+#define ADAPT_SCALE_SHIFT 2
+#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
+#endif
+
+/*
* allocate a large system hash table from bootmem
* - it is assumed that the hash table must contain an exact power-of-2
* quantity of entries
@@ -7200,6 +7250,7 @@ void *__init alloc_large_system_hash(const char *tablename,
unsigned long long max = high_limit;
unsigned long log2qty, size;
void *table = NULL;
+ gfp_t gfp_flags;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@@ -7211,6 +7262,16 @@ void *__init alloc_large_system_hash(const char *tablename,
if (PAGE_SHIFT < 20)
numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
+#if __BITS_PER_LONG > 32
+ if (!high_limit) {
+ unsigned long adapt;
+
+ for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
+ adapt <<= ADAPT_SCALE_SHIFT)
+ scale++;
+ }
+#endif
+
/* limit to 1 bucket per 2^scale bytes of low memory */
if (scale > PAGE_SHIFT)
numentries >>= (scale - PAGE_SHIFT);
@@ -7244,12 +7305,17 @@ void *__init alloc_large_system_hash(const char *tablename,
log2qty = ilog2(numentries);
+ /*
+ * memblock allocator returns zeroed memory already, so HASH_ZERO is
+ * currently not used when HASH_EARLY is specified.
+ */
+ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
table = memblock_virt_alloc_nopanic(size, 0);
else if (hashdist)
- table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+ table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
else {
/*
* If bucketsize is not a power-of-two, we may free
@@ -7257,8 +7323,8 @@ void *__init alloc_large_system_hash(const char *tablename,
* alloc_pages_exact() automatically does
*/
if (get_order(size) < MAX_ORDER) {
- table = alloc_pages_exact(size, GFP_ATOMIC);
- kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+ table = alloc_pages_exact(size, gfp_flags);
+ kmemleak_alloc(table, size, 1, gfp_flags);
}
}
} while (!table && size > PAGE_SIZE && --log2qty);
@@ -7567,7 +7633,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, false)) {
- pr_info("%s: [%lx, %lx) PFNs busy\n",
+ pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
__func__, outer_start, end);
ret = -EBUSY;
goto done;
@@ -7660,6 +7726,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
break;
if (pfn == end_pfn)
return;
+ offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
pfn = start_pfn;