diff options
Diffstat (limited to 'mm/compaction.c')
-rw-r--r-- | mm/compaction.c | 250 |
1 files changed, 156 insertions, 94 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index c8bcdea15f5f..0fb3b89b3967 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -229,6 +229,33 @@ static void reset_cached_positions(struct zone *zone) pageblock_start_pfn(zone_end_pfn(zone) - 1); } +#ifdef CONFIG_SPARSEMEM +/* + * If the PFN falls into an offline section, return the start PFN of the + * next online section. If the PFN falls into an online section or if + * there is no next online section, return 0. + */ +static unsigned long skip_offline_sections(unsigned long start_pfn) +{ + unsigned long start_nr = pfn_to_section_nr(start_pfn); + + if (online_section_nr(start_nr)) + return 0; + + while (++start_nr <= __highest_present_section_nr) { + if (online_section_nr(start_nr)) + return section_nr_to_pfn(start_nr); + } + + return 0; +} +#else +static unsigned long skip_offline_sections(unsigned long start_pfn) +{ + return 0; +} +#endif + /* * Compound pages of >= pageblock_order should consistently be skipped until * released. It is always pointless to compact pages of such order (if they are @@ -392,18 +419,14 @@ void reset_isolation_suitable(pg_data_t *pgdat) * Sets the pageblock skip bit if it was clear. Note that this is a hint as * locks are not required for read/writers. Returns true if it was already set. */ -static bool test_and_set_skip(struct compact_control *cc, struct page *page, - unsigned long pfn) +static bool test_and_set_skip(struct compact_control *cc, struct page *page) { bool skip; - /* Do no update if skip hint is being ignored */ + /* Do not update if skip hint is being ignored */ if (cc->ignore_skip_hint) return false; - if (!pageblock_aligned(pfn)) - return false; - skip = get_pageblock_skip(page); if (!skip && !cc->no_set_skip_hint) set_pageblock_skip(page); @@ -440,9 +463,6 @@ static void update_pageblock_skip(struct compact_control *cc, if (cc->no_set_skip_hint) return; - if (!page) - return; - set_pageblock_skip(page); /* Update where async and sync compaction should restart */ @@ -470,8 +490,7 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) { } -static bool test_and_set_skip(struct compact_control *cc, struct page *page, - unsigned long pfn) +static bool test_and_set_skip(struct compact_control *cc, struct page *page) { return false; } @@ -745,8 +764,9 @@ isolate_freepages_range(struct compact_control *cc, } /* Similar to reclaim, but different enough that they don't share logic */ -static bool too_many_isolated(pg_data_t *pgdat) +static bool too_many_isolated(struct compact_control *cc) { + pg_data_t *pgdat = cc->zone->zone_pgdat; bool too_many; unsigned long active, inactive, isolated; @@ -758,6 +778,17 @@ static bool too_many_isolated(pg_data_t *pgdat) isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + node_page_state(pgdat, NR_ISOLATED_ANON); + /* + * Allow GFP_NOFS to isolate past the limit set for regular + * compaction runs. This prevents an ABBA deadlock when other + * compactors have already isolated to the limit, but are + * blocked on filesystem locks held by the GFP_NOFS thread. + */ + if (cc->gfp_mask & __GFP_FS) { + inactive >>= 3; + active >>= 3; + } + too_many = isolated > (inactive + active) / 2; if (!too_many) wake_throttle_isolated(pgdat); @@ -806,7 +837,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * list by either parallel reclaimers or compaction. If there are, * delay for some time until fewer pages are isolated */ - while (unlikely(too_many_isolated(pgdat))) { + while (unlikely(too_many_isolated(cc))) { /* stop isolation if there are still pages not migrated */ if (cc->nr_migratepages) return -EAGAIN; @@ -1074,11 +1105,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, lruvec_memcg_debug(lruvec, page_folio(page)); - /* Try get exclusive access under lock */ - if (!skip_updated) { + /* + * Try get exclusive access under lock. If marked for + * skip, the scan is aborted unless the current context + * is a rescan to reach the end of the pageblock. + */ + if (!skip_updated && valid_page) { skip_updated = true; - if (test_and_set_skip(cc, page, low_pfn)) + if (test_and_set_skip(cc, valid_page) && + !cc->finish_pageblock) { goto isolate_abort; + } } /* @@ -1191,7 +1228,7 @@ isolate_abort: * rescanned twice in a row. */ if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) { - if (valid_page && !skip_updated) + if (!cc->no_set_skip_hint && valid_page && !skip_updated) set_pageblock_skip(valid_page); update_cached_migrate(cc, low_pfn); } @@ -1379,7 +1416,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn) isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ - if (cc->nr_freepages < cc->nr_migratepages) + if (start_pfn == end_pfn) set_pageblock_skip(page); return; @@ -1403,11 +1440,10 @@ static int next_search_order(struct compact_control *cc, int order) return order; } -static unsigned long -fast_isolate_freepages(struct compact_control *cc) +static void fast_isolate_freepages(struct compact_control *cc) { unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1); - unsigned int nr_scanned = 0; + unsigned int nr_scanned = 0, total_isolated = 0; unsigned long low_pfn, min_pfn, highest = 0; unsigned long nr_isolated = 0; unsigned long distance; @@ -1417,7 +1453,7 @@ fast_isolate_freepages(struct compact_control *cc) /* Full compaction passes in a negative order */ if (cc->order <= 0) - return cc->free_pfn; + return; /* * If starting the scan, use a deeper search and use the highest @@ -1506,6 +1542,7 @@ fast_isolate_freepages(struct compact_control *cc) set_page_private(page, order); nr_isolated = 1 << order; nr_scanned += nr_isolated - 1; + total_isolated += nr_isolated; cc->nr_freepages += nr_isolated; list_add_tail(&page->lru, &cc->freepages); count_compact_events(COMPACTISOLATED, nr_isolated); @@ -1518,6 +1555,10 @@ fast_isolate_freepages(struct compact_control *cc) spin_unlock_irqrestore(&cc->zone->lock, flags); + /* Skip fast search if enough freepages isolated */ + if (cc->nr_freepages >= cc->nr_migratepages) + break; + /* * Smaller scan on next order so the total scan is related * to freelist_scan_limit. @@ -1526,6 +1567,9 @@ fast_isolate_freepages(struct compact_control *cc) limit = max(1U, limit >> 1); } + trace_mm_compaction_fast_isolate_freepages(min_pfn, cc->free_pfn, + nr_scanned, total_isolated); + if (!page) { cc->fast_search_fail++; if (scan_start) { @@ -1556,11 +1600,10 @@ fast_isolate_freepages(struct compact_control *cc) cc->total_free_scanned += nr_scanned; if (!page) - return cc->free_pfn; + return; low_pfn = page_to_pfn(page); fast_isolate_around(cc, low_pfn); - return low_pfn; } /* @@ -1684,11 +1727,10 @@ splitmap: * This is a migrate-callback that "allocates" freepages by taking pages * from the isolated freelists in the block we are migrating to. */ -static struct page *compaction_alloc(struct page *migratepage, - unsigned long data) +static struct folio *compaction_alloc(struct folio *src, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; - struct page *freepage; + struct folio *dst; if (list_empty(&cc->freepages)) { isolate_freepages(cc); @@ -1697,11 +1739,11 @@ static struct page *compaction_alloc(struct page *migratepage, return NULL; } - freepage = list_entry(cc->freepages.next, struct page, lru); - list_del(&freepage->lru); + dst = list_entry(cc->freepages.next, struct folio, lru); + list_del(&dst->lru); cc->nr_freepages--; - return freepage; + return dst; } /* @@ -1709,11 +1751,11 @@ static struct page *compaction_alloc(struct page *migratepage, * freelist. All pages on the freelist are from the same zone, so there is no * special handling needed for NUMA. */ -static void compaction_free(struct page *page, unsigned long data) +static void compaction_free(struct folio *dst, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; - list_add(&page->lru, &cc->freepages); + list_add(&dst->lru, &cc->freepages); cc->nr_freepages++; } @@ -1736,6 +1778,7 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE */ static unsigned int __read_mostly sysctl_compaction_proactiveness = 20; static int sysctl_extfrag_threshold = 500; +static int __read_mostly sysctl_compact_memory; static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) @@ -1864,7 +1907,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; - set_pageblock_skip(freepage); break; } } @@ -1940,8 +1982,14 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, cc->zone); - if (!page) + if (!page) { + unsigned long next_pfn; + + next_pfn = skip_offline_sections(block_start_pfn); + if (next_pfn) + block_end_pfn = min(next_pfn, cc->free_pfn); continue; + } /* * If isolation recently failed, do not retry. Only check the @@ -2193,25 +2241,11 @@ static enum compact_result compact_finished(struct compact_control *cc) return ret; } -static enum compact_result __compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, - int highest_zoneidx, - unsigned long wmark_target) +static bool __compaction_suitable(struct zone *zone, int order, + int highest_zoneidx, + unsigned long wmark_target) { unsigned long watermark; - - if (is_via_compact_memory(order)) - return COMPACT_CONTINUE; - - watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); - /* - * If watermarks for high-order allocation are already met, there - * should be no need for compaction at all. - */ - if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, - alloc_flags)) - return COMPACT_SUCCESS; - /* * Watermarks for order-0 must be met for compaction to be able to * isolate free pages for migration targets. This means that the @@ -2229,29 +2263,20 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); - if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, - ALLOC_CMA, wmark_target)) - return COMPACT_SKIPPED; - - return COMPACT_CONTINUE; + return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx, + ALLOC_CMA, wmark_target); } /* * compaction_suitable: Is this suitable to run compaction on this zone now? - * Returns - * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_SUCCESS - If the allocation would succeed without compaction - * COMPACT_CONTINUE - If compaction should run now */ -enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, - int highest_zoneidx) +bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) { - enum compact_result ret; - int fragindex; + enum compact_result compact_result; + bool suitable; - ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx, - zone_page_state(zone, NR_FREE_PAGES)); + suitable = __compaction_suitable(zone, order, highest_zoneidx, + zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation @@ -2268,17 +2293,24 @@ enum compact_result compaction_suitable(struct zone *zone, int order, * excessive compaction for costly orders, but it should not be at the * expense of system stability. */ - if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) { - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - ret = COMPACT_NOT_SUITABLE_ZONE; + if (suitable) { + compact_result = COMPACT_CONTINUE; + if (order > PAGE_ALLOC_COSTLY_ORDER) { + int fragindex = fragmentation_index(zone, order); + + if (fragindex >= 0 && + fragindex <= sysctl_extfrag_threshold) { + suitable = false; + compact_result = COMPACT_NOT_SUITABLE_ZONE; + } + } + } else { + compact_result = COMPACT_SKIPPED; } - trace_mm_compaction_suitable(zone, order, ret); - if (ret == COMPACT_NOT_SUITABLE_ZONE) - ret = COMPACT_SKIPPED; + trace_mm_compaction_suitable(zone, order, compact_result); - return ret; + return suitable; } bool compaction_zonelist_suitable(struct alloc_context *ac, int order, @@ -2294,7 +2326,6 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->highest_zoneidx, ac->nodemask) { unsigned long available; - enum compact_result compact_result; /* * Do not consider all the reclaimable memory because we do not @@ -2304,9 +2335,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, */ available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); - compact_result = __compaction_suitable(zone, order, alloc_flags, - ac->highest_zoneidx, available); - if (compact_result == COMPACT_CONTINUE) + if (__compaction_suitable(zone, order, ac->highest_zoneidx, + available)) return true; } @@ -2336,11 +2366,22 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) INIT_LIST_HEAD(&cc->migratepages); cc->migratetype = gfp_migratetype(cc->gfp_mask); - ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, - cc->highest_zoneidx); - /* Compaction is likely to fail */ - if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) - return ret; + + if (!is_via_compact_memory(cc->order)) { + unsigned long watermark; + + /* Allocation can already succeed, nothing to do */ + watermark = wmark_pages(cc->zone, + cc->alloc_flags & ALLOC_WMARK_MASK); + if (zone_watermark_ok(cc->zone, cc->order, watermark, + cc->highest_zoneidx, cc->alloc_flags)) + return COMPACT_SUCCESS; + + /* Compaction is likely to fail */ + if (!compaction_suitable(cc->zone, cc->order, + cc->highest_zoneidx)) + return COMPACT_SKIPPED; + } /* * Clear pageblock skip if there were failures recently and compaction @@ -2456,7 +2497,8 @@ rescan: } /* * If an ASYNC or SYNC_LIGHT fails to migrate a page - * within the current order-aligned block, scan the + * within the current order-aligned block and + * fast_find_migrateblock may be used then scan the * remainder of the pageblock. This will mark the * pageblock "skip" to avoid rescanning in the near * future. This will isolate more pages than necessary @@ -2464,8 +2506,9 @@ rescan: * fast_find_migrateblock revisiting blocks that were * recently partially scanned. */ - if (cc->direct_compaction && !cc->finish_pageblock && - (cc->mode < MIGRATE_SYNC)) { + if (!pageblock_aligned(cc->migrate_pfn) && + !cc->ignore_skip_hint && !cc->finish_pageblock && + (cc->mode < MIGRATE_SYNC)) { cc->finish_pageblock = true; /* @@ -2780,6 +2823,15 @@ static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int static int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { + int ret; + + ret = proc_dointvec(table, write, buffer, length, ppos); + if (ret) + return ret; + + if (sysctl_compact_memory != 1) + return -EINVAL; + if (write) compact_nodes(); @@ -2833,8 +2885,14 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) if (!populated_zone(zone)) continue; - if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, - highest_zoneidx) == COMPACT_CONTINUE) + /* Allocation can already succeed, check other zones */ + if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, + min_wmark_pages(zone), + highest_zoneidx, 0)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, + highest_zoneidx)) return true; } @@ -2871,8 +2929,12 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (compaction_deferred(zone, cc.order)) continue; - if (compaction_suitable(zone, cc.order, 0, zoneid) != - COMPACT_CONTINUE) + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, cc.order, + min_wmark_pages(zone), zoneid, 0)) + continue; + + if (!compaction_suitable(zone, cc.order, zoneid)) continue; if (kthread_should_stop()) @@ -3021,7 +3083,7 @@ static int kcompactd(void *p) * This kcompactd start function will be called by init and node-hot-add. * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. */ -void kcompactd_run(int nid) +void __meminit kcompactd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -3039,7 +3101,7 @@ void kcompactd_run(int nid) * Called by memory hotplug when all memory in a node is offlined. Caller must * be holding mem_hotplug_begin/done(). */ -void kcompactd_stop(int nid) +void __meminit kcompactd_stop(int nid) { struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; @@ -3095,7 +3157,7 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, static struct ctl_table vm_compaction[] = { { .procname = "compact_memory", - .data = NULL, + .data = &sysctl_compact_memory, .maxlen = sizeof(int), .mode = 0200, .proc_handler = sysctl_compaction_handler, |