diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 2013 |
1 files changed, 1153 insertions, 860 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c1069efcc4d7..ca423cc20b59 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -63,6 +63,7 @@ #include <linux/sched/rt.h> #include <linux/page_owner.h> #include <linux/kthread.h> +#include <linux/memcontrol.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -286,15 +287,9 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat) /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { - if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) - return true; - - return false; -} + int nid = early_pfn_to_nid(pfn); -static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) -{ - if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) return true; return false; @@ -339,11 +334,6 @@ static inline bool early_page_uninitialised(unsigned long pfn) return false; } -static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) -{ - return false; -} - static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) @@ -352,6 +342,106 @@ static inline bool update_defer_init(pg_data_t *pgdat, } #endif +/* Return a pointer to the bitmap storing bits affecting a block of pages */ +static inline unsigned long *get_pageblock_bitmap(struct page *page, + unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + return __pfn_to_section(pfn)->pageblock_flags; +#else + return page_zone(page)->pageblock_flags; +#endif /* CONFIG_SPARSEMEM */ +} + +static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + pfn &= (PAGES_PER_SECTION-1); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#else + pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#endif /* CONFIG_SPARSEMEM */ +} + +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest to retrieve + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ +static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long word; + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; +} + +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask); +} + +static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) +{ + return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK); +} + +/** + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @flags: The flags to set + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest + * @mask: mask of bits that the caller is interested in + */ +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); + + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = READ_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } +} void set_pageblock_migratetype(struct page *page, int migratetype) { @@ -422,12 +512,6 @@ static void bad_page(struct page *page, const char *reason, static unsigned long nr_shown; static unsigned long nr_unshown; - /* Don't complain about poisoned pages */ - if (PageHWPoison(page)) { - page_mapcount_reset(page); /* remove PageBuddy */ - return; - } - /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. @@ -513,14 +597,7 @@ static int __init early_debug_pagealloc(char *buf) { if (!buf) return -EINVAL; - - if (strcmp(buf, "on") == 0) - _debug_pagealloc_enabled = true; - - if (strcmp(buf, "off") == 0) - _debug_pagealloc_enabled = false; - - return 0; + return kstrtobool(buf, &_debug_pagealloc_enabled); } early_param("debug_pagealloc", early_debug_pagealloc); @@ -530,6 +607,9 @@ static bool need_debug_guardpage(void) if (!debug_pagealloc_enabled()) return false; + if (!debug_guardpage_minorder()) + return false; + return true; } @@ -538,6 +618,9 @@ static void init_debug_guardpage(void) if (!debug_pagealloc_enabled()) return; + if (!debug_guardpage_minorder()) + return; + _debug_guardpage_enabled = true; } @@ -558,23 +641,31 @@ static int __init debug_guardpage_minorder_setup(char *buf) pr_info("Setting debug_guardpage_minorder to %lu\n", res); return 0; } -__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); +early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); -static inline void set_page_guard(struct zone *zone, struct page *page, +static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { struct page_ext *page_ext; if (!debug_guardpage_enabled()) - return; + return false; + + if (order >= debug_guardpage_minorder()) + return false; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return false; + __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); INIT_LIST_HEAD(&page->lru); set_page_private(page, order); /* Guard pages are not available for any usage */ __mod_zone_freepage_state(zone, -(1 << order), migratetype); + + return true; } static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -586,6 +677,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, return; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); set_page_private(page, 0); @@ -593,9 +687,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else -struct page_ext_operations debug_guardpage_ops = { NULL, }; -static inline void set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} +struct page_ext_operations debug_guardpage_ops; +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} #endif @@ -784,17 +878,42 @@ out: zone->free_area[order].nr_free++; } -static inline int free_pages_check(struct page *page) +/* + * A bad page could be due to a number of fields. Instead of multiple branches, + * try and check multiple fields with one check. The caller must do a detailed + * check if necessary. + */ +static inline bool page_expected_state(struct page *page, + unsigned long check_flags) { - const char *bad_reason = NULL; - unsigned long bad_flags = 0; + if (unlikely(atomic_read(&page->_mapcount) != -1)) + return false; + + if (unlikely((unsigned long)page->mapping | + page_ref_count(page) | +#ifdef CONFIG_MEMCG + (unsigned long)page->mem_cgroup | +#endif + (page->flags & check_flags))) + return false; + + return true; +} + +static void free_pages_check_bad(struct page *page) +{ + const char *bad_reason; + unsigned long bad_flags; + + bad_reason = NULL; + bad_flags = 0; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _count"; + bad_reason = "nonzero _refcount"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; bad_flags = PAGE_FLAGS_CHECK_AT_FREE; @@ -803,16 +922,150 @@ static inline int free_pages_check(struct page *page) if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(bad_reason)) { - bad_page(page, bad_reason, bad_flags); - return 1; + bad_page(page, bad_reason, bad_flags); +} + +static inline int free_pages_check(struct page *page) +{ + if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) + return 0; + + /* Something has gone sideways, find it */ + free_pages_check_bad(page); + return 1; +} + +static int free_tail_pages_check(struct page *head_page, struct page *page) +{ + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; } + switch (page - head_page) { + case 1: + /* the first tail page: ->mapping is compound_mapcount() */ + if (unlikely(compound_mapcount(page))) { + bad_page(page, "nonzero compound_mapcount", 0); + goto out; + } + break; + case 2: + /* + * the second tail page: ->mapping is + * page_deferred_list().next -- ignore value. + */ + break; + default: + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page", 0); + goto out; + } + break; + } + if (unlikely(!PageTail(page))) { + bad_page(page, "PageTail not set", 0); + goto out; + } + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent", 0); + goto out; + } + ret = 0; +out: + page->mapping = NULL; + clear_compound_head(page); + return ret; +} + +static __always_inline bool free_pages_prepare(struct page *page, + unsigned int order, bool check_free) +{ + int bad = 0; + + VM_BUG_ON_PAGE(PageTail(page), page); + + trace_mm_page_free(page, order); + kmemcheck_free_shadow(page, order); + + /* + * Check tail pages before head page information is cleared to + * avoid checking PageCompound for order-0 pages. + */ + if (unlikely(order)) { + bool compound = PageCompound(page); + int i; + + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + + if (compound) + ClearPageDoubleMap(page); + for (i = 1; i < (1 << order); i++) { + if (compound) + bad += free_tail_pages_check(page, page + i); + if (unlikely(free_pages_check(page + i))) { + bad++; + continue; + } + (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + } + } + if (PageMappingFlags(page)) + page->mapping = NULL; + if (memcg_kmem_enabled() && PageKmemcg(page)) + memcg_kmem_uncharge(page, order); + if (check_free) + bad += free_pages_check(page); + if (bad) + return false; + page_cpupid_reset_last(page); - if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - return 0; + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + reset_page_owner(page, order); + + if (!PageHighMem(page)) { + debug_check_no_locks_freed(page_address(page), + PAGE_SIZE << order); + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE << order); + } + arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); + kernel_map_pages(page, 1 << order, 0); + kasan_free_pages(page, order); + + return true; } +#ifdef CONFIG_DEBUG_VM +static inline bool free_pcp_prepare(struct page *page) +{ + return free_pages_prepare(page, 0, true); +} + +static inline bool bulkfree_pcp_prepare(struct page *page) +{ + return false; +} +#else +static bool free_pcp_prepare(struct page *page) +{ + return free_pages_prepare(page, 0, false); +} + +static bool bulkfree_pcp_prepare(struct page *page) +{ + return free_pages_check(page); +} +#endif /* CONFIG_DEBUG_VM */ + /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. @@ -829,15 +1082,16 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - int to_free = count; unsigned long nr_scanned; + bool isolated_pageblocks; spin_lock(&zone->lock); - nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + isolated_pageblocks = has_isolate_pageblock(zone); + nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); - while (to_free) { + while (count) { struct page *page; struct list_head *list; @@ -857,7 +1111,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* This is the only non-empty list. Free them all. */ if (batch_free == MIGRATE_PCPTYPES) - batch_free = to_free; + batch_free = count; do { int mt; /* migratetype of the to-be-freed page */ @@ -870,12 +1124,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_ISOLATE page should not go to pcplists */ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); /* Pageblock could have been isolated meanwhile */ - if (unlikely(has_isolate_pageblock(zone))) + if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); + if (bulkfree_pcp_prepare(page)) + continue; + __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - } while (--to_free && --batch_free && !list_empty(list)); + } while (--count && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); } @@ -887,9 +1144,9 @@ static void free_one_page(struct zone *zone, { unsigned long nr_scanned; spin_lock(&zone->lock); - nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { @@ -899,56 +1156,6 @@ static void free_one_page(struct zone *zone, spin_unlock(&zone->lock); } -static int free_tail_pages_check(struct page *head_page, struct page *page) -{ - int ret = 1; - - /* - * We rely page->lru.next never has bit 0 set, unless the page - * is PageTail(). Let's make sure that's true even for poisoned ->lru. - */ - BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); - - if (!IS_ENABLED(CONFIG_DEBUG_VM)) { - ret = 0; - goto out; - } - switch (page - head_page) { - case 1: - /* the first tail page: ->mapping is compound_mapcount() */ - if (unlikely(compound_mapcount(page))) { - bad_page(page, "nonzero compound_mapcount", 0); - goto out; - } - break; - case 2: - /* - * the second tail page: ->mapping is - * page_deferred_list().next -- ignore value. - */ - break; - default: - if (page->mapping != TAIL_MAPPING) { - bad_page(page, "corrupted mapping in tail page", 0); - goto out; - } - break; - } - if (unlikely(!PageTail(page))) { - bad_page(page, "PageTail not set", 0); - goto out; - } - if (unlikely(compound_head(page) != head_page)) { - bad_page(page, "compound_head not consistent", 0); - goto out; - } - ret = 0; -out: - page->mapping = NULL; - clear_compound_head(page); - return ret; -} - static void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { @@ -1003,7 +1210,7 @@ static inline void init_reserved_page(unsigned long pfn) * marks the pages PageReserved. The remaining valid pages are later * sent to the buddy page allocator. */ -void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) +void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -1022,51 +1229,13 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) } } -static bool free_pages_prepare(struct page *page, unsigned int order) -{ - bool compound = PageCompound(page); - int i, bad = 0; - - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - - trace_mm_page_free(page, order); - kmemcheck_free_shadow(page, order); - kasan_free_pages(page, order); - - if (PageAnon(page)) - page->mapping = NULL; - bad += free_pages_check(page); - for (i = 1; i < (1 << order); i++) { - if (compound) - bad += free_tail_pages_check(page, page + i); - bad += free_pages_check(page + i); - } - if (bad) - return false; - - reset_page_owner(page, order); - - if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page), - PAGE_SIZE << order); - debug_check_no_obj_freed(page_address(page), - PAGE_SIZE << order); - } - arch_free_page(page, order); - kernel_poison_pages(page, 1 << order, 0); - kernel_map_pages(page, 1 << order, 0); - - return true; -} - static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); - if (!free_pages_prepare(page, order)) + if (!free_pages_prepare(page, order, true)) return; migratetype = get_pfnblock_migratetype(page, pfn); @@ -1076,8 +1245,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __init __free_pages_boot_core(struct page *page, - unsigned long pfn, unsigned int order) +static void __init __free_pages_boot_core(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -1110,7 +1278,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid < 0) - nid = 0; + nid = first_online_node; spin_unlock(&early_pfn_lock); return nid; @@ -1154,7 +1322,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; - return __free_pages_boot_core(page, pfn, order); + return __free_pages_boot_core(page, order); } /* @@ -1236,15 +1404,18 @@ static void __init deferred_free_range(struct page *page, return; /* Free a large naturally-aligned chunk if possible */ - if (nr_pages == MAX_ORDER_NR_PAGES && - (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { + if (nr_pages == pageblock_nr_pages && + (pfn & (pageblock_nr_pages - 1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, pfn, MAX_ORDER-1); + __free_pages_boot_core(page, pageblock_order); return; } - for (i = 0; i < nr_pages; i++, page++, pfn++) - __free_pages_boot_core(page, pfn, 0); + for (i = 0; i < nr_pages; i++, page++, pfn++) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + __free_pages_boot_core(page, 0); + } } /* Completion tracking for deferred_init_memmap() threads */ @@ -1312,9 +1483,9 @@ static int __init deferred_init_memmap(void *data) /* * Ensure pfn_valid is checked every - * MAX_ORDER_NR_PAGES for memory holes + * pageblock_nr_pages for memory holes */ - if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) { if (!pfn_valid(pfn)) { page = NULL; goto free_range; @@ -1327,7 +1498,7 @@ static int __init deferred_init_memmap(void *data) } /* Minimise pfn page lookups and scheduler checks */ - if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { + if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { page++; } else { nr_pages += nr_to_free; @@ -1363,6 +1534,9 @@ free_range: free_base_page = NULL; free_base_pfn = nr_to_free = 0; } + /* Free the last block of pages to allocator */ + nr_pages += nr_to_free; + deferred_free_range(free_base_page, free_base_pfn, nr_to_free); first_init_pfn = max(end_pfn, first_init_pfn); } @@ -1459,28 +1633,22 @@ static inline void expand(struct zone *zone, struct page *page, size >>= 1; VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && - debug_guardpage_enabled() && - high < debug_guardpage_minorder()) { - /* - * Mark as guard pages (or page), that will allow to - * merge back to allocator when buddy will be freed. - * Corresponding page table entries will not be touched, - * pages will stay not present in virtual address space - */ - set_page_guard(zone, &page[size], high, migratetype); + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + if (set_page_guard(zone, &page[size], high, migratetype)) continue; - } + list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); } } -/* - * This page is about to be returned from the page allocator - */ -static inline int check_new_page(struct page *page) +static void check_new_page_bad(struct page *page) { const char *bad_reason = NULL; unsigned long bad_flags = 0; @@ -1494,6 +1662,9 @@ static inline int check_new_page(struct page *page) if (unlikely(page->flags & __PG_HWPOISON)) { bad_reason = "HWPoisoned (hardware-corrupted)"; bad_flags = __PG_HWPOISON; + /* Don't complain about hwpoisoned pages */ + page_mapcount_reset(page); /* remove PageBuddy */ + return; } if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; @@ -1503,11 +1674,20 @@ static inline int check_new_page(struct page *page) if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(bad_reason)) { - bad_page(page, bad_reason, bad_flags); - return 1; - } - return 0; + bad_page(page, bad_reason, bad_flags); +} + +/* + * This page is about to be returned from the page allocator + */ +static inline int check_new_page(struct page *page) +{ + if (likely(page_expected_state(page, + PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) + return 0; + + check_new_page_bad(page); + return 1; } static inline bool free_pages_prezeroed(bool poisoned) @@ -1516,20 +1696,43 @@ static inline bool free_pages_prezeroed(bool poisoned) page_poisoning_enabled() && poisoned; } -static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - int alloc_flags) +#ifdef CONFIG_DEBUG_VM +static bool check_pcp_refill(struct page *page) { - int i; - bool poisoned = true; + return false; +} +static bool check_new_pcp(struct page *page) +{ + return check_new_page(page); +} +#else +static bool check_pcp_refill(struct page *page) +{ + return check_new_page(page); +} +static bool check_new_pcp(struct page *page) +{ + return false; +} +#endif /* CONFIG_DEBUG_VM */ + +static bool check_new_pages(struct page *page, unsigned int order) +{ + int i; for (i = 0; i < (1 << order); i++) { struct page *p = page + i; + if (unlikely(check_new_page(p))) - return 1; - if (poisoned) - poisoned &= page_is_poisoned(p); + return true; } + return false; +} + +inline void post_alloc_hook(struct page *page, unsigned int order, + gfp_t gfp_flags) +{ set_page_private(page, 0); set_page_refcounted(page); @@ -1537,6 +1740,22 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, kernel_map_pages(page, 1 << order, 1); kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); + set_page_owner(page, order, gfp_flags); +} + +static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags) +{ + int i; + bool poisoned = true; + + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + if (poisoned) + poisoned &= page_is_poisoned(p); + } + + post_alloc_hook(page, order, gfp_flags); if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) @@ -1545,8 +1764,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); - set_page_owner(page, order, gfp_flags); - /* * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking @@ -1557,8 +1774,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, set_page_pfmemalloc(page); else clear_page_pfmemalloc(page); - - return 0; } /* @@ -1980,6 +2195,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (unlikely(page == NULL)) break; + if (unlikely(check_pcp_refill(page))) + continue; + /* * Split buddy pages returned by expand() are received here * in physical page order. The page is added to the callers and @@ -2157,6 +2375,10 @@ void mark_free_pages(struct zone *zone) for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + if (!swsusp_page_is_forbidden(page)) swsusp_unset_page_free(page); } @@ -2187,7 +2409,7 @@ void free_hot_cold_page(struct page *page, bool cold) unsigned long pfn = page_to_pfn(page); int migratetype; - if (!free_pages_prepare(page, 0)) + if (!free_pcp_prepare(page)) return; migratetype = get_pfnblock_migratetype(page, pfn); @@ -2250,7 +2472,6 @@ void free_hot_cold_page_list(struct list_head *list, bool cold) void split_page(struct page *page, unsigned int order) { int i; - gfp_t gfp_mask; VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); @@ -2264,12 +2485,9 @@ void split_page(struct page *page, unsigned int order) split_page(virt_to_page(page[0].shadow), order); #endif - gfp_mask = get_page_owner_gfp(page); - set_page_owner(page, 0, gfp_mask); - for (i = 1; i < (1 << order); i++) { + for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - set_page_owner(page + i, 0, gfp_mask); - } + split_page_owner(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -2285,9 +2503,14 @@ int __isolate_free_page(struct page *page, unsigned int order) mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt)) { - /* Obey watermarks as if the page was being allocated */ - watermark = low_wmark_pages(zone) + (1 << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + /* + * Obey watermarks as if the page was being allocated. We can + * emulate a high-order watermark check with a raised order-0 + * watermark, because we already know our high-order page + * exists. + */ + watermark = min_wmark_pages(zone) + (1UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); @@ -2298,9 +2521,10 @@ int __isolate_free_page(struct page *page, unsigned int order) zone->free_area[order].nr_free--; rmv_page_order(page); - set_page_owner(page, order, __GFP_MOVABLE); - - /* Set the pageblock if the isolated page is at least a pageblock */ + /* + * Set the pageblock if the isolated page is at least half of a + * pageblock + */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -2316,30 +2540,34 @@ int __isolate_free_page(struct page *page, unsigned int order) } /* - * Similar to split_page except the page is already free. As this is only - * being used for migration, the migratetype of the block also changes. - * As this is called with interrupts disabled, the caller is responsible - * for calling arch_alloc_page() and kernel_map_page() after interrupts - * are enabled. + * Update NUMA hit/miss statistics * - * Note: this is probably too low level an operation for use in drivers. - * Please consult with lkml before using this in your driver. + * Must be called with interrupts disabled. + * + * When __GFP_OTHER_NODE is set assume the node of the preferred + * zone is the local node. This is useful for daemons who allocate + * memory on behalf of other processes. */ -int split_free_page(struct page *page) +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, + gfp_t flags) { - unsigned int order; - int nr_pages; - - order = page_order(page); +#ifdef CONFIG_NUMA + int local_nid = numa_node_id(); + enum zone_stat_item local_stat = NUMA_LOCAL; - nr_pages = __isolate_free_page(page, order); - if (!nr_pages) - return 0; + if (unlikely(flags & __GFP_OTHER_NODE)) { + local_stat = NUMA_OTHER; + local_nid = preferred_zone->node; + } - /* Split into individual pages */ - set_page_refcounted(page); - split_page(page, order); - return nr_pages; + if (z->node == local_nid) { + __inc_zone_state(z, NUMA_HIT); + __inc_zone_state(z, local_stat); + } else { + __inc_zone_state(z, NUMA_MISS); + __inc_zone_state(preferred_zone, NUMA_FOREIGN); + } +#endif } /* @@ -2348,7 +2576,8 @@ int split_free_page(struct page *page) static inline struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int alloc_flags, int migratetype) + gfp_t gfp_flags, unsigned int alloc_flags, + int migratetype) { unsigned long flags; struct page *page; @@ -2359,23 +2588,26 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, struct list_head *list; local_irq_save(flags); - pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, - migratetype, cold); - if (unlikely(list_empty(list))) - goto failed; - } + do { + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) + goto failed; + } - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); + if (cold) + page = list_last_entry(list, struct page, lru); + else + page = list_first_entry(list, struct page, lru); - list_del(&page->lru); - pcp->count--; + list_del(&page->lru); + pcp->count--; + + } while (check_new_pcp(page)); } else { /* * We most definitely don't want callers attempting to @@ -2384,14 +2616,16 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); spin_lock_irqsave(&zone->lock, flags); - page = NULL; - if (alloc_flags & ALLOC_HARDER) { - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } - if (!page) - page = __rmqueue(zone, order, migratetype); + do { + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } + if (!page) + page = __rmqueue(zone, order, migratetype); + } while (page && check_new_pages(page, order)); spin_unlock(&zone->lock); if (!page) goto failed; @@ -2399,12 +2633,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, get_pcppage_migratetype(page)); } - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); - if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && - !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) - set_bit(ZONE_FAIR_DEPLETED, &zone->flags); - - __count_zone_vm_events(PGALLOC, zone, 1 << order); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); @@ -2500,13 +2729,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * one free page of a suitable size. Checking now avoids taking the zone lock * to check in the allocation paths if no pages are free. */ -static bool __zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags, - long free_pages) +bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + int classzone_idx, unsigned int alloc_flags, + long free_pages) { long min = mark; int o; - const int alloc_harder = (alloc_flags & ALLOC_HARDER); + const bool alloc_harder = (alloc_flags & ALLOC_HARDER); /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -2569,12 +2798,38 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, int alloc_flags) + int classzone_idx, unsigned int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } +static inline bool zone_watermark_fast(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, unsigned int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + long cma_pages = 0; + +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); +#endif + + /* + * Fast check for order-0 only. If this fails then the reserves + * need to be calculated. There is a corner case where the check + * passes but only the high-order atomic reserve are free. If + * the caller is !atomic then it'll uselessly search the free + * list. That corner case is then slower but it is harmless. + */ + if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + return true; + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); +} + bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx) { @@ -2588,40 +2843,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, } #ifdef CONFIG_NUMA -static bool zone_local(struct zone *local_zone, struct zone *zone) -{ - return local_zone->node == zone->node; -} - static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ -static bool zone_local(struct zone *local_zone, struct zone *zone) -{ - return true; -} - static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; } #endif /* CONFIG_NUMA */ -static void reset_alloc_batches(struct zone *preferred_zone) -{ - struct zone *zone = preferred_zone->zone_pgdat->node_zones; - - do { - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); - } while (zone++ != preferred_zone); -} - /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -2630,74 +2863,55 @@ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { - struct zonelist *zonelist = ac->zonelist; - struct zoneref *z; - struct page *page = NULL; + struct zoneref *z = ac->preferred_zoneref; struct zone *zone; - int nr_fair_skipped = 0; - bool zonelist_rescan; - -zonelist_scan: - zonelist_rescan = false; + struct pglist_data *last_pgdat_dirty_limit = NULL; /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { + struct page *page; unsigned long mark; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed(zone, gfp_mask)) + !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* - * Distribute pages in proportion to the individual - * zone size to ensure fair page aging. The zone a - * page was allocated in should have no effect on the - * time the page has in memory before being reclaimed. - */ - if (alloc_flags & ALLOC_FAIR) { - if (!zone_local(ac->preferred_zone, zone)) - break; - if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { - nr_fair_skipped++; - continue; - } - } - /* * When allocating a page cache page for writing, we - * want to get it from a zone that is within its dirty - * limit, such that no single zone holds more than its + * want to get it from a node that is within its dirty + * limit, such that no single node holds more than its * proportional share of globally allowed dirty pages. - * The dirty limits take into account the zone's + * The dirty limits take into account the node's * lowmem reserves and high watermark so that kswapd * should be able to balance it without having to * write pages from its LRU list. * - * This may look like it could increase pressure on - * lower zones by failing allocations in higher zones - * before they are full. But the pages that do spill - * over are limited as the lower zones are protected - * by this very same mechanism. It should not become - * a practical burden to them. - * * XXX: For now, allow allocations to potentially - * exceed the per-zone dirty limit in the slowpath + * exceed the per-node dirty limit in the slowpath * (spread_dirty_pages unset) before going into reclaim, * which is important when on a NUMA setup the allowed - * zones are together not big enough to reach the + * nodes are together not big enough to reach the * global limit. The proper fix for these situations - * will require awareness of zones in the + * will require awareness of nodes in the * dirty-throttling and the flusher threads. */ - if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) - continue; + if (ac->spread_dirty_pages) { + if (last_pgdat_dirty_limit == zone->zone_pgdat) + continue; + + if (!node_dirty_ok(zone->zone_pgdat)) { + last_pgdat_dirty_limit = zone->zone_pgdat; + continue; + } + } mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (!zone_watermark_ok(zone, order, mark, - ac->classzone_idx, alloc_flags)) { + if (!zone_watermark_fast(zone, order, mark, + ac_classzone_idx(ac), alloc_flags)) { int ret; /* Checked here to keep the fast path fast */ @@ -2705,22 +2919,22 @@ zonelist_scan: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (zone_reclaim_mode == 0 || - !zone_allows_reclaim(ac->preferred_zone, zone)) + if (node_reclaim_mode == 0 || + !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; - ret = zone_reclaim(zone, gfp_mask, order); + ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); switch (ret) { - case ZONE_RECLAIM_NOSCAN: + case NODE_RECLAIM_NOSCAN: /* did not scan */ continue; - case ZONE_RECLAIM_FULL: + case NODE_RECLAIM_FULL: /* scanned but unreclaimable */ continue; default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - ac->classzone_idx, alloc_flags)) + ac_classzone_idx(ac), alloc_flags)) goto try_this_zone; continue; @@ -2728,11 +2942,10 @@ zonelist_scan: } try_this_zone: - page = buffered_rmqueue(ac->preferred_zone, zone, order, + page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { - if (prep_new_page(page, order, gfp_mask, alloc_flags)) - goto try_this_zone; + prep_new_page(page, order, gfp_mask, alloc_flags); /* * If this is a high-order atomic allocation then check @@ -2745,27 +2958,6 @@ try_this_zone: } } - /* - * The first pass makes sure allocations are spread fairly within the - * local node. However, the local node might have free pages left - * after the fairness batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without fairness, and - * include remote zones now, before entering the slowpath and waking - * kswapd: prefer spilling to a remote zone over swapping locally. - */ - if (alloc_flags & ALLOC_FAIR) { - alloc_flags &= ~ALLOC_FAIR; - if (nr_fair_skipped) { - zonelist_rescan = true; - reset_alloc_batches(ac->preferred_zone); - } - if (nr_online_nodes > 1) - zonelist_rescan = true; - } - - if (zonelist_rescan) - goto zonelist_scan; - return NULL; } @@ -2787,9 +2979,11 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); -void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) +void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; + struct va_format vaf; + va_list args; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || debug_guardpage_minorder() > 0) @@ -2807,22 +3001,16 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; - if (fmt) { - struct va_format vaf; - va_list args; - - va_start(args, fmt); + pr_warn("%s: ", current->comm); - vaf.fmt = fmt; - vaf.va = &args; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_cont("%pV", &vaf); + va_end(args); - pr_warn("%pV", &vaf); - - va_end(args); - } + pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); - pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", - current->comm, order, gfp_mask, &gfp_mask); dump_stack(); if (!should_suppress_show_mem()) show_mem(filter); @@ -2835,6 +3023,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct oom_control oc = { .zonelist = ac->zonelist, .nodemask = ac->nodemask, + .memcg = NULL, .gfp_mask = gfp_mask, .order = order, }; @@ -2872,22 +3061,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; - /* The OOM killer does not compensate for IO-less reclaim */ - if (!(gfp_mask & __GFP_FS)) { - /* - * XXX: Page reclaim didn't yield anything, - * and the OOM killer can't be invoked, but - * keep looping as per tradition. - * - * But do not keep looping if oom_killer_disable() - * was already called, for the system is trying to - * enter a quiescent state during suspend. - */ - *did_some_progress = !oom_killer_disabled; - goto out; - } if (pm_suspended_storage()) goto out; + /* + * XXX: GFP_NOFS allocations should rather fail than rely on + * other request to make a forward progress. + * We are in an unfortunate situation where out_of_memory cannot + * do much for this context but let's try it to at least get + * access to memory reserved if the current task is killed (see + * out_of_memory). Once filesystems are ready to handle allocation + * failures more gracefully we should just bail out here. + */ + /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; @@ -2913,34 +3098,31 @@ out: return page; } +/* + * Maximum number of compaction retries wit a progress before OOM + * killer is consider as the only way to move forward. + */ +#define MAX_COMPACT_RETRIES 16 + #ifdef CONFIG_COMPACTION /* Try memory compaction for high-order allocations before reclaim */ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended_compaction, - bool *deferred_compaction) + unsigned int alloc_flags, const struct alloc_context *ac, + enum compact_priority prio, enum compact_result *compact_result) { - unsigned long compact_result; struct page *page; if (!order) return NULL; current->flags |= PF_MEMALLOC; - compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - mode, contended_compaction); + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, + prio); current->flags &= ~PF_MEMALLOC; - switch (compact_result) { - case COMPACT_DEFERRED: - *deferred_compaction = true; - /* fall-through */ - case COMPACT_SKIPPED: + if (*compact_result <= COMPACT_INACTIVE) return NULL; - default: - break; - } /* * At least in one zone compaction wasn't deferred or skipped, so let's @@ -2948,8 +3130,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) { struct zone *zone = page_zone(page); @@ -2970,15 +3151,102 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; } + +static inline bool +should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, + enum compact_result compact_result, + enum compact_priority *compact_priority, + int *compaction_retries) +{ + int max_retries = MAX_COMPACT_RETRIES; + int min_priority; + + if (!order) + return false; + + if (compaction_made_progress(compact_result)) + (*compaction_retries)++; + + /* + * compaction considers all the zone as desperately out of memory + * so it doesn't really make much sense to retry except when the + * failure could be caused by insufficient priority + */ + if (compaction_failed(compact_result)) + goto check_priority; + + /* + * make sure the compaction wasn't deferred or didn't bail out early + * due to locks contention before we declare that we should give up. + * But do not retry if the given zonelist is not suitable for + * compaction. + */ + if (compaction_withdrawn(compact_result)) + return compaction_zonelist_suitable(ac, order, alloc_flags); + + /* + * !costly requests are much more important than __GFP_REPEAT + * costly ones because they are de facto nofail and invoke OOM + * killer to move on while costly can fail and users are ready + * to cope with that. 1/4 retries is rather arbitrary but we + * would need much more detailed feedback from compaction to + * make a better decision. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + max_retries /= 4; + if (*compaction_retries <= max_retries) + return true; + + /* + * Make sure there are attempts at the highest priority if we exhausted + * all retries or failed at the lower priorities. + */ +check_priority: + min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? + MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; + if (*compact_priority > min_priority) { + (*compact_priority)--; + *compaction_retries = 0; + return true; + } + return false; +} #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended_compaction, - bool *deferred_compaction) + unsigned int alloc_flags, const struct alloc_context *ac, + enum compact_priority prio, enum compact_result *compact_result) { + *compact_result = COMPACT_SKIPPED; return NULL; } + +static inline bool +should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, + enum compact_result compact_result, + enum compact_priority *compact_priority, + int *compaction_retries) +{ + struct zone *zone; + struct zoneref *z; + + if (!order || order > PAGE_ALLOC_COSTLY_ORDER) + return false; + + /* + * There are setups with compaction disabled which would prefer to loop + * inside the allocator rather than hit the oom killer prematurely. + * Let's give them a good hope and keep retrying while the order-0 + * watermarks are OK. + */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { + if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), + ac_classzone_idx(ac), alloc_flags)) + return true; + } + return false; +} #endif /* CONFIG_COMPACTION */ /* Perform direct synchronous page reclaim */ @@ -3013,7 +3281,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, unsigned long *did_some_progress) { struct page *page = NULL; @@ -3024,8 +3292,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, return NULL; retry: - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); /* * If an allocation failed after direct reclaim, it could be because @@ -3046,16 +3313,20 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; + pg_data_t *last_pgdat = NULL; for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->high_zoneidx, ac->nodemask) - wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); + ac->high_zoneidx, ac->nodemask) { + if (last_pgdat != zone->zone_pgdat) + wakeup_kswapd(zone, order, ac->high_zoneidx); + last_pgdat = zone->zone_pgdat; + } } -static inline int +static inline unsigned int gfp_to_alloc_flags(gfp_t gfp_mask) { - int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -3083,16 +3354,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { - if (gfp_mask & __GFP_MEMALLOC) - alloc_flags |= ALLOC_NO_WATERMARKS; - else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) - alloc_flags |= ALLOC_NO_WATERMARKS; - else if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) - alloc_flags |= ALLOC_NO_WATERMARKS; - } #ifdef CONFIG_CMA if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; @@ -3102,12 +3363,122 @@ gfp_to_alloc_flags(gfp_t gfp_mask) bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) { - return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); + if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + return false; + + if (gfp_mask & __GFP_MEMALLOC) + return true; + if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) + return true; + if (!in_interrupt() && + ((current->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) + return true; + + return false; } -static inline bool is_thp_gfp_mask(gfp_t gfp_mask) +/* + * Maximum number of reclaim retries without any progress before OOM killer + * is consider as the only way to move forward. + */ +#define MAX_RECLAIM_RETRIES 16 + +/* + * Checks whether it makes sense to retry the reclaim to make a forward progress + * for the given allocation request. + * The reclaim feedback represented by did_some_progress (any progress during + * the last reclaim round) and no_progress_loops (number of reclaim rounds without + * any progress in a row) is considered as well as the reclaimable pages on the + * applicable zone list (with a backoff mechanism which is a function of + * no_progress_loops). + * + * Returns true if a retry is viable or false to enter the oom path. + */ +static inline bool +should_reclaim_retry(gfp_t gfp_mask, unsigned order, + struct alloc_context *ac, int alloc_flags, + bool did_some_progress, int *no_progress_loops) { - return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; + struct zone *zone; + struct zoneref *z; + + /* + * Costly allocations might have made a progress but this doesn't mean + * their order will become available due to high fragmentation so + * always increment the no progress counter for them + */ + if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) + *no_progress_loops = 0; + else + (*no_progress_loops)++; + + /* + * Make sure we converge to OOM if we cannot make any progress + * several times in the row. + */ + if (*no_progress_loops > MAX_RECLAIM_RETRIES) + return false; + + /* + * Keep reclaiming pages while there is a chance this will lead + * somewhere. If none of the target zones can satisfy our allocation + * request even if all reclaimable pages are considered then we are + * screwed and have to go OOM. + */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { + unsigned long available; + unsigned long reclaimable; + + available = reclaimable = zone_reclaimable_pages(zone); + available -= DIV_ROUND_UP((*no_progress_loops) * available, + MAX_RECLAIM_RETRIES); + available += zone_page_state_snapshot(zone, NR_FREE_PAGES); + + /* + * Would the allocation succeed if we reclaimed the whole + * available? + */ + if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), + ac_classzone_idx(ac), alloc_flags, available)) { + /* + * If we didn't make any progress and have a lot of + * dirty + writeback pages then we should wait for + * an IO to complete to slow down the reclaim and + * prevent from pre mature OOM + */ + if (!did_some_progress) { + unsigned long write_pending; + + write_pending = zone_page_state_snapshot(zone, + NR_ZONE_WRITE_PENDING); + + if (2 * write_pending > reclaimable) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + return true; + } + } + + /* + * Memory allocation/reclaim might be called from a WQ + * context and the current implementation of the WQ + * concurrency control doesn't recognize that + * a particular WQ is congested if the worker thread is + * looping without ever sleeping. Therefore we have to + * do a short sleep here rather than calling + * cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout_uninterruptible(1); + else + cond_resched(); + + return true; + } + } + + return false; } static inline struct page * @@ -3116,12 +3487,14 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; - int alloc_flags; - unsigned long pages_reclaimed = 0; + unsigned int alloc_flags; unsigned long did_some_progress; - enum migrate_mode migration_mode = MIGRATE_ASYNC; - bool deferred_compaction = false; - int contended_compaction = COMPACT_CONTENDED_NONE; + enum compact_priority compact_priority = DEF_COMPACT_PRIORITY; + enum compact_result compact_result; + int compaction_retries = 0; + int no_progress_loops = 0; + unsigned long alloc_start = jiffies; + unsigned int stall_timeout = 10 * HZ; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3142,48 +3515,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; -retry: - if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); - /* - * OK, we're below the kswapd watermark and have kicked background - * reclaim. Now things get more complex, so set up alloc_flags according - * to how we want to proceed. + * The fast path uses conservative alloc_flags to succeed only until + * kswapd needs to be woken up, and to avoid the cost of setting up + * alloc_flags precisely. So we do that now. */ alloc_flags = gfp_to_alloc_flags(gfp_mask); + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + wake_all_kswapds(order, ac); + /* - * Find the true preferred zone if the allocation is unconstrained by - * cpusets. + * The adjusted alloc_flags might result in immediate success, so try + * that first */ - if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { - struct zoneref *preferred_zoneref; - preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, NULL, &ac->preferred_zone); - ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); - } - - /* This is the last chance, in general, before the goto nopage. */ - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) goto got_pg; - /* Allocate without watermarks if the context allows */ - if (alloc_flags & ALLOC_NO_WATERMARKS) { + /* + * For costly allocations, try direct compaction first, as it's likely + * that we have enough base pages and don't need to reclaim. Don't try + * that for allocations that are allowed to ignore watermarks, as the + * ALLOC_NO_WATERMARKS attempt didn't yet happen. + */ + if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && + !gfp_pfmemalloc_allowed(gfp_mask)) { + page = __alloc_pages_direct_compact(gfp_mask, order, + alloc_flags, ac, + INIT_COMPACT_PRIORITY, + &compact_result); + if (page) + goto got_pg; + /* - * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds - * the allocation is high priority and these type of - * allocations are system rather than user orientated + * Checks for costly allocations with __GFP_NORETRY, which + * includes THP page fault allocations */ + if (gfp_mask & __GFP_NORETRY) { + /* + * If compaction is deferred for high-order allocations, + * it is because sync compaction recently failed. If + * this is the case and the caller requested a THP + * allocation, we do not want to heavily disrupt the + * system, so we fail the allocation instead of entering + * direct reclaim. + */ + if (compact_result == COMPACT_DEFERRED) + goto nopage; + + /* + * Looks like reclaim/compaction is worth trying, but + * sync compaction could be very expensive, so keep + * using async compaction. + */ + compact_priority = INIT_COMPACT_PRIORITY; + } + } + +retry: + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + wake_all_kswapds(order, ac); + + if (gfp_pfmemalloc_allowed(gfp_mask)) + alloc_flags = ALLOC_NO_WATERMARKS; + + /* + * Reset the zonelist iterators if memory policies can be ignored. + * These allocations are high priority and system rather than user + * orientated. + */ + if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS, ac); - if (page) - goto got_pg; + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, ac->nodemask); } + /* Attempt with potentially adjusted zonelist and alloc_flags */ + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + if (page) + goto got_pg; + /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) { /* @@ -3213,74 +3626,53 @@ retry: if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; - /* - * Try direct compaction. The first pass is asynchronous. Subsequent - * attempts after direct reclaim are synchronous - */ + + /* Try direct reclaim and then allocating */ + page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, + &did_some_progress); + if (page) + goto got_pg; + + /* Try direct compaction and then allocating */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, - migration_mode, - &contended_compaction, - &deferred_compaction); + compact_priority, &compact_result); if (page) goto got_pg; - /* Checks for THP-specific high-order allocations */ - if (is_thp_gfp_mask(gfp_mask)) { - /* - * If compaction is deferred for high-order allocations, it is - * because sync compaction recently failed. If this is the case - * and the caller requested a THP allocation, we do not want - * to heavily disrupt the system, so we fail the allocation - * instead of entering direct reclaim. - */ - if (deferred_compaction) - goto nopage; + /* Do not loop if specifically requested */ + if (gfp_mask & __GFP_NORETRY) + goto nopage; - /* - * In all zones where compaction was attempted (and not - * deferred or skipped), lock contention has been detected. - * For THP allocation we do not want to disrupt the others - * so we fallback to base pages instead. - */ - if (contended_compaction == COMPACT_CONTENDED_LOCK) - goto nopage; + /* + * Do not retry costly high order allocations unless they are + * __GFP_REPEAT + */ + if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) + goto nopage; - /* - * If compaction was aborted due to need_resched(), we do not - * want to further increase allocation latency, unless it is - * khugepaged trying to collapse. - */ - if (contended_compaction == COMPACT_CONTENDED_SCHED - && !(current->flags & PF_KTHREAD)) - goto nopage; + /* Make sure we know about allocations which stall for too long */ + if (time_after(jiffies, alloc_start + stall_timeout)) { + warn_alloc(gfp_mask, + "page alloction stalls for %ums, order:%u\n", + jiffies_to_msecs(jiffies-alloc_start), order); + stall_timeout += 10 * HZ; } + if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, + did_some_progress > 0, &no_progress_loops)) + goto retry; + /* - * It can become very expensive to allocate transparent hugepages at - * fault, so use asynchronous memory compaction for THP unless it is - * khugepaged trying to collapse. + * It doesn't make any sense to retry for the compaction if the order-0 + * reclaim is not able to make any progress because the current + * implementation of the compaction depends on the sufficient amount + * of free memory (see __compaction_suitable) */ - if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD)) - migration_mode = MIGRATE_SYNC_LIGHT; - - /* Try direct reclaim and then allocating */ - page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, - &did_some_progress); - if (page) - goto got_pg; - - /* Do not loop if specifically requested */ - if (gfp_mask & __GFP_NORETRY) - goto noretry; - - /* Keep reclaiming pages as long as there is reasonable progress */ - pages_reclaimed += did_some_progress; - if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || - ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { - /* Wait for some write requests to complete then retry */ - wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); + if (did_some_progress > 0 && + should_compact_retry(ac, order, alloc_flags, + compact_result, &compact_priority, + &compaction_retries)) goto retry; - } /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); @@ -3288,23 +3680,14 @@ retry: goto got_pg; /* Retry as long as the OOM killer is making progress */ - if (did_some_progress) + if (did_some_progress) { + no_progress_loops = 0; goto retry; + } -noretry: - /* - * High-order allocations do not necessarily loop after - * direct reclaim and reclaim/compaction depends on compaction - * being called after reclaim so call directly if necessary - */ - page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, - ac, migration_mode, - &contended_compaction, - &deferred_compaction); - if (page) - goto got_pg; nopage: - warn_alloc_failed(gfp_mask, order, NULL); + warn_alloc(gfp_mask, + "page allocation failure: order:%u", order); got_pg: return page; } @@ -3316,17 +3699,24 @@ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { - struct zoneref *preferred_zoneref; - struct page *page = NULL; + struct page *page; unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + unsigned int alloc_flags = ALLOC_WMARK_LOW; + gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), + .zonelist = zonelist, .nodemask = nodemask, .migratetype = gfpflags_to_migratetype(gfp_mask), }; + if (cpusets_enabled()) { + alloc_mask |= __GFP_HARDWALL; + alloc_flags |= ALLOC_CPUSET; + if (!ac.nodemask) + ac.nodemask = &cpuset_current_mems_allowed; + } + gfp_mask &= gfp_allowed_mask; lockdep_trace_alloc(gfp_mask); @@ -3350,49 +3740,64 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); - /* We set it here, as __alloc_pages_slowpath might have changed it */ - ac.zonelist = zonelist; - /* Dirty zone balancing only done in the fast path */ ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); - /* The preferred zone is used for statistics later */ - preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, - ac.nodemask ? : &cpuset_current_mems_allowed, - &ac.preferred_zone); - if (!ac.preferred_zone) - goto out; - ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); + /* + * The preferred zone is used for statistics but crucially it is + * also used as the starting point for the zonelist iterator. It + * may get reset for allocations that ignore memory policies. + */ + ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, + ac.high_zoneidx, ac.nodemask); + if (!ac.preferred_zoneref) { + page = NULL; + goto no_zone; + } /* First allocation attempt */ - alloc_mask = gfp_mask|__GFP_HARDWALL; page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); - if (unlikely(!page)) { - /* - * Runtime PM, block IO and its error handling path - * can deadlock because I/O on the device might not - * complete. - */ - alloc_mask = memalloc_noio_flags(gfp_mask); - ac.spread_dirty_pages = false; - - page = __alloc_pages_slowpath(alloc_mask, order, &ac); - } + if (likely(page)) + goto out; - if (kmemcheck_enabled && page) - kmemcheck_pagealloc_alloc(page, order, gfp_mask); + /* + * Runtime PM, block IO and its error handling path can deadlock + * because I/O on the device might not complete. + */ + alloc_mask = memalloc_noio_flags(gfp_mask); + ac.spread_dirty_pages = false; - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); + /* + * Restore the original nodemask if it was potentially replaced with + * &cpuset_current_mems_allowed to optimize the fast-path attempt. + */ + if (cpusets_enabled()) + ac.nodemask = nodemask; + page = __alloc_pages_slowpath(alloc_mask, order, &ac); -out: +no_zone: /* * When updating a task's mems_allowed, it is possible to race with * parallel threads in such a way that an allocation can fail while * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) { + alloc_mask = gfp_mask; goto retry_cpuset; + } + +out: + if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && + unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { + __free_pages(page, order); + page = NULL; + } + + if (kmemcheck_enabled && page) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); + + trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); return page; } @@ -3544,56 +3949,6 @@ void __free_page_frag(void *addr) } EXPORT_SYMBOL(__free_page_frag); -/* - * alloc_kmem_pages charges newly allocated pages to the kmem resource counter - * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is - * equivalent to alloc_pages. - * - * It should be used when the caller would like to use kmalloc, but since the - * allocation is large, it has to fall back to the page allocator. - */ -struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) -{ - struct page *page; - - page = alloc_pages(gfp_mask, order); - if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { - __free_pages(page, order); - page = NULL; - } - return page; -} - -struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) -{ - struct page *page; - - page = alloc_pages_node(nid, gfp_mask, order); - if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { - __free_pages(page, order); - page = NULL; - } - return page; -} - -/* - * __free_kmem_pages and free_kmem_pages will free pages allocated with - * alloc_kmem_pages. - */ -void __free_kmem_pages(struct page *page, unsigned int order) -{ - memcg_kmem_uncharge(page, order); - __free_pages(page, order); -} - -void free_kmem_pages(unsigned long addr, unsigned int order) -{ - if (addr != 0) { - VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_kmem_pages(virt_to_page((void *)addr), order); - } -} - static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) { @@ -3739,7 +4094,7 @@ long si_mem_available(void) int lru; for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_page_state(NR_LRU_BASE + lru); + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) wmark_low += zone->watermark[WMARK_LOW]; @@ -3775,7 +4130,7 @@ EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = global_page_state(NR_SHMEM); + val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -3790,20 +4145,29 @@ void si_meminfo_node(struct sysinfo *val, int nid) { int zone_type; /* needs to be signed */ unsigned long managed_pages = 0; + unsigned long managed_highpages = 0; + unsigned long free_highpages = 0; pg_data_t *pgdat = NODE_DATA(nid); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; - val->sharedram = node_page_state(nid, NR_SHMEM); - val->freeram = node_page_state(nid, NR_FREE_PAGES); + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM - val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; - val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], - NR_FREE_PAGES); + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + + if (is_highmem(zone)) { + managed_highpages += zone->managed_pages; + free_highpages += zone_page_state(zone, NR_FREE_PAGES); + } + } + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; #else - val->totalhigh = 0; - val->freehigh = 0; + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; #endif val->mem_unit = PAGE_SIZE; } @@ -3872,6 +4236,7 @@ void show_free_areas(unsigned int filter) unsigned long free_pcp = 0; int cpu; struct zone *zone; + pg_data_t *pgdat; for_each_populated_zone(zone) { if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -3887,26 +4252,73 @@ void show_free_areas(unsigned int filter) " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", - global_page_state(NR_ACTIVE_ANON), - global_page_state(NR_INACTIVE_ANON), - global_page_state(NR_ISOLATED_ANON), - global_page_state(NR_ACTIVE_FILE), - global_page_state(NR_INACTIVE_FILE), - global_page_state(NR_ISOLATED_FILE), - global_page_state(NR_UNEVICTABLE), - global_page_state(NR_FILE_DIRTY), - global_page_state(NR_WRITEBACK), - global_page_state(NR_UNSTABLE_NFS), + global_node_page_state(NR_ACTIVE_ANON), + global_node_page_state(NR_INACTIVE_ANON), + global_node_page_state(NR_ISOLATED_ANON), + global_node_page_state(NR_ACTIVE_FILE), + global_node_page_state(NR_INACTIVE_FILE), + global_node_page_state(NR_ISOLATED_FILE), + global_node_page_state(NR_UNEVICTABLE), + global_node_page_state(NR_FILE_DIRTY), + global_node_page_state(NR_WRITEBACK), + global_node_page_state(NR_UNSTABLE_NFS), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), - global_page_state(NR_FILE_MAPPED), - global_page_state(NR_SHMEM), + global_node_page_state(NR_FILE_MAPPED), + global_node_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), global_page_state(NR_FREE_PAGES), free_pcp, global_page_state(NR_FREE_CMA_PAGES)); + for_each_online_pgdat(pgdat) { + printk("Node %d" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" + " mapped:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " shmem:%lukB" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + " shmem_thp: %lukB" + " shmem_pmdmapped: %lukB" + " anon_thp: %lukB" +#endif + " writeback_tmp:%lukB" + " unstable:%lukB" + " pages_scanned:%lu" + " all_unreclaimable? %s" + "\n", + pgdat->node_id, + K(node_page_state(pgdat, NR_ACTIVE_ANON)), + K(node_page_state(pgdat, NR_INACTIVE_ANON)), + K(node_page_state(pgdat, NR_ACTIVE_FILE)), + K(node_page_state(pgdat, NR_INACTIVE_FILE)), + K(node_page_state(pgdat, NR_UNEVICTABLE)), + K(node_page_state(pgdat, NR_ISOLATED_ANON)), + K(node_page_state(pgdat, NR_ISOLATED_FILE)), + K(node_page_state(pgdat, NR_FILE_MAPPED)), + K(node_page_state(pgdat, NR_FILE_DIRTY)), + K(node_page_state(pgdat, NR_WRITEBACK)), +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) + * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), +#endif + K(node_page_state(pgdat, NR_SHMEM)), + K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), + K(node_page_state(pgdat, NR_UNSTABLE_NFS)), + node_page_state(pgdat, NR_PAGES_SCANNED), + !pgdat_reclaimable(pgdat) ? "yes" : "no"); + } + for_each_populated_zone(zone) { int i; @@ -3928,61 +4340,41 @@ void show_free_areas(unsigned int filter) " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" - " isolated(anon):%lukB" - " isolated(file):%lukB" + " writepending:%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " dirty:%lukB" - " writeback:%lukB" - " mapped:%lukB" - " shmem:%lukB" " slab_reclaimable:%lukB" " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" - " unstable:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" " free_cma:%lukB" - " writeback_tmp:%lukB" - " pages_scanned:%lu" - " all_unreclaimable? %s" "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), - K(zone_page_state(zone, NR_ACTIVE_ANON)), - K(zone_page_state(zone, NR_INACTIVE_ANON)), - K(zone_page_state(zone, NR_ACTIVE_FILE)), - K(zone_page_state(zone, NR_INACTIVE_FILE)), - K(zone_page_state(zone, NR_UNEVICTABLE)), - K(zone_page_state(zone, NR_ISOLATED_ANON)), - K(zone_page_state(zone, NR_ISOLATED_FILE)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), + K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_FILE_DIRTY)), - K(zone_page_state(zone, NR_WRITEBACK)), - K(zone_page_state(zone, NR_FILE_MAPPED)), - K(zone_page_state(zone, NR_SHMEM)), K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), - zone_page_state(zone, NR_KERNEL_STACK) * - THREAD_SIZE / 1024, + zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), - K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->pageset->pcp.count)), - K(zone_page_state(zone, NR_FREE_CMA_PAGES)), - K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - K(zone_page_state(zone, NR_PAGES_SCANNED)), - (!zone_reclaimable(zone) ? "yes" : "no") - ); + K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) printk(" %ld", zone->lowmem_reserve[i]); @@ -4024,7 +4416,7 @@ void show_free_areas(unsigned int filter) hugetlb_show_meminfo(); - printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); + printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); show_swap_cache_info(); } @@ -4049,7 +4441,7 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, do { zone_type--; zone = pgdat->node_zones + zone_type; - if (populated_zone(zone)) { + if (managed_zone(zone)) { zoneref_set_zone(zone, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); @@ -4244,7 +4636,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) int j; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; j = build_zonelists_node(NODE_DATA(node), zonelist, j); @@ -4260,7 +4652,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) int j; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[1]; + zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; @@ -4281,13 +4673,13 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) struct zone *z; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; pos = 0; for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { for (j = 0; j < nr_nodes; j++) { node = node_order[j]; z = &NODE_DATA(node)->node_zones[zone_type]; - if (populated_zone(z)) { + if (managed_zone(z)) { zoneref_set_zone(z, &zonelist->_zonerefs[pos++]); check_highest_zone(zone_type); @@ -4390,16 +4782,17 @@ static void build_zonelists(pg_data_t *pgdat) */ int local_memory_node(int node) { - struct zone *zone; + struct zoneref *z; - (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), + z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), - NULL, - &zone); - return zone->node; + NULL); + return z->zone->node; } #endif +static void setup_min_unmapped_ratio(void); +static void setup_min_slab_ratio(void); #else /* CONFIG_NUMA */ static void set_zonelist_order(void) @@ -4415,7 +4808,7 @@ static void build_zonelists(pg_data_t *pgdat) local_node = pgdat->node_id; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; j = build_zonelists_node(pgdat, zonelist, 0); /* @@ -4688,15 +5081,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * If not mirrored_kernelcore and ZONE_MOVABLE exists, range - * from zone_movable_pfn[nid] to end of each node should be - * ZONE_MOVABLE not ZONE_NORMAL. skip it. - */ - if (!mirrored_kernelcore && zone_movable_pfn[nid]) - if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) - continue; - - /* * Check given memblock attribute by firmware which can affect * kernel memory layout. If zone==ZONE_MOVABLE but memory is * mirrored, it's an overlapped memmap init. skip it. @@ -4904,13 +5288,18 @@ static void __meminit setup_zone_pageset(struct zone *zone) */ void __init setup_per_cpu_pageset(void) { + struct pglist_data *pgdat; struct zone *zone; for_each_populated_zone(zone) setup_zone_pageset(zone); + + for_each_online_pgdat(pgdat) + pgdat->per_cpu_nodestats = + alloc_percpu(struct per_cpu_nodestat); } -static noinline __init_refok +static noinline __ref int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; @@ -5134,6 +5523,12 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, *zone_end_pfn = min(node_end_pfn, arch_zone_highest_possible_pfn[movable_zone]); + /* Adjust for ZONE_MOVABLE starting within this range */ + } else if (!mirrored_kernelcore && + *zone_start_pfn < zone_movable_pfn[nid] && + *zone_end_pfn > zone_movable_pfn[nid]) { + *zone_end_pfn = zone_movable_pfn[nid]; + /* Check if this whole range is within ZONE_MOVABLE */ } else if (*zone_start_pfn >= zone_movable_pfn[nid]) *zone_start_pfn = *zone_end_pfn; @@ -5237,28 +5632,23 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages * and vice versa. */ - if (zone_movable_pfn[nid]) { - if (mirrored_kernelcore) { - unsigned long start_pfn, end_pfn; - struct memblock_region *r; - - for_each_memblock(memory, r) { - start_pfn = clamp(memblock_region_memory_base_pfn(r), - zone_start_pfn, zone_end_pfn); - end_pfn = clamp(memblock_region_memory_end_pfn(r), - zone_start_pfn, zone_end_pfn); - - if (zone_type == ZONE_MOVABLE && - memblock_is_mirror(r)) - nr_absent += end_pfn - start_pfn; - - if (zone_type == ZONE_NORMAL && - !memblock_is_mirror(r)) - nr_absent += end_pfn - start_pfn; - } - } else { - if (zone_type == ZONE_NORMAL) - nr_absent += node_end_pfn - zone_movable_pfn[nid]; + if (mirrored_kernelcore && zone_movable_pfn[nid]) { + unsigned long start_pfn, end_pfn; + struct memblock_region *r; + + for_each_memblock(memory, r) { + start_pfn = clamp(memblock_region_memory_base_pfn(r), + zone_start_pfn, zone_end_pfn); + end_pfn = clamp(memblock_region_memory_end_pfn(r), + zone_start_pfn, zone_end_pfn); + + if (zone_type == ZONE_MOVABLE && + memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + + if (zone_type == ZONE_NORMAL && + !memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; } } @@ -5465,6 +5855,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->kcompactd_wait); #endif pgdat_page_ext_init(pgdat); + spin_lock_init(&pgdat->lru_lock); + lruvec_init(node_lruvec(pgdat)); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; @@ -5514,21 +5906,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; - zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) - / 100; - zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; + zone->zone_pgdat = pgdat; spin_lock_init(&zone->lock); - spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); - zone->zone_pgdat = pgdat; zone_pcp_init(zone); - /* For bootup, initialized properly in watermark setup */ - mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); - - lruvec_init(&zone->lruvec); if (!size) continue; @@ -5540,7 +5924,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) } } -static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) +static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; unsigned long __maybe_unused offset = 0; @@ -5594,11 +5978,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ - WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; + pgdat->per_cpu_nodestats = NULL; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, @@ -5980,15 +6365,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) sizeof(arch_zone_lowest_possible_pfn)); memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); - arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); - arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; - for (i = 1; i < MAX_NR_ZONES; i++) { + + start_pfn = find_min_pfn_with_active_regions(); + + for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - arch_zone_lowest_possible_pfn[i] = - arch_zone_highest_possible_pfn[i-1]; - arch_zone_highest_possible_pfn[i] = - max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); + + end_pfn = max(max_zone_pfn[i], start_pfn); + arch_zone_lowest_possible_pfn[i] = start_pfn; + arch_zone_highest_possible_pfn[i] = end_pfn; + + start_pfn = end_pfn; } arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; @@ -6252,6 +6640,9 @@ static void calculate_totalreserve_pages(void) enum zone_type i, j; for_each_online_pgdat(pgdat) { + + pgdat->totalreserve_pages = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; long max = 0; @@ -6268,7 +6659,7 @@ static void calculate_totalreserve_pages(void) if (max > zone->managed_pages) max = zone->managed_pages; - zone->totalreserve_pages = max; + pgdat->totalreserve_pages += max; reserve_pages += max; } @@ -6369,10 +6760,6 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; - __mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - spin_unlock_irqrestore(&zone->lock, flags); } @@ -6395,49 +6782,6 @@ void setup_per_zone_wmarks(void) } /* - * The inactive anon list should be small enough that the VM never has to - * do too much work, but large enough that each inactive page has a chance - * to be referenced again before it is swapped out. - * - * The inactive_anon ratio is the target ratio of ACTIVE_ANON to - * INACTIVE_ANON pages on this zone's LRU, maintained by the - * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of - * the anonymous pages are kept on the inactive list. - * - * total target max - * memory ratio inactive anon - * ------------------------------------- - * 10MB 1 5MB - * 100MB 1 50MB - * 1GB 3 250MB - * 10GB 10 0.9GB - * 100GB 31 3GB - * 1TB 101 10GB - * 10TB 320 32GB - */ -static void __meminit calculate_zone_inactive_ratio(struct zone *zone) -{ - unsigned int gb, ratio; - - /* Zone size in gigabytes */ - gb = zone->managed_pages >> (30 - PAGE_SHIFT); - if (gb) - ratio = int_sqrt(10 * gb); - else - ratio = 1; - - zone->inactive_ratio = ratio; -} - -static void __meminit setup_per_zone_inactive_ratio(void) -{ - struct zone *zone; - - for_each_zone(zone) - calculate_zone_inactive_ratio(zone); -} - -/* * Initialise min_free_kbytes. * * For small machines we want it small (128k min). For large machines @@ -6482,7 +6826,12 @@ int __meminit init_per_zone_wmark_min(void) setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); - setup_per_zone_inactive_ratio(); + +#ifdef CONFIG_NUMA + setup_min_unmapped_ratio(); + setup_min_slab_ratio(); +#endif + return 0; } core_initcall(init_per_zone_wmark_min) @@ -6524,35 +6873,58 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, } #ifdef CONFIG_NUMA +static void setup_min_unmapped_ratio(void) +{ + pg_data_t *pgdat; + struct zone *zone; + + for_each_online_pgdat(pgdat) + pgdat->min_unmapped_pages = 0; + + for_each_zone(zone) + zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * + sysctl_min_unmapped_ratio) / 100; +} + + int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - struct zone *zone; int rc; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; - for_each_zone(zone) - zone->min_unmapped_pages = (zone->managed_pages * - sysctl_min_unmapped_ratio) / 100; + setup_min_unmapped_ratio(); + return 0; } +static void setup_min_slab_ratio(void) +{ + pg_data_t *pgdat; + struct zone *zone; + + for_each_online_pgdat(pgdat) + pgdat->min_slab_pages = 0; + + for_each_zone(zone) + zone->zone_pgdat->min_slab_pages += (zone->managed_pages * + sysctl_min_slab_ratio) / 100; +} + int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - struct zone *zone; int rc; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; - for_each_zone(zone) - zone->min_slab_pages = (zone->managed_pages * - sysctl_min_slab_ratio) / 100; + setup_min_slab_ratio(); + return 0; } #endif @@ -6630,6 +7002,17 @@ static int __init set_hashdist(char *str) __setup("hashdist=", set_hashdist); #endif +#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES +/* + * Returns the number of pages that arch has reserved but + * is not known to alloc_large_system_hash(). + */ +static unsigned long __init arch_reserved_kernel_pages(void) +{ + return 0; +} +#endif + /* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 @@ -6654,6 +7037,7 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; + numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ if (PAGE_SHIFT < 20) @@ -6725,98 +7109,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/* Return a pointer to the bitmap storing bits affecting a block of pages */ -static inline unsigned long *get_pageblock_bitmap(struct zone *zone, - unsigned long pfn) -{ -#ifdef CONFIG_SPARSEMEM - return __pfn_to_section(pfn)->pageblock_flags; -#else - return zone->pageblock_flags; -#endif /* CONFIG_SPARSEMEM */ -} - -static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) -{ -#ifdef CONFIG_SPARSEMEM - pfn &= (PAGES_PER_SECTION-1); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; -#else - pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; -#endif /* CONFIG_SPARSEMEM */ -} - -/** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @pfn: The target page frame number - * @end_bitidx: The last bit of interest to retrieve - * @mask: mask of bits that the caller is interested in - * - * Return: pageblock_bits flags - */ -unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, - unsigned long end_bitidx, - unsigned long mask) -{ - struct zone *zone; - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long word; - - zone = page_zone(page); - bitmap = get_pageblock_bitmap(zone, pfn); - bitidx = pfn_to_bitidx(zone, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); - - word = bitmap[word_bitidx]; - bitidx += end_bitidx; - return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; -} - -/** - * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @flags: The flags to set - * @pfn: The target page frame number - * @end_bitidx: The last bit of interest - * @mask: mask of bits that the caller is interested in - */ -void set_pfnblock_flags_mask(struct page *page, unsigned long flags, - unsigned long pfn, - unsigned long end_bitidx, - unsigned long mask) -{ - struct zone *zone; - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long old_word, word; - - BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); - - zone = page_zone(page); - bitmap = get_pageblock_bitmap(zone, pfn); - bitidx = pfn_to_bitidx(zone, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); - - VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); - - bitidx += end_bitidx; - mask <<= (BITS_PER_LONG - bitidx - 1); - flags <<= (BITS_PER_LONG - bitidx - 1); - - word = READ_ONCE(bitmap[word_bitidx]); - for (;;) { - old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); - if (word == old_word) - break; - word = old_word; - } -} - /* * This function checks whether pageblock includes unmovable pages or not. * If @count is not zero, it is okay to include less @count unmovable pages @@ -6864,7 +7156,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * We can't use page_count without pin a page * because another CPU can free compound page. * This check already skips compound tails of THP - * because their page->_count is zero at all time. + * because their page->_refcount is zero at all time. */ if (!page_ref_count(page)) { if (PageBuddy(page)) @@ -7177,7 +7469,8 @@ void zone_pcp_reset(struct zone *zone) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * All pages in the range must be isolated before calling this. + * All pages in the range must be in a single zone and isolated + * before calling this. */ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) |