summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-08-10 11:18:00 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-08-10 11:18:00 -0700
commitb1701d5e29eb0a102aa3393319b3e4eb1a19c6ea (patch)
tree7bcb08dc82b47c81ac39b329fa3e5b41485cc054 /mm
parentc235698355fa94df7073b51befda7d4be00a0e23 (diff)
parenta9e9c93966afdaae74a6a7533552391646b93f2c (diff)
Merge tag 'mm-stable-2022-08-09' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull remaining MM updates from Andrew Morton: "Three patch series - two that perform cleanups and one feature: - hugetlb_vmemmap cleanups from Muchun Song - hardware poisoning support for 1GB hugepages, from Naoya Horiguchi - highmem documentation fixups from Fabio De Francesco" * tag 'mm-stable-2022-08-09' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (23 commits) Documentation/mm: add details about kmap_local_page() and preemption highmem: delete a sentence from kmap_local_page() kdocs Documentation/mm: rrefer kmap_local_page() and avoid kmap() Documentation/mm: avoid invalid use of addresses from kmap_local_page() Documentation/mm: don't kmap*() pages which can't come from HIGHMEM highmem: specify that kmap_local_page() is callable from interrupts highmem: remove unneeded spaces in kmap_local_page() kdocs mm, hwpoison: enable memory error handling on 1GB hugepage mm, hwpoison: skip raw hwpoison page in freeing 1GB hugepage mm, hwpoison: make __page_handle_poison returns int mm, hwpoison: set PG_hwpoison for busy hugetlb pages mm, hwpoison: make unpoison aware of raw error info in hwpoisoned hugepage mm, hwpoison, hugetlb: support saving mechanism of raw error pages mm/hugetlb: make pud_huge() and follow_huge_pud() aware of non-present pud entry mm/hugetlb: check gigantic_page_runtime_supported() in return_unused_surplus_pages() mm: hugetlb_vmemmap: use PTRS_PER_PTE instead of PMD_SIZE / PAGE_SIZE mm: hugetlb_vmemmap: move code comments to vmemmap_dedup.rst mm: hugetlb_vmemmap: improve hugetlb_vmemmap code readability mm: hugetlb_vmemmap: replace early_param() with core_param() mm: hugetlb_vmemmap: move vmemmap code related to HugeTLB to hugetlb_vmemmap.c ...
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c73
-rw-r--r--mm/hugetlb_vmemmap.c589
-rw-r--r--mm/hugetlb_vmemmap.h45
-rw-r--r--mm/memory-failure.c179
-rw-r--r--mm/sparse-vmemmap.c399
5 files changed, 684 insertions, 601 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f044962ad9df..0aee2f3ae15c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1535,7 +1535,14 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
- if (hugetlb_vmemmap_alloc(h, page)) {
+ /*
+ * If we don't know which subpages are hwpoisoned, we can't free
+ * the hugepage, so it's leaked intentionally.
+ */
+ if (HPageRawHwpUnreliable(page))
+ return;
+
+ if (hugetlb_vmemmap_restore(h, page)) {
spin_lock_irq(&hugetlb_lock);
/*
* If we cannot allocate vmemmap pages, just refuse to free the
@@ -1547,6 +1554,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
return;
}
+ /*
+ * Move PageHWPoison flag from head page to the raw error pages,
+ * which makes any healthy subpages reusable.
+ */
+ if (unlikely(PageHWPoison(page)))
+ hugetlb_clear_page_hwpoison(page);
+
for (i = 0; i < pages_per_huge_page(h);
i++, subpage = mem_map_next(subpage, page, i)) {
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1612,7 +1626,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
static inline void flush_free_hpage_work(struct hstate *h)
{
- if (hugetlb_optimize_vmemmap_pages(h))
+ if (hugetlb_vmemmap_optimizable(h))
flush_work(&free_hpage_work);
}
@@ -1734,7 +1748,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
static void __prep_new_huge_page(struct hstate *h, struct page *page)
{
- hugetlb_vmemmap_free(h, page);
+ hugetlb_vmemmap_optimize(h, page);
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
hugetlb_set_page_subpool(page, NULL);
@@ -2107,17 +2121,8 @@ retry:
* Attempt to allocate vmemmmap here so that we can take
* appropriate action on failure.
*/
- rc = hugetlb_vmemmap_alloc(h, head);
+ rc = hugetlb_vmemmap_restore(h, head);
if (!rc) {
- /*
- * Move PageHWPoison flag from head page to the raw
- * error page, which makes any subpages rather than
- * the error page reusable.
- */
- if (PageHWPoison(head) && page != head) {
- SetPageHWPoison(page);
- ClearPageHWPoison(head);
- }
update_and_free_page(h, head, false);
} else {
spin_lock_irq(&hugetlb_lock);
@@ -2432,8 +2437,7 @@ static void return_unused_surplus_pages(struct hstate *h,
/* Uncommit the reservation */
h->resv_huge_pages -= unused_resv_pages;
- /* Cannot return gigantic pages currently */
- if (hstate_is_gigantic(h))
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
goto out;
/*
@@ -3182,8 +3186,10 @@ static void __init report_hugepages(void)
char buf[32];
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
- pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
+ pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
buf, h->free_huge_pages);
+ pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
+ hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
}
}
@@ -3421,7 +3427,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
remove_hugetlb_page_for_demote(h, page, false);
spin_unlock_irq(&hugetlb_lock);
- rc = hugetlb_vmemmap_alloc(h, page);
+ rc = hugetlb_vmemmap_restore(h, page);
if (rc) {
/* Allocation of vmemmmap failed, we can not demote page */
spin_lock_irq(&hugetlb_lock);
@@ -4111,7 +4117,6 @@ void __init hugetlb_add_hstate(unsigned int order)
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
- hugetlb_vmemmap_init(h);
parsed_hstate = h;
}
@@ -6985,10 +6990,38 @@ struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int flags)
{
- if (flags & (FOLL_GET | FOLL_PIN))
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ pte_t pte;
+
+ if (WARN_ON_ONCE(flags & FOLL_PIN))
return NULL;
- return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+retry:
+ ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
+ if (!pud_huge(*pud))
+ goto out;
+ pte = huge_ptep_get((pte_t *)pud);
+ if (pte_present(pte)) {
+ page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+ page = NULL;
+ goto out;
+ }
+ } else {
+ if (is_hugetlb_entry_migration(pte)) {
+ spin_unlock(ptl);
+ __migration_entry_wait(mm, (pte_t *)pud, ptl);
+ goto retry;
+ }
+ /*
+ * hwpoisoned entry is treated as no_page_table in
+ * follow_page_mask().
+ */
+ }
+out:
+ spin_unlock(ptl);
+ return page;
}
struct page * __weak
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 1362feb3c6c9..20f414c0379f 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Optimize vmemmap pages associated with HugeTLB
+ * HugeTLB Vmemmap Optimization (HVO)
*
- * Copyright (c) 2020, Bytedance. All rights reserved.
+ * Copyright (c) 2020, ByteDance. All rights reserved.
*
* Author: Muchun Song <songmuchun@bytedance.com>
*
@@ -10,84 +10,443 @@
*/
#define pr_fmt(fmt) "HugeTLB: " fmt
-#include <linux/memory.h>
+#include <linux/pgtable.h>
+#include <linux/bootmem_info.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
#include "hugetlb_vmemmap.h"
-/*
- * There are a lot of struct page structures associated with each HugeTLB page.
- * For tail pages, the value of compound_head is the same. So we can reuse first
- * page of head page structures. We map the virtual addresses of all the pages
- * of tail page structures to the head page struct, and then free these page
- * frames. Therefore, we need to reserve one pages as vmemmap areas.
+/**
+ * struct vmemmap_remap_walk - walk vmemmap page table
+ *
+ * @remap_pte: called for each lowest-level entry (PTE).
+ * @nr_walked: the number of walked pte.
+ * @reuse_page: the page which is reused for the tail vmemmap pages.
+ * @reuse_addr: the virtual address of the @reuse_page page.
+ * @vmemmap_pages: the list head of the vmemmap pages that can be freed
+ * or is mapped from.
*/
-#define RESERVE_VMEMMAP_NR 1U
-#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
-
-enum vmemmap_optimize_mode {
- VMEMMAP_OPTIMIZE_OFF,
- VMEMMAP_OPTIMIZE_ON,
+struct vmemmap_remap_walk {
+ void (*remap_pte)(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk);
+ unsigned long nr_walked;
+ struct page *reuse_page;
+ unsigned long reuse_addr;
+ struct list_head *vmemmap_pages;
};
-DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
- hugetlb_optimize_vmemmap_key);
-EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
+static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+{
+ pmd_t __pmd;
+ int i;
+ unsigned long addr = start;
+ struct page *page = pmd_page(*pmd);
+ pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
+
+ if (!pgtable)
+ return -ENOMEM;
+
+ pmd_populate_kernel(&init_mm, &__pmd, pgtable);
+
+ for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
+ pte_t entry, *pte;
+ pgprot_t pgprot = PAGE_KERNEL;
+
+ entry = mk_pte(page + i, pgprot);
+ pte = pte_offset_kernel(&__pmd, addr);
+ set_pte_at(&init_mm, addr, pte, entry);
+ }
-static enum vmemmap_optimize_mode vmemmap_optimize_mode =
- IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
+ spin_lock(&init_mm.page_table_lock);
+ if (likely(pmd_leaf(*pmd))) {
+ /*
+ * Higher order allocations from buddy allocator must be able to
+ * be treated as indepdenent small pages (as they can be freed
+ * individually).
+ */
+ if (!PageReserved(page))
+ split_page(page, get_order(PMD_SIZE));
+
+ /* Make pte visible before pmd. See comment in pmd_install(). */
+ smp_wmb();
+ pmd_populate_kernel(&init_mm, pmd, pgtable);
+ flush_tlb_kernel_range(start, start + PMD_SIZE);
+ } else {
+ pte_free_kernel(&init_mm, pgtable);
+ }
+ spin_unlock(&init_mm.page_table_lock);
+
+ return 0;
+}
-static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to)
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
{
- if (vmemmap_optimize_mode == to)
- return;
+ int leaf;
- if (to == VMEMMAP_OPTIMIZE_OFF)
- static_branch_dec(&hugetlb_optimize_vmemmap_key);
- else
- static_branch_inc(&hugetlb_optimize_vmemmap_key);
- WRITE_ONCE(vmemmap_optimize_mode, to);
+ spin_lock(&init_mm.page_table_lock);
+ leaf = pmd_leaf(*pmd);
+ spin_unlock(&init_mm.page_table_lock);
+
+ if (!leaf)
+ return 0;
+
+ return __split_vmemmap_huge_pmd(pmd, start);
+}
+
+static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ pte_t *pte = pte_offset_kernel(pmd, addr);
+
+ /*
+ * The reuse_page is found 'first' in table walk before we start
+ * remapping (which is calling @walk->remap_pte).
+ */
+ if (!walk->reuse_page) {
+ walk->reuse_page = pte_page(*pte);
+ /*
+ * Because the reuse address is part of the range that we are
+ * walking, skip the reuse address range.
+ */
+ addr += PAGE_SIZE;
+ pte++;
+ walk->nr_walked++;
+ }
+
+ for (; addr != end; addr += PAGE_SIZE, pte++) {
+ walk->remap_pte(pte, addr, walk);
+ walk->nr_walked++;
+ }
}
-static int __init hugetlb_vmemmap_early_param(char *buf)
+static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
{
- bool enable;
- enum vmemmap_optimize_mode mode;
+ pmd_t *pmd;
+ unsigned long next;
- if (kstrtobool(buf, &enable))
- return -EINVAL;
+ pmd = pmd_offset(pud, addr);
+ do {
+ int ret;
- mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF;
- vmemmap_optimize_mode_switch(mode);
+ ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+ if (ret)
+ return ret;
+
+ next = pmd_addr_end(addr, end);
+ vmemmap_pte_range(pmd, addr, next, walk);
+ } while (pmd++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_offset(p4d, addr);
+ do {
+ int ret;
+
+ next = pud_addr_end(addr, end);
+ ret = vmemmap_pmd_range(pud, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pud++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ int ret;
+
+ next = p4d_addr_end(addr, end);
+ ret = vmemmap_pud_range(p4d, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (p4d++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int vmemmap_remap_range(unsigned long start, unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ unsigned long addr = start;
+ unsigned long next;
+ pgd_t *pgd;
+
+ VM_BUG_ON(!PAGE_ALIGNED(start));
+ VM_BUG_ON(!PAGE_ALIGNED(end));
+
+ pgd = pgd_offset_k(addr);
+ do {
+ int ret;
+
+ next = pgd_addr_end(addr, end);
+ ret = vmemmap_p4d_range(pgd, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pgd++, addr = next, addr != end);
+
+ /*
+ * We only change the mapping of the vmemmap virtual address range
+ * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
+ * belongs to the range.
+ */
+ flush_tlb_kernel_range(start + PAGE_SIZE, end);
return 0;
}
-early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param);
/*
- * Previously discarded vmemmap pages will be allocated and remapping
- * after this function returns zero.
+ * Free a vmemmap page. A vmemmap page can be allocated from the memblock
+ * allocator or buddy allocator. If the PG_reserved flag is set, it means
+ * that it allocated from the memblock allocator, just free it via the
+ * free_bootmem_page(). Otherwise, use __free_page().
*/
-int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
+static inline void free_vmemmap_page(struct page *page)
+{
+ if (PageReserved(page))
+ free_bootmem_page(page);
+ else
+ __free_page(page);
+}
+
+/* Free a list of the vmemmap pages */
+static void free_vmemmap_page_list(struct list_head *list)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ list_del(&page->lru);
+ free_vmemmap_page(page);
+ }
+}
+
+static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk)
+{
+ /*
+ * Remap the tail pages as read-only to catch illegal write operation
+ * to the tail pages.
+ */
+ pgprot_t pgprot = PAGE_KERNEL_RO;
+ pte_t entry = mk_pte(walk->reuse_page, pgprot);
+ struct page *page = pte_page(*pte);
+
+ list_add_tail(&page->lru, walk->vmemmap_pages);
+ set_pte_at(&init_mm, addr, pte, entry);
+}
+
+/*
+ * How many struct page structs need to be reset. When we reuse the head
+ * struct page, the special metadata (e.g. page->flags or page->mapping)
+ * cannot copy to the tail struct page structs. The invalid value will be
+ * checked in the free_tail_pages_check(). In order to avoid the message
+ * of "corrupted mapping in tail page". We need to reset at least 3 (one
+ * head struct page struct and two tail struct page structs) struct page
+ * structs.
+ */
+#define NR_RESET_STRUCT_PAGE 3
+
+static inline void reset_struct_pages(struct page *start)
+{
+ int i;
+ struct page *from = start + NR_RESET_STRUCT_PAGE;
+
+ for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
+ memcpy(start + i, from, sizeof(*from));
+}
+
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk)
+{
+ pgprot_t pgprot = PAGE_KERNEL;
+ struct page *page;
+ void *to;
+
+ BUG_ON(pte_page(*pte) != walk->reuse_page);
+
+ page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+ list_del(&page->lru);
+ to = page_to_virt(page);
+ copy_page(to, (void *)walk->reuse_addr);
+ reset_struct_pages(to);
+
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
+/**
+ * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
+ * to the page which @reuse is mapped to, then free vmemmap
+ * which the range are mapped to.
+ * @start: start address of the vmemmap virtual address range that we want
+ * to remap.
+ * @end: end address of the vmemmap virtual address range that we want to
+ * remap.
+ * @reuse: reuse address.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_free(unsigned long start, unsigned long end,
+ unsigned long reuse)
+{
+ int ret;
+ LIST_HEAD(vmemmap_pages);
+ struct vmemmap_remap_walk walk = {
+ .remap_pte = vmemmap_remap_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+
+ /*
+ * In order to make remapping routine most efficient for the huge pages,
+ * the routine of vmemmap page table walking has the following rules
+ * (see more details from the vmemmap_pte_range()):
+ *
+ * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
+ * should be continuous.
+ * - The @reuse address is part of the range [@reuse, @end) that we are
+ * walking which is passed to vmemmap_remap_range().
+ * - The @reuse address is the first in the complete range.
+ *
+ * So we need to make sure that @start and @reuse meet the above rules.
+ */
+ BUG_ON(start - reuse != PAGE_SIZE);
+
+ mmap_read_lock(&init_mm);
+ ret = vmemmap_remap_range(reuse, end, &walk);
+ if (ret && walk.nr_walked) {
+ end = reuse + walk.nr_walked * PAGE_SIZE;
+ /*
+ * vmemmap_pages contains pages from the previous
+ * vmemmap_remap_range call which failed. These
+ * are pages which were removed from the vmemmap.
+ * They will be restored in the following call.
+ */
+ walk = (struct vmemmap_remap_walk) {
+ .remap_pte = vmemmap_restore_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+
+ vmemmap_remap_range(reuse, end, &walk);
+ }
+ mmap_read_unlock(&init_mm);
+
+ free_vmemmap_page_list(&vmemmap_pages);
+
+ return ret;
+}
+
+static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
+ gfp_t gfp_mask, struct list_head *list)
+{
+ unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
+ int nid = page_to_nid((struct page *)start);
+ struct page *page, *next;
+
+ while (nr_pages--) {
+ page = alloc_pages_node(nid, gfp_mask, 0);
+ if (!page)
+ goto out;
+ list_add_tail(&page->lru, list);
+ }
+
+ return 0;
+out:
+ list_for_each_entry_safe(page, next, list, lru)
+ __free_pages(page, 0);
+ return -ENOMEM;
+}
+
+/**
+ * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
+ * to the page which is from the @vmemmap_pages
+ * respectively.
+ * @start: start address of the vmemmap virtual address range that we want
+ * to remap.
+ * @end: end address of the vmemmap virtual address range that we want to
+ * remap.
+ * @reuse: reuse address.
+ * @gfp_mask: GFP flag for allocating vmemmap pages.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+ unsigned long reuse, gfp_t gfp_mask)
+{
+ LIST_HEAD(vmemmap_pages);
+ struct vmemmap_remap_walk walk = {
+ .remap_pte = vmemmap_restore_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+
+ /* See the comment in the vmemmap_remap_free(). */
+ BUG_ON(start - reuse != PAGE_SIZE);
+
+ if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+ return -ENOMEM;
+
+ mmap_read_lock(&init_mm);
+ vmemmap_remap_range(reuse, end, &walk);
+ mmap_read_unlock(&init_mm);
+
+ return 0;
+}
+
+DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
+EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
+
+static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
+core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
+
+/**
+ * hugetlb_vmemmap_restore - restore previously optimized (by
+ * hugetlb_vmemmap_optimize()) vmemmap pages which
+ * will be reallocated and remapped.
+ * @h: struct hstate.
+ * @head: the head page whose vmemmap pages will be restored.
+ *
+ * Return: %0 if @head's vmemmap pages have been reallocated and remapped,
+ * negative error code otherwise.
+ */
+int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
{
int ret;
- unsigned long vmemmap_addr = (unsigned long)head;
- unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
+ unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+ unsigned long vmemmap_reuse;
if (!HPageVmemmapOptimized(head))
return 0;
- vmemmap_addr += RESERVE_VMEMMAP_SIZE;
- vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
- vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
- vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+ vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
+ vmemmap_reuse = vmemmap_start;
+ vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
/*
- * The pages which the vmemmap virtual address range [@vmemmap_addr,
+ * The pages which the vmemmap virtual address range [@vmemmap_start,
* @vmemmap_end) are mapped to are freed to the buddy allocator, and
* the range is mapped to the page which @vmemmap_reuse is mapped to.
* When a HugeTLB page is freed to the buddy allocator, previously
* discarded vmemmap pages must be allocated and remapping.
*/
- ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
+ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse,
GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
if (!ret) {
ClearHPageVmemmapOptimized(head);
@@ -97,11 +456,14 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
return ret;
}
-static unsigned int vmemmap_optimizable_pages(struct hstate *h,
- struct page *head)
+/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
+static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
{
- if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
- return 0;
+ if (!READ_ONCE(vmemmap_optimize_enabled))
+ return false;
+
+ if (!hugetlb_vmemmap_optimizable(h))
+ return false;
if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
pmd_t *pmdp, pmd;
@@ -144,118 +506,73 @@ static unsigned int vmemmap_optimizable_pages(struct hstate *h,
* +-------------------------------------------+
*/
if (PageVmemmapSelfHosted(vmemmap_page))
- return 0;
+ return false;
}
- return hugetlb_optimize_vmemmap_pages(h);
+ return true;
}
-void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
+/**
+ * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages.
+ * @h: struct hstate.
+ * @head: the head page whose vmemmap pages will be optimized.
+ *
+ * This function only tries to optimize @head's vmemmap pages and does not
+ * guarantee that the optimization will succeed after it returns. The caller
+ * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages
+ * have been optimized.
+ */
+void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
{
- unsigned long vmemmap_addr = (unsigned long)head;
- unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
+ unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+ unsigned long vmemmap_reuse;
- vmemmap_pages = vmemmap_optimizable_pages(h, head);
- if (!vmemmap_pages)
+ if (!vmemmap_should_optimize(h, head))
return;
static_branch_inc(&hugetlb_optimize_vmemmap_key);
- vmemmap_addr += RESERVE_VMEMMAP_SIZE;
- vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
- vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+ vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
+ vmemmap_reuse = vmemmap_start;
+ vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
/*
- * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
+ * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
* to the page which @vmemmap_reuse is mapped to, then free the pages
- * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
+ * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
*/
- if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+ if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse))
static_branch_dec(&hugetlb_optimize_vmemmap_key);
else
SetHPageVmemmapOptimized(head);
}
-void __init hugetlb_vmemmap_init(struct hstate *h)
-{
- unsigned int nr_pages = pages_per_huge_page(h);
- unsigned int vmemmap_pages;
-
- /*
- * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
- * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP,
- * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
- */
- BUILD_BUG_ON(__NR_USED_SUBPAGE >=
- RESERVE_VMEMMAP_SIZE / sizeof(struct page));
-
- if (!is_power_of_2(sizeof(struct page))) {
- pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
- static_branch_disable(&hugetlb_optimize_vmemmap_key);
- return;
- }
-
- vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
- /*
- * The head page is not to be freed to buddy allocator, the other tail
- * pages will map to the head page, so they can be freed.
- *
- * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
- * on some architectures (e.g. aarch64). See Documentation/arm64/
- * hugetlbpage.rst for more details.
- */
- if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
- h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
-
- pr_info("can optimize %d vmemmap pages for %s\n",
- h->optimize_vmemmap_pages, h->name);
-}
-
-#ifdef CONFIG_PROC_SYSCTL
-static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length,
- loff_t *ppos)
-{
- int ret;
- enum vmemmap_optimize_mode mode;
- static DEFINE_MUTEX(sysctl_mutex);
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mutex_lock(&sysctl_mutex);
- mode = vmemmap_optimize_mode;
- table->data = &mode;
- ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (write && !ret)
- vmemmap_optimize_mode_switch(mode);
- mutex_unlock(&sysctl_mutex);
-
- return ret;
-}
-
static struct ctl_table hugetlb_vmemmap_sysctls[] = {
{
.procname = "hugetlb_optimize_vmemmap",
- .maxlen = sizeof(enum vmemmap_optimize_mode),
+ .data = &vmemmap_optimize_enabled,
+ .maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = hugetlb_optimize_vmemmap_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .proc_handler = proc_dobool,
},
{ }
};
-static __init int hugetlb_vmemmap_sysctls_init(void)
+static int __init hugetlb_vmemmap_init(void)
{
- /*
- * If "struct page" crosses page boundaries, the vmemmap pages cannot
- * be optimized.
- */
- if (is_power_of_2(sizeof(struct page)))
- register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
-
+ /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
+ BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE);
+
+ if (IS_ENABLED(CONFIG_PROC_SYSCTL)) {
+ const struct hstate *h;
+
+ for_each_hstate(h) {
+ if (hugetlb_vmemmap_optimizable(h)) {
+ register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
+ break;
+ }
+ }
+ }
return 0;
}
-late_initcall(hugetlb_vmemmap_sysctls_init);
-#endif /* CONFIG_PROC_SYSCTL */
+late_initcall(hugetlb_vmemmap_init);
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 109b0a53b6fe..25bd0e002431 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Optimize vmemmap pages associated with HugeTLB
+ * HugeTLB Vmemmap Optimization (HVO)
*
- * Copyright (c) 2020, Bytedance. All rights reserved.
+ * Copyright (c) 2020, ByteDance. All rights reserved.
*
* Author: Muchun Song <songmuchun@bytedance.com>
*/
@@ -11,35 +11,50 @@
#include <linux/hugetlb.h>
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head);
-void hugetlb_vmemmap_free(struct hstate *h, struct page *head);
-void hugetlb_vmemmap_init(struct hstate *h);
+int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
+void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
/*
- * How many vmemmap pages associated with a HugeTLB page that can be
- * optimized and freed to the buddy allocator.
+ * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
+ * Documentation/vm/vmemmap_dedup.rst.
*/
-static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
+#define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE
+
+static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
{
- return h->optimize_vmemmap_pages;
+ return pages_per_huge_page(h) * sizeof(struct page);
+}
+
+/*
+ * Return how many vmemmap size associated with a HugeTLB page that can be
+ * optimized and can be freed to the buddy allocator.
+ */
+static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
+{
+ int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE;
+
+ if (!is_power_of_2(sizeof(struct page)))
+ return 0;
+ return size > 0 ? size : 0;
}
#else
-static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
+static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
{
return 0;
}
-static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
+static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
{
}
-static inline void hugetlb_vmemmap_init(struct hstate *h)
+static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
{
+ return 0;
}
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
-static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
+static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h)
{
- return 0;
+ return hugetlb_vmemmap_optimizable_size(h) != 0;
}
-#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
#endif /* _LINUX_HUGETLB_VMEMMAP_H */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9a7a228ad04a..14439806b5ef 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -74,7 +74,13 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
static bool hw_memory_failure __read_mostly = false;
-static bool __page_handle_poison(struct page *page)
+/*
+ * Return values:
+ * 1: the page is dissolved (if needed) and taken off from buddy,
+ * 0: the page is dissolved (if needed) and not taken off from buddy,
+ * < 0: failed to dissolve.
+ */
+static int __page_handle_poison(struct page *page)
{
int ret;
@@ -84,7 +90,7 @@ static bool __page_handle_poison(struct page *page)
ret = take_page_off_buddy(page);
zone_pcp_enable(page_zone(page));
- return ret > 0;
+ return ret;
}
static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
@@ -94,7 +100,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
* Doing this check for free pages is also fine since dissolve_free_huge_page
* returns 0 for non-hugetlb pages as well.
*/
- if (!__page_handle_poison(page))
+ if (__page_handle_poison(page) <= 0)
/*
* We could fail to take off the target page from buddy
* for example due to racy page allocation, but that's
@@ -762,7 +768,6 @@ static const char * const action_page_types[] = {
[MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
[MF_MSG_HUGE] = "huge page",
[MF_MSG_FREE_HUGE] = "free huge page",
- [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
[MF_MSG_UNMAP_FAILED] = "unmapping failed page",
[MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
[MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
@@ -1078,7 +1083,6 @@ static int me_huge_page(struct page_state *ps, struct page *p)
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
unlock_page(hpage);
} else {
- res = MF_FAILED;
unlock_page(hpage);
/*
* migration entry prevents later access on error hugepage,
@@ -1086,9 +1090,11 @@ static int me_huge_page(struct page_state *ps, struct page *p)
* subpages.
*/
put_page(hpage);
- if (__page_handle_poison(p)) {
+ if (__page_handle_poison(p) >= 0) {
page_ref_inc(p);
res = MF_RECOVERED;
+ } else {
+ res = MF_FAILED;
}
}
@@ -1662,6 +1668,113 @@ unlock:
EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
#endif /* CONFIG_FS_DAX */
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Struct raw_hwp_page represents information about "raw error page",
+ * constructing singly linked list originated from ->private field of
+ * SUBPAGE_INDEX_HWPOISON-th tail page.
+ */
+struct raw_hwp_page {
+ struct llist_node node;
+ struct page *page;
+};
+
+static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
+{
+ return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON);
+}
+
+static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
+{
+ struct llist_head *head;
+ struct llist_node *t, *tnode;
+ unsigned long count = 0;
+
+ head = raw_hwp_list_head(hpage);
+ llist_for_each_safe(tnode, t, head->first) {
+ struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
+
+ if (move_flag)
+ SetPageHWPoison(p->page);
+ kfree(p);
+ count++;
+ }
+ llist_del_all(head);
+ return count;
+}
+
+static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
+{
+ struct llist_head *head;
+ struct raw_hwp_page *raw_hwp;
+ struct llist_node *t, *tnode;
+ int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0;
+
+ /*
+ * Once the hwpoison hugepage has lost reliable raw error info,
+ * there is little meaning to keep additional error info precisely,
+ * so skip to add additional raw error info.
+ */
+ if (HPageRawHwpUnreliable(hpage))
+ return -EHWPOISON;
+ head = raw_hwp_list_head(hpage);
+ llist_for_each_safe(tnode, t, head->first) {
+ struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
+
+ if (p->page == page)
+ return -EHWPOISON;
+ }
+
+ raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
+ if (raw_hwp) {
+ raw_hwp->page = page;
+ llist_add(&raw_hwp->node, head);
+ /* the first error event will be counted in action_result(). */
+ if (ret)
+ num_poisoned_pages_inc();
+ } else {
+ /*
+ * Failed to save raw error info. We no longer trace all
+ * hwpoisoned subpages, and we need refuse to free/dissolve
+ * this hwpoisoned hugepage.
+ */
+ SetHPageRawHwpUnreliable(hpage);
+ /*
+ * Once HPageRawHwpUnreliable is set, raw_hwp_page is not
+ * used any more, so free it.
+ */
+ __free_raw_hwp_pages(hpage, false);
+ }
+ return ret;
+}
+
+static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag)
+{
+ /*
+ * HPageVmemmapOptimized hugepages can't be freed because struct
+ * pages for tail pages are required but they don't exist.
+ */
+ if (move_flag && HPageVmemmapOptimized(hpage))
+ return 0;
+
+ /*
+ * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by
+ * definition.
+ */
+ if (HPageRawHwpUnreliable(hpage))
+ return 0;
+
+ return __free_raw_hwp_pages(hpage, move_flag);
+}
+
+void hugetlb_clear_page_hwpoison(struct page *hpage)
+{
+ if (HPageRawHwpUnreliable(hpage))
+ return;
+ ClearPageHWPoison(hpage);
+ free_raw_hwp_pages(hpage, true);
+}
+
/*
* Called from hugetlb code with hugetlb_lock held.
*
@@ -1693,10 +1806,11 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
count_increased = true;
} else {
ret = -EBUSY;
- goto out;
+ if (!(flags & MF_NO_RETRY))
+ goto out;
}
- if (TestSetPageHWPoison(head)) {
+ if (hugetlb_set_page_hwpoison(head, page)) {
ret = -EHWPOISON;
goto out;
}
@@ -1708,7 +1822,6 @@ out:
return ret;
}
-#ifdef CONFIG_HUGETLB_PAGE
/*
* Taking refcount of hugetlb pages needs extra care about race conditions
* with basic operations like hugepage allocation/free/demotion.
@@ -1721,7 +1834,6 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
struct page *p = pfn_to_page(pfn);
struct page *head;
unsigned long page_flags;
- bool retry = true;
*hugetlb = 1;
retry:
@@ -1737,8 +1849,8 @@ retry:
}
return res;
} else if (res == -EBUSY) {
- if (retry) {
- retry = false;
+ if (!(flags & MF_NO_RETRY)) {
+ flags |= MF_NO_RETRY;
goto retry;
}
action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
@@ -1749,7 +1861,7 @@ retry:
lock_page(head);
if (hwpoison_filter(p)) {
- ClearPageHWPoison(head);
+ hugetlb_clear_page_hwpoison(head);
res = -EOPNOTSUPP;
goto out;
}
@@ -1760,10 +1872,11 @@ retry:
*/
if (res == 0) {
unlock_page(head);
- res = MF_FAILED;
- if (__page_handle_poison(p)) {
+ if (__page_handle_poison(p) >= 0) {
page_ref_inc(p);
res = MF_RECOVERED;
+ } else {
+ res = MF_FAILED;
}
action_result(pfn, MF_MSG_FREE_HUGE, res);
return res == MF_RECOVERED ? 0 : -EBUSY;
@@ -1771,21 +1884,6 @@ retry:
page_flags = head->flags;
- /*
- * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
- * simply disable it. In order to make it work properly, we need
- * make sure that:
- * - conversion of a pud that maps an error hugetlb into hwpoison
- * entry properly works, and
- * - other mm code walking over page table is aware of pud-aligned
- * hwpoison entries.
- */
- if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
- action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
- res = -EBUSY;
- goto out;
- }
-
if (!hwpoison_user_mappings(p, pfn, flags, head)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
@@ -1804,6 +1902,10 @@ static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *
return 0;
}
+static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
+{
+ return 0;
+}
#endif /* CONFIG_HUGETLB_PAGE */
static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
@@ -2209,6 +2311,7 @@ int unpoison_memory(unsigned long pfn)
struct page *p;
int ret = -EBUSY;
int freeit = 0;
+ unsigned long count = 1;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -2256,6 +2359,13 @@ int unpoison_memory(unsigned long pfn)
ret = get_hwpoison_page(p, MF_UNPOISON);
if (!ret) {
+ if (PageHuge(p)) {
+ count = free_raw_hwp_pages(page, false);
+ if (count == 0) {
+ ret = -EBUSY;
+ goto unlock_mutex;
+ }
+ }
ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
} else if (ret < 0) {
if (ret == -EHWPOISON) {
@@ -2264,6 +2374,13 @@ int unpoison_memory(unsigned long pfn)
unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
pfn, &unpoison_rs);
} else {
+ if (PageHuge(p)) {
+ count = free_raw_hwp_pages(page, false);
+ if (count == 0) {
+ ret = -EBUSY;
+ goto unlock_mutex;
+ }
+ }
freeit = !!TestClearPageHWPoison(p);
put_page(page);
@@ -2276,7 +2393,7 @@ int unpoison_memory(unsigned long pfn)
unlock_mutex:
mutex_unlock(&mf_mutex);
if (!ret || freeit) {
- num_poisoned_pages_dec();
+ num_poisoned_pages_sub(count);
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
page_to_pfn(p), &unpoison_rs);
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 5f0ed4717ed0..46ae542118c0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -27,408 +27,9 @@
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
-#include <linux/pgtable.h>
-#include <linux/bootmem_info.h>
#include <asm/dma.h>
#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-
-#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-/**
- * struct vmemmap_remap_walk - walk vmemmap page table
- *
- * @remap_pte: called for each lowest-level entry (PTE).
- * @nr_walked: the number of walked pte.
- * @reuse_page: the page which is reused for the tail vmemmap pages.
- * @reuse_addr: the virtual address of the @reuse_page page.
- * @vmemmap_pages: the list head of the vmemmap pages that can be freed
- * or is mapped from.
- */
-struct vmemmap_remap_walk {
- void (*remap_pte)(pte_t *pte, unsigned long addr,
- struct vmemmap_remap_walk *walk);
- unsigned long nr_walked;
- struct page *reuse_page;
- unsigned long reuse_addr;
- struct list_head *vmemmap_pages;
-};
-
-static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
-{
- pmd_t __pmd;
- int i;
- unsigned long addr = start;
- struct page *page = pmd_page(*pmd);
- pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
-
- if (!pgtable)
- return -ENOMEM;
-
- pmd_populate_kernel(&init_mm, &__pmd, pgtable);
-
- for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
- pte_t entry, *pte;
- pgprot_t pgprot = PAGE_KERNEL;
-
- entry = mk_pte(page + i, pgprot);
- pte = pte_offset_kernel(&__pmd, addr);
- set_pte_at(&init_mm, addr, pte, entry);
- }
-
- spin_lock(&init_mm.page_table_lock);
- if (likely(pmd_leaf(*pmd))) {
- /*
- * Higher order allocations from buddy allocator must be able to
- * be treated as indepdenent small pages (as they can be freed
- * individually).
- */
- if (!PageReserved(page))
- split_page(page, get_order(PMD_SIZE));
-
- /* Make pte visible before pmd. See comment in pmd_install(). */
- smp_wmb();
- pmd_populate_kernel(&init_mm, pmd, pgtable);
- flush_tlb_kernel_range(start, start + PMD_SIZE);
- } else {
- pte_free_kernel(&init_mm, pgtable);
- }
- spin_unlock(&init_mm.page_table_lock);
-
- return 0;
-}
-
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
-{
- int leaf;
-
- spin_lock(&init_mm.page_table_lock);
- leaf = pmd_leaf(*pmd);
- spin_unlock(&init_mm.page_table_lock);
-
- if (!leaf)
- return 0;
-
- return __split_vmemmap_huge_pmd(pmd, start);
-}
-
-static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end,
- struct vmemmap_remap_walk *walk)
-{
- pte_t *pte = pte_offset_kernel(pmd, addr);
-
- /*
- * The reuse_page is found 'first' in table walk before we start
- * remapping (which is calling @walk->remap_pte).
- */
- if (!walk->reuse_page) {
- walk->reuse_page = pte_page(*pte);
- /*
- * Because the reuse address is part of the range that we are
- * walking, skip the reuse address range.
- */
- addr += PAGE_SIZE;
- pte++;
- walk->nr_walked++;
- }
-
- for (; addr != end; addr += PAGE_SIZE, pte++) {
- walk->remap_pte(pte, addr, walk);
- walk->nr_walked++;
- }
-}
-
-static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end,
- struct vmemmap_remap_walk *walk)
-{
- pmd_t *pmd;
- unsigned long next;
-
- pmd = pmd_offset(pud, addr);
- do {
- int ret;
-
- ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
- if (ret)
- return ret;
-
- next = pmd_addr_end(addr, end);
- vmemmap_pte_range(pmd, addr, next, walk);
- } while (pmd++, addr = next, addr != end);
-
- return 0;
-}
-
-static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end,
- struct vmemmap_remap_walk *walk)
-{
- pud_t *pud;
- unsigned long next;
-
- pud = pud_offset(p4d, addr);
- do {
- int ret;
-
- next = pud_addr_end(addr, end);
- ret = vmemmap_pmd_range(pud, addr, next, walk);
- if (ret)
- return ret;
- } while (pud++, addr = next, addr != end);
-
- return 0;
-}
-
-static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end,
- struct vmemmap_remap_walk *walk)
-{
- p4d_t *p4d;
- unsigned long next;
-
- p4d = p4d_offset(pgd, addr);
- do {
- int ret;
-
- next = p4d_addr_end(addr, end);
- ret = vmemmap_pud_range(p4d, addr, next, walk);
- if (ret)
- return ret;
- } while (p4d++, addr = next, addr != end);
-
- return 0;
-}
-
-static int vmemmap_remap_range(unsigned long start, unsigned long end,
- struct vmemmap_remap_walk *walk)
-{
- unsigned long addr = start;
- unsigned long next;
- pgd_t *pgd;
-
- VM_BUG_ON(!PAGE_ALIGNED(start));
- VM_BUG_ON(!PAGE_ALIGNED(end));
-
- pgd = pgd_offset_k(addr);
- do {
- int ret;
-
- next = pgd_addr_end(addr, end);
- ret = vmemmap_p4d_range(pgd, addr, next, walk);
- if (ret)
- return ret;
- } while (pgd++, addr = next, addr != end);
-
- /*
- * We only change the mapping of the vmemmap virtual address range
- * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
- * belongs to the range.
- */
- flush_tlb_kernel_range(start + PAGE_SIZE, end);
-
- return 0;
-}
-
-/*
- * Free a vmemmap page. A vmemmap page can be allocated from the memblock
- * allocator or buddy allocator. If the PG_reserved flag is set, it means
- * that it allocated from the memblock allocator, just free it via the
- * free_bootmem_page(). Otherwise, use __free_page().
- */
-static inline void free_vmemmap_page(struct page *page)
-{
- if (PageReserved(page))
- free_bootmem_page(page);
- else
- __free_page(page);
-}
-
-/* Free a list of the vmemmap pages */
-static void free_vmemmap_page_list(struct list_head *list)
-{
- struct page *page, *next;
-
- list_for_each_entry_safe(page, next, list, lru) {
- list_del(&page->lru);
- free_vmemmap_page(page);
- }
-}
-
-static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
- struct vmemmap_remap_walk *walk)
-{
- /*
- * Remap the tail pages as read-only to catch illegal write operation
- * to the tail pages.
- */
- pgprot_t pgprot = PAGE_KERNEL_RO;
- pte_t entry = mk_pte(walk->reuse_page, pgprot);
- struct page *page = pte_page(*pte);
-
- list_add_tail(&page->lru, walk->vmemmap_pages);
- set_pte_at(&init_mm, addr, pte, entry);
-}
-
-/*
- * How many struct page structs need to be reset. When we reuse the head
- * struct page, the special metadata (e.g. page->flags or page->mapping)
- * cannot copy to the tail struct page structs. The invalid value will be
- * checked in the free_tail_pages_check(). In order to avoid the message
- * of "corrupted mapping in tail page". We need to reset at least 3 (one
- * head struct page struct and two tail struct page structs) struct page
- * structs.
- */
-#define NR_RESET_STRUCT_PAGE 3
-
-static inline void reset_struct_pages(struct page *start)
-{
- int i;
- struct page *from = start + NR_RESET_STRUCT_PAGE;
-
- for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
- memcpy(start + i, from, sizeof(*from));
-}
-
-static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
- struct vmemmap_remap_walk *walk)
-{
- pgprot_t pgprot = PAGE_KERNEL;
- struct page *page;
- void *to;
-
- BUG_ON(pte_page(*pte) != walk->reuse_page);
-
- page = list_first_entry(walk->vmemmap_pages, struct page, lru);
- list_del(&page->lru);
- to = page_to_virt(page);
- copy_page(to, (void *)walk->reuse_addr);
- reset_struct_pages(to);
-
- set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
-}
-
-/**
- * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
- * to the page which @reuse is mapped to, then free vmemmap
- * which the range are mapped to.
- * @start: start address of the vmemmap virtual address range that we want
- * to remap.
- * @end: end address of the vmemmap virtual address range that we want to
- * remap.
- * @reuse: reuse address.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-int vmemmap_remap_free(unsigned long start, unsigned long end,
- unsigned long reuse)
-{
- int ret;
- LIST_HEAD(vmemmap_pages);
- struct vmemmap_remap_walk walk = {
- .remap_pte = vmemmap_remap_pte,
- .reuse_addr = reuse,
- .vmemmap_pages = &vmemmap_pages,
- };
-
- /*
- * In order to make remapping routine most efficient for the huge pages,
- * the routine of vmemmap page table walking has the following rules
- * (see more details from the vmemmap_pte_range()):
- *
- * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
- * should be continuous.
- * - The @reuse address is part of the range [@reuse, @end) that we are
- * walking which is passed to vmemmap_remap_range().
- * - The @reuse address is the first in the complete range.
- *
- * So we need to make sure that @start and @reuse meet the above rules.
- */
- BUG_ON(start - reuse != PAGE_SIZE);
-
- mmap_read_lock(&init_mm);
- ret = vmemmap_remap_range(reuse, end, &walk);
- if (ret && walk.nr_walked) {
- end = reuse + walk.nr_walked * PAGE_SIZE;
- /*
- * vmemmap_pages contains pages from the previous
- * vmemmap_remap_range call which failed. These
- * are pages which were removed from the vmemmap.
- * They will be restored in the following call.
- */
- walk = (struct vmemmap_remap_walk) {
- .remap_pte = vmemmap_restore_pte,
- .reuse_addr = reuse,
- .vmemmap_pages = &vmemmap_pages,
- };
-
- vmemmap_remap_range(reuse, end, &walk);
- }
- mmap_read_unlock(&init_mm);
-
- free_vmemmap_page_list(&vmemmap_pages);
-
- return ret;
-}
-
-static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
- gfp_t gfp_mask, struct list_head *list)
-{
- unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
- int nid = page_to_nid((struct page *)start);
- struct page *page, *next;
-
- while (nr_pages--) {
- page = alloc_pages_node(nid, gfp_mask, 0);
- if (!page)
- goto out;
- list_add_tail(&page->lru, list);
- }
-
- return 0;
-out:
- list_for_each_entry_safe(page, next, list, lru)
- __free_pages(page, 0);
- return -ENOMEM;
-}
-
-/**
- * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
- * to the page which is from the @vmemmap_pages
- * respectively.
- * @start: start address of the vmemmap virtual address range that we want
- * to remap.
- * @end: end address of the vmemmap virtual address range that we want to
- * remap.
- * @reuse: reuse address.
- * @gfp_mask: GFP flag for allocating vmemmap pages.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-int vmemmap_remap_alloc(unsigned long start, unsigned long end,
- unsigned long reuse, gfp_t gfp_mask)
-{
- LIST_HEAD(vmemmap_pages);
- struct vmemmap_remap_walk walk = {
- .remap_pte = vmemmap_restore_pte,
- .reuse_addr = reuse,
- .vmemmap_pages = &vmemmap_pages,
- };
-
- /* See the comment in the vmemmap_remap_free(). */
- BUG_ON(start - reuse != PAGE_SIZE);
-
- if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
- return -ENOMEM;
-
- mmap_read_lock(&init_mm);
- vmemmap_remap_range(reuse, end, &walk);
- mmap_read_unlock(&init_mm);
-
- return 0;
-}
-#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
/*
* Allocate a block of memory to be used to back the virtual memory map