summaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c291
1 files changed, 248 insertions, 43 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 418bf01a50ed..e5828875f7bb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
#include <linux/bootmem.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/sched/signal.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -32,6 +33,7 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
+#include <linux/userfaultfd_k.h>
#include "internal.h"
int hugepages_treat_as_movable;
@@ -1051,7 +1053,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn,
unsigned long nr_pages)
{
unsigned long end_pfn = start_pfn + nr_pages;
- return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
+ GFP_KERNEL);
}
static bool pfn_range_valid_gigantic(struct zone *z,
@@ -1773,23 +1776,32 @@ free:
}
/*
- * When releasing a hugetlb pool reservation, any surplus pages that were
- * allocated to satisfy the reservation must be explicitly freed if they were
- * never used.
- * Called with hugetlb_lock held.
+ * This routine has two main purposes:
+ * 1) Decrement the reservation count (resv_huge_pages) by the value passed
+ * in unused_resv_pages. This corresponds to the prior adjustments made
+ * to the associated reservation map.
+ * 2) Free any unused surplus pages that may have been allocated to satisfy
+ * the reservation. As many as unused_resv_pages may be freed.
+ *
+ * Called with hugetlb_lock held. However, the lock could be dropped (and
+ * reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
+ * we must make sure nobody else can claim pages we are in the process of
+ * freeing. Do this by ensuring resv_huge_page always is greater than the
+ * number of huge pages we plan to free when dropping the lock.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
unsigned long nr_pages;
- /* Uncommit the reservation */
- h->resv_huge_pages -= unused_resv_pages;
-
/* Cannot return gigantic pages currently */
if (hstate_is_gigantic(h))
- return;
+ goto out;
+ /*
+ * Part (or even all) of the reservation could have been backed
+ * by pre-allocated pages. Only free surplus pages.
+ */
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
/*
@@ -1799,12 +1811,22 @@ static void return_unused_surplus_pages(struct hstate *h,
* when the nodes with surplus pages have no free pages.
* free_pool_huge_page() will balance the the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
+ *
+ * Note that we decrement resv_huge_pages as we free the pages. If
+ * we drop the lock, resv_huge_pages will still be sufficiently large
+ * to cover subsequent pages we may free.
*/
while (nr_pages--) {
+ h->resv_huge_pages--;
+ unused_resv_pages--;
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
- break;
+ goto out;
cond_resched_lock(&hugetlb_lock);
}
+
+out:
+ /* Fully uncommit the reservation */
+ h->resv_huge_pages -= unused_resv_pages;
}
@@ -3122,7 +3144,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
* this far.
*/
-static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int hugetlb_vm_op_fault(struct vm_fault *vmf)
{
BUG();
return 0;
@@ -3286,6 +3308,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
+ /*
+ * This is a hugetlb vma, all the pte entries should point
+ * to huge page.
+ */
+ tlb_remove_check_page_size_change(tlb, sz);
tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
address = start;
@@ -3336,7 +3363,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
- tlb_remove_tlb_entry(tlb, ptep, address);
+ tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
@@ -3450,15 +3477,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep, pte_t pte,
- struct page *pagecache_page, spinlock_t *ptl)
+ unsigned long address, pte_t *ptep,
+ struct page *pagecache_page, spinlock_t *ptl)
{
+ pte_t pte;
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
int ret = 0, outside_reserve = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
retry_avoidcopy:
@@ -3654,6 +3683,38 @@ retry:
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto out;
+
+ /*
+ * Check for page in userfault range
+ */
+ if (userfaultfd_missing(vma)) {
+ u32 hash;
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = address,
+ .flags = flags,
+ /*
+ * Hard to debug if it ends up being
+ * used by a callee that assumes
+ * something about the other
+ * uninitialized fields... same as in
+ * memory.c
+ */
+ };
+
+ /*
+ * hugetlb_fault_mutex must be dropped before
+ * handling userfault. Reacquire after handling
+ * fault to make calling code simpler.
+ */
+ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
+ idx, address);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ ret = handle_userfault(&vmf, VM_UFFD_MISSING);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ goto out;
+ }
+
page = alloc_huge_page(vma, address, 0);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
@@ -3711,8 +3772,7 @@ retry:
vma_end_reservation(h, vma, address);
}
- ptl = huge_pte_lockptr(h, mm, ptep);
- spin_lock(ptl);
+ ptl = huge_pte_lock(h, mm, ptep);
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto backout;
@@ -3733,7 +3793,7 @@ retry:
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
}
spin_unlock(ptl);
@@ -3888,8 +3948,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_cow(mm, vma, address, ptep, entry,
- pagecache_page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep,
+ pagecache_page, ptl);
goto out_put_page;
}
entry = huge_pte_mkdirty(entry);
@@ -3923,10 +3983,113 @@ out_mutex:
return ret;
}
+/*
+ * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
+ * modifications for huge pages.
+ */
+int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ int vm_shared = dst_vma->vm_flags & VM_SHARED;
+ struct hstate *h = hstate_vma(dst_vma);
+ pte_t _dst_pte;
+ spinlock_t *ptl;
+ int ret;
+ struct page *page;
+
+ if (!*pagep) {
+ ret = -ENOMEM;
+ page = alloc_huge_page(dst_vma, dst_addr, 0);
+ if (IS_ERR(page))
+ goto out;
+
+ ret = copy_huge_page_from_user(page,
+ (const void __user *) src_addr,
+ pages_per_huge_page(h), false);
+
+ /* fallback to copy_from_user outside mmap_sem */
+ if (unlikely(ret)) {
+ ret = -EFAULT;
+ *pagep = page;
+ /* don't free the page */
+ goto out;
+ }
+ } else {
+ page = *pagep;
+ *pagep = NULL;
+ }
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+ set_page_huge_active(page);
+
+ /*
+ * If shared, add to page cache
+ */
+ if (vm_shared) {
+ struct address_space *mapping = dst_vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
+ ret = huge_add_to_page_cache(page, mapping, idx);
+ if (ret)
+ goto out_release_nounlock;
+ }
+
+ ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
+ spin_lock(ptl);
+
+ ret = -EEXIST;
+ if (!huge_pte_none(huge_ptep_get(dst_pte)))
+ goto out_release_unlock;
+
+ if (vm_shared) {
+ page_dup_rmap(page, true);
+ } else {
+ ClearPagePrivate(page);
+ hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
+ }
+
+ _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
+ if (dst_vma->vm_flags & VM_WRITE)
+ _dst_pte = huge_pte_mkdirty(_dst_pte);
+ _dst_pte = pte_mkyoung(_dst_pte);
+
+ set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
+ dst_vma->vm_flags & VM_WRITE);
+ hugetlb_count_add(pages_per_huge_page(h), dst_mm);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+ spin_unlock(ptl);
+ if (vm_shared)
+ unlock_page(page);
+ ret = 0;
+out:
+ return ret;
+out_release_unlock:
+ spin_unlock(ptl);
+out_release_nounlock:
+ if (vm_shared)
+ unlock_page(page);
+ put_page(page);
+ goto out;
+}
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags)
+ long i, unsigned int flags, int *nonblocking)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
@@ -3989,16 +4152,43 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
((flags & FOLL_WRITE) &&
!huge_pte_write(huge_ptep_get(pte)))) {
int ret;
+ unsigned int fault_flags = 0;
if (pte)
spin_unlock(ptl);
- ret = hugetlb_fault(mm, vma, vaddr,
- (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
- if (!(ret & VM_FAULT_ERROR))
- continue;
-
- remainder = 0;
- break;
+ if (flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (nonblocking)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (flags & FOLL_NOWAIT)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+ FAULT_FLAG_RETRY_NOWAIT;
+ if (flags & FOLL_TRIED) {
+ VM_WARN_ON_ONCE(fault_flags &
+ FAULT_FLAG_ALLOW_RETRY);
+ fault_flags |= FAULT_FLAG_TRIED;
+ }
+ ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ remainder = 0;
+ break;
+ }
+ if (ret & VM_FAULT_RETRY) {
+ if (nonblocking)
+ *nonblocking = 0;
+ *nr_pages = 0;
+ /*
+ * VM_FAULT_RETRY must not return an
+ * error, it will return zero
+ * instead.
+ *
+ * No need to update "position" as the
+ * caller will not check it after
+ * *nr_pages is set to 0.
+ */
+ return i;
+ }
+ continue;
}
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
@@ -4027,6 +4217,11 @@ same_page:
spin_unlock(ptl);
}
*nr_pages = remainder;
+ /*
+ * setting position is actually required only if remainder is
+ * not zero but it's faster not to add a "if (remainder)"
+ * branch.
+ */
*position = vaddr;
return i ? i : -EFAULT;
@@ -4208,7 +4403,9 @@ int hugetlb_reserve_pages(struct inode *inode,
return 0;
out_err:
if (!vma || vma->vm_flags & VM_MAYSHARE)
- region_abort(resv_map, from, to);
+ /* Don't call region_abort if region_chg failed */
+ if (chg >= 0)
+ region_abort(resv_map, from, to);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
return ret;
@@ -4330,8 +4527,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!spte)
goto out;
- ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
- spin_lock(ptl);
+ ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
@@ -4361,7 +4557,8 @@ out:
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
pgd_t *pgd = pgd_offset(mm, *addr);
- pud_t *pud = pud_offset(pgd, *addr);
+ p4d_t *p4d = p4d_offset(pgd, *addr);
+ pud_t *pud = pud_offset(p4d, *addr);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
@@ -4392,11 +4589,13 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
- pud = pud_alloc(mm, pgd, addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = pud_alloc(mm, p4d, addr);
if (pud) {
if (sz == PUD_SIZE) {
pte = (pte_t *)pud;
@@ -4416,18 +4615,22 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
- pmd_t *pmd = NULL;
+ pmd_t *pmd;
pgd = pgd_offset(mm, addr);
- if (pgd_present(*pgd)) {
- pud = pud_offset(pgd, addr);
- if (pud_present(*pud)) {
- if (pud_huge(*pud))
- return (pte_t *)pud;
- pmd = pmd_offset(pud, addr);
- }
- }
+ if (!pgd_present(*pgd))
+ return NULL;
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ return NULL;
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud))
+ return NULL;
+ if (pud_huge(*pud))
+ return (pte_t *)pud;
+ pmd = pmd_offset(pud, addr);
return (pte_t *) pmd;
}
@@ -4450,6 +4653,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
{
struct page *page = NULL;
spinlock_t *ptl;
+ pte_t pte;
retry:
ptl = pmd_lockptr(mm, pmd);
spin_lock(ptl);
@@ -4459,12 +4663,13 @@ retry:
*/
if (!pmd_huge(*pmd))
goto out;
- if (pmd_present(*pmd)) {
+ pte = huge_ptep_get((pte_t *)pmd);
+ if (pte_present(pte)) {
page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
if (flags & FOLL_GET)
get_page(page);
} else {
- if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
+ if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
__migration_entry_wait(mm, (pte_t *)pmd, ptl);
goto retry;