summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile5
-rw-r--r--mm/backing-dev.c67
-rw-r--r--mm/cma.c89
-rw-r--r--mm/compaction.c16
-rw-r--r--mm/failslab.c2
-rw-r--r--mm/gup.c4
-rw-r--r--mm/hmm.c573
-rw-r--r--mm/huge_memory.c42
-rw-r--r--mm/hugetlb.c27
-rw-r--r--mm/internal.h5
-rw-r--r--mm/kasan/kasan.c72
-rw-r--r--mm/kmemleak.c12
-rw-r--r--mm/ksm.c36
-rw-r--r--mm/list_lru.c74
-rw-r--r--mm/memblock.c43
-rw-r--r--mm/memcontrol.c264
-rw-r--r--mm/memfd.c342
-rw-r--r--mm/memory-failure.c18
-rw-r--r--mm/memory.c37
-rw-r--r--mm/memory_hotplug.c55
-rw-r--r--mm/mempolicy.c40
-rw-r--r--mm/migrate.c366
-rw-r--r--mm/mincore.c12
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/mprotect.c9
-rw-r--r--mm/nommu.c12
-rw-r--r--mm/oom_kill.c222
-rw-r--r--mm/page_alloc.c501
-rw-r--r--mm/page_idle.c12
-rw-r--r--mm/page_isolation.c21
-rw-r--r--mm/page_owner.c6
-rw-r--r--mm/page_poison.c2
-rw-r--r--mm/pagewalk.c3
-rw-r--r--mm/percpu-stats.c13
-rw-r--r--mm/rmap.c1
-rw-r--r--mm/shmem.c328
-rw-r--r--mm/slab.c24
-rw-r--r--mm/slab.h27
-rw-r--r--mm/slab_common.c96
-rw-r--r--mm/slub.c207
-rw-r--r--mm/sparse-vmemmap.c1
-rw-r--r--mm/sparse.c37
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swap_slots.c4
-rw-r--r--mm/swap_state.c160
-rw-r--r--mm/swapfile.c158
-rw-r--r--mm/util.c19
-rw-r--r--mm/vmscan.c234
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/z3fold.c35
-rw-r--r--mm/zsmalloc.c58
51 files changed, 2769 insertions, 1638 deletions
diff --git a/mm/Makefile b/mm/Makefile
index e669f02c5a54..8716bdabe1e6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@ obj-y := filemap.o mempool.o oom_kill.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o vmacache.o swap_slots.o \
+ compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
debug.o $(mmu-y)
@@ -55,7 +55,7 @@ ifdef CONFIG_MMU
endif
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
-obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o
obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
@@ -105,3 +105,4 @@ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_HMM) += hmm.o
+obj-$(CONFIG_MEMFD_CREATE) += memfd.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d2984e9fcf08..2fc3f38e4c4f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -100,18 +100,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
return 0;
}
-
-static int bdi_debug_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, bdi_debug_stats_show, inode->i_private);
-}
-
-static const struct file_operations bdi_debug_stats_fops = {
- .open = bdi_debug_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
static int bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
@@ -231,11 +220,46 @@ static ssize_t stable_pages_required_show(struct device *dev,
}
static DEVICE_ATTR_RO(stable_pages_required);
+static ssize_t strictlimit_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int val;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ switch (val) {
+ case 0:
+ bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
+ break;
+ case 1:
+ bdi->capabilities |= BDI_CAP_STRICTLIMIT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return count;
+}
+static ssize_t strictlimit_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n",
+ !!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
+}
+static DEVICE_ATTR_RW(strictlimit);
+
static struct attribute *bdi_dev_attrs[] = {
&dev_attr_read_ahead_kb.attr,
&dev_attr_min_ratio.attr,
&dev_attr_max_ratio.attr,
&dev_attr_stable_pages_required.attr,
+ &dev_attr_strictlimit.attr,
NULL,
};
ATTRIBUTE_GROUPS(bdi_dev);
@@ -1031,23 +1055,18 @@ EXPORT_SYMBOL(congestion_wait);
/**
* wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
- * @pgdat: A pgdat to check if it is heavily congested
* @sync: SYNC or ASYNC IO
* @timeout: timeout in jiffies
*
- * In the event of a congested backing_dev (any backing_dev) and the given
- * @pgdat has experienced recent congestion, this waits for up to @timeout
- * jiffies for either a BDI to exit congestion of the given @sync queue
- * or a write to complete.
- *
- * In the absence of pgdat congestion, cond_resched() is called to yield
- * the processor if necessary but otherwise does not sleep.
+ * In the event of a congested backing_dev (any backing_dev) this waits
+ * for up to @timeout jiffies for either a BDI to exit congestion of the
+ * given @sync queue or a write to complete.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
* it is the number of jiffies that were still remaining when the function
* returned. return_value == timeout implies the function did not sleep.
*/
-long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout)
+long wait_iff_congested(int sync, long timeout)
{
long ret;
unsigned long start = jiffies;
@@ -1055,12 +1074,10 @@ long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout)
wait_queue_head_t *wqh = &congestion_wqh[sync];
/*
- * If there is no congestion, or heavy congestion is not being
- * encountered in the current pgdat, yield if necessary instead
+ * If there is no congestion, yield if necessary instead
* of sleeping on the congestion queue
*/
- if (atomic_read(&nr_wb_congested[sync]) == 0 ||
- !test_bit(PGDAT_CONGESTED, &pgdat->flags)) {
+ if (atomic_read(&nr_wb_congested[sync]) == 0) {
cond_resched();
/* In case we scheduled, work out time remaining */
diff --git a/mm/cma.c b/mm/cma.c
index 0607729abf3b..aa40e6c7b042 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -35,9 +35,11 @@
#include <linux/cma.h>
#include <linux/highmem.h>
#include <linux/io.h>
+#include <linux/kmemleak.h>
#include <trace/events/cma.h>
#include "cma.h"
+#include "internal.h"
struct cma cma_areas[MAX_CMA_AREAS];
unsigned cma_area_count;
@@ -108,23 +110,25 @@ static int __init cma_activate_area(struct cma *cma)
if (!cma->bitmap)
return -ENOMEM;
- WARN_ON_ONCE(!pfn_valid(pfn));
- zone = page_zone(pfn_to_page(pfn));
-
do {
unsigned j;
base_pfn = pfn;
+ if (!pfn_valid(base_pfn))
+ goto err;
+
+ zone = page_zone(pfn_to_page(base_pfn));
for (j = pageblock_nr_pages; j; --j, pfn++) {
- WARN_ON_ONCE(!pfn_valid(pfn));
+ if (!pfn_valid(pfn))
+ goto err;
+
/*
- * alloc_contig_range requires the pfn range
- * specified to be in the same zone. Make this
- * simple by forcing the entire CMA resv range
- * to be in the same zone.
+ * In init_cma_reserved_pageblock(), present_pages
+ * is adjusted with assumption that all pages in
+ * the pageblock come from a single zone.
*/
if (page_zone(pfn_to_page(pfn)) != zone)
- goto not_in_zone;
+ goto err;
}
init_cma_reserved_pageblock(pfn_to_page(base_pfn));
} while (--i);
@@ -138,7 +142,7 @@ static int __init cma_activate_area(struct cma *cma)
return 0;
-not_in_zone:
+err:
pr_err("CMA area %s could not be activated\n", cma->name);
kfree(cma->bitmap);
cma->count = 0;
@@ -148,6 +152,41 @@ not_in_zone:
static int __init cma_init_reserved_areas(void)
{
int i;
+ struct zone *zone;
+ pg_data_t *pgdat;
+
+ if (!cma_area_count)
+ return 0;
+
+ for_each_online_pgdat(pgdat) {
+ unsigned long start_pfn = UINT_MAX, end_pfn = 0;
+
+ zone = &pgdat->node_zones[ZONE_MOVABLE];
+
+ /*
+ * In this case, we cannot adjust the zone range
+ * since it is now maximum node span and we don't
+ * know original zone range.
+ */
+ if (populated_zone(zone))
+ continue;
+
+ for (i = 0; i < cma_area_count; i++) {
+ if (pfn_to_nid(cma_areas[i].base_pfn) !=
+ pgdat->node_id)
+ continue;
+
+ start_pfn = min(start_pfn, cma_areas[i].base_pfn);
+ end_pfn = max(end_pfn, cma_areas[i].base_pfn +
+ cma_areas[i].count);
+ }
+
+ if (!end_pfn)
+ continue;
+
+ zone->zone_start_pfn = start_pfn;
+ zone->spanned_pages = end_pfn - start_pfn;
+ }
for (i = 0; i < cma_area_count; i++) {
int ret = cma_activate_area(&cma_areas[i]);
@@ -156,15 +195,41 @@ static int __init cma_init_reserved_areas(void)
return ret;
}
+ /*
+ * Reserved pages for ZONE_MOVABLE are now activated and
+ * this would change ZONE_MOVABLE's managed page counter and
+ * the other zones' present counter. We need to re-calculate
+ * various zone information that depends on this initialization.
+ */
+ build_all_zonelists(NULL);
+ for_each_populated_zone(zone) {
+ if (zone_idx(zone) == ZONE_MOVABLE) {
+ zone_pcp_reset(zone);
+ setup_zone_pageset(zone);
+ } else
+ zone_pcp_update(zone);
+
+ set_zone_contiguous(zone);
+ }
+
+ /*
+ * We need to re-init per zone wmark by calling
+ * init_per_zone_wmark_min() but doesn't call here because it is
+ * registered on core_initcall and it will be called later than us.
+ */
+
return 0;
}
-core_initcall(cma_init_reserved_areas);
+pure_initcall(cma_init_reserved_areas);
/**
* cma_init_reserved_mem() - create custom contiguous area from reserved memory
* @base: Base address of the reserved area
* @size: Size of the reserved area (in bytes),
* @order_per_bit: Order of pages represented by one bit on bitmap.
+ * @name: The name of the area. If this parameter is NULL, the name of
+ * the area will be set to "cmaN", where N is a running counter of
+ * used areas.
* @res_cma: Pointer to store the created cma region.
*
* This function creates custom contiguous area from already reserved memory.
@@ -227,6 +292,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
* @alignment: Alignment for the CMA area, should be power of 2 or zero
* @order_per_bit: Order of pages represented by one bit on bitmap.
* @fixed: hint about where to place the reserved area
+ * @name: The name of the area. See function cma_init_reserved_mem()
* @res_cma: Pointer to store the created cma region.
*
* This function reserves memory from early allocator. It should be
@@ -390,6 +456,7 @@ static inline void cma_debug_show_areas(struct cma *cma) { }
* @cma: Contiguous memory region for which the allocation is performed.
* @count: Requested number of pages.
* @align: Requested alignment of pages (in PAGE_SIZE order).
+ * @gfp_mask: GFP mask to use during compaction
*
* This function allocates part of contiguous memory on specific
* contiguous memory area.
diff --git a/mm/compaction.c b/mm/compaction.c
index 2c8999d027ab..028b7210a669 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -576,6 +576,7 @@ isolate_fail:
/**
* isolate_freepages_range() - isolate free pages.
+ * @cc: Compaction control structure.
* @start_pfn: The first PFN to start isolating.
* @end_pfn: The one-past-last PFN.
*
@@ -1165,8 +1166,7 @@ static void isolate_freepages(struct compact_control *cc)
* from the isolated freelists in the block we are migrating to.
*/
static struct page *compaction_alloc(struct page *migratepage,
- unsigned long data,
- int **result)
+ unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
@@ -1450,14 +1450,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
* if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
- * ALLOC_CMA is used, as pages in CMA pageblocks are considered
- * suitable migration targets
*/
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
- ALLOC_CMA, wmark_target))
+ 0, wmark_target))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
@@ -1988,6 +1986,14 @@ static void kcompactd_do_work(pg_data_t *pgdat)
compaction_defer_reset(zone, cc.order, false);
} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
/*
+ * Buddy pages may become stranded on pcps that could
+ * otherwise coalesce on the zone's free area for
+ * order >= cc.order. This is ratelimited by the
+ * upcoming deferral.
+ */
+ drain_all_pages(zone);
+
+ /*
* We use sync migration mode here, so we defer like
* sync direct compaction does.
*/
diff --git a/mm/failslab.c b/mm/failslab.c
index 8087d976a809..1f2f248e3601 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -14,7 +14,7 @@ static struct {
.cache_filter = false,
};
-bool should_failslab(struct kmem_cache *s, gfp_t gfpflags)
+bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
{
/* No fault-injection for bootstrap cache */
if (unlikely(s == kmem_cache))
diff --git a/mm/gup.c b/mm/gup.c
index 751c7b5dbc36..2e2df7f3e92d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -532,7 +532,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
* reCOWed by userspace write).
*/
if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
- *flags |= FOLL_COW;
+ *flags |= FOLL_COW;
return 0;
}
@@ -1641,7 +1641,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
PMD_SHIFT, next, write, pages, nr))
return 0;
} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
- return 0;
+ return 0;
} while (pmdp++, addr = next, addr != end);
return 1;
diff --git a/mm/hmm.c b/mm/hmm.c
index 4aa554e76d06..de7b6bf77201 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -151,6 +151,32 @@ static void hmm_invalidate_range(struct hmm *hmm,
up_read(&hmm->mirrors_sem);
}
+static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ struct hmm_mirror *mirror;
+ struct hmm *hmm = mm->hmm;
+
+ down_write(&hmm->mirrors_sem);
+ mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
+ list);
+ while (mirror) {
+ list_del_init(&mirror->list);
+ if (mirror->ops->release) {
+ /*
+ * Drop mirrors_sem so callback can wait on any pending
+ * work that might itself trigger mmu_notifier callback
+ * and thus would deadlock with us.
+ */
+ up_write(&hmm->mirrors_sem);
+ mirror->ops->release(mirror);
+ down_write(&hmm->mirrors_sem);
+ }
+ mirror = list_first_entry_or_null(&hmm->mirrors,
+ struct hmm_mirror, list);
+ }
+ up_write(&hmm->mirrors_sem);
+}
+
static void hmm_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
@@ -176,6 +202,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
+ .release = hmm_release,
.invalidate_range_start = hmm_invalidate_range_start,
.invalidate_range_end = hmm_invalidate_range_end,
};
@@ -197,13 +224,24 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
if (!mm || !mirror || !mirror->ops)
return -EINVAL;
+again:
mirror->hmm = hmm_register(mm);
if (!mirror->hmm)
return -ENOMEM;
down_write(&mirror->hmm->mirrors_sem);
- list_add(&mirror->list, &mirror->hmm->mirrors);
- up_write(&mirror->hmm->mirrors_sem);
+ if (mirror->hmm->mm == NULL) {
+ /*
+ * A racing hmm_mirror_unregister() is about to destroy the hmm
+ * struct. Try again to allocate a new one.
+ */
+ up_write(&mirror->hmm->mirrors_sem);
+ mirror->hmm = NULL;
+ goto again;
+ } else {
+ list_add(&mirror->list, &mirror->hmm->mirrors);
+ up_write(&mirror->hmm->mirrors_sem);
+ }
return 0;
}
@@ -218,11 +256,32 @@ EXPORT_SYMBOL(hmm_mirror_register);
*/
void hmm_mirror_unregister(struct hmm_mirror *mirror)
{
- struct hmm *hmm = mirror->hmm;
+ bool should_unregister = false;
+ struct mm_struct *mm;
+ struct hmm *hmm;
+ if (mirror->hmm == NULL)
+ return;
+
+ hmm = mirror->hmm;
down_write(&hmm->mirrors_sem);
- list_del(&mirror->list);
+ list_del_init(&mirror->list);
+ should_unregister = list_empty(&hmm->mirrors);
+ mirror->hmm = NULL;
+ mm = hmm->mm;
+ hmm->mm = NULL;
up_write(&hmm->mirrors_sem);
+
+ if (!should_unregister || mm == NULL)
+ return;
+
+ spin_lock(&mm->page_table_lock);
+ if (mm->hmm == hmm)
+ mm->hmm = NULL;
+ spin_unlock(&mm->page_table_lock);
+
+ mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
+ kfree(hmm);
}
EXPORT_SYMBOL(hmm_mirror_unregister);
@@ -231,110 +290,275 @@ struct hmm_vma_walk {
unsigned long last;
bool fault;
bool block;
- bool write;
};
-static int hmm_vma_do_fault(struct mm_walk *walk,
- unsigned long addr,
- hmm_pfn_t *pfn)
+static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
+ bool write_fault, uint64_t *pfn)
{
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
int r;
flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
- flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
+ flags |= write_fault ? FAULT_FLAG_WRITE : 0;
r = handle_mm_fault(vma, addr, flags);
if (r & VM_FAULT_RETRY)
return -EBUSY;
if (r & VM_FAULT_ERROR) {
- *pfn = HMM_PFN_ERROR;
+ *pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
}
return -EAGAIN;
}
-static void hmm_pfns_special(hmm_pfn_t *pfns,
- unsigned long addr,
- unsigned long end)
-{
- for (; addr < end; addr += PAGE_SIZE, pfns++)
- *pfns = HMM_PFN_SPECIAL;
-}
-
static int hmm_pfns_bad(unsigned long addr,
unsigned long end,
struct mm_walk *walk)
{
- struct hmm_range *range = walk->private;
- hmm_pfn_t *pfns = range->pfns;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ uint64_t *pfns = range->pfns;
unsigned long i;
i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, i++)
- pfns[i] = HMM_PFN_ERROR;
+ pfns[i] = range->values[HMM_PFN_ERROR];
return 0;
}
-static void hmm_pfns_clear(hmm_pfn_t *pfns,
- unsigned long addr,
- unsigned long end)
-{
- for (; addr < end; addr += PAGE_SIZE, pfns++)
- *pfns = 0;
-}
-
-static int hmm_vma_walk_hole(unsigned long addr,
- unsigned long end,
- struct mm_walk *walk)
+/*
+ * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
+ * @start: range virtual start address (inclusive)
+ * @end: range virtual end address (exclusive)
+ * @fault: should we fault or not ?
+ * @write_fault: write fault ?
+ * @walk: mm_walk structure
+ * Returns: 0 on success, -EAGAIN after page fault, or page fault error
+ *
+ * This function will be called whenever pmd_none() or pte_none() returns true,
+ * or whenever there is no page directory covering the virtual address range.
+ */
+static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
+ bool fault, bool write_fault,
+ struct mm_walk *walk)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- hmm_pfn_t *pfns = range->pfns;
+ uint64_t *pfns = range->pfns;
unsigned long i;
hmm_vma_walk->last = addr;
i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, i++) {
- pfns[i] = HMM_PFN_EMPTY;
- if (hmm_vma_walk->fault) {
+ pfns[i] = range->values[HMM_PFN_NONE];
+ if (fault || write_fault) {
int ret;
- ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
+ ret = hmm_vma_do_fault(walk, addr, write_fault,
+ &pfns[i]);
if (ret != -EAGAIN)
return ret;
}
}
- return hmm_vma_walk->fault ? -EAGAIN : 0;
+ return (fault || write_fault) ? -EAGAIN : 0;
}
-static int hmm_vma_walk_clear(unsigned long addr,
- unsigned long end,
- struct mm_walk *walk)
+static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
+ uint64_t pfns, uint64_t cpu_flags,
+ bool *fault, bool *write_fault)
{
- struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- hmm_pfn_t *pfns = range->pfns;
+
+ *fault = *write_fault = false;
+ if (!hmm_vma_walk->fault)
+ return;
+
+ /* We aren't ask to do anything ... */
+ if (!(pfns & range->flags[HMM_PFN_VALID]))
+ return;
+ /* If this is device memory than only fault if explicitly requested */
+ if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
+ /* Do we fault on device memory ? */
+ if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
+ *write_fault = pfns & range->flags[HMM_PFN_WRITE];
+ *fault = true;
+ }
+ return;
+ }
+
+ /* If CPU page table is not valid then we need to fault */
+ *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
+ /* Need to write fault ? */
+ if ((pfns & range->flags[HMM_PFN_WRITE]) &&
+ !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
+ *write_fault = true;
+ *fault = true;
+ }
+}
+
+static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
+ const uint64_t *pfns, unsigned long npages,
+ uint64_t cpu_flags, bool *fault,
+ bool *write_fault)
+{
unsigned long i;
- hmm_vma_walk->last = addr;
+ if (!hmm_vma_walk->fault) {
+ *fault = *write_fault = false;
+ return;
+ }
+
+ for (i = 0; i < npages; ++i) {
+ hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
+ fault, write_fault);
+ if ((*fault) || (*write_fault))
+ return;
+ }
+}
+
+static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ bool fault, write_fault;
+ unsigned long i, npages;
+ uint64_t *pfns;
+
i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++) {
- pfns[i] = 0;
- if (hmm_vma_walk->fault) {
- int ret;
+ npages = (end - addr) >> PAGE_SHIFT;
+ pfns = &range->pfns[i];
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+ 0, &fault, &write_fault);
+ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+}
- ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
- if (ret != -EAGAIN)
- return ret;
+static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
+{
+ if (pmd_protnone(pmd))
+ return 0;
+ return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_WRITE] :
+ range->flags[HMM_PFN_VALID];
+}
+
+static int hmm_vma_handle_pmd(struct mm_walk *walk,
+ unsigned long addr,
+ unsigned long end,
+ uint64_t *pfns,
+ pmd_t pmd)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long pfn, npages, i;
+ bool fault, write_fault;
+ uint64_t cpu_flags;
+
+ npages = (end - addr) >> PAGE_SHIFT;
+ cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
+ &fault, &write_fault);
+
+ if (pmd_protnone(pmd) || fault || write_fault)
+ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+
+ pfn = pmd_pfn(pmd) + pte_index(addr);
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
+ pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+ hmm_vma_walk->last = end;
+ return 0;
+}
+
+static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
+{
+ if (pte_none(pte) || !pte_present(pte))
+ return 0;
+ return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_WRITE] :
+ range->flags[HMM_PFN_VALID];
+}
+
+static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
+ unsigned long end, pmd_t *pmdp, pte_t *ptep,
+ uint64_t *pfn)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ bool fault, write_fault;
+ uint64_t cpu_flags;
+ pte_t pte = *ptep;
+ uint64_t orig_pfn = *pfn;
+
+ *pfn = range->values[HMM_PFN_NONE];
+ cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
+
+ if (pte_none(pte)) {
+ if (fault || write_fault)
+ goto fault;
+ return 0;
+ }
+
+ if (!pte_present(pte)) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ if (!non_swap_entry(entry)) {
+ if (fault || write_fault)
+ goto fault;
+ return 0;
}
+
+ /*
+ * This is a special swap entry, ignore migration, use
+ * device and report anything else as error.
+ */
+ if (is_device_private_entry(entry)) {
+ cpu_flags = range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_DEVICE_PRIVATE];
+ cpu_flags |= is_write_device_private_entry(entry) ?
+ range->flags[HMM_PFN_WRITE] : 0;
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
+ if (fault || write_fault)
+ goto fault;
+ *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
+ *pfn |= cpu_flags;
+ return 0;
+ }
+
+ if (is_migration_entry(entry)) {
+ if (fault || write_fault) {
+ pte_unmap(ptep);
+ hmm_vma_walk->last = addr;
+ migration_entry_wait(vma->vm_mm,
+ pmdp, addr);
+ return -EAGAIN;
+ }
+ return 0;
+ }
+
+ /* Report error for everything else */
+ *pfn = range->values[HMM_PFN_ERROR];
+ return -EFAULT;
}
- return hmm_vma_walk->fault ? -EAGAIN : 0;
+ if (fault || write_fault)
+ goto fault;
+
+ *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
+ return 0;
+
+fault:
+ pte_unmap(ptep);
+ /* Fault any virtual address we were asked to fault */
+ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
}
static int hmm_vma_walk_pmd(pmd_t *pmdp,
@@ -344,26 +568,20 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- struct vm_area_struct *vma = walk->vma;
- hmm_pfn_t *pfns = range->pfns;
+ uint64_t *pfns = range->pfns;
unsigned long addr = start, i;
- bool write_fault;
- hmm_pfn_t flag;
pte_t *ptep;
i = (addr - range->start) >> PAGE_SHIFT;
- flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
- write_fault = hmm_vma_walk->fault & hmm_vma_walk->write;
again:
if (pmd_none(*pmdp))
return hmm_vma_walk_hole(start, end, walk);
- if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB)
+ if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
return hmm_pfns_bad(start, end, walk);
if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
- unsigned long pfn;
pmd_t pmd;
/*
@@ -379,17 +597,8 @@ again:
barrier();
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;
- if (pmd_protnone(pmd))
- return hmm_vma_walk_clear(start, end, walk);
- if (write_fault && !pmd_write(pmd))
- return hmm_vma_walk_clear(start, end, walk);
-
- pfn = pmd_pfn(pmd) + pte_index(addr);
- flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
- for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
- return 0;
+ return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
}
if (pmd_bad(*pmdp))
@@ -397,79 +606,43 @@ again:
ptep = pte_offset_map(pmdp, addr);
for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
- pte_t pte = *ptep;
-
- pfns[i] = 0;
+ int r;
- if (pte_none(pte)) {
- pfns[i] = HMM_PFN_EMPTY;
- if (hmm_vma_walk->fault)
- goto fault;
- continue;
+ r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
+ if (r) {
+ /* hmm_vma_handle_pte() did unmap pte directory */
+ hmm_vma_walk->last = addr;
+ return r;
}
-
- if (!pte_present(pte)) {
- swp_entry_t entry = pte_to_swp_entry(pte);
-
- if (!non_swap_entry(entry)) {
- if (hmm_vma_walk->fault)
- goto fault;
- continue;
- }
-
- /*
- * This is a special swap entry, ignore migration, use
- * device and report anything else as error.
- */
- if (is_device_private_entry(entry)) {
- pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry));
- if (is_write_device_private_entry(entry)) {
- pfns[i] |= HMM_PFN_WRITE;
- } else if (write_fault)
- goto fault;
- pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE;
- pfns[i] |= flag;
- } else if (is_migration_entry(entry)) {
- if (hmm_vma_walk->fault) {
- pte_unmap(ptep);
- hmm_vma_walk->last = addr;
- migration_entry_wait(vma->vm_mm,
- pmdp, addr);
- return -EAGAIN;
- }
- continue;
- } else {
- /* Report error for everything else */
- pfns[i] = HMM_PFN_ERROR;
- }
- continue;
- }
-
- if (write_fault && !pte_write(pte))
- goto fault;
-
- pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
- pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
- continue;
-
-fault:
- pte_unmap(ptep);
- /* Fault all pages in range */
- return hmm_vma_walk_clear(start, end, walk);
}
pte_unmap(ptep - 1);
+ hmm_vma_walk->last = addr;
return 0;
}
+static void hmm_pfns_clear(struct hmm_range *range,
+ uint64_t *pfns,
+ unsigned long addr,
+ unsigned long end)
+{
+ for (; addr < end; addr += PAGE_SIZE, pfns++)
+ *pfns = range->values[HMM_PFN_NONE];
+}
+
+static void hmm_pfns_special(struct hmm_range *range)
+{
+ unsigned long addr = range->start, i = 0;
+
+ for (; addr < range->end; addr += PAGE_SIZE, i++)
+ range->pfns[i] = range->values[HMM_PFN_SPECIAL];
+}
+
/*
* hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
- * @vma: virtual memory area containing the virtual address range
- * @range: used to track snapshot validity
- * @start: range virtual start address (inclusive)
- * @end: range virtual end address (exclusive)
- * @entries: array of hmm_pfn_t: provided by the caller, filled in by function
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success
+ * @range: range being snapshotted
+ * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
+ * vma permission, 0 success
*
* This snapshots the CPU page table for a range of virtual addresses. Snapshot
* validity is tracked by range struct. See hmm_vma_range_done() for further
@@ -482,26 +655,17 @@ fault:
* NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
* MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
*/
-int hmm_vma_get_pfns(struct vm_area_struct *vma,
- struct hmm_range *range,
- unsigned long start,
- unsigned long end,
- hmm_pfn_t *pfns)
+int hmm_vma_get_pfns(struct hmm_range *range)
{
+ struct vm_area_struct *vma = range->vma;
struct hmm_vma_walk hmm_vma_walk;
struct mm_walk mm_walk;
struct hmm *hmm;
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
- hmm_pfns_special(pfns, start, end);
- return -EINVAL;
- }
-
/* Sanity check, this really should not happen ! */
- if (start < vma->vm_start || start >= vma->vm_end)
+ if (range->start < vma->vm_start || range->start >= vma->vm_end)
return -EINVAL;
- if (end < vma->vm_start || end > vma->vm_end)
+ if (range->end < vma->vm_start || range->end > vma->vm_end)
return -EINVAL;
hmm = hmm_register(vma->vm_mm);
@@ -511,10 +675,24 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
if (!hmm->mmu_notifier.ops)
return -EINVAL;
+ /* FIXME support hugetlb fs */
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ hmm_pfns_special(range);
+ return -EINVAL;
+ }
+
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it does
+ * not allow write access, either. Architecture that allow
+ * write without read access are not supported by HMM, because
+ * operations such has atomic access would not work.
+ */
+ hmm_pfns_clear(range, range->pfns, range->start, range->end);
+ return -EPERM;
+ }
+
/* Initialize range to track CPU page table update */
- range->start = start;
- range->pfns = pfns;
- range->end = end;
spin_lock(&hmm->lock);
range->valid = true;
list_add_rcu(&range->list, &hmm->ranges);
@@ -532,14 +710,13 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
mm_walk.pmd_entry = hmm_vma_walk_pmd;
mm_walk.pte_hole = hmm_vma_walk_hole;
- walk_page_range(start, end, &mm_walk);
+ walk_page_range(range->start, range->end, &mm_walk);
return 0;
}
EXPORT_SYMBOL(hmm_vma_get_pfns);
/*
* hmm_vma_range_done() - stop tracking change to CPU page table over a range
- * @vma: virtual memory area containing the virtual address range
* @range: range being tracked
* Returns: false if range data has been invalidated, true otherwise
*
@@ -559,10 +736,10 @@ EXPORT_SYMBOL(hmm_vma_get_pfns);
*
* There are two ways to use this :
* again:
- * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
+ * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
* trans = device_build_page_table_update_transaction(pfns);
* device_page_table_lock();
- * if (!hmm_vma_range_done(vma, range)) {
+ * if (!hmm_vma_range_done(range)) {
* device_page_table_unlock();
* goto again;
* }
@@ -570,13 +747,13 @@ EXPORT_SYMBOL(hmm_vma_get_pfns);
* device_page_table_unlock();
*
* Or:
- * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
+ * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
* device_page_table_lock();
- * hmm_vma_range_done(vma, range);
- * device_update_page_table(pfns);
+ * hmm_vma_range_done(range);
+ * device_update_page_table(range->pfns);
* device_page_table_unlock();
*/
-bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
+bool hmm_vma_range_done(struct hmm_range *range)
{
unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
struct hmm *hmm;
@@ -586,7 +763,7 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
return false;
}
- hmm = hmm_register(vma->vm_mm);
+ hmm = hmm_register(range->vma->vm_mm);
if (!hmm) {
memset(range->pfns, 0, sizeof(*range->pfns) * npages);
return false;
@@ -602,36 +779,34 @@ EXPORT_SYMBOL(hmm_vma_range_done);
/*
* hmm_vma_fault() - try to fault some address in a virtual address range
- * @vma: virtual memory area containing the virtual address range
- * @range: use to track pfns array content validity
- * @start: fault range virtual start address (inclusive)
- * @end: fault range virtual end address (exclusive)
- * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted
- * @write: is it a write fault
+ * @range: range being faulted
* @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
* Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
*
* This is similar to a regular CPU page fault except that it will not trigger
* any memory migration if the memory being faulted is not accessible by CPUs.
*
- * On error, for one virtual address in the range, the function will set the
- * hmm_pfn_t error flag for the corresponding pfn entry.
+ * On error, for one virtual address in the range, the function will mark the
+ * corresponding HMM pfn entry with an error flag.
*
* Expected use pattern:
* retry:
* down_read(&mm->mmap_sem);
* // Find vma and address device wants to fault, initialize hmm_pfn_t
* // array accordingly
- * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry);
+ * ret = hmm_vma_fault(range, write, block);
* switch (ret) {
* case -EAGAIN:
- * hmm_vma_range_done(vma, range);
+ * hmm_vma_range_done(range);
* // You might want to rate limit or yield to play nicely, you may
* // also commit any valid pfn in the array assuming that you are
* // getting true from hmm_vma_range_monitor_end()
* goto retry;
* case 0:
* break;
+ * case -ENOMEM:
+ * case -EINVAL:
+ * case -EPERM:
* default:
* // Handle error !
* up_read(&mm->mmap_sem)
@@ -639,7 +814,7 @@ EXPORT_SYMBOL(hmm_vma_range_done);
* }
* // Take device driver lock that serialize device page table update
* driver_lock_device_page_table_update();
- * hmm_vma_range_done(vma, range);
+ * hmm_vma_range_done(range);
* // Commit pfns we got from hmm_vma_fault()
* driver_unlock_device_page_table_update();
* up_read(&mm->mmap_sem)
@@ -649,51 +824,54 @@ EXPORT_SYMBOL(hmm_vma_range_done);
*
* YOU HAVE BEEN WARNED !
*/
-int hmm_vma_fault(struct vm_area_struct *vma,
- struct hmm_range *range,
- unsigned long start,
- unsigned long end,
- hmm_pfn_t *pfns,
- bool write,
- bool block)
+int hmm_vma_fault(struct hmm_range *range, bool block)
{
+ struct vm_area_struct *vma = range->vma;
+ unsigned long start = range->start;
struct hmm_vma_walk hmm_vma_walk;
struct mm_walk mm_walk;
struct hmm *hmm;
int ret;
/* Sanity check, this really should not happen ! */
- if (start < vma->vm_start || start >= vma->vm_end)
+ if (range->start < vma->vm_start || range->start >= vma->vm_end)
return -EINVAL;
- if (end < vma->vm_start || end > vma->vm_end)
+ if (range->end < vma->vm_start || range->end > vma->vm_end)
return -EINVAL;
hmm = hmm_register(vma->vm_mm);
if (!hmm) {
- hmm_pfns_clear(pfns, start, end);
+ hmm_pfns_clear(range, range->pfns, range->start, range->end);
return -ENOMEM;
}
/* Caller must have registered a mirror using hmm_mirror_register() */
if (!hmm->mmu_notifier.ops)
return -EINVAL;
+ /* FIXME support hugetlb fs */
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ hmm_pfns_special(range);
+ return -EINVAL;
+ }
+
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it does
+ * not allow write access, either. Architecture that allow
+ * write without read access are not supported by HMM, because
+ * operations such has atomic access would not work.
+ */
+ hmm_pfns_clear(range, range->pfns, range->start, range->end);
+ return -EPERM;
+ }
+
/* Initialize range to track CPU page table update */
- range->start = start;
- range->pfns = pfns;
- range->end = end;
spin_lock(&hmm->lock);
range->valid = true;
list_add_rcu(&range->list, &hmm->ranges);
spin_unlock(&hmm->lock);
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
- hmm_pfns_special(pfns, start, end);
- return 0;
- }
-
hmm_vma_walk.fault = true;
- hmm_vma_walk.write = write;
hmm_vma_walk.block = block;
hmm_vma_walk.range = range;
mm_walk.private = &hmm_vma_walk;
@@ -708,7 +886,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
mm_walk.pte_hole = hmm_vma_walk_hole;
do {
- ret = walk_page_range(start, end, &mm_walk);
+ ret = walk_page_range(start, range->end, &mm_walk);
start = hmm_vma_walk.last;
} while (ret == -EAGAIN);
@@ -716,8 +894,9 @@ int hmm_vma_fault(struct vm_area_struct *vma,
unsigned long i;
i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
- hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end);
- hmm_vma_range_done(vma, range);
+ hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
+ range->end);
+ hmm_vma_range_done(range);
}
return ret;
}
@@ -836,13 +1015,6 @@ static void hmm_devmem_release(struct device *dev, void *data)
hmm_devmem_radix_release(resource);
}
-static struct hmm_devmem *hmm_devmem_find(resource_size_t phys)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT);
-}
-
static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
{
resource_size_t key, align_start, align_size, align_end;
@@ -883,9 +1055,8 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
struct hmm_devmem *dup;
- rcu_read_lock();
- dup = hmm_devmem_find(key);
- rcu_read_unlock();
+ dup = radix_tree_lookup(&hmm_devmem_radix,
+ key >> PA_SECTION_SHIFT);
if (dup) {
dev_err(device, "%s: collides with mapping for %s\n",
__func__, dev_name(dup->device));
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5a68730eebd6..f2eccca30535 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2356,26 +2356,13 @@ static void __split_huge_page_tail(struct page *head, int tail,
struct page *page_tail = head + tail;
VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
- VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
/*
- * tail_page->_refcount is zero and not changing from under us. But
- * get_page_unless_zero() may be running from under us on the
- * tail_page. If we used atomic_set() below instead of atomic_inc() or
- * atomic_add(), we would then run atomic_set() concurrently with
- * get_page_unless_zero(), and atomic_set() is implemented in C not
- * using locked ops. spin_unlock on x86 sometime uses locked ops
- * because of PPro errata 66, 92, so unless somebody can guarantee
- * atomic_set() here would be safe on all archs (and not only on x86),
- * it's safer to use atomic_inc()/atomic_add().
+ * Clone page flags before unfreezing refcount.
+ *
+ * After successful get_page_unless_zero() might follow flags change,
+ * for exmaple lock_page() which set PG_waiters.
*/
- if (PageAnon(head) && !PageSwapCache(head)) {
- page_ref_inc(page_tail);
- } else {
- /* Additional pin to radix tree */
- page_ref_add(page_tail, 2);
- }
-
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
((1L << PG_referenced) |
@@ -2388,14 +2375,21 @@ static void __split_huge_page_tail(struct page *head, int tail,
(1L << PG_unevictable) |
(1L << PG_dirty)));
- /*
- * After clearing PageTail the gup refcount can be released.
- * Page flags also must be visible before we make the page non-compound.
- */
+ /* Page flags must be visible before we make the page non-compound. */
smp_wmb();
+ /*
+ * Clear PageTail before unfreezing page refcount.
+ *
+ * After successful get_page_unless_zero() might follow put_page()
+ * which needs correct compound_head().
+ */
clear_compound_head(page_tail);
+ /* Finally unfreeze refcount. Additional reference from page cache. */
+ page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
+ PageSwapCache(head)));
+
if (page_is_young(head))
set_page_young(page_tail);
if (page_is_idle(head))
@@ -2408,6 +2402,12 @@ static void __split_huge_page_tail(struct page *head, int tail,
page_tail->index = head->index + tail;
page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+
+ /*
+ * always add to the tail because some iterators expect new
+ * pages to show after the currently processed elements - e.g.
+ * migrate_pages
+ */
lru_add_page_tail(head, page_tail, lruvec, list);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 976bbc5646fe..218679138255 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -637,29 +637,22 @@ EXPORT_SYMBOL_GPL(linear_hugepage_index);
*/
unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
- struct hstate *hstate;
-
- if (!is_vm_hugetlb_page(vma))
- return PAGE_SIZE;
-
- hstate = hstate_vma(vma);
-
- return 1UL << huge_page_shift(hstate);
+ if (vma->vm_ops && vma->vm_ops->pagesize)
+ return vma->vm_ops->pagesize(vma);
+ return PAGE_SIZE;
}
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
/*
* Return the page size being used by the MMU to back a VMA. In the majority
* of cases, the page size used by the kernel matches the MMU size. On
- * architectures where it differs, an architecture-specific version of this
- * function is required.
+ * architectures where it differs, an architecture-specific 'strong'
+ * version of this symbol is required.
*/
-#ifndef vma_mmu_pagesize
-unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
return vma_kernel_pagesize(vma);
}
-#endif
/*
* Flags for MAP_PRIVATE reservations. These are stored in the bottom
@@ -3153,6 +3146,13 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
return 0;
}
+static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
+{
+ struct hstate *hstate = hstate_vma(vma);
+
+ return 1UL << huge_page_shift(hstate);
+}
+
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
* handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -3170,6 +3170,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.open = hugetlb_vm_op_open,
.close = hugetlb_vm_op_close,
.split = hugetlb_vm_op_split,
+ .pagesize = hugetlb_vm_op_pagesize,
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
diff --git a/mm/internal.h b/mm/internal.h
index e6bd35182dae..62d8c34e63d5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -168,6 +168,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
+extern void set_zone_contiguous(struct zone *zone);
+extern void clear_zone_contiguous(struct zone *zone);
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -495,7 +498,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
-#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
enum ttu_flags;
struct tlbflush_unmap_batch;
@@ -538,4 +540,5 @@ static inline bool is_migrate_highatomic_page(struct page *page)
}
void setup_zone_pageset(struct zone *zone);
+extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index e13d911251e7..135ce2838c89 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -323,9 +323,9 @@ void kasan_free_pages(struct page *page, unsigned int order)
* Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
* For larger allocations larger redzones are used.
*/
-static size_t optimal_redzone(size_t object_size)
+static unsigned int optimal_redzone(unsigned int object_size)
{
- int rz =
+ return
object_size <= 64 - 16 ? 16 :
object_size <= 128 - 32 ? 32 :
object_size <= 512 - 64 ? 64 :
@@ -333,14 +333,13 @@ static size_t optimal_redzone(size_t object_size)
object_size <= (1 << 14) - 256 ? 256 :
object_size <= (1 << 15) - 512 ? 512 :
object_size <= (1 << 16) - 1024 ? 1024 : 2048;
- return rz;
}
-void kasan_cache_create(struct kmem_cache *cache, size_t *size,
+void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
slab_flags_t *flags)
{
+ unsigned int orig_size = *size;
int redzone_adjust;
- int orig_size = *size;
/* Add alloc meta. */
cache->kasan_info.alloc_meta_offset = *size;
@@ -358,7 +357,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
if (redzone_adjust > 0)
*size += redzone_adjust;
- *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size +
+ *size = min_t(unsigned int, KMALLOC_MAX_SIZE,
+ max(*size, cache->object_size +
optimal_redzone(cache->object_size)));
/*
@@ -382,7 +382,8 @@ void kasan_cache_shrink(struct kmem_cache *cache)
void kasan_cache_shutdown(struct kmem_cache *cache)
{
- quarantine_remove_cache(cache);
+ if (!__kmem_cache_empty(cache))
+ quarantine_remove_cache(cache);
}
size_t kasan_metadata_size(struct kmem_cache *cache)
@@ -791,6 +792,41 @@ DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
#ifdef CONFIG_MEMORY_HOTPLUG
+static bool shadow_mapped(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (pgd_none(*pgd))
+ return false;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return false;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return false;
+
+ /*
+ * We can't use pud_large() or pud_huge(), the first one
+ * is arch-specific, the last one depend on HUGETLB_PAGE.
+ * So let's abuse pud_bad(), if bud is bad it's has to
+ * because it's huge.
+ */
+ if (pud_bad(*pud))
+ return true;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return false;
+
+ if (pmd_bad(*pmd))
+ return true;
+ pte = pte_offset_kernel(pmd, addr);
+ return !pte_none(*pte);
+}
+
static int __meminit kasan_mem_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
@@ -812,6 +848,14 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb,
case MEM_GOING_ONLINE: {
void *ret;
+ /*
+ * If shadow is mapped already than it must have been mapped
+ * during the boot. This could happen if we onlining previously
+ * offlined memory.
+ */
+ if (shadow_mapped(shadow_start))
+ return NOTIFY_OK;
+
ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
shadow_end, GFP_KERNEL,
PAGE_KERNEL, VM_NO_GUARD,
@@ -823,8 +867,18 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb,
kmemleak_ignore(ret);
return NOTIFY_OK;
}
- case MEM_OFFLINE:
- vfree((void *)shadow_start);
+ case MEM_OFFLINE: {
+ struct vm_struct *vm;
+
+ /*
+ * Only hot-added memory have vm_area. Freeing shadow
+ * mapped during boot would be tricky, so we'll just
+ * have to keep it.
+ */
+ vm = find_vm_area((void *)shadow_start);
+ if (vm)
+ vfree((void *)shadow_start);
+ }
}
return NOTIFY_OK;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 46c2290a08f1..9a085d525bbc 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1187,6 +1187,11 @@ EXPORT_SYMBOL(kmemleak_no_scan);
/**
* kmemleak_alloc_phys - similar to kmemleak_alloc but taking a physical
* address argument
+ * @phys: physical address of the object
+ * @size: size of the object
+ * @min_count: minimum number of references to this object.
+ * See kmemleak_alloc()
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
*/
void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
gfp_t gfp)
@@ -1199,6 +1204,9 @@ EXPORT_SYMBOL(kmemleak_alloc_phys);
/**
* kmemleak_free_part_phys - similar to kmemleak_free_part but taking a
* physical address argument
+ * @phys: physical address if the beginning or inside an object. This
+ * also represents the start of the range to be freed
+ * @size: size to be unregistered
*/
void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
{
@@ -1210,6 +1218,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys);
/**
* kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical
* address argument
+ * @phys: physical address of the object
*/
void __ref kmemleak_not_leak_phys(phys_addr_t phys)
{
@@ -1221,6 +1230,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys);
/**
* kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical
* address argument
+ * @phys: physical address of the object
*/
void __ref kmemleak_ignore_phys(phys_addr_t phys)
{
@@ -1963,7 +1973,7 @@ static void kmemleak_disable(void)
/*
* Allow boot-time kmemleak disabling (enabled by default).
*/
-static int kmemleak_boot_config(char *str)
+static int __init kmemleak_boot_config(char *str)
{
if (!str)
return -EINVAL;
diff --git a/mm/ksm.c b/mm/ksm.c
index adb5f991da8e..e8d6c6210b80 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1318,10 +1318,10 @@ bool is_page_sharing_candidate(struct stable_node *stable_node)
return __is_page_sharing_candidate(stable_node, 0);
}
-struct page *stable_node_dup(struct stable_node **_stable_node_dup,
- struct stable_node **_stable_node,
- struct rb_root *root,
- bool prune_stale_stable_nodes)
+static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
+ struct stable_node **_stable_node,
+ struct rb_root *root,
+ bool prune_stale_stable_nodes)
{
struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
struct hlist_node *hlist_safe;
@@ -2082,8 +2082,22 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
tree_rmap_item =
unstable_tree_search_insert(rmap_item, page, &tree_page);
if (tree_rmap_item) {
+ bool split;
+
kpage = try_to_merge_two_pages(rmap_item, page,
tree_rmap_item, tree_page);
+ /*
+ * If both pages we tried to merge belong to the same compound
+ * page, then we actually ended up increasing the reference
+ * count of the same compound page twice, and split_huge_page
+ * failed.
+ * Here we set a flag if that happened, and we use it later to
+ * try split_huge_page again. Since we call put_page right
+ * afterwards, the reference count will be correct and
+ * split_huge_page should succeed.
+ */
+ split = PageTransCompound(page)
+ && compound_head(page) == compound_head(tree_page);
put_page(tree_page);
if (kpage) {
/*
@@ -2110,6 +2124,20 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
break_cow(tree_rmap_item);
break_cow(rmap_item);
}
+ } else if (split) {
+ /*
+ * We are here if we tried to merge two pages and
+ * failed because they both belonged to the same
+ * compound page. We will split the page now, but no
+ * merging will take place.
+ * We do not want to add the cost of a full lock; if
+ * the page is locked, it is better to skip it and
+ * perhaps try again later.
+ */
+ if (!trylock_page(page))
+ return;
+ split_huge_page(page);
+ unlock_page(page);
}
}
}
diff --git a/mm/list_lru.c b/mm/list_lru.c
index fd41e969ede5..d9c84c5bda1d 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
+#include <linux/prefetch.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/memcontrol.h>
@@ -52,14 +53,15 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru)
static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
{
+ struct list_lru_memcg *memcg_lrus;
/*
- * The lock protects the array of per cgroup lists from relocation
- * (see memcg_update_list_lru_node).
+ * Either lock or RCU protects the array of per cgroup lists
+ * from relocation (see memcg_update_list_lru_node).
*/
- lockdep_assert_held(&nlru->lock);
- if (nlru->memcg_lrus && idx >= 0)
- return nlru->memcg_lrus->lru[idx];
-
+ memcg_lrus = rcu_dereference_check(nlru->memcg_lrus,
+ lockdep_is_held(&nlru->lock));
+ if (memcg_lrus && idx >= 0)
+ return memcg_lrus->lru[idx];
return &nlru->lru;
}
@@ -132,6 +134,12 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
+ /*
+ * Prefetch the neighboring list entries to reduce lock hold time.
+ */
+ prefetchw(item->prev);
+ prefetchw(item->next);
+
spin_lock(&nlru->lock);
if (!list_empty(item)) {
l = list_lru_from_kmem(nlru, item);
@@ -168,10 +176,10 @@ static unsigned long __list_lru_count_one(struct list_lru *lru,
struct list_lru_one *l;
unsigned long count;
- spin_lock(&nlru->lock);
+ rcu_read_lock();
l = list_lru_from_memcg_idx(nlru, memcg_idx);
count = l->nr_items;
- spin_unlock(&nlru->lock);
+ rcu_read_unlock();
return count;
}
@@ -324,24 +332,41 @@ fail:
static int memcg_init_list_lru_node(struct list_lru_node *nlru)
{
+ struct list_lru_memcg *memcg_lrus;
int size = memcg_nr_cache_ids;
- nlru->memcg_lrus = kvmalloc(size * sizeof(void *), GFP_KERNEL);
- if (!nlru->memcg_lrus)
+ memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
+ size * sizeof(void *), GFP_KERNEL);
+ if (!memcg_lrus)
return -ENOMEM;
- if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
- kvfree(nlru->memcg_lrus);
+ if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) {
+ kvfree(memcg_lrus);
return -ENOMEM;
}
+ RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus);
return 0;
}
static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
{
- __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
- kvfree(nlru->memcg_lrus);
+ struct list_lru_memcg *memcg_lrus;
+ /*
+ * This is called when shrinker has already been unregistered,
+ * and nobody can use it. So, there is no need to use kvfree_rcu().
+ */
+ memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
+ __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
+ kvfree(memcg_lrus);
+}
+
+static void kvfree_rcu(struct rcu_head *head)
+{
+ struct list_lru_memcg *mlru;
+
+ mlru = container_of(head, struct list_lru_memcg, rcu);
+ kvfree(mlru);
}
static int memcg_update_list_lru_node(struct list_lru_node *nlru,
@@ -351,8 +376,9 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
BUG_ON(old_size > new_size);
- old = nlru->memcg_lrus;
- new = kvmalloc(new_size * sizeof(void *), GFP_KERNEL);
+ old = rcu_dereference_protected(nlru->memcg_lrus,
+ lockdep_is_held(&list_lrus_mutex));
+ new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
if (!new)
return -ENOMEM;
@@ -361,29 +387,33 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
return -ENOMEM;
}
- memcpy(new, old, old_size * sizeof(void *));
+ memcpy(&new->lru, &old->lru, old_size * sizeof(void *));
/*
- * The lock guarantees that we won't race with a reader
- * (see list_lru_from_memcg_idx).
+ * The locking below allows readers that hold nlru->lock avoid taking
+ * rcu_read_lock (see list_lru_from_memcg_idx).
*
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
* we have to use IRQ-safe primitives here to avoid deadlock.
*/
spin_lock_irq(&nlru->lock);
- nlru->memcg_lrus = new;
+ rcu_assign_pointer(nlru->memcg_lrus, new);
spin_unlock_irq(&nlru->lock);
- kvfree(old);
+ call_rcu(&old->rcu, kvfree_rcu);
return 0;
}
static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
int old_size, int new_size)
{
+ struct list_lru_memcg *memcg_lrus;
+
+ memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus,
+ lockdep_is_held(&list_lrus_mutex));
/* do not bother shrinking the array back to the old size, because we
* cannot handle allocation failures here */
- __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
+ __memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size);
}
static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
diff --git a/mm/memblock.c b/mm/memblock.c
index 9307cbd87089..5108356ad8aa 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -17,6 +17,7 @@
#include <linux/poison.h>
#include <linux/pfn.h>
#include <linux/debugfs.h>
+#include <linux/kmemleak.h>
#include <linux/seq_file.h>
#include <linux/memblock.h>
@@ -924,7 +925,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
r = &type_b->regions[idx_b];
r_start = idx_b ? r[-1].base + r[-1].size : 0;
r_end = idx_b < type_b->cnt ?
- r->base : ULLONG_MAX;
+ r->base : (phys_addr_t)ULLONG_MAX;
/*
* if idx_b advanced past idx_a,
@@ -1040,7 +1041,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
r = &type_b->regions[idx_b];
r_start = idx_b ? r[-1].base + r[-1].size : 0;
r_end = idx_b < type_b->cnt ?
- r->base : ULLONG_MAX;
+ r->base : (phys_addr_t)ULLONG_MAX;
/*
* if idx_b advanced past idx_a,
* break out to advance idx_a
@@ -1345,7 +1346,7 @@ void * __init memblock_virt_alloc_try_nid_raw(
min_addr, max_addr, nid);
#ifdef CONFIG_DEBUG_VM
if (ptr && size > 0)
- memset(ptr, 0xff, size);
+ memset(ptr, PAGE_POISON_PATTERN, size);
#endif
return ptr;
}
@@ -1750,29 +1751,6 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
}
}
-extern unsigned long __init_memblock
-memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr)
-{
- struct memblock_region *rgn;
- unsigned long size = 0;
- int idx;
-
- for_each_memblock_type(idx, (&memblock.reserved), rgn) {
- phys_addr_t start, end;
-
- if (rgn->base + rgn->size < start_addr)
- continue;
- if (rgn->base > end_addr)
- continue;
-
- start = rgn->base;
- end = start + rgn->size;
- size += end - start;
- }
-
- return size;
-}
-
void __init_memblock __memblock_dump_all(void)
{
pr_info("MEMBLOCK configuration:\n");
@@ -1818,18 +1796,7 @@ static int memblock_debug_show(struct seq_file *m, void *private)
}
return 0;
}
-
-static int memblock_debug_open(struct inode *inode, struct file *file)
-{
- return single_open(file, memblock_debug_show, inode->i_private);
-}
-
-static const struct file_operations memblock_debug_fops = {
- .open = memblock_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(memblock_debug);
static int __init memblock_init_debugfs(void)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9ec024b862ac..aa1e7455e313 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -888,7 +888,8 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
* value, the function breaks the iteration loop and returns the value.
* Otherwise, it will iterate over all tasks and return 0.
*
- * This function must not be called for the root memory cgroup.
+ * If memcg is the root memory cgroup, this function will iterate only
+ * over tasks belonging directly to the root memory cgroup.
*/
int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
int (*fn)(struct task_struct *, void *), void *arg)
@@ -896,8 +897,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct mem_cgroup *iter;
int ret = 0;
- BUG_ON(memcg == root_mem_cgroup);
-
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
struct task_struct *task;
@@ -906,7 +905,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
while (!ret && (task = css_task_iter_next(&it)))
ret = fn(task, arg);
css_task_iter_end(&it);
- if (ret) {
+ if (ret || memcg == root_mem_cgroup) {
mem_cgroup_iter_break(memcg, iter);
break;
}
@@ -2592,6 +2591,224 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
return ret;
}
+static long memcg_oom_badness(struct mem_cgroup *memcg,
+ const nodemask_t *nodemask,
+ unsigned long totalpages)
+{
+ long points = 0;
+ int nid;
+ pg_data_t *pgdat;
+
+ for_each_node_state(nid, N_MEMORY) {
+ if (nodemask && !node_isset(nid, *nodemask))
+ continue;
+
+ points += mem_cgroup_node_nr_lru_pages(memcg, nid,
+ LRU_ALL_ANON | BIT(LRU_UNEVICTABLE));
+
+ pgdat = NODE_DATA(nid);
+ points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg),
+ NR_SLAB_UNRECLAIMABLE);
+ }
+
+ points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) /
+ (PAGE_SIZE / 1024);
+ points += memcg_page_state(memcg, MEMCG_SOCK);
+ points += memcg_page_state(memcg, MEMCG_SWAP);
+
+ return points;
+}
+
+/*
+ * Checks if the given memcg is a valid OOM victim and returns a number,
+ * which means the folowing:
+ * -1: there are inflight OOM victim tasks, belonging to the memcg
+ * 0: memcg is not eligible, e.g. all belonging tasks are protected
+ * by oom_score_adj set to OOM_SCORE_ADJ_MIN
+ * >0: memcg is eligible, and the returned value is an estimation
+ * of the memory footprint
+ */
+static long oom_evaluate_memcg(struct mem_cgroup *memcg,
+ const nodemask_t *nodemask,
+ unsigned long totalpages)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+ int eligible = 0;
+
+ /*
+ * Root memory cgroup is a special case:
+ * we don't have necessary stats to evaluate it exactly as
+ * leaf memory cgroups, so we approximate it's oom_score
+ * by summing oom_score of all belonging tasks, which are
+ * owners of their mm structs.
+ *
+ * If there are inflight OOM victim tasks inside
+ * the root memcg, we return -1.
+ */
+ if (memcg == root_mem_cgroup) {
+ struct css_task_iter it;
+ struct task_struct *task;
+ long score = 0;
+
+ css_task_iter_start(&memcg->css, 0, &it);
+ while ((task = css_task_iter_next(&it))) {
+ if (tsk_is_oom_victim(task) &&
+ !test_bit(MMF_OOM_SKIP,
+ &task->signal->oom_mm->flags)) {
+ score = -1;
+ break;
+ }
+
+ task_lock(task);
+ if (!task->mm || task->mm->owner != task) {
+ task_unlock(task);
+ continue;
+ }
+ task_unlock(task);
+
+ score += oom_badness(task, memcg, nodemask,
+ totalpages);
+ }
+ css_task_iter_end(&it);
+
+ return score;
+ }
+
+ /*
+ * Memcg is OOM eligible if there are OOM killable tasks inside.
+ *
+ * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN
+ * as unkillable.
+ *
+ * If there are inflight OOM victim tasks inside the memcg,
+ * we return -1.
+ */
+ css_task_iter_start(&memcg->css, 0, &it);
+ while ((task = css_task_iter_next(&it))) {
+ if (!eligible &&
+ task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN)
+ eligible = 1;
+
+ if (tsk_is_oom_victim(task) &&
+ !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) {
+ eligible = -1;
+ break;
+ }
+ }
+ css_task_iter_end(&it);
+
+ if (eligible <= 0)
+ return eligible;
+
+ return memcg_oom_badness(memcg, nodemask, totalpages);
+}
+
+static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
+{
+ struct mem_cgroup *iter, *group = NULL;
+ long group_score = 0;
+
+ oc->chosen_memcg = NULL;
+ oc->chosen_points = 0;
+
+ /*
+ * If OOM is memcg-wide, and the memcg has the oom_group flag set,
+ * all tasks belonging to the memcg should be killed.
+ * So, we mark the memcg as a victim.
+ */
+ if (oc->memcg && mem_cgroup_oom_group(oc->memcg)) {
+ oc->chosen_memcg = oc->memcg;
+ css_get(&oc->chosen_memcg->css);
+ return;
+ }
+
+ /*
+ * The oom_score is calculated for leaf memory cgroups (including
+ * the root memcg).
+ * Non-leaf oom_group cgroups accumulating score of descendant
+ * leaf memory cgroups.
+ */
+ rcu_read_lock();
+ for_each_mem_cgroup_tree(iter, root) {
+ long score;
+
+ /*
+ * We don't consider non-leaf non-oom_group memory cgroups
+ * as OOM victims.
+ */
+ if (memcg_has_children(iter) && iter != root_mem_cgroup &&
+ !mem_cgroup_oom_group(iter))
+ continue;
+
+ /*
+ * If group is not set or we've ran out of the group's sub-tree,
+ * we should set group and reset group_score.
+ */
+ if (!group || group == root_mem_cgroup ||
+ !mem_cgroup_is_descendant(iter, group)) {
+ group = iter;
+ group_score = 0;
+ }
+
+ if (memcg_has_children(iter) && iter != root_mem_cgroup)
+ continue;
+
+ score = oom_evaluate_memcg(iter, oc->nodemask, oc->totalpages);
+
+ /*
+ * Ignore empty and non-eligible memory cgroups.
+ */
+ if (score == 0)
+ continue;
+
+ /*
+ * If there are inflight OOM victims, we don't need
+ * to look further for new victims.
+ */
+ if (score == -1) {
+ oc->chosen_memcg = INFLIGHT_VICTIM;
+ mem_cgroup_iter_break(root, iter);
+ break;
+ }
+
+ group_score += score;
+
+ if (group_score > oc->chosen_points) {
+ oc->chosen_points = group_score;
+ oc->chosen_memcg = group;
+ }
+ }
+
+ if (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM)
+ css_get(&oc->chosen_memcg->css);
+
+ rcu_read_unlock();
+}
+
+bool mem_cgroup_select_oom_victim(struct oom_control *oc)
+{
+ struct mem_cgroup *root;
+
+ if (mem_cgroup_disabled())
+ return false;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return false;
+
+ if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
+ return false;
+
+ if (oc->memcg)
+ root = oc->memcg;
+ else
+ root = root_mem_cgroup;
+
+ select_victim_memcg(root, oc);
+
+ return oc->chosen_memcg;
+}
+
/*
* Reclaims as many pages from the given memcg as possible.
*
@@ -5187,6 +5404,39 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
return nbytes;
}
+static int memory_oom_group_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ bool oom_group = memcg->oom_group;
+
+ if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
+ return -ENOTSUPP;
+
+ seq_printf(m, "%d\n", oom_group);
+
+ return 0;
+}
+
+static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ int oom_group;
+ int err;
+
+ if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
+ return -ENOTSUPP;
+
+ err = kstrtoint(strstrip(buf), 0, &oom_group);
+ if (err)
+ return err;
+
+ memcg->oom_group = oom_group;
+
+ return nbytes;
+}
+
static int memory_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5307,6 +5557,12 @@ static struct cftype memory_files[] = {
.write = memory_max_write,
},
{
+ .name = "oom_group",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_oom_group_show,
+ .write = memory_oom_group_write,
+ },
+ {
.name = "events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, events_file),
diff --git a/mm/memfd.c b/mm/memfd.c
new file mode 100644
index 000000000000..c913fbfa094a
--- /dev/null
+++ b/mm/memfd.c
@@ -0,0 +1,342 @@
+/*
+ * memfd_create system call and file sealing support
+ *
+ * Code was originally included in shmem.c, and broken out to facilitate
+ * use by hugetlbfs as well as tmpfs.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/khugepaged.h>
+#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
+#include <linux/shmem_fs.h>
+#include <linux/memfd.h>
+#include <uapi/linux/memfd.h>
+
+/*
+ * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
+ * so reuse a tag which we firmly believe is never set or cleared on shmem.
+ */
+#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE
+#define LAST_SCAN 4 /* about 150ms max */
+
+static void shmem_tag_pins(struct address_space *mapping)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ pgoff_t start;
+ struct page *page;
+
+ lru_add_drain();
+ start = 0;
+ rcu_read_lock();
+
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ page = radix_tree_deref_slot(slot);
+ if (!page || radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ } else if (page_count(page) - page_mapcount(page) > 1) {
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_tag_set(&mapping->page_tree, iter.index,
+ SHMEM_TAG_PINNED);
+ spin_unlock_irq(&mapping->tree_lock);
+ }
+
+ if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
+ * via get_user_pages(), drivers might have some pending I/O without any active
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * and see whether it has an elevated ref-count. If so, we tag them and wait for
+ * them to be dropped.
+ * The caller must guarantee that no new user will acquire writable references
+ * to those pages to avoid races.
+ */
+static int shmem_wait_for_pins(struct address_space *mapping)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ pgoff_t start;
+ struct page *page;
+ int error, scan;
+
+ shmem_tag_pins(mapping);
+
+ error = 0;
+ for (scan = 0; scan <= LAST_SCAN; scan++) {
+ if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
+ break;
+
+ if (!scan)
+ lru_add_drain_all();
+ else if (schedule_timeout_killable((HZ << scan) / 200))
+ scan = LAST_SCAN;
+
+ start = 0;
+ rcu_read_lock();
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
+ start, SHMEM_TAG_PINNED) {
+
+ page = radix_tree_deref_slot(slot);
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+
+ page = NULL;
+ }
+
+ if (page &&
+ page_count(page) - page_mapcount(page) != 1) {
+ if (scan < LAST_SCAN)
+ goto continue_resched;
+
+ /*
+ * On the last scan, we clean up all those tags
+ * we inserted; but make a note that we still
+ * found pages pinned.
+ */
+ error = -EBUSY;
+ }
+
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_tag_clear(&mapping->page_tree,
+ iter.index, SHMEM_TAG_PINNED);
+ spin_unlock_irq(&mapping->tree_lock);
+continue_resched:
+ if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ return error;
+}
+
+static unsigned int *memfd_file_seals_ptr(struct file *file)
+{
+ if (shmem_file(file))
+ return &SHMEM_I(file_inode(file))->seals;
+
+ if (is_file_hugepages(file))
+ return &HUGETLBFS_I(file_inode(file))->seals;
+
+ return NULL;
+}
+
+#define F_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_SHRINK | \
+ F_SEAL_GROW | \
+ F_SEAL_WRITE)
+
+static int memfd_add_seals(struct file *file, unsigned int seals)
+{
+ struct inode *inode = file_inode(file);
+ unsigned int *file_seals;
+ int error;
+
+ /*
+ * SEALING
+ * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
+ * but restrict access to a specific subset of file operations. Seals
+ * can only be added, but never removed. This way, mutually untrusted
+ * parties can share common memory regions with a well-defined policy.
+ * A malicious peer can thus never perform unwanted operations on a
+ * shared object.
+ *
+ * Seals are only supported on special tmpfs or hugetlbfs files and
+ * always affect the whole underlying inode. Once a seal is set, it
+ * may prevent some kinds of access to the file. Currently, the
+ * following seals are defined:
+ * SEAL_SEAL: Prevent further seals from being set on this file
+ * SEAL_SHRINK: Prevent the file from shrinking
+ * SEAL_GROW: Prevent the file from growing
+ * SEAL_WRITE: Prevent write access to the file
+ *
+ * As we don't require any trust relationship between two parties, we
+ * must prevent seals from being removed. Therefore, sealing a file
+ * only adds a given set of seals to the file, it never touches
+ * existing seals. Furthermore, the "setting seals"-operation can be
+ * sealed itself, which basically prevents any further seal from being
+ * added.
+ *
+ * Semantics of sealing are only defined on volatile files. Only
+ * anonymous tmpfs and hugetlbfs files support sealing. More
+ * importantly, seals are never written to disk. Therefore, there's
+ * no plan to support it on other file types.
+ */
+
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EPERM;
+ if (seals & ~(unsigned int)F_ALL_SEALS)
+ return -EINVAL;
+
+ inode_lock(inode);
+
+ file_seals = memfd_file_seals_ptr(file);
+ if (!file_seals) {
+ error = -EINVAL;
+ goto unlock;
+ }
+
+ if (*file_seals & F_SEAL_SEAL) {
+ error = -EPERM;
+ goto unlock;
+ }
+
+ if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
+ error = mapping_deny_writable(file->f_mapping);
+ if (error)
+ goto unlock;
+
+ error = shmem_wait_for_pins(file->f_mapping);
+ if (error) {
+ mapping_allow_writable(file->f_mapping);
+ goto unlock;
+ }
+ }
+
+ *file_seals |= seals;
+ error = 0;
+
+unlock:
+ inode_unlock(inode);
+ return error;
+}
+
+static int memfd_get_seals(struct file *file)
+{
+ unsigned int *seals = memfd_file_seals_ptr(file);
+
+ return seals ? *seals : -EINVAL;
+}
+
+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ long error;
+
+ switch (cmd) {
+ case F_ADD_SEALS:
+ /* disallow upper 32bit */
+ if (arg > UINT_MAX)
+ return -EINVAL;
+
+ error = memfd_add_seals(file, arg);
+ break;
+ case F_GET_SEALS:
+ error = memfd_get_seals(file);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ return error;
+}
+
+#define MFD_NAME_PREFIX "memfd:"
+#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
+#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
+
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+
+SYSCALL_DEFINE2(memfd_create,
+ const char __user *, uname,
+ unsigned int, flags)
+{
+ unsigned int *file_seals;
+ struct file *file;
+ int fd, error;
+ char *name;
+ long len;
+
+ if (!(flags & MFD_HUGETLB)) {
+ if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+ return -EINVAL;
+ } else {
+ /* Allow huge page size encoding in flags. */
+ if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+ (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
+ return -EINVAL;
+ }
+
+ /* length includes terminating zero */
+ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
+ if (len <= 0)
+ return -EFAULT;
+ if (len > MFD_NAME_MAX_LEN + 1)
+ return -EINVAL;
+
+ name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ strcpy(name, MFD_NAME_PREFIX);
+ if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ /* terminating-zero may have changed after strnlen_user() returned */
+ if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+ if (fd < 0) {
+ error = fd;
+ goto err_name;
+ }
+
+ if (flags & MFD_HUGETLB) {
+ struct user_struct *user = NULL;
+
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+ HUGETLB_ANONHUGE_INODE,
+ (flags >> MFD_HUGE_SHIFT) &
+ MFD_HUGE_MASK);
+ } else
+ file = shmem_file_setup(name, 0, VM_NORESERVE);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_fd;
+ }
+ file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+ file->f_flags |= O_RDWR | O_LARGEFILE;
+
+ if (flags & MFD_ALLOW_SEALING) {
+ file_seals = memfd_file_seals_ptr(file);
+ *file_seals &= ~F_SEAL_SEAL;
+ }
+
+ fd_install(fd, file);
+ kfree(name);
+ return fd;
+
+err_fd:
+ put_unused_fd(fd);
+err_name:
+ kfree(name);
+ return error;
+}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8291b75f42c8..9d142b9b86dc 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -502,6 +502,7 @@ static const char * const action_page_types[] = {
[MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
[MF_MSG_HUGE] = "huge page",
[MF_MSG_FREE_HUGE] = "free huge page",
+ [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
[MF_MSG_UNMAP_FAILED] = "unmapping failed page",
[MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
[MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
@@ -1084,6 +1085,21 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
return 0;
}
+ /*
+ * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
+ * simply disable it. In order to make it work properly, we need
+ * make sure that:
+ * - conversion of a pud that maps an error hugetlb into hwpoison
+ * entry properly works, and
+ * - other mm code walking over page table is aware of pud-aligned
+ * hwpoison entries.
+ */
+ if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
+ action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
+ res = -EBUSY;
+ goto out;
+ }
+
if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
@@ -1471,7 +1487,7 @@ int unpoison_memory(unsigned long pfn)
}
EXPORT_SYMBOL(unpoison_memory);
-static struct page *new_page(struct page *p, unsigned long private, int **x)
+static struct page *new_page(struct page *p, unsigned long private)
{
int nid = page_to_nid(p);
diff --git a/mm/memory.c b/mm/memory.c
index aed37325d94e..a1f990e33e38 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2883,26 +2883,16 @@ EXPORT_SYMBOL(unmap_mapping_range);
int do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = NULL, *swapcache = NULL;
+ struct page *page = NULL, *swapcache;
struct mem_cgroup *memcg;
- struct vma_swap_readahead swap_ra;
swp_entry_t entry;
pte_t pte;
int locked;
int exclusive = 0;
int ret = 0;
- bool vma_readahead = swap_use_vma_readahead();
- if (vma_readahead) {
- page = swap_readahead_detect(vmf, &swap_ra);
- swapcache = page;
- }
-
- if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
- if (page)
- put_page(page);
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
- }
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
@@ -2928,19 +2918,17 @@ int do_swap_page(struct vm_fault *vmf)
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
- if (!page) {
- page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
- vmf->address);
- swapcache = page;
- }
+ page = lookup_swap_cache(entry, vma, vmf->address);
+ swapcache = page;
if (!page) {
struct swap_info_struct *si = swp_swap_info(entry);
if (si->flags & SWP_SYNCHRONOUS_IO &&
- __swap_count(si, entry) == 1) {
+ __swap_count(entry) == 1) {
/* skip swapcache */
- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
+ vmf->address);
if (page) {
__SetPageLocked(page);
__SetPageSwapBacked(page);
@@ -2949,12 +2937,8 @@ int do_swap_page(struct vm_fault *vmf)
swap_readpage(page, true);
}
} else {
- if (vma_readahead)
- page = do_swap_page_readahead(entry,
- GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
- else
- page = swapin_readahead(entry,
- GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+ vmf);
swapcache = page;
}
@@ -2982,7 +2966,6 @@ int do_swap_page(struct vm_fault *vmf)
*/
ret = VM_FAULT_HWPOISON;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
- swapcache = page;
goto out_release;
}
@@ -3052,7 +3035,6 @@ int do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
@@ -3066,6 +3048,7 @@ int do_swap_page(struct vm_fault *vmf)
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
}
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b2bd52ff7605..f74826cdceea 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -250,7 +250,6 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
struct vmem_altmap *altmap, bool want_memblock)
{
int ret;
- int i;
if (pfn_valid(phys_start_pfn))
return -EEXIST;
@@ -259,27 +258,10 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
if (ret < 0)
return ret;
- /*
- * Make all the pages reserved so that nobody will stumble over half
- * initialized state.
- * FIXME: We also have to associate it with a node because page_to_nid
- * relies on having page with the proper node.
- */
- for (i = 0; i < PAGES_PER_SECTION; i++) {
- unsigned long pfn = phys_start_pfn + i;
- struct page *page;
- if (!pfn_valid(pfn))
- continue;
-
- page = pfn_to_page(pfn);
- set_page_node(page, nid);
- SetPageReserved(page);
- }
-
if (!want_memblock)
return 0;
- return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
+ return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
}
/*
@@ -559,6 +541,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
* @zone: zone from which pages need to be removed
* @phys_start_pfn: starting pageframe (must be aligned to start of a section)
* @nr_pages: number of pages to remove (must be multiple of section size)
+ * @altmap: alternative device page map or %NULL if default memmap is used
*
* Generic helper function to remove section mappings and sysfs entries
* for the section of the memory we are removing. Caller needs to make
@@ -908,8 +891,15 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int nid;
int ret;
struct memory_notify arg;
+ struct memory_block *mem;
+
+ /*
+ * We can't use pfn_to_nid() because nid might be stored in struct page
+ * which is not yet initialized. Instead, we find nid from memory block.
+ */
+ mem = find_memory_block(__pfn_to_section(pfn));
+ nid = mem->nid;
- nid = pfn_to_nid(pfn);
/* associate pfn range with the zone */
zone = move_pfn_range(online_type, nid, pfn, nr_pages);
@@ -1055,6 +1045,7 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
/**
* try_online_node - online a node if offlined
+ * @nid: the node ID
*
* called by cpu_up() to online a node without onlined memory.
*/
@@ -1083,15 +1074,16 @@ out:
static int check_hotplug_memory_range(u64 start, u64 size)
{
- u64 start_pfn = PFN_DOWN(start);
+ unsigned long block_sz = memory_block_size_bytes();
+ u64 block_nr_pages = block_sz >> PAGE_SHIFT;
u64 nr_pages = size >> PAGE_SHIFT;
+ u64 start_pfn = PFN_DOWN(start);
- /* Memory range must be aligned with section */
- if ((start_pfn & ~PAGE_SECTION_MASK) ||
- (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) {
- pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n",
- (unsigned long long)start,
- (unsigned long long)size);
+ /* memory range must be block size aligned */
+ if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) ||
+ !IS_ALIGNED(nr_pages, block_nr_pages)) {
+ pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
+ block_sz, start, size);
return -EINVAL;
}
@@ -1337,8 +1329,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
return 0;
}
-static struct page *new_node_page(struct page *page, unsigned long private,
- int **result)
+static struct page *new_node_page(struct page *page, unsigned long private)
{
int nid = page_to_nid(page);
nodemask_t nmask = node_states[N_MEMORY];
@@ -1381,7 +1372,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (isolate_huge_page(page, &source))
move_pages -= 1 << compound_order(head);
continue;
- } else if (thp_migration_supported() && PageTransHuge(page))
+ } else if (PageTransHuge(page))
pfn = page_to_pfn(compound_head(page))
+ hpage_nr_pages(page) - 1;
@@ -1814,6 +1805,7 @@ static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
/**
* try_offline_node
+ * @nid: the node ID
*
* Offline a node if all memory sections and cpus of the node are removed.
*
@@ -1857,6 +1849,9 @@ EXPORT_SYMBOL(try_offline_node);
/**
* remove_memory
+ * @nid: the node ID
+ * @start: physical address of the region to remove
+ * @size: size of the region to remove
*
* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
* and online/offline operations before this call, as required by
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 01cbb7078d6c..9ac49ef17b4e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -446,15 +446,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
goto out;
}
- if (!thp_migration_supported()) {
- get_page(page);
- spin_unlock(ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- goto out;
- }
if (!queue_pages_required(page, qp)) {
ret = 1;
goto unlock;
@@ -495,7 +486,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
if (pmd_trans_unstable(pmd))
return 0;
-retry:
+
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
@@ -511,22 +502,6 @@ retry:
continue;
if (!queue_pages_required(page, qp))
continue;
- if (PageTransCompound(page) && !thp_migration_supported()) {
- get_page(page);
- pte_unmap_unlock(pte, ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- /* Failed to split -- skip. */
- if (ret) {
- pte = pte_offset_map_lock(walk->mm, pmd,
- addr, &ptl);
- continue;
- }
- goto retry;
- }
-
migrate_page_add(page, qp->pagelist, flags);
}
pte_unmap_unlock(pte - 1, ptl);
@@ -942,12 +917,13 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
}
}
-static struct page *new_node_page(struct page *page, unsigned long node, int **x)
+/* page allocation callback for NUMA node migration */
+struct page *alloc_new_node_page(struct page *page, unsigned long node)
{
if (PageHuge(page))
return alloc_huge_page_node(page_hstate(compound_head(page)),
node);
- else if (thp_migration_supported() && PageTransHuge(page)) {
+ else if (PageTransHuge(page)) {
struct page *thp;
thp = alloc_pages_node(node,
@@ -986,7 +962,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_node_page, NULL, dest,
+ err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
@@ -1107,7 +1083,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
-static struct page *new_page(struct page *page, unsigned long start, int **x)
+static struct page *new_page(struct page *page, unsigned long start)
{
struct vm_area_struct *vma;
unsigned long uninitialized_var(address);
@@ -1123,7 +1099,7 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
if (PageHuge(page)) {
return alloc_huge_page_vma(page_hstate(compound_head(page)),
vma, address);
- } else if (thp_migration_supported() && PageTransHuge(page)) {
+ } else if (PageTransHuge(page)) {
struct page *thp;
thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
@@ -1152,7 +1128,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
return -ENOSYS;
}
-static struct page *new_page(struct page *page, unsigned long start, int **x)
+static struct page *new_page(struct page *page, unsigned long start)
{
return NULL;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 003886606a22..f475fe9efc48 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1137,10 +1137,12 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
enum migrate_reason reason)
{
int rc = MIGRATEPAGE_SUCCESS;
- int *result = NULL;
struct page *newpage;
- newpage = get_new_page(page, private, &result);
+ if (!thp_migration_supported() && PageTransHuge(page))
+ return -ENOMEM;
+
+ newpage = get_new_page(page, private);
if (!newpage)
return -ENOMEM;
@@ -1161,14 +1163,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
goto out;
}
- if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) {
- lock_page(page);
- rc = split_huge_page(page);
- unlock_page(page);
- if (rc)
- goto out;
- }
-
rc = __unmap_and_move(page, newpage, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
set_page_owner_migrate_reason(newpage, reason);
@@ -1231,12 +1225,6 @@ put_new:
put_page(newpage);
}
- if (result) {
- if (rc)
- *result = rc;
- else
- *result = page_to_nid(newpage);
- }
return rc;
}
@@ -1264,7 +1252,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
enum migrate_mode mode, int reason)
{
int rc = -EAGAIN;
- int *result = NULL;
int page_was_mapped = 0;
struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
@@ -1281,7 +1268,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
return -ENOSYS;
}
- new_hpage = get_new_page(hpage, private, &result);
+ new_hpage = get_new_page(hpage, private);
if (!new_hpage)
return -ENOMEM;
@@ -1345,12 +1332,6 @@ out:
else
putback_active_hugepage(new_hpage);
- if (result) {
- if (rc)
- *result = rc;
- else
- *result = page_to_nid(new_hpage);
- }
return rc;
}
@@ -1395,6 +1376,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
retry = 0;
list_for_each_entry_safe(page, page2, from, lru) {
+retry:
cond_resched();
if (PageHuge(page))
@@ -1408,6 +1390,26 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
switch(rc) {
case -ENOMEM:
+ /*
+ * THP migration might be unsupported or the
+ * allocation could've failed so we should
+ * retry on the same page with the THP split
+ * to base pages.
+ *
+ * Head page is retried immediately and tail
+ * pages are added to the tail of the list so
+ * we encounter them after the rest of the list
+ * is processed.
+ */
+ if (PageTransHuge(page)) {
+ lock_page(page);
+ rc = split_huge_page_to_list(page, from);
+ unlock_page(page);
+ if (!rc) {
+ list_safe_reset_next(page, page2, lru);
+ goto retry;
+ }
+ }
nr_failed++;
goto out;
case -EAGAIN:
@@ -1444,141 +1446,101 @@ out:
}
#ifdef CONFIG_NUMA
-/*
- * Move a list of individual pages
- */
-struct page_to_node {
- unsigned long addr;
- struct page *page;
- int node;
- int status;
-};
-static struct page *new_page_node(struct page *p, unsigned long private,
- int **result)
+static int store_status(int __user *status, int start, int value, int nr)
{
- struct page_to_node *pm = (struct page_to_node *)private;
-
- while (pm->node != MAX_NUMNODES && pm->page != p)
- pm++;
+ while (nr-- > 0) {
+ if (put_user(value, status + start))
+ return -EFAULT;
+ start++;
+ }
- if (pm->node == MAX_NUMNODES)
- return NULL;
+ return 0;
+}
- *result = &pm->status;
+static int do_move_pages_to_node(struct mm_struct *mm,
+ struct list_head *pagelist, int node)
+{
+ int err;
- if (PageHuge(p))
- return alloc_huge_page_node(page_hstate(compound_head(p)),
- pm->node);
- else if (thp_migration_supported() && PageTransHuge(p)) {
- struct page *thp;
+ if (list_empty(pagelist))
+ return 0;
- thp = alloc_pages_node(pm->node,
- (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
- HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
- } else
- return __alloc_pages_node(pm->node,
- GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
+ err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
+ MIGRATE_SYNC, MR_SYSCALL);
+ if (err)
+ putback_movable_pages(pagelist);
+ return err;
}
/*
- * Move a set of pages as indicated in the pm array. The addr
- * field must be set to the virtual address of the page to be moved
- * and the node number must contain a valid target node.
- * The pm array ends with node = MAX_NUMNODES.
+ * Resolves the given address to a struct page, isolates it from the LRU and
+ * puts it to the given pagelist.
+ * Returns -errno if the page cannot be found/isolated or 0 when it has been
+ * queued or the page doesn't need to be migrated because it is already on
+ * the target node
*/
-static int do_move_page_to_node_array(struct mm_struct *mm,
- struct page_to_node *pm,
- int migrate_all)
+static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
+ int node, struct list_head *pagelist, bool migrate_all)
{
+ struct vm_area_struct *vma;
+ struct page *page;
+ unsigned int follflags;
int err;
- struct page_to_node *pp;
- LIST_HEAD(pagelist);
down_read(&mm->mmap_sem);
+ err = -EFAULT;
+ vma = find_vma(mm, addr);
+ if (!vma || addr < vma->vm_start || !vma_migratable(vma))
+ goto out;
- /*
- * Build a list of pages to migrate
- */
- for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
- struct vm_area_struct *vma;
- struct page *page;
- struct page *head;
- unsigned int follflags;
-
- err = -EFAULT;
- vma = find_vma(mm, pp->addr);
- if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
- goto set_status;
-
- /* FOLL_DUMP to ignore special (like zero) pages */
- follflags = FOLL_GET | FOLL_DUMP;
- if (!thp_migration_supported())
- follflags |= FOLL_SPLIT;
- page = follow_page(vma, pp->addr, follflags);
+ /* FOLL_DUMP to ignore special (like zero) pages */
+ follflags = FOLL_GET | FOLL_DUMP;
+ page = follow_page(vma, addr, follflags);
- err = PTR_ERR(page);
- if (IS_ERR(page))
- goto set_status;
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
+ goto out;
- err = -ENOENT;
- if (!page)
- goto set_status;
+ err = -ENOENT;
+ if (!page)
+ goto out;
- err = page_to_nid(page);
+ err = 0;
+ if (page_to_nid(page) == node)
+ goto out_putpage;
- if (err == pp->node)
- /*
- * Node already in the right place
- */
- goto put_and_set;
+ err = -EACCES;
+ if (page_mapcount(page) > 1 && !migrate_all)
+ goto out_putpage;
- err = -EACCES;
- if (page_mapcount(page) > 1 &&
- !migrate_all)
- goto put_and_set;
-
- if (PageHuge(page)) {
- if (PageHead(page)) {
- isolate_huge_page(page, &pagelist);
- err = 0;
- pp->page = page;
- }
- goto put_and_set;
+ if (PageHuge(page)) {
+ if (PageHead(page)) {
+ isolate_huge_page(page, pagelist);
+ err = 0;
}
+ } else {
+ struct page *head;
- pp->page = compound_head(page);
head = compound_head(page);
err = isolate_lru_page(head);
- if (!err) {
- list_add_tail(&head->lru, &pagelist);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
- hpage_nr_pages(head));
- }
-put_and_set:
- /*
- * Either remove the duplicate refcount from
- * isolate_lru_page() or drop the page ref if it was
- * not isolated.
- */
- put_page(page);
-set_status:
- pp->status = err;
- }
-
- err = 0;
- if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_page_node, NULL,
- (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
if (err)
- putback_movable_pages(&pagelist);
- }
+ goto out_putpage;
+ err = 0;
+ list_add_tail(&head->lru, pagelist);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON + page_is_file_cache(head),
+ hpage_nr_pages(head));
+ }
+out_putpage:
+ /*
+ * Either remove the duplicate refcount from
+ * isolate_lru_page() or drop the page ref if it was
+ * not isolated.
+ */
+ put_page(page);
+out:
up_read(&mm->mmap_sem);
return err;
}
@@ -1593,79 +1555,79 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
const int __user *nodes,
int __user *status, int flags)
{
- struct page_to_node *pm;
- unsigned long chunk_nr_pages;
- unsigned long chunk_start;
- int err;
-
- err = -ENOMEM;
- pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
- if (!pm)
- goto out;
+ int current_node = NUMA_NO_NODE;
+ LIST_HEAD(pagelist);
+ int start, i;
+ int err = 0, err1;
migrate_prep();
- /*
- * Store a chunk of page_to_node array in a page,
- * but keep the last one as a marker
- */
- chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
-
- for (chunk_start = 0;
- chunk_start < nr_pages;
- chunk_start += chunk_nr_pages) {
- int j;
+ for (i = start = 0; i < nr_pages; i++) {
+ const void __user *p;
+ unsigned long addr;
+ int node;
- if (chunk_start + chunk_nr_pages > nr_pages)
- chunk_nr_pages = nr_pages - chunk_start;
-
- /* fill the chunk pm with addrs and nodes from user-space */
- for (j = 0; j < chunk_nr_pages; j++) {
- const void __user *p;
- int node;
-
- err = -EFAULT;
- if (get_user(p, pages + j + chunk_start))
- goto out_pm;
- pm[j].addr = (unsigned long) p;
-
- if (get_user(node, nodes + j + chunk_start))
- goto out_pm;
-
- err = -ENODEV;
- if (node < 0 || node >= MAX_NUMNODES)
- goto out_pm;
-
- if (!node_state(node, N_MEMORY))
- goto out_pm;
-
- err = -EACCES;
- if (!node_isset(node, task_nodes))
- goto out_pm;
+ err = -EFAULT;
+ if (get_user(p, pages + i))
+ goto out_flush;
+ if (get_user(node, nodes + i))
+ goto out_flush;
+ addr = (unsigned long)p;
+
+ err = -ENODEV;
+ if (node < 0 || node >= MAX_NUMNODES)
+ goto out_flush;
+ if (!node_state(node, N_MEMORY))
+ goto out_flush;
- pm[j].node = node;
+ err = -EACCES;
+ if (!node_isset(node, task_nodes))
+ goto out_flush;
+
+ if (current_node == NUMA_NO_NODE) {
+ current_node = node;
+ start = i;
+ } else if (node != current_node) {
+ err = do_move_pages_to_node(mm, &pagelist, current_node);
+ if (err)
+ goto out;
+ err = store_status(status, start, current_node, i - start);
+ if (err)
+ goto out;
+ start = i;
+ current_node = node;
}
- /* End marker for this chunk */
- pm[chunk_nr_pages].node = MAX_NUMNODES;
-
- /* Migrate this chunk */
- err = do_move_page_to_node_array(mm, pm,
- flags & MPOL_MF_MOVE_ALL);
- if (err < 0)
- goto out_pm;
+ /*
+ * Errors in the page lookup or isolation are not fatal and we simply
+ * report them via status
+ */
+ err = add_page_for_migration(mm, addr, current_node,
+ &pagelist, flags & MPOL_MF_MOVE_ALL);
+ if (!err)
+ continue;
- /* Return status information */
- for (j = 0; j < chunk_nr_pages; j++)
- if (put_user(pm[j].status, status + j + chunk_start)) {
- err = -EFAULT;
- goto out_pm;
- }
- }
- err = 0;
+ err = store_status(status, i, err, 1);
+ if (err)
+ goto out_flush;
-out_pm:
- free_page((unsigned long)pm);
+ err = do_move_pages_to_node(mm, &pagelist, current_node);
+ if (err)
+ goto out;
+ if (i > start) {
+ err = store_status(status, start, current_node, i - start);
+ if (err)
+ goto out;
+ }
+ current_node = NUMA_NO_NODE;
+ }
+out_flush:
+ /* Make sure we do not overwrite the existing error */
+ err1 = do_move_pages_to_node(mm, &pagelist, current_node);
+ if (!err1)
+ err1 = store_status(status, start, current_node, i - start);
+ if (!err)
+ err = err1;
out:
return err;
}
@@ -1866,8 +1828,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
}
static struct page *alloc_misplaced_dst_page(struct page *page,
- unsigned long data,
- int **result)
+ unsigned long data)
{
int nid = (int) data;
struct page *newpage;
@@ -1987,6 +1948,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
goto out;
/*
+ * Also do not migrate dirty pages as not all filesystems can move
+ * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
+ */
+ if (page_is_file_cache(page) && PageDirty(page))
+ goto out;
+
+ /*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
diff --git a/mm/mincore.c b/mm/mincore.c
index fc37afe226e6..a66f2052c7b1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -68,8 +68,16 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
*/
if (radix_tree_exceptional_entry(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
+ struct swap_info_struct *si;
+
+ /* Prevent swap device to being swapoff under us */
+ si = get_swap_device(swp);
+ if (si) {
+ page = find_get_page(swap_address_space(swp),
+ swp_offset(swp));
+ put_swap_device(si);
+ } else
+ page = NULL;
}
} else
page = find_get_page(mapping, pgoff);
diff --git a/mm/mmap.c b/mm/mmap.c
index aa0dc8231c0d..f2154fc2548b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3191,13 +3191,15 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
if (rlimit(RLIMIT_DATA) == 0 &&
mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
return true;
- if (!ignore_rlimit_data) {
- pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits or use boot option ignore_rlimit_data.\n",
- current->comm, current->pid,
- (mm->data_vm + npages) << PAGE_SHIFT,
- rlimit(RLIMIT_DATA));
+
+ pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
+ current->comm, current->pid,
+ (mm->data_vm + npages) << PAGE_SHIFT,
+ rlimit(RLIMIT_DATA),
+ ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");
+
+ if (!ignore_rlimit_data)
return false;
- }
}
return true;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c1d6af7455da..625608bc8962 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -27,6 +27,7 @@
#include <linux/pkeys.h>
#include <linux/ksm.h>
#include <linux/uaccess.h>
+#include <linux/mm_inline.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
@@ -89,6 +90,14 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
page_mapcount(page) != 1)
continue;
+ /*
+ * While migration can move some dirty pages,
+ * it cannot move them all from MIGRATE_ASYNC
+ * context.
+ */
+ if (page_is_file_cache(page) && PageDirty(page))
+ continue;
+
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
continue;
diff --git a/mm/nommu.c b/mm/nommu.c
index 4f8720243ae7..13723736d38f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -457,18 +457,6 @@ void __weak vmalloc_sync_all(void)
{
}
-/**
- * alloc_vm_area - allocate a range of kernel address space
- * @size: size of the area
- *
- * Returns: NULL on failure, vm_struct on success
- *
- * This function reserves a range of kernel address space, and
- * allocates pagetables to map that range. No actual mappings
- * are created. If the kernel address space is not shared
- * between processes, it syncs the pagetable across all
- * processes.
- */
struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
BUG();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f2e7dfb81eee..dfd370526909 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -185,6 +185,8 @@ static bool is_dump_unreclaim_slabs(void)
* oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
* @totalpages: total present RAM allowed for page allocation
+ * @memcg: task's memory controller, if constrained
+ * @nodemask: nodemask passed to page allocator for mempolicy ooms
*
* The heuristic for determining which task to kill is made to be as simple and
* predictable as possible. The goal is to return the highest value for the
@@ -224,13 +226,6 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
mm_pgtables_bytes(p->mm) / PAGE_SIZE;
task_unlock(p);
- /*
- * Root processes get 3% bonus, just like the __vm_enough_memory()
- * implementation used by LSMs.
- */
- if (has_capability_noaudit(p, CAP_SYS_ADMIN))
- points -= (points * 3) / 100;
-
/* Normalize to oom_score_adj units */
adj *= totalpages / 1000;
points += adj;
@@ -309,7 +304,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
return CONSTRAINT_NONE;
}
-static int oom_evaluate_task(struct task_struct *task, void *arg)
+int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
unsigned long points;
@@ -343,26 +338,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
goto next;
/* Prefer thread group leaders for display purposes */
- if (points == oc->chosen_points && thread_group_leader(oc->chosen))
+ if (points == oc->chosen_points && thread_group_leader(oc->chosen_task))
goto next;
select:
- if (oc->chosen)
- put_task_struct(oc->chosen);
+ if (oc->chosen_task)
+ put_task_struct(oc->chosen_task);
get_task_struct(task);
- oc->chosen = task;
+ oc->chosen_task = task;
oc->chosen_points = points;
next:
return 0;
abort:
- if (oc->chosen)
- put_task_struct(oc->chosen);
- oc->chosen = (void *)-1UL;
+ if (oc->chosen_task)
+ put_task_struct(oc->chosen_task);
+ oc->chosen_task = INFLIGHT_VICTIM;
return 1;
}
/*
* Simple selection loop. We choose the process with the highest number of
- * 'points'. In case scan was aborted, oc->chosen is set to -1.
+ * 'points'. In case scan was aborted, oc->chosen_task is set to -1.
*/
static void select_bad_process(struct oom_control *oc)
{
@@ -595,7 +590,8 @@ static void oom_reap_task(struct task_struct *tsk)
while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
schedule_timeout_idle(HZ/10);
- if (attempts <= MAX_OOM_REAP_RETRIES)
+ if (attempts <= MAX_OOM_REAP_RETRIES ||
+ test_bit(MMF_OOM_SKIP, &mm->flags))
goto done;
@@ -834,67 +830,22 @@ static bool task_will_free_mem(struct task_struct *task)
return ret;
}
-static void oom_kill_process(struct oom_control *oc, const char *message)
+static void __oom_kill_process(struct task_struct *victim)
{
- struct task_struct *p = oc->chosen;
- unsigned int points = oc->chosen_points;
- struct task_struct *victim = p;
- struct task_struct *child;
- struct task_struct *t;
+ struct task_struct *p;
struct mm_struct *mm;
- unsigned int victim_points = 0;
- static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
bool can_oom_reap = true;
/*
- * If the task is already exiting, don't alarm the sysadmin or kill
- * its children or threads, just give it access to memory reserves
- * so it can die quickly
+ * __oom_kill_process() is used to kill all tasks belonging to
+ * the selected memory cgroup, so we should check that we're not
+ * trying to kill an unkillable task.
*/
- task_lock(p);
- if (task_will_free_mem(p)) {
- mark_oom_victim(p);
- wake_oom_reaper(p);
- task_unlock(p);
- put_task_struct(p);
+ if (is_global_init(victim) || (victim->flags & PF_KTHREAD) ||
+ victim->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ put_task_struct(victim);
return;
}
- task_unlock(p);
-
- if (__ratelimit(&oom_rs))
- dump_header(oc, p);
-
- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
- message, task_pid_nr(p), p->comm, points);
-
- /*
- * If any of p's children has a different mm and is eligible for kill,
- * the one with the highest oom_badness() score is sacrificed for its
- * parent. This attempts to lose the minimal amount of work done while
- * still freeing memory.
- */
- read_lock(&tasklist_lock);
- for_each_thread(p, t) {
- list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
-
- if (process_shares_mm(child, p->mm))
- continue;
- /*
- * oom_badness() returns 0 if the thread is unkillable
- */
- child_points = oom_badness(child,
- oc->memcg, oc->nodemask, oc->totalpages);
- if (child_points > victim_points) {
- put_task_struct(victim);
- victim = child;
- victim_points = child_points;
- get_task_struct(victim);
- }
- }
- }
- read_unlock(&tasklist_lock);
p = find_lock_task_mm(victim);
if (!p) {
@@ -969,6 +920,108 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
}
#undef K
+static void oom_kill_process(struct oom_control *oc, const char *message)
+{
+ struct task_struct *p = oc->chosen_task;
+ unsigned int points = oc->chosen_points;
+ struct task_struct *victim = p;
+ struct task_struct *child;
+ struct task_struct *t;
+ unsigned int victim_points = 0;
+ static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ /*
+ * If the task is already exiting, don't alarm the sysadmin or kill
+ * its children or threads, just give it access to memory reserves
+ * so it can die quickly
+ */
+ task_lock(p);
+ if (task_will_free_mem(p)) {
+ mark_oom_victim(p);
+ wake_oom_reaper(p);
+ task_unlock(p);
+ put_task_struct(p);
+ return;
+ }
+ task_unlock(p);
+
+ if (__ratelimit(&oom_rs))
+ dump_header(oc, p);
+
+ pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
+ message, task_pid_nr(p), p->comm, points);
+
+ /*
+ * If any of p's children has a different mm and is eligible for kill,
+ * the one with the highest oom_badness() score is sacrificed for its
+ * parent. This attempts to lose the minimal amount of work done while
+ * still freeing memory.
+ */
+ read_lock(&tasklist_lock);
+ for_each_thread(p, t) {
+ list_for_each_entry(child, &t->children, sibling) {
+ unsigned int child_points;
+
+ if (process_shares_mm(child, p->mm))
+ continue;
+ /*
+ * oom_badness() returns 0 if the thread is unkillable
+ */
+ child_points = oom_badness(child,
+ oc->memcg, oc->nodemask, oc->totalpages);
+ if (child_points > victim_points) {
+ put_task_struct(victim);
+ victim = child;
+ victim_points = child_points;
+ get_task_struct(victim);
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
+
+ __oom_kill_process(victim);
+}
+
+static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+{
+ get_task_struct(task);
+ __oom_kill_process(task);
+ return 0;
+}
+
+static bool oom_kill_memcg_victim(struct oom_control *oc)
+{
+ if (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM)
+ return oc->chosen_memcg;
+
+ /*
+ * If memory.oom_group is set, kill all tasks belonging to the sub-tree
+ * of the chosen memory cgroup, otherwise kill the task with the biggest
+ * memory footprint.
+ */
+ if (mem_cgroup_oom_group(oc->chosen_memcg)) {
+ mem_cgroup_scan_tasks(oc->chosen_memcg, oom_kill_memcg_member,
+ NULL);
+ /* We have one or more terminating processes at this point. */
+ oc->chosen_task = INFLIGHT_VICTIM;
+ } else {
+ oc->chosen_points = 0;
+ oc->chosen_task = NULL;
+ mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc);
+
+ if (oc->chosen_task == NULL ||
+ oc->chosen_task == INFLIGHT_VICTIM)
+ goto out;
+
+ __oom_kill_process(oc->chosen_task);
+ }
+
+out:
+ mem_cgroup_put(oc->chosen_memcg);
+ return oc->chosen_task;
+}
+
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
@@ -1021,6 +1074,7 @@ bool out_of_memory(struct oom_control *oc)
{
unsigned long freed = 0;
enum oom_constraint constraint = CONSTRAINT_NONE;
+ bool delay = false; /* if set, delay next allocation attempt */
if (oom_killer_disabled)
return false;
@@ -1065,27 +1119,37 @@ bool out_of_memory(struct oom_control *oc)
current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oc->chosen = current;
+ oc->chosen_task = current;
oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
return true;
}
+ if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) {
+ delay = true;
+ goto out;
+ }
+
select_bad_process(oc);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
+ if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
dump_header(oc, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (oc->chosen && oc->chosen != (void *)-1UL) {
+ if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) {
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
"Memory cgroup out of memory");
- /*
- * Give the killed process a good chance to exit before trying
- * to allocate memory again.
- */
- schedule_timeout_killable(1);
+ delay = true;
}
- return !!oc->chosen;
+
+out:
+ /*
+ * Give the killed process a good chance to exit before trying
+ * to allocate memory again.
+ */
+ if (delay)
+ schedule_timeout_killable(1);
+
+ return !!oc->chosen_task;
}
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ea018263210..b4390db64da3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -205,17 +205,18 @@ static void __free_pages_ok(struct page *page, unsigned int order);
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
*/
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
- 256,
+ [ZONE_DMA] = 256,
#endif
#ifdef CONFIG_ZONE_DMA32
- 256,
+ [ZONE_DMA32] = 256,
#endif
+ [ZONE_NORMAL] = 32,
#ifdef CONFIG_HIGHMEM
- 32,
+ [ZONE_HIGHMEM] = 0,
#endif
- 32,
+ [ZONE_MOVABLE] = 0,
};
EXPORT_SYMBOL(totalram_pages);
@@ -265,17 +266,19 @@ int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
int watermark_scale_factor = 10;
-static unsigned long __meminitdata nr_kernel_pages;
-static unsigned long __meminitdata nr_all_pages;
-static unsigned long __meminitdata dma_reserve;
+static unsigned long nr_kernel_pages __meminitdata;
+static unsigned long nr_all_pages __meminitdata;
+static unsigned long dma_reserve __meminitdata;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-static unsigned long __initdata required_kernelcore;
-static unsigned long __initdata required_movablecore;
-static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
-static bool mirrored_kernelcore;
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long required_kernelcore __initdata;
+static unsigned long required_kernelcore_percent __initdata;
+static unsigned long required_movablecore __initdata;
+static unsigned long required_movablecore_percent __initdata;
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
+static bool mirrored_kernelcore __meminitdata;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -292,40 +295,6 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-
-/*
- * Determine how many pages need to be initialized during early boot
- * (non-deferred initialization).
- * The value of first_deferred_pfn will be set later, once non-deferred pages
- * are initialized, but for now set it ULONG_MAX.
- */
-static inline void reset_deferred_meminit(pg_data_t *pgdat)
-{
- phys_addr_t start_addr, end_addr;
- unsigned long max_pgcnt;
- unsigned long reserved;
-
- /*
- * Initialise at least 2G of a node but also take into account that
- * two large system hashes that can take up 1GB for 0.25TB/node.
- */
- max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
- (pgdat->node_spanned_pages >> 8));
-
- /*
- * Compensate the all the memblock reservations (e.g. crash kernel)
- * from the initial estimation to make sure we will initialize enough
- * memory to boot.
- */
- start_addr = PFN_PHYS(pgdat->node_start_pfn);
- end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
- reserved = memblock_reserved_memory_within(start_addr, end_addr);
- max_pgcnt += PHYS_PFN(reserved);
-
- pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
- pgdat->first_deferred_pfn = ULONG_MAX;
-}
-
/* Returns true if the struct page for the pfn is uninitialised */
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
{
@@ -361,10 +330,6 @@ static inline bool update_defer_init(pg_data_t *pgdat,
return true;
}
#else
-static inline void reset_deferred_meminit(pg_data_t *pgdat)
-{
-}
-
static inline bool early_page_uninitialised(unsigned long pfn)
{
return false;
@@ -1099,6 +1064,15 @@ static bool bulkfree_pcp_prepare(struct page *page)
}
#endif /* CONFIG_DEBUG_VM */
+static inline void prefetch_buddy(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
+ struct page *buddy = page + (buddy_pfn - pfn);
+
+ prefetch(buddy);
+}
+
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone, and of same order.
@@ -1115,13 +1089,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
+ int prefetch_nr = 0;
bool isolated_pageblocks;
-
- spin_lock(&zone->lock);
- isolated_pageblocks = has_isolate_pageblock(zone);
+ struct page *page, *tmp;
+ LIST_HEAD(head);
while (count) {
- struct page *page;
struct list_head *list;
/*
@@ -1143,26 +1116,48 @@ static void free_pcppages_bulk(struct zone *zone, int count,
batch_free = count;
do {
- int mt; /* migratetype of the to-be-freed page */
-
page = list_last_entry(list, struct page, lru);
- /* must delete as __free_one_page list manipulates */
+ /* must delete to avoid corrupting pcp list */
list_del(&page->lru);
-
- mt = get_pcppage_migratetype(page);
- /* MIGRATE_ISOLATE page should not go to pcplists */
- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
- /* Pageblock could have been isolated meanwhile */
- if (unlikely(isolated_pageblocks))
- mt = get_pageblock_migratetype(page);
+ pcp->count--;
if (bulkfree_pcp_prepare(page))
continue;
- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
- trace_mm_page_pcpu_drain(page, 0, mt);
+ list_add_tail(&page->lru, &head);
+
+ /*
+ * We are going to put the page back to the global
+ * pool, prefetch its buddy to speed up later access
+ * under zone->lock. It is believed the overhead of
+ * an additional test and calculating buddy_pfn here
+ * can be offset by reduced memory latency later. To
+ * avoid excessive prefetching due to large count, only
+ * prefetch buddy for the first pcp->batch nr of pages.
+ */
+ if (prefetch_nr++ < pcp->batch)
+ prefetch_buddy(page);
} while (--count && --batch_free && !list_empty(list));
}
+
+ spin_lock(&zone->lock);
+ isolated_pageblocks = has_isolate_pageblock(zone);
+
+ /*
+ * Use safe version since after __free_one_page(),
+ * page->lru.next will not point to original list.
+ */
+ list_for_each_entry_safe(page, tmp, &head, lru) {
+ int mt = get_pcppage_migratetype(page);
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
+ if (unlikely(isolated_pageblocks))
+ mt = get_pageblock_migratetype(page);
+
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+ trace_mm_page_pcpu_drain(page, 0, mt);
+ }
spin_unlock(&zone->lock);
}
@@ -1181,10 +1176,9 @@ static void free_one_page(struct zone *zone,
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
- unsigned long zone, int nid, bool zero)
+ unsigned long zone, int nid)
{
- if (zero)
- mm_zero_struct_page(page);
+ mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
@@ -1198,12 +1192,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
#endif
}
-static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
- int nid, bool zero)
-{
- return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero);
-}
-
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __meminit init_reserved_page(unsigned long pfn)
{
@@ -1222,7 +1210,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
break;
}
- __init_single_pfn(pfn, zid, nid, true);
+ __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
}
#else
static inline void init_reserved_page(unsigned long pfn)
@@ -1506,7 +1494,7 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
} else if (!(pfn & nr_pgmask)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 1;
- cond_resched();
+ touch_nmi_watchdog();
} else {
nr_free++;
}
@@ -1535,11 +1523,11 @@ static unsigned long __init deferred_init_pages(int nid, int zid,
continue;
} else if (!page || !(pfn & nr_pgmask)) {
page = pfn_to_page(pfn);
- cond_resched();
+ touch_nmi_watchdog();
} else {
page++;
}
- __init_single_page(page, pfn, zid, nid, true);
+ __init_single_page(page, pfn, zid, nid);
nr_pages++;
}
return (nr_pages);
@@ -1552,23 +1540,25 @@ static int __init deferred_init_memmap(void *data)
int nid = pgdat->node_id;
unsigned long start = jiffies;
unsigned long nr_pages = 0;
- unsigned long spfn, epfn;
+ unsigned long spfn, epfn, first_init_pfn, flags;
phys_addr_t spa, epa;
int zid;
struct zone *zone;
- unsigned long first_init_pfn = pgdat->first_deferred_pfn;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
u64 i;
+ /* Bind memory initialisation thread to a local node if possible */
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(current, cpumask);
+
+ pgdat_resize_lock(pgdat, &flags);
+ first_init_pfn = pgdat->first_deferred_pfn;
if (first_init_pfn == ULONG_MAX) {
+ pgdat_resize_unlock(pgdat, &flags);
pgdat_init_report_one_done();
return 0;
}
- /* Bind memory initialisation thread to a local node if possible */
- if (!cpumask_empty(cpumask))
- set_cpus_allowed_ptr(current, cpumask);
-
/* Sanity check boundaries */
BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
@@ -1598,6 +1588,7 @@ static int __init deferred_init_memmap(void *data)
epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
deferred_free_pages(nid, zid, spfn, epfn);
}
+ pgdat_resize_unlock(pgdat, &flags);
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
@@ -1608,6 +1599,117 @@ static int __init deferred_init_memmap(void *data)
pgdat_init_report_one_done();
return 0;
}
+
+/*
+ * During boot we initialize deferred pages on-demand, as needed, but once
+ * page_alloc_init_late() has finished, the deferred pages are all initialized,
+ * and we can permanently disable that path.
+ */
+static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+
+/*
+ * If this zone has deferred pages, try to grow it by initializing enough
+ * deferred pages to satisfy the allocation specified by order, rounded up to
+ * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
+ * of SECTION_SIZE bytes by initializing struct pages in increments of
+ * PAGES_PER_SECTION * sizeof(struct page) bytes.
+ *
+ * Return true when zone was grown, otherwise return false. We return true even
+ * when we grow less than requested, to let the caller decide if there are
+ * enough pages to satisfy the allocation.
+ *
+ * Note: We use noinline because this function is needed only during boot, and
+ * it is called from a __ref function _deferred_grow_zone. This way we are
+ * making sure that it is not inlined into permanent text section.
+ */
+static noinline bool __init
+deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+ int zid = zone_idx(zone);
+ int nid = zone_to_nid(zone);
+ pg_data_t *pgdat = NODE_DATA(nid);
+ unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
+ unsigned long nr_pages = 0;
+ unsigned long first_init_pfn, spfn, epfn, t, flags;
+ unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
+ phys_addr_t spa, epa;
+ u64 i;
+
+ /* Only the last zone may have deferred pages */
+ if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
+ return false;
+
+ pgdat_resize_lock(pgdat, &flags);
+
+ /*
+ * If deferred pages have been initialized while we were waiting for
+ * the lock, return true, as the zone was grown. The caller will retry
+ * this zone. We won't return to this function since the caller also
+ * has this static branch.
+ */
+ if (!static_branch_unlikely(&deferred_pages)) {
+ pgdat_resize_unlock(pgdat, &flags);
+ return true;
+ }
+
+ /*
+ * If someone grew this zone while we were waiting for spinlock, return
+ * true, as there might be enough pages already.
+ */
+ if (first_deferred_pfn != pgdat->first_deferred_pfn) {
+ pgdat_resize_unlock(pgdat, &flags);
+ return true;
+ }
+
+ first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
+
+ if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
+ pgdat_resize_unlock(pgdat, &flags);
+ return false;
+ }
+
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+ spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+ epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+
+ while (spfn < epfn && nr_pages < nr_pages_needed) {
+ t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
+ first_deferred_pfn = min(t, epfn);
+ nr_pages += deferred_init_pages(nid, zid, spfn,
+ first_deferred_pfn);
+ spfn = first_deferred_pfn;
+ }
+
+ if (nr_pages >= nr_pages_needed)
+ break;
+ }
+
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+ spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+ epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
+ deferred_free_pages(nid, zid, spfn, epfn);
+
+ if (first_deferred_pfn == epfn)
+ break;
+ }
+ pgdat->first_deferred_pfn = first_deferred_pfn;
+ pgdat_resize_unlock(pgdat, &flags);
+
+ return nr_pages > 0;
+}
+
+/*
+ * deferred_grow_zone() is __init, but it is called from
+ * get_page_from_freelist() during early boot until deferred_pages permanently
+ * disables this call. This is why we have refdata wrapper to avoid warning,
+ * and to ensure that the function body gets unloaded.
+ */
+static bool __ref
+_deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+ return deferred_grow_zone(zone, order);
+}
+
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
void __init page_alloc_init_late(void)
@@ -1626,6 +1728,12 @@ void __init page_alloc_init_late(void)
/* Block until all are initialised */
wait_for_completion(&pgdat_init_all_done_comp);
+ /*
+ * We initialized the rest of the deferred pages. Permanently disable
+ * on-demand struct page initialization.
+ */
+ static_branch_disable(&deferred_pages);
+
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
@@ -1639,16 +1747,38 @@ void __init page_alloc_init_late(void)
}
#ifdef CONFIG_CMA
+static void __init adjust_present_page_count(struct page *page, long count)
+{
+ struct zone *zone = page_zone(page);
+
+ /* We don't need to hold a lock since it is boot-up process */
+ zone->present_pages += count;
+}
+
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
{
unsigned i = pageblock_nr_pages;
+ unsigned long pfn = page_to_pfn(page);
struct page *p = page;
+ int nid = page_to_nid(page);
+
+ /*
+ * ZONE_MOVABLE will steal present pages from other zones by
+ * changing page links so page_zone() is changed. Before that,
+ * we need to adjust previous zone's page count first.
+ */
+ adjust_present_page_count(page, -pageblock_nr_pages);
do {
__ClearPageReserved(p);
set_page_count(p, 0);
- } while (++p, --i);
+
+ /* Steal pages from other zones */
+ set_page_links(p, ZONE_MOVABLE, nid, pfn);
+ } while (++p, ++pfn, --i);
+
+ adjust_present_page_count(page, pageblock_nr_pages);
set_pageblock_migratetype(page, MIGRATE_CMA);
@@ -2418,10 +2548,8 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
local_irq_save(flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
- if (to_drain > 0) {
+ if (to_drain > 0)
free_pcppages_bulk(zone, to_drain, pcp);
- pcp->count -= to_drain;
- }
local_irq_restore(flags);
}
#endif
@@ -2443,10 +2571,8 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
- if (pcp->count) {
+ if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
- pcp->count = 0;
- }
local_irq_restore(flags);
}
@@ -2670,7 +2796,6 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
if (pcp->count >= pcp->high) {
unsigned long batch = READ_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
- pcp->count -= batch;
}
}
@@ -2768,7 +2893,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
* exists.
*/
watermark = min_wmark_pages(zone) + (1UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -3044,12 +3169,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
-#ifdef CONFIG_CMA
- /* If allocation can't use CMA areas don't use free CMA pages */
- if (!(alloc_flags & ALLOC_CMA))
- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
-
/*
* Check watermarks for an order-0 allocation request. If these
* are not met, then a high-order request also cannot go ahead
@@ -3076,10 +3195,8 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
#ifdef CONFIG_CMA
- if ((alloc_flags & ALLOC_CMA) &&
- !list_empty(&area->free_list[MIGRATE_CMA])) {
+ if (!list_empty(&area->free_list[MIGRATE_CMA]))
return true;
- }
#endif
if (alloc_harder &&
!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
@@ -3099,13 +3216,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx, unsigned int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
- long cma_pages = 0;
-
-#ifdef CONFIG_CMA
- /* If allocation can't use CMA areas don't use free CMA pages */
- if (!(alloc_flags & ALLOC_CMA))
- cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
/*
* Fast check for order-0 only. If this fails then the reserves
@@ -3114,7 +3224,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
* the caller is !atomic then it'll uselessly search the free
* list. That corner case is then slower but it is harmless.
*/
- if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+ if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx])
return true;
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
@@ -3205,6 +3315,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
ac_classzone_idx(ac), alloc_flags)) {
int ret;
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ /*
+ * Watermark failed for this zone, but see if we can
+ * grow this zone if it contains deferred pages.
+ */
+ if (static_branch_unlikely(&deferred_pages)) {
+ if (_deferred_grow_zone(zone, order))
+ goto try_this_zone;
+ }
+#endif
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (alloc_flags & ALLOC_NO_WATERMARKS)
@@ -3246,6 +3366,14 @@ try_this_zone:
reserve_highatomic_pageblock(page, zone, order);
return page;
+ } else {
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ /* Try again if zone has deferred pages */
+ if (static_branch_unlikely(&deferred_pages)) {
+ if (_deferred_grow_zone(zone, order))
+ goto try_this_zone;
+ }
+#endif
}
}
@@ -3685,16 +3813,18 @@ retry:
return page;
}
-static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
+static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
+ const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
pg_data_t *last_pgdat = NULL;
+ enum zone_type high_zoneidx = ac->high_zoneidx;
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
- ac->high_zoneidx, ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
+ ac->nodemask) {
if (last_pgdat != zone->zone_pgdat)
- wakeup_kswapd(zone, order, ac->high_zoneidx);
+ wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
last_pgdat = zone->zone_pgdat;
}
}
@@ -3730,10 +3860,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
-#ifdef CONFIG_CMA
- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
- alloc_flags |= ALLOC_CMA;
-#endif
return alloc_flags;
}
@@ -3973,7 +4099,7 @@ retry_cpuset:
goto nopage;
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
- wake_all_kswapds(order, ac);
+ wake_all_kswapds(order, gfp_mask, ac);
/*
* The adjusted alloc_flags might result in immediate success, so try
@@ -4031,7 +4157,7 @@ retry_cpuset:
retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
- wake_all_kswapds(order, ac);
+ wake_all_kswapds(order, gfp_mask, ac);
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
@@ -4200,9 +4326,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
if (should_fail_alloc_page(gfp_mask, order))
return false;
- if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
- *alloc_flags |= ALLOC_CMA;
-
return true;
}
@@ -4612,6 +4735,13 @@ long si_mem_available(void)
min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
wmark_low);
+ /*
+ * Part of the kernel memory, which can be released under memory
+ * pressure.
+ */
+ available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
+ PAGE_SHIFT;
+
if (available < 0)
available = 0;
return available;
@@ -5334,6 +5464,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long pfn;
unsigned long nr_initialised = 0;
+ struct page *page;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
struct memblock_region *r = NULL, *tmp;
#endif
@@ -5386,6 +5517,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
#endif
not_early:
+ page = pfn_to_page(pfn);
+ __init_single_page(page, pfn, zone, nid);
+ if (context == MEMMAP_HOTPLUG)
+ SetPageReserved(page);
+
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
@@ -5402,15 +5538,8 @@ not_early:
* because this is done early in sparse_add_one_section
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
- struct page *page = pfn_to_page(pfn);
-
- __init_single_page(page, pfn, zone, nid,
- context != MEMMAP_HOTPLUG);
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
- } else {
- __init_single_pfn(pfn, zone, nid,
- context != MEMMAP_HOTPLUG);
}
}
}
@@ -6079,6 +6208,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
+ unsigned long node_end_pfn = 0;
pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
@@ -6106,9 +6236,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
unsigned long zone_start_pfn = zone->zone_start_pfn;
+ unsigned long movable_size = 0;
size = zone->spanned_pages;
realsize = freesize = zone->present_pages;
+ if (zone_end_pfn(zone) > node_end_pfn)
+ node_end_pfn = zone_end_pfn(zone);
+
/*
* Adjust freesize so that it accounts for how much memory
@@ -6157,12 +6291,30 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone_seqlock_init(zone);
zone_pcp_init(zone);
- if (!size)
+ /*
+ * The size of the CMA area is unknown now so we need to
+ * prepare the memory for the usemap at maximum.
+ */
+ if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE &&
+ pgdat->node_spanned_pages) {
+ movable_size = node_end_pfn - pgdat->node_start_pfn;
+ }
+
+ if (!size && !movable_size)
continue;
set_pageblock_order();
- setup_usemap(pgdat, zone, zone_start_pfn, size);
- init_currently_empty_zone(zone, zone_start_pfn, size);
+ if (movable_size) {
+ zone->zone_start_pfn = pgdat->node_start_pfn;
+ zone->spanned_pages = movable_size;
+ setup_usemap(pgdat, zone,
+ pgdat->node_start_pfn, movable_size);
+ init_currently_empty_zone(zone,
+ pgdat->node_start_pfn, movable_size);
+ } else {
+ setup_usemap(pgdat, zone, zone_start_pfn, size);
+ init_currently_empty_zone(zone, zone_start_pfn, size);
+ }
memmap_init(size, nid, j, zone_start_pfn);
}
}
@@ -6241,7 +6393,15 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
alloc_node_mem_map(pgdat);
- reset_deferred_meminit(pgdat);
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ /*
+ * We start only with one section of pages, more pages are added as
+ * needed until the rest of deferred pages are initialized.
+ */
+ pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
+ pgdat->node_spanned_pages);
+ pgdat->first_deferred_pfn = ULONG_MAX;
+#endif
free_area_init_core(pgdat);
}
@@ -6471,7 +6631,18 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}
/*
- * If movablecore=nn[KMG] was specified, calculate what size of
+ * If kernelcore=nn% or movablecore=nn% was specified, calculate the
+ * amount of necessary memory.
+ */
+ if (required_kernelcore_percent)
+ required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
+ 10000UL;
+ if (required_movablecore_percent)
+ required_movablecore = (totalpages * 100 * required_movablecore_percent) /
+ 10000UL;
+
+ /*
+ * If movablecore= was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
* and movablecore are specified, then the value of kernelcore
@@ -6711,18 +6882,30 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
zero_resv_unavail();
}
-static int __init cmdline_parse_core(char *p, unsigned long *core)
+static int __init cmdline_parse_core(char *p, unsigned long *core,
+ unsigned long *percent)
{
unsigned long long coremem;
+ char *endptr;
+
if (!p)
return -EINVAL;
- coremem = memparse(p, &p);
- *core = coremem >> PAGE_SHIFT;
+ /* Value may be a percentage of total memory, otherwise bytes */
+ coremem = simple_strtoull(p, &endptr, 0);
+ if (*endptr == '%') {
+ /* Paranoid check for percent values greater than 100 */
+ WARN_ON(coremem > 100);
- /* Paranoid check that UL is enough for the coremem value */
- WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+ *percent = coremem;
+ } else {
+ coremem = memparse(p, &p);
+ /* Paranoid check that UL is enough for the coremem value */
+ WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+ *core = coremem >> PAGE_SHIFT;
+ *percent = 0UL;
+ }
return 0;
}
@@ -6738,7 +6921,8 @@ static int __init cmdline_parse_kernelcore(char *p)
return 0;
}
- return cmdline_parse_core(p, &required_kernelcore);
+ return cmdline_parse_core(p, &required_kernelcore,
+ &required_kernelcore_percent);
}
/*
@@ -6747,7 +6931,8 @@ static int __init cmdline_parse_kernelcore(char *p)
*/
static int __init cmdline_parse_movablecore(char *p)
{
- return cmdline_parse_core(p, &required_movablecore);
+ return cmdline_parse_core(p, &required_movablecore,
+ &required_movablecore_percent);
}
early_param("kernelcore", cmdline_parse_kernelcore);
@@ -6971,13 +7156,15 @@ static void setup_per_zone_lowmem_reserve(void)
struct zone *lower_zone;
idx--;
-
- if (sysctl_lowmem_reserve_ratio[idx] < 1)
- sysctl_lowmem_reserve_ratio[idx] = 1;
-
lower_zone = pgdat->node_zones + idx;
- lower_zone->lowmem_reserve[j] = managed_pages /
- sysctl_lowmem_reserve_ratio[idx];
+
+ if (sysctl_lowmem_reserve_ratio[idx] < 1) {
+ sysctl_lowmem_reserve_ratio[idx] = 0;
+ lower_zone->lowmem_reserve[j] = 0;
+ } else {
+ lower_zone->lowmem_reserve[j] =
+ managed_pages / sysctl_lowmem_reserve_ratio[idx];
+ }
managed_pages += lower_zone->managed_pages;
}
}
@@ -7591,7 +7778,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
cc->nr_migratepages -= nr_reclaimed;
ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
- NULL, 0, cc->mode, MR_CMA);
+ NULL, 0, cc->mode, MR_CONTIG_RANGE);
}
if (ret < 0) {
putback_movable_pages(&cc->migratepages);
@@ -7611,11 +7798,11 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* @gfp_mask: GFP mask to use during compaction
*
* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
- * aligned, however it's the caller's responsibility to guarantee that
- * we are the only thread that changes migrate type of pageblocks the
- * pages fall in.
+ * aligned. The PFN range must belong to a single zone.
*
- * The PFN range must belong to a single zone.
+ * The first thing this routine does is attempt to MIGRATE_ISOLATE all
+ * pageblocks in the range. Once isolated, the pageblocks should not
+ * be modified by others.
*
* Returns zero on success or negative error code. On success all
* pages which PFN is in [start, end) are allocated for the caller and
@@ -7768,7 +7955,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
}
#endif
-#ifdef CONFIG_MEMORY_HOTPLUG
+#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
* page high values need to be recalulated.
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 0a49374e6931..e412a63b2b74 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -65,11 +65,15 @@ static bool page_idle_clear_pte_refs_one(struct page *page,
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
if (pvmw.pte) {
- referenced = ptep_clear_young_notify(vma, addr,
- pvmw.pte);
+ /*
+ * For PTE-mapped THP, one sub page is referenced,
+ * the whole THP is referenced.
+ */
+ if (ptep_clear_young_notify(vma, addr, pvmw.pte))
+ referenced = true;
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- referenced = pmdp_clear_young_notify(vma, addr,
- pvmw.pmd);
+ if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
+ referenced = true;
} else {
/* unexpected pmd-mapped page? */
WARN_ON_ONCE(1);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 165ed8117bd1..43e085608846 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -28,6 +28,14 @@ static int set_migratetype_isolate(struct page *page, int migratetype,
spin_lock_irqsave(&zone->lock, flags);
+ /*
+ * We assume the caller intended to SET migrate type to isolate.
+ * If it is already set, then someone else must have raced and
+ * set it before us. Return -EBUSY
+ */
+ if (is_migrate_isolate_page(page))
+ goto out;
+
pfn = page_to_pfn(page);
arg.start_pfn = pfn;
arg.nr_pages = pageblock_nr_pages;
@@ -166,7 +174,15 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* future will not be allocated again.
*
* start_pfn/end_pfn must be aligned to pageblock_order.
- * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
+ * Return 0 on success and -EBUSY if any part of range cannot be isolated.
+ *
+ * There is no high level synchronization mechanism that prevents two threads
+ * from trying to isolate overlapping ranges. If this happens, one thread
+ * will notice pageblocks in the overlapping range already set to isolate.
+ * This happens in set_migratetype_isolate, and set_migratetype_isolate
+ * returns an error. We then clean up by restoring the migration type on
+ * pageblocks we may have modified and return -EBUSY to caller. This
+ * prevents two threads from simultaneously working on overlapping ranges.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned migratetype, bool skip_hwpoisoned_pages)
@@ -293,8 +309,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
return pfn < end_pfn ? -EBUSY : 0;
}
-struct page *alloc_migrate_target(struct page *page, unsigned long private,
- int **resultp)
+struct page *alloc_migrate_target(struct page *page, unsigned long private)
{
return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]);
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 7172e0a80e13..77d9e791ae8a 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -35,7 +35,7 @@ static depot_stack_handle_t early_handle;
static void init_early_allocated_pages(void);
-static int early_page_owner_param(char *buf)
+static int __init early_page_owner_param(char *buf)
{
if (!buf)
return -EINVAL;
@@ -274,7 +274,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
*/
for (; pfn < end_pfn; ) {
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
@@ -541,7 +541,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
unsigned long block_end_pfn;
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
diff --git a/mm/page_poison.c b/mm/page_poison.c
index e83fd44867de..aa2b3d34e8ea 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -9,7 +9,7 @@
static bool want_page_poisoning __read_mostly;
-static int early_page_poison_param(char *buf)
+static int __init early_page_poison_param(char *buf)
{
if (!buf)
return -EINVAL;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 8d2da5dec1e0..c3084ff2569d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -258,6 +258,9 @@ static int __walk_page_range(unsigned long start, unsigned long end,
/**
* walk_page_range - walk page table with caller specific callbacks
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @walk: mm_walk structure defining the callbacks and the target address space
*
* Recursively walk the page table tree of the process represented by @walk->mm
* within the virtual address range [@start, @end). During walking, we can do
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index 7a58460bfd27..063ff60ecd90 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -223,18 +223,7 @@ alloc_buffer:
return 0;
}
-
-static int percpu_stats_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, percpu_stats_show, NULL);
-}
-
-static const struct file_operations percpu_stats_fops = {
- .open = percpu_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(percpu_stats);
static int __init init_percpu_stats_debugfs(void)
{
diff --git a/mm/rmap.c b/mm/rmap.c
index 144c66e688a9..9122787c4947 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1171,6 +1171,7 @@ void page_add_new_anon_rmap(struct page *page,
/**
* page_add_file_rmap - add pte mapping to a file page
* @page: the page to add the mapping to
+ * @compound: charge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
diff --git a/mm/shmem.c b/mm/shmem.c
index b85919243399..a9b6d536b6f1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1422,9 +1422,12 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
{
struct vm_area_struct pvma;
struct page *page;
+ struct vm_fault vmf;
shmem_pseudo_vma_init(&pvma, info, index);
- page = swapin_readahead(swap, gfp, &pvma, 0);
+ vmf.vma = &pvma;
+ vmf.address = 0;
+ page = swap_cluster_readahead(swap, gfp, &vmf);
shmem_pseudo_vma_destroy(&pvma);
return page;
@@ -2613,241 +2616,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
return offset;
}
-/*
- * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
- * so reuse a tag which we firmly believe is never set or cleared on shmem.
- */
-#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE
-#define LAST_SCAN 4 /* about 150ms max */
-
-static void shmem_tag_pins(struct address_space *mapping)
-{
- struct radix_tree_iter iter;
- void **slot;
- pgoff_t start;
- struct page *page;
-
- lru_add_drain();
- start = 0;
- rcu_read_lock();
-
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- page = radix_tree_deref_slot(slot);
- if (!page || radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- } else if (page_count(page) - page_mapcount(page) > 1) {
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_set(&mapping->page_tree, iter.index,
- SHMEM_TAG_PINNED);
- spin_unlock_irq(&mapping->tree_lock);
- }
-
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
- }
- rcu_read_unlock();
-}
-
-/*
- * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
- * via get_user_pages(), drivers might have some pending I/O without any active
- * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
- * and see whether it has an elevated ref-count. If so, we tag them and wait for
- * them to be dropped.
- * The caller must guarantee that no new user will acquire writable references
- * to those pages to avoid races.
- */
-static int shmem_wait_for_pins(struct address_space *mapping)
-{
- struct radix_tree_iter iter;
- void **slot;
- pgoff_t start;
- struct page *page;
- int error, scan;
-
- shmem_tag_pins(mapping);
-
- error = 0;
- for (scan = 0; scan <= LAST_SCAN; scan++) {
- if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
- break;
-
- if (!scan)
- lru_add_drain_all();
- else if (schedule_timeout_killable((HZ << scan) / 200))
- scan = LAST_SCAN;
-
- start = 0;
- rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
- start, SHMEM_TAG_PINNED) {
-
- page = radix_tree_deref_slot(slot);
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
- page = NULL;
- }
-
- if (page &&
- page_count(page) - page_mapcount(page) != 1) {
- if (scan < LAST_SCAN)
- goto continue_resched;
-
- /*
- * On the last scan, we clean up all those tags
- * we inserted; but make a note that we still
- * found pages pinned.
- */
- error = -EBUSY;
- }
-
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_clear(&mapping->page_tree,
- iter.index, SHMEM_TAG_PINNED);
- spin_unlock_irq(&mapping->tree_lock);
-continue_resched:
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
- }
- rcu_read_unlock();
- }
-
- return error;
-}
-
-static unsigned int *memfd_file_seals_ptr(struct file *file)
-{
- if (file->f_op == &shmem_file_operations)
- return &SHMEM_I(file_inode(file))->seals;
-
-#ifdef CONFIG_HUGETLBFS
- if (file->f_op == &hugetlbfs_file_operations)
- return &HUGETLBFS_I(file_inode(file))->seals;
-#endif
-
- return NULL;
-}
-
-#define F_ALL_SEALS (F_SEAL_SEAL | \
- F_SEAL_SHRINK | \
- F_SEAL_GROW | \
- F_SEAL_WRITE)
-
-static int memfd_add_seals(struct file *file, unsigned int seals)
-{
- struct inode *inode = file_inode(file);
- unsigned int *file_seals;
- int error;
-
- /*
- * SEALING
- * Sealing allows multiple parties to share a shmem-file but restrict
- * access to a specific subset of file operations. Seals can only be
- * added, but never removed. This way, mutually untrusted parties can
- * share common memory regions with a well-defined policy. A malicious
- * peer can thus never perform unwanted operations on a shared object.
- *
- * Seals are only supported on special shmem-files and always affect
- * the whole underlying inode. Once a seal is set, it may prevent some
- * kinds of access to the file. Currently, the following seals are
- * defined:
- * SEAL_SEAL: Prevent further seals from being set on this file
- * SEAL_SHRINK: Prevent the file from shrinking
- * SEAL_GROW: Prevent the file from growing
- * SEAL_WRITE: Prevent write access to the file
- *
- * As we don't require any trust relationship between two parties, we
- * must prevent seals from being removed. Therefore, sealing a file
- * only adds a given set of seals to the file, it never touches
- * existing seals. Furthermore, the "setting seals"-operation can be
- * sealed itself, which basically prevents any further seal from being
- * added.
- *
- * Semantics of sealing are only defined on volatile files. Only
- * anonymous shmem files support sealing. More importantly, seals are
- * never written to disk. Therefore, there's no plan to support it on
- * other file types.
- */
-
- if (!(file->f_mode & FMODE_WRITE))
- return -EPERM;
- if (seals & ~(unsigned int)F_ALL_SEALS)
- return -EINVAL;
-
- inode_lock(inode);
-
- file_seals = memfd_file_seals_ptr(file);
- if (!file_seals) {
- error = -EINVAL;
- goto unlock;
- }
-
- if (*file_seals & F_SEAL_SEAL) {
- error = -EPERM;
- goto unlock;
- }
-
- if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
- error = mapping_deny_writable(file->f_mapping);
- if (error)
- goto unlock;
-
- error = shmem_wait_for_pins(file->f_mapping);
- if (error) {
- mapping_allow_writable(file->f_mapping);
- goto unlock;
- }
- }
-
- *file_seals |= seals;
- error = 0;
-
-unlock:
- inode_unlock(inode);
- return error;
-}
-
-static int memfd_get_seals(struct file *file)
-{
- unsigned int *seals = memfd_file_seals_ptr(file);
-
- return seals ? *seals : -EINVAL;
-}
-
-long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- long error;
-
- switch (cmd) {
- case F_ADD_SEALS:
- /* disallow upper 32bit */
- if (arg > UINT_MAX)
- return -EINVAL;
-
- error = memfd_add_seals(file, arg);
- break;
- case F_GET_SEALS:
- error = memfd_get_seals(file);
- break;
- default:
- error = -EINVAL;
- break;
- }
-
- return error;
-}
-
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
@@ -3669,94 +3437,6 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
shmem_show_mpol(seq, sbinfo->mpol);
return 0;
}
-
-#define MFD_NAME_PREFIX "memfd:"
-#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
-#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
-
-SYSCALL_DEFINE2(memfd_create,
- const char __user *, uname,
- unsigned int, flags)
-{
- unsigned int *file_seals;
- struct file *file;
- int fd, error;
- char *name;
- long len;
-
- if (!(flags & MFD_HUGETLB)) {
- if (flags & ~(unsigned int)MFD_ALL_FLAGS)
- return -EINVAL;
- } else {
- /* Allow huge page size encoding in flags. */
- if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
- (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
- return -EINVAL;
- }
-
- /* length includes terminating zero */
- len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
- if (len <= 0)
- return -EFAULT;
- if (len > MFD_NAME_MAX_LEN + 1)
- return -EINVAL;
-
- name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
- if (!name)
- return -ENOMEM;
-
- strcpy(name, MFD_NAME_PREFIX);
- if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
- error = -EFAULT;
- goto err_name;
- }
-
- /* terminating-zero may have changed after strnlen_user() returned */
- if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
- error = -EFAULT;
- goto err_name;
- }
-
- fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
- if (fd < 0) {
- error = fd;
- goto err_name;
- }
-
- if (flags & MFD_HUGETLB) {
- struct user_struct *user = NULL;
-
- file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
- HUGETLB_ANONHUGE_INODE,
- (flags >> MFD_HUGE_SHIFT) &
- MFD_HUGE_MASK);
- } else
- file = shmem_file_setup(name, 0, VM_NORESERVE);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_fd;
- }
- file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
- file->f_flags |= O_RDWR | O_LARGEFILE;
-
- if (flags & MFD_ALLOW_SEALING) {
- file_seals = memfd_file_seals_ptr(file);
- *file_seals &= ~F_SEAL_SEAL;
- }
-
- fd_install(fd, file);
- kfree(name);
- return fd;
-
-err_fd:
- put_unused_fd(fd);
-err_name:
- kfree(name);
- return error;
-}
-
#endif /* CONFIG_TMPFS */
static void shmem_put_super(struct super_block *sb)
diff --git a/mm/slab.c b/mm/slab.c
index 9095c3945425..b59f2cdf28d1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1869,7 +1869,7 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
return 0;
}
-slab_flags_t kmem_cache_flags(unsigned long object_size,
+slab_flags_t kmem_cache_flags(unsigned int object_size,
slab_flags_t flags, const char *name,
void (*ctor)(void *))
{
@@ -1877,7 +1877,7 @@ slab_flags_t kmem_cache_flags(unsigned long object_size,
}
struct kmem_cache *
-__kmem_cache_alias(const char *name, size_t size, size_t align,
+__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *))
{
struct kmem_cache *cachep;
@@ -1994,7 +1994,7 @@ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
- size_t size = cachep->size;
+ unsigned int size = cachep->size;
#if DEBUG
#if FORCED_DEBUG
@@ -2291,6 +2291,18 @@ out:
return nr_freed;
}
+bool __kmem_cache_empty(struct kmem_cache *s)
+{
+ int node;
+ struct kmem_cache_node *n;
+
+ for_each_kmem_cache_node(s, node, n)
+ if (!list_empty(&n->slabs_full) ||
+ !list_empty(&n->slabs_partial))
+ return false;
+ return true;
+}
+
int __kmem_cache_shrink(struct kmem_cache *cachep)
{
int ret = 0;
@@ -2675,11 +2687,7 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
if (n->colour_next >= cachep->colour)
n->colour_next = 0;
- offset = n->colour_next;
- if (offset >= cachep->colour)
- offset = 0;
-
- offset *= cachep->colour_off;
+ offset = n->colour_next * cachep->colour_off;
/* Get slab management. */
freelist = alloc_slabmgmt(cachep, page, offset,
diff --git a/mm/slab.h b/mm/slab.h
index 51813236e773..68bdf498da3b 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -22,8 +22,8 @@ struct kmem_cache {
unsigned int size; /* The aligned/padded/added on size */
unsigned int align; /* Alignment as calculated */
slab_flags_t flags; /* Active flags on the slab */
- size_t useroffset; /* Usercopy region offset */
- size_t usersize; /* Usercopy region size */
+ unsigned int useroffset;/* Usercopy region offset */
+ unsigned int usersize; /* Usercopy region size */
const char *name; /* Slab name for sysfs */
int refcount; /* Use counter */
void (*ctor)(void *); /* Called on object slot creation */
@@ -77,7 +77,7 @@ extern struct kmem_cache *kmem_cache;
/* A table of kmalloc cache names and sizes */
extern const struct kmalloc_info_struct {
const char *name;
- unsigned long size;
+ unsigned int size;
} kmalloc_info[];
#ifndef CONFIG_SLOB
@@ -93,31 +93,31 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t);
/* Functions provided by the slab allocators */
int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
-extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
- slab_flags_t flags, size_t useroffset,
- size_t usersize);
+struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size,
+ slab_flags_t flags, unsigned int useroffset,
+ unsigned int usersize);
extern void create_boot_cache(struct kmem_cache *, const char *name,
- size_t size, slab_flags_t flags, size_t useroffset,
- size_t usersize);
+ unsigned int size, slab_flags_t flags,
+ unsigned int useroffset, unsigned int usersize);
int slab_unmergeable(struct kmem_cache *s);
-struct kmem_cache *find_mergeable(size_t size, size_t align,
+struct kmem_cache *find_mergeable(unsigned size, unsigned align,
slab_flags_t flags, const char *name, void (*ctor)(void *));
#ifndef CONFIG_SLOB
struct kmem_cache *
-__kmem_cache_alias(const char *name, size_t size, size_t align,
+__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *));
-slab_flags_t kmem_cache_flags(unsigned long object_size,
+slab_flags_t kmem_cache_flags(unsigned int object_size,
slab_flags_t flags, const char *name,
void (*ctor)(void *));
#else
static inline struct kmem_cache *
-__kmem_cache_alias(const char *name, size_t size, size_t align,
+__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *))
{ return NULL; }
-static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
+static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
slab_flags_t flags, const char *name,
void (*ctor)(void *))
{
@@ -166,6 +166,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
SLAB_TEMPORARY | \
SLAB_ACCOUNT)
+bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 10f127b2de7c..98dcdc352062 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -10,6 +10,7 @@
#include <linux/poison.h>
#include <linux/interrupt.h>
#include <linux/memory.h>
+#include <linux/cache.h>
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/cpu.h>
@@ -81,38 +82,19 @@ unsigned int kmem_cache_size(struct kmem_cache *s)
EXPORT_SYMBOL(kmem_cache_size);
#ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(const char *name, size_t size)
+static int kmem_cache_sanity_check(const char *name, unsigned int size)
{
- struct kmem_cache *s = NULL;
-
if (!name || in_interrupt() || size < sizeof(void *) ||
size > KMALLOC_MAX_SIZE) {
pr_err("kmem_cache_create(%s) integrity check failed\n", name);
return -EINVAL;
}
- list_for_each_entry(s, &slab_caches, list) {
- char tmp;
- int res;
-
- /*
- * This happens when the module gets unloaded and doesn't
- * destroy its slab cache and no-one else reuses the vmalloc
- * area of the module. Print a warning.
- */
- res = probe_kernel_address(s->name, tmp);
- if (res) {
- pr_err("Slab cache with size %d has lost its name\n",
- s->object_size);
- continue;
- }
- }
-
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
return 0;
}
#else
-static inline int kmem_cache_sanity_check(const char *name, size_t size)
+static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
{
return 0;
}
@@ -279,8 +261,8 @@ static inline void memcg_unlink_cache(struct kmem_cache *s)
* Figure out what the alignment of the objects will be given a set of
* flags, a user specified alignment and the size of the objects.
*/
-static unsigned long calculate_alignment(unsigned long flags,
- unsigned long align, unsigned long size)
+static unsigned int calculate_alignment(slab_flags_t flags,
+ unsigned int align, unsigned int size)
{
/*
* If the user wants hardware cache aligned objects then follow that
@@ -290,7 +272,7 @@ static unsigned long calculate_alignment(unsigned long flags,
* alignment though. If that is greater then use it.
*/
if (flags & SLAB_HWCACHE_ALIGN) {
- unsigned long ralign;
+ unsigned int ralign;
ralign = cache_line_size();
while (size <= ralign / 2)
@@ -330,7 +312,7 @@ int slab_unmergeable(struct kmem_cache *s)
return 0;
}
-struct kmem_cache *find_mergeable(size_t size, size_t align,
+struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
slab_flags_t flags, const char *name, void (*ctor)(void *))
{
struct kmem_cache *s;
@@ -378,9 +360,9 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
}
static struct kmem_cache *create_cache(const char *name,
- size_t object_size, size_t size, size_t align,
- slab_flags_t flags, size_t useroffset,
- size_t usersize, void (*ctor)(void *),
+ unsigned int object_size, unsigned int align,
+ slab_flags_t flags, unsigned int useroffset,
+ unsigned int usersize, void (*ctor)(void *),
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
struct kmem_cache *s;
@@ -395,8 +377,7 @@ static struct kmem_cache *create_cache(const char *name,
goto out;
s->name = name;
- s->object_size = object_size;
- s->size = size;
+ s->size = s->object_size = object_size;
s->align = align;
s->ctor = ctor;
s->useroffset = useroffset;
@@ -451,8 +432,10 @@ out_free_cache:
* as davem.
*/
struct kmem_cache *
-kmem_cache_create_usercopy(const char *name, size_t size, size_t align,
- slab_flags_t flags, size_t useroffset, size_t usersize,
+kmem_cache_create_usercopy(const char *name,
+ unsigned int size, unsigned int align,
+ slab_flags_t flags,
+ unsigned int useroffset, unsigned int usersize,
void (*ctor)(void *))
{
struct kmem_cache *s = NULL;
@@ -500,7 +483,7 @@ kmem_cache_create_usercopy(const char *name, size_t size, size_t align,
goto out_unlock;
}
- s = create_cache(cache_name, size, size,
+ s = create_cache(cache_name, size,
calculate_alignment(flags, align, size),
flags, useroffset, usersize, ctor, NULL, NULL);
if (IS_ERR(s)) {
@@ -531,7 +514,7 @@ out_unlock:
EXPORT_SYMBOL(kmem_cache_create_usercopy);
struct kmem_cache *
-kmem_cache_create(const char *name, size_t size, size_t align,
+kmem_cache_create(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *))
{
return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
@@ -647,7 +630,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
goto out_unlock;
s = create_cache(cache_name, root_cache->object_size,
- root_cache->size, root_cache->align,
+ root_cache->align,
root_cache->flags & CACHE_CREATE_MASK,
root_cache->useroffset, root_cache->usersize,
root_cache->ctor, memcg, root_cache);
@@ -916,8 +899,9 @@ bool slab_is_available(void)
#ifndef CONFIG_SLOB
/* Create a cache during boot when no slab services are available yet */
-void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
- slab_flags_t flags, size_t useroffset, size_t usersize)
+void __init create_boot_cache(struct kmem_cache *s, const char *name,
+ unsigned int size, slab_flags_t flags,
+ unsigned int useroffset, unsigned int usersize)
{
int err;
@@ -932,15 +916,15 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
err = __kmem_cache_create(s, flags);
if (err)
- panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n",
+ panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
name, size, err);
s->refcount = -1; /* Exempt from merging for now */
}
-struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
- slab_flags_t flags, size_t useroffset,
- size_t usersize)
+struct kmem_cache *__init create_kmalloc_cache(const char *name,
+ unsigned int size, slab_flags_t flags,
+ unsigned int useroffset, unsigned int usersize)
{
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
@@ -954,11 +938,11 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
return s;
}
-struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
+struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
EXPORT_SYMBOL(kmalloc_caches);
#ifdef CONFIG_ZONE_DMA
-struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
+struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
EXPORT_SYMBOL(kmalloc_dma_caches);
#endif
@@ -968,7 +952,7 @@ EXPORT_SYMBOL(kmalloc_dma_caches);
* of two cache sizes there. The size of larger slabs can be determined using
* fls.
*/
-static s8 size_index[24] = {
+static u8 size_index[24] __ro_after_init = {
3, /* 8 */
4, /* 16 */
5, /* 24 */
@@ -995,7 +979,7 @@ static s8 size_index[24] = {
2 /* 192 */
};
-static inline int size_index_elem(size_t bytes)
+static inline unsigned int size_index_elem(unsigned int bytes)
{
return (bytes - 1) / 8;
}
@@ -1006,7 +990,7 @@ static inline int size_index_elem(size_t bytes)
*/
struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
- int index;
+ unsigned int index;
if (unlikely(size > KMALLOC_MAX_SIZE)) {
WARN_ON_ONCE(!(flags & __GFP_NOWARN));
@@ -1064,13 +1048,13 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
*/
void __init setup_kmalloc_cache_index_table(void)
{
- int i;
+ unsigned int i;
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
- int elem = size_index_elem(i);
+ unsigned int elem = size_index_elem(i);
if (elem >= ARRAY_SIZE(size_index))
break;
@@ -1137,9 +1121,9 @@ void __init create_kmalloc_caches(slab_flags_t flags)
struct kmem_cache *s = kmalloc_caches[i];
if (s) {
- int size = kmalloc_size(i);
+ unsigned int size = kmalloc_size(i);
char *n = kasprintf(GFP_NOWAIT,
- "dma-kmalloc-%d", size);
+ "dma-kmalloc-%u", size);
BUG_ON(!n);
kmalloc_dma_caches[i] = create_kmalloc_cache(n,
@@ -1182,10 +1166,10 @@ EXPORT_SYMBOL(kmalloc_order_trace);
#ifdef CONFIG_SLAB_FREELIST_RANDOM
/* Randomize a generic freelist */
static void freelist_randomize(struct rnd_state *state, unsigned int *list,
- size_t count)
+ unsigned int count)
{
- size_t i;
unsigned int rand;
+ unsigned int i;
for (i = 0; i < count; i++)
list[i] = i;
@@ -1532,3 +1516,11 @@ EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
EXPORT_TRACEPOINT_SYMBOL(kfree);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
+
+int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
+{
+ if (__should_failslab(s, gfpflags))
+ return -ENOMEM;
+ return 0;
+}
+ALLOW_ERROR_INJECTION(should_failslab, ERRNO);
diff --git a/mm/slub.c b/mm/slub.c
index e381728a3751..44aa7847324a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -311,18 +311,18 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
__p += (__s)->size, __idx++)
/* Determine object index from a given position */
-static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
+static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
{
return (p - addr) / s->size;
}
-static inline int order_objects(int order, unsigned long size, int reserved)
+static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved)
{
- return ((PAGE_SIZE << order) - reserved) / size;
+ return (((unsigned int)PAGE_SIZE << order) - reserved) / size;
}
-static inline struct kmem_cache_order_objects oo_make(int order,
- unsigned long size, int reserved)
+static inline struct kmem_cache_order_objects oo_make(unsigned int order,
+ unsigned int size, unsigned int reserved)
{
struct kmem_cache_order_objects x = {
(order << OO_SHIFT) + order_objects(order, size, reserved)
@@ -331,12 +331,12 @@ static inline struct kmem_cache_order_objects oo_make(int order,
return x;
}
-static inline int oo_order(struct kmem_cache_order_objects x)
+static inline unsigned int oo_order(struct kmem_cache_order_objects x)
{
return x.x >> OO_SHIFT;
}
-static inline int oo_objects(struct kmem_cache_order_objects x)
+static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
{
return x.x & OO_MASK;
}
@@ -466,7 +466,7 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
set_bit(slab_index(p, s, addr), map);
}
-static inline int size_from_object(struct kmem_cache *s)
+static inline unsigned int size_from_object(struct kmem_cache *s)
{
if (s->flags & SLAB_RED_ZONE)
return s->size - s->red_left_pad;
@@ -598,13 +598,13 @@ static void init_tracking(struct kmem_cache *s, void *object)
set_track(s, object, TRACK_ALLOC, 0UL);
}
-static void print_track(const char *s, struct track *t)
+static void print_track(const char *s, struct track *t, unsigned long pr_time)
{
if (!t->addr)
return;
pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
- s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
+ s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
#ifdef CONFIG_STACKTRACE
{
int i;
@@ -619,11 +619,12 @@ static void print_track(const char *s, struct track *t)
static void print_tracking(struct kmem_cache *s, void *object)
{
+ unsigned long pr_time = jiffies;
if (!(s->flags & SLAB_STORE_USER))
return;
- print_track("Allocated", get_track(s, object, TRACK_ALLOC));
- print_track("Freed", get_track(s, object, TRACK_FREE));
+ print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
+ print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
}
static void print_page_info(struct page *page)
@@ -680,7 +681,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
print_section(KERN_ERR, "Object ", p,
- min_t(unsigned long, s->object_size, PAGE_SIZE));
+ min_t(unsigned int, s->object_size, PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
@@ -1292,7 +1293,7 @@ out:
__setup("slub_debug", setup_slub_debug);
-slab_flags_t kmem_cache_flags(unsigned long object_size,
+slab_flags_t kmem_cache_flags(unsigned int object_size,
slab_flags_t flags, const char *name,
void (*ctor)(void *))
{
@@ -1325,7 +1326,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct page *page) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct page *page) {}
-slab_flags_t kmem_cache_flags(unsigned long object_size,
+slab_flags_t kmem_cache_flags(unsigned int object_size,
slab_flags_t flags, const char *name,
void (*ctor)(void *))
{
@@ -1362,10 +1363,8 @@ static __always_inline void kfree_hook(void *x)
kasan_kfree_large(x, _RET_IP_);
}
-static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x)
+static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
{
- void *freeptr;
-
kmemleak_free_recursive(x, s->flags);
/*
@@ -1385,17 +1384,12 @@ static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x)
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size);
- freeptr = get_freepointer(s, x);
- /*
- * kasan_slab_free() may put x into memory quarantine, delaying its
- * reuse. In this case the object's freelist pointer is changed.
- */
- kasan_slab_free(s, x, _RET_IP_);
- return freeptr;
+ /* KASAN might put x into memory quarantine, delaying its reuse */
+ return kasan_slab_free(s, x, _RET_IP_);
}
-static inline void slab_free_freelist_hook(struct kmem_cache *s,
- void *head, void *tail)
+static inline bool slab_free_freelist_hook(struct kmem_cache *s,
+ void **head, void **tail)
{
/*
* Compiler cannot detect this function can be removed if slab_free_hook()
@@ -1406,13 +1400,33 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
defined(CONFIG_DEBUG_OBJECTS_FREE) || \
defined(CONFIG_KASAN)
- void *object = head;
- void *tail_obj = tail ? : head;
- void *freeptr;
+ void *object;
+ void *next = *head;
+ void *old_tail = *tail ? *tail : *head;
+
+ /* Head and tail of the reconstructed freelist */
+ *head = NULL;
+ *tail = NULL;
do {
- freeptr = slab_free_hook(s, object);
- } while ((object != tail_obj) && (object = freeptr));
+ object = next;
+ next = get_freepointer(s, object);
+ /* If object's reuse doesn't have to be delayed */
+ if (!slab_free_hook(s, object)) {
+ /* Move object to the new freelist */
+ set_freepointer(s, object, *head);
+ *head = object;
+ if (!*tail)
+ *tail = object;
+ }
+ } while (object != old_tail);
+
+ if (*head == *tail)
+ *tail = NULL;
+
+ return *head != NULL;
+#else
+ return true;
#endif
}
@@ -1435,7 +1449,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
gfp_t flags, int node, struct kmem_cache_order_objects oo)
{
struct page *page;
- int order = oo_order(oo);
+ unsigned int order = oo_order(oo);
if (node == NUMA_NO_NODE)
page = alloc_pages(flags, order);
@@ -1454,8 +1468,8 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
/* Pre-initialize the random sequence cache */
static int init_cache_random_seq(struct kmem_cache *s)
{
+ unsigned int count = oo_objects(s->oo);
int err;
- unsigned long i, count = oo_objects(s->oo);
/* Bailout if already initialised */
if (s->random_seq)
@@ -1470,6 +1484,8 @@ static int init_cache_random_seq(struct kmem_cache *s)
/* Transform to an offset on the set of pages */
if (s->random_seq) {
+ unsigned int i;
+
for (i = 0; i < count; i++)
s->random_seq[i] *= s->size;
}
@@ -1811,7 +1827,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
{
struct page *page, *page2;
void *object = NULL;
- int available = 0;
+ unsigned int available = 0;
int objects;
/*
@@ -2398,7 +2414,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
nid, gfpflags, &gfpflags);
- pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
+ pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
s->name, s->object_size, s->size, oo_order(s->oo),
oo_order(s->min));
@@ -2965,14 +2981,12 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
void *head, void *tail, int cnt,
unsigned long addr)
{
- slab_free_freelist_hook(s, head, tail);
/*
- * slab_free_freelist_hook() could have put the items into quarantine.
- * If so, no need to free them.
+ * With KASAN enabled slab_free_freelist_hook modifies the freelist
+ * to remove objects, whose reuse must be delayed.
*/
- if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU))
- return;
- do_slab_free(s, page, head, tail, cnt, addr);
+ if (slab_free_freelist_hook(s, &head, &tail))
+ do_slab_free(s, page, head, tail, cnt, addr);
}
#ifdef CONFIG_KASAN
@@ -3181,9 +3195,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk);
* and increases the number of allocations possible without having to
* take the list_lock.
*/
-static int slub_min_order;
-static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
-static int slub_min_objects;
+static unsigned int slub_min_order;
+static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
+static unsigned int slub_min_objects;
/*
* Calculate the order of allocation given an slab object size.
@@ -3210,20 +3224,21 @@ static int slub_min_objects;
* requested a higher mininum order then we start with that one instead of
* the smallest order which will fit the object.
*/
-static inline int slab_order(int size, int min_objects,
- int max_order, int fract_leftover, int reserved)
+static inline unsigned int slab_order(unsigned int size,
+ unsigned int min_objects, unsigned int max_order,
+ unsigned int fract_leftover, unsigned int reserved)
{
- int order;
- int rem;
- int min_order = slub_min_order;
+ unsigned int min_order = slub_min_order;
+ unsigned int order;
if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
return get_order(size * MAX_OBJS_PER_PAGE) - 1;
- for (order = max(min_order, get_order(min_objects * size + reserved));
+ for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved));
order <= max_order; order++) {
- unsigned long slab_size = PAGE_SIZE << order;
+ unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
+ unsigned int rem;
rem = (slab_size - reserved) % size;
@@ -3234,12 +3249,11 @@ static inline int slab_order(int size, int min_objects,
return order;
}
-static inline int calculate_order(int size, int reserved)
+static inline int calculate_order(unsigned int size, unsigned int reserved)
{
- int order;
- int min_objects;
- int fraction;
- int max_objects;
+ unsigned int order;
+ unsigned int min_objects;
+ unsigned int max_objects;
/*
* Attempt to find best configuration for a slab. This
@@ -3256,6 +3270,8 @@ static inline int calculate_order(int size, int reserved)
min_objects = min(min_objects, max_objects);
while (min_objects > 1) {
+ unsigned int fraction;
+
fraction = 16;
while (fraction >= 4) {
order = slab_order(size, min_objects,
@@ -3457,8 +3473,8 @@ static void set_cpu_partial(struct kmem_cache *s)
static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
slab_flags_t flags = s->flags;
- size_t size = s->object_size;
- int order;
+ unsigned int size = s->object_size;
+ unsigned int order;
/*
* Round up object size to the next word boundary. We can only
@@ -3548,7 +3564,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
else
order = calculate_order(size, s->reserved);
- if (order < 0)
+ if ((int)order < 0)
return 0;
s->allocflags = 0;
@@ -3632,8 +3648,8 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
free_kmem_cache_nodes(s);
error:
if (flags & SLAB_PANIC)
- panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n",
- s->name, (unsigned long)s->size, s->size,
+ panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n",
+ s->name, s->size, s->size,
oo_order(s->oo), s->offset, (unsigned long)flags);
return -EINVAL;
}
@@ -3691,6 +3707,17 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
discard_slab(s, page);
}
+bool __kmem_cache_empty(struct kmem_cache *s)
+{
+ int node;
+ struct kmem_cache_node *n;
+
+ for_each_kmem_cache_node(s, node, n)
+ if (n->nr_partial || slabs_node(s, node))
+ return false;
+ return true;
+}
+
/*
* Release all resources used by a slab cache.
*/
@@ -3716,7 +3743,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
static int __init setup_slub_min_order(char *str)
{
- get_option(&str, &slub_min_order);
+ get_option(&str, (int *)&slub_min_order);
return 1;
}
@@ -3725,8 +3752,8 @@ __setup("slub_min_order=", setup_slub_min_order);
static int __init setup_slub_max_order(char *str)
{
- get_option(&str, &slub_max_order);
- slub_max_order = min(slub_max_order, MAX_ORDER - 1);
+ get_option(&str, (int *)&slub_max_order);
+ slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1);
return 1;
}
@@ -3735,7 +3762,7 @@ __setup("slub_max_order=", setup_slub_max_order);
static int __init setup_slub_min_objects(char *str)
{
- get_option(&str, &slub_min_objects);
+ get_option(&str, (int *)&slub_min_objects);
return 1;
}
@@ -3824,7 +3851,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
bool to_user)
{
struct kmem_cache *s;
- unsigned long offset;
+ unsigned int offset;
size_t object_size;
/* Find object and usable object size. */
@@ -4230,7 +4257,7 @@ void __init kmem_cache_init(void)
cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
slub_cpu_dead);
- pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%u, Nodes=%d\n",
+ pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n",
cache_line_size(),
slub_min_order, slub_max_order, slub_min_objects,
nr_cpu_ids, nr_node_ids);
@@ -4241,7 +4268,7 @@ void __init kmem_cache_init_late(void)
}
struct kmem_cache *
-__kmem_cache_alias(const char *name, size_t size, size_t align,
+__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *))
{
struct kmem_cache *s, *c;
@@ -4254,13 +4281,12 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
* Adjust the object sizes so that we clear
* the complete object on kzalloc.
*/
- s->object_size = max(s->object_size, (int)size);
- s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+ s->object_size = max(s->object_size, size);
+ s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
for_each_memcg_cache(c, s) {
c->object_size = s->object_size;
- c->inuse = max_t(int, c->inuse,
- ALIGN(size, sizeof(void *)));
+ c->inuse = max(c->inuse, ALIGN(size, sizeof(void *)));
}
if (sysfs_slab_alias(s, name)) {
@@ -4889,35 +4915,35 @@ struct slab_attribute {
static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->size);
+ return sprintf(buf, "%u\n", s->size);
}
SLAB_ATTR_RO(slab_size);
static ssize_t align_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->align);
+ return sprintf(buf, "%u\n", s->align);
}
SLAB_ATTR_RO(align);
static ssize_t object_size_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->object_size);
+ return sprintf(buf, "%u\n", s->object_size);
}
SLAB_ATTR_RO(object_size);
static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", oo_objects(s->oo));
+ return sprintf(buf, "%u\n", oo_objects(s->oo));
}
SLAB_ATTR_RO(objs_per_slab);
static ssize_t order_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- unsigned long order;
+ unsigned int order;
int err;
- err = kstrtoul(buf, 10, &order);
+ err = kstrtouint(buf, 10, &order);
if (err)
return err;
@@ -4930,7 +4956,7 @@ static ssize_t order_store(struct kmem_cache *s,
static ssize_t order_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", oo_order(s->oo));
+ return sprintf(buf, "%u\n", oo_order(s->oo));
}
SLAB_ATTR(order);
@@ -4962,10 +4988,10 @@ static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
size_t length)
{
- unsigned long objects;
+ unsigned int objects;
int err;
- err = kstrtoul(buf, 10, &objects);
+ err = kstrtouint(buf, 10, &objects);
if (err)
return err;
if (objects && !kmem_cache_has_cpu_partial(s))
@@ -5081,7 +5107,7 @@ SLAB_ATTR_RO(cache_dma);
static ssize_t usersize_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%zu\n", s->usersize);
+ return sprintf(buf, "%u\n", s->usersize);
}
SLAB_ATTR_RO(usersize);
@@ -5093,7 +5119,7 @@ SLAB_ATTR_RO(destroy_by_rcu);
static ssize_t reserved_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->reserved);
+ return sprintf(buf, "%u\n", s->reserved);
}
SLAB_ATTR_RO(reserved);
@@ -5288,21 +5314,22 @@ SLAB_ATTR(shrink);
#ifdef CONFIG_NUMA
static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
+ return sprintf(buf, "%u\n", s->remote_node_defrag_ratio / 10);
}
static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- unsigned long ratio;
+ unsigned int ratio;
int err;
- err = kstrtoul(buf, 10, &ratio);
+ err = kstrtouint(buf, 10, &ratio);
if (err)
return err;
+ if (ratio > 100)
+ return -ERANGE;
- if (ratio <= 100)
- s->remote_node_defrag_ratio = ratio * 10;
+ s->remote_node_defrag_ratio = ratio * 10;
return length;
}
@@ -5663,7 +5690,7 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'A';
if (p != name + 1)
*p++ = '-';
- p += sprintf(p, "%07d", s->size);
+ p += sprintf(p, "%07u", s->size);
BUG_ON(p > name + ID_STR_LENGTH - 1);
return name;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index bd0276d5f66b..640e68f8324b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -303,7 +303,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
ms = __nr_to_section(pnum);
pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
__func__);
- ms->section_mem_map = 0;
}
if (vmemmap_buf_start) {
diff --git a/mm/sparse.c b/mm/sparse.c
index 58cab483e81b..4a58f8809542 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -202,6 +202,12 @@ static inline int next_present_section_nr(int section_nr)
(section_nr <= __highest_present_section_nr)); \
section_nr = next_present_section_nr(section_nr))
+/*
+ * Record how many memory sections are marked as present
+ * during system bootup.
+ */
+static int __initdata nr_present_sections;
+
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -231,6 +237,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
ms->section_mem_map = sparse_encode_early_nid(nid) |
SECTION_IS_ONLINE;
section_mark_present(ms);
+ nr_present_sections++;
}
}
}
@@ -446,7 +453,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
ms = __nr_to_section(pnum);
pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
__func__);
- ms->section_mem_map = 0;
}
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
@@ -474,7 +480,6 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
__func__);
- ms->section_mem_map = 0;
return NULL;
}
#endif
@@ -486,10 +491,12 @@ void __weak __meminit vmemmap_populate_print_last(void)
/**
* alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
* @map: usemap_map for pageblock flags or mmap_map for vmemmap
+ * @unit_size: size of map unit
*/
static void __init alloc_usemap_and_memmap(void (*alloc_func)
(void *, unsigned long, unsigned long,
- unsigned long, int), void *data)
+ unsigned long, int), void *data,
+ int data_unit_size)
{
unsigned long pnum;
unsigned long map_count;
@@ -566,7 +573,8 @@ void __init sparse_init(void)
if (!usemap_map)
panic("can not allocate usemap_map\n");
alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
- (void *)usemap_map);
+ (void *)usemap_map,
+ sizeof(usemap_map[0]));
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
@@ -574,21 +582,28 @@ void __init sparse_init(void)
if (!map_map)
panic("can not allocate map_map\n");
alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
- (void *)map_map);
+ (void *)map_map,
+ sizeof(map_map[0]));
#endif
for_each_present_section_nr(0, pnum) {
+ struct mem_section *ms;
+ ms = __nr_to_section(pnum);
usemap = usemap_map[pnum];
- if (!usemap)
+ if (!usemap) {
+ ms->section_mem_map = 0;
continue;
+ }
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
map = map_map[pnum];
#else
map = sparse_early_mem_map_alloc(pnum);
#endif
- if (!map)
+ if (!map) {
+ ms->section_mem_map = 0;
continue;
+ }
sparse_init_one_section(__nr_to_section(pnum), pnum, map,
usemap);
@@ -779,7 +794,13 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
goto out;
}
- memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Poison uninitialized struct pages in order to catch invalid flags
+ * combinations.
+ */
+ memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION);
+#endif
section_mark_present(ms);
diff --git a/mm/swap.c b/mm/swap.c
index eed846cfc8b8..26fc9b5f1b6c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -708,7 +708,6 @@ void lru_add_drain_all(void)
* release_pages - batched put_page()
* @pages: array of pages to release
* @nr: number of pages
- * @cold: whether the pages are cache cold
*
* Decrement the reference count on all the pages in @pages. If it
* fell to zero, remove the page from the LRU and free it.
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index bebc19292018..f2641894f440 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -34,8 +34,6 @@
#include <linux/mutex.h>
#include <linux/mm.h>
-#ifdef CONFIG_SWAP
-
static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
static bool swap_slot_cache_active;
bool swap_slot_cache_enabled;
@@ -356,5 +354,3 @@ repeat:
return entry;
}
-
-#endif /* CONFIG_SWAP */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 39ae7cfad90f..486f7c26386e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -38,7 +38,7 @@ static const struct address_space_operations swap_aops = {
struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
-bool swap_vma_readahead __read_mostly = true;
+static bool enable_vma_readahead __read_mostly = true;
#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
@@ -322,6 +322,11 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
release_pages(pagep, nr);
}
+static inline bool swap_use_vma_readahead(void)
+{
+ return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
+}
+
/*
* Lookup a swap entry in the swap cache. A found page will be returned
* unlocked and with its refcount incremented - we rely on the kernel
@@ -332,32 +337,48 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
unsigned long addr)
{
struct page *page;
- unsigned long ra_info;
- int win, hits, readahead;
+ struct swap_info_struct *si;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ put_swap_device(si);
INC_CACHE_INFO(find_total);
if (page) {
+ bool vma_ra = swap_use_vma_readahead();
+ bool readahead;
+
INC_CACHE_INFO(find_success);
+ /*
+ * At the moment, we don't support PG_readahead for anon THP
+ * so let's bail out rather than confusing the readahead stat.
+ */
if (unlikely(PageTransCompound(page)))
return page;
+
readahead = TestClearPageReadahead(page);
- if (vma) {
- ra_info = GET_SWAP_RA_VAL(vma);
- win = SWAP_RA_WIN(ra_info);
- hits = SWAP_RA_HITS(ra_info);
+ if (vma && vma_ra) {
+ unsigned long ra_val;
+ int win, hits;
+
+ ra_val = GET_SWAP_RA_VAL(vma);
+ win = SWAP_RA_WIN(ra_val);
+ hits = SWAP_RA_HITS(ra_val);
if (readahead)
hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
atomic_long_set(&vma->swap_readahead_info,
SWAP_RA_VAL(addr, win, hits));
}
+
if (readahead) {
count_vm_event(SWAP_RA_HIT);
- if (!vma)
+ if (!vma || !vma_ra)
atomic_inc(&swapin_readahead_hits);
}
}
+
return page;
}
@@ -365,8 +386,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
{
- struct page *found_page, *new_page = NULL;
- struct address_space *swapper_space = swap_address_space(entry);
+ struct page *found_page = NULL, *new_page = NULL;
+ struct swap_info_struct *si;
int err;
*new_page_allocated = false;
@@ -376,7 +397,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(swapper_space, swp_offset(entry));
+ si = get_swap_device(entry);
+ if (!si)
+ break;
+ found_page = find_get_page(swap_address_space(entry),
+ swp_offset(entry));
+ put_swap_device(si);
if (found_page)
break;
@@ -533,11 +559,10 @@ static unsigned long swapin_nr_pages(unsigned long offset)
}
/**
- * swapin_readahead - swap in pages in hope we need them soon
+ * swap_cluster_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory
* @gfp_mask: memory allocation flags
- * @vma: user vma this address belongs to
- * @addr: target address for mempolicy
+ * @vmf: fault information
*
* Returns the struct page for entry and addr, after queueing swapin.
*
@@ -549,10 +574,10 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* This has been extended to use the NUMA policies from the mm triggering
* the readahead.
*
- * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
+ * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL.
*/
-struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr)
+struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_fault *vmf)
{
struct page *page;
unsigned long entry_offset = swp_offset(entry);
@@ -562,6 +587,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct swap_info_struct *si = swp_swap_info(entry);
struct blk_plug plug;
bool do_poll = true, page_allocated;
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long addr = vmf->address;
mask = swapin_nr_pages(offset) - 1;
if (!mask)
@@ -586,8 +613,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
continue;
if (page_allocated) {
swap_readpage(page, false);
- if (offset != entry_offset &&
- likely(!PageTransCompound(page))) {
+ if (offset != entry_offset) {
SetPageReadahead(page);
count_vm_event(SWAP_RA);
}
@@ -649,16 +675,15 @@ static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
}
-struct page *swap_readahead_detect(struct vm_fault *vmf,
- struct vma_swap_readahead *swap_ra)
+static void swap_ra_info(struct vm_fault *vmf,
+ struct vma_swap_readahead *ra_info)
{
struct vm_area_struct *vma = vmf->vma;
- unsigned long swap_ra_info;
- struct page *page;
+ unsigned long ra_val;
swp_entry_t entry;
unsigned long faddr, pfn, fpfn;
unsigned long start, end;
- pte_t *pte;
+ pte_t *pte, *orig_pte;
unsigned int max_win, hits, prev_win, win, left;
#ifndef CONFIG_64BIT
pte_t *tpte;
@@ -667,30 +692,32 @@ struct page *swap_readahead_detect(struct vm_fault *vmf,
max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
SWAP_RA_ORDER_CEILING);
if (max_win == 1) {
- swap_ra->win = 1;
- return NULL;
+ ra_info->win = 1;
+ return;
}
faddr = vmf->address;
- entry = pte_to_swp_entry(vmf->orig_pte);
- if ((unlikely(non_swap_entry(entry))))
- return NULL;
- page = lookup_swap_cache(entry, vma, faddr);
- if (page)
- return page;
+ orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
+ entry = pte_to_swp_entry(*pte);
+ if ((unlikely(non_swap_entry(entry)))) {
+ pte_unmap(orig_pte);
+ return;
+ }
fpfn = PFN_DOWN(faddr);
- swap_ra_info = GET_SWAP_RA_VAL(vma);
- pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
- prev_win = SWAP_RA_WIN(swap_ra_info);
- hits = SWAP_RA_HITS(swap_ra_info);
- swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits,
+ ra_val = GET_SWAP_RA_VAL(vma);
+ pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
+ prev_win = SWAP_RA_WIN(ra_val);
+ hits = SWAP_RA_HITS(ra_val);
+ ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
max_win, prev_win);
atomic_long_set(&vma->swap_readahead_info,
SWAP_RA_VAL(faddr, win, 0));
- if (win == 1)
- return NULL;
+ if (win == 1) {
+ pte_unmap(orig_pte);
+ return;
+ }
/* Copy the PTEs because the page table may be unmapped */
if (fpfn == pfn + 1)
@@ -703,23 +730,21 @@ struct page *swap_readahead_detect(struct vm_fault *vmf,
swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
&start, &end);
}
- swap_ra->nr_pte = end - start;
- swap_ra->offset = fpfn - start;
- pte = vmf->pte - swap_ra->offset;
+ ra_info->nr_pte = end - start;
+ ra_info->offset = fpfn - start;
+ pte -= ra_info->offset;
#ifdef CONFIG_64BIT
- swap_ra->ptes = pte;
+ ra_info->ptes = pte;
#else
- tpte = swap_ra->ptes;
+ tpte = ra_info->ptes;
for (pfn = start; pfn != end; pfn++)
*tpte++ = *pte++;
#endif
-
- return NULL;
+ pte_unmap(orig_pte);
}
-struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
- struct vm_fault *vmf,
- struct vma_swap_readahead *swap_ra)
+static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+ struct vm_fault *vmf)
{
struct blk_plug plug;
struct vm_area_struct *vma = vmf->vma;
@@ -728,12 +753,14 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
swp_entry_t entry;
unsigned int i;
bool page_allocated;
+ struct vma_swap_readahead ra_info = {0,};
- if (swap_ra->win == 1)
+ swap_ra_info(vmf, &ra_info);
+ if (ra_info.win == 1)
goto skip;
blk_start_plug(&plug);
- for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte;
+ for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
i++, pte++) {
pentry = *pte;
if (pte_none(pentry))
@@ -749,8 +776,7 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
continue;
if (page_allocated) {
swap_readpage(page, false);
- if (i != swap_ra->offset &&
- likely(!PageTransCompound(page))) {
+ if (i != ra_info.offset) {
SetPageReadahead(page);
count_vm_event(SWAP_RA);
}
@@ -761,23 +787,43 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
lru_add_drain();
skip:
return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
- swap_ra->win == 1);
+ ra_info.win == 1);
+}
+
+/**
+ * swapin_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @gfp_mask: memory allocation flags
+ * @vmf: fault information
+ *
+ * Returns the struct page for entry and addr, after queueing swapin.
+ *
+ * It's a main entry function for swap readahead. By the configuration,
+ * it will read ahead blocks by cluster-based(ie, physical disk based)
+ * or vma-based(ie, virtual address based on faulty address) readahead.
+ */
+struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_fault *vmf)
+{
+ return swap_use_vma_readahead() ?
+ swap_vma_readahead(entry, gfp_mask, vmf) :
+ swap_cluster_readahead(entry, gfp_mask, vmf);
}
#ifdef CONFIG_SYSFS
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false");
+ return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false");
}
static ssize_t vma_ra_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
- swap_vma_readahead = true;
+ enable_vma_readahead = true;
else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
- swap_vma_readahead = false;
+ enable_vma_readahead = false;
else
return -EINVAL;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c7a33717d079..0b3674b1409a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -38,6 +38,7 @@
#include <linux/export.h>
#include <linux/swap_slots.h>
#include <linux/sort.h>
+#include <linux/stop_machine.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -85,7 +86,7 @@ PLIST_HEAD(swap_active_head);
* is held and the locking order requires swap_lock to be taken
* before any swap_info_struct->lock.
*/
-struct plist_head *swap_avail_heads;
+static struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -1107,6 +1108,64 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
return p;
}
+/*
+ * Check whether swap entry is valid in the swap device. If so,
+ * return pointer to swap_info_struct, and keep the swap entry valid
+ * via preventing the swap device from being swapoff, until
+ * put_swap_device() is called. Otherwise return NULL.
+ *
+ * Notice that swapoff or swapoff+swapon can still happen before the
+ * preempt_disable() in get_swap_device() or after the
+ * preempt_enable() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc. The
+ * caller must be prepared for that. For example, the following
+ * situation is possible.
+ *
+ * CPU1 CPU2
+ * do_swap_page()
+ * ... swapoff+swapon
+ * __read_swap_cache_async()
+ * swapcache_prepare()
+ * __swap_duplicate()
+ * // check swap_map
+ * // verify PTE not changed
+ *
+ * In __swap_duplicate(), the swap_map need to be checked before
+ * changing partly because the specified swap entry may be for another
+ * swap device which has been swapoff. And in do_swap_page(), after
+ * the page is read from the swap device, the PTE is verified not
+ * changed with the page table locked to check whether the swap device
+ * has been swapoff or swapoff+swapon.
+ */
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ unsigned long type, offset;
+
+ if (!entry.val)
+ goto out;
+ type = swp_type(entry);
+ if (type >= nr_swapfiles)
+ goto bad_nofile;
+ si = swap_info[type];
+
+ preempt_disable();
+ if (!(si->flags & SWP_VALID))
+ goto unlock_out;
+ offset = swp_offset(entry);
+ if (offset >= si->max)
+ goto unlock_out;
+
+ return si;
+bad_nofile:
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
+out:
+ return NULL;
+unlock_out:
+ preempt_enable();
+ return NULL;
+}
+
static unsigned char __swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -1328,11 +1387,18 @@ int page_swapcount(struct page *page)
return count;
}
-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+int __swap_count(swp_entry_t entry)
{
+ struct swap_info_struct *si;
pgoff_t offset = swp_offset(entry);
+ int count = 0;
- return swap_count(si->swap_map[offset]);
+ si = get_swap_device(entry);
+ if (si) {
+ count = swap_count(si->swap_map[offset]);
+ put_swap_device(si);
+ }
+ return count;
}
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
@@ -1357,9 +1423,11 @@ int __swp_swapcount(swp_entry_t entry)
int count = 0;
struct swap_info_struct *si;
- si = __swap_info_get(entry);
- if (si)
+ si = get_swap_device(entry);
+ if (si) {
count = swap_swapcount(si, entry);
+ put_swap_device(si);
+ }
return count;
}
@@ -1800,8 +1868,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
- set_pte_at(vma->vm_mm, addr, pte,
- pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
mem_cgroup_commit_charge(page, memcg, true, false);
@@ -1810,6 +1876,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
+ set_pte_at(vma->vm_mm, addr, pte,
+ pte_mkold(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
/*
* Move the page to the active list so it is not
@@ -2451,9 +2519,9 @@ static int swap_node(struct swap_info_struct *p)
return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}
-static void _enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map,
- struct swap_cluster_info *cluster_info)
+static void setup_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
{
int i;
@@ -2478,7 +2546,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
}
p->swap_map = swap_map;
p->cluster_info = cluster_info;
- p->flags |= SWP_WRITEOK;
+}
+
+static void _enable_swap_info(struct swap_info_struct *p)
+{
+ p->flags |= SWP_WRITEOK | SWP_VALID;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
@@ -2497,6 +2569,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
add_to_avail_list(p);
}
+static int swap_onoff_stop(void *arg)
+{
+ return 0;
+}
+
static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info,
@@ -2505,7 +2582,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, prio, swap_map, cluster_info);
+ setup_swap_info(p, prio, swap_map, cluster_info);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * Guarantee swap_map, cluster_info, etc. fields are used
+ * between get/put_swap_device() only if SWP_VALID bit is set
+ */
+ stop_machine(swap_onoff_stop, NULL, cpu_online_mask);
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2514,7 +2601,8 @@ static void reinsert_swap_info(struct swap_info_struct *p)
{
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2617,6 +2705,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
reenable_swap_slots_cache_unlock();
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * wait for swap operations protected by get/put_swap_device()
+ * to complete
+ */
+ stop_machine(swap_onoff_stop, NULL, cpu_online_mask);
+
flush_work(&p->discard_work);
destroy_swap_extents(p);
@@ -3356,22 +3455,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
struct swap_info_struct *p;
struct swap_cluster_info *ci;
- unsigned long offset, type;
+ unsigned long offset;
unsigned char count;
unsigned char has_cache;
int err = -EINVAL;
- if (non_swap_entry(entry))
+ p = get_swap_device(entry);
+ if (!p)
goto out;
- type = swp_type(entry);
- if (type >= nr_swapfiles)
- goto bad_file;
- p = swap_info[type];
offset = swp_offset(entry);
- if (unlikely(offset >= p->max))
- goto out;
-
ci = lock_cluster_or_swap_info(p, offset);
count = p->swap_map[offset];
@@ -3417,11 +3510,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
out:
+ if (p)
+ put_swap_device(p);
return err;
-
-bad_file:
- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
- goto out;
}
/*
@@ -3513,6 +3604,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
struct page *list_page;
pgoff_t offset;
unsigned char count;
+ int ret = 0;
/*
* When debugging, it's easier to use __GFP_ZERO here; but it's better
@@ -3520,15 +3612,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
*/
page = alloc_page(gfp_mask | __GFP_HIGHMEM);
- si = swap_info_get(entry);
+ si = get_swap_device(entry);
if (!si) {
/*
* An acceptable race has occurred since the failing
- * __swap_duplicate(): the swap entry has been freed,
- * perhaps even the whole swap_map cleared for swapoff.
+ * __swap_duplicate(): the swap device may be swapoff
*/
goto outer;
}
+ spin_lock(&si->lock);
offset = swp_offset(entry);
@@ -3546,9 +3638,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
if (!page) {
- unlock_cluster(ci);
- spin_unlock(&si->lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
/*
@@ -3600,10 +3691,11 @@ out_unlock_cont:
out:
unlock_cluster(ci);
spin_unlock(&si->lock);
+ put_swap_device(si);
outer:
if (page)
__free_page(page);
- return 0;
+ return ret;
}
/*
diff --git a/mm/util.c b/mm/util.c
index c1250501364f..1fc4fa7576f7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -287,7 +287,7 @@ int vma_is_stack_for_current(struct vm_area_struct *vma)
}
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
-void arch_pick_mmap_layout(struct mm_struct *mm)
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
@@ -515,6 +515,16 @@ struct address_space *page_mapping(struct page *page)
}
EXPORT_SYMBOL(page_mapping);
+/*
+ * For file cache pages, return the address_space, otherwise return NULL
+ */
+struct address_space *page_mapping_file(struct page *page)
+{
+ if (unlikely(PageSwapCache(page)))
+ return NULL;
+ return page_mapping(page);
+}
+
/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
{
@@ -658,6 +668,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free += global_node_page_state(NR_SLAB_RECLAIMABLE);
/*
+ * Part of the kernel memory, which can be released
+ * under memory pressure.
+ */
+ free += global_node_page_state(
+ NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT;
+
+ /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
if (free <= totalreserve_pages)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cd5dc3faaa57..7bf65da22ff0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -116,6 +116,15 @@ struct scan_control {
/* Number of pages freed so far during a call to shrink_zones() */
unsigned long nr_reclaimed;
+
+ struct {
+ unsigned int dirty;
+ unsigned int unqueued_dirty;
+ unsigned int congested;
+ unsigned int writeback;
+ unsigned int immediate;
+ unsigned int file_taken;
+ } nr;
};
#ifdef ARCH_HAS_PREFETCH
@@ -190,6 +199,18 @@ static bool sane_reclaim(struct scan_control *sc)
#endif
return false;
}
+
+static void set_memcg_bit(enum pgdat_flags flag,
+ struct mem_cgroup *memcg)
+{
+ set_bit(flag, &memcg->flags);
+}
+
+static int test_memcg_bit(enum pgdat_flags flag,
+ struct mem_cgroup *memcg)
+{
+ return test_bit(flag, &memcg->flags);
+}
#else
static bool global_reclaim(struct scan_control *sc)
{
@@ -200,6 +221,17 @@ static bool sane_reclaim(struct scan_control *sc)
{
return true;
}
+
+static inline void set_memcg_bit(enum pgdat_flags flag,
+ struct mem_cgroup *memcg)
+{
+}
+
+static inline int test_memcg_bit(enum pgdat_flags flag,
+ struct mem_cgroup *memcg)
+{
+ return 0;
+}
#endif
/*
@@ -442,16 +474,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
return 0;
- if (!down_read_trylock(&shrinker_rwsem)) {
- /*
- * If we would return 0, our callers would understand that we
- * have nothing else to shrink and give up trying. By returning
- * 1 we keep it going and assume we'll be able to shrink next
- * time.
- */
- freed = 1;
+ if (!down_read_trylock(&shrinker_rwsem))
goto out;
- }
list_for_each_entry(shrinker, &shrinker_list, list) {
struct shrink_control sc = {
@@ -865,17 +889,6 @@ static void page_check_dirty_writeback(struct page *page,
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}
-struct reclaim_stat {
- unsigned nr_dirty;
- unsigned nr_unqueued_dirty;
- unsigned nr_congested;
- unsigned nr_writeback;
- unsigned nr_immediate;
- unsigned nr_activate;
- unsigned nr_ref_keep;
- unsigned nr_unmap_fail;
-};
-
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -934,7 +947,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
/*
- * The number of dirty pages determines if a zone is marked
+ * The number of dirty pages determines if a node is marked
* reclaim_congested which affects wait_iff_congested. kswapd
* will stall and start writing pages if the tail of the LRU
* is all dirty unqueued pages.
@@ -1763,23 +1776,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
free_unref_page_list(&page_list);
/*
- * If reclaim is isolating dirty pages under writeback, it implies
- * that the long-lived page allocation rate is exceeding the page
- * laundering rate. Either the global limits are not being effective
- * at throttling processes due to the page distribution throughout
- * zones or there is heavy usage of a slow backing device. The
- * only option is to throttle from reclaim context which is not ideal
- * as there is no guarantee the dirtying process is throttled in the
- * same way balance_dirty_pages() manages.
- *
- * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
- * of pages under pages flagged for immediate reclaim and stall if any
- * are encountered in the nr_immediate check below.
- */
- if (stat.nr_writeback && stat.nr_writeback == nr_taken)
- set_bit(PGDAT_WRITEBACK, &pgdat->flags);
-
- /*
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not doing their job. This can
* happen when memory pressure pushes dirty pages to the end of
@@ -1793,48 +1789,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (stat.nr_unqueued_dirty == nr_taken)
wakeup_flusher_threads(WB_REASON_VMSCAN);
- /*
- * Legacy memcg will stall in page writeback so avoid forcibly
- * stalling here.
- */
- if (sane_reclaim(sc)) {
- /*
- * Tag a zone as congested if all the dirty pages scanned were
- * backed by a congested BDI and wait_iff_congested will stall.
- */
- if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
- set_bit(PGDAT_CONGESTED, &pgdat->flags);
-
- /* Allow kswapd to start writing pages during reclaim. */
- if (stat.nr_unqueued_dirty == nr_taken)
- set_bit(PGDAT_DIRTY, &pgdat->flags);
-
- /*
- * If kswapd scans pages marked marked for immediate
- * reclaim and under writeback (nr_immediate), it implies
- * that pages are cycling through the LRU faster than
- * they are written so also forcibly stall.
- */
- if (stat.nr_immediate && current_may_throttle())
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- }
-
- /*
- * Stall direct reclaim for IO completions if underlying BDIs or zone
- * is congested. Allow kswapd to continue until it starts encountering
- * unqueued dirty pages or cycling through the LRU too quickly.
- */
- if (!sc->hibernation_mode && !current_is_kswapd() &&
- current_may_throttle())
- wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
+ sc->nr.dirty += stat.nr_dirty;
+ sc->nr.congested += stat.nr_congested;
+ sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
+ sc->nr.writeback += stat.nr_writeback;
+ sc->nr.immediate += stat.nr_immediate;
+ if (file)
+ sc->nr.file_taken += nr_taken;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
- nr_scanned, nr_reclaimed,
- stat.nr_dirty, stat.nr_writeback,
- stat.nr_congested, stat.nr_immediate,
- stat.nr_activate, stat.nr_ref_keep,
- stat.nr_unmap_fail,
- sc->priority, file);
+ nr_scanned, nr_reclaimed, &stat, sc->priority, file);
return nr_reclaimed;
}
@@ -2515,6 +2479,12 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
return true;
}
+static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
+{
+ return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
+ (memcg && test_memcg_bit(PGDAT_CONGESTED, memcg));
+}
+
static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2530,6 +2500,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long node_lru_pages = 0;
struct mem_cgroup *memcg;
+ memset(&sc->nr, 0, sizeof(sc->nr));
+
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
@@ -2595,6 +2567,68 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
+ if (current_is_kswapd()) {
+ /*
+ * If reclaim is isolating dirty pages under writeback,
+ * it implies that the long-lived page allocation rate
+ * is exceeding the page laundering rate. Either the
+ * global limits are not being effective at throttling
+ * processes due to the page distribution throughout
+ * zones or there is heavy usage of a slow backing
+ * device. The only option is to throttle from reclaim
+ * context which is not ideal as there is no guarantee
+ * the dirtying process is throttled in the same way
+ * balance_dirty_pages() manages.
+ *
+ * Once a node is flagged PGDAT_WRITEBACK, kswapd will
+ * count the number of pages under pages flagged for
+ * immediate reclaim and stall if any are encountered
+ * in the nr_immediate check below.
+ */
+ if (sc->nr.writeback &&
+ sc->nr.writeback == sc->nr.file_taken)
+ set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+
+ /*
+ * Tag a node as congested if all the dirty pages
+ * scanned were backed by a congested BDI and
+ * wait_iff_congested will stall.
+ */
+ if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+ set_bit(PGDAT_CONGESTED, &pgdat->flags);
+
+ /* Allow kswapd to start writing pages during reclaim.*/
+ if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+ set_bit(PGDAT_DIRTY, &pgdat->flags);
+
+ /*
+ * If kswapd scans pages marked marked for immediate
+ * reclaim and under writeback (nr_immediate), it
+ * implies that pages are cycling through the LRU
+ * faster than they are written so also forcibly stall.
+ */
+ if (sc->nr.immediate)
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ }
+
+ /*
+ * Legacy memcg will stall in page writeback so avoid forcibly
+ * stalling in wait_iff_congested().
+ */
+ if (!global_reclaim(sc) && sane_reclaim(sc) &&
+ sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+ set_memcg_bit(PGDAT_CONGESTED, root);
+
+ /*
+ * Stall direct reclaim for IO completions if underlying BDIs
+ * and node is congested. Allow kswapd to continue until it
+ * starts encountering unqueued dirty pages or cycling through
+ * the LRU too quickly.
+ */
+ if (!sc->hibernation_mode && !current_is_kswapd() &&
+ current_may_throttle() && pgdat_memcg_congested(pgdat, root))
+ wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
@@ -3033,6 +3067,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
* the priority and make it zero.
*/
shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
+ clear_bit(PGDAT_CONGESTED, &memcg->flags);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -3078,6 +3113,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
memalloc_noreclaim_restore(noreclaim_flag);
+ clear_bit(PGDAT_CONGESTED, &memcg->flags);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -3547,16 +3583,21 @@ kswapd_try_sleep:
}
/*
- * A zone is low on free memory, so wake its kswapd task to service it.
+ * A zone is low on free memory or too fragmented for high-order memory. If
+ * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
+ * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim
+ * has failed or is not needed, still wake up kcompactd if only compaction is
+ * needed.
*/
-void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
+ enum zone_type classzone_idx)
{
pg_data_t *pgdat;
if (!managed_zone(zone))
return;
- if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
+ if (!cpuset_zone_allowed(zone, gfp_flags))
return;
pgdat = zone->zone_pgdat;
pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
@@ -3565,14 +3606,23 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- /* Hopeless node, leave it to direct reclaim */
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
- return;
-
- if (pgdat_balanced(pgdat, order, classzone_idx))
+ /* Hopeless node, leave it to direct reclaim if possible */
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
+ pgdat_balanced(pgdat, order, classzone_idx)) {
+ /*
+ * There may be plenty of free memory available, but it's too
+ * fragmented for high-order allocations. Wake up kcompactd
+ * and rely on compaction_suitable() to determine if it's
+ * needed. If it fails, it will defer subsequent attempts to
+ * ratelimit its work.
+ */
+ if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
+ wakeup_kcompactd(pgdat, order, classzone_idx);
return;
+ }
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
+ gfp_flags);
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -3802,7 +3852,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
- * Free memory by calling shrink zone with increasing
+ * Free memory by calling shrink node with increasing
* priorities until we have enough memory freed.
*/
do {
@@ -3877,7 +3927,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
*/
int page_evictable(struct page *page)
{
- return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+ int ret;
+
+ /* Prevent address_space of inode and swap cache from being freed */
+ rcu_read_lock();
+ ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+ rcu_read_unlock();
+ return ret;
}
#ifdef CONFIG_SHMEM
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 33581be705f0..536332e988b8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1161,6 +1161,7 @@ const char * const vmstat_text[] = {
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
+ "nr_indirectly_reclaimable",
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
diff --git a/mm/z3fold.c b/mm/z3fold.c
index d589d318727f..f579ad4a8100 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -620,24 +620,27 @@ lookup:
bud = FIRST;
}
- spin_lock(&pool->stale_lock);
- zhdr = list_first_entry_or_null(&pool->stale,
- struct z3fold_header, buddy);
- /*
- * Before allocating a page, let's see if we can take one from the
- * stale pages list. cancel_work_sync() can sleep so we must make
- * sure it won't be called in case we're in atomic context.
- */
- if (zhdr && (can_sleep || !work_pending(&zhdr->work))) {
- list_del(&zhdr->buddy);
- spin_unlock(&pool->stale_lock);
- if (can_sleep)
+ page = NULL;
+ if (can_sleep) {
+ spin_lock(&pool->stale_lock);
+ zhdr = list_first_entry_or_null(&pool->stale,
+ struct z3fold_header, buddy);
+ /*
+ * Before allocating a page, let's see if we can take one from
+ * the stale pages list. cancel_work_sync() can sleep so we
+ * limit this case to the contexts where we can sleep
+ */
+ if (zhdr) {
+ list_del(&zhdr->buddy);
+ spin_unlock(&pool->stale_lock);
cancel_work_sync(&zhdr->work);
- page = virt_to_page(zhdr);
- } else {
- spin_unlock(&pool->stale_lock);
- page = alloc_page(gfp);
+ page = virt_to_page(zhdr);
+ } else {
+ spin_unlock(&pool->stale_lock);
+ }
}
+ if (!page)
+ page = alloc_page(gfp);
if (!page)
return -ENOMEM;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b7f61cd1c709..61cb05dc950c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -193,6 +193,7 @@ static struct vfsmount *zsmalloc_mnt;
* (see: fix_fullness_group())
*/
static const int fullness_threshold_frac = 4;
+static size_t huge_class_size;
struct size_class {
spinlock_t lock;
@@ -642,18 +643,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
return 0;
}
-
-static int zs_stats_size_open(struct inode *inode, struct file *file)
-{
- return single_open(file, zs_stats_size_show, inode->i_private);
-}
-
-static const struct file_operations zs_stat_size_ops = {
- .open = zs_stats_size_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(zs_stats_size);
static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
{
@@ -672,7 +662,7 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
pool->stat_dentry = entry;
entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
- pool->stat_dentry, pool, &zs_stat_size_ops);
+ pool->stat_dentry, pool, &zs_stats_size_fops);
if (!entry) {
pr_warn("%s: debugfs file entry <%s> creation failed\n",
name, "classes");
@@ -861,6 +851,7 @@ static struct page *get_next_page(struct page *page)
/**
* obj_to_location - get (<page>, <obj_idx>) from encoded object value
+ * @obj: the encoded object value
* @page: page object resides in zspage
* @obj_idx: object index
*/
@@ -1311,6 +1302,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
* zs_map_object - get address of allocated object from handle.
* @pool: pool from which the object was allocated
* @handle: handle returned from zs_malloc
+ * @mm: maping mode to use
*
* Before using an object allocated from zs_malloc, it must be mapped using
* this function. When done with the object, it must be unmapped using
@@ -1418,6 +1410,25 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
+/**
+ * zs_huge_class_size() - Returns the size (in bytes) of the first huge
+ * zsmalloc &size_class.
+ * @pool: zsmalloc pool to use
+ *
+ * The function returns the size of the first huge class - any object of equal
+ * or bigger size will be stored in zspage consisting of a single physical
+ * page.
+ *
+ * Context: Any context.
+ *
+ * Return: the size (in bytes) of the first huge zsmalloc &size_class.
+ */
+size_t zs_huge_class_size(struct zs_pool *pool)
+{
+ return huge_class_size;
+}
+EXPORT_SYMBOL_GPL(zs_huge_class_size);
+
static unsigned long obj_malloc(struct size_class *class,
struct zspage *zspage, unsigned long handle)
{
@@ -2375,6 +2386,27 @@ struct zs_pool *zs_create_pool(const char *name)
objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
/*
+ * We iterate from biggest down to smallest classes,
+ * so huge_class_size holds the size of the first huge
+ * class. Any object bigger than or equal to that will
+ * endup in the huge class.
+ */
+ if (pages_per_zspage != 1 && objs_per_zspage != 1 &&
+ !huge_class_size) {
+ huge_class_size = size;
+ /*
+ * The object uses ZS_HANDLE_SIZE bytes to store the
+ * handle. We need to subtract it, because zs_malloc()
+ * unconditionally adds handle size before it performs
+ * size class search - so object may be smaller than
+ * huge class size, yet it still can end up in the huge
+ * class because it grows by ZS_HANDLE_SIZE extra bytes
+ * right before class lookup.
+ */
+ huge_class_size -= (ZS_HANDLE_SIZE - 1);
+ }
+
+ /*
* size_class is used for normal zsmalloc operation such
* as alloc/free for that size. Although it is natural that we
* have one size_class for each size, there is a chance that we