summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Makefile2
-rw-r--r--mm/filemap.c11
-rw-r--r--mm/huge_memory.c17
-rw-r--r--mm/list_lru.c139
-rw-r--r--mm/memcontrol.c809
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c43
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/swap.c44
-rw-r--r--mm/vmscan.c324
11 files changed, 742 insertions, 660 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 256bfd0f6007..e847f199c5c9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -245,7 +245,7 @@ config COMPACTION
config MIGRATION
bool "Page migration"
def_bool y
- depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA
+ depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU
help
Allows the migration of the physical location of pages of processes
while the virtual addresses are not changed. This is useful in
@@ -522,7 +522,7 @@ config MEM_SOFT_DIRTY
config CMA
bool "Contiguous Memory Allocator"
- depends on HAVE_MEMBLOCK
+ depends on HAVE_MEMBLOCK && MMU
select MIGRATION
select MEMORY_ISOLATION
help
diff --git a/mm/Makefile b/mm/Makefile
index f00803386a67..305d10acd081 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o balloon_compaction.o \
- interval_tree.o $(mmu-y)
+ interval_tree.o list_lru.o $(mmu-y)
obj-y += init-mm.o
diff --git a/mm/filemap.c b/mm/filemap.c
index 1c707f37cb1a..2d0f6d842858 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1594,6 +1594,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page;
+ bool memcg_oom;
pgoff_t size;
int ret = 0;
@@ -1602,7 +1603,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
/*
- * Do we have something in the page cache already?
+ * Do we have something in the page cache already? Either
+ * way, try readahead, but disable the memcg OOM killer for it
+ * as readahead is optional and no errors are propagated up
+ * the fault stack. The OOM killer is enabled while trying to
+ * instantiate the faulting page individually below.
*/
page = find_get_page(mapping, offset);
if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1610,10 +1615,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* We found the page, so try async readahead before
* waiting for the lock.
*/
+ memcg_oom = mem_cgroup_toggle_oom(false);
do_async_mmap_readahead(vma, ra, file, page, offset);
+ mem_cgroup_toggle_oom(memcg_oom);
} else if (!page) {
/* No page in the page cache at all */
+ memcg_oom = mem_cgroup_toggle_oom(false);
do_sync_mmap_readahead(vma, ra, file, offset);
+ mem_cgroup_toggle_oom(memcg_oom);
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 243f4cc75777..d96d921064bd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -211,24 +211,29 @@ static void put_huge_zero_page(void)
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
-static int shrink_huge_zero_page(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- if (!sc->nr_to_scan)
- /* we can free zero page only if last reference remains */
- return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+}
+static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
__free_page(zero_page);
+ return HPAGE_PMD_NR;
}
return 0;
}
static struct shrinker huge_zero_page_shrinker = {
- .shrink = shrink_huge_zero_page,
+ .count_objects = shrink_huge_zero_page_count,
+ .scan_objects = shrink_huge_zero_page_scan,
.seeks = DEFAULT_SEEKS,
};
diff --git a/mm/list_lru.c b/mm/list_lru.c
new file mode 100644
index 000000000000..72467914b856
--- /dev/null
+++ b/mm/list_lru.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
+ * Authors: David Chinner and Glauber Costa
+ *
+ * Generic LRU infrastructure
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/list_lru.h>
+#include <linux/slab.h>
+
+bool list_lru_add(struct list_lru *lru, struct list_head *item)
+{
+ int nid = page_to_nid(virt_to_page(item));
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ WARN_ON_ONCE(nlru->nr_items < 0);
+ if (list_empty(item)) {
+ list_add_tail(item, &nlru->list);
+ if (nlru->nr_items++ == 0)
+ node_set(nid, lru->active_nodes);
+ spin_unlock(&nlru->lock);
+ return true;
+ }
+ spin_unlock(&nlru->lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_add);
+
+bool list_lru_del(struct list_lru *lru, struct list_head *item)
+{
+ int nid = page_to_nid(virt_to_page(item));
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ if (!list_empty(item)) {
+ list_del_init(item);
+ if (--nlru->nr_items == 0)
+ node_clear(nid, lru->active_nodes);
+ WARN_ON_ONCE(nlru->nr_items < 0);
+ spin_unlock(&nlru->lock);
+ return true;
+ }
+ spin_unlock(&nlru->lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_del);
+
+unsigned long
+list_lru_count_node(struct list_lru *lru, int nid)
+{
+ unsigned long count = 0;
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ WARN_ON_ONCE(nlru->nr_items < 0);
+ count += nlru->nr_items;
+ spin_unlock(&nlru->lock);
+
+ return count;
+}
+EXPORT_SYMBOL_GPL(list_lru_count_node);
+
+unsigned long
+list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
+ void *cb_arg, unsigned long *nr_to_walk)
+{
+
+ struct list_lru_node *nlru = &lru->node[nid];
+ struct list_head *item, *n;
+ unsigned long isolated = 0;
+
+ spin_lock(&nlru->lock);
+restart:
+ list_for_each_safe(item, n, &nlru->list) {
+ enum lru_status ret;
+
+ /*
+ * decrement nr_to_walk first so that we don't livelock if we
+ * get stuck on large numbesr of LRU_RETRY items
+ */
+ if (--(*nr_to_walk) == 0)
+ break;
+
+ ret = isolate(item, &nlru->lock, cb_arg);
+ switch (ret) {
+ case LRU_REMOVED:
+ if (--nlru->nr_items == 0)
+ node_clear(nid, lru->active_nodes);
+ WARN_ON_ONCE(nlru->nr_items < 0);
+ isolated++;
+ break;
+ case LRU_ROTATE:
+ list_move_tail(item, &nlru->list);
+ break;
+ case LRU_SKIP:
+ break;
+ case LRU_RETRY:
+ /*
+ * The lru lock has been dropped, our list traversal is
+ * now invalid and so we have to restart from scratch.
+ */
+ goto restart;
+ default:
+ BUG();
+ }
+ }
+
+ spin_unlock(&nlru->lock);
+ return isolated;
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_node);
+
+int list_lru_init(struct list_lru *lru)
+{
+ int i;
+ size_t size = sizeof(*lru->node) * nr_node_ids;
+
+ lru->node = kzalloc(size, GFP_KERNEL);
+ if (!lru->node)
+ return -ENOMEM;
+
+ nodes_clear(lru->active_nodes);
+ for (i = 0; i < nr_node_ids; i++) {
+ spin_lock_init(&lru->node[i].lock);
+ INIT_LIST_HEAD(&lru->node[i].list);
+ lru->node[i].nr_items = 0;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(list_lru_init);
+
+void list_lru_destroy(struct list_lru *lru)
+{
+ kfree(lru->node);
+}
+EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5ca1dcf77ce9..0417eab39af9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,7 +39,6 @@
#include <linux/limits.h>
#include <linux/export.h>
#include <linux/mutex.h>
-#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -175,10 +174,6 @@ struct mem_cgroup_per_zone {
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
- struct rb_node tree_node; /* RB tree node */
- unsigned long long usage_in_excess;/* Set to the value by which */
- /* the soft limit is exceeded*/
- bool on_tree;
struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */
};
@@ -187,26 +182,6 @@ struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};
-/*
- * Cgroups above their limits are maintained in a RB-Tree, independent of
- * their hierarchy representation
- */
-
-struct mem_cgroup_tree_per_zone {
- struct rb_root rb_root;
- spinlock_t lock;
-};
-
-struct mem_cgroup_tree_per_node {
- struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
-};
-
-struct mem_cgroup_tree {
- struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
-};
-
-static struct mem_cgroup_tree soft_limit_tree __read_mostly;
-
struct mem_cgroup_threshold {
struct eventfd_ctx *eventfd;
u64 threshold;
@@ -280,6 +255,7 @@ struct mem_cgroup {
bool oom_lock;
atomic_t under_oom;
+ atomic_t oom_wakeups;
int swappiness;
/* OOM-Killer disable */
@@ -304,7 +280,7 @@ struct mem_cgroup {
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
*/
- unsigned long move_charge_at_immigrate;
+ unsigned long move_charge_at_immigrate;
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
@@ -341,6 +317,22 @@ struct mem_cgroup {
atomic_t numainfo_events;
atomic_t numainfo_updating;
#endif
+ /*
+ * Protects soft_contributed transitions.
+ * See mem_cgroup_update_soft_limit
+ */
+ spinlock_t soft_lock;
+
+ /*
+ * If true then this group has increased parents' children_in_excess
+ * when it got over the soft limit.
+ * When a group falls bellow the soft limit, parents' children_in_excess
+ * is decreased and soft_contributed changed to false.
+ */
+ bool soft_contributed;
+
+ /* Number of children that are in soft limit excess */
+ atomic_t children_in_excess;
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
@@ -444,7 +436,6 @@ static bool move_file(void)
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
-#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -671,164 +662,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
return mem_cgroup_zoneinfo(memcg, nid, zid);
}
-static struct mem_cgroup_tree_per_zone *
-soft_limit_tree_node_zone(int nid, int zid)
-{
- return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
-}
-
-static struct mem_cgroup_tree_per_zone *
-soft_limit_tree_from_page(struct page *page)
-{
- int nid = page_to_nid(page);
- int zid = page_zonenum(page);
-
- return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
-}
-
-static void
-__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
- struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz,
- unsigned long long new_usage_in_excess)
-{
- struct rb_node **p = &mctz->rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct mem_cgroup_per_zone *mz_node;
-
- if (mz->on_tree)
- return;
-
- mz->usage_in_excess = new_usage_in_excess;
- if (!mz->usage_in_excess)
- return;
- while (*p) {
- parent = *p;
- mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
- tree_node);
- if (mz->usage_in_excess < mz_node->usage_in_excess)
- p = &(*p)->rb_left;
- /*
- * We can't avoid mem cgroups that are over their soft
- * limit by the same amount
- */
- else if (mz->usage_in_excess >= mz_node->usage_in_excess)
- p = &(*p)->rb_right;
- }
- rb_link_node(&mz->tree_node, parent, p);
- rb_insert_color(&mz->tree_node, &mctz->rb_root);
- mz->on_tree = true;
-}
-
-static void
-__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
- struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
-{
- if (!mz->on_tree)
- return;
- rb_erase(&mz->tree_node, &mctz->rb_root);
- mz->on_tree = false;
-}
-
-static void
-mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
- struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
-{
- spin_lock(&mctz->lock);
- __mem_cgroup_remove_exceeded(memcg, mz, mctz);
- spin_unlock(&mctz->lock);
-}
-
-
-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
-{
- unsigned long long excess;
- struct mem_cgroup_per_zone *mz;
- struct mem_cgroup_tree_per_zone *mctz;
- int nid = page_to_nid(page);
- int zid = page_zonenum(page);
- mctz = soft_limit_tree_from_page(page);
-
- /*
- * Necessary to update all ancestors when hierarchy is used.
- * because their event counter is not touched.
- */
- for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- mz = mem_cgroup_zoneinfo(memcg, nid, zid);
- excess = res_counter_soft_limit_excess(&memcg->res);
- /*
- * We have to update the tree if mz is on RB-tree or
- * mem is over its softlimit.
- */
- if (excess || mz->on_tree) {
- spin_lock(&mctz->lock);
- /* if on-tree, remove it */
- if (mz->on_tree)
- __mem_cgroup_remove_exceeded(memcg, mz, mctz);
- /*
- * Insert again. mz->usage_in_excess will be updated.
- * If excess is 0, no tree ops.
- */
- __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
- spin_unlock(&mctz->lock);
- }
- }
-}
-
-static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
-{
- int node, zone;
- struct mem_cgroup_per_zone *mz;
- struct mem_cgroup_tree_per_zone *mctz;
-
- for_each_node(node) {
- for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- mz = mem_cgroup_zoneinfo(memcg, node, zone);
- mctz = soft_limit_tree_node_zone(node, zone);
- mem_cgroup_remove_exceeded(memcg, mz, mctz);
- }
- }
-}
-
-static struct mem_cgroup_per_zone *
-__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
-{
- struct rb_node *rightmost = NULL;
- struct mem_cgroup_per_zone *mz;
-
-retry:
- mz = NULL;
- rightmost = rb_last(&mctz->rb_root);
- if (!rightmost)
- goto done; /* Nothing to reclaim from */
-
- mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
- /*
- * Remove the node now but someone else can add it back,
- * we will to add it back at the end of reclaim to its correct
- * position in the tree.
- */
- __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
- if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
- !css_tryget(&mz->memcg->css))
- goto retry;
-done:
- return mz;
-}
-
-static struct mem_cgroup_per_zone *
-mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
-{
- struct mem_cgroup_per_zone *mz;
-
- spin_lock(&mctz->lock);
- mz = __mem_cgroup_largest_soft_limit_node(mctz);
- spin_unlock(&mctz->lock);
- return mz;
-}
-
/*
* Implementation Note: reading percpu statistics for memcg.
*
@@ -1003,6 +836,48 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
}
/*
+ * Called from rate-limited memcg_check_events when enough
+ * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
+ * that all the parents up the hierarchy will be notified that this group
+ * is in excess or that it is not in excess anymore. mmecg->soft_contributed
+ * makes the transition a single action whenever the state flips from one to
+ * the other.
+ */
+static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
+{
+ unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
+ struct mem_cgroup *parent = memcg;
+ int delta = 0;
+
+ spin_lock(&memcg->soft_lock);
+ if (excess) {
+ if (!memcg->soft_contributed) {
+ delta = 1;
+ memcg->soft_contributed = true;
+ }
+ } else {
+ if (memcg->soft_contributed) {
+ delta = -1;
+ memcg->soft_contributed = false;
+ }
+ }
+
+ /*
+ * Necessary to update all ancestors when hierarchy is used
+ * because their event counter is not touched.
+ * We track children even outside the hierarchy for the root
+ * cgroup because tree walk starting at root should visit
+ * all cgroups and we want to prevent from pointless tree
+ * walk if no children is below the limit.
+ */
+ while (delta && (parent = parent_mem_cgroup(parent)))
+ atomic_add(delta, &parent->children_in_excess);
+ if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
+ atomic_add(delta, &root_mem_cgroup->children_in_excess);
+ spin_unlock(&memcg->soft_lock);
+}
+
+/*
* Check events in order.
*
*/
@@ -1025,7 +900,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
- mem_cgroup_update_tree(memcg, page);
+ mem_cgroup_update_soft_limit(memcg);
#if MAX_NUMNODES > 1
if (unlikely(do_numainfo))
atomic_inc(&memcg->numainfo_events);
@@ -1068,6 +943,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
return memcg;
}
+static enum mem_cgroup_filter_t
+mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
+ mem_cgroup_iter_filter cond)
+{
+ if (!cond)
+ return VISIT;
+ return cond(memcg, root);
+}
+
/*
* Returns a next (in a pre-order walk) alive memcg (with elevated css
* ref. count) or NULL if the whole root's subtree has been visited.
@@ -1075,7 +959,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
* helper function to be used by mem_cgroup_iter
*/
static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
- struct mem_cgroup *last_visited)
+ struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
{
struct cgroup_subsys_state *prev_css, *next_css;
@@ -1093,11 +977,31 @@ skip_node:
if (next_css) {
struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
- if (css_tryget(&mem->css))
- return mem;
- else {
+ switch (mem_cgroup_filter(mem, root, cond)) {
+ case SKIP:
prev_css = next_css;
goto skip_node;
+ case SKIP_TREE:
+ if (mem == root)
+ return NULL;
+ /*
+ * css_rightmost_descendant is not an optimal way to
+ * skip through a subtree (especially for imbalanced
+ * trees leaning to right) but that's what we have right
+ * now. More effective solution would be traversing
+ * right-up for first non-NULL without calling
+ * css_next_descendant_pre afterwards.
+ */
+ prev_css = css_rightmost_descendant(next_css);
+ goto skip_node;
+ case VISIT:
+ if (css_tryget(&mem->css))
+ return mem;
+ else {
+ prev_css = next_css;
+ goto skip_node;
+ }
+ break;
}
}
@@ -1161,6 +1065,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
* @reclaim: cookie for shared reclaim walks, NULL for full walks
+ * @cond: filter for visited nodes, NULL for no filter
*
* Returns references to children of the hierarchy below @root, or
* @root itself, or %NULL after a full round-trip.
@@ -1173,9 +1078,10 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
* divide up the memcgs in the hierarchy among all concurrent
* reclaimers operating on the same zone and priority.
*/
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
+struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
struct mem_cgroup *prev,
- struct mem_cgroup_reclaim_cookie *reclaim)
+ struct mem_cgroup_reclaim_cookie *reclaim,
+ mem_cgroup_iter_filter cond)
{
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *last_visited = NULL;
@@ -1192,7 +1098,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
goto out_css_put;
- return root;
+ if (mem_cgroup_filter(root, root, cond) == VISIT)
+ return root;
+ return NULL;
}
rcu_read_lock();
@@ -1215,7 +1123,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
last_visited = mem_cgroup_iter_load(iter, root, &seq);
}
- memcg = __mem_cgroup_iter_next(root, last_visited);
+ memcg = __mem_cgroup_iter_next(root, last_visited, cond);
if (reclaim) {
mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1226,7 +1134,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
reclaim->generation = iter->generation;
}
- if (prev && !memcg)
+ /*
+ * We have finished the whole tree walk or no group has been
+ * visited because filter told us to skip the root node.
+ */
+ if (!memcg && (prev || (cond && !last_visited)))
goto out_unlock;
}
out_unlock:
@@ -1867,6 +1779,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
return total;
}
+#if MAX_NUMNODES > 1
/**
* test_mem_cgroup_node_reclaimable
* @memcg: the target memcg
@@ -1889,7 +1802,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
return false;
}
-#if MAX_NUMNODES > 1
/*
* Always updating the nodemask is not very good - even if we have an empty
@@ -1957,115 +1869,64 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
return node;
}
-/*
- * Check all nodes whether it contains reclaimable pages or not.
- * For quick scan, we make use of scan_nodes. This will allow us to skip
- * unused nodes. But scan_nodes is lazily updated and may not cotain
- * enough new information. We need to do double check.
- */
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
- int nid;
-
- /*
- * quick check...making use of scan_node.
- * We can skip unused nodes.
- */
- if (!nodes_empty(memcg->scan_nodes)) {
- for (nid = first_node(memcg->scan_nodes);
- nid < MAX_NUMNODES;
- nid = next_node(nid, memcg->scan_nodes)) {
-
- if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
- return true;
- }
- }
- /*
- * Check rest of nodes.
- */
- for_each_node_state(nid, N_MEMORY) {
- if (node_isset(nid, memcg->scan_nodes))
- continue;
- if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
- return true;
- }
- return false;
-}
-
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
- return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
-}
#endif
-static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
- struct zone *zone,
- gfp_t gfp_mask,
- unsigned long *total_scanned)
-{
- struct mem_cgroup *victim = NULL;
- int total = 0;
- int loop = 0;
- unsigned long excess;
- unsigned long nr_scanned;
- struct mem_cgroup_reclaim_cookie reclaim = {
- .zone = zone,
- .priority = 0,
- };
-
- excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
-
- while (1) {
- victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
- if (!victim) {
- loop++;
- if (loop >= 2) {
- /*
- * If we have not been able to reclaim
- * anything, it might because there are
- * no reclaimable pages under this hierarchy
- */
- if (!total)
- break;
- /*
- * We want to do more targeted reclaim.
- * excess >> 2 is not to excessive so as to
- * reclaim too much, nor too less that we keep
- * coming back to reclaim from this cgroup
- */
- if (total >= (excess >> 2) ||
- (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
- break;
- }
- continue;
- }
- if (!mem_cgroup_reclaimable(victim, false))
- continue;
- total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
- zone, &nr_scanned);
- *total_scanned += nr_scanned;
- if (!res_counter_soft_limit_excess(&root_memcg->res))
+/*
+ * A group is eligible for the soft limit reclaim under the given root
+ * hierarchy if
+ * a) it is over its soft limit
+ * b) any parent up the hierarchy is over its soft limit
+ *
+ * If the given group doesn't have any children over the limit then it
+ * doesn't make any sense to iterate its subtree.
+ */
+enum mem_cgroup_filter_t
+mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+ struct mem_cgroup *root)
+{
+ struct mem_cgroup *parent;
+
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ parent = memcg;
+
+ if (res_counter_soft_limit_excess(&memcg->res))
+ return VISIT;
+
+ /*
+ * If any parent up to the root in the hierarchy is over its soft limit
+ * then we have to obey and reclaim from this group as well.
+ */
+ while ((parent = parent_mem_cgroup(parent))) {
+ if (res_counter_soft_limit_excess(&parent->res))
+ return VISIT;
+ if (parent == root)
break;
}
- mem_cgroup_iter_break(root_memcg, victim);
- return total;
+
+ if (!atomic_read(&memcg->children_in_excess))
+ return SKIP_TREE;
+ return SKIP;
}
+static DEFINE_SPINLOCK(memcg_oom_lock);
+
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
- * Has to be called with memcg_oom_lock
*/
-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
+static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter, *failed = NULL;
+ spin_lock(&memcg_oom_lock);
+
for_each_mem_cgroup_tree(iter, memcg) {
if (iter->oom_lock) {
/*
@@ -2079,33 +1940,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
iter->oom_lock = true;
}
- if (!failed)
- return true;
-
- /*
- * OK, we failed to lock the whole subtree so we have to clean up
- * what we set up to the failing subtree
- */
- for_each_mem_cgroup_tree(iter, memcg) {
- if (iter == failed) {
- mem_cgroup_iter_break(memcg, iter);
- break;
+ if (failed) {
+ /*
+ * OK, we failed to lock the whole subtree so we have
+ * to clean up what we set up to the failing subtree
+ */
+ for_each_mem_cgroup_tree(iter, memcg) {
+ if (iter == failed) {
+ mem_cgroup_iter_break(memcg, iter);
+ break;
+ }
+ iter->oom_lock = false;
}
- iter->oom_lock = false;
}
- return false;
+
+ spin_unlock(&memcg_oom_lock);
+
+ return !failed;
}
-/*
- * Has to be called with memcg_oom_lock
- */
-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
+ spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
- return 0;
+ spin_unlock(&memcg_oom_lock);
}
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
@@ -2129,7 +1990,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
atomic_add_unless(&iter->under_oom, -1, 0);
}
-static DEFINE_SPINLOCK(memcg_oom_lock);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
@@ -2159,6 +2019,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
static void memcg_wakeup_oom(struct mem_cgroup *memcg)
{
+ atomic_inc(&memcg->oom_wakeups);
/* for filtering, pass "memcg" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
@@ -2170,56 +2031,136 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
}
/*
- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ * try to call OOM killer
*/
-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
- int order)
+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- struct oom_wait_info owait;
- bool locked, need_to_kill;
+ bool locked;
+ int wakeups;
- owait.memcg = memcg;
- owait.wait.flags = 0;
- owait.wait.func = memcg_oom_wake_function;
- owait.wait.private = current;
- INIT_LIST_HEAD(&owait.wait.task_list);
- need_to_kill = true;
- mem_cgroup_mark_under_oom(memcg);
+ if (!current->memcg_oom.may_oom)
+ return;
+
+ current->memcg_oom.in_memcg_oom = 1;
- /* At first, try to OOM lock hierarchy under memcg.*/
- spin_lock(&memcg_oom_lock);
- locked = mem_cgroup_oom_lock(memcg);
/*
- * Even if signal_pending(), we can't quit charge() loop without
- * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
- * under OOM is always welcomed, use TASK_KILLABLE here.
+ * As with any blocking lock, a contender needs to start
+ * listening for wakeups before attempting the trylock,
+ * otherwise it can miss the wakeup from the unlock and sleep
+ * indefinitely. This is just open-coded because our locking
+ * is so particular to memcg hierarchies.
*/
- prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
- if (!locked || memcg->oom_kill_disable)
- need_to_kill = false;
+ wakeups = atomic_read(&memcg->oom_wakeups);
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+
if (locked)
mem_cgroup_oom_notify(memcg);
- spin_unlock(&memcg_oom_lock);
- if (need_to_kill) {
- finish_wait(&memcg_oom_waitq, &owait.wait);
+ if (locked && !memcg->oom_kill_disable) {
+ mem_cgroup_unmark_under_oom(memcg);
mem_cgroup_out_of_memory(memcg, mask, order);
+ mem_cgroup_oom_unlock(memcg);
+ /*
+ * There is no guarantee that an OOM-lock contender
+ * sees the wakeups triggered by the OOM kill
+ * uncharges. Wake any sleepers explicitely.
+ */
+ memcg_oom_recover(memcg);
} else {
- schedule();
- finish_wait(&memcg_oom_waitq, &owait.wait);
+ /*
+ * A system call can just return -ENOMEM, but if this
+ * is a page fault and somebody else is handling the
+ * OOM already, we need to sleep on the OOM waitqueue
+ * for this memcg until the situation is resolved.
+ * Which can take some time because it might be
+ * handled by a userspace task.
+ *
+ * However, this is the charge context, which means
+ * that we may sit on a large call stack and hold
+ * various filesystem locks, the mmap_sem etc. and we
+ * don't want the OOM handler to deadlock on them
+ * while we sit here and wait. Store the current OOM
+ * context in the task_struct, then return -ENOMEM.
+ * At the end of the page fault handler, with the
+ * stack unwound, pagefault_out_of_memory() will check
+ * back with us by calling
+ * mem_cgroup_oom_synchronize(), possibly putting the
+ * task to sleep.
+ */
+ current->memcg_oom.oom_locked = locked;
+ current->memcg_oom.wakeups = wakeups;
+ css_get(&memcg->css);
+ current->memcg_oom.wait_on_memcg = memcg;
}
- spin_lock(&memcg_oom_lock);
- if (locked)
- mem_cgroup_oom_unlock(memcg);
- memcg_wakeup_oom(memcg);
- spin_unlock(&memcg_oom_lock);
+}
- mem_cgroup_unmark_under_oom(memcg);
+/**
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ *
+ * This has to be called at the end of a page fault if the the memcg
+ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ *
+ * Memcg supports userspace OOM handling, so failed allocations must
+ * sleep on a waitqueue until the userspace task resolves the
+ * situation. Sleeping directly in the charge context with all kinds
+ * of locks held is not a good idea, instead we remember an OOM state
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
+ * the end of the page fault to put the task to sleep and clean up the
+ * OOM state.
+ *
+ * Returns %true if an ongoing memcg OOM situation was detected and
+ * finalized, %false otherwise.
+ */
+bool mem_cgroup_oom_synchronize(void)
+{
+ struct oom_wait_info owait;
+ struct mem_cgroup *memcg;
- if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+ /* OOM is global, do not handle */
+ if (!current->memcg_oom.in_memcg_oom)
return false;
- /* Give chance to dying process */
- schedule_timeout_uninterruptible(1);
+
+ /*
+ * We invoked the OOM killer but there is a chance that a kill
+ * did not free up any charges. Everybody else might already
+ * be sleeping, so restart the fault and keep the rampage
+ * going until some charges are released.
+ */
+ memcg = current->memcg_oom.wait_on_memcg;
+ if (!memcg)
+ goto out;
+
+ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+ goto out_memcg;
+
+ owait.memcg = memcg;
+ owait.wait.flags = 0;
+ owait.wait.func = memcg_oom_wake_function;
+ owait.wait.private = current;
+ INIT_LIST_HEAD(&owait.wait.task_list);
+
+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+ /* Only sleep if we didn't miss any wakeups since OOM */
+ if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+ schedule();
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+out_memcg:
+ mem_cgroup_unmark_under_oom(memcg);
+ if (current->memcg_oom.oom_locked) {
+ mem_cgroup_oom_unlock(memcg);
+ /*
+ * There is no guarantee that an OOM-lock contender
+ * sees the wakeups triggered by the OOM kill
+ * uncharges. Wake any sleepers explicitely.
+ */
+ memcg_oom_recover(memcg);
+ }
+ css_put(&memcg->css);
+ current->memcg_oom.wait_on_memcg = NULL;
+out:
+ current->memcg_oom.in_memcg_oom = 0;
return true;
}
@@ -2450,7 +2391,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
flush_work(&stock->work);
}
out:
- put_online_cpus();
+ put_online_cpus();
}
/*
@@ -2532,12 +2473,11 @@ enum {
CHARGE_RETRY, /* need to retry but retry is not bad */
CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
- CHARGE_OOM_DIE, /* the current is killed because of OOM */
};
static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages, unsigned int min_pages,
- bool oom_check)
+ bool invoke_oom)
{
unsigned long csize = nr_pages * PAGE_SIZE;
struct mem_cgroup *mem_over_limit;
@@ -2594,14 +2534,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (mem_cgroup_wait_acct_move(mem_over_limit))
return CHARGE_RETRY;
- /* If we don't need to call oom-killer at el, return immediately */
- if (!oom_check)
- return CHARGE_NOMEM;
- /* check OOM */
- if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
- return CHARGE_OOM_DIE;
+ if (invoke_oom)
+ mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
- return CHARGE_RETRY;
+ return CHARGE_NOMEM;
}
/*
@@ -2704,7 +2640,7 @@ again:
}
do {
- bool oom_check;
+ bool invoke_oom = oom && !nr_oom_retries;
/* If killed, bypass charge */
if (fatal_signal_pending(current)) {
@@ -2712,14 +2648,8 @@ again:
goto bypass;
}
- oom_check = false;
- if (oom && !nr_oom_retries) {
- oom_check = true;
- nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
- }
-
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
- oom_check);
+ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
+ nr_pages, invoke_oom);
switch (ret) {
case CHARGE_OK:
break;
@@ -2732,16 +2662,12 @@ again:
css_put(&memcg->css);
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
- if (!oom) {
+ if (!oom || invoke_oom) {
css_put(&memcg->css);
goto nomem;
}
- /* If oom, we never return -ENOMEM */
nr_oom_retries--;
break;
- case CHARGE_OOM_DIE: /* Killed by OOM Killer */
- css_put(&memcg->css);
- goto bypass;
}
} while (ret != CHARGE_OK);
@@ -2882,7 +2808,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
* is accessed after testing USED bit. To make pc->mem_cgroup visible
* before USED bit, we need memory barrier here.
* See mem_cgroup_add_lru_list(), etc.
- */
+ */
smp_wmb();
SetPageCgroupUsed(pc);
@@ -2905,9 +2831,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
unlock_page_cgroup(pc);
/*
- * "charge_statistics" updated event counter. Then, check it.
- * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
- * if they exceeds softlimit.
+ * "charge_statistics" updated event counter.
*/
memcg_check_events(memcg, page);
}
@@ -3626,9 +3550,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
* the page allocator. Therefore, the following sequence when backed by
* the SLUB allocator:
*
- * memcg_stop_kmem_account();
- * kmalloc(<large_number>)
- * memcg_resume_kmem_account();
+ * memcg_stop_kmem_account();
+ * kmalloc(<large_number>)
+ * memcg_resume_kmem_account();
*
* would effectively ignore the fact that we should skip accounting,
* since it will drive us directly to this function without passing
@@ -4657,7 +4581,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
- if (curusage >= oldusage)
+ if (curusage >= oldusage)
retry_count--;
else
oldusage = curusage;
@@ -4678,7 +4602,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
int enlarge = 0;
/* see mem_cgroup_resize_res_limit */
- retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
+ retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
while (retry_count) {
if (signal_pending(current)) {
@@ -4727,98 +4651,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
return ret;
}
-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
- gfp_t gfp_mask,
- unsigned long *total_scanned)
-{
- unsigned long nr_reclaimed = 0;
- struct mem_cgroup_per_zone *mz, *next_mz = NULL;
- unsigned long reclaimed;
- int loop = 0;
- struct mem_cgroup_tree_per_zone *mctz;
- unsigned long long excess;
- unsigned long nr_scanned;
-
- if (order > 0)
- return 0;
-
- mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
- /*
- * This loop can run a while, specially if mem_cgroup's continuously
- * keep exceeding their soft limit and putting the system under
- * pressure
- */
- do {
- if (next_mz)
- mz = next_mz;
- else
- mz = mem_cgroup_largest_soft_limit_node(mctz);
- if (!mz)
- break;
-
- nr_scanned = 0;
- reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
- gfp_mask, &nr_scanned);
- nr_reclaimed += reclaimed;
- *total_scanned += nr_scanned;
- spin_lock(&mctz->lock);
-
- /*
- * If we failed to reclaim anything from this memory cgroup
- * it is time to move on to the next cgroup
- */
- next_mz = NULL;
- if (!reclaimed) {
- do {
- /*
- * Loop until we find yet another one.
- *
- * By the time we get the soft_limit lock
- * again, someone might have aded the
- * group back on the RB tree. Iterate to
- * make sure we get a different mem.
- * mem_cgroup_largest_soft_limit_node returns
- * NULL if no other cgroup is present on
- * the tree
- */
- next_mz =
- __mem_cgroup_largest_soft_limit_node(mctz);
- if (next_mz == mz)
- css_put(&next_mz->memcg->css);
- else /* next_mz == NULL or other memcg */
- break;
- } while (1);
- }
- __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
- excess = res_counter_soft_limit_excess(&mz->memcg->res);
- /*
- * One school of thought says that we should not add
- * back the node to the tree if reclaim returns 0.
- * But our reclaim could return 0, simply because due
- * to priority we are exposing a smaller subset of
- * memory to reclaim from. Consider this as a longer
- * term TODO.
- */
- /* If excess == 0, no tree ops */
- __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
- spin_unlock(&mctz->lock);
- css_put(&mz->memcg->css);
- loop++;
- /*
- * Could not reclaim anything and there are no more
- * mem cgroups to try or we seem to be looping without
- * reclaiming anything.
- */
- if (!nr_reclaimed &&
- (next_mz == NULL ||
- loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
- break;
- } while (!nr_reclaimed);
- if (next_mz)
- css_put(&next_mz->memcg->css);
- return nr_reclaimed;
-}
-
/**
* mem_cgroup_force_empty_list - clears LRU of a group
* @memcg: group to clear
@@ -4990,18 +4822,12 @@ static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
unsigned int event)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- int ret;
if (mem_cgroup_is_root(memcg))
return -EINVAL;
- css_get(&memcg->css);
- ret = mem_cgroup_force_empty(memcg);
- css_put(&memcg->css);
-
- return ret;
+ return mem_cgroup_force_empty(memcg);
}
-
static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -5139,7 +4965,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
*/
mutex_lock(&memcg_create_mutex);
mutex_lock(&set_limit_mutex);
- if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
+ if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
ret = -EBUSY;
goto out;
@@ -5149,7 +4975,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
ret = memcg_update_cache_sizes(memcg);
if (ret) {
- res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
+ res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
goto out;
}
static_key_slow_inc(&memcg_kmem_enabled_key);
@@ -6083,8 +5909,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
lruvec_init(&mz->lruvec);
- mz->usage_in_excess = 0;
- mz->on_tree = false;
mz->memcg = memcg;
}
memcg->nodeinfo[node] = pn;
@@ -6140,7 +5964,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
int node;
size_t size = memcg_size();
- mem_cgroup_remove_from_trees(memcg);
free_css_id(&mem_cgroup_subsys, &memcg->css);
for_each_node(node)
@@ -6177,29 +6000,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(parent_mem_cgroup);
-static void __init mem_cgroup_soft_limit_tree_init(void)
-{
- struct mem_cgroup_tree_per_node *rtpn;
- struct mem_cgroup_tree_per_zone *rtpz;
- int tmp, node, zone;
-
- for_each_node(node) {
- tmp = node;
- if (!node_state(node, N_NORMAL_MEMORY))
- tmp = -1;
- rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
- BUG_ON(!rtpn);
-
- soft_limit_tree.rb_tree_per_node[node] = rtpn;
-
- for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- rtpz = &rtpn->rb_tree_per_zone[zone];
- rtpz->rb_root = RB_ROOT;
- spin_lock_init(&rtpz->lock);
- }
- }
-}
-
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -6229,6 +6029,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
+ spin_lock_init(&memcg->soft_lock);
return &memcg->css;
@@ -6306,6 +6107,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
mem_cgroup_invalidate_reclaim_iterators(memcg);
mem_cgroup_reparent_charges(memcg);
+ if (memcg->soft_contributed) {
+ while ((memcg = parent_mem_cgroup(memcg)))
+ atomic_dec(&memcg->children_in_excess);
+
+ if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
+ atomic_dec(&root_mem_cgroup->children_in_excess);
+ }
mem_cgroup_destroy_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure);
}
@@ -6980,7 +6788,6 @@ static int __init mem_cgroup_init(void)
{
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
enable_swap_cgroup();
- mem_cgroup_soft_limit_tree_init();
memcg_stock_init();
return 0;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fc6ce6c185be..a20477f62a0b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -248,10 +248,12 @@ void shake_page(struct page *p, int access)
*/
if (access) {
int nr;
+ int nid = page_to_nid(p);
do {
struct shrink_control shrink = {
.gfp_mask = GFP_KERNEL,
};
+ node_set(nid, shrink.nodes_to_scan);
nr = shrink_slab(&shrink, 1000, 1000);
if (page_count(p) == 1)
diff --git a/mm/memory.c b/mm/memory.c
index 380e4a66717a..ca0003947115 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3754,22 +3754,14 @@ unlock:
/*
* By the time we get here, we already hold the mm semaphore
*/
-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
+static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- __set_current_state(TASK_RUNNING);
-
- count_vm_event(PGFAULT);
- mem_cgroup_count_vm_event(mm, PGFAULT);
-
- /* do counter updates before entering really critical section. */
- check_sync_rss_stat(current);
-
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
@@ -3853,6 +3845,37 @@ retry:
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ int ret;
+
+ __set_current_state(TASK_RUNNING);
+
+ count_vm_event(PGFAULT);
+ mem_cgroup_count_vm_event(mm, PGFAULT);
+
+ /* do counter updates before entering really critical section. */
+ check_sync_rss_stat(current);
+
+ /*
+ * Enable the memcg OOM handling for faults triggered in user
+ * space. Kernel faults are handled more gracefully.
+ */
+ if (flags & FAULT_FLAG_USER)
+ mem_cgroup_enable_oom();
+
+ ret = __handle_mm_fault(mm, vma, address, flags);
+
+ if (flags & FAULT_FLAG_USER)
+ mem_cgroup_disable_oom();
+
+ if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
+ mem_cgroup_oom_synchronize();
+
+ return ret;
+}
+
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 98e75f2ac7bc..314e9d274381 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -678,9 +678,12 @@ out:
*/
void pagefault_out_of_memory(void)
{
- struct zonelist *zonelist = node_zonelist(first_online_node,
- GFP_KERNEL);
+ struct zonelist *zonelist;
+ if (mem_cgroup_oom_synchronize())
+ return;
+
+ zonelist = node_zonelist(first_online_node, GFP_KERNEL);
if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
out_of_memory(NULL, 0, 0, NULL, false);
clear_zonelist_oom(zonelist, GFP_KERNEL);
diff --git a/mm/swap.c b/mm/swap.c
index c899502d3e36..759c3caf44bd 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -432,6 +432,11 @@ static void activate_page_drain(int cpu)
pagevec_lru_move_fn(pvec, __activate_page, NULL);
}
+static bool need_activate_page_drain(int cpu)
+{
+ return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
void activate_page(struct page *page)
{
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -449,6 +454,11 @@ static inline void activate_page_drain(int cpu)
{
}
+static bool need_activate_page_drain(int cpu)
+{
+ return false;
+}
+
void activate_page(struct page *page)
{
struct zone *zone = page_zone(page);
@@ -701,12 +711,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
lru_add_drain();
}
-/*
- * Returns 0 for success
- */
-int lru_add_drain_all(void)
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+
+void lru_add_drain_all(void)
{
- return schedule_on_each_cpu(lru_add_drain_per_cpu);
+ static DEFINE_MUTEX(lock);
+ static struct cpumask has_work;
+ int cpu;
+
+ mutex_lock(&lock);
+ get_online_cpus();
+ cpumask_clear(&has_work);
+
+ for_each_online_cpu(cpu) {
+ struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
+
+ if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+ pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+ need_activate_page_drain(cpu)) {
+ INIT_WORK(work, lru_add_drain_per_cpu);
+ schedule_work_on(cpu, work);
+ cpumask_set_cpu(cpu, &has_work);
+ }
+ }
+
+ for_each_cpu(cpu, &has_work)
+ flush_work(&per_cpu(lru_add_drain_work, cpu));
+
+ put_online_cpus();
+ mutex_unlock(&lock);
}
/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 758540d3ca83..a3bf7fd8522c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -139,11 +139,23 @@ static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
}
+
+static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
+{
+ struct mem_cgroup *root = sc->target_mem_cgroup;
+ return !mem_cgroup_disabled() &&
+ mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
+}
#else
static bool global_reclaim(struct scan_control *sc)
{
return true;
}
+
+static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
+{
+ return false;
+}
#endif
static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -155,14 +167,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
}
/*
- * Add a shrinker callback to be called from the vm
+ * Add a shrinker callback to be called from the vm.
*/
-void register_shrinker(struct shrinker *shrinker)
+int register_shrinker(struct shrinker *shrinker)
{
- atomic_long_set(&shrinker->nr_in_batch, 0);
+ size_t size = sizeof(*shrinker->nr_deferred);
+
+ /*
+ * If we only have one possible node in the system anyway, save
+ * ourselves the trouble and disable NUMA aware behavior. This way we
+ * will save memory and some small loop time later.
+ */
+ if (nr_node_ids == 1)
+ shrinker->flags &= ~SHRINKER_NUMA_AWARE;
+
+ if (shrinker->flags & SHRINKER_NUMA_AWARE)
+ size *= nr_node_ids;
+
+ shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+ if (!shrinker->nr_deferred)
+ return -ENOMEM;
+
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
+ return 0;
}
EXPORT_SYMBOL(register_shrinker);
@@ -177,15 +206,102 @@ void unregister_shrinker(struct shrinker *shrinker)
}
EXPORT_SYMBOL(unregister_shrinker);
-static inline int do_shrinker_shrink(struct shrinker *shrinker,
- struct shrink_control *sc,
- unsigned long nr_to_scan)
-{
- sc->nr_to_scan = nr_to_scan;
- return (*shrinker->shrink)(shrinker, sc);
+#define SHRINK_BATCH 128
+
+static unsigned long
+shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ unsigned long nr_pages_scanned, unsigned long lru_pages)
+{
+ unsigned long freed = 0;
+ unsigned long long delta;
+ long total_scan;
+ long max_pass;
+ long nr;
+ long new_nr;
+ int nid = shrinkctl->nid;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
+
+ max_pass = shrinker->count_objects(shrinker, shrinkctl);
+ if (max_pass == 0)
+ return 0;
+
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+
+ total_scan = nr;
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
+ delta *= max_pass;
+ do_div(delta, lru_pages + 1);
+ total_scan += delta;
+ if (total_scan < 0) {
+ printk(KERN_ERR
+ "shrink_slab: %pF negative objects to delete nr=%ld\n",
+ shrinker->scan_objects, total_scan);
+ total_scan = max_pass;
+ }
+
+ /*
+ * We need to avoid excessive windup on filesystem shrinkers
+ * due to large numbers of GFP_NOFS allocations causing the
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+ * max_pass. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+ if (delta < max_pass / 4)
+ total_scan = min(total_scan, max_pass / 2);
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ if (total_scan > max_pass * 2)
+ total_scan = max_pass * 2;
+
+ trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+ nr_pages_scanned, lru_pages,
+ max_pass, delta, total_scan);
+
+ while (total_scan >= batch_size) {
+ unsigned long ret;
+
+ shrinkctl->nr_to_scan = batch_size;
+ ret = shrinker->scan_objects(shrinker, shrinkctl);
+ if (ret == SHRINK_STOP)
+ break;
+ freed += ret;
+
+ count_vm_events(SLABS_SCANNED, batch_size);
+ total_scan -= batch_size;
+
+ cond_resched();
+ }
+
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates. If we exhausted the
+ * scan, there is no need to do an update.
+ */
+ if (total_scan > 0)
+ new_nr = atomic_long_add_return(total_scan,
+ &shrinker->nr_deferred[nid]);
+ else
+ new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+
+ trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+ return freed;
}
-#define SHRINK_BATCH 128
/*
* Call the shrink functions to age shrinkable caches
*
@@ -205,115 +321,45 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
*
* Returns the number of slab objects which we shrunk.
*/
-unsigned long shrink_slab(struct shrink_control *shrink,
+unsigned long shrink_slab(struct shrink_control *shrinkctl,
unsigned long nr_pages_scanned,
unsigned long lru_pages)
{
struct shrinker *shrinker;
- unsigned long ret = 0;
+ unsigned long freed = 0;
if (nr_pages_scanned == 0)
nr_pages_scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem)) {
- /* Assume we'll be able to shrink next time */
- ret = 1;
+ /*
+ * If we would return 0, our callers would understand that we
+ * have nothing else to shrink and give up trying. By returning
+ * 1 we keep it going and assume we'll be able to shrink next
+ * time.
+ */
+ freed = 1;
goto out;
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- unsigned long long delta;
- long total_scan;
- long max_pass;
- int shrink_ret = 0;
- long nr;
- long new_nr;
- long batch_size = shrinker->batch ? shrinker->batch
- : SHRINK_BATCH;
-
- max_pass = do_shrinker_shrink(shrinker, shrink, 0);
- if (max_pass <= 0)
- continue;
-
- /*
- * copy the current shrinker scan count into a local variable
- * and zero it so that other concurrent shrinker invocations
- * don't also do this scanning work.
- */
- nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
-
- total_scan = nr;
- delta = (4 * nr_pages_scanned) / shrinker->seeks;
- delta *= max_pass;
- do_div(delta, lru_pages + 1);
- total_scan += delta;
- if (total_scan < 0) {
- printk(KERN_ERR "shrink_slab: %pF negative objects to "
- "delete nr=%ld\n",
- shrinker->shrink, total_scan);
- total_scan = max_pass;
- }
-
- /*
- * We need to avoid excessive windup on filesystem shrinkers
- * due to large numbers of GFP_NOFS allocations causing the
- * shrinkers to return -1 all the time. This results in a large
- * nr being built up so when a shrink that can do some work
- * comes along it empties the entire cache due to nr >>>
- * max_pass. This is bad for sustaining a working set in
- * memory.
- *
- * Hence only allow the shrinker to scan the entire cache when
- * a large delta change is calculated directly.
- */
- if (delta < max_pass / 4)
- total_scan = min(total_scan, max_pass / 2);
-
- /*
- * Avoid risking looping forever due to too large nr value:
- * never try to free more than twice the estimate number of
- * freeable entries.
- */
- if (total_scan > max_pass * 2)
- total_scan = max_pass * 2;
-
- trace_mm_shrink_slab_start(shrinker, shrink, nr,
- nr_pages_scanned, lru_pages,
- max_pass, delta, total_scan);
-
- while (total_scan >= batch_size) {
- int nr_before;
+ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+ if (!node_online(shrinkctl->nid))
+ continue;
- nr_before = do_shrinker_shrink(shrinker, shrink, 0);
- shrink_ret = do_shrinker_shrink(shrinker, shrink,
- batch_size);
- if (shrink_ret == -1)
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
+ (shrinkctl->nid != 0))
break;
- if (shrink_ret < nr_before)
- ret += nr_before - shrink_ret;
- count_vm_events(SLABS_SCANNED, batch_size);
- total_scan -= batch_size;
- cond_resched();
- }
+ freed += shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
- /*
- * move the unused scan count back into the shrinker in a
- * manner that handles concurrent updates. If we exhausted the
- * scan, there is no need to do an update.
- */
- if (total_scan > 0)
- new_nr = atomic_long_add_return(total_scan,
- &shrinker->nr_in_batch);
- else
- new_nr = atomic_long_read(&shrinker->nr_in_batch);
-
- trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
+ }
}
up_read(&shrinker_rwsem);
out:
cond_resched();
- return ret;
+ return freed;
}
static inline int is_page_cache_freeable(struct page *page)
@@ -2111,9 +2157,11 @@ static inline bool should_continue_reclaim(struct zone *zone,
}
}
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
+static int
+__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
{
unsigned long nr_reclaimed, nr_scanned;
+ int groups_scanned = 0;
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2121,15 +2169,17 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
.zone = zone,
.priority = sc->priority,
};
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = NULL;
+ mem_cgroup_iter_filter filter = (soft_reclaim) ?
+ mem_cgroup_soft_reclaim_eligible : NULL;
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- memcg = mem_cgroup_iter(root, NULL, &reclaim);
- do {
+ while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
struct lruvec *lruvec;
+ groups_scanned++;
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
shrink_lruvec(lruvec, sc);
@@ -2149,8 +2199,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
mem_cgroup_iter_break(root, memcg);
break;
}
- memcg = mem_cgroup_iter(root, memcg, &reclaim);
- } while (memcg);
+ }
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
@@ -2158,6 +2207,37 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
+
+ return groups_scanned;
+}
+
+
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
+{
+ bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
+ unsigned long nr_scanned = sc->nr_scanned;
+ int scanned_groups;
+
+ scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
+ /*
+ * memcg iterator might race with other reclaimer or start from
+ * a incomplete tree walk so the tree walk in __shrink_zone
+ * might have missed groups that are above the soft limit. Try
+ * another loop to catch up with others. Do it just once to
+ * prevent from reclaim latencies when other reclaimers always
+ * preempt this one.
+ */
+ if (do_soft_reclaim && !scanned_groups)
+ __shrink_zone(zone, sc, do_soft_reclaim);
+
+ /*
+ * No group is over the soft limit or those that are do not have
+ * pages in the zone we are reclaiming so we have to reclaim everybody
+ */
+ if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
+ __shrink_zone(zone, sc, false);
+ return;
+ }
}
/* Returns true if compaction should go ahead for a high-order request */
@@ -2221,8 +2301,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
- unsigned long nr_soft_reclaimed;
- unsigned long nr_soft_scanned;
bool aborted_reclaim = false;
/*
@@ -2262,18 +2340,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
continue;
}
}
- /*
- * This steals pages from memory cgroups over softlimit
- * and returns the number of reclaimed pages and
- * scanned pages. This works for global memory pressure
- * and balancing, not for a memcg's limit.
- */
- nr_soft_scanned = 0;
- nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
- sc->order, sc->gfp_mask,
- &nr_soft_scanned);
- sc->nr_reclaimed += nr_soft_reclaimed;
- sc->nr_scanned += nr_soft_scanned;
/* need some check for avoid more shrink_zone() */
}
@@ -2354,12 +2420,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
*/
if (global_reclaim(sc)) {
unsigned long lru_pages = 0;
+
+ nodes_clear(shrink->nodes_to_scan);
for_each_zone_zonelist(zone, z, zonelist,
gfp_zone(sc->gfp_mask)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
lru_pages += zone_reclaimable_pages(zone);
+ node_set(zone_to_nid(zone),
+ shrink->nodes_to_scan);
}
shrink_slab(shrink, sc->nr_scanned, lru_pages);
@@ -2816,6 +2886,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
return true;
shrink_zone(zone, sc);
+ nodes_clear(shrink.nodes_to_scan);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
@@ -2870,8 +2942,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
{
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
- unsigned long nr_soft_reclaimed;
- unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.priority = DEF_PRIORITY,
@@ -2986,15 +3056,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
sc.nr_scanned = 0;
- nr_soft_scanned = 0;
- /*
- * Call soft limit reclaim before calling shrink_zone.
- */
- nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
- order, sc.gfp_mask,
- &nr_soft_scanned);
- sc.nr_reclaimed += nr_soft_reclaimed;
-
/*
* There should be no need to raise the scanning
* priority if enough pages are already being scanned
@@ -3524,10 +3585,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* number of slab pages and shake the slab until it is reduced
* by the same nr_pages that we used for reclaiming unmapped
* pages.
- *
- * Note that shrink_slab will free memory on all zones and may
- * take a long time.
*/
+ nodes_clear(shrink.nodes_to_scan);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
for (;;) {
unsigned long lru_pages = zone_reclaimable_pages(zone);