summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c450
1 files changed, 374 insertions, 76 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fb7440dd7ecc..fe4f123f1170 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,6 +49,7 @@
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
+#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
@@ -152,8 +153,13 @@ struct mem_cgroup_stat_cpu {
};
struct mem_cgroup_reclaim_iter {
- /* css_id of the last scanned hierarchy member */
- int position;
+ /*
+ * last scanned hierarchy member. Valid only if last_dead_count
+ * matches memcg->dead_count of the hierarchy root group.
+ */
+ struct mem_cgroup *last_visited;
+ unsigned long last_dead_count;
+
/* scan generation, increased every round-trip */
unsigned int generation;
};
@@ -256,6 +262,9 @@ struct mem_cgroup {
*/
struct res_counter res;
+ /* vmpressure notifications */
+ struct vmpressure vmpressure;
+
union {
/*
* the counter to account for mem+swap usage.
@@ -310,14 +319,31 @@ struct mem_cgroup {
/* thresholds for mem+swap usage. RCU-protected */
struct mem_cgroup_thresholds memsw_thresholds;
- /* For oom notifier event fd */
- struct list_head oom_notify;
+ union {
+ /* For oom notifier event fd */
+ struct list_head oom_notify;
+ /*
+ * we can only trigger an oom event if the memcg is alive.
+ * so we will reuse this field to hook the memcg in the list
+ * of dead memcgs.
+ */
+ struct list_head dead;
+ };
- /*
- * Should we move charges of a task when a task is moved into this
- * mem_cgroup ? And what type of charges should we move ?
- */
- unsigned long move_charge_at_immigrate;
+ union {
+ /*
+ * Should we move charges of a task when a task is moved into
+ * this mem_cgroup ? And what type of charges should we move ?
+ */
+ unsigned long move_charge_at_immigrate;
+
+ /*
+ * We are no longer concerned about moving charges after memcg
+ * is dead. So we will fill this up with its name, to aid
+ * debugging.
+ */
+ char *memcg_name;
+ };
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
@@ -335,6 +361,7 @@ struct mem_cgroup {
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
+ atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct tcp_memcontrol tcp_mem;
#endif
@@ -353,6 +380,7 @@ struct mem_cgroup {
atomic_t numainfo_events;
atomic_t numainfo_updating;
#endif
+
/*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
@@ -369,6 +397,55 @@ static size_t memcg_size(void)
nr_node_ids * sizeof(struct mem_cgroup_per_node);
}
+#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY
+static LIST_HEAD(dangling_memcgs);
+static DEFINE_MUTEX(dangling_memcgs_mutex);
+
+static inline void memcg_dangling_free(struct mem_cgroup *memcg)
+{
+ mutex_lock(&dangling_memcgs_mutex);
+ list_del(&memcg->dead);
+ mutex_unlock(&dangling_memcgs_mutex);
+ free_pages((unsigned long)memcg->memcg_name, 0);
+}
+
+static inline void memcg_dangling_add(struct mem_cgroup *memcg)
+{
+ /*
+ * cgroup.c will do page-sized allocations most of the time,
+ * so we'll just follow the pattern. Also, __get_free_pages
+ * is a better interface than kmalloc for us here, because
+ * we'd like this memory to be always billed to the root cgroup,
+ * not to the process removing the memcg. While kmalloc would
+ * require us to wrap it into memcg_stop/resume_kmem_account,
+ * with __get_free_pages we just don't pass the memcg flag.
+ */
+ memcg->memcg_name = (char *)__get_free_pages(GFP_KERNEL, 0);
+
+ /*
+ * we will, in general, just ignore failures. No need to go crazy,
+ * being this just a debugging interface. It is nice to copy a memcg
+ * name over, but if we (unlikely) can't, just the address will do
+ */
+ if (!memcg->memcg_name)
+ goto add_list;
+
+ if (cgroup_path(memcg->css.cgroup, memcg->memcg_name, PAGE_SIZE) < 0) {
+ free_pages((unsigned long)memcg->memcg_name, 0);
+ memcg->memcg_name = NULL;
+ }
+
+add_list:
+ INIT_LIST_HEAD(&memcg->dead);
+ mutex_lock(&dangling_memcgs_mutex);
+ list_add(&memcg->dead, &dangling_memcgs);
+ mutex_unlock(&dangling_memcgs_mutex);
+}
+#else
+static inline void memcg_dangling_free(struct mem_cgroup *memcg) {}
+static inline void memcg_dangling_add(struct mem_cgroup *memcg) {}
+#endif
+
/* internal only representation about the status of kmem accounting. */
enum {
KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
@@ -504,6 +581,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
return container_of(s, struct mem_cgroup, css);
}
+/* Some nice accessors for the vmpressure. */
+struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
+{
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ return &memcg->vmpressure;
+}
+
+struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+{
+ return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+}
+
+struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
+{
+ return &mem_cgroup_from_css(css)->vmpressure;
+}
+
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return (memcg == root_mem_cgroup);
@@ -1067,6 +1162,51 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
return memcg;
}
+/*
+ * Returns a next (in a pre-order walk) alive memcg (with elevated css
+ * ref. count) or NULL if the whole root's subtree has been visited.
+ *
+ * helper function to be used by mem_cgroup_iter
+ */
+static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
+ struct mem_cgroup *last_visited)
+{
+ struct cgroup *prev_cgroup, *next_cgroup;
+
+ /*
+ * Root is not visited by cgroup iterators so it needs an
+ * explicit visit.
+ */
+ if (!last_visited)
+ return root;
+
+ prev_cgroup = (last_visited == root) ? NULL
+ : last_visited->css.cgroup;
+skip_node:
+ next_cgroup = cgroup_next_descendant_pre(
+ prev_cgroup, root->css.cgroup);
+
+ /*
+ * Even if we found a group we have to make sure it is
+ * alive. css && !memcg means that the groups should be
+ * skipped and we should continue the tree walk.
+ * last_visited css is safe to use because it is
+ * protected by css_get and the tree walk is rcu safe.
+ */
+ if (next_cgroup) {
+ struct mem_cgroup *mem = mem_cgroup_from_cont(
+ next_cgroup);
+ if (css_tryget(&mem->css))
+ return mem;
+ else {
+ prev_cgroup = next_cgroup;
+ goto skip_node;
+ }
+ }
+
+ return NULL;
+}
+
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
@@ -1089,7 +1229,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup_reclaim_cookie *reclaim)
{
struct mem_cgroup *memcg = NULL;
- int id = 0;
+ struct mem_cgroup *last_visited = NULL;
+ unsigned long uninitialized_var(dead_count);
if (mem_cgroup_disabled())
return NULL;
@@ -1098,20 +1239,17 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
root = root_mem_cgroup;
if (prev && !reclaim)
- id = css_id(&prev->css);
-
- if (prev && prev != root)
- css_put(&prev->css);
+ last_visited = prev;
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
- return NULL;
+ goto out_css_put;
return root;
}
+ rcu_read_lock();
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
- struct cgroup_subsys_state *css;
if (reclaim) {
int nid = zone_to_nid(reclaim->zone);
@@ -1120,31 +1258,60 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
- if (prev && reclaim->generation != iter->generation)
- return NULL;
- id = iter->position;
+ last_visited = iter->last_visited;
+ if (prev && reclaim->generation != iter->generation) {
+ iter->last_visited = NULL;
+ goto out_unlock;
+ }
+
+ /*
+ * If the dead_count mismatches, a destruction
+ * has happened or is happening concurrently.
+ * If the dead_count matches, a destruction
+ * might still happen concurrently, but since
+ * we checked under RCU, that destruction
+ * won't free the object until we release the
+ * RCU reader lock. Thus, the dead_count
+ * check verifies the pointer is still valid,
+ * css_tryget() verifies the cgroup pointed to
+ * is alive.
+ */
+ dead_count = atomic_read(&root->dead_count);
+ smp_rmb();
+ last_visited = iter->last_visited;
+ if (last_visited) {
+ if ((dead_count != iter->last_dead_count) ||
+ !css_tryget(&last_visited->css)) {
+ last_visited = NULL;
+ }
+ }
}
- rcu_read_lock();
- css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
- if (css) {
- if (css == &root->css || css_tryget(css))
- memcg = mem_cgroup_from_css(css);
- } else
- id = 0;
- rcu_read_unlock();
+ memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
- iter->position = id;
- if (!css)
+ if (last_visited)
+ css_put(&last_visited->css);
+
+ iter->last_visited = memcg;
+ smp_wmb();
+ iter->last_dead_count = dead_count;
+
+ if (!memcg)
iter->generation++;
else if (!prev && memcg)
reclaim->generation = iter->generation;
}
- if (prev && !css)
- return NULL;
+ if (prev && !memcg)
+ goto out_unlock;
}
+out_unlock:
+ rcu_read_unlock();
+out_css_put:
+ if (prev && prev != root)
+ css_put(&prev->css);
+
return memcg;
}
@@ -1686,11 +1853,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct task_struct *chosen = NULL;
/*
- * If current has a pending SIGKILL, then automatically select it. The
- * goal is to allow it to allocate so that it may quickly exit and free
- * its memory.
+ * If current has a pending SIGKILL or is exiting, then automatically
+ * select it. The goal is to allow it to allocate so that it may
+ * quickly exit and free its memory.
*/
- if (fatal_signal_pending(current)) {
+ if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
set_thread_flag(TIF_MEMDIE);
return;
}
@@ -3114,12 +3281,12 @@ void memcg_release_cache(struct kmem_cache *s)
root = s->memcg_params->root_cache;
root->memcg_params->memcg_caches[id] = NULL;
- mem_cgroup_put(memcg);
mutex_lock(&memcg->slab_caches_mutex);
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
+ mem_cgroup_put(memcg);
out:
kfree(s->memcg_params);
}
@@ -3383,7 +3550,6 @@ static void memcg_create_cache_work_func(struct work_struct *w)
/*
* Enqueue the creation of a per-memcg kmem_cache.
- * Called with rcu_read_lock.
*/
static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
@@ -3391,12 +3557,8 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
struct create_work *cw;
cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
- if (cw == NULL)
- return;
-
- /* The corresponding put will be done in the workqueue. */
- if (!css_tryget(&memcg->css)) {
- kfree(cw);
+ if (cw == NULL) {
+ css_put(&memcg->css);
return;
}
@@ -3452,10 +3614,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
- rcu_read_unlock();
if (!memcg_can_account_kmem(memcg))
- return cachep;
+ goto out;
idx = memcg_cache_id(memcg);
@@ -3464,29 +3625,38 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
* code updating memcg_caches will issue a write barrier to match this.
*/
read_barrier_depends();
- if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
- /*
- * If we are in a safe context (can wait, and not in interrupt
- * context), we could be be predictable and return right away.
- * This would guarantee that the allocation being performed
- * already belongs in the new cache.
- *
- * However, there are some clashes that can arrive from locking.
- * For instance, because we acquire the slab_mutex while doing
- * kmem_cache_dup, this means no further allocation could happen
- * with the slab_mutex held.
- *
- * Also, because cache creation issue get_online_cpus(), this
- * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
- * that ends up reversed during cpu hotplug. (cpuset allocates
- * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
- * better to defer everything.
- */
- memcg_create_cache_enqueue(memcg, cachep);
- return cachep;
+ if (likely(cachep->memcg_params->memcg_caches[idx])) {
+ cachep = cachep->memcg_params->memcg_caches[idx];
+ goto out;
}
- return cachep->memcg_params->memcg_caches[idx];
+ /* The corresponding put will be done in the workqueue. */
+ if (!css_tryget(&memcg->css))
+ goto out;
+ rcu_read_unlock();
+
+ /*
+ * If we are in a safe context (can wait, and not in interrupt
+ * context), we could be be predictable and return right away.
+ * This would guarantee that the allocation being performed
+ * already belongs in the new cache.
+ *
+ * However, there are some clashes that can arrive from locking.
+ * For instance, because we acquire the slab_mutex while doing
+ * kmem_cache_dup, this means no further allocation could happen
+ * with the slab_mutex held.
+ *
+ * Also, because cache creation issue get_online_cpus(), this
+ * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
+ * that ends up reversed during cpu hotplug. (cpuset allocates
+ * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
+ * better to defer everything.
+ */
+ memcg_create_cache_enqueue(memcg, cachep);
+ return cachep;
+out:
+ rcu_read_unlock();
+ return cachep;
}
EXPORT_SYMBOL(__memcg_kmem_get_cache);
@@ -4948,9 +5118,6 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
- if (!do_swap_account && type == _MEMSWAP)
- return -EOPNOTSUPP;
-
switch (type) {
case _MEM:
if (name == RES_USAGE)
@@ -4975,6 +5142,107 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
}
+#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY
+static void
+mem_cgroup_dangling_swap(struct mem_cgroup *memcg, struct seq_file *m)
+{
+#ifdef CONFIG_MEMCG_SWAP
+ u64 kmem;
+ u64 memsw;
+
+ /*
+ * kmem will also propagate here, so we are only interested in the
+ * difference. See comment in mem_cgroup_reparent_charges for details.
+ *
+ * We could save this value for later consumption by kmem reports, but
+ * there is not a lot of problem if the figures differ slightly.
+ */
+ kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE);
+ memsw = res_counter_read_u64(&memcg->memsw, RES_USAGE) - kmem;
+ seq_printf(m, "\t%llu swap bytes\n", memsw);
+#endif
+}
+
+
+static void
+mem_cgroup_dangling_tcp(struct mem_cgroup *memcg, struct seq_file *m)
+{
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
+ struct tcp_memcontrol *tcp = &memcg->tcp_mem;
+ s64 tcp_socks;
+ u64 tcp_bytes;
+
+ tcp_socks = percpu_counter_sum_positive(&tcp->tcp_sockets_allocated);
+ tcp_bytes = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
+ seq_printf(m, "\t%llu tcp bytes", tcp_bytes);
+ /*
+ * if tcp_bytes == 0, tcp_socks != 0 is a bug. One more reason to print
+ * it!
+ */
+ if (tcp_bytes || tcp_socks)
+ seq_printf(m, ", in %lld sockets", tcp_socks);
+ seq_printf(m, "\n");
+
+#endif
+}
+
+static void
+mem_cgroup_dangling_kmem(struct mem_cgroup *memcg, struct seq_file *m)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ u64 kmem;
+ struct memcg_cache_params *params;
+
+ kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE);
+ seq_printf(m, "\t%llu kmem bytes", kmem);
+
+ /* list below may not be initialized, so not even try */
+ if (!kmem)
+ return;
+
+ seq_printf(m, " in caches");
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+ struct kmem_cache *s = memcg_params_to_cache(params);
+
+ seq_printf(m, " %s", s->name);
+ }
+ mutex_unlock(&memcg->slab_caches_mutex);
+ seq_printf(m, "\n");
+#endif
+}
+
+/*
+ * After a memcg is destroyed, it may still be kept around in memory.
+ * Currently, the two main reasons for it are swap entries, and kernel memory.
+ * Because they will be freed assynchronously, they will pin the memcg structure
+ * and its resources until the last reference goes away.
+ *
+ * This root-only file will show information about which users
+ */
+static int mem_cgroup_dangling_read(struct cgroup *cont, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct mem_cgroup *memcg;
+
+ mutex_lock(&dangling_memcgs_mutex);
+
+ list_for_each_entry(memcg, &dangling_memcgs, dead) {
+ if (memcg->memcg_name)
+ seq_printf(m, "%s:\n", memcg->memcg_name);
+ else
+ seq_printf(m, "%p (name lost):\n", memcg);
+
+ mem_cgroup_dangling_swap(memcg, m);
+ mem_cgroup_dangling_tcp(memcg, m);
+ mem_cgroup_dangling_kmem(memcg, m);
+ }
+
+ mutex_unlock(&dangling_memcgs_mutex);
+ return 0;
+}
+#endif
+
static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
{
int ret = -EINVAL;
@@ -5085,9 +5353,6 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
- if (!do_swap_account && type == _MEMSWAP)
- return -EOPNOTSUPP;
-
switch (name) {
case RES_LIMIT:
if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
@@ -5164,9 +5429,6 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
type = MEMFILE_TYPE(event);
name = MEMFILE_ATTR(event);
- if (!do_swap_account && type == _MEMSWAP)
- return -EOPNOTSUPP;
-
switch (name) {
case RES_MAX_USAGE:
if (type == _MEM)
@@ -5745,7 +6007,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return ret;
return mem_cgroup_sockets_init(memcg, ss);
-};
+}
static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
{
@@ -5840,6 +6102,11 @@ static struct cftype mem_cgroup_files[] = {
.unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
+ {
+ .name = "pressure_level",
+ .register_event = vmpressure_register_event,
+ .unregister_event = vmpressure_unregister_event,
+ },
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
@@ -5877,6 +6144,14 @@ static struct cftype mem_cgroup_files[] = {
},
#endif
#endif
+
+#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY
+ {
+ .name = "dangling_memcgs",
+ .read_seq_string = mem_cgroup_dangling_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+#endif
{ }, /* terminate */
};
@@ -6026,6 +6301,8 @@ static void free_work(struct work_struct *work)
struct mem_cgroup *memcg;
memcg = container_of(work, struct mem_cgroup, work_freeing);
+
+ memcg_dangling_free(memcg);
__mem_cgroup_free(memcg);
}
@@ -6121,6 +6398,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
memcg->move_charge_at_immigrate = 0;
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
+ vmpressure_init(&memcg->vmpressure);
return &memcg->css;
@@ -6186,10 +6464,29 @@ mem_cgroup_css_online(struct cgroup *cont)
return error;
}
+/*
+ * Announce all parents that a group from their hierarchy is gone.
+ */
+static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *parent = memcg;
+
+ while ((parent = parent_mem_cgroup(parent)))
+ atomic_inc(&parent->dead_count);
+
+ /*
+ * if the root memcg is not hierarchical we have to check it
+ * explicitely.
+ */
+ if (!root_mem_cgroup->use_hierarchy)
+ atomic_inc(&root_mem_cgroup->dead_count);
+}
+
static void mem_cgroup_css_offline(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ mem_cgroup_invalidate_reclaim_iterators(memcg);
mem_cgroup_reparent_charges(memcg);
mem_cgroup_destroy_all_caches(memcg);
}
@@ -6200,6 +6497,7 @@ static void mem_cgroup_css_free(struct cgroup *cont)
kmem_cgroup_destroy(memcg);
+ memcg_dangling_add(memcg);
mem_cgroup_put(memcg);
}