diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-02-14 15:35:28 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-02-14 15:35:28 +1100 |
commit | f600c997a1e930c3be312fdc4d1a67979d89173e (patch) | |
tree | 352f5d35d672887a5552730cb71af9d3f068fe19 /mm/memcontrol.c | |
parent | 065957b017e290977dc5527ae4e21cff9d3fb5ec (diff) | |
parent | 0ef183f7f162a62662f6002b03bcf3ca2903c937 (diff) |
Merge branch 'akpm/master'
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 656 |
1 files changed, 449 insertions, 207 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fbb60b103e64..c1ea9cf1e455 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = { "pgmajfault", }; +static const char * const mem_cgroup_lru_names[] = { + "inactive_anon", + "active_anon", + "inactive_file", + "active_file", + "unevictable", +}; + /* * Per memcg event counter is incremented at every pagein/pageout. With THP, * it will be incremated by the number of pages. This counter is used for @@ -172,7 +180,7 @@ struct mem_cgroup_per_node { }; struct mem_cgroup_lru_info { - struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; + struct mem_cgroup_per_node *nodeinfo[0]; }; /* @@ -276,17 +284,6 @@ struct mem_cgroup { */ struct res_counter kmem; /* - * Per cgroup active and inactive list, similar to the - * per zone LRU lists. - */ - struct mem_cgroup_lru_info info; - int last_scanned_node; -#if MAX_NUMNODES > 1 - nodemask_t scan_nodes; - atomic_t numainfo_events; - atomic_t numainfo_updating; -#endif - /* * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; @@ -313,14 +310,31 @@ struct mem_cgroup { /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; - /* For oom notifier event fd */ - struct list_head oom_notify; + union { + /* For oom notifier event fd */ + struct list_head oom_notify; + /* + * we can only trigger an oom event if the memcg is alive. + * so we will reuse this field to hook the memcg in the list + * of dead memcgs. + */ + struct list_head dead; + }; + + union { + /* + * Should we move charges of a task when a task is moved into + * this mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; - /* - * Should we move charges of a task when a task is moved into this - * mem_cgroup ? And what type of charges should we move ? - */ - unsigned long move_charge_at_immigrate; + /* + * We are no longer concerned about moving charges after memcg + * is dead. So we will fill this up with its name, to aid + * debugging. + */ + char *memcg_name; + }; /* * set > 0 if pages under this cgroup are moving to other cgroup. */ @@ -349,8 +363,78 @@ struct mem_cgroup { /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif + + int last_scanned_node; +#if MAX_NUMNODES > 1 + nodemask_t scan_nodes; + atomic_t numainfo_events; + atomic_t numainfo_updating; +#endif + /* + * Per cgroup active and inactive list, similar to the + * per zone LRU lists. + * + * WARNING: This has to be the last element of the struct. Don't + * add new fields after this point. + */ + struct mem_cgroup_lru_info info; }; +static size_t memcg_size(void) +{ + return sizeof(struct mem_cgroup) + + nr_node_ids * sizeof(struct mem_cgroup_per_node); +} + +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY +static LIST_HEAD(dangling_memcgs); +static DEFINE_MUTEX(dangling_memcgs_mutex); + +static inline void memcg_dangling_free(struct mem_cgroup *memcg) +{ + mutex_lock(&dangling_memcgs_mutex); + list_del(&memcg->dead); + mutex_unlock(&dangling_memcgs_mutex); + free_pages((unsigned long)memcg->memcg_name, 0); +} + +static inline void memcg_dangling_add(struct mem_cgroup *memcg) +{ + /* + * cgroup.c will do page-sized allocations most of the time, + * so we'll just follow the pattern. Also, __get_free_pages + * is a better interface than kmalloc for us here, because + * we'd like this memory to be always billed to the root cgroup, + * not to the process removing the memcg. While kmalloc would + * require us to wrap it into memcg_stop/resume_kmem_account, + * with __get_free_pages we just don't pass the memcg flag. + */ + memcg->memcg_name = (char *)__get_free_pages(GFP_KERNEL, 0); + + /* + * we will, in general, just ignore failures. No need to go crazy, + * being this just a debugging interface. It is nice to copy a memcg + * name over, but if we (unlikely) can't, just the address will do + */ + if (!memcg->memcg_name) + goto add_list; + + if (cgroup_path(memcg->css.cgroup, memcg->memcg_name, PAGE_SIZE) < 0) { + free_pages((unsigned long)memcg->memcg_name, 0); + memcg->memcg_name = NULL; + } + +add_list: + INIT_LIST_HEAD(&memcg->dead); + mutex_lock(&dangling_memcgs_mutex); + list_add(&memcg->dead, &dangling_memcgs); + mutex_unlock(&dangling_memcgs_mutex); +} +#else +static inline void memcg_dangling_free(struct mem_cgroup *memcg) {} +static inline void memcg_dangling_add(struct mem_cgroup *memcg) {} +#endif + /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ @@ -398,8 +482,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) /* Stuffs for move charges at task migration. */ /* - * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a - * left-shifted bitmap of these types. + * Types of charges to be moved. "move_charge_at_immitgrate" and + * "immigrate_flags" are treated as a left-shifted bitmap of these types. */ enum move_type { MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ @@ -412,6 +496,7 @@ static struct move_charge_struct { spinlock_t lock; /* for from, to */ struct mem_cgroup *from; struct mem_cgroup *to; + unsigned long immigrate_flags; unsigned long precharge; unsigned long moved_charge; unsigned long moved_swap; @@ -424,14 +509,12 @@ static struct move_charge_struct { static bool move_anon(void) { - return test_bit(MOVE_CHARGE_TYPE_ANON, - &mc.to->move_charge_at_immigrate); + return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); } static bool move_file(void) { - return test_bit(MOVE_CHARGE_TYPE_FILE, - &mc.to->move_charge_at_immigrate); + return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); } /* @@ -471,6 +554,13 @@ enum res_type { #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) +/* + * The memcg_create_mutex will be held whenever a new cgroup is created. + * As a consequence, any change that needs to protect against new child cgroups + * appearing has to hold it as well. + */ +static DEFINE_MUTEX(memcg_create_mutex); + static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); @@ -627,6 +717,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) { + VM_BUG_ON((unsigned)nid >= nr_node_ids); return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; } @@ -1371,17 +1462,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } -int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) -{ - unsigned long active; - unsigned long inactive; - - inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE); - active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE); - - return (active > inactive); -} - #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -1524,8 +1604,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, spin_unlock_irqrestore(&memcg->move_lock, *flags); } +#define K(x) ((x) << (PAGE_SHIFT-10)) /** - * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. + * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. * @memcg: The memory cgroup that went over limit * @p: Task that is going to be killed * @@ -1543,8 +1624,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) */ static char memcg_name[PATH_MAX]; int ret; + struct mem_cgroup *iter; + unsigned int i; - if (!memcg || !p) + if (!p) return; rcu_read_lock(); @@ -1563,7 +1646,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) } rcu_read_unlock(); - printk(KERN_INFO "Task in %s killed", memcg_name); + pr_info("Task in %s killed", memcg_name); rcu_read_lock(); ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); @@ -1576,22 +1659,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) /* * Continues from above, so we don't need an KERN_ level */ - printk(KERN_CONT " as a result of limit of %s\n", memcg_name); + pr_cont(" as a result of limit of %s\n", memcg_name); done: - printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", + pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->res, RES_FAILCNT)); - printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " - "failcnt %llu\n", + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); - printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", + pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); + + for_each_mem_cgroup_tree(iter, memcg) { + pr_info("Memory cgroup stats"); + + rcu_read_lock(); + ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); + if (!ret) + pr_cont(" for %s", memcg_name); + rcu_read_unlock(); + pr_cont(":"); + + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + continue; + pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], + K(mem_cgroup_read_stat(iter, i))); + } + + for (i = 0; i < NR_LRU_LISTS; i++) + pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], + K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); + + pr_cont("\n"); + } } /* @@ -2256,6 +2362,17 @@ static void drain_local_stock(struct work_struct *dummy) clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } +static void __init memcg_stock_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct memcg_stock_pcp *stock = + &per_cpu(memcg_stock, cpu); + INIT_WORK(&stock->work, drain_local_stock); + } +} + /* * Cache charges(val) which is from res_counter, to local per_cpu area. * This will be consumed by consume_stock() function, later. @@ -4391,8 +4508,8 @@ void mem_cgroup_print_bad_page(struct page *page) pc = lookup_page_cgroup_used(page); if (pc) { - printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", - pc, pc->flags, pc->mem_cgroup); + pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", + pc, pc->flags, pc->mem_cgroup); } } #endif @@ -4719,6 +4836,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) } /* + * This mainly exists for tests during the setting of set of use_hierarchy. + * Since this is the very setting we are changing, the current hierarchy value + * is meaningless + */ +static inline bool __memcg_has_children(struct mem_cgroup *memcg) +{ + struct cgroup *pos; + + /* bounce at first found */ + cgroup_for_each_child(pos, memcg->css.cgroup) + return true; + return false; +} + +/* + * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed + * to be already dead (as in mem_cgroup_force_empty, for instance). This is + * from mem_cgroup_count_children(), in the sense that we don't really care how + * many children we have; we only need to know if we have any. It also counts + * any memcg without hierarchy as infertile. + */ +static inline bool memcg_has_children(struct mem_cgroup *memcg) +{ + return memcg->use_hierarchy && __memcg_has_children(memcg); +} + +/* * Reclaims as many pages from the given memcg as possible and moves * the rest to the parent. * @@ -4788,7 +4932,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, if (parent) parent_memcg = mem_cgroup_from_cont(parent); - cgroup_lock(); + mutex_lock(&memcg_create_mutex); if (memcg->use_hierarchy == val) goto out; @@ -4803,7 +4947,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, */ if ((!parent_memcg || !parent_memcg->use_hierarchy) && (val == 1 || val == 0)) { - if (list_empty(&cont->children)) + if (!__memcg_has_children(memcg)) memcg->use_hierarchy = val; else retval = -EBUSY; @@ -4811,7 +4955,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, retval = -EINVAL; out: - cgroup_unlock(); + mutex_unlock(&memcg_create_mutex); return retval; } @@ -4892,12 +5036,111 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, str, len); } +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY +static void +mem_cgroup_dangling_swap(struct mem_cgroup *memcg, struct seq_file *m) +{ +#ifdef CONFIG_MEMCG_SWAP + u64 kmem; + u64 memsw; + + /* + * kmem will also propagate here, so we are only interested in the + * difference. See comment in mem_cgroup_reparent_charges for details. + * + * We could save this value for later consumption by kmem reports, but + * there is not a lot of problem if the figures differ slightly. + */ + kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE); + memsw = res_counter_read_u64(&memcg->memsw, RES_USAGE) - kmem; + seq_printf(m, "\t%llu swap bytes\n", memsw); +#endif +} + + +static void +mem_cgroup_dangling_tcp(struct mem_cgroup *memcg, struct seq_file *m) +{ +#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) + struct tcp_memcontrol *tcp = &memcg->tcp_mem; + s64 tcp_socks; + u64 tcp_bytes; + + tcp_socks = percpu_counter_sum_positive(&tcp->tcp_sockets_allocated); + tcp_bytes = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); + seq_printf(m, "\t%llu tcp bytes", tcp_bytes); + /* + * if tcp_bytes == 0, tcp_socks != 0 is a bug. One more reason to print + * it! + */ + if (tcp_bytes || tcp_socks) + seq_printf(m, ", in %lld sockets", tcp_socks); + seq_printf(m, "\n"); + +#endif +} + +static void +mem_cgroup_dangling_kmem(struct mem_cgroup *memcg, struct seq_file *m) +{ +#ifdef CONFIG_MEMCG_KMEM + u64 kmem; + struct memcg_cache_params *params; + + kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE); + seq_printf(m, "\t%llu kmem bytes", kmem); + + /* list below may not be initialized, so not even try */ + if (!kmem) + return; + + seq_printf(m, " in caches"); + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + struct kmem_cache *s = memcg_params_to_cache(params); + + seq_printf(m, " %s", s->name); + } + mutex_unlock(&memcg->slab_caches_mutex); + seq_printf(m, "\n"); +#endif +} + +/* + * After a memcg is destroyed, it may still be kept around in memory. + * Currently, the two main reasons for it are swap entries, and kernel memory. + * Because they will be freed assynchronously, they will pin the memcg structure + * and its resources until the last reference goes away. + * + * This root-only file will show information about which users + */ +static int mem_cgroup_dangling_read(struct cgroup *cont, struct cftype *cft, + struct seq_file *m) +{ + struct mem_cgroup *memcg; + + mutex_lock(&dangling_memcgs_mutex); + + list_for_each_entry(memcg, &dangling_memcgs, dead) { + if (memcg->memcg_name) + seq_printf(m, "%s:\n", memcg->memcg_name); + else + seq_printf(m, "%p (name lost):\n", memcg); + + mem_cgroup_dangling_swap(memcg, m); + mem_cgroup_dangling_tcp(memcg, m); + mem_cgroup_dangling_kmem(memcg, m); + } + + mutex_unlock(&dangling_memcgs_mutex); + return 0; +} +#endif + static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) { int ret = -EINVAL; #ifdef CONFIG_MEMCG_KMEM - bool must_inc_static_branch = false; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); /* * For simplicity, we won't allow this to be disabled. It also can't @@ -4910,18 +5153,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) * * After it first became limited, changes in the value of the limit are * of course permitted. - * - * Taking the cgroup_lock is really offensive, but it is so far the only - * way to guarantee that no children will appear. There are plenty of - * other offenders, and they should all go away. Fine grained locking - * is probably the way to go here. When we are fully hierarchical, we - * can also get rid of the use_hierarchy check. */ - cgroup_lock(); + mutex_lock(&memcg_create_mutex); mutex_lock(&set_limit_mutex); if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { - if (cgroup_task_count(cont) || (memcg->use_hierarchy && - !list_empty(&cont->children))) { + if (cgroup_task_count(cont) || memcg_has_children(memcg)) { ret = -EBUSY; goto out; } @@ -4933,7 +5169,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); goto out; } - must_inc_static_branch = true; + static_key_slow_inc(&memcg_kmem_enabled_key); + /* + * setting the active bit after the inc will guarantee no one + * starts accounting before all call sites are patched + */ + memcg_kmem_set_active(memcg); + /* * kmem charges can outlive the cgroup. In the case of slab * pages, for instance, a page contain objects from various @@ -4945,28 +5187,7 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) ret = res_counter_set_limit(&memcg->kmem, val); out: mutex_unlock(&set_limit_mutex); - cgroup_unlock(); - - /* - * We are by now familiar with the fact that we can't inc the static - * branch inside cgroup_lock. See disarm functions for details. A - * worker here is overkill, but also wrong: After the limit is set, we - * must start accounting right away. Since this operation can't fail, - * we can safely defer it to here - no rollback will be needed. - * - * The boolean used to control this is also safe, because - * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be - * able to set it to true; - */ - if (must_inc_static_branch) { - static_key_slow_inc(&memcg_kmem_enabled_key); - /* - * setting the active bit after the inc will guarantee no one - * starts accounting before all call sites are patched - */ - memcg_kmem_set_active(memcg); - } - + mutex_unlock(&memcg_create_mutex); #endif return ret; } @@ -5148,15 +5369,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, if (val >= (1 << NR_MOVE_TYPE)) return -EINVAL; + /* - * We check this value several times in both in can_attach() and - * attach(), so we need cgroup lock to prevent this value from being - * inconsistent. + * No kind of locking is needed in here, because ->can_attach() will + * check this value once in the beginning of the process, and then carry + * on with stale data. This means that changes to this value will only + * affect task migrations starting after the change. */ - cgroup_lock(); memcg->move_charge_at_immigrate = val; - cgroup_unlock(); - return 0; } #else @@ -5214,14 +5434,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, } #endif /* CONFIG_NUMA */ -static const char * const mem_cgroup_lru_names[] = { - "inactive_anon", - "active_anon", - "inactive_file", - "active_file", - "unevictable", -}; - static inline void mem_cgroup_lru_names_not_uptodate(void) { BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); @@ -5335,18 +5547,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, parent = mem_cgroup_from_cont(cgrp->parent); - cgroup_lock(); + mutex_lock(&memcg_create_mutex); /* If under hierarchy, only empty-root can set this value */ - if ((parent->use_hierarchy) || - (memcg->use_hierarchy && !list_empty(&cgrp->children))) { - cgroup_unlock(); + if ((parent->use_hierarchy) || memcg_has_children(memcg)) { + mutex_unlock(&memcg_create_mutex); return -EINVAL; } memcg->swappiness = val; - cgroup_unlock(); + mutex_unlock(&memcg_create_mutex); return 0; } @@ -5672,7 +5883,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, parent = mem_cgroup_from_cont(cgrp->parent); - cgroup_lock(); + mutex_lock(&memcg_create_mutex); /* oom-kill-disable is a flag for subhierarchy. */ if ((parent->use_hierarchy) || (memcg->use_hierarchy && !list_empty(&cgrp->children))) { @@ -5682,7 +5893,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, memcg->oom_kill_disable = val; if (!val) memcg_oom_recover(memcg); - cgroup_unlock(); + mutex_unlock(&memcg_create_mutex); return 0; } @@ -5797,33 +6008,6 @@ static struct cftype mem_cgroup_files[] = { .read_seq_string = memcg_numa_stat_show, }, #endif -#ifdef CONFIG_MEMCG_SWAP - { - .name = "memsw.usage_in_bytes", - .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), - .read = mem_cgroup_read, - .register_event = mem_cgroup_usage_register_event, - .unregister_event = mem_cgroup_usage_unregister_event, - }, - { - .name = "memsw.max_usage_in_bytes", - .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, - }, - { - .name = "memsw.limit_in_bytes", - .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), - .write_string = mem_cgroup_write, - .read = mem_cgroup_read, - }, - { - .name = "memsw.failcnt", - .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), - .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, - }, -#endif #ifdef CONFIG_MEMCG_KMEM { .name = "kmem.limit_in_bytes", @@ -5855,9 +6039,47 @@ static struct cftype mem_cgroup_files[] = { }, #endif #endif + +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY + { + .name = "dangling_memcgs", + .read_seq_string = mem_cgroup_dangling_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, +#endif { }, /* terminate */ }; +#ifdef CONFIG_MEMCG_SWAP +static struct cftype memsw_cgroup_files[] = { + { + .name = "memsw.usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), + .read = mem_cgroup_read, + .register_event = mem_cgroup_usage_register_event, + .unregister_event = mem_cgroup_usage_unregister_event, + }, + { + .name = "memsw.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, + { + .name = "memsw.limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), + .write_string = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { + .name = "memsw.failcnt", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, + { }, /* terminate */ +}; +#endif static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; @@ -5896,9 +6118,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - int size = sizeof(struct mem_cgroup); + size_t size = memcg_size(); - /* Can be very big if MAX_NUMNODES is very big */ + /* Can be very big if nr_node_ids is very big */ if (size < PAGE_SIZE) memcg = kzalloc(size, GFP_KERNEL); else @@ -5935,7 +6157,7 @@ out_free: static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - int size = sizeof(struct mem_cgroup); + size_t size = memcg_size(); mem_cgroup_remove_from_trees(memcg); free_css_id(&mem_cgroup_subsys, &memcg->css); @@ -5974,6 +6196,8 @@ static void free_work(struct work_struct *work) struct mem_cgroup *memcg; memcg = container_of(work, struct mem_cgroup, work_freeing); + + memcg_dangling_free(memcg); __mem_cgroup_free(memcg); } @@ -6017,19 +6241,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(parent_mem_cgroup); -#ifdef CONFIG_MEMCG_SWAP -static void __init enable_swap_cgroup(void) -{ - if (!mem_cgroup_disabled() && really_do_swap_account) - do_swap_account = 1; -} -#else -static void __init enable_swap_cgroup(void) -{ -} -#endif - -static int mem_cgroup_soft_limit_tree_init(void) +static void __init mem_cgroup_soft_limit_tree_init(void) { struct mem_cgroup_tree_per_node *rtpn; struct mem_cgroup_tree_per_zone *rtpz; @@ -6040,8 +6252,7 @@ static int mem_cgroup_soft_limit_tree_init(void) if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); - if (!rtpn) - goto err_cleanup; + BUG_ON(!rtpn); soft_limit_tree.rb_tree_per_node[node] = rtpn; @@ -6051,23 +6262,12 @@ static int mem_cgroup_soft_limit_tree_init(void) spin_lock_init(&rtpz->lock); } } - return 0; - -err_cleanup: - for_each_node(node) { - if (!soft_limit_tree.rb_tree_per_node[node]) - break; - kfree(soft_limit_tree.rb_tree_per_node[node]); - soft_limit_tree.rb_tree_per_node[node] = NULL; - } - return 1; - } static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup *cont) { - struct mem_cgroup *memcg, *parent; + struct mem_cgroup *memcg; long error = -ENOMEM; int node; @@ -6081,24 +6281,44 @@ mem_cgroup_css_alloc(struct cgroup *cont) /* root ? */ if (cont->parent == NULL) { - int cpu; - enable_swap_cgroup(); - parent = NULL; - if (mem_cgroup_soft_limit_tree_init()) - goto free_out; root_mem_cgroup = memcg; - for_each_possible_cpu(cpu) { - struct memcg_stock_pcp *stock = - &per_cpu(memcg_stock, cpu); - INIT_WORK(&stock->work, drain_local_stock); - } - } else { - parent = mem_cgroup_from_cont(cont->parent); - memcg->use_hierarchy = parent->use_hierarchy; - memcg->oom_kill_disable = parent->oom_kill_disable; + res_counter_init(&memcg->res, NULL); + res_counter_init(&memcg->memsw, NULL); + res_counter_init(&memcg->kmem, NULL); } - if (parent && parent->use_hierarchy) { + memcg->last_scanned_node = MAX_NUMNODES; + INIT_LIST_HEAD(&memcg->oom_notify); + atomic_set(&memcg->refcnt, 1); + memcg->move_charge_at_immigrate = 0; + mutex_init(&memcg->thresholds_lock); + spin_lock_init(&memcg->move_lock); + + return &memcg->css; + +free_out: + __mem_cgroup_free(memcg); + return ERR_PTR(error); +} + +static int +mem_cgroup_css_online(struct cgroup *cont) +{ + struct mem_cgroup *memcg, *parent; + int error = 0; + + if (!cont->parent) + return 0; + + mutex_lock(&memcg_create_mutex); + memcg = mem_cgroup_from_cont(cont); + parent = mem_cgroup_from_cont(cont->parent); + + memcg->use_hierarchy = parent->use_hierarchy; + memcg->oom_kill_disable = parent->oom_kill_disable; + memcg->swappiness = mem_cgroup_swappiness(parent); + + if (parent->use_hierarchy) { res_counter_init(&memcg->res, &parent->res); res_counter_init(&memcg->memsw, &parent->memsw); res_counter_init(&memcg->kmem, &parent->kmem); @@ -6119,20 +6339,12 @@ mem_cgroup_css_alloc(struct cgroup *cont) * much sense so let cgroup subsystem know about this * unfortunate state in our controller. */ - if (parent && parent != root_mem_cgroup) + if (parent != root_mem_cgroup) mem_cgroup_subsys.broken_hierarchy = true; } - memcg->last_scanned_node = MAX_NUMNODES; - INIT_LIST_HEAD(&memcg->oom_notify); - - if (parent) - memcg->swappiness = mem_cgroup_swappiness(parent); - atomic_set(&memcg->refcnt, 1); - memcg->move_charge_at_immigrate = 0; - mutex_init(&memcg->thresholds_lock); - spin_lock_init(&memcg->move_lock); error = memcg_init_kmem(memcg, &mem_cgroup_subsys); + mutex_unlock(&memcg_create_mutex); if (error) { /* * We call put now because our (and parent's) refcnts @@ -6140,12 +6352,10 @@ mem_cgroup_css_alloc(struct cgroup *cont) * call __mem_cgroup_free, so return directly */ mem_cgroup_put(memcg); - return ERR_PTR(error); + if (parent->use_hierarchy) + mem_cgroup_put(parent); } - return &memcg->css; -free_out: - __mem_cgroup_free(memcg); - return ERR_PTR(error); + return error; } static void mem_cgroup_css_offline(struct cgroup *cont) @@ -6162,6 +6372,7 @@ static void mem_cgroup_css_free(struct cgroup *cont) kmem_cgroup_destroy(memcg); + memcg_dangling_add(memcg); mem_cgroup_put(memcg); } @@ -6281,7 +6492,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ - page = find_get_page(&swapper_space, ent.val); + page = find_get_page(swap_address_space(ent), ent.val); if (do_swap_account) entry->val = ent.val; @@ -6322,7 +6533,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, swp_entry_t swap = radix_to_swp_entry(page); if (do_swap_account) *entry = swap; - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif return page; @@ -6532,8 +6743,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); + unsigned long move_charge_at_immigrate; - if (memcg->move_charge_at_immigrate) { + /* + * We are now commited to this value whatever it is. Changes in this + * tunable will only affect upcoming migrations, not the current one. + * So we need to save it, and keep it going. + */ + move_charge_at_immigrate = memcg->move_charge_at_immigrate; + if (move_charge_at_immigrate) { struct mm_struct *mm; struct mem_cgroup *from = mem_cgroup_from_task(p); @@ -6553,6 +6771,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, spin_lock(&mc.lock); mc.from = from; mc.to = memcg; + mc.immigrate_flags = move_charge_at_immigrate; spin_unlock(&mc.lock); /* We set mc.moving_task later */ @@ -6747,6 +6966,7 @@ struct cgroup_subsys mem_cgroup_subsys = { .name = "memory", .subsys_id = mem_cgroup_subsys_id, .css_alloc = mem_cgroup_css_alloc, + .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, .css_free = mem_cgroup_css_free, .can_attach = mem_cgroup_can_attach, @@ -6757,19 +6977,6 @@ struct cgroup_subsys mem_cgroup_subsys = { .use_id = 1, }; -/* - * The rest of init is performed during ->css_alloc() for root css which - * happens before initcalls. hotcpu_notifier() can't be done together as - * it would introduce circular locking by adding cgroup_lock -> cpu hotplug - * dependency. Do it from a subsys_initcall(). - */ -static int __init mem_cgroup_init(void) -{ - hotcpu_notifier(memcg_cpu_hotplug_callback, 0); - return 0; -} -subsys_initcall(mem_cgroup_init); - #ifdef CONFIG_MEMCG_SWAP static int __init enable_swap_account(char *s) { @@ -6782,4 +6989,39 @@ static int __init enable_swap_account(char *s) } __setup("swapaccount=", enable_swap_account); +static void __init memsw_file_init(void) +{ + WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); +} + +static void __init enable_swap_cgroup(void) +{ + if (!mem_cgroup_disabled() && really_do_swap_account) { + do_swap_account = 1; + memsw_file_init(); + } +} + +#else +static void __init enable_swap_cgroup(void) +{ +} #endif + +/* + * subsys_initcall() for memory controller. + * + * Some parts like hotcpu_notifier() have to be initialized from this context + * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically + * everything that doesn't depend on a specific mem_cgroup structure should + * be initialized from here. + */ +static int __init mem_cgroup_init(void) +{ + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); + enable_swap_cgroup(); + mem_cgroup_soft_limit_tree_init(); + memcg_stock_init(); + return 0; +} +subsys_initcall(mem_cgroup_init); |