From 30088ad815802f850f26114920ccf9effd4bc520 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 17:53:46 +0800 Subject: cgroup,rcu: convert call_rcu(free_css_set_rcu) to kfree_rcu() The rcu callback free_css_set_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(free_css_set_rcu). Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/cgroup.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 25c7eb52de1a..d5160a83fb35 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -326,12 +326,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } -static void free_css_set_rcu(struct rcu_head *obj) -{ - struct css_set *cg = container_of(obj, struct css_set, rcu_head); - kfree(cg); -} - /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups @@ -375,7 +369,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) } write_unlock(&css_set_lock); - call_rcu(&cg->rcu_head, free_css_set_rcu); + kfree_rcu(cg, rcu_head); } /* -- cgit v1.2.3 From f2da1c40dc003939f616f27a103b2592f1424b07 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 17:55:16 +0800 Subject: cgroup,rcu: convert call_rcu(free_cgroup_rcu) to kfree_rcu() The rcu callback free_cgroup_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(free_cgroup_rcu). Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/cgroup.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d5160a83fb35..20451ce7195f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -806,13 +806,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) return ret; } -static void free_cgroup_rcu(struct rcu_head *obj) -{ - struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); - - kfree(cgrp); -} - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -850,7 +843,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ BUG_ON(!list_empty(&cgrp->pidlists)); - call_rcu(&cgrp->rcu_head, free_cgroup_rcu); + kfree_rcu(cgrp, rcu_head); } iput(inode); } -- cgit v1.2.3 From 025cea99db3fb110ebc8ede5ff647833fab9574f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 17:56:10 +0800 Subject: cgroup,rcu: convert call_rcu(__free_css_id_cb) to kfree_rcu() The rcu callback __free_css_id_cb() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(__free_css_id_cb). Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/cgroup.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 20451ce7195f..909a35510af5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4610,14 +4610,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, return ret; } -static void __free_css_id_cb(struct rcu_head *head) -{ - struct css_id *id; - - id = container_of(head, struct css_id, rcu_head); - kfree(id); -} - void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) { struct css_id *id = css->id; @@ -4632,7 +4624,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) spin_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); spin_unlock(&ss->id_lock); - call_rcu(&id->rcu_head, __free_css_id_cb); + kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); -- cgit v1.2.3 From f780bdb7c1c73009cb57adcf99ef50027d80bf3c Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:19 -0700 Subject: cgroups: add per-thread subsystem callbacks Add cgroup subsystem callbacks for per-thread attachment in atomic contexts Add can_attach_task(), pre_attach(), and attach_task() as new callbacks for cgroups's subsystem interface. Unlike can_attach and attach, these are for per-thread operations, to be called potentially many times when attaching an entire threadgroup. Also, the old "bool threadgroup" interface is removed, as replaced by this. All subsystems are modified for the new interface - of note is cpuset, which requires from/to nodemasks for attach to be globally scoped (though per-cpuset would work too) to persist from its pre_attach to attach_task and attach. This is a pre-patch for cgroup-procs-writable.patch. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 30 ++++++++---- block/blk-cgroup.c | 18 +++----- include/linux/cgroup.h | 10 ++-- kernel/cgroup.c | 17 +++++-- kernel/cgroup_freezer.c | 26 ++++------- kernel/cpuset.c | 96 +++++++++++++++++++-------------------- kernel/sched.c | 38 ++-------------- mm/memcontrol.c | 18 +++----- security/device_cgroup.c | 3 +- 9 files changed, 114 insertions(+), 142 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index aedf1bd02fdd..b3bd3bdbe202 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -575,7 +575,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be called multiple times against a cgroup. int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *task, bool threadgroup) + struct task_struct *task) (cgroup_mutex held by caller) Called prior to moving a task into a cgroup; if the subsystem @@ -584,9 +584,14 @@ task is passed, then a successful result indicates that *any* unspecified task can be moved into the cgroup. Note that this isn't called on a fork. If this method returns 0 (success) then this should remain valid while the caller holds cgroup_mutex and it is ensured that either -attach() or cancel_attach() will be called in future. If threadgroup is -true, then a successful result indicates that all threads in the given -thread's threadgroup can be moved together. +attach() or cancel_attach() will be called in future. + +int can_attach_task(struct cgroup *cgrp, struct task_struct *tsk); +(cgroup_mutex held by caller) + +As can_attach, but for operations that must be run once per task to be +attached (possibly many when using cgroup_attach_proc). Called after +can_attach. void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *task, bool threadgroup) @@ -598,15 +603,24 @@ function, so that the subsystem can implement a rollback. If not, not necessary. This will be called only about subsystems whose can_attach() operation have succeeded. +void pre_attach(struct cgroup *cgrp); +(cgroup_mutex held by caller) + +For any non-per-thread attachment work that needs to happen before +attach_task. Needed by cpuset. + void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task, - bool threadgroup) + struct cgroup *old_cgrp, struct task_struct *task) (cgroup_mutex held by caller) Called after the task has been attached to the cgroup, to allow any post-attachment activity that requires memory allocations or blocking. -If threadgroup is true, the subsystem should take care of all threads -in the specified thread's threadgroup. Currently does not support any + +void attach_task(struct cgroup *cgrp, struct task_struct *tsk); +(cgroup_mutex held by caller) + +As attach, but for operations that must be run once per task to be attached, +like can_attach_task. Called before attach. Currently does not support any subsystem that might need the old_cgrp for every thread in the group. void fork(struct cgroup_subsy *ss, struct task_struct *task) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 07371cfdfae6..bcaf16ee6ad1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -30,10 +30,8 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup); static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, struct cgroup *); -static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, - struct task_struct *, bool); -static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, - struct cgroup *, struct task_struct *, bool); +static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *); +static void blkiocg_attach_task(struct cgroup *, struct task_struct *); static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); @@ -46,8 +44,8 @@ static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); struct cgroup_subsys blkio_subsys = { .name = "blkio", .create = blkiocg_create, - .can_attach = blkiocg_can_attach, - .attach = blkiocg_attach, + .can_attach_task = blkiocg_can_attach_task, + .attach_task = blkiocg_attach_task, .destroy = blkiocg_destroy, .populate = blkiocg_populate, #ifdef CONFIG_BLK_CGROUP @@ -1616,9 +1614,7 @@ done: * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkiocg_can_attach(struct cgroup_subsys *subsys, - struct cgroup *cgroup, struct task_struct *tsk, - bool threadgroup) +static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { struct io_context *ioc; int ret = 0; @@ -1633,9 +1629,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys, return ret; } -static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, - struct cgroup *prev, struct task_struct *tsk, - bool threadgroup) +static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { struct io_context *ioc; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5ac7ebc36dbb..1e6cde21fa3f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -467,12 +467,14 @@ struct cgroup_subsys { int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk, bool threadgroup); + struct task_struct *tsk); + int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk); void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk, bool threadgroup); + struct task_struct *tsk); + void (*pre_attach)(struct cgroup *cgrp); + void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk); void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *tsk, - bool threadgroup); + struct cgroup *old_cgrp, struct task_struct *tsk); void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *task); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 909a35510af5..38fb0ad1cb46 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1759,7 +1759,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk, false); + retval = ss->can_attach(ss, cgrp, tsk); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1771,6 +1771,13 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; } } + if (ss->can_attach_task) { + retval = ss->can_attach_task(cgrp, tsk); + if (retval) { + failed_ss = ss; + goto out; + } + } } task_lock(tsk); @@ -1805,8 +1812,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) write_unlock(&css_set_lock); for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + if (ss->attach_task) + ss->attach_task(cgrp, tsk); if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk, false); + ss->attach(ss, cgrp, oldcgrp, tsk); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -1829,7 +1840,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, tsk, false); + ss->cancel_attach(ss, cgrp, tsk); } } return retval; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e7bebb7c6c38..e691818d7e45 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss, */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task, bool threadgroup) + struct task_struct *task) { struct freezer *freezer; @@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state != CGROUP_THAWED) return -EBUSY; + return 0; +} + +static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +{ rcu_read_lock(); - if (__cgroup_freezing_or_frozen(task)) { + if (__cgroup_freezing_or_frozen(tsk)) { rcu_read_unlock(); return -EBUSY; } rcu_read_unlock(); - - if (threadgroup) { - struct task_struct *c; - - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (__cgroup_freezing_or_frozen(c)) { - rcu_read_unlock(); - return -EBUSY; - } - } - rcu_read_unlock(); - } - return 0; } @@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = { .populate = freezer_populate, .subsys_id = freezer_subsys_id, .can_attach = freezer_can_attach, + .can_attach_task = freezer_can_attach_task, + .pre_attach = NULL, + .attach_task = NULL, .attach = NULL, .fork = freezer_fork, .exit = NULL, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2bb8c2e98fff..55b297d78adc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* Protected by cgroup_lock */ -static cpumask_var_t cpus_attach; - /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct task_struct *tsk, bool threadgroup) + struct task_struct *tsk) { - int ret; struct cpuset *cs = cgroup_cs(cont); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) @@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - ret = security_task_setscheduler(tsk); - if (ret) - return ret; - if (threadgroup) { - struct task_struct *c; - - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - ret = security_task_setscheduler(c); - if (ret) { - rcu_read_unlock(); - return ret; - } - } - rcu_read_unlock(); - } return 0; } -static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, - struct cpuset *cs) +static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) +{ + return security_task_setscheduler(task); +} + +/* + * Protected by cgroup_lock. The nodemasks must be stored globally because + * dynamically allocating them is not allowed in pre_attach, and they must + * persist among pre_attach, attach_task, and attach. + */ +static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_from; +static nodemask_t cpuset_attach_nodemask_to; + +/* Set-up work for before attaching each task. */ +static void cpuset_pre_attach(struct cgroup *cont) +{ + struct cpuset *cs = cgroup_cs(cont); + + if (cs == &top_cpuset) + cpumask_copy(cpus_attach, cpu_possible_mask); + else + guarantee_online_cpus(cs, cpus_attach); + + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); +} + +/* Per-thread attachment work. */ +static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) { int err; + struct cpuset *cs = cgroup_cs(cont); + /* * can_attach beforehand should guarantee that this doesn't fail. * TODO: have a better way to handle failure here @@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, err = set_cpus_allowed_ptr(tsk, cpus_attach); WARN_ON_ONCE(err); - cpuset_change_task_nodemask(tsk, to); + cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, tsk); - } static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct cgroup *oldcont, struct task_struct *tsk, - bool threadgroup) + struct cgroup *oldcont, struct task_struct *tsk) { struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - static nodemask_t to; /* protected by cgroup_mutex */ - if (cs == &top_cpuset) { - cpumask_copy(cpus_attach, cpu_possible_mask); - } else { - guarantee_online_cpus(cs, cpus_attach); - } - guarantee_online_mems(cs, &to); - - /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, &to, cs); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, &to, cs); - } - rcu_read_unlock(); - } - - /* change mm; only needs to be done once even if threadgroup */ - to = cs->mems_allowed; + /* + * Change mm, possibly for multiple threads in a threadgroup. This is + * expensive and may sleep. + */ + cpuset_attach_nodemask_from = oldcs->mems_allowed; + cpuset_attach_nodemask_to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, &to); + mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); + cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, + &cpuset_attach_nodemask_to); mmput(mm); } } @@ -1911,6 +1904,9 @@ struct cgroup_subsys cpuset_subsys = { .create = cpuset_create, .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, + .can_attach_task = cpuset_can_attach_task, + .pre_attach = cpuset_pre_attach, + .attach_task = cpuset_attach_task, .attach = cpuset_attach, .populate = cpuset_populate, .post_clone = cpuset_post_clone, diff --git a/kernel/sched.c b/kernel/sched.c index 2d12893b8b0f..5e43e9dc65d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8764,42 +8764,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) return 0; } -static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk, bool threadgroup) -{ - int retval = cpu_cgroup_can_attach_task(cgrp, tsk); - if (retval) - return retval; - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - retval = cpu_cgroup_can_attach_task(cgrp, c); - if (retval) { - rcu_read_unlock(); - return retval; - } - } - rcu_read_unlock(); - } - return 0; -} - static void -cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk, - bool threadgroup) +cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { sched_move_task(tsk); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - sched_move_task(c); - } - rcu_read_unlock(); - } } static void @@ -8887,8 +8855,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .create = cpu_cgroup_create, .destroy = cpu_cgroup_destroy, - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, + .can_attach_task = cpu_cgroup_can_attach_task, + .attach_task = cpu_cgroup_attach_task, .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5fd3dcd3f2e..fc259926c170 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4953,8 +4953,7 @@ static void mem_cgroup_clear_mc(void) static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { int ret = 0; struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); @@ -4993,8 +4992,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { mem_cgroup_clear_mc(); } @@ -5112,8 +5110,7 @@ retry: static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { struct mm_struct *mm; @@ -5131,22 +5128,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { return 0; } static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { } static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { } #endif diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 8d9c48f13774..cd1f779fa51d 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -62,8 +62,7 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) struct cgroup_subsys devices_subsys; static int devcgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, struct task_struct *task, - bool threadgroup) + struct cgroup *new_cgroup, struct task_struct *task) { if (current != task && !capable(CAP_SYS_ADMIN)) return -EPERM; -- cgit v1.2.3 From 74a1166dfe1135dcc168d35fa5261aa7e087011b Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:20 -0700 Subject: cgroups: make procs file writable Make procs file writable to move all threads by tgid at once. Add functionality that enables users to move all threads in a threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs' file. This current implementation makes use of a per-threadgroup rwsem that's taken for reading in the fork() path to prevent newly forking threads within the threadgroup from "escaping" while the move is in progress. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 9 +- kernel/cgroup.c | 439 ++++++++++++++++++++++++++++++++++---- 2 files changed, 401 insertions(+), 47 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index b3bd3bdbe202..8c4f3466c894 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -236,7 +236,8 @@ containing the following files describing that cgroup: - cgroup.procs: list of tgids in the cgroup. This list is not guaranteed to be sorted or free of duplicate tgids, and userspace should sort/uniquify the list if this property is required. - This is a read-only file, for now. + Writing a thread group id into this file moves all threads in that + group into this cgroup. - notify_on_release flag: run the release agent on exit? - release_agent: the path to use for release notifications (this file exists in the top cgroup only) @@ -430,6 +431,12 @@ You can attach the current shell task by echoing 0: # echo 0 > tasks +You can use the cgroup.procs file instead of the tasks file to move all +threads in a threadgroup at once. Echoing the pid of any task in a +threadgroup to cgroup.procs causes all tasks in that threadgroup to be +be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks +in the writing task's threadgroup. + Note: Since every task is always a member of exactly one cgroup in each mounted hierarchy, to remove a task from its current cgroup you must move it into a new cgroup (possibly the root cgroup) by writing to the diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 38fb0ad1cb46..5e6a9745f0e7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1735,6 +1735,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) } EXPORT_SYMBOL_GPL(cgroup_path); +/* + * cgroup_task_migrate - move a task from one cgroup to another. + * + * 'guarantee' is set if the caller promises that a new css_set for the task + * will already exist. If not set, this function might sleep, and can fail with + * -ENOMEM. Otherwise, it can only fail with -ESRCH. + */ +static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + struct task_struct *tsk, bool guarantee) +{ + struct css_set *oldcg; + struct css_set *newcg; + + /* + * get old css_set. we need to take task_lock and refcount it, because + * an exiting task can change its css_set to init_css_set and drop its + * old one without taking cgroup_mutex. + */ + task_lock(tsk); + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + + /* locate or allocate a new css_set for this task. */ + if (guarantee) { + /* we know the css_set we want already exists. */ + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + read_lock(&css_set_lock); + newcg = find_existing_css_set(oldcg, cgrp, template); + BUG_ON(!newcg); + get_css_set(newcg); + read_unlock(&css_set_lock); + } else { + might_sleep(); + /* find_css_set will give us newcg already referenced. */ + newcg = find_css_set(oldcg, cgrp); + if (!newcg) { + put_css_set(oldcg); + return -ENOMEM; + } + } + put_css_set(oldcg); + + /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + task_unlock(tsk); + put_css_set(newcg); + return -ESRCH; + } + rcu_assign_pointer(tsk->cgroups, newcg); + task_unlock(tsk); + + /* Update the css_set linked lists if we're using them */ + write_lock(&css_set_lock); + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); + write_unlock(&css_set_lock); + + /* + * We just gained a reference on oldcg by taking it from the task. As + * trading it for newcg is protected by cgroup_mutex, we're safe to drop + * it here; it will be freed under RCU. + */ + put_css_set(oldcg); + + set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + return 0; +} + /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1745,11 +1815,9 @@ EXPORT_SYMBOL_GPL(cgroup_path); */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { - int retval = 0; + int retval; struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; - struct css_set *cg; - struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; /* Nothing to do if the task is already in that cgroup */ @@ -1780,36 +1848,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } - task_lock(tsk); - cg = tsk->cgroups; - get_css_set(cg); - task_unlock(tsk); - /* - * Locate or allocate a new css_set for this task, - * based on its final set of cgroups - */ - newcg = find_css_set(cg, cgrp); - put_css_set(cg); - if (!newcg) { - retval = -ENOMEM; - goto out; - } - - task_lock(tsk); - if (tsk->flags & PF_EXITING) { - task_unlock(tsk); - put_css_set(newcg); - retval = -ESRCH; + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); + if (retval) goto out; - } - rcu_assign_pointer(tsk->cgroups, newcg); - task_unlock(tsk); - - /* Update the css_set linked lists if we're using them */ - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &newcg->tasks); - write_unlock(&css_set_lock); for_each_subsys(root, ss) { if (ss->pre_attach) @@ -1819,9 +1860,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) if (ss->attach) ss->attach(ss, cgrp, oldcgrp, tsk); } - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + synchronize_rcu(); - put_css_set(cg); /* * wake up rmdir() waiter. the rmdir should fail since the cgroup @@ -1871,49 +1911,356 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /* - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex - * held. May take task_lock of task + * cgroup_attach_proc works in two stages, the first of which prefetches all + * new css_sets needed (to make sure we have enough memory before committing + * to the move) and stores them in a list of entries of the following type. + * TODO: possible optimization: use css_set->rcu_head for chaining instead + */ +struct cg_list_entry { + struct css_set *cg; + struct list_head links; +}; + +static bool css_set_check_fetched(struct cgroup *cgrp, + struct task_struct *tsk, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + + read_lock(&css_set_lock); + newcg = find_existing_css_set(cg, cgrp, template); + if (newcg) + get_css_set(newcg); + read_unlock(&css_set_lock); + + /* doesn't exist at all? */ + if (!newcg) + return false; + /* see if it's already in the list */ + list_for_each_entry(cg_entry, newcg_list, links) { + if (cg_entry->cg == newcg) { + put_css_set(newcg); + return true; + } + } + + /* not found */ + put_css_set(newcg); + return false; +} + +/* + * Find the new css_set and store it in the list in preparation for moving the + * given task to the given cgroup. Returns 0 or -ENOMEM. + */ +static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + + /* ensure a new css_set will exist for this thread */ + newcg = find_css_set(cg, cgrp); + if (!newcg) + return -ENOMEM; + /* add it to the list */ + cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); + if (!cg_entry) { + put_css_set(newcg); + return -ENOMEM; + } + cg_entry->cg = newcg; + list_add(&cg_entry->links, newcg_list); + return 0; +} + +/** + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * @cgrp: the cgroup to attach to + * @leader: the threadgroup leader task_struct of the group to be attached + * + * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will + * take task_lock of each thread in leader's threadgroup individually in turn. + */ +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +{ + int retval, i, group_size; + struct cgroup_subsys *ss, *failed_ss = NULL; + bool cancel_failed_ss = false; + /* guaranteed to be initialized later, but the compiler needs this */ + struct cgroup *oldcgrp = NULL; + struct css_set *oldcg; + struct cgroupfs_root *root = cgrp->root; + /* threadgroup list cursor and array */ + struct task_struct *tsk; + struct task_struct **group; + /* + * we need to make sure we have css_sets for all the tasks we're + * going to move -before- we actually start moving them, so that in + * case we get an ENOMEM we can bail out before making any changes. + */ + struct list_head newcg_list; + struct cg_list_entry *cg_entry, *temp_nobe; + + /* + * step 0: in order to do expensive, possibly blocking operations for + * every thread, we cannot iterate the thread group list, since it needs + * rcu or tasklist locked. instead, build an array of all threads in the + * group - threadgroup_fork_lock prevents new threads from appearing, + * and if threads exit, this will just be an over-estimate. + */ + group_size = get_nr_threads(leader); + group = kmalloc(group_size * sizeof(*group), GFP_KERNEL); + if (!group) + return -ENOMEM; + + /* prevent changes to the threadgroup list while we take a snapshot. */ + rcu_read_lock(); + if (!thread_group_leader(leader)) { + /* + * a race with de_thread from another thread's exec() may strip + * us of our leadership, making while_each_thread unsafe to use + * on this task. if this happens, there is no choice but to + * throw this task away and try again (from cgroup_procs_write); + * this is "double-double-toil-and-trouble-check locking". + */ + rcu_read_unlock(); + retval = -EAGAIN; + goto out_free_group_list; + } + /* take a reference on each task in the group to go in the array. */ + tsk = leader; + i = 0; + do { + /* as per above, nr_threads may decrease, but not increase. */ + BUG_ON(i >= group_size); + get_task_struct(tsk); + group[i] = tsk; + i++; + } while_each_thread(leader, tsk); + /* remember the number of threads in the array for later. */ + group_size = i; + rcu_read_unlock(); + + /* + * step 1: check that we can legitimately attach to the cgroup. + */ + for_each_subsys(root, ss) { + if (ss->can_attach) { + retval = ss->can_attach(ss, cgrp, leader); + if (retval) { + failed_ss = ss; + goto out_cancel_attach; + } + } + /* a callback to be run on every thread in the threadgroup. */ + if (ss->can_attach_task) { + /* run on each task in the threadgroup. */ + for (i = 0; i < group_size; i++) { + retval = ss->can_attach_task(cgrp, group[i]); + if (retval) { + failed_ss = ss; + cancel_failed_ss = true; + goto out_cancel_attach; + } + } + } + } + + /* + * step 2: make sure css_sets exist for all threads to be migrated. + * we use find_css_set, which allocates a new one if necessary. + */ + INIT_LIST_HEAD(&newcg_list); + for (i = 0; i < group_size; i++) { + tsk = group[i]; + /* nothing to do if this task is already in the cgroup */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* get old css_set pointer */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + /* ignore this task if it's going away */ + task_unlock(tsk); + continue; + } + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + /* see if the new one for us is already in the list? */ + if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { + /* was already there, nothing to do. */ + put_css_set(oldcg); + } else { + /* we don't already have it. get new one. */ + retval = css_set_prefetch(cgrp, oldcg, &newcg_list); + put_css_set(oldcg); + if (retval) + goto out_list_teardown; + } + } + + /* + * step 3: now that we're guaranteed success wrt the css_sets, proceed + * to move all tasks to the new cgroup, calling ss->attach_task for each + * one along the way. there are no failure cases after here, so this is + * the commit point. + */ + for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + } + for (i = 0; i < group_size; i++) { + tsk = group[i]; + /* leave current thread as it is if it's already there */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + } + /* if the thread is PF_EXITING, it can just get skipped. */ + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); + BUG_ON(retval != 0 && retval != -ESRCH); + } + /* nothing is sensitive to fork() after this point. */ + + /* + * step 4: do expensive, non-thread-specific subsystem callbacks. + * TODO: if ever a subsystem needs to know the oldcgrp for each task + * being moved, this call will need to be reworked to communicate that. + */ + for_each_subsys(root, ss) { + if (ss->attach) + ss->attach(ss, cgrp, oldcgrp, leader); + } + + /* + * step 5: success! and cleanup + */ + synchronize_rcu(); + cgroup_wakeup_rmdir_waiter(cgrp); + retval = 0; +out_list_teardown: + /* clean up the list of prefetched css_sets. */ + list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { + list_del(&cg_entry->links); + put_css_set(cg_entry->cg); + kfree(cg_entry); + } +out_cancel_attach: + /* same deal as in cgroup_attach_task */ + if (retval) { + for_each_subsys(root, ss) { + if (ss == failed_ss) { + if (cancel_failed_ss && ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + break; + } + if (ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + } + } + /* clean up the array of referenced threads in the group. */ + for (i = 0; i < group_size; i++) + put_task_struct(group[i]); +out_free_group_list: + kfree(group); + return retval; +} + +/* + * Find the task_struct of the task to attach by vpid and pass it along to the + * function to attach either it or all tasks in its threadgroup. Will take + * cgroup_mutex; may take task_lock of task. */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; int ret; + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + if (pid) { rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk || tsk->flags & PF_EXITING) { + if (!tsk) { rcu_read_unlock(); + cgroup_unlock(); + return -ESRCH; + } + if (threadgroup) { + /* + * RCU protects this access, since tsk was found in the + * tid map. a race with de_thread may cause group_leader + * to stop being the leader, but cgroup_attach_proc will + * detect it later. + */ + tsk = tsk->group_leader; + } else if (tsk->flags & PF_EXITING) { + /* optimization for the single-task-only case */ + rcu_read_unlock(); + cgroup_unlock(); return -ESRCH; } + /* + * even if we're attaching all tasks in the thread group, we + * only need to check permissions on one of them. + */ tcred = __task_cred(tsk); if (cred->euid && cred->euid != tcred->uid && cred->euid != tcred->suid) { rcu_read_unlock(); + cgroup_unlock(); return -EACCES; } get_task_struct(tsk); rcu_read_unlock(); } else { - tsk = current; + if (threadgroup) + tsk = current->group_leader; + else + tsk = current; get_task_struct(tsk); } - ret = cgroup_attach_task(cgrp, tsk); + if (threadgroup) { + threadgroup_fork_write_lock(tsk); + ret = cgroup_attach_proc(cgrp, tsk); + threadgroup_fork_write_unlock(tsk); + } else { + ret = cgroup_attach_task(cgrp, tsk); + } put_task_struct(tsk); + cgroup_unlock(); return ret; } static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +{ + return attach_task_by_pid(cgrp, pid, false); +} + +static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) { int ret; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - ret = attach_task_by_pid(cgrp, pid); - cgroup_unlock(); + do { + /* + * attach_proc fails with -EAGAIN if threadgroup leadership + * changes in the middle of the operation, in which case we need + * to find the task_struct for the new leader and start over. + */ + ret = attach_task_by_pid(cgrp, tgid, true); + } while (ret == -EAGAIN); return ret; } @@ -3270,9 +3617,9 @@ static struct cftype files[] = { { .name = CGROUP_FILE_GENERIC_PREFIX "procs", .open = cgroup_procs_open, - /* .write_u64 = cgroup_procs_write, TODO */ + .write_u64 = cgroup_procs_write, .release = cgroup_pidlist_release, - .mode = S_IRUGO, + .mode = S_IRUGO | S_IWUSR, }, { .name = "notify_on_release", -- cgit v1.2.3 From d846687d7f84e45f23ecf3846dbb43312a1206dd Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:21 -0700 Subject: cgroups: use flex_array in attach_proc Convert cgroup_attach_proc to use flex_array. The cgroup_attach_proc implementation requires a pre-allocated array to store task pointers to atomically move a thread-group, but asking for a monolithic array with kmalloc() may be unreliable for very large groups. Using flex_array provides the same functionality with less risk of failure. This is a post-patch for cgroup-procs-write.patch. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5e6a9745f0e7..00a884342d3d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,6 +57,7 @@ #include /* TODO: replace with more sophisticated array */ #include #include +#include /* used in cgroup_attach_proc */ #include @@ -1995,7 +1996,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ struct task_struct *tsk; - struct task_struct **group; + struct flex_array *group; /* * we need to make sure we have css_sets for all the tasks we're * going to move -before- we actually start moving them, so that in @@ -2012,9 +2013,15 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * and if threads exit, this will just be an over-estimate. */ group_size = get_nr_threads(leader); - group = kmalloc(group_size * sizeof(*group), GFP_KERNEL); + /* flex_array supports very large thread-groups better than kmalloc. */ + group = flex_array_alloc(sizeof(struct task_struct *), group_size, + GFP_KERNEL); if (!group) return -ENOMEM; + /* pre-allocate to guarantee space while iterating in rcu read-side. */ + retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); + if (retval) + goto out_free_group_list; /* prevent changes to the threadgroup list while we take a snapshot. */ rcu_read_lock(); @@ -2037,7 +2044,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); get_task_struct(tsk); - group[i] = tsk; + /* + * saying GFP_ATOMIC has no effect here because we did prealloc + * earlier, but it's good form to communicate our expectations. + */ + retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); + BUG_ON(retval != 0); i++; } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ @@ -2059,7 +2071,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) if (ss->can_attach_task) { /* run on each task in the threadgroup. */ for (i = 0; i < group_size; i++) { - retval = ss->can_attach_task(cgrp, group[i]); + tsk = flex_array_get_ptr(group, i); + retval = ss->can_attach_task(cgrp, tsk); if (retval) { failed_ss = ss; cancel_failed_ss = true; @@ -2075,7 +2088,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { - tsk = group[i]; + tsk = flex_array_get_ptr(group, i); /* nothing to do if this task is already in the cgroup */ oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) @@ -2114,7 +2127,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) ss->pre_attach(cgrp); } for (i = 0; i < group_size; i++) { - tsk = group[i]; + tsk = flex_array_get_ptr(group, i); /* leave current thread as it is if it's already there */ oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) @@ -2167,10 +2180,12 @@ out_cancel_attach: } } /* clean up the array of referenced threads in the group. */ - for (i = 0; i < group_size; i++) - put_task_struct(group[i]); + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + put_task_struct(tsk); + } out_free_group_list: - kfree(group); + flex_array_free(group); return retval; } -- cgit v1.2.3 From a77aea92010acf54ad785047234418d5d68772e2 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 26 May 2011 16:25:23 -0700 Subject: cgroup: remove the ns_cgroup The ns_cgroup is an annoying cgroup at the namespace / cgroup frontier and leads to some problems: * cgroup creation is out-of-control * cgroup name can conflict when pids are looping * it is not possible to have a single process handling a lot of namespaces without falling in a exponential creation time * we may want to create a namespace without creating a cgroup The ns_cgroup was replaced by a compatibility flag 'clone_children', where a newly created cgroup will copy the parent cgroup values. The userspace has to manually create a cgroup and add a task to the 'tasks' file. This patch removes the ns_cgroup as suggested in the following thread: https://lists.linux-foundation.org/pipermail/containers/2009-June/018616.html The 'cgroup_clone' function is removed because it is no longer used. This is a userspace-visible change. Commit 45531757b45c ("cgroup: notify ns_cgroup deprecated") (merged into 2.6.27) caused the kernel to emit a printk warning users that the feature is planned for removal. Since that time we have heard from XXX users who were affected by this. Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Cc: Eric W. Biederman Cc: Jamal Hadi Salim Reviewed-by: Li Zefan Acked-by: Paul Menage Acked-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 2 +- arch/mips/configs/bcm47xx_defconfig | 1 - arch/mn10300/configs/asb2364_defconfig | 1 - arch/powerpc/configs/ppc6xx_defconfig | 1 - arch/powerpc/configs/pseries_defconfig | 1 - arch/sh/configs/apsh4ad0a_defconfig | 1 - arch/sh/configs/sdk7786_defconfig | 1 - arch/sh/configs/se7206_defconfig | 1 - arch/sh/configs/shx3_defconfig | 1 - arch/sh/configs/urquell_defconfig | 1 - arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - include/linux/cgroup.h | 3 - include/linux/cgroup_subsys.h | 6 -- include/linux/nsproxy.h | 9 --- init/Kconfig | 8 --- kernel/Makefile | 1 - kernel/cgroup.c | 116 -------------------------------- kernel/cpuset.c | 7 +- kernel/fork.c | 6 -- kernel/ns_cgroup.c | 118 --------------------------------- kernel/nsproxy.c | 4 -- 22 files changed, 4 insertions(+), 287 deletions(-) delete mode 100644 kernel/ns_cgroup.c (limited to 'kernel/cgroup.c') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 8c4f3466c894..0ed99f08f1f3 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -651,7 +651,7 @@ always handled well. void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp) (cgroup_mutex held by caller) -Called at the end of cgroup_clone() to do any parameter +Called during cgroup_create() to do any parameter initialization which might be required before a task could attach. For example in cpusets, no task may attach before 'cpus' and 'mems' are set up. diff --git a/arch/mips/configs/bcm47xx_defconfig b/arch/mips/configs/bcm47xx_defconfig index 22fdf2f0cc23..ad15fb10322b 100644 --- a/arch/mips/configs/bcm47xx_defconfig +++ b/arch/mips/configs/bcm47xx_defconfig @@ -16,7 +16,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_TINY_RCU=y CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_CPUACCT=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y diff --git a/arch/mn10300/configs/asb2364_defconfig b/arch/mn10300/configs/asb2364_defconfig index 31d76261a3d5..fbb96ae3122a 100644 --- a/arch/mn10300/configs/asb2364_defconfig +++ b/arch/mn10300/configs/asb2364_defconfig @@ -8,7 +8,6 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 214208924a9c..04360f9b0109 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -10,7 +10,6 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 7de13865508c..c9f212b5f3de 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -15,7 +15,6 @@ CONFIG_AUDITSYSCALL=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig index 77ec0e7b8ddf..e7583484cc07 100644 --- a/arch/sh/configs/apsh4ad0a_defconfig +++ b/arch/sh/configs/apsh4ad0a_defconfig @@ -7,7 +7,6 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index c41650572d79..8a7dd7b59c5c 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -12,7 +12,6 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_CGROUPS=y CONFIG_CGROUP_DEBUG=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig index a468ff227fc6..72c3fad7383f 100644 --- a/arch/sh/configs/se7206_defconfig +++ b/arch/sh/configs/se7206_defconfig @@ -8,7 +8,6 @@ CONFIG_RCU_TRACE=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_CGROUPS=y CONFIG_CGROUP_DEBUG=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y diff --git a/arch/sh/configs/shx3_defconfig b/arch/sh/configs/shx3_defconfig index 3f92d37c6374..6bb413036892 100644 --- a/arch/sh/configs/shx3_defconfig +++ b/arch/sh/configs/shx3_defconfig @@ -9,7 +9,6 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig index 7b3daec6fefe..8bfa4d056d7a 100644 --- a/arch/sh/configs/urquell_defconfig +++ b/arch/sh/configs/urquell_defconfig @@ -9,7 +9,6 @@ CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_CGROUPS=y CONFIG_CGROUP_DEBUG=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 6f9872658dd2..2bf18059fbea 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -10,7 +10,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index ee01a9d5d4f0..22a0dc8e51dd 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -11,7 +11,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1e6cde21fa3f..ab4ac0ccb857 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -555,9 +555,6 @@ static inline struct cgroup* task_cgroup(struct task_struct *task, return task_subsys_state(task, subsys_id)->cgroup; } -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss, - char *nodename); - /* A cgroup_iter should be treated as an opaque object */ struct cgroup_iter { struct list_head *cg_link; diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index cdbfcb8780ec..ac663c18776c 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -19,12 +19,6 @@ SUBSYS(debug) /* */ -#ifdef CONFIG_CGROUP_NS -SUBSYS(ns) -#endif - -/* */ - #ifdef CONFIG_CGROUP_SCHED SUBSYS(cpu_cgroup) #endif diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 7b370c7cfeff..50d20aba57d3 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -81,13 +81,4 @@ static inline void get_nsproxy(struct nsproxy *ns) atomic_inc(&ns->count); } -#ifdef CONFIG_CGROUP_NS -int ns_cgroup_clone(struct task_struct *tsk, struct pid *pid); -#else -static inline int ns_cgroup_clone(struct task_struct *tsk, struct pid *pid) -{ - return 0; -} -#endif - #endif diff --git a/init/Kconfig b/init/Kconfig index 332aac649966..ebafac4231ee 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -589,14 +589,6 @@ config CGROUP_DEBUG Say N if unsure. -config CGROUP_NS - bool "Namespace cgroup subsystem" - help - Provides a simple namespace cgroup subsystem to - provide hierarchical naming of sets of namespaces, - for instance virtual servers and checkpoint/restart - jobs. - config CGROUP_FREEZER bool "Freezer cgroup subsystem" help diff --git a/kernel/Makefile b/kernel/Makefile index e9cf19155b46..2d64cfcc8b42 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -61,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o -obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 00a884342d3d..2731d115d725 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4629,122 +4629,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) put_css_set_taskexit(cg); } -/** - * cgroup_clone - clone the cgroup the given subsystem is attached to - * @tsk: the task to be moved - * @subsys: the given subsystem - * @nodename: the name for the new cgroup - * - * Duplicate the current cgroup in the hierarchy that the given - * subsystem is attached to, and move this task into the new - * child. - */ -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, - char *nodename) -{ - struct dentry *dentry; - int ret = 0; - struct cgroup *parent, *child; - struct inode *inode; - struct css_set *cg; - struct cgroupfs_root *root; - struct cgroup_subsys *ss; - - /* We shouldn't be called by an unregistered subsystem */ - BUG_ON(!subsys->active); - - /* First figure out what hierarchy and cgroup we're dealing - * with, and pin them so we can drop cgroup_mutex */ - mutex_lock(&cgroup_mutex); - again: - root = subsys->root; - if (root == &rootnode) { - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* Pin the hierarchy */ - if (!atomic_inc_not_zero(&root->sb->s_active)) { - /* We race with the final deactivate_super() */ - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* Keep the cgroup alive */ - task_lock(tsk); - parent = task_cgroup(tsk, subsys->subsys_id); - cg = tsk->cgroups; - get_css_set(cg); - task_unlock(tsk); - - mutex_unlock(&cgroup_mutex); - - /* Now do the VFS work to create a cgroup */ - inode = parent->dentry->d_inode; - - /* Hold the parent directory mutex across this operation to - * stop anyone else deleting the new cgroup */ - mutex_lock(&inode->i_mutex); - dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); - if (IS_ERR(dentry)) { - printk(KERN_INFO - "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, - PTR_ERR(dentry)); - ret = PTR_ERR(dentry); - goto out_release; - } - - /* Create the cgroup directory, which also creates the cgroup */ - ret = vfs_mkdir(inode, dentry, 0755); - child = __d_cgrp(dentry); - dput(dentry); - if (ret) { - printk(KERN_INFO - "Failed to create cgroup %s: %d\n", nodename, - ret); - goto out_release; - } - - /* The cgroup now exists. Retake cgroup_mutex and check - * that we're still in the same state that we thought we - * were. */ - mutex_lock(&cgroup_mutex); - if ((root != subsys->root) || - (parent != task_cgroup(tsk, subsys->subsys_id))) { - /* Aargh, we raced ... */ - mutex_unlock(&inode->i_mutex); - put_css_set(cg); - - deactivate_super(root->sb); - /* The cgroup is still accessible in the VFS, but - * we're not going to try to rmdir() it at this - * point. */ - printk(KERN_INFO - "Race in cgroup_clone() - leaking cgroup %s\n", - nodename); - goto again; - } - - /* do any required auto-setup */ - for_each_subsys(root, ss) { - if (ss->post_clone) - ss->post_clone(ss, child); - } - - /* All seems fine. Finish by moving the task into the new cgroup */ - ret = cgroup_attach_task(child, tsk); - mutex_unlock(&cgroup_mutex); - - out_release: - mutex_unlock(&inode->i_mutex); - - mutex_lock(&cgroup_mutex); - put_css_set(cg); - mutex_unlock(&cgroup_mutex); - deactivate_super(root->sb); - return ret; -} - /** * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp * @cgrp: the cgroup in question diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 55b297d78adc..1ceeb049c827 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1802,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) } /* - * post_clone() is called at the end of cgroup_clone(). - * 'cgroup' was just created automatically as a result of - * a cgroup_clone(), and the current task is about to - * be moved into 'cgroup'. + * post_clone() is called during cgroup_create() when the + * clone_children mount argument was specified. The cgroup + * can not yet have any tasks. * * Currently we refuse to set up the cgroup - thereby * refusing the task to be entered, and as a result refusing diff --git a/kernel/fork.c b/kernel/fork.c index 1fa9d940e301..1f84099ecce6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1229,12 +1229,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; - if (current->nsproxy != p->nsproxy) { - retval = ns_cgroup_clone(p, pid); - if (retval) - goto bad_fork_free_pid; - } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c deleted file mode 100644 index 2c98ad94ba0e..000000000000 --- a/kernel/ns_cgroup.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * ns_cgroup.c - namespace cgroup subsystem - * - * Copyright 2006, 2007 IBM Corp - */ - -#include -#include -#include -#include -#include -#include - -struct ns_cgroup { - struct cgroup_subsys_state css; -}; - -struct cgroup_subsys ns_subsys; - -static inline struct ns_cgroup *cgroup_to_ns( - struct cgroup *cgroup) -{ - return container_of(cgroup_subsys_state(cgroup, ns_subsys_id), - struct ns_cgroup, css); -} - -int ns_cgroup_clone(struct task_struct *task, struct pid *pid) -{ - char name[PROC_NUMBUF]; - - snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid)); - return cgroup_clone(task, &ns_subsys, name); -} - -/* - * Rules: - * 1. you can only enter a cgroup which is a descendant of your current - * cgroup - * 2. you can only place another process into a cgroup if - * a. you have CAP_SYS_ADMIN - * b. your cgroup is an ancestor of task's destination cgroup - * (hence either you are in the same cgroup as task, or in an - * ancestor cgroup thereof) - */ -static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task, bool threadgroup) -{ - if (current != task) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!cgroup_is_descendant(new_cgroup, current)) - return -EPERM; - } - - if (!cgroup_is_descendant(new_cgroup, task)) - return -EPERM; - - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (!cgroup_is_descendant(new_cgroup, c)) { - rcu_read_unlock(); - return -EPERM; - } - } - rcu_read_unlock(); - } - - return 0; -} - -/* - * Rules: you can only create a cgroup if - * 1. you are capable(CAP_SYS_ADMIN) - * 2. the target cgroup is a descendant of your own cgroup - */ -static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct ns_cgroup *ns_cgroup; - - if (!capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - if (!cgroup_is_descendant(cgroup, current)) - return ERR_PTR(-EPERM); - if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { - printk("ns_cgroup can't be created with parent " - "'clone_children' set.\n"); - return ERR_PTR(-EINVAL); - } - - printk_once("ns_cgroup deprecated: consider using the " - "'clone_children' flag without the ns_cgroup.\n"); - - ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); - if (!ns_cgroup) - return ERR_PTR(-ENOMEM); - return &ns_cgroup->css; -} - -static void ns_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct ns_cgroup *ns_cgroup; - - ns_cgroup = cgroup_to_ns(cgroup); - kfree(ns_cgroup); -} - -struct cgroup_subsys ns_subsys = { - .name = "ns", - .can_attach = ns_can_attach, - .create = ns_create, - .destroy = ns_destroy, - .subsys_id = ns_subsys_id, -}; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5424e37673ed..d6a00f3de15d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -201,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, goto out; } - err = ns_cgroup_clone(current, task_pid(current)); - if (err) - put_nsproxy(*new_nsp); - out: return err; } -- cgit v1.2.3