From cdcc136ffd264849a943acb42c36ffe9b458f811 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Jul 2009 16:47:45 +0200 Subject: locking, sched, cgroups: Annotate release_list_lock as raw The release_list_lock can be taken in atomic context and therefore cannot be preempted on -rt - annotate it. In mainline this change documents the low level nature of the lock - otherwise there's no functional difference. Lockdep and Sparse checking will work as usual. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1d2b6ceea95d..453100a4159d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -265,7 +265,7 @@ list_for_each_entry(_root, &roots, root_list) /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); -static DEFINE_SPINLOCK(release_list_lock); +static DEFINE_RAW_SPINLOCK(release_list_lock); static void cgroup_release_agent(struct work_struct *work); static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); @@ -4014,11 +4014,11 @@ again: finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ @@ -4671,13 +4671,13 @@ static void check_for_release(struct cgroup *cgrp) * already queued for a userspace notification, queue * it now */ int need_schedule_work = 0; - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); if (!cgroup_is_removed(cgrp) && list_empty(&cgrp->release_list)) { list_add(&cgrp->release_list, &release_list); need_schedule_work = 1; } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); if (need_schedule_work) schedule_work(&release_agent_work); } @@ -4729,7 +4729,7 @@ static void cgroup_release_agent(struct work_struct *work) { BUG_ON(work != &release_agent_work); mutex_lock(&cgroup_mutex); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; @@ -4738,7 +4738,7 @@ static void cgroup_release_agent(struct work_struct *work) struct cgroup, release_list); list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!pathbuf) goto continue_free; @@ -4768,9 +4768,9 @@ static void cgroup_release_agent(struct work_struct *work) continue_free: kfree(pathbuf); kfree(agentbuf); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); mutex_unlock(&cgroup_mutex); } -- cgit v1.2.3 From 33ef6b6984403a688189317ef46bb3caab3b70e0 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 2 Nov 2011 13:38:05 -0700 Subject: cgroups: more safe tasklist locking in cgroup_attach_proc Fix unstable tasklist locking in cgroup_attach_proc. According to this thread - https://lkml.org/lkml/2011/7/27/243 - RCU is not sufficient to guarantee the tasklist is stable w.r.t. de_thread and exit. Taking tasklist_lock for reading, instead of rcu_read_lock, ensures proper exclusion. Signed-off-by: Ben Blum Acked-by: Paul Menage Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: "Paul E. McKenney" Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 453100a4159d..64b0e73402df 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2027,7 +2027,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) goto out_free_group_list; /* prevent changes to the threadgroup list while we take a snapshot. */ - rcu_read_lock(); + read_lock(&tasklist_lock); if (!thread_group_leader(leader)) { /* * a race with de_thread from another thread's exec() may strip @@ -2036,7 +2036,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * throw this task away and try again (from cgroup_procs_write); * this is "double-double-toil-and-trouble-check locking". */ - rcu_read_unlock(); + read_unlock(&tasklist_lock); retval = -EAGAIN; goto out_free_group_list; } @@ -2057,7 +2057,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ group_size = i; - rcu_read_unlock(); + read_unlock(&tasklist_lock); /* * step 1: check that we can legitimately attach to the cgroup. -- cgit v1.2.3 From 77ceab8ea590d7dc6c8f055ce43dfebd74428107 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 2 Nov 2011 13:38:07 -0700 Subject: cgroups: don't attach task to subsystem if migration failed If a task has exited to the point it has called cgroup_exit() already, then we can't migrate it to another cgroup anymore. This can happen when we are attaching a task to a new cgroup between the call to ->can_attach_task() on subsystems and the migration that is eventually tried in cgroup_task_migrate(). In this case cgroup_task_migrate() returns -ESRCH and we don't want to attach the task to the subsystems because the attachment to the new cgroup itself failed. Fix this by only calling ->attach_task() on the subsystems if the cgroup migration succeeded. Reported-by: Oleg Nesterov Signed-off-by: Ben Blum Acked-by: Paul Menage Cc: Li Zefan Cc: Tejun Heo Signed-off-by: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 64b0e73402df..8386b21224ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2135,14 +2135,17 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) continue; - /* attach each task to each subsystem */ - for_each_subsys(root, ss) { - if (ss->attach_task) - ss->attach_task(cgrp, tsk); - } /* if the thread is PF_EXITING, it can just get skipped. */ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); - BUG_ON(retval != 0 && retval != -ESRCH); + if (retval == 0) { + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + } + } else { + BUG_ON(retval != -ESRCH); + } } /* nothing is sensitive to fork() after this point. */ -- cgit v1.2.3 From c1e2ee2dc436574880758b3836fc96935b774c32 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Wed, 2 Nov 2011 13:40:29 -0700 Subject: memcg: replace ss->id_lock with a rwlock While back-porting Johannes Weiner's patch "mm: memcg-aware global reclaim" for an internal effort, we noticed a significant performance regression during page-reclaim heavy workloads due to high contention of the ss->id_lock. This lock protects idr map, and serializes calls to idr_get_next() in css_get_next() (which is used during the memcg hierarchy walk). Since idr_get_next() is just doing a look up, we need only serialize it with respect to idr_remove()/idr_get_new(). By making the ss->id_lock a rwlock, contention is greatly reduced and performance improves. Tested: cat a 256m file from a ramdisk in a 128m container 50 times on each core (one file + container per core) in parallel on a NUMA machine. Result is the time for the test to complete in 1 of the containers. Both kernels included Johannes' memcg-aware global reclaim patches. Before rwlock patch: 1710.778s After rwlock patch: 152.227s Signed-off-by: Andrew Bresticker Cc: Paul Menage Cc: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index da7e4bc34e8c..1b7f9d525013 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -516,7 +516,7 @@ struct cgroup_subsys { struct list_head sibling; /* used when use_id == true */ struct idr idr; - spinlock_t id_lock; + rwlock_t id_lock; /* should be defined only by modular subsystems */ struct module *module; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8386b21224ef..d9d5648f3cdc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4883,9 +4883,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) rcu_assign_pointer(id->css, NULL); rcu_assign_pointer(css->id, NULL); - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); @@ -4911,10 +4911,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) error = -ENOMEM; goto err_out; } - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ error = idr_get_new_above(&ss->idr, newid, 1, &myid); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); /* Returns error when there are no free spaces for new ID.*/ if (error) { @@ -4929,9 +4929,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) return newid; remove_idr: error = -ENOSPC; - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); idr_remove(&ss->idr, myid); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); err_out: kfree(newid); return ERR_PTR(error); @@ -4943,7 +4943,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, { struct css_id *newid; - spin_lock_init(&ss->id_lock); + rwlock_init(&ss->id_lock); idr_init(&ss->idr); newid = get_new_cssid(ss, 0); @@ -5038,9 +5038,9 @@ css_get_next(struct cgroup_subsys *ss, int id, * scan next entry from bitmap(tree), tmpid is updated after * idr_get_next(). */ - spin_lock(&ss->id_lock); + read_lock(&ss->id_lock); tmp = idr_get_next(&ss->idr, &tmpid); - spin_unlock(&ss->id_lock); + read_unlock(&ss->id_lock); if (!tmp) break; -- cgit v1.2.3