From 7e9abd89cbdf9b73d327d8173343abce9022609b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 25 Jul 2008 01:46:54 -0700 Subject: cgroup: use read lock to guard find_existing_css_set() The function does not modify anything (except the temporary css template), so it's sufficient to hold read lock. Signed-off-by: Li Zefan Acked-by: Paul Menage Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 15ac0e1e4f4d..f50edadfdd86 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -415,11 +415,11 @@ static struct css_set *find_css_set( /* First see if we already have a cgroup group that matches * the desired set */ - write_lock(&css_set_lock); + read_lock(&css_set_lock); res = find_existing_css_set(oldcg, cgrp, template); if (res) get_css_set(res); - write_unlock(&css_set_lock); + read_unlock(&css_set_lock); if (res) return res; -- cgit v1.2.3 From 71cbb949d17d4d776abd547135feb7f3282405c8 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 25 Jul 2008 01:46:55 -0700 Subject: cgroup: list_for_each cleanup -------------------------- while() { list_entry(); ... } -------------------------- is equivalent to following code. -------------------------- list_for_each_entry(){ ... } -------------------------- later can review easily more. this patch is just clean up. it doesn't have any behavor change. Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f50edadfdd86..6836a9063634 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -241,17 +241,20 @@ static int use_task_css_set_links; */ static void unlink_css_set(struct css_set *cg) { + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + write_lock(&css_set_lock); hlist_del(&cg->hlist); css_set_count--; - while (!list_empty(&cg->cg_links)) { - struct cg_cgroup_link *link; - link = list_entry(cg->cg_links.next, - struct cg_cgroup_link, cg_link_list); + + list_for_each_entry_safe(link, saved_link, &cg->cg_links, + cg_link_list) { list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); kfree(link); } + write_unlock(&css_set_lock); } @@ -363,15 +366,14 @@ static struct css_set *find_existing_css_set( static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; int i; INIT_LIST_HEAD(tmp); for (i = 0; i < count; i++) { link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { - while (!list_empty(tmp)) { - link = list_entry(tmp->next, - struct cg_cgroup_link, - cgrp_link_list); + list_for_each_entry_safe(link, saved_link, tmp, + cgrp_link_list) { list_del(&link->cgrp_link_list); kfree(link); } @@ -384,11 +386,10 @@ static int allocate_cg_links(int count, struct list_head *tmp) static void free_cg_links(struct list_head *tmp) { - while (!list_empty(tmp)) { - struct cg_cgroup_link *link; - link = list_entry(tmp->next, - struct cg_cgroup_link, - cgrp_link_list); + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + + list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { list_del(&link->cgrp_link_list); kfree(link); } @@ -1093,6 +1094,8 @@ static void cgroup_kill_sb(struct super_block *sb) { struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; int ret; + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; BUG_ON(!root); @@ -1112,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) { * root cgroup */ write_lock(&css_set_lock); - while (!list_empty(&cgrp->css_sets)) { - struct cg_cgroup_link *link; - link = list_entry(cgrp->css_sets.next, - struct cg_cgroup_link, cgrp_link_list); + + list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, + cgrp_link_list) { list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); kfree(link); @@ -1756,15 +1758,11 @@ int cgroup_add_files(struct cgroup *cgrp, int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; - struct list_head *l; + struct cg_cgroup_link *link; read_lock(&css_set_lock); - l = cgrp->css_sets.next; - while (l != &cgrp->css_sets) { - struct cg_cgroup_link *link = - list_entry(l, struct cg_cgroup_link, cgrp_link_list); + list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { count += atomic_read(&link->cg->ref.refcount); - l = l->next; } read_unlock(&css_set_lock); return count; -- cgit v1.2.3 From 8947f9d5b361ce927be6d5c11fed57905b7a4100 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 25 Jul 2008 01:46:56 -0700 Subject: cgroups: annotate two variables with __read_mostly - need_forkexit_callback will be read only after system boot. - use_task_css_set_links will be read only after it's set. And these 2 variables are checked when a new process is forked. Signed-off-by: Li Zefan Acked-by: Paul Menage Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6836a9063634..70d083c6fb6b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -118,7 +118,7 @@ static int root_count; * extra work in the fork/exit path if none of the subsystems need to * be called. */ -static int need_forkexit_callback; +static int need_forkexit_callback __read_mostly; static int need_mm_owner_callback __read_mostly; /* convenient tests for these bits */ @@ -220,7 +220,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups * compiled into their kernel but not actually in use */ -static int use_task_css_set_links; +static int use_task_css_set_links __read_mostly; /* When we create or destroy a css_set, the operation simply * takes/releases a reference count on all the cgroups referenced -- cgit v1.2.3 From db3b14978abc02041046ed8353f0899cb58ffffc Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Fri, 25 Jul 2008 01:46:58 -0700 Subject: cgroup files: add write_string cgroup control file method This patch adds a write_string() method for cgroups control files. The semantics are that a buffer is copied from userspace to kernelspace and the handler function invoked on that buffer. The buffer is guaranteed to be nul-terminated, and no longer than max_write_len (defaulting to 64 bytes if unspecified). Later patches will convert existing raw file write handlers in control group subsystems to use this method. Signed-off-by: Paul Menage Cc: Paul Jackson Cc: Pavel Emelyanov Acked-by: Balbir Singh Acked-by: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 14 ++++++++++++++ kernel/cgroup.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) (limited to 'kernel/cgroup.c') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 88a734edccbc..f5379455bb59 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -205,6 +205,13 @@ struct cftype { * subsystem, followed by a period */ char name[MAX_CFTYPE_NAME]; int private; + + /* + * If non-zero, defines the maximum length of string that can + * be passed to write_string; defaults to 64 + */ + size_t max_write_len; + int (*open)(struct inode *inode, struct file *file); ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, struct file *file, @@ -248,6 +255,13 @@ struct cftype { */ int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); + /* + * write_string() is passed a nul-terminated kernelspace + * buffer of maximum length determined by max_write_len. + * Returns 0 or -ve error code. + */ + int (*write_string)(struct cgroup *cgrp, struct cftype *cft, + const char *buffer); /* * trigger() callback can be used to get some kick from the * userspace, when the actual string written is not important diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 70d083c6fb6b..3a99cc2df860 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1363,6 +1363,39 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, return retval; } +static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) +{ + char local_buffer[64]; + int retval = 0; + size_t max_bytes = cft->max_write_len; + char *buffer = local_buffer; + + if (!max_bytes) + max_bytes = sizeof(local_buffer) - 1; + if (nbytes >= max_bytes) + return -E2BIG; + /* Allocate a dynamic buffer if we need one */ + if (nbytes >= sizeof(local_buffer)) { + buffer = kmalloc(nbytes + 1, GFP_KERNEL); + if (buffer == NULL) + return -ENOMEM; + } + if (nbytes && copy_from_user(buffer, userbuf, nbytes)) + return -EFAULT; + + buffer[nbytes] = 0; /* nul-terminate */ + strstrip(buffer); + retval = cft->write_string(cgrp, cft, buffer); + if (!retval) + retval = nbytes; + if (buffer != local_buffer) + kfree(buffer); + return retval; +} + static ssize_t cgroup_common_file_write(struct cgroup *cgrp, struct cftype *cft, struct file *file, @@ -1440,6 +1473,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, return cft->write(cgrp, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); + if (cft->write_string) + return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); if (cft->trigger) { int ret = cft->trigger(cgrp, (unsigned int)cft->private); return ret ? ret : nbytes; -- cgit v1.2.3 From e788e066c651b1bbf4a927dc95395c1aa13be436 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Fri, 25 Jul 2008 01:46:59 -0700 Subject: cgroup files: move the release_agent file to use typed handlers Adds cgroup_release_agent_write() and cgroup_release_agent_show() methods to handle writing/reading the path to a cgroup hierarchy's release agent. As a result, cgroup_common_file_read() is now unnecessary. As part of the change, a previously-tolerated race in cgroup_release_agent() is avoided by copying the current release_agent_path prior to calling call_usermode_helper(). Signed-off-by: Paul Menage Cc: Paul Jackson Cc: Pavel Emelyanov Cc: Balbir Singh Acked-by: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 2 + kernel/cgroup.c | 125 ++++++++++++++++++++++--------------------------- 2 files changed, 59 insertions(+), 68 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f5379455bb59..e78377a91a74 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -295,6 +295,8 @@ int cgroup_add_files(struct cgroup *cgrp, int cgroup_is_removed(const struct cgroup *cgrp); +int cgroup_lock_live_group(struct cgroup *cgrp); + int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); int cgroup_task_count(const struct cgroup *cgrp); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3a99cc2df860..0120b5d67a73 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -89,11 +89,7 @@ struct cgroupfs_root { /* Hierarchy-specific flags */ unsigned long flags; - /* The path to use for release notifications. No locking - * between setting and use - so if userspace updates this - * while child cgroups exist, you could miss a - * notification. We ensure that it's always a valid - * NUL-terminated string */ + /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; }; @@ -1329,6 +1325,45 @@ enum cgroup_filetype { FILE_RELEASE_AGENT, }; +/** + * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. + * @cgrp: the cgroup to be checked for liveness + * + * Returns true (with lock held) on success, or false (with no lock + * held) on failure. + */ +int cgroup_lock_live_group(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_mutex); + if (cgroup_is_removed(cgrp)) { + mutex_unlock(&cgroup_mutex); + return false; + } + return true; +} + +static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + strcpy(cgrp->root->release_agent_path, buffer); + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *seq) +{ + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + seq_puts(seq, cgrp->root->release_agent_path); + seq_putc(seq, '\n'); + mutex_unlock(&cgroup_mutex); + return 0; +} + static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, struct file *file, const char __user *userbuf, @@ -1443,10 +1478,6 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp, else clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); break; - case FILE_RELEASE_AGENT: - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - strcpy(cgrp->root->release_agent_path, buffer); - break; default: retval = -EINVAL; goto out2; @@ -1506,49 +1537,6 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } -static ssize_t cgroup_common_file_read(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, - char __user *buf, - size_t nbytes, loff_t *ppos) -{ - enum cgroup_filetype type = cft->private; - char *page; - ssize_t retval = 0; - char *s; - - if (!(page = (char *)__get_free_page(GFP_KERNEL))) - return -ENOMEM; - - s = page; - - switch (type) { - case FILE_RELEASE_AGENT: - { - struct cgroupfs_root *root; - size_t n; - mutex_lock(&cgroup_mutex); - root = cgrp->root; - n = strnlen(root->release_agent_path, - sizeof(root->release_agent_path)); - n = min(n, (size_t) PAGE_SIZE); - strncpy(s, root->release_agent_path, n); - mutex_unlock(&cgroup_mutex); - s += n; - break; - } - default: - retval = -EINVAL; - goto out; - } - *s++ = '\n'; - - retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); -out: - free_page((unsigned long)page); - return retval; -} - static ssize_t cgroup_file_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { @@ -1606,6 +1594,7 @@ int cgroup_seqfile_release(struct inode *inode, struct file *file) static struct file_operations cgroup_seqfile_operations = { .read = seq_read, + .write = cgroup_file_write, .llseek = seq_lseek, .release = cgroup_seqfile_release, }; @@ -2283,8 +2272,9 @@ static struct cftype files[] = { static struct cftype cft_release_agent = { .name = "release_agent", - .read = cgroup_common_file_read, - .write = cgroup_common_file_write, + .read_seq_string = cgroup_release_agent_show, + .write_string = cgroup_release_agent_write, + .max_write_len = PATH_MAX, .private = FILE_RELEASE_AGENT, }; @@ -3111,27 +3101,24 @@ static void cgroup_release_agent(struct work_struct *work) while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; - char *pathbuf; + char *pathbuf = NULL, *agentbuf = NULL; struct cgroup *cgrp = list_entry(release_list.next, struct cgroup, release_list); list_del_init(&cgrp->release_list); spin_unlock(&release_list_lock); pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!pathbuf) { - spin_lock(&release_list_lock); - continue; - } - - if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) { - kfree(pathbuf); - spin_lock(&release_list_lock); - continue; - } + if (!pathbuf) + goto continue_free; + if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) + goto continue_free; + agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); + if (!agentbuf) + goto continue_free; i = 0; - argv[i++] = cgrp->root->release_agent_path; - argv[i++] = (char *)pathbuf; + argv[i++] = agentbuf; + argv[i++] = pathbuf; argv[i] = NULL; i = 0; @@ -3145,8 +3132,10 @@ static void cgroup_release_agent(struct work_struct *work) * be a slow process */ mutex_unlock(&cgroup_mutex); call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - kfree(pathbuf); mutex_lock(&cgroup_mutex); + continue_free: + kfree(pathbuf); + kfree(agentbuf); spin_lock(&release_list_lock); } spin_unlock(&release_list_lock); -- cgit v1.2.3 From 84eea842886ac35020be6043e04748ed22014359 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Fri, 25 Jul 2008 01:47:00 -0700 Subject: cgroups: misc cleanups to write_string patchset This patch contains cleanups suggested by reviewers for the recent write_string() patchset: - pair cgroup_lock_live_group() with cgroup_unlock() in cgroup.c for clarity, rather than directly unlocking cgroup_mutex. - make the return type of cgroup_lock_live_group() a bool - use a #define'd constant for the local buffer size in read/write functions Signed-off-by: Paul Menage Cc: Paul Jackson Cc: Pavel Emelyanov Cc: Balbir Singh Acked-by: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 4 ++-- kernel/cgroup.c | 21 ++++++++++++--------- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e78377a91a74..cc59d3a21d87 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -21,11 +21,13 @@ struct cgroupfs_root; struct cgroup_subsys; struct inode; +struct cgroup; extern int cgroup_init_early(void); extern int cgroup_init(void); extern void cgroup_init_smp(void); extern void cgroup_lock(void); +extern bool cgroup_lock_live_group(struct cgroup *cgrp); extern void cgroup_unlock(void); extern void cgroup_fork(struct task_struct *p); extern void cgroup_fork_callbacks(struct task_struct *p); @@ -295,8 +297,6 @@ int cgroup_add_files(struct cgroup *cgrp, int cgroup_is_removed(const struct cgroup *cgrp); -int cgroup_lock_live_group(struct cgroup *cgrp); - int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); int cgroup_task_count(const struct cgroup *cgrp); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0120b5d67a73..a14122ecaa5e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1329,10 +1329,10 @@ enum cgroup_filetype { * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness * - * Returns true (with lock held) on success, or false (with no lock - * held) on failure. + * On success, returns true; the lock should be later released with + * cgroup_unlock(). On failure returns false with no lock held. */ -int cgroup_lock_live_group(struct cgroup *cgrp) +bool cgroup_lock_live_group(struct cgroup *cgrp) { mutex_lock(&cgroup_mutex); if (cgroup_is_removed(cgrp)) { @@ -1349,7 +1349,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, if (!cgroup_lock_live_group(cgrp)) return -ENODEV; strcpy(cgrp->root->release_agent_path, buffer); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return 0; } @@ -1360,16 +1360,19 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, return -ENODEV; seq_puts(seq, cgrp->root->release_agent_path); seq_putc(seq, '\n'); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return 0; } +/* A buffer size big enough for numbers or short strings */ +#define CGROUP_LOCAL_BUFFER_SIZE 64 + static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { - char buffer[64]; + char buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; char *end; @@ -1403,7 +1406,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { - char local_buffer[64]; + char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; size_t max_bytes = cft->max_write_len; char *buffer = local_buffer; @@ -1518,7 +1521,7 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, char __user *buf, size_t nbytes, loff_t *ppos) { - char tmp[64]; + char tmp[CGROUP_LOCAL_BUFFER_SIZE]; u64 val = cft->read_u64(cgrp, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); @@ -1530,7 +1533,7 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, char __user *buf, size_t nbytes, loff_t *ppos) { - char tmp[64]; + char tmp[CGROUP_LOCAL_BUFFER_SIZE]; s64 val = cft->read_s64(cgrp, cft); int len = sprintf(tmp, "%lld\n", (long long) val); -- cgit v1.2.3 From 6379c106152388f7ea45d6dda63edda0e9181fc8 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Fri, 25 Jul 2008 01:47:01 -0700 Subject: cgroup files: move notify_on_release file to separate write handler This patch moves the write handler for the cgroups notify_on_release file into a separate handler. This handler requires no cgroups locking since it relies on atomic bitops for synchronization. Signed-off-by: Paul Menage Cc: Paul Jackson Cc: Pavel Emelyanov Cc: Balbir Singh Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a14122ecaa5e..d597d3015786 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1474,13 +1474,6 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp, case FILE_TASKLIST: retval = attach_task_by_pid(cgrp, buffer); break; - case FILE_NOTIFY_ON_RELEASE: - clear_bit(CGRP_RELEASABLE, &cgrp->flags); - if (simple_strtoul(buffer, NULL, 10) != 0) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - break; default: retval = -EINVAL; goto out2; @@ -2252,6 +2245,18 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, return notify_on_release(cgrp); } +static int cgroup_write_notify_on_release(struct cgroup *cgrp, + struct cftype *cft, + u64 val) +{ + clear_bit(CGRP_RELEASABLE, &cgrp->flags); + if (val) + set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + else + clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + return 0; +} + /* * for the common functions, 'private' gives the type of file */ @@ -2268,7 +2273,7 @@ static struct cftype files[] = { { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, - .write = cgroup_common_file_write, + .write_u64 = cgroup_write_notify_on_release, .private = FILE_NOTIFY_ON_RELEASE, }, }; -- cgit v1.2.3 From af351026aafc8da16518a02b41c66d3e0c1cdef4 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Fri, 25 Jul 2008 01:47:01 -0700 Subject: cgroup files: turn attach_task_by_pid directly into a cgroup write handler This patch changes attach_task_by_pid() to take a u64 rather than a string; as a result it can be called directly as a control groups write_u64 handler, and cgroup_common_file_write() can be removed. Signed-off-by: Paul Menage Cc: Paul Jackson Cc: Pavel Emelyanov Cc: Balbir Singh Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 80 ++++++++++----------------------------------------------- 1 file changed, 14 insertions(+), 66 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d597d3015786..86b71e714e13 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -504,10 +504,6 @@ static struct css_set *find_css_set( * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * - * The cgroup_common_file_write handler for operations that modify - * the cgroup hierarchy holds cgroup_mutex across the entire operation, - * single threading all such cgroup modifications across the system. - * * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't * (usually) take cgroup_mutex. These are the two most performance * critical pieces of code here. The exception occurs on cgroup_exit(), @@ -1279,18 +1275,14 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } /* - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with - * cgroup_mutex, may take task_lock of task + * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex + * held. May take task_lock of task */ -static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) { - pid_t pid; struct task_struct *tsk; int ret; - if (sscanf(pidbuf, "%d", &pid) != 1) - return -EIO; - if (pid) { rcu_read_lock(); tsk = find_task_by_vpid(pid); @@ -1316,6 +1308,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) return ret; } +static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +{ + int ret; + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + ret = attach_task_by_pid(cgrp, pid); + cgroup_unlock(); + return ret; +} + /* The various types of files and directories in a cgroup file system */ enum cgroup_filetype { FILE_ROOT, @@ -1434,60 +1436,6 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, return retval; } -static ssize_t cgroup_common_file_write(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) -{ - enum cgroup_filetype type = cft->private; - char *buffer; - int retval = 0; - - if (nbytes >= PATH_MAX) - return -E2BIG; - - /* +1 for nul-terminator */ - buffer = kmalloc(nbytes + 1, GFP_KERNEL); - if (buffer == NULL) - return -ENOMEM; - - if (copy_from_user(buffer, userbuf, nbytes)) { - retval = -EFAULT; - goto out1; - } - buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); /* strip -just- trailing whitespace */ - - mutex_lock(&cgroup_mutex); - - /* - * This was already checked for in cgroup_file_write(), but - * check again now we're holding cgroup_mutex. - */ - if (cgroup_is_removed(cgrp)) { - retval = -ENODEV; - goto out2; - } - - switch (type) { - case FILE_TASKLIST: - retval = attach_task_by_pid(cgrp, buffer); - break; - default: - retval = -EINVAL; - goto out2; - } - - if (retval == 0) - retval = nbytes; -out2: - mutex_unlock(&cgroup_mutex); -out1: - kfree(buffer); - return retval; -} - static ssize_t cgroup_file_write(struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos) { @@ -2265,7 +2213,7 @@ static struct cftype files[] = { .name = "tasks", .open = cgroup_tasks_open, .read = cgroup_tasks_read, - .write = cgroup_common_file_write, + .write_u64 = cgroup_tasks_write, .release = cgroup_tasks_release, .private = FILE_TASKLIST, }, -- cgit v1.2.3 From e885dcde75685e09f23cffae1f6d5169c105b8a0 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Fri, 25 Jul 2008 01:47:06 -0700 Subject: cgroup_clone: use pid of newly created task for new cgroup cgroup_clone creates a new cgroup with the pid of the task. This works correctly for unshare, but for clone cgroup_clone is called from copy_namespaces inside copy_process, which happens before the new pid is created. As a result, the new cgroup was created with current's pid. This patch: 1. Moves the call inside copy_process to after the new pid is created 2. Passes the struct pid into ns_cgroup_clone (as it is not yet attached to the task) 3. Passes a name from ns_cgroup_clone() into cgroup_clone() so as to keep cgroup_clone() itself simpler 4. Uses pid_vnr() to get the process id value, so that the pid used to name the new cgroup is always the pid as it would be known to the task which did the cloning or unsharing. I think that is the most intuitive thing to do. This way, task t1 does clone(CLONE_NEWPID) to get t2, which does clone(CLONE_NEWPID) to get t3, then the cgroup for t3 will be named for the pid by which t2 knows t3. (Thanks to Dan Smith for finding the main bug) Changelog: June 11: Incorporate Paul Menage's feedback: don't pass NULL to ns_cgroup_clone from unshare, and reduce patch size by using 'nodename' in cgroup_clone. June 10: Original version [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Serge Hallyn Acked-by: Paul Menage Tested-by: Dan Smith Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 3 ++- include/linux/nsproxy.h | 7 +++++-- kernel/cgroup.c | 7 +++---- kernel/fork.c | 6 ++++++ kernel/ns_cgroup.c | 8 ++++++-- kernel/nsproxy.c | 8 +------- 6 files changed, 23 insertions(+), 16 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index cc59d3a21d87..c98dd7cb7076 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -364,7 +364,8 @@ static inline struct cgroup* task_cgroup(struct task_struct *task, return task_subsys_state(task, subsys_id)->cgroup; } -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss); +int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss, + char *nodename); /* A cgroup_iter should be treated as an opaque object */ struct cgroup_iter { diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 0e66b57631fc..c8a768e59640 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -82,9 +82,12 @@ static inline void get_nsproxy(struct nsproxy *ns) } #ifdef CONFIG_CGROUP_NS -int ns_cgroup_clone(struct task_struct *tsk); +int ns_cgroup_clone(struct task_struct *tsk, struct pid *pid); #else -static inline int ns_cgroup_clone(struct task_struct *tsk) { return 0; } +static inline int ns_cgroup_clone(struct task_struct *tsk, struct pid *pid) +{ + return 0; +} #endif #endif diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 86b71e714e13..66ec9fd21e0c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2848,16 +2848,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) * cgroup_clone - clone the cgroup the given subsystem is attached to * @tsk: the task to be moved * @subsys: the given subsystem + * @nodename: the name for the new cgroup * * Duplicate the current cgroup in the hierarchy that the given * subsystem is attached to, and move this task into the new * child. */ -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) +int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, + char *nodename) { struct dentry *dentry; int ret = 0; - char nodename[MAX_CGROUP_TYPE_NAMELEN]; struct cgroup *parent, *child; struct inode *inode; struct css_set *cg; @@ -2882,8 +2883,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) cg = tsk->cgroups; parent = task_cgroup(tsk, subsys->subsys_id); - snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid); - /* Pin the hierarchy */ atomic_inc(&parent->root->sb->s_active); diff --git a/kernel/fork.c b/kernel/fork.c index 5a5d6fef341d..228f80c9155a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1107,6 +1107,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; + if (current->nsproxy != p->nsproxy) { + retval = ns_cgroup_clone(p, pid); + if (retval) + goto bad_fork_free_pid; + } + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 48d7ed6fc3a4..43c2111cd54d 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -24,9 +25,12 @@ static inline struct ns_cgroup *cgroup_to_ns( struct ns_cgroup, css); } -int ns_cgroup_clone(struct task_struct *task) +int ns_cgroup_clone(struct task_struct *task, struct pid *pid) { - return cgroup_clone(task, &ns_subsys); + char name[PROC_NUMBUF]; + + snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid)); + return cgroup_clone(task, &ns_subsys, name); } /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index adc785146a1c..21575fc46d05 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -157,12 +157,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } - err = ns_cgroup_clone(tsk); - if (err) { - put_nsproxy(new_ns); - goto out; - } - tsk->nsproxy = new_ns; out: @@ -209,7 +203,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, goto out; } - err = ns_cgroup_clone(current); + err = ns_cgroup_clone(current, task_pid(current)); if (err) put_nsproxy(*new_nsp); -- cgit v1.2.3 From 96930a6365c99c160138a395566e360b27348b8f Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 25 Jul 2008 19:46:21 -0700 Subject: make cgroup_seqfile_release() static cgroup_seqfile_release() can become static. Signed-off-by: Adrian Bunk Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66ec9fd21e0c..89bd6fb7894f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1529,7 +1529,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) return cft->read_seq_string(state->cgroup, cft, m); } -int cgroup_seqfile_release(struct inode *inode, struct file *file) +static int cgroup_seqfile_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; kfree(seq->private); -- cgit v1.2.3 From 3f8206d496e9e9495afb1d4e70d29712b4d403c9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Jul 2008 03:46:43 -0400 Subject: [PATCH] get rid of indirect users of namei.h fs.h needs path.h, not namei.h; nfs_fs.h doesn't need it at all. Several places in the tree needed direct include. Signed-off-by: Al Viro --- fs/nfsd/nfsctl.c | 1 + fs/ubifs/file.c | 1 + include/linux/fs.h | 2 +- include/linux/nfs_fs.h | 1 - kernel/cgroup.c | 1 + 5 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1955a2702e60..c53e65f8f3a2 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 005a3b854d96..8565e586e533 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -53,6 +53,7 @@ #include "ubifs.h" #include +#include static int read_block(struct inode *inode, void *addr, unsigned int block, struct ubifs_data_node *dn) diff --git a/include/linux/fs.h b/include/linux/fs.h index 7676fa1c20ae..8252b045e624 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -279,7 +279,7 @@ extern int dir_notify_enable; #include #include #include -#include +#include #include #include #include diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f08f9ca602af..78a5922a2f11 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 89bd6fb7894f..657f8f8d93a5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -45,6 +45,7 @@ #include #include #include +#include #include -- cgit v1.2.3 From 5a3eb9f6b7c598529f832b8baa6458ab1cbab2c6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:18 -0700 Subject: cgroup: fix possible memory leak There's a leak if copy_from_user() returns failure. Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 657f8f8d93a5..28debe4e1488 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1424,14 +1424,17 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, if (buffer == NULL) return -ENOMEM; } - if (nbytes && copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; + if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { + retval = -EFAULT; + goto out; + } buffer[nbytes] = 0; /* nul-terminate */ strstrip(buffer); retval = cft->write_string(cgrp, cft, buffer); if (!retval) retval = nbytes; +out: if (buffer != local_buffer) kfree(buffer); return retval; -- cgit v1.2.3 From 36553434f475a84b653e25e74490ee8df43b86d5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:19 -0700 Subject: cgroup: remove duplicate code in allocate_cg_link() - just call free_cg_links() in allocate_cg_links() - the list will get initialized in allocate_cg_links(), so don't init it twice Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 28debe4e1488..249a517591b8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -355,6 +355,17 @@ static struct css_set *find_existing_css_set( return NULL; } +static void free_cg_links(struct list_head *tmp) +{ + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + + list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { + list_del(&link->cgrp_link_list); + kfree(link); + } +} + /* * allocate_cg_links() allocates "count" cg_cgroup_link structures * and chains them on tmp through their cgrp_link_list fields. Returns 0 on @@ -363,17 +374,12 @@ static struct css_set *find_existing_css_set( static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; int i; INIT_LIST_HEAD(tmp); for (i = 0; i < count; i++) { link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { - list_for_each_entry_safe(link, saved_link, tmp, - cgrp_link_list) { - list_del(&link->cgrp_link_list); - kfree(link); - } + free_cg_links(tmp); return -ENOMEM; } list_add(&link->cgrp_link_list, tmp); @@ -381,17 +387,6 @@ static int allocate_cg_links(int count, struct list_head *tmp) return 0; } -static void free_cg_links(struct list_head *tmp) -{ - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; - - list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { - list_del(&link->cgrp_link_list); - kfree(link); - } -} - /* * find_css_set() takes an existing cgroup group and a * cgroup object, and returns a css_set object that's @@ -956,7 +951,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct super_block *sb; struct cgroupfs_root *root; struct list_head tmp_cg_links; - INIT_LIST_HEAD(&tmp_cg_links); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); -- cgit v1.2.3 From 55b6fd0162ace1e0f1b52c8c092565c115127ef6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:20 -0700 Subject: cgroup: uninline cgroup_has_css_refs() It's not small enough, and has 2 call sites. text data bss dec hex filename 12813 1676 4832 19321 4b79 cgroup.o.orig 12775 1676 4832 19283 4b53 cgroup.o Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 249a517591b8..13932abde159 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2368,7 +2368,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -static inline int cgroup_has_css_refs(struct cgroup *cgrp) +static int cgroup_has_css_refs(struct cgroup *cgrp) { /* Check the reference count on each subsystem. Since we * already established that there are no tasks in the -- cgit v1.2.3