summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2014-02-24 16:11:26 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2014-02-24 16:11:26 +1100
commit42c7a760815c6be522f71e86ff128276d1eb9db9 (patch)
tree21dc9fb7c385661b67d2c029b1743349439772bc
parent45b5809ae550134afa67b8513c4c4e8803b6697f (diff)
parent3c0d1ccae0e559f32eaed36f23603b894e971dbe (diff)
Merge remote-tracking branch 'cgroup/for-next'
-rw-r--r--arch/sparc/kernel/leon_pci_grpci2.c1
-rw-r--r--arch/sparc/kernel/sun4m_irq.c2
-rw-r--r--block/blk-cgroup.c11
-rw-r--r--block/blk-cgroup.h14
-rw-r--r--block/blk-throttle.c4
-rw-r--r--block/cfq-iosched.c3
-rw-r--r--fs/bio.c2
-rw-r--r--fs/kernfs/dir.c1
-rw-r--r--include/linux/cgroup.h229
-rw-r--r--include/linux/cgroup_subsys.h30
-rw-r--r--include/linux/hugetlb_cgroup.h2
-rw-r--r--include/linux/memcontrol.h2
-rw-r--r--include/net/cls_cgroup.h2
-rw-r--r--include/net/netprio_cgroup.h17
-rw-r--r--init/Kconfig1
-rw-r--r--kernel/cgroup.c2777
-rw-r--r--kernel/cgroup_freezer.c10
-rw-r--r--kernel/cpuset.c236
-rw-r--r--kernel/events/core.c25
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/cpuacct.c6
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--mm/hugetlb_cgroup.c9
-rw-r--r--mm/memcontrol.c106
-rw-r--r--mm/memory-failure.c8
-rw-r--r--net/Kconfig2
-rw-r--r--net/core/netclassid_cgroup.c15
-rw-r--r--net/core/netprio_cgroup.c39
-rw-r--r--net/ipv4/tcp_memcontrol.c2
-rw-r--r--security/device_cgroup.c8
30 files changed, 1240 insertions, 2337 deletions
diff --git a/arch/sparc/kernel/leon_pci_grpci2.c b/arch/sparc/kernel/leon_pci_grpci2.c
index 5f0402aab7fb..24d6a4446349 100644
--- a/arch/sparc/kernel/leon_pci_grpci2.c
+++ b/arch/sparc/kernel/leon_pci_grpci2.c
@@ -8,6 +8,7 @@
#include <linux/of_device.h>
#include <linux/kernel.h>
#include <linux/pci.h>
+#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <asm/io.h>
diff --git a/arch/sparc/kernel/sun4m_irq.c b/arch/sparc/kernel/sun4m_irq.c
index c5ade9d27a1d..8bb3b3fddea7 100644
--- a/arch/sparc/kernel/sun4m_irq.c
+++ b/arch/sparc/kernel/sun4m_irq.c
@@ -9,6 +9,8 @@
* Copyright (C) 1996 Dave Redman (djhr@tadpole.co.uk)
*/
+#include <linux/slab.h>
+
#include <asm/timer.h>
#include <asm/traps.h>
#include <asm/pgalloc.h>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b6e95b5e262f..e4a4145926f6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -894,7 +894,7 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
int ret = 0;
/* task_lock() is needed to avoid races with exit_io_context() */
- cgroup_taskset_for_each(task, css, tset) {
+ cgroup_taskset_for_each(task, tset) {
task_lock(task);
ioc = task->io_context;
if (ioc && atomic_read(&ioc->nr_tasks) > 1)
@@ -906,17 +906,14 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
return ret;
}
-struct cgroup_subsys blkio_subsys = {
- .name = "blkio",
+struct cgroup_subsys blkio_cgrp_subsys = {
.css_alloc = blkcg_css_alloc,
.css_offline = blkcg_css_offline,
.css_free = blkcg_css_free,
.can_attach = blkcg_can_attach,
- .subsys_id = blkio_subsys_id,
.base_cftypes = blkcg_files,
- .module = THIS_MODULE,
};
-EXPORT_SYMBOL_GPL(blkio_subsys);
+EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
/**
* blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1106,7 +1103,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
/* everything is in place, add intf files for the new policy */
if (pol->cftypes)
- WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
+ WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes));
ret = 0;
out_unlock:
mutex_unlock(&blkcg_pol_mutex);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 86154eab9523..15a8d640de57 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -186,7 +186,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
static inline struct blkcg *task_blkcg(struct task_struct *tsk)
{
- return css_to_blkcg(task_css(tsk, blkio_subsys_id));
+ return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
}
static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -241,12 +241,16 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
*/
static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
{
- int ret;
+ char *p;
- ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
- if (ret)
+ p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+ if (!p) {
strncpy(buf, "<unavailable>", buflen);
- return ret;
+ return -ENAMETOOLONG;
+ }
+
+ memmove(buf, p, buf + buflen - p);
+ return 0;
}
/**
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 1474c3ab7e72..861c363e4129 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1425,28 +1425,24 @@ static struct cftype throtl_files[] = {
.private = offsetof(struct throtl_grp, bps[READ]),
.seq_show = tg_print_conf_u64,
.write_string = tg_set_conf_u64,
- .max_write_len = 256,
},
{
.name = "throttle.write_bps_device",
.private = offsetof(struct throtl_grp, bps[WRITE]),
.seq_show = tg_print_conf_u64,
.write_string = tg_set_conf_u64,
- .max_write_len = 256,
},
{
.name = "throttle.read_iops_device",
.private = offsetof(struct throtl_grp, iops[READ]),
.seq_show = tg_print_conf_uint,
.write_string = tg_set_conf_uint,
- .max_write_len = 256,
},
{
.name = "throttle.write_iops_device",
.private = offsetof(struct throtl_grp, iops[WRITE]),
.seq_show = tg_print_conf_uint,
.write_string = tg_set_conf_uint,
- .max_write_len = 256,
},
{
.name = "throttle.io_service_bytes",
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 744833b630c6..461187943392 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1838,7 +1838,6 @@ static struct cftype cfq_blkcg_files[] = {
.flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cfqg_print_leaf_weight_device,
.write_string = cfqg_set_leaf_weight_device,
- .max_write_len = 256,
},
{
.name = "weight",
@@ -1853,7 +1852,6 @@ static struct cftype cfq_blkcg_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cfqg_print_weight_device,
.write_string = cfqg_set_weight_device,
- .max_write_len = 256,
},
{
.name = "weight",
@@ -1866,7 +1864,6 @@ static struct cftype cfq_blkcg_files[] = {
.name = "leaf_weight_device",
.seq_show = cfqg_print_leaf_weight_device,
.write_string = cfqg_set_leaf_weight_device,
- .max_write_len = 256,
},
{
.name = "leaf_weight",
diff --git a/fs/bio.c b/fs/bio.c
index b2dd42ed9edd..b1bc722b89aa 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1969,7 +1969,7 @@ int bio_associate_current(struct bio *bio)
/* associate blkcg if exists */
rcu_read_lock();
- css = task_css(current, blkio_subsys_id);
+ css = task_css(current, blkio_cgrp_id);
if (css && css_tryget(css))
bio->bi_css = css;
rcu_read_unlock();
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8245d3b34894..5a398c0c3d99 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -112,6 +112,7 @@ char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
return p;
}
+EXPORT_SYMBOL_GPL(kernfs_path);
/**
* pr_cont_kernfs_name - pr_cont name of a kernfs_node
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9450f025fe0c..8c283a910b91 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -14,14 +14,13 @@
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
-#include <linux/prio_heap.h>
#include <linux/rwsem.h>
#include <linux/idr.h>
#include <linux/workqueue.h>
-#include <linux/xattr.h>
#include <linux/fs.h>
#include <linux/percpu-refcount.h>
#include <linux/seq_file.h>
+#include <linux/kernfs.h>
#ifdef CONFIG_CGROUPS
@@ -37,28 +36,13 @@ extern void cgroup_post_fork(struct task_struct *p);
extern void cgroup_exit(struct task_struct *p, int run_callbacks);
extern int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry);
-extern int cgroup_load_subsys(struct cgroup_subsys *ss);
-extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
extern int proc_cgroup_show(struct seq_file *, void *);
-/*
- * Define the enumeration of all cgroup subsystems.
- *
- * We define ids for builtin subsystems and then modular ones.
- */
-#define SUBSYS(_x) _x ## _subsys_id,
+/* define the enumeration of all cgroup subsystems */
+#define SUBSYS(_x) _x ## _cgrp_id,
enum cgroup_subsys_id {
-#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
#include <linux/cgroup_subsys.h>
-#undef IS_SUBSYS_ENABLED
- CGROUP_BUILTIN_SUBSYS_COUNT,
-
- __CGROUP_SUBSYS_TEMP_PLACEHOLDER = CGROUP_BUILTIN_SUBSYS_COUNT - 1,
-
-#define IS_SUBSYS_ENABLED(option) IS_MODULE(option)
-#include <linux/cgroup_subsys.h>
-#undef IS_SUBSYS_ENABLED
CGROUP_SUBSYS_COUNT,
};
#undef SUBSYS
@@ -153,11 +137,6 @@ enum {
CGRP_SANE_BEHAVIOR,
};
-struct cgroup_name {
- struct rcu_head rcu_head;
- char name[];
-};
-
struct cgroup {
unsigned long flags; /* "unsigned long" so bitops work */
@@ -174,16 +153,17 @@ struct cgroup {
/* the number of attached css's */
int nr_css;
+ atomic_t refcnt;
+
/*
* We link our 'sibling' struct into our parent's 'children'.
* Our children link their 'sibling' into our 'children'.
*/
struct list_head sibling; /* my parent's children */
struct list_head children; /* my children */
- struct list_head files; /* my files */
struct cgroup *parent; /* my parent */
- struct dentry *dentry; /* cgroup fs entry, RCU protected */
+ struct kernfs_node *kn; /* cgroup kernfs entry */
/*
* Monotonically increasing unique serial number which defines a
@@ -193,19 +173,6 @@ struct cgroup {
*/
u64 serial_nr;
- /*
- * This is a copy of dentry->d_name, and it's needed because
- * we can't use dentry->d_name in cgroup_path().
- *
- * You must acquire rcu_read_lock() to access cgrp->name, and
- * the only place that can change it is rename(), which is
- * protected by parent dir's i_mutex.
- *
- * Normally you should use cgroup_name() wrapper rather than
- * access it directly.
- */
- struct cgroup_name __rcu *name;
-
/* Private pointers for each registered subsystem */
struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
@@ -237,9 +204,6 @@ struct cgroup {
/* For css percpu_ref killing and RCU-protected deletion */
struct rcu_head rcu_head;
struct work_struct destroy_work;
-
- /* directory xattrs */
- struct simple_xattrs xattrs;
};
#define MAX_CGROUP_ROOT_NAMELEN 64
@@ -262,8 +226,8 @@ enum {
*
* The followings are the behaviors currently affected this flag.
*
- * - Mount options "noprefix" and "clone_children" are disallowed.
- * Also, cgroupfs file cgroup.clone_children is not created.
+ * - Mount options "noprefix", "xattr", "clone_children",
+ * "release_agent" and "name" are disallowed.
*
* - When mounting an existing superblock, mount options should
* match.
@@ -281,6 +245,8 @@ enum {
* - "release_agent" and "notify_on_release" are removed.
* Replacement notification mechanism will be implemented.
*
+ * - "cgroup.clone_children" is removed.
+ *
* - cpuset: tasks will be kept in empty cpusets when hotplug happens
* and take masks of ancestors with non-empty cpus/mems, instead of
* being moved to an ancestor.
@@ -300,17 +266,15 @@ enum {
/* mount options live below bit 16 */
CGRP_ROOT_OPTION_MASK = (1 << 16) - 1,
-
- CGRP_ROOT_SUBSYS_BOUND = (1 << 16), /* subsystems finished binding */
};
/*
* A cgroupfs_root represents the root of a cgroup hierarchy, and may be
- * associated with a superblock to form an active hierarchy. This is
+ * associated with a kernfs_root to form an active hierarchy. This is
* internal to cgroup core. Don't access directly from controllers.
*/
struct cgroupfs_root {
- struct super_block *sb;
+ struct kernfs_root *kf_root;
/* The bitmask of subsystems attached to this hierarchy */
unsigned long subsys_mask;
@@ -318,11 +282,11 @@ struct cgroupfs_root {
/* Unique id for this hierarchy. */
int hierarchy_id;
- /* The root cgroup for this hierarchy */
+ /* The root cgroup. Root is destroyed on its release. */
struct cgroup top_cgroup;
- /* Tracks how many cgroups are currently defined in hierarchy.*/
- int number_of_cgroups;
+ /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
+ atomic_t nr_cgrps;
/* A list running through the active hierarchies */
struct list_head root_list;
@@ -372,10 +336,9 @@ struct css_set {
struct list_head cgrp_links;
/*
- * Set of subsystem states, one for each subsystem. This array
- * is immutable after creation apart from the init_css_set
- * during subsystem registration (at boot time) and modular subsystem
- * loading/unloading.
+ * Set of subsystem states, one for each subsystem. This array is
+ * immutable after creation apart from the init_css_set during
+ * subsystem registration (at boot time).
*/
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
@@ -416,8 +379,9 @@ struct cftype {
umode_t mode;
/*
- * If non-zero, defines the maximum length of string that can
- * be passed to write_string; defaults to 64
+ * The maximum length of string, excluding trailing nul, that can
+ * be passed to write_string. If < PAGE_SIZE-1, PAGE_SIZE-1 is
+ * assumed.
*/
size_t max_write_len;
@@ -425,10 +389,12 @@ struct cftype {
unsigned int flags;
/*
- * The subsys this file belongs to. Initialized automatically
- * during registration. NULL for cgroup core files.
+ * Fields used for internal bookkeeping. Initialized automatically
+ * during registration.
*/
- struct cgroup_subsys *ss;
+ struct cgroup_subsys *ss; /* NULL for cgroup core files */
+ struct list_head node; /* anchored at ss->cfts */
+ struct kernfs_ops *kf_ops;
/*
* read_u64() is a shortcut for the common case of returning a
@@ -475,36 +441,10 @@ struct cftype {
* kick type for multiplexing.
*/
int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
-};
-
-/*
- * cftype_sets describe cftypes belonging to a subsystem and are chained at
- * cgroup_subsys->cftsets. Each cftset points to an array of cftypes
- * terminated by zero length name.
- */
-struct cftype_set {
- struct list_head node; /* chained at subsys->cftsets */
- struct cftype *cfts;
-};
-
-/*
- * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't
- * access directly.
- */
-struct cfent {
- struct list_head node;
- struct dentry *dentry;
- struct cftype *type;
- struct cgroup_subsys_state *css;
-
- /* file xattrs */
- struct simple_xattrs xattrs;
-};
-/* seq_file->private points to the following, only ->priv is public */
-struct cgroup_open_file {
- struct cfent *cfe;
- void *priv;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lock_class_key lockdep_key;
+#endif
};
/*
@@ -516,34 +456,79 @@ static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
}
-/* Caller should hold rcu_read_lock() */
-static inline const char *cgroup_name(const struct cgroup *cgrp)
+/* no synchronization, the result can only be used as a hint */
+static inline bool cgroup_has_tasks(struct cgroup *cgrp)
{
- return rcu_dereference(cgrp->name)->name;
+ return !list_empty(&cgrp->cset_links);
}
-static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
+/* returns ino associated with a cgroup, 0 indicates unmounted root */
+static inline ino_t cgroup_ino(struct cgroup *cgrp)
{
- struct cgroup_open_file *of = seq->private;
- return of->cfe->css;
+ if (cgrp->kn)
+ return cgrp->kn->ino;
+ else
+ return 0;
}
static inline struct cftype *seq_cft(struct seq_file *seq)
{
- struct cgroup_open_file *of = seq->private;
- return of->cfe->type;
+ struct kernfs_open_file *of = seq->private;
+
+ return of->kn->priv;
+}
+
+struct cgroup_subsys_state *seq_css(struct seq_file *seq);
+
+/*
+ * Name / path handling functions. All are thin wrappers around the kernfs
+ * counterparts and can be called under any context.
+ */
+
+static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
+{
+ /* dummy_top doesn't have a kn associated */
+ if (cgrp->kn)
+ return kernfs_name(cgrp->kn, buf, buflen);
+ else
+ return strlcpy(buf, "/", buflen);
}
+static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
+ size_t buflen)
+{
+ /* dummy_top doesn't have a kn associated */
+ if (cgrp->kn)
+ return kernfs_path(cgrp->kn, buf, buflen);
+ strlcpy(buf, "/", buflen);
+ return (buflen <= 2) ? NULL : buf;
+}
+
+static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
+{
+ /* dummy_top doesn't have a kn associated */
+ if (cgrp->kn)
+ pr_cont_kernfs_name(cgrp->kn);
+ else
+ pr_cont("/");
+}
+
+static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
+{
+ /* dummy_top doesn't have a kn associated */
+ if (cgrp->kn)
+ pr_cont_kernfs_path(cgrp->kn);
+ else
+ pr_cont("/");
+}
+
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
+
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
-int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
-
-int cgroup_task_count(const struct cgroup *cgrp);
-
/*
* Control Group taskset, used to pass around set of tasks to cgroup_subsys
* methods.
@@ -551,22 +536,15 @@ int cgroup_task_count(const struct cgroup *cgrp);
struct cgroup_taskset;
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
-struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
- int subsys_id);
-int cgroup_taskset_size(struct cgroup_taskset *tset);
/**
* cgroup_taskset_for_each - iterate cgroup_taskset
* @task: the loop cursor
- * @skip_css: skip if task's css matches this, %NULL to iterate through all
* @tset: taskset to iterate
*/
-#define cgroup_taskset_for_each(task, skip_css, tset) \
+#define cgroup_taskset_for_each(task, tset) \
for ((task) = cgroup_taskset_first((tset)); (task); \
- (task) = cgroup_taskset_next((tset))) \
- if (!(skip_css) || \
- cgroup_taskset_cur_css((tset), \
- (skip_css)->ss->subsys_id) != (skip_css))
+ (task) = cgroup_taskset_next((tset)))
/*
* Control Group subsystem type.
@@ -591,7 +569,6 @@ struct cgroup_subsys {
struct task_struct *task);
void (*bind)(struct cgroup_subsys_state *root_css);
- int subsys_id;
int disabled;
int early_init;
@@ -610,27 +587,26 @@ struct cgroup_subsys {
bool broken_hierarchy;
bool warned_broken_hierarchy;
+ /* the following two fields are initialized automtically during boot */
+ int id;
#define MAX_CGROUP_TYPE_NAMELEN 32
const char *name;
/* link to parent, protected by cgroup_lock() */
struct cgroupfs_root *root;
- /* list of cftype_sets */
- struct list_head cftsets;
+ /*
+ * List of cftypes. Each entry is the first entry of an array
+ * terminated by zero length name.
+ */
+ struct list_head cfts;
- /* base cftypes, automatically [de]registered with subsys itself */
+ /* base cftypes, automatically registered with subsys itself */
struct cftype *base_cftypes;
- struct cftype_set base_cftset;
-
- /* should be defined only by modular subsystems */
- struct module *module;
};
-#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
-#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
+#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
#include <linux/cgroup_subsys.h>
-#undef IS_SUBSYS_ENABLED
#undef SUBSYS
/**
@@ -837,16 +813,11 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
struct task_struct *css_task_iter_next(struct css_task_iter *it);
void css_task_iter_end(struct css_task_iter *it);
-int css_scan_tasks(struct cgroup_subsys_state *css,
- bool (*test)(struct task_struct *, void *),
- void (*process)(struct task_struct *, void *),
- void *data, struct ptr_heap *heap);
-
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
-struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
- struct cgroup_subsys *ss);
+struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
+ struct cgroup_subsys *ss);
#else /* !CONFIG_CGROUPS */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 7b99d717411d..768fe44e19f0 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -3,51 +3,51 @@
*
* DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
*/
-#if IS_SUBSYS_ENABLED(CONFIG_CPUSETS)
+#if IS_ENABLED(CONFIG_CPUSETS)
SUBSYS(cpuset)
#endif
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEBUG)
+#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
SUBSYS(debug)
#endif
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_SCHED)
-SUBSYS(cpu_cgroup)
+#if IS_ENABLED(CONFIG_CGROUP_SCHED)
+SUBSYS(cpu)
#endif
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_CPUACCT)
+#if IS_ENABLED(CONFIG_CGROUP_CPUACCT)
SUBSYS(cpuacct)
#endif
-#if IS_SUBSYS_ENABLED(CONFIG_MEMCG)
-SUBSYS(mem_cgroup)
+#if IS_ENABLED(CONFIG_MEMCG)
+SUBSYS(memory)
#endif
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE)
+#if IS_ENABLED(CONFIG_CGROUP_DEVICE)
SUBSYS(devices)
#endif
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_FREEZER)
+#if IS_ENABLED(CONFIG_CGROUP_FREEZER)
SUBSYS(freezer)
#endif
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
+#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
SUBSYS(net_cls)
#endif
-#if IS_SUBSYS_ENABLED(CONFIG_BLK_CGROUP)
+#if IS_ENABLED(CONFIG_BLK_CGROUP)
SUBSYS(blkio)
#endif
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)
-SUBSYS(perf)
+#if IS_ENABLED(CONFIG_CGROUP_PERF)
+SUBSYS(perf_event)
#endif
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_PRIO)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
SUBSYS(net_prio)
#endif
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_HUGETLB)
+#if IS_ENABLED(CONFIG_CGROUP_HUGETLB)
SUBSYS(hugetlb)
#endif
/*
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 787bba3bf552..0129f89cf98d 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -49,7 +49,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
static inline bool hugetlb_cgroup_disabled(void)
{
- if (hugetlb_subsys.disabled)
+ if (hugetlb_cgrp_subsys.disabled)
return true;
return false;
}
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index abd0113b6620..eccfb4a4b379 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -162,7 +162,7 @@ extern int do_swap_account;
static inline bool mem_cgroup_disabled(void)
{
- if (mem_cgroup_subsys.disabled)
+ if (memory_cgrp_subsys.disabled)
return true;
return false;
}
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index 9cf2d5ef38d9..c15d39456e14 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -34,7 +34,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
return 0;
rcu_read_lock();
- classid = container_of(task_css(p, net_cls_subsys_id),
+ classid = container_of(task_css(p, net_cls_cgrp_id),
struct cgroup_cls_state, css)->classid;
rcu_read_unlock();
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index dafc09f0fdbc..f2a9597ff53c 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -27,32 +27,17 @@ struct netprio_map {
void sock_update_netprioidx(struct sock *sk);
-#if IS_BUILTIN(CONFIG_CGROUP_NET_PRIO)
static inline u32 task_netprioidx(struct task_struct *p)
{
struct cgroup_subsys_state *css;
u32 idx;
rcu_read_lock();
- css = task_css(p, net_prio_subsys_id);
+ css = task_css(p, net_prio_cgrp_id);
idx = css->cgroup->id;
rcu_read_unlock();
return idx;
}
-#elif IS_MODULE(CONFIG_CGROUP_NET_PRIO)
-static inline u32 task_netprioidx(struct task_struct *p)
-{
- struct cgroup_subsys_state *css;
- u32 idx = 0;
-
- rcu_read_lock();
- css = task_css(p, net_prio_subsys_id);
- if (css)
- idx = css->cgroup->id;
- rcu_read_unlock();
- return idx;
-}
-#endif
#else /* !CONFIG_CGROUP_NET_PRIO */
static inline u32 task_netprioidx(struct task_struct *p)
{
diff --git a/init/Kconfig b/init/Kconfig
index 009a797dd242..3f74784560a5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -854,6 +854,7 @@ config NUMA_BALANCING
menuconfig CGROUPS
boolean "Control Group support"
+ select KERNFS
help
This option adds support for grouping sets of processes together, for
use with process control subsystems such as Cpusets, CFS, memory
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 105f273b6f86..8ab800c7bac0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -40,23 +40,21 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
-#include <linux/backing-dev.h>
#include <linux/slab.h>
-#include <linux/magic.h>
#include <linux/spinlock.h>
+#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
-#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
#include <linux/hashtable.h>
-#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
+#include <linux/delay.h>
#include <linux/atomic.h>
@@ -68,21 +66,21 @@
*/
#define CGROUP_PIDLIST_DESTROY_DELAY HZ
+#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
+ MAX_CFTYPE_NAME + 2)
+
+/*
+ * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
+ * creation/removal and hierarchy changing operations including cgroup
+ * creation, removal, css association and controller rebinding. This outer
+ * lock is needed mainly to resolve the circular dependency between kernfs
+ * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
+ */
+static DEFINE_MUTEX(cgroup_tree_mutex);
+
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
- *
- * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
- * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
- * release_agent_path and so on. Modifying requires both cgroup_mutex and
- * cgroup_root_mutex. Readers can acquire either of the two. This is to
- * break the following locking order cycle.
- *
- * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
- * B. namespace_sem -> cgroup_mutex
- *
- * B happens only through cgroup_show_options() and using cgroup_root_mutex
- * breaks it.
*/
#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
@@ -91,20 +89,17 @@ EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
static DEFINE_MUTEX(cgroup_mutex);
#endif
-static DEFINE_MUTEX(cgroup_root_mutex);
+/*
+ * Protects cgroup_subsys->release_agent_path. Modifying it also requires
+ * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
-#define cgroup_assert_mutex_or_rcu_locked() \
+#define cgroup_assert_mutexes_or_rcu_locked() \
rcu_lockdep_assert(rcu_read_lock_held() || \
+ lockdep_is_held(&cgroup_tree_mutex) || \
lockdep_is_held(&cgroup_mutex), \
- "cgroup_mutex or RCU read lock required");
-
-#ifdef CONFIG_LOCKDEP
-#define cgroup_assert_mutex_or_root_locked() \
- WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
- !lockdep_is_held(&cgroup_root_mutex)))
-#else
-#define cgroup_assert_mutex_or_root_locked() do { } while (0)
-#endif
+ "cgroup_[tree_]mutex or RCU read lock required");
/*
* cgroup destruction makes heavy use of work items and there can be a lot
@@ -120,17 +115,19 @@ static struct workqueue_struct *cgroup_destroy_wq;
*/
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-/*
- * Generate an array of cgroup subsystem pointers. At boot time, this is
- * populated with the built in subsystems, and modular subsystems are
- * registered after that. The mutable section of this array is protected by
- * cgroup_mutex.
- */
-#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
-#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
-static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
+/* generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
+static struct cgroup_subsys *cgroup_subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
+#undef SUBSYS
/*
* The dummy hierarchy, reserved for the subsystems that are otherwise
@@ -147,15 +144,9 @@ static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
static LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
-/*
- * Hierarchy ID allocation and mapping. It follows the same exclusion
- * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
- * writes, either for reads.
- */
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);
-static struct cgroup_name root_cgroup_name = { .name = "/" };
-
/*
* Assign a monotonically increasing serial number to cgroups. It
* guarantees cgroups with bigger numbers are newer than those with smaller
@@ -175,11 +166,13 @@ static int need_forkexit_callback __read_mostly;
static struct cftype cgroup_base_files[];
+static void cgroup_put(struct cgroup *cgrp);
+static int rebind_subsystems(struct cgroupfs_root *root,
+ unsigned long added_mask, unsigned removed_mask);
static void cgroup_destroy_css_killed(struct cgroup *cgrp);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
-static int cgroup_file_release(struct inode *inode, struct file *file);
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
/**
@@ -197,8 +190,9 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
if (ss)
- return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
- lockdep_is_held(&cgroup_mutex));
+ return rcu_dereference_check(cgrp->subsys[ss->id],
+ lockdep_is_held(&cgroup_tree_mutex) ||
+ lockdep_is_held(&cgroup_mutex));
else
return &cgrp->dummy_css;
}
@@ -209,6 +203,27 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
return test_bit(CGRP_DEAD, &cgrp->flags);
}
+struct cgroup_subsys_state *seq_css(struct seq_file *seq)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = seq_cft(seq);
+
+ /*
+ * This is open and unprotected implementation of cgroup_css().
+ * seq_css() is only called from a kernfs file operation which has
+ * an active reference on the file. Because all the subsystem
+ * files are drained before a css is disassociated with a cgroup,
+ * the matching css from the cgroup's subsys table is guaranteed to
+ * be and stay valid until the enclosing operation is complete.
+ */
+ if (cft->ss)
+ return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+ else
+ return &cgrp->dummy_css;
+}
+EXPORT_SYMBOL_GPL(seq_css);
+
/**
* cgroup_is_descendant - test ancestry
* @cgrp: the cgroup to be tested
@@ -227,7 +242,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
}
return false;
}
-EXPORT_SYMBOL_GPL(cgroup_is_descendant);
static int cgroup_is_releasable(const struct cgroup *cgrp)
{
@@ -254,54 +268,23 @@ static int notify_on_release(const struct cgroup *cgrp)
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = rcu_dereference_check( \
(cgrp)->subsys[(ssid)], \
+ lockdep_is_held(&cgroup_tree_mutex) || \
lockdep_is_held(&cgroup_mutex)))) { } \
else
/**
- * for_each_subsys - iterate all loaded cgroup subsystems
+ * for_each_subsys - iterate all enabled cgroup subsystems
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- *
- * Iterates through all loaded subsystems. Should be called under
- * cgroup_mutex or cgroup_root_mutex.
*/
#define for_each_subsys(ss, ssid) \
- for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
- (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
- if (!((ss) = cgroup_subsys[(ssid)])) { } \
- else
-
-/**
- * for_each_builtin_subsys - iterate all built-in cgroup subsystems
- * @ss: the iteration cursor
- * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
- *
- * Bulit-in subsystems are always present and iteration itself doesn't
- * require any synchronization.
- */
-#define for_each_builtin_subsys(ss, i) \
- for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
- (((ss) = cgroup_subsys[i]) || true); (i)++)
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
/* iterate across the active hierarchies */
#define for_each_active_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
-static inline struct cfent *__d_cfe(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
-static inline struct cftype *__d_cft(struct dentry *dentry)
-{
- return __d_cfe(dentry)->type;
-}
-
/**
* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
* @cgrp: the cgroup to be checked for liveness
@@ -358,11 +341,10 @@ static struct css_set init_css_set;
static struct cgrp_cset_link init_cgrp_cset_link;
/*
- * css_set_lock protects the list of css_set objects, and the chain of
- * tasks off each css_set. Nests outside task->alloc_lock due to
- * css_task_iter_start().
+ * css_set_rwsem protects the list of css_set objects, and the chain of
+ * tasks off each css_set.
*/
-static DEFINE_RWLOCK(css_set_lock);
+static DECLARE_RWSEM(css_set_rwsem);
static int css_set_count;
/*
@@ -386,30 +368,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
return key;
}
-/*
- * We don't maintain the lists running through each css_set to its task
- * until after the first call to css_task_iter_start(). This reduces the
- * fork()/exit() overhead for people who have cgroups compiled into their
- * kernel but not actually in use.
- */
-static int use_task_css_set_links __read_mostly;
-
-static void __put_css_set(struct css_set *cset, int taskexit)
+static void put_css_set_locked(struct css_set *cset, bool taskexit)
{
struct cgrp_cset_link *link, *tmp_link;
- /*
- * Ensure that the refcount doesn't hit zero while any readers
- * can see it. Similar to atomic_dec_and_lock(), but for an
- * rwlock
- */
- if (atomic_add_unless(&cset->refcount, -1, 1))
- return;
- write_lock(&css_set_lock);
- if (!atomic_dec_and_test(&cset->refcount)) {
- write_unlock(&css_set_lock);
+ lockdep_assert_held(&css_set_rwsem);
+
+ if (!atomic_dec_and_test(&cset->refcount))
return;
- }
/* This css_set is dead. unlink it and release cgroup refcounts */
hash_del(&cset->hlist);
@@ -421,7 +387,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
list_del(&link->cset_link);
list_del(&link->cgrp_link);
- /* @cgrp can't go away while we're holding css_set_lock */
+ /* @cgrp can't go away while we're holding css_set_rwsem */
if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
if (taskexit)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -431,10 +397,24 @@ static void __put_css_set(struct css_set *cset, int taskexit)
kfree(link);
}
- write_unlock(&css_set_lock);
kfree_rcu(cset, rcu_head);
}
+static void put_css_set(struct css_set *cset, bool taskexit)
+{
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cset->refcount, -1, 1))
+ return;
+
+ down_write(&css_set_rwsem);
+ put_css_set_locked(cset, taskexit);
+ up_write(&css_set_rwsem);
+}
+
/*
* refcounted get/put for css_set objects
*/
@@ -443,16 +423,6 @@ static inline void get_css_set(struct css_set *cset)
atomic_inc(&cset->refcount);
}
-static inline void put_css_set(struct css_set *cset)
-{
- __put_css_set(cset, 0);
-}
-
-static inline void put_css_set_taskexit(struct css_set *cset)
-{
- __put_css_set(cset, 1);
-}
-
/**
* compare_css_sets - helper function for find_existing_css_set().
* @cset: candidate css_set being tested
@@ -652,11 +622,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
/* First see if we already have a cgroup group that matches
* the desired set */
- read_lock(&css_set_lock);
+ down_read(&css_set_rwsem);
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
if (cset)
return cset;
@@ -680,7 +650,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
* find_existing_css_set() */
memcpy(cset->subsys, template, sizeof(cset->subsys));
- write_lock(&css_set_lock);
+ down_write(&css_set_rwsem);
/* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -698,14 +668,98 @@ static struct css_set *find_css_set(struct css_set *old_cset,
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
- write_unlock(&css_set_lock);
+ up_write(&css_set_rwsem);
return cset;
}
+static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+{
+ struct cgroup *top_cgrp = kf_root->kn->priv;
+
+ return top_cgrp->root;
+}
+
+static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
+{
+ int id;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
+ GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ root->hierarchy_id = id;
+ return 0;
+}
+
+static void cgroup_exit_root_id(struct cgroupfs_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (root->hierarchy_id) {
+ idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+ root->hierarchy_id = 0;
+ }
+}
+
+static void cgroup_free_root(struct cgroupfs_root *root)
+{
+ if (root) {
+ /* hierarhcy ID shoulid already have been released */
+ WARN_ON_ONCE(root->hierarchy_id);
+
+ idr_destroy(&root->cgroup_idr);
+ kfree(root);
+ }
+}
+
+static void cgroup_destroy_root(struct cgroupfs_root *root)
+{
+ struct cgroup *cgrp = &root->top_cgroup;
+ struct cgrp_cset_link *link, *tmp_link;
+
+ mutex_lock(&cgroup_tree_mutex);
+ mutex_lock(&cgroup_mutex);
+
+ BUG_ON(atomic_read(&root->nr_cgrps));
+ BUG_ON(!list_empty(&cgrp->children));
+
+ /* Rebind all subsystems back to the default hierarchy */
+ WARN_ON(rebind_subsystems(root, 0, root->subsys_mask));
+
+ /*
+ * Release all the links from cset_links to this hierarchy's
+ * root cgroup
+ */
+ down_write(&css_set_rwsem);
+
+ list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+ kfree(link);
+ }
+ up_write(&css_set_rwsem);
+
+ if (!list_empty(&root->root_list)) {
+ list_del(&root->root_list);
+ cgroup_root_count--;
+ }
+
+ cgroup_exit_root_id(root);
+
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
+
+ kernfs_destroy_root(root->kf_root);
+ cgroup_free_root(root);
+}
+
/*
* Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex held.
+ * called with cgroup_mutex and css_set_rwsem held.
*/
static struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroupfs_root *root)
@@ -713,8 +767,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct css_set *cset;
struct cgroup *res = NULL;
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
- read_lock(&css_set_lock);
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
/*
* No need to lock the task - since we hold cgroup_mutex the
* task can't change groups, so the only thing that can happen
@@ -735,7 +790,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
}
}
}
- read_unlock(&css_set_lock);
+
BUG_ON(!res);
return res;
}
@@ -790,78 +845,71 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-/*
- * A couple of forward declarations required, due to cyclic reference loop:
- * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
- * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
- * -> cgroup_mkdir.
- */
-
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
-static const struct inode_operations cgroup_dir_inode_operations;
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static const struct file_operations proc_cgroupstats_operations;
-static struct backing_dev_info cgroup_backing_dev_info = {
- .name = "cgroup",
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
-static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+ char *buf)
{
- struct inode *inode = new_inode(sb);
-
- if (inode) {
- inode->i_ino = get_next_ino();
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
- }
- return inode;
+ if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
+ cft->ss->name, cft->name);
+ else
+ strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ return buf;
}
-static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
+ */
+static umode_t cgroup_file_mode(const struct cftype *cft)
{
- struct cgroup_name *name;
+ umode_t mode = 0;
- name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
- if (!name)
- return NULL;
- strcpy(name->name, dentry->d_name.name);
- return name;
+ if (cft->mode)
+ return cft->mode;
+
+ if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+ mode |= S_IRUGO;
+
+ if (cft->write_u64 || cft->write_s64 || cft->write_string ||
+ cft->trigger)
+ mode |= S_IWUSR;
+
+ return mode;
}
static void cgroup_free_fn(struct work_struct *work)
{
struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
- mutex_lock(&cgroup_mutex);
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
-
- /*
- * We get a ref to the parent's dentry, and put the ref when
- * this cgroup is being freed, so it's guaranteed that the
- * parent won't be destroyed before its children.
- */
- dput(cgrp->parent->dentry);
-
- /*
- * Drop the active superblock reference that we took when we
- * created the cgroup. This will free cgrp->root, if we are
- * holding the last reference to @sb.
- */
- deactivate_super(cgrp->root->sb);
-
+ atomic_dec(&cgrp->root->nr_cgrps);
cgroup_pidlist_destroy_all(cgrp);
- simple_xattrs_free(&cgrp->xattrs);
-
- kfree(rcu_dereference_raw(cgrp->name));
- kfree(cgrp);
+ if (cgrp->parent) {
+ /*
+ * We get a ref to the parent, and put the ref when this
+ * cgroup is being freed, so it's guaranteed that the
+ * parent won't be destroyed before its children.
+ */
+ cgroup_put(cgrp->parent);
+ kernfs_put(cgrp->kn);
+ kfree(cgrp);
+ } else {
+ /*
+ * This is top cgroup's refcnt reaching zero, which
+ * indicates that the root should be released.
+ */
+ cgroup_destroy_root(cgrp->root);
+ }
}
static void cgroup_free_rcu(struct rcu_head *head)
@@ -872,73 +920,40 @@ static void cgroup_free_rcu(struct rcu_head *head)
queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
}
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
-{
- /* is dentry a directory ? if so, kfree() associated cgroup */
- if (S_ISDIR(inode->i_mode)) {
- struct cgroup *cgrp = dentry->d_fsdata;
-
- BUG_ON(!(cgroup_is_dead(cgrp)));
-
- /*
- * XXX: cgrp->id is only used to look up css's. As cgroup
- * and css's lifetimes will be decoupled, it should be made
- * per-subsystem and moved to css->id so that lookups are
- * successful until the target css is released.
- */
- mutex_lock(&cgroup_mutex);
- idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
- mutex_unlock(&cgroup_mutex);
- cgrp->id = -1;
-
- call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
- } else {
- struct cfent *cfe = __d_cfe(dentry);
- struct cgroup *cgrp = dentry->d_parent->d_fsdata;
-
- WARN_ONCE(!list_empty(&cfe->node) &&
- cgrp != &cgrp->root->top_cgroup,
- "cfe still linked for %s\n", cfe->type->name);
- simple_xattrs_free(&cfe->xattrs);
- kfree(cfe);
- }
- iput(inode);
-}
-
-static void remove_dir(struct dentry *d)
+static void cgroup_get(struct cgroup *cgrp)
{
- struct dentry *parent = dget(d->d_parent);
-
- d_delete(d);
- simple_rmdir(parent->d_inode, d);
- dput(parent);
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
+ atomic_inc(&cgrp->refcnt);
}
-static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_put(struct cgroup *cgrp)
{
- struct cfent *cfe;
-
- lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
- lockdep_assert_held(&cgroup_mutex);
+ if (!atomic_dec_and_test(&cgrp->refcnt))
+ return;
+ if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
+ return;
/*
- * If we're doing cleanup due to failure of cgroup_create(),
- * the corresponding @cfe may not exist.
+ * XXX: cgrp->id is only used to look up css's. As cgroup and
+ * css's lifetimes will be decoupled, it should be made
+ * per-subsystem and moved to css->id so that lookups are
+ * successful until the target css is released.
*/
- list_for_each_entry(cfe, &cgrp->files, node) {
- struct dentry *d = cfe->dentry;
+ mutex_lock(&cgroup_mutex);
+ idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+ mutex_unlock(&cgroup_mutex);
+ cgrp->id = -1;
- if (cft && cfe->type != cft)
- continue;
+ call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
+}
- dget(d);
- d_delete(d);
- simple_unlink(cgrp->dentry->d_inode, d);
- list_del_init(&cfe->node);
- dput(d);
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+{
+ char name[CGROUP_FILE_NAME_MAX];
- break;
- }
+ lockdep_assert_held(&cgroup_tree_mutex);
+ kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}
/**
@@ -952,81 +967,41 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
int i;
for_each_subsys(ss, i) {
- struct cftype_set *set;
+ struct cftype *cfts;
if (!test_bit(i, &subsys_mask))
continue;
- list_for_each_entry(set, &ss->cftsets, node)
- cgroup_addrm_files(cgrp, set->cfts, false);
+ list_for_each_entry(cfts, &ss->cfts, node)
+ cgroup_addrm_files(cgrp, cfts, false);
}
}
-/*
- * NOTE : the dentry must have been dget()'ed
- */
-static void cgroup_d_remove_dir(struct dentry *dentry)
-{
- struct dentry *parent;
-
- parent = dentry->d_parent;
- spin_lock(&parent->d_lock);
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- list_del_init(&dentry->d_u.d_child);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&parent->d_lock);
- remove_dir(dentry);
-}
-
-/*
- * Call with cgroup_mutex held. Drops reference counts on modules, including
- * any duplicate ones that parse_cgroupfs_options took. If this function
- * returns an error, no reference counts are touched.
- */
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long added_mask, unsigned removed_mask)
{
struct cgroup *cgrp = &root->top_cgroup;
struct cgroup_subsys *ss;
- unsigned long pinned = 0;
int i, ret;
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
- BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
+ lockdep_assert_held(&cgroup_tree_mutex);
+ lockdep_assert_held(&cgroup_mutex);
/* Check that any added subsystems are currently free */
- for_each_subsys(ss, i) {
- if (!(added_mask & (1 << i)))
- continue;
-
- /* is the subsystem mounted elsewhere? */
- if (ss->root != &cgroup_dummy_root) {
- ret = -EBUSY;
- goto out_put;
- }
-
- /* pin the module */
- if (!try_module_get(ss->module)) {
- ret = -ENOENT;
- goto out_put;
- }
- pinned |= 1 << i;
- }
-
- /* subsys could be missing if unloaded between parsing and here */
- if (added_mask != pinned) {
- ret = -ENOENT;
- goto out_put;
- }
+ for_each_subsys(ss, i)
+ if ((added_mask & (1 << i)) && ss->root != &cgroup_dummy_root)
+ return -EBUSY;
ret = cgroup_populate_dir(cgrp, added_mask);
if (ret)
- goto out_put;
+ return ret;
/*
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
+ mutex_unlock(&cgroup_mutex);
cgroup_clear_dir(cgrp, removed_mask);
+ mutex_lock(&cgroup_mutex);
for_each_subsys(ss, i) {
unsigned long bit = 1UL << i;
@@ -1059,35 +1034,21 @@ static int rebind_subsystems(struct cgroupfs_root *root,
RCU_INIT_POINTER(cgrp->subsys[i], NULL);
cgroup_subsys[i]->root = &cgroup_dummy_root;
-
- /* subsystem is now free - drop reference on module */
- module_put(ss->module);
root->subsys_mask &= ~bit;
}
}
- /*
- * Mark @root has finished binding subsystems. @root->subsys_mask
- * now matches the bound subsystems.
- */
- root->flags |= CGRP_ROOT_SUBSYS_BOUND;
-
+ kernfs_activate(cgrp->kn);
return 0;
-
-out_put:
- for_each_subsys(ss, i)
- if (pinned & (1 << i))
- module_put(ss->module);
- return ret;
}
-static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
+static int cgroup_show_options(struct seq_file *seq,
+ struct kernfs_root *kf_root)
{
- struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
+ struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_subsys *ss;
int ssid;
- mutex_lock(&cgroup_root_mutex);
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_printf(seq, ",%s", ss->name);
@@ -1097,13 +1058,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
seq_puts(seq, ",xattr");
+
+ spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
- mutex_unlock(&cgroup_root_mutex);
return 0;
}
@@ -1115,9 +1079,6 @@ struct cgroup_sb_opts {
char *name;
/* User explicitly requested empty subsystem */
bool none;
-
- struct cgroupfs_root *new_root;
-
};
/*
@@ -1137,7 +1098,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
BUG_ON(!mutex_is_locked(&cgroup_mutex));
#ifdef CONFIG_CPUSETS
- mask = ~(1UL << cpuset_subsys_id);
+ mask = ~(1UL << cpuset_cgrp_id);
#endif
memset(opts, 0, sizeof(*opts));
@@ -1242,13 +1203,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- if (opts->flags & CGRP_ROOT_NOPREFIX) {
- pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
- return -EINVAL;
- }
-
- if (opts->cpuset_clone_children) {
- pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
+ if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
+ opts->cpuset_clone_children || opts->release_agent ||
+ opts->name) {
+ pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
return -EINVAL;
}
}
@@ -1276,11 +1234,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return 0;
}
-static int cgroup_remount(struct super_block *sb, int *flags, char *data)
+static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
int ret = 0;
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
+ struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
unsigned long added_mask, removed_mask;
@@ -1289,9 +1246,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
return -EINVAL;
}
- mutex_lock(&cgrp->dentry->d_inode->i_mutex);
+ mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
@@ -1316,7 +1272,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
}
/* remounting is not allowed for populated hierarchies */
- if (root->number_of_cgroups > 1) {
+ if (!list_empty(&root->top_cgroup.children)) {
ret = -EBUSY;
goto out_unlock;
}
@@ -1325,35 +1281,81 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto out_unlock;
- if (opts.release_agent)
+ if (opts.release_agent) {
+ spin_lock(&release_agent_path_lock);
strcpy(root->release_agent_path, opts.release_agent);
+ spin_unlock(&release_agent_path_lock);
+ }
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
- mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
return ret;
}
-static const struct super_operations cgroup_ops = {
- .statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
- .show_options = cgroup_show_options,
- .remount_fs = cgroup_remount,
-};
+/*
+ * To reduce the fork() overhead for systems that are not actually using
+ * their cgroups capability, we don't maintain the lists running through
+ * each css_set to its tasks until we see the list actually used - in other
+ * words after the first mount.
+ */
+static bool use_task_css_set_links __read_mostly;
+
+static void cgroup_enable_task_cg_lists(void)
+{
+ struct task_struct *p, *g;
+
+ down_write(&css_set_rwsem);
+
+ if (use_task_css_set_links)
+ goto out_unlock;
+
+ use_task_css_set_links = true;
+
+ /*
+ * We need tasklist_lock because RCU is not safe against
+ * while_each_thread(). Besides, a forking task that has passed
+ * cgroup_post_fork() without seeing use_task_css_set_links = 1
+ * is not guaranteed to have its child immediately visible in the
+ * tasklist if we walk through it with RCU.
+ */
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ task_lock(p);
+
+ WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+ task_css_set(p) != &init_css_set);
+
+ /*
+ * We should check if the process is exiting, otherwise
+ * it will race with cgroup_exit() in that the list
+ * entry won't be deleted though the process has exited.
+ * Do it while holding siglock so that we don't end up
+ * racing against cgroup_exit().
+ */
+ spin_lock_irq(&p->sighand->siglock);
+ if (!(p->flags & PF_EXITING))
+ list_add(&p->cg_list, &task_css_set(p)->tasks);
+ spin_unlock_irq(&p->sighand->siglock);
+
+ task_unlock(p);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+out_unlock:
+ up_write(&css_set_rwsem);
+}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
+ atomic_set(&cgrp->refcnt, 1);
INIT_LIST_HEAD(&cgrp->sibling);
INIT_LIST_HEAD(&cgrp->children);
- INIT_LIST_HEAD(&cgrp->files);
INIT_LIST_HEAD(&cgrp->cset_links);
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->dummy_css.cgroup = cgrp;
- simple_xattrs_init(&cgrp->xattrs);
}
static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1361,66 +1363,18 @@ static void init_cgroup_root(struct cgroupfs_root *root)
struct cgroup *cgrp = &root->top_cgroup;
INIT_LIST_HEAD(&root->root_list);
- root->number_of_cgroups = 1;
+ atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
- RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
init_cgroup_housekeeping(cgrp);
idr_init(&root->cgroup_idr);
}
-static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
-{
- int id;
-
- lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&cgroup_root_mutex);
-
- id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
- GFP_KERNEL);
- if (id < 0)
- return id;
-
- root->hierarchy_id = id;
- return 0;
-}
-
-static void cgroup_exit_root_id(struct cgroupfs_root *root)
-{
- lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&cgroup_root_mutex);
-
- if (root->hierarchy_id) {
- idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
- root->hierarchy_id = 0;
- }
-}
-
-static int cgroup_test_super(struct super_block *sb, void *data)
-{
- struct cgroup_sb_opts *opts = data;
- struct cgroupfs_root *root = sb->s_fs_info;
-
- /* If we asked for a name then it must match */
- if (opts->name && strcmp(opts->name, root->name))
- return 0;
-
- /*
- * If we asked for subsystems (or explicitly for no
- * subsystems) then they must match
- */
- if ((opts->subsys_mask || opts->none)
- && (opts->subsys_mask != root->subsys_mask))
- return 0;
-
- return 1;
-}
-
static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
{
struct cgroupfs_root *root;
if (!opts->subsys_mask && !opts->none)
- return NULL;
+ return ERR_PTR(-EINVAL);
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root)
@@ -1428,15 +1382,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
init_cgroup_root(root);
- /*
- * We need to set @root->subsys_mask now so that @root can be
- * matched by cgroup_test_super() before it finishes
- * initialization; otherwise, competing mounts with the same
- * options may try to bind the same subsystems instead of waiting
- * for the first one leading to unexpected mount errors.
- * SUBSYS_BOUND will be set once actual binding is complete.
- */
- root->subsys_mask = opts->subsys_mask;
root->flags = opts->flags;
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
@@ -1447,291 +1392,202 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
return root;
}
-static void cgroup_free_root(struct cgroupfs_root *root)
+static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
{
- if (root) {
- /* hierarhcy ID shoulid already have been released */
- WARN_ON_ONCE(root->hierarchy_id);
-
- idr_destroy(&root->cgroup_idr);
- kfree(root);
- }
-}
+ LIST_HEAD(tmp_links);
+ struct cgroup *root_cgrp = &root->top_cgroup;
+ struct css_set *cset;
+ int i, ret;
-static int cgroup_set_super(struct super_block *sb, void *data)
-{
- int ret;
- struct cgroup_sb_opts *opts = data;
+ lockdep_assert_held(&cgroup_tree_mutex);
+ lockdep_assert_held(&cgroup_mutex);
- /* If we don't have a new root, we can't set up a new sb */
- if (!opts->new_root)
- return -EINVAL;
+ ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
+ if (ret < 0)
+ goto out;
+ root_cgrp->id = ret;
- BUG_ON(!opts->subsys_mask && !opts->none);
+ /*
+ * We're accessing css_set_count without locking css_set_rwsem here,
+ * but that's OK - it can only be increased by someone holding
+ * cgroup_lock, and that's us. The worst that can happen is that we
+ * have some link structures left over
+ */
+ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+ if (ret)
+ goto out;
- ret = set_anon_super(sb, NULL);
+ /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
+ ret = cgroup_init_root_id(root, 2, 0);
if (ret)
- return ret;
+ goto out;
- sb->s_fs_info = opts->new_root;
- opts->new_root->sb = sb;
+ root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED,
+ root_cgrp);
+ if (IS_ERR(root->kf_root)) {
+ ret = PTR_ERR(root->kf_root);
+ goto exit_root_id;
+ }
+ root_cgrp->kn = root->kf_root->kn;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
- sb->s_magic = CGROUP_SUPER_MAGIC;
- sb->s_op = &cgroup_ops;
+ ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+ if (ret)
+ goto destroy_root;
- return 0;
-}
+ ret = rebind_subsystems(root, ss_mask, 0);
+ if (ret)
+ goto destroy_root;
-static int cgroup_get_rootdir(struct super_block *sb)
-{
- static const struct dentry_operations cgroup_dops = {
- .d_iput = cgroup_diput,
- .d_delete = always_delete_dentry,
- };
+ /*
+ * There must be no failure case after here, since rebinding takes
+ * care of subsystems' refcounts, which are explicitly dropped in
+ * the failure exit path.
+ */
+ list_add(&root->root_list, &cgroup_roots);
+ cgroup_root_count++;
- struct inode *inode =
- cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
+ /*
+ * Link the top cgroup in this hierarchy into all the css_set
+ * objects.
+ */
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ link_css_set(&tmp_links, cset, root_cgrp);
+ up_write(&css_set_rwsem);
- if (!inode)
- return -ENOMEM;
+ BUG_ON(!list_empty(&root_cgrp->children));
+ BUG_ON(atomic_read(&root->nr_cgrps) != 1);
- inode->i_fop = &simple_dir_operations;
- inode->i_op = &cgroup_dir_inode_operations;
- /* directories start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
- sb->s_root = d_make_root(inode);
- if (!sb->s_root)
- return -ENOMEM;
- /* for everything else we want ->d_op set */
- sb->s_d_op = &cgroup_dops;
- return 0;
+ kernfs_activate(root_cgrp->kn);
+ ret = 0;
+ goto out;
+
+destroy_root:
+ kernfs_destroy_root(root->kf_root);
+ root->kf_root = NULL;
+exit_root_id:
+ cgroup_exit_root_id(root);
+out:
+ free_cgrp_cset_links(&tmp_links);
+ return ret;
}
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
- struct cgroup_sb_opts opts;
struct cgroupfs_root *root;
- int ret = 0;
- struct super_block *sb;
- struct cgroupfs_root *new_root;
- struct list_head tmp_links;
- struct inode *inode;
- const struct cred *cred;
-
- /* First find the desired set of subsystems */
- mutex_lock(&cgroup_mutex);
- ret = parse_cgroupfs_options(data, &opts);
- mutex_unlock(&cgroup_mutex);
- if (ret)
- goto out_err;
+ struct cgroup_sb_opts opts;
+ struct dentry *dentry;
+ int ret;
/*
- * Allocate a new cgroup root. We may not need it if we're
- * reusing an existing hierarchy.
+ * The first time anyone tries to mount a cgroup, enable the list
+ * linking each css_set to its tasks and fix up all existing tasks.
*/
- new_root = cgroup_root_from_opts(&opts);
- if (IS_ERR(new_root)) {
- ret = PTR_ERR(new_root);
- goto out_err;
- }
- opts.new_root = new_root;
-
- /* Locate an existing or new sb for this hierarchy */
- sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
- if (IS_ERR(sb)) {
- ret = PTR_ERR(sb);
- cgroup_free_root(opts.new_root);
- goto out_err;
- }
-
- root = sb->s_fs_info;
- BUG_ON(!root);
- if (root == opts.new_root) {
- /* We used the new root structure, so this is a new hierarchy */
- struct cgroup *root_cgrp = &root->top_cgroup;
- struct cgroupfs_root *existing_root;
- int i;
- struct css_set *cset;
-
- BUG_ON(sb->s_root != NULL);
-
- ret = cgroup_get_rootdir(sb);
- if (ret)
- goto drop_new_super;
- inode = sb->s_root->d_inode;
-
- mutex_lock(&inode->i_mutex);
- mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
-
- ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
- if (ret < 0)
- goto unlock_drop;
- root_cgrp->id = ret;
-
- /* Check for name clashes with existing mounts */
- ret = -EBUSY;
- if (strlen(root->name))
- for_each_active_root(existing_root)
- if (!strcmp(existing_root->name, root->name))
- goto unlock_drop;
-
- /*
- * We're accessing css_set_count without locking
- * css_set_lock here, but that's OK - it can only be
- * increased by someone holding cgroup_lock, and
- * that's us. The worst that can happen is that we
- * have some link structures left over
- */
- ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
- if (ret)
- goto unlock_drop;
-
- /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
- ret = cgroup_init_root_id(root, 2, 0);
- if (ret)
- goto unlock_drop;
-
- sb->s_root->d_fsdata = root_cgrp;
- root_cgrp->dentry = sb->s_root;
-
- /*
- * We're inside get_sb() and will call lookup_one_len() to
- * create the root files, which doesn't work if SELinux is
- * in use. The following cred dancing somehow works around
- * it. See 2ce9738ba ("cgroupfs: use init_cred when
- * populating new cgroupfs mount") for more details.
- */
- cred = override_creds(&init_cred);
-
- ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
- if (ret)
- goto rm_base_files;
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
+retry:
+ mutex_lock(&cgroup_tree_mutex);
+ mutex_lock(&cgroup_mutex);
- ret = rebind_subsystems(root, root->subsys_mask, 0);
- if (ret)
- goto rm_base_files;
+ /* First find the desired set of subsystems */
+ ret = parse_cgroupfs_options(data, &opts);
+ if (ret)
+ goto out_unlock;
- revert_creds(cred);
+ /* look for a matching existing root */
+ for_each_active_root(root) {
+ bool name_match = false;
/*
- * There must be no failure case after here, since rebinding
- * takes care of subsystems' refcounts, which are explicitly
- * dropped in the failure exit path.
+ * If we asked for a name then it must match. Also, if
+ * name matches but sybsys_mask doesn't, we should fail.
+ * Remember whether name matched.
*/
+ if (opts.name) {
+ if (strcmp(opts.name, root->name))
+ continue;
+ name_match = true;
+ }
- list_add(&root->root_list, &cgroup_roots);
- cgroup_root_count++;
-
- /* Link the top cgroup in this hierarchy into all
- * the css_set objects */
- write_lock(&css_set_lock);
- hash_for_each(css_set_table, i, cset, hlist)
- link_css_set(&tmp_links, cset, root_cgrp);
- write_unlock(&css_set_lock);
-
- free_cgrp_cset_links(&tmp_links);
-
- BUG_ON(!list_empty(&root_cgrp->children));
- BUG_ON(root->number_of_cgroups != 1);
-
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- } else {
/*
- * We re-used an existing hierarchy - the new root (if
- * any) is not needed
+ * If we asked for subsystems (or explicitly for no
+ * subsystems) then they must match.
*/
- cgroup_free_root(opts.new_root);
+ if ((opts.subsys_mask || opts.none) &&
+ (opts.subsys_mask != root->subsys_mask)) {
+ if (!name_match)
+ continue;
+ ret = -EBUSY;
+ goto out_unlock;
+ }
if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
ret = -EINVAL;
- goto drop_new_super;
+ goto out_unlock;
} else {
pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
}
}
- }
- kfree(opts.release_agent);
- kfree(opts.name);
- return dget(sb->s_root);
-
- rm_base_files:
- free_cgrp_cset_links(&tmp_links);
- cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
- revert_creds(cred);
- unlock_drop:
- cgroup_exit_root_id(root);
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- drop_new_super:
- deactivate_locked_super(sb);
- out_err:
- kfree(opts.release_agent);
- kfree(opts.name);
- return ERR_PTR(ret);
-}
-
-static void cgroup_kill_sb(struct super_block *sb)
-{
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
- struct cgrp_cset_link *link, *tmp_link;
- int ret;
-
- BUG_ON(!root);
-
- BUG_ON(root->number_of_cgroups != 1);
- BUG_ON(!list_empty(&cgrp->children));
+ /*
+ * A root's lifetime is governed by its top cgroup. Zero
+ * ref indicate that the root is being destroyed. Wait for
+ * destruction to complete so that the subsystems are free.
+ * We can use wait_queue for the wait but this path is
+ * super cold. Let's just sleep for a bit and retry.
+ */
+ if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
+ kfree(opts.release_agent);
+ kfree(opts.name);
+ msleep(10);
+ goto retry;
+ }
- mutex_lock(&cgrp->dentry->d_inode->i_mutex);
- mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
+ ret = 0;
+ goto out_unlock;
+ }
- /* Rebind all subsystems back to the default hierarchy */
- if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
- ret = rebind_subsystems(root, 0, root->subsys_mask);
- /* Shouldn't be able to fail ... */
- BUG_ON(ret);
+ /* no such thing, create a new one */
+ root = cgroup_root_from_opts(&opts);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out_unlock;
}
- /*
- * Release all the links from cset_links to this hierarchy's
- * root cgroup
- */
- write_lock(&css_set_lock);
+ ret = cgroup_setup_root(root, opts.subsys_mask);
+ if (ret)
+ cgroup_free_root(root);
- list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
- list_del(&link->cset_link);
- list_del(&link->cgrp_link);
- kfree(link);
- }
- write_unlock(&css_set_lock);
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- cgroup_root_count--;
- }
+ kfree(opts.release_agent);
+ kfree(opts.name);
- cgroup_exit_root_id(root);
+ if (ret)
+ return ERR_PTR(ret);
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ dentry = kernfs_mount(fs_type, flags, root->kf_root);
+ if (IS_ERR(dentry))
+ cgroup_put(&root->top_cgroup);
+ return dentry;
+}
- simple_xattrs_free(&cgrp->xattrs);
+static void cgroup_kill_sb(struct super_block *sb)
+{
+ struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
+ struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
- kill_litter_super(sb);
- cgroup_free_root(root);
+ cgroup_put(&root->top_cgroup);
+ kernfs_kill_sb(sb);
}
static struct file_system_type cgroup_fs_type = {
@@ -1743,57 +1599,6 @@ static struct file_system_type cgroup_fs_type = {
static struct kobject *cgroup_kobj;
/**
- * cgroup_path - generate the path of a cgroup
- * @cgrp: the cgroup in question
- * @buf: the buffer to write the path into
- * @buflen: the length of the buffer
- *
- * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
- *
- * We can't generate cgroup path using dentry->d_name, as accessing
- * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
- * inode's i_mutex, while on the other hand cgroup_path() can be called
- * with some irq-safe spinlocks held.
- */
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
-{
- int ret = -ENAMETOOLONG;
- char *start;
-
- if (!cgrp->parent) {
- if (strlcpy(buf, "/", buflen) >= buflen)
- return -ENAMETOOLONG;
- return 0;
- }
-
- start = buf + buflen - 1;
- *start = '\0';
-
- rcu_read_lock();
- do {
- const char *name = cgroup_name(cgrp);
- int len;
-
- len = strlen(name);
- if ((start -= len) < buf)
- goto out;
- memcpy(start, name, len);
-
- if (--start < buf)
- goto out;
- *start = '/';
-
- cgrp = cgrp->parent;
- } while (cgrp->parent);
- ret = 0;
- memmove(buf, start, buf + buflen - start);
-out:
- rcu_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(cgroup_path);
-
-/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
* @buf: the buffer to write the path into
@@ -1804,31 +1609,32 @@ EXPORT_SYMBOL_GPL(cgroup_path);
* function grabs cgroup_mutex and shouldn't be used inside locks used by
* cgroup controller callbacks.
*
- * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.
+ * Return value is the same as kernfs_path().
*/
-int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
{
struct cgroupfs_root *root;
struct cgroup *cgrp;
- int hierarchy_id = 1, ret = 0;
-
- if (buflen < 2)
- return -ENAMETOOLONG;
+ int hierarchy_id = 1;
+ char *path = NULL;
mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
if (root) {
cgrp = task_cgroup_from_root(task, root);
- ret = cgroup_path(cgrp, buf, buflen);
+ path = cgroup_path(cgrp, buf, buflen);
} else {
/* if no hierarchy exists, everyone is in "/" */
- memcpy(buf, "/", 2);
+ if (strlcpy(buf, "/", buflen) < buflen)
+ path = buf;
}
+ up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
- return ret;
+ return path;
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
@@ -1846,7 +1652,6 @@ struct cgroup_taskset {
struct flex_array *tc_array;
int tc_array_len;
int idx;
- struct cgroup *cur_cgrp;
};
/**
@@ -1861,11 +1666,9 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
tset->idx = 0;
return cgroup_taskset_next(tset);
} else {
- tset->cur_cgrp = tset->single.cgrp;
return tset->single.task;
}
}
-EXPORT_SYMBOL_GPL(cgroup_taskset_first);
/**
* cgroup_taskset_next - iterate to the next task in taskset
@@ -1882,42 +1685,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
return NULL;
tc = flex_array_get(tset->tc_array, tset->idx++);
- tset->cur_cgrp = tc->cgrp;
return tc->task;
}
-EXPORT_SYMBOL_GPL(cgroup_taskset_next);
/**
- * cgroup_taskset_cur_css - return the matching css for the current task
- * @tset: taskset of interest
- * @subsys_id: the ID of the target subsystem
- *
- * Return the css for the current (last returned) task of @tset for
- * subsystem specified by @subsys_id. This function must be preceded by
- * either cgroup_taskset_first() or cgroup_taskset_next().
- */
-struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
- int subsys_id)
-{
- return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
-
-/**
- * cgroup_taskset_size - return the number of tasks in taskset
- * @tset: taskset of interest
- */
-int cgroup_taskset_size(struct cgroup_taskset *tset)
-{
- return tset->tc_array ? tset->tc_array_len : 1;
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_size);
-
-
-/*
* cgroup_task_migrate - move a task from one cgroup to another.
+ * @old_cgrp; the cgroup @tsk is being migrated from
+ * @tsk: the task being migrated
+ * @new_cset: the new css_set @tsk is being attached to
*
- * Must be called with cgroup_mutex and threadgroup locked.
+ * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
*/
static void cgroup_task_migrate(struct cgroup *old_cgrp,
struct task_struct *tsk,
@@ -1925,6 +1702,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
{
struct css_set *old_cset;
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
/*
* We are synchronized through threadgroup_lock() against PF_EXITING
* setting such that we can't race against cgroup_exit() changing the
@@ -1937,11 +1717,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
rcu_assign_pointer(tsk->cgroups, new_cset);
task_unlock(tsk);
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_move(&tsk->cg_list, &new_cset->tasks);
- write_unlock(&css_set_lock);
+ list_move(&tsk->cg_list, &new_cset->tasks);
/*
* We just gained a reference on old_cset by taking it from the
@@ -1949,26 +1725,26 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
* we're safe to drop it here; it will be freed under RCU.
*/
set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
- put_css_set(old_cset);
+ put_css_set_locked(old_cset, false);
}
/**
* cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
* @cgrp: the cgroup to attach to
- * @tsk: the task or the leader of the threadgroup to be attached
+ * @leader: the task or the leader of the threadgroup to be attached
* @threadgroup: attach the whole threadgroup?
*
* Call holding cgroup_mutex and the group_rwsem of the leader. Will take
* task_lock of @tsk or each thread in the threadgroup individually in turn.
*/
-static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
+static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
bool threadgroup)
{
- int retval, i, group_size;
+ int ret, i, group_size;
struct cgroupfs_root *root = cgrp->root;
struct cgroup_subsys_state *css, *failed_css = NULL;
/* threadgroup list cursor and array */
- struct task_struct *leader = tsk;
+ struct task_struct *task;
struct task_and_cgroup *tc;
struct flex_array *group;
struct cgroup_taskset tset = { };
@@ -1981,7 +1757,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
* threads exit, this will just be an over-estimate.
*/
if (threadgroup)
- group_size = get_nr_threads(tsk);
+ group_size = get_nr_threads(leader);
else
group_size = 1;
/* flex_array supports very large thread-groups better than kmalloc. */
@@ -1989,8 +1765,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
if (!group)
return -ENOMEM;
/* pre-allocate to guarantee space while iterating in rcu read-side. */
- retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
- if (retval)
+ ret = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
+ if (ret)
goto out_free_group_list;
i = 0;
@@ -1999,18 +1775,20 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
* already PF_EXITING could be freed from underneath us unless we
* take an rcu_read_lock.
*/
+ down_read(&css_set_rwsem);
rcu_read_lock();
+ task = leader;
do {
struct task_and_cgroup ent;
- /* @tsk either already exited or can't exit until the end */
- if (tsk->flags & PF_EXITING)
+ /* @task either already exited or can't exit until the end */
+ if (task->flags & PF_EXITING)
goto next;
/* as per above, nr_threads may decrease, but not increase. */
BUG_ON(i >= group_size);
- ent.task = tsk;
- ent.cgrp = task_cgroup_from_root(tsk, root);
+ ent.task = task;
+ ent.cgrp = task_cgroup_from_root(task, root);
/* nothing to do if this task is already in the cgroup */
if (ent.cgrp == cgrp)
goto next;
@@ -2018,21 +1796,22 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
* saying GFP_ATOMIC has no effect here because we did prealloc
* earlier, but it's good form to communicate our expectations.
*/
- retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
- BUG_ON(retval != 0);
+ ret = flex_array_put(group, i, &ent, GFP_ATOMIC);
+ BUG_ON(ret != 0);
i++;
next:
if (!threadgroup)
break;
- } while_each_thread(leader, tsk);
+ } while_each_thread(leader, task);
rcu_read_unlock();
+ up_read(&css_set_rwsem);
/* remember the number of threads in the array for later. */
group_size = i;
tset.tc_array = group;
tset.tc_array_len = group_size;
/* methods shouldn't be called if no task is actually migrating */
- retval = 0;
+ ret = 0;
if (!group_size)
goto out_free_group_list;
@@ -2041,8 +1820,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
*/
for_each_css(css, i, cgrp) {
if (css->ss->can_attach) {
- retval = css->ss->can_attach(css, &tset);
- if (retval) {
+ ret = css->ss->can_attach(css, &tset);
+ if (ret) {
failed_css = css;
goto out_cancel_attach;
}
@@ -2060,7 +1839,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
old_cset = task_css_set(tc->task);
tc->cset = find_css_set(old_cset, cgrp);
if (!tc->cset) {
- retval = -ENOMEM;
+ ret = -ENOMEM;
goto out_put_css_set_refs;
}
}
@@ -2070,10 +1849,12 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
* proceed to move all tasks to the new cgroup. There are no
* failure cases after here, so this is the commit point.
*/
+ down_write(&css_set_rwsem);
for (i = 0; i < group_size; i++) {
tc = flex_array_get(group, i);
cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
}
+ up_write(&css_set_rwsem);
/* nothing is sensitive to fork() after this point. */
/*
@@ -2086,18 +1867,18 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
/*
* step 5: success! and cleanup
*/
- retval = 0;
+ ret = 0;
out_put_css_set_refs:
- if (retval) {
+ if (ret) {
for (i = 0; i < group_size; i++) {
tc = flex_array_get(group, i);
if (!tc->cset)
break;
- put_css_set(tc->cset);
+ put_css_set(tc->cset, false);
}
}
out_cancel_attach:
- if (retval) {
+ if (ret) {
for_each_css(css, i, cgrp) {
if (css == failed_css)
break;
@@ -2107,7 +1888,7 @@ out_cancel_attach:
}
out_free_group_list:
flex_array_free(group);
- return retval;
+ return ret;
}
/*
@@ -2203,7 +1984,11 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
mutex_lock(&cgroup_mutex);
for_each_active_root(root) {
- struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
+ struct cgroup *from_cgrp;
+
+ down_read(&css_set_rwsem);
+ from_cgrp = task_cgroup_from_root(from, root);
+ up_read(&css_set_rwsem);
retval = cgroup_attach_task(from_cgrp, tsk, false);
if (retval)
@@ -2230,14 +2015,15 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css,
static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
struct cftype *cft, const char *buffer)
{
- BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
- if (strlen(buffer) >= PATH_MAX)
- return -EINVAL;
+ struct cgroupfs_root *root = css->cgroup->root;
+
+ BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
if (!cgroup_lock_live_group(css->cgroup))
return -ENODEV;
- mutex_lock(&cgroup_root_mutex);
- strcpy(css->cgroup->root->release_agent_path, buffer);
- mutex_unlock(&cgroup_root_mutex);
+ spin_lock(&release_agent_path_lock);
+ strlcpy(root->release_agent_path, buffer,
+ sizeof(root->release_agent_path));
+ spin_unlock(&release_agent_path_lock);
mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -2262,32 +2048,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
return 0;
}
-/* A buffer size big enough for numbers or short strings */
-#define CGROUP_LOCAL_BUFFER_SIZE 64
-
-static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
- size_t nbytes, loff_t *ppos)
+static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
{
- struct cfent *cfe = __d_cfe(file->f_dentry);
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup_subsys_state *css = cfe->css;
- size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
- char *buf;
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of->kn->priv;
+ struct cgroup_subsys_state *css;
int ret;
- if (nbytes >= max_bytes)
- return -E2BIG;
-
- buf = kmalloc(nbytes + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- if (copy_from_user(buf, userbuf, nbytes)) {
- ret = -EFAULT;
- goto out_free;
- }
-
- buf[nbytes] = '\0';
+ /*
+ * kernfs guarantees that a file isn't deleted with operations in
+ * flight, which means that the matching css is and stays alive and
+ * doesn't need to be pinned. The RCU locking is not necessary
+ * either. It's just for the convenience of using cgroup_css().
+ */
+ rcu_read_lock();
+ css = cgroup_css(cgrp, cft->ss);
+ rcu_read_unlock();
if (cft->write_string) {
ret = cft->write_string(css, cft, strstrip(buf));
@@ -2306,53 +2083,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
} else {
ret = -EINVAL;
}
-out_free:
- kfree(buf);
+
return ret ?: nbytes;
}
-/*
- * seqfile ops/methods for returning structured data. Currently just
- * supports string->u64 maps, but can be extended in future.
- */
-
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
- struct cftype *cft = seq_cft(seq);
-
- if (cft->seq_start) {
- return cft->seq_start(seq, ppos);
- } else {
- /*
- * The same behavior and code as single_open(). Returns
- * !NULL if pos is at the beginning; otherwise, NULL.
- */
- return NULL + !*ppos;
- }
+ return seq_cft(seq)->seq_start(seq, ppos);
}
static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
- struct cftype *cft = seq_cft(seq);
-
- if (cft->seq_next) {
- return cft->seq_next(seq, v, ppos);
- } else {
- /*
- * The same behavior and code as single_open(), always
- * terminate after the initial read.
- */
- ++*ppos;
- return NULL;
- }
+ return seq_cft(seq)->seq_next(seq, v, ppos);
}
static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
- struct cftype *cft = seq_cft(seq);
-
- if (cft->seq_stop)
- cft->seq_stop(seq, v);
+ seq_cft(seq)->seq_stop(seq, v);
}
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -2372,96 +2119,35 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
return 0;
}
-static struct seq_operations cgroup_seq_operations = {
- .start = cgroup_seqfile_start,
- .next = cgroup_seqfile_next,
- .stop = cgroup_seqfile_stop,
- .show = cgroup_seqfile_show,
+static struct kernfs_ops cgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_show = cgroup_seqfile_show,
};
-static int cgroup_file_open(struct inode *inode, struct file *file)
-{
- struct cfent *cfe = __d_cfe(file->f_dentry);
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
- struct cgroup_subsys_state *css;
- struct cgroup_open_file *of;
- int err;
-
- err = generic_file_open(inode, file);
- if (err)
- return err;
-
- /*
- * If the file belongs to a subsystem, pin the css. Will be
- * unpinned either on open failure or release. This ensures that
- * @css stays alive for all file operations.
- */
- rcu_read_lock();
- css = cgroup_css(cgrp, cft->ss);
- if (cft->ss && !css_tryget(css))
- css = NULL;
- rcu_read_unlock();
-
- if (!css)
- return -ENODEV;
-
- /*
- * @cfe->css is used by read/write/close to determine the
- * associated css. @file->private_data would be a better place but
- * that's already used by seqfile. Multiple accessors may use it
- * simultaneously which is okay as the association never changes.
- */
- WARN_ON_ONCE(cfe->css && cfe->css != css);
- cfe->css = css;
-
- of = __seq_open_private(file, &cgroup_seq_operations,
- sizeof(struct cgroup_open_file));
- if (of) {
- of->cfe = cfe;
- return 0;
- }
-
- if (css->ss)
- css_put(css);
- return -ENOMEM;
-}
-
-static int cgroup_file_release(struct inode *inode, struct file *file)
-{
- struct cfent *cfe = __d_cfe(file->f_dentry);
- struct cgroup_subsys_state *css = cfe->css;
-
- if (css->ss)
- css_put(css);
- return seq_release_private(inode, file);
-}
+static struct kernfs_ops cgroup_kf_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_start = cgroup_seqfile_start,
+ .seq_next = cgroup_seqfile_next,
+ .seq_stop = cgroup_seqfile_stop,
+ .seq_show = cgroup_seqfile_show,
+};
/*
* cgroup_rename - Only allow simple rename of directories in place.
*/
-static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+ const char *new_name_str)
{
+ struct cgroup *cgrp = kn->priv;
int ret;
- struct cgroup_name *name, *old_name;
- struct cgroup *cgrp;
-
- /*
- * It's convinient to use parent dir's i_mutex to protected
- * cgrp->name.
- */
- lockdep_assert_held(&old_dir->i_mutex);
- if (!S_ISDIR(old_dentry->d_inode->i_mode))
+ if (kernfs_type(kn) != KERNFS_DIR)
return -ENOTDIR;
- if (new_dentry->d_inode)
- return -EEXIST;
- if (old_dir != new_dir)
+ if (kn->parent != new_parent)
return -EIO;
- cgrp = __d_cgrp(old_dentry);
-
/*
* This isn't a proper migration and its usefulness is very
* limited. Disallow if sane_behavior.
@@ -2469,218 +2155,29 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
if (cgroup_sane_behavior(cgrp))
return -EPERM;
- name = cgroup_alloc_name(new_dentry);
- if (!name)
- return -ENOMEM;
-
- ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
- if (ret) {
- kfree(name);
- return ret;
- }
-
- old_name = rcu_dereference_protected(cgrp->name, true);
- rcu_assign_pointer(cgrp->name, name);
-
- kfree_rcu(old_name, rcu_head);
- return 0;
-}
-
-static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
-{
- if (S_ISDIR(dentry->d_inode->i_mode))
- return &__d_cgrp(dentry)->xattrs;
- else
- return &__d_cfe(dentry)->xattrs;
-}
-
-static inline int xattr_enabled(struct dentry *dentry)
-{
- struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
- return root->flags & CGRP_ROOT_XATTR;
-}
-
-static bool is_valid_xattr(const char *name)
-{
- if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
- !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
- return true;
- return false;
-}
-
-static int cgroup_setxattr(struct dentry *dentry, const char *name,
- const void *val, size_t size, int flags)
-{
- if (!xattr_enabled(dentry))
- return -EOPNOTSUPP;
- if (!is_valid_xattr(name))
- return -EINVAL;
- return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
-}
-
-static int cgroup_removexattr(struct dentry *dentry, const char *name)
-{
- if (!xattr_enabled(dentry))
- return -EOPNOTSUPP;
- if (!is_valid_xattr(name))
- return -EINVAL;
- return simple_xattr_remove(__d_xattrs(dentry), name);
-}
-
-static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
- void *buf, size_t size)
-{
- if (!xattr_enabled(dentry))
- return -EOPNOTSUPP;
- if (!is_valid_xattr(name))
- return -EINVAL;
- return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
-}
-
-static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
-{
- if (!xattr_enabled(dentry))
- return -EOPNOTSUPP;
- return simple_xattr_list(__d_xattrs(dentry), buf, size);
-}
-
-static const struct file_operations cgroup_file_operations = {
- .read = seq_read,
- .write = cgroup_file_write,
- .llseek = generic_file_llseek,
- .open = cgroup_file_open,
- .release = cgroup_file_release,
-};
-
-static const struct inode_operations cgroup_file_inode_operations = {
- .setxattr = cgroup_setxattr,
- .getxattr = cgroup_getxattr,
- .listxattr = cgroup_listxattr,
- .removexattr = cgroup_removexattr,
-};
-
-static const struct inode_operations cgroup_dir_inode_operations = {
- .lookup = simple_lookup,
- .mkdir = cgroup_mkdir,
- .rmdir = cgroup_rmdir,
- .rename = cgroup_rename,
- .setxattr = cgroup_setxattr,
- .getxattr = cgroup_getxattr,
- .listxattr = cgroup_listxattr,
- .removexattr = cgroup_removexattr,
-};
-
-static int cgroup_create_file(struct dentry *dentry, umode_t mode,
- struct super_block *sb)
-{
- struct inode *inode;
-
- if (!dentry)
- return -ENOENT;
- if (dentry->d_inode)
- return -EEXIST;
-
- inode = cgroup_new_inode(mode, sb);
- if (!inode)
- return -ENOMEM;
-
- if (S_ISDIR(mode)) {
- inode->i_op = &cgroup_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
-
- /* start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
- inc_nlink(dentry->d_parent->d_inode);
-
- /*
- * Control reaches here with cgroup_mutex held.
- * @inode->i_mutex should nest outside cgroup_mutex but we
- * want to populate it immediately without releasing
- * cgroup_mutex. As @inode isn't visible to anyone else
- * yet, trylock will always succeed without affecting
- * lockdep checks.
- */
- WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
- } else if (S_ISREG(mode)) {
- inode->i_size = 0;
- inode->i_fop = &cgroup_file_operations;
- inode->i_op = &cgroup_file_inode_operations;
- }
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
- return 0;
-}
-
-/**
- * cgroup_file_mode - deduce file mode of a control file
- * @cft: the control file in question
- *
- * returns cft->mode if ->mode is not 0
- * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- * returns S_IRUGO if it has only a read handler
- * returns S_IWUSR if it has only a write hander
- */
-static umode_t cgroup_file_mode(const struct cftype *cft)
-{
- umode_t mode = 0;
-
- if (cft->mode)
- return cft->mode;
-
- if (cft->read_u64 || cft->read_s64 || cft->seq_show)
- mode |= S_IRUGO;
+ mutex_lock(&cgroup_tree_mutex);
+ mutex_lock(&cgroup_mutex);
- if (cft->write_u64 || cft->write_s64 || cft->write_string ||
- cft->trigger)
- mode |= S_IWUSR;
+ ret = kernfs_rename(kn, new_parent, new_name_str);
- return mode;
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
+ return ret;
}
static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
{
- struct dentry *dir = cgrp->dentry;
- struct cgroup *parent = __d_cgrp(dir);
- struct dentry *dentry;
- struct cfent *cfe;
- int error;
- umode_t mode;
- char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
-
- if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
- !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
- strcpy(name, cft->ss->name);
- strcat(name, ".");
- }
- strcat(name, cft->name);
+ char name[CGROUP_FILE_NAME_MAX];
+ struct kernfs_node *kn;
+ struct lock_class_key *key = NULL;
- BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
-
- cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
- if (!cfe)
- return -ENOMEM;
-
- dentry = lookup_one_len(name, dir, strlen(name));
- if (IS_ERR(dentry)) {
- error = PTR_ERR(dentry);
- goto out;
- }
-
- cfe->type = (void *)cft;
- cfe->dentry = dentry;
- dentry->d_fsdata = cfe;
- simple_xattrs_init(&cfe->xattrs);
-
- mode = cgroup_file_mode(cft);
- error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
- if (!error) {
- list_add_tail(&cfe->node, &parent->files);
- cfe = NULL;
- }
- dput(dentry);
-out:
- kfree(cfe);
- return error;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ key = &cft->lockdep_key;
+#endif
+ kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
+ cgroup_file_mode(cft), 0, cft->kf_ops, cft,
+ NULL, false, key);
+ return PTR_ERR_OR_ZERO(kn);
}
/**
@@ -2700,8 +2197,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
struct cftype *cft;
int ret;
- lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
- lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&cgroup_tree_mutex);
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
@@ -2726,44 +2222,19 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
return 0;
}
-static void cgroup_cfts_prepare(void)
- __acquires(&cgroup_mutex)
-{
- /*
- * Thanks to the entanglement with vfs inode locking, we can't walk
- * the existing cgroups under cgroup_mutex and create files.
- * Instead, we use css_for_each_descendant_pre() and drop RCU read
- * lock before calling cgroup_addrm_files().
- */
- mutex_lock(&cgroup_mutex);
-}
-
-static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
- __releases(&cgroup_mutex)
+static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
LIST_HEAD(pending);
struct cgroup_subsys *ss = cfts[0].ss;
struct cgroup *root = &ss->root->top_cgroup;
- struct super_block *sb = ss->root->sb;
- struct dentry *prev = NULL;
- struct inode *inode;
struct cgroup_subsys_state *css;
- u64 update_before;
int ret = 0;
- /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
- if (!cfts || ss->root == &cgroup_dummy_root ||
- !atomic_inc_not_zero(&sb->s_active)) {
- mutex_unlock(&cgroup_mutex);
- return 0;
- }
+ lockdep_assert_held(&cgroup_tree_mutex);
- /*
- * All cgroups which are created after we drop cgroup_mutex will
- * have the updated set of files, so we only need to update the
- * cgroups created before the current @cgroup_serial_nr_next.
- */
- update_before = cgroup_serial_nr_next;
+ /* don't bother if @ss isn't attached */
+ if (ss->root == &cgroup_dummy_root)
+ return 0;
/* add/rm files for all cgroups created before */
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2772,62 +2243,75 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
if (cgroup_is_dead(cgrp))
continue;
- inode = cgrp->dentry->d_inode;
- dget(cgrp->dentry);
- dput(prev);
- prev = cgrp->dentry;
-
- mutex_unlock(&cgroup_mutex);
- mutex_lock(&inode->i_mutex);
- mutex_lock(&cgroup_mutex);
- if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
- ret = cgroup_addrm_files(cgrp, cfts, is_add);
- mutex_unlock(&inode->i_mutex);
+ ret = cgroup_addrm_files(cgrp, cfts, is_add);
if (ret)
break;
}
- mutex_unlock(&cgroup_mutex);
- dput(prev);
- deactivate_super(sb);
+
+ if (is_add && !ret)
+ kernfs_activate(root->kn);
return ret;
}
-/**
- * cgroup_add_cftypes - add an array of cftypes to a subsystem
- * @ss: target cgroup subsystem
- * @cfts: zero-length name terminated array of cftypes
- *
- * Register @cfts to @ss. Files described by @cfts are created for all
- * existing cgroups to which @ss is attached and all future cgroups will
- * have them too. This function can be called anytime whether @ss is
- * attached or not.
- *
- * Returns 0 on successful registration, -errno on failure. Note that this
- * function currently returns 0 as long as @cfts registration is successful
- * even if some file creation attempts on existing cgroups fail.
- */
-int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+static void cgroup_exit_cftypes(struct cftype *cfts)
{
- struct cftype_set *set;
struct cftype *cft;
- int ret;
- set = kzalloc(sizeof(*set), GFP_KERNEL);
- if (!set)
- return -ENOMEM;
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* free copy for custom atomic_write_len, see init_cftypes() */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
+ kfree(cft->kf_ops);
+ cft->kf_ops = NULL;
+ cft->ss = NULL;
+ }
+}
+
+static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
- for (cft = cfts; cft->name[0] != '\0'; cft++)
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ struct kernfs_ops *kf_ops;
+
+ WARN_ON(cft->ss || cft->kf_ops);
+
+ if (cft->seq_start)
+ kf_ops = &cgroup_kf_ops;
+ else
+ kf_ops = &cgroup_kf_single_ops;
+
+ /*
+ * Ugh... if @cft wants a custom max_write_len, we need to
+ * make a copy of kf_ops to set its atomic_write_len.
+ */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
+ kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
+ if (!kf_ops) {
+ cgroup_exit_cftypes(cfts);
+ return -ENOMEM;
+ }
+ kf_ops->atomic_write_len = cft->max_write_len;
+ }
+
+ cft->kf_ops = kf_ops;
cft->ss = ss;
+ }
- cgroup_cfts_prepare();
- set->cfts = cfts;
- list_add_tail(&set->node, &ss->cftsets);
- ret = cgroup_cfts_commit(cfts, true);
- if (ret)
- cgroup_rm_cftypes(cfts);
- return ret;
+ return 0;
+}
+
+static int cgroup_rm_cftypes_locked(struct cftype *cfts)
+{
+ lockdep_assert_held(&cgroup_tree_mutex);
+
+ if (!cfts || !cfts[0].ss)
+ return -ENOENT;
+
+ list_del(&cfts->node);
+ cgroup_apply_cftypes(cfts, false);
+ cgroup_exit_cftypes(cfts);
+ return 0;
}
-EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
/**
* cgroup_rm_cftypes - remove an array of cftypes from a subsystem
@@ -2842,24 +2326,48 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
*/
int cgroup_rm_cftypes(struct cftype *cfts)
{
- struct cftype_set *set;
+ int ret;
- if (!cfts || !cfts[0].ss)
- return -ENOENT;
+ mutex_lock(&cgroup_tree_mutex);
+ ret = cgroup_rm_cftypes_locked(cfts);
+ mutex_unlock(&cgroup_tree_mutex);
+ return ret;
+}
- cgroup_cfts_prepare();
+/**
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Register @cfts to @ss. Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too. This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure. Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
+ */
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ int ret;
- list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
- if (set->cfts == cfts) {
- list_del(&set->node);
- kfree(set);
- cgroup_cfts_commit(cfts, false);
- return 0;
- }
- }
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
+
+ ret = cgroup_init_cftypes(ss, cfts);
+ if (ret)
+ return ret;
+
+ mutex_lock(&cgroup_tree_mutex);
- cgroup_cfts_commit(NULL, false);
- return -ENOENT;
+ list_add_tail(&cfts->node, &ss->cfts);
+ ret = cgroup_apply_cftypes(cfts, true);
+ if (ret)
+ cgroup_rm_cftypes_locked(cfts);
+
+ mutex_unlock(&cgroup_tree_mutex);
+ return ret;
}
/**
@@ -2868,57 +2376,18 @@ int cgroup_rm_cftypes(struct cftype *cfts)
*
* Return the number of tasks in the cgroup.
*/
-int cgroup_task_count(const struct cgroup *cgrp)
+static int cgroup_task_count(const struct cgroup *cgrp)
{
int count = 0;
struct cgrp_cset_link *link;
- read_lock(&css_set_lock);
+ down_read(&css_set_rwsem);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
count += atomic_read(&link->cset->refcount);
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
return count;
}
-/*
- * To reduce the fork() overhead for systems that are not actually using
- * their cgroups capability, we don't maintain the lists running through
- * each css_set to its tasks until we see the list actually used - in other
- * words after the first call to css_task_iter_start().
- */
-static void cgroup_enable_task_cg_lists(void)
-{
- struct task_struct *p, *g;
- write_lock(&css_set_lock);
- use_task_css_set_links = 1;
- /*
- * We need tasklist_lock because RCU is not safe against
- * while_each_thread(). Besides, a forking task that has passed
- * cgroup_post_fork() without seeing use_task_css_set_links = 1
- * is not guaranteed to have its child immediately visible in the
- * tasklist if we walk through it with RCU.
- */
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- task_lock(p);
- /*
- * We should check if the process is exiting, otherwise
- * it will race with cgroup_exit() in that the list
- * entry won't be deleted though the process has exited.
- * Do it while holding siglock so that we don't end up
- * racing against cgroup_exit().
- */
- spin_lock_irq(&p->sighand->siglock);
- if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
- list_add(&p->cg_list, &task_css_set(p)->tasks);
- spin_unlock_irq(&p->sighand->siglock);
-
- task_unlock(p);
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
- write_unlock(&css_set_lock);
-}
-
/**
* css_next_child - find the next child of a given css
* @pos_css: the current position (%NULL to initiate traversal)
@@ -2937,7 +2406,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
struct cgroup *cgrp = parent_css->cgroup;
struct cgroup *next;
- cgroup_assert_mutex_or_rcu_locked();
+ cgroup_assert_mutexes_or_rcu_locked();
/*
* @pos could already have been removed. Once a cgroup is removed,
@@ -2973,7 +2442,6 @@ css_next_child(struct cgroup_subsys_state *pos_css,
return cgroup_css(next, parent_css->ss);
}
-EXPORT_SYMBOL_GPL(css_next_child);
/**
* css_next_descendant_pre - find the next descendant for pre-order walk
@@ -2995,7 +2463,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
{
struct cgroup_subsys_state *next;
- cgroup_assert_mutex_or_rcu_locked();
+ cgroup_assert_mutexes_or_rcu_locked();
/* if first iteration, visit @root */
if (!pos)
@@ -3016,7 +2484,6 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
return NULL;
}
-EXPORT_SYMBOL_GPL(css_next_descendant_pre);
/**
* css_rightmost_descendant - return the rightmost descendant of a css
@@ -3036,7 +2503,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
{
struct cgroup_subsys_state *last, *tmp;
- cgroup_assert_mutex_or_rcu_locked();
+ cgroup_assert_mutexes_or_rcu_locked();
do {
last = pos;
@@ -3048,7 +2515,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
return last;
}
-EXPORT_SYMBOL_GPL(css_rightmost_descendant);
static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
@@ -3084,7 +2550,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
{
struct cgroup_subsys_state *next;
- cgroup_assert_mutex_or_rcu_locked();
+ cgroup_assert_mutexes_or_rcu_locked();
/* if first iteration, visit leftmost descendant which may be @root */
if (!pos)
@@ -3102,7 +2568,6 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
/* no sibling left, visit parent */
return css_parent(pos);
}
-EXPORT_SYMBOL_GPL(css_next_descendant_post);
/**
* css_advance_task_iter - advance a task itererator to the next css_set
@@ -3146,17 +2611,12 @@ static void css_advance_task_iter(struct css_task_iter *it)
*/
void css_task_iter_start(struct cgroup_subsys_state *css,
struct css_task_iter *it)
- __acquires(css_set_lock)
+ __acquires(css_set_rwsem)
{
- /*
- * The first time anyone tries to iterate across a css, we need to
- * enable the list linking each css_set to its tasks, and fix up
- * all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
+ /* no one should try to iterate before mounting cgroups */
+ WARN_ON_ONCE(!use_task_css_set_links);
- read_lock(&css_set_lock);
+ down_read(&css_set_rwsem);
it->origin_css = css;
it->cset_link = &css->cgroup->cset_links;
@@ -3204,180 +2664,9 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
* Finish task iteration started by css_task_iter_start().
*/
void css_task_iter_end(struct css_task_iter *it)
- __releases(css_set_lock)
-{
- read_unlock(&css_set_lock);
-}
-
-static inline int started_after_time(struct task_struct *t1,
- struct timespec *time,
- struct task_struct *t2)
-{
- int start_diff = timespec_compare(&t1->start_time, time);
- if (start_diff > 0) {
- return 1;
- } else if (start_diff < 0) {
- return 0;
- } else {
- /*
- * Arbitrarily, if two processes started at the same
- * time, we'll say that the lower pointer value
- * started first. Note that t2 may have exited by now
- * so this may not be a valid pointer any longer, but
- * that's fine - it still serves to distinguish
- * between two tasks started (effectively) simultaneously.
- */
- return t1 > t2;
- }
-}
-
-/*
- * This function is a callback from heap_insert() and is used to order
- * the heap.
- * In this case we order the heap in descending task start time.
- */
-static inline int started_after(void *p1, void *p2)
-{
- struct task_struct *t1 = p1;
- struct task_struct *t2 = p2;
- return started_after_time(t1, &t2->start_time, t2);
-}
-
-/**
- * css_scan_tasks - iterate though all the tasks in a css
- * @css: the css to iterate tasks of
- * @test: optional test callback
- * @process: process callback
- * @data: data passed to @test and @process
- * @heap: optional pre-allocated heap used for task iteration
- *
- * Iterate through all the tasks in @css, calling @test for each, and if it
- * returns %true, call @process for it also.
- *
- * @test may be NULL, meaning always true (select all tasks), which
- * effectively duplicates css_task_iter_{start,next,end}() but does not
- * lock css_set_lock for the call to @process.
- *
- * It is guaranteed that @process will act on every task that is a member
- * of @css for the duration of this call. This function may or may not
- * call @process for tasks that exit or move to a different css during the
- * call, or are forked or move into the css during the call.
- *
- * Note that @test may be called with locks held, and may in some
- * situations be called multiple times for the same task, so it should be
- * cheap.
- *
- * If @heap is non-NULL, a heap has been pre-allocated and will be used for
- * heap operations (and its "gt" member will be overwritten), else a
- * temporary heap will be used (allocation of which may cause this function
- * to fail).
- */
-int css_scan_tasks(struct cgroup_subsys_state *css,
- bool (*test)(struct task_struct *, void *),
- void (*process)(struct task_struct *, void *),
- void *data, struct ptr_heap *heap)
-{
- int retval, i;
- struct css_task_iter it;
- struct task_struct *p, *dropped;
- /* Never dereference latest_task, since it's not refcounted */
- struct task_struct *latest_task = NULL;
- struct ptr_heap tmp_heap;
- struct timespec latest_time = { 0, 0 };
-
- if (heap) {
- /* The caller supplied our heap and pre-allocated its memory */
- heap->gt = &started_after;
- } else {
- /* We need to allocate our own heap memory */
- heap = &tmp_heap;
- retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
- if (retval)
- /* cannot allocate the heap */
- return retval;
- }
-
- again:
- /*
- * Scan tasks in the css, using the @test callback to determine
- * which are of interest, and invoking @process callback on the
- * ones which need an update. Since we don't want to hold any
- * locks during the task updates, gather tasks to be processed in a
- * heap structure. The heap is sorted by descending task start
- * time. If the statically-sized heap fills up, we overflow tasks
- * that started later, and in future iterations only consider tasks
- * that started after the latest task in the previous pass. This
- * guarantees forward progress and that we don't miss any tasks.
- */
- heap->size = 0;
- css_task_iter_start(css, &it);
- while ((p = css_task_iter_next(&it))) {
- /*
- * Only affect tasks that qualify per the caller's callback,
- * if he provided one
- */
- if (test && !test(p, data))
- continue;
- /*
- * Only process tasks that started after the last task
- * we processed
- */
- if (!started_after_time(p, &latest_time, latest_task))
- continue;
- dropped = heap_insert(heap, p);
- if (dropped == NULL) {
- /*
- * The new task was inserted; the heap wasn't
- * previously full
- */
- get_task_struct(p);
- } else if (dropped != p) {
- /*
- * The new task was inserted, and pushed out a
- * different task
- */
- get_task_struct(p);
- put_task_struct(dropped);
- }
- /*
- * Else the new task was newer than anything already in
- * the heap and wasn't inserted
- */
- }
- css_task_iter_end(&it);
-
- if (heap->size) {
- for (i = 0; i < heap->size; i++) {
- struct task_struct *q = heap->ptrs[i];
- if (i == 0) {
- latest_time = q->start_time;
- latest_task = q;
- }
- /* Process the task per the caller's callback */
- process(q, data);
- put_task_struct(q);
- }
- /*
- * If we had to process any tasks at all, scan again
- * in case some of them were in the middle of forking
- * children that didn't get processed.
- * Not the most efficient way to do it, but it avoids
- * having to take callback_mutex in the fork path
- */
- goto again;
- }
- if (heap == &tmp_heap)
- heap_free(&tmp_heap);
- return 0;
-}
-
-static void cgroup_transfer_one_task(struct task_struct *task, void *data)
+ __releases(css_set_rwsem)
{
- struct cgroup *new_cgroup = data;
-
- mutex_lock(&cgroup_mutex);
- cgroup_attach_task(new_cgroup, task, false);
- mutex_unlock(&cgroup_mutex);
+ up_read(&css_set_rwsem);
}
/**
@@ -3387,8 +2676,26 @@ static void cgroup_transfer_one_task(struct task_struct *task, void *data)
*/
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
- return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
- to, NULL);
+ struct css_task_iter it;
+ struct task_struct *task;
+ int ret = 0;
+
+ do {
+ css_task_iter_start(&from->dummy_css, &it);
+ task = css_task_iter_next(&it);
+ if (task)
+ get_task_struct(task);
+ css_task_iter_end(&it);
+
+ if (task) {
+ mutex_lock(&cgroup_mutex);
+ ret = cgroup_attach_task(to, task, false);
+ mutex_unlock(&cgroup_mutex);
+ put_task_struct(task);
+ }
+ } while (task && !ret);
+
+ return ret;
}
/*
@@ -3687,21 +2994,31 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
- int ret = -EINVAL;
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct cgroup *cgrp;
struct css_task_iter it;
struct task_struct *tsk;
+ /* it should be kernfs_node belonging to cgroupfs and is a directory */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return -EINVAL;
+
+ mutex_lock(&cgroup_mutex);
+
/*
- * Validate dentry by checking the superblock operations,
- * and make sure it's a directory.
+ * We aren't being called from kernfs and there's no guarantee on
+ * @kn->priv's validity. For this and css_tryget_from_dir(),
+ * @kn->priv is RCU safe. Let's do the RCU dancing.
*/
- if (dentry->d_sb->s_op != &cgroup_ops ||
- !S_ISDIR(dentry->d_inode->i_mode))
- goto err;
-
- ret = 0;
- cgrp = dentry->d_fsdata;
+ rcu_read_lock();
+ cgrp = rcu_dereference(kn->priv);
+ if (!cgrp || cgroup_is_dead(cgrp)) {
+ rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
+ return -ENOENT;
+ }
+ rcu_read_unlock();
css_task_iter_start(&cgrp->dummy_css, &it);
while ((tsk = css_task_iter_next(&it))) {
@@ -3726,8 +3043,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
css_task_iter_end(&it);
-err:
- return ret;
+ mutex_unlock(&cgroup_mutex);
+ return 0;
}
@@ -3745,7 +3062,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup_open_file *of = s->private;
+ struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
@@ -3800,7 +3117,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
- struct cgroup_open_file *of = s->private;
+ struct kernfs_open_file *of = s->private;
struct cgroup_pidlist *l = of->priv;
if (l)
@@ -3811,7 +3128,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup_open_file *of = s->private;
+ struct kernfs_open_file *of = s->private;
struct cgroup_pidlist *l = of->priv;
pid_t *p = v;
pid_t *end = l->list + l->length;
@@ -3861,23 +3178,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
return 0;
}
-/*
- * When dput() is called asynchronously, if umount has been done and
- * then deactivate_super() in cgroup_free_fn() kills the superblock,
- * there's a small window that vfs will see the root dentry with non-zero
- * refcnt and trigger BUG().
- *
- * That's why we hold a reference before dput() and drop it right after.
- */
-static void cgroup_dput(struct cgroup *cgrp)
-{
- struct super_block *sb = cgrp->root->sb;
-
- atomic_inc(&sb->s_active);
- dput(cgrp->dentry);
- deactivate_super(sb);
-}
-
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -3944,7 +3244,7 @@ static struct cftype cgroup_base_files[] = {
.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_release_agent_show,
.write_string = cgroup_release_agent_write,
- .max_write_len = PATH_MAX,
+ .max_write_len = PATH_MAX - 1,
},
{ } /* terminate */
};
@@ -3963,13 +3263,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
/* process cftsets of each subsystem */
for_each_subsys(ss, i) {
- struct cftype_set *set;
+ struct cftype *cfts;
if (!test_bit(i, &subsys_mask))
continue;
- list_for_each_entry(set, &ss->cftsets, node) {
- ret = cgroup_addrm_files(cgrp, set->cfts, true);
+ list_for_each_entry(cfts, &ss->cfts, node) {
+ ret = cgroup_addrm_files(cgrp, cfts, true);
if (ret < 0)
goto err;
}
@@ -4012,7 +3312,7 @@ static void css_free_work_fn(struct work_struct *work)
css_put(css->parent);
css->ss->css_free(css);
- cgroup_dput(cgrp);
+ cgroup_put(cgrp);
}
static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -4020,10 +3320,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
struct cgroup_subsys_state *css =
container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
- /*
- * css holds an extra ref to @cgrp->dentry which is put on the last
- * css_put(). dput() requires process context which we don't have.
- */
INIT_WORK(&css->destroy_work, css_free_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);
}
@@ -4033,7 +3329,7 @@ static void css_release(struct percpu_ref *ref)
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
- rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
+ rcu_assign_pointer(css->cgroup->subsys[css->ss->id], NULL);
call_rcu(&css->rcu_head, css_free_rcu_fn);
}
@@ -4058,6 +3354,7 @@ static int online_css(struct cgroup_subsys_state *css)
struct cgroup_subsys *ss = css->ss;
int ret = 0;
+ lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex);
if (ss->css_online)
@@ -4065,7 +3362,7 @@ static int online_css(struct cgroup_subsys_state *css)
if (!ret) {
css->flags |= CSS_ONLINE;
css->cgroup->nr_css++;
- rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
+ rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
}
return ret;
}
@@ -4075,6 +3372,7 @@ static void offline_css(struct cgroup_subsys_state *css)
{
struct cgroup_subsys *ss = css->ss;
+ lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex);
if (!(css->flags & CSS_ONLINE))
@@ -4085,7 +3383,7 @@ static void offline_css(struct cgroup_subsys_state *css)
css->flags &= ~CSS_ONLINE;
css->cgroup->nr_css--;
- RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
+ RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
}
/**
@@ -4103,7 +3401,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
struct cgroup_subsys_state *css;
int err;
- lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
css = ss->css_alloc(cgroup_css(parent, ss));
@@ -4116,7 +3413,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
init_css(css, ss, cgrp);
- err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
+ err = cgroup_populate_dir(cgrp, 1 << ss->id);
if (err)
goto err_free;
@@ -4124,7 +3421,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
if (err)
goto err_free;
- dget(cgrp->dentry);
+ cgroup_get(cgrp);
css_get(css->parent);
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
@@ -4144,35 +3441,27 @@ err_free:
return err;
}
-/*
+/**
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new inode
- *
- * Must be called with the mutex on the parent inode held
+ * @name: name of the new cgroup
+ * @mode: mode to set on new cgroup
*/
-static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
- umode_t mode)
+static long cgroup_create(struct cgroup *parent, const char *name,
+ umode_t mode)
{
struct cgroup *cgrp;
- struct cgroup_name *name;
struct cgroupfs_root *root = parent->root;
int ssid, err;
struct cgroup_subsys *ss;
- struct super_block *sb = root->sb;
+ struct kernfs_node *kn;
/* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
if (!cgrp)
return -ENOMEM;
- name = cgroup_alloc_name(dentry);
- if (!name) {
- err = -ENOMEM;
- goto err_free_cgrp;
- }
- rcu_assign_pointer(cgrp->name, name);
+ mutex_lock(&cgroup_tree_mutex);
/*
* Only live parents can have children. Note that the liveliness
@@ -4183,7 +3472,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
*/
if (!cgroup_lock_live_group(parent)) {
err = -ENODEV;
- goto err_free_name;
+ goto err_unlock_tree;
}
/*
@@ -4196,18 +3485,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
goto err_unlock;
}
- /* Grab a reference on the superblock so the hierarchy doesn't
- * get deleted on unmount if there are child cgroups. This
- * can be done outside cgroup_mutex, since the sb can't
- * disappear while someone has an open control file on the
- * fs */
- atomic_inc(&sb->s_active);
-
init_cgroup_housekeeping(cgrp);
- dentry->d_fsdata = cgrp;
- cgrp->dentry = dentry;
-
cgrp->parent = parent;
cgrp->dummy_css.parent = &parent->dummy_css;
cgrp->root = parent->root;
@@ -4218,24 +3497,26 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+ /* create the directory */
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ if (IS_ERR(kn)) {
+ err = PTR_ERR(kn);
+ goto err_free_id;
+ }
+ cgrp->kn = kn;
+
/*
- * Create directory. cgroup_create_file() returns with the new
- * directory locked on success so that it can be populated without
- * dropping cgroup_mutex.
+ * This extra ref will be put in cgroup_free_fn() and guarantees
+ * that @cgrp->kn is always accessible.
*/
- err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
- if (err < 0)
- goto err_free_id;
- lockdep_assert_held(&dentry->d_inode->i_mutex);
+ kernfs_get(kn);
cgrp->serial_nr = cgroup_serial_nr_next++;
/* allocation complete, commit to creation */
list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
- root->number_of_cgroups++;
-
- /* hold a ref to the parent's dentry */
- dget(parent->dentry);
+ atomic_inc(&root->nr_cgrps);
+ cgroup_get(parent);
/*
* @cgrp is now fully operational. If something fails after this
@@ -4256,36 +3537,35 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
}
}
+ kernfs_activate(kn);
+
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
return 0;
err_free_id:
idr_remove(&root->cgroup_idr, cgrp->id);
- /* Release the reference count that we took on the superblock */
- deactivate_super(sb);
err_unlock:
mutex_unlock(&cgroup_mutex);
-err_free_name:
- kfree(rcu_dereference_raw(cgrp->name));
-err_free_cgrp:
+err_unlock_tree:
+ mutex_unlock(&cgroup_tree_mutex);
kfree(cgrp);
return err;
err_destroy:
cgroup_destroy_locked(cgrp);
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
return err;
}
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
{
- struct cgroup *c_parent = dentry->d_parent->d_fsdata;
+ struct cgroup *parent = parent_kn->priv;
- /* the vfs holds inode->i_mutex already */
- return cgroup_create(c_parent, dentry, mode | S_IFDIR);
+ return cgroup_create(parent, name, mode);
}
/*
@@ -4298,6 +3578,7 @@ static void css_killed_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup *cgrp = css->cgroup;
+ mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
/*
@@ -4315,6 +3596,7 @@ static void css_killed_work_fn(struct work_struct *work)
cgroup_destroy_css_killed(cgrp);
mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
/*
* Put the css refs from kill_css(). Each css holds an extra
@@ -4347,7 +3629,11 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
*/
static void kill_css(struct cgroup_subsys_state *css)
{
- cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
+ /*
+ * This must happen before css is disassociated with its cgroup.
+ * See seq_css() for details.
+ */
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
/*
* Killing would put the base ref, but we need to keep it alive
@@ -4395,22 +3681,21 @@ static void kill_css(struct cgroup_subsys_state *css)
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
- struct dentry *d = cgrp->dentry;
- struct cgroup_subsys_state *css;
struct cgroup *child;
+ struct cgroup_subsys_state *css;
bool empty;
int ssid;
- lockdep_assert_held(&d->d_inode->i_mutex);
+ lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex);
/*
- * css_set_lock synchronizes access to ->cset_links and prevents
- * @cgrp from being removed while __put_css_set() is in progress.
+ * css_set_rwsem synchronizes access to ->cset_links and prevents
+ * @cgrp from being removed while put_css_set() is in progress.
*/
- read_lock(&css_set_lock);
+ down_read(&css_set_rwsem);
empty = list_empty(&cgrp->cset_links);
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
if (!empty)
return -EBUSY;
@@ -4433,10 +3718,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/*
* Initiate massacre of all css's. cgroup_destroy_css_killed()
* will be invoked to perform the rest of destruction once the
- * percpu refs of all css's are confirmed to be killed.
+ * percpu refs of all css's are confirmed to be killed. This
+ * involves removing the subsystem's files, drop cgroup_mutex.
*/
+ mutex_unlock(&cgroup_mutex);
for_each_css(css, ssid, cgrp)
kill_css(css);
+ mutex_lock(&cgroup_mutex);
/*
* Mark @cgrp dead. This prevents further task migration and child
@@ -4462,14 +3750,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
if (!cgrp->nr_css)
cgroup_destroy_css_killed(cgrp);
+ /* remove @cgrp directory along with the base files */
+ mutex_unlock(&cgroup_mutex);
+
/*
- * Clear the base files and remove @cgrp directory. The removal
- * puts the base ref but we aren't quite done with @cgrp yet, so
- * hold onto it.
+ * There are two control paths which try to determine cgroup from
+ * dentry without going through kernfs - cgroupstats_build() and
+ * css_tryget_from_dir(). Those are supported by RCU protecting
+ * clearing of cgrp->kn->priv backpointer, which should happen
+ * after all files under it have been removed.
*/
- cgroup_addrm_files(cgrp, cgroup_base_files, false);
- dget(d);
- cgroup_d_remove_dir(d);
+ kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
+ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
+
+ mutex_lock(&cgroup_mutex);
return 0;
};
@@ -4486,59 +3780,69 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
static void cgroup_destroy_css_killed(struct cgroup *cgrp)
{
struct cgroup *parent = cgrp->parent;
- struct dentry *d = cgrp->dentry;
+ lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex);
/* delete this cgroup from parent->children */
list_del_rcu(&cgrp->sibling);
- dput(d);
+ cgroup_put(cgrp);
set_bit(CGRP_RELEASABLE, &parent->flags);
check_for_release(parent);
}
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
+static int cgroup_rmdir(struct kernfs_node *kn)
{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = cgroup_destroy_locked(dentry->d_fsdata);
- mutex_unlock(&cgroup_mutex);
+ struct cgroup *cgrp = kn->priv;
+ int ret = 0;
- return ret;
-}
+ /*
+ * This is self-destruction but @kn can't be removed while this
+ * callback is in progress. Let's break active protection. Once
+ * the protection is broken, @cgrp can be destroyed at any point.
+ * Pin it so that it stays accessible.
+ */
+ cgroup_get(cgrp);
+ kernfs_break_active_protection(kn);
-static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
-{
- INIT_LIST_HEAD(&ss->cftsets);
+ mutex_lock(&cgroup_tree_mutex);
+ mutex_lock(&cgroup_mutex);
/*
- * base_cftset is embedded in subsys itself, no need to worry about
- * deregistration.
+ * @cgrp might already have been destroyed while we're trying to
+ * grab the mutexes.
*/
- if (ss->base_cftypes) {
- struct cftype *cft;
+ if (!cgroup_is_dead(cgrp))
+ ret = cgroup_destroy_locked(cgrp);
- for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
- cft->ss = ss;
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
- ss->base_cftset.cfts = ss->base_cftypes;
- list_add_tail(&ss->base_cftset.node, &ss->cftsets);
- }
+ kernfs_unbreak_active_protection(kn);
+ cgroup_put(cgrp);
+ return ret;
}
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+ .remount_fs = cgroup_remount,
+ .show_options = cgroup_show_options,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .rename = cgroup_rename,
+};
+
static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+ mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
- /* init base cftset */
- cgroup_init_cftsets(ss);
+ INIT_LIST_HEAD(&ss->cfts);
/* Create the top cgroup state for this subsystem */
ss->root = &cgroup_dummy_root;
@@ -4551,7 +3855,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
* init_css_set is in the subsystem's top cgroup. */
- init_css_set.subsys[ss->subsys_id] = css;
+ init_css_set.subsys[ss->id] = css;
need_forkexit_callback |= ss->fork || ss->exit;
@@ -4563,184 +3867,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
BUG_ON(online_css(css));
mutex_unlock(&cgroup_mutex);
-
- /* this function shouldn't be used with modular subsystems, since they
- * need to register a subsys_id, among other things */
- BUG_ON(ss->module);
-}
-
-/**
- * cgroup_load_subsys: load and register a modular subsystem at runtime
- * @ss: the subsystem to load
- *
- * This function should be called in a modular subsystem's initcall. If the
- * subsystem is built as a module, it will be assigned a new subsys_id and set
- * up for use. If the subsystem is built-in anyway, work is delegated to the
- * simpler cgroup_init_subsys.
- */
-int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
-{
- struct cgroup_subsys_state *css;
- int i, ret;
- struct hlist_node *tmp;
- struct css_set *cset;
- unsigned long key;
-
- /* check name and function validity */
- if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
- ss->css_alloc == NULL || ss->css_free == NULL)
- return -EINVAL;
-
- /*
- * we don't support callbacks in modular subsystems. this check is
- * before the ss->module check for consistency; a subsystem that could
- * be a module should still have no callbacks even if the user isn't
- * compiling it as one.
- */
- if (ss->fork || ss->exit)
- return -EINVAL;
-
- /*
- * an optionally modular subsystem is built-in: we want to do nothing,
- * since cgroup_init_subsys will have already taken care of it.
- */
- if (ss->module == NULL) {
- /* a sanity check */
- BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
- return 0;
- }
-
- /* init base cftset */
- cgroup_init_cftsets(ss);
-
- mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
- cgroup_subsys[ss->subsys_id] = ss;
-
- /*
- * no ss->css_alloc seems to need anything important in the ss
- * struct, so this can happen first (i.e. before the dummy root
- * attachment).
- */
- css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
- if (IS_ERR(css)) {
- /* failure case - need to deassign the cgroup_subsys[] slot. */
- cgroup_subsys[ss->subsys_id] = NULL;
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- return PTR_ERR(css);
- }
-
- ss->root = &cgroup_dummy_root;
-
- /* our new subsystem will be attached to the dummy hierarchy. */
- init_css(css, ss, cgroup_dummy_top);
-
- /*
- * Now we need to entangle the css into the existing css_sets. unlike
- * in cgroup_init_subsys, there are now multiple css_sets, so each one
- * will need a new pointer to it; done by iterating the css_set_table.
- * furthermore, modifying the existing css_sets will corrupt the hash
- * table state, so each changed css_set will need its hash recomputed.
- * this is all done under the css_set_lock.
- */
- write_lock(&css_set_lock);
- hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
- /* skip entries that we already rehashed */
- if (cset->subsys[ss->subsys_id])
- continue;
- /* remove existing entry */
- hash_del(&cset->hlist);
- /* set new value */
- cset->subsys[ss->subsys_id] = css;
- /* recompute hash and restore entry */
- key = css_set_hash(cset->subsys);
- hash_add(css_set_table, &cset->hlist, key);
- }
- write_unlock(&css_set_lock);
-
- ret = online_css(css);
- if (ret) {
- ss->css_free(css);
- goto err_unload;
- }
-
- /* success! */
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- return 0;
-
-err_unload:
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- /* @ss can't be mounted here as try_module_get() would fail */
- cgroup_unload_subsys(ss);
- return ret;
-}
-EXPORT_SYMBOL_GPL(cgroup_load_subsys);
-
-/**
- * cgroup_unload_subsys: unload a modular subsystem
- * @ss: the subsystem to unload
- *
- * This function should be called in a modular subsystem's exitcall. When this
- * function is invoked, the refcount on the subsystem's module will be 0, so
- * the subsystem will not be attached to any hierarchy.
- */
-void cgroup_unload_subsys(struct cgroup_subsys *ss)
-{
- struct cgrp_cset_link *link;
- struct cgroup_subsys_state *css;
-
- BUG_ON(ss->module == NULL);
-
- /*
- * we shouldn't be called if the subsystem is in use, and the use of
- * try_module_get() in rebind_subsystems() should ensure that it
- * doesn't start being used while we're killing it off.
- */
- BUG_ON(ss->root != &cgroup_dummy_root);
-
- mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
-
- css = cgroup_css(cgroup_dummy_top, ss);
- if (css)
- offline_css(css);
-
- /* deassign the subsys_id */
- cgroup_subsys[ss->subsys_id] = NULL;
-
- /*
- * disentangle the css from all css_sets attached to the dummy
- * top. as in loading, we need to pay our respects to the hashtable
- * gods.
- */
- write_lock(&css_set_lock);
- list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
- struct css_set *cset = link->cset;
- unsigned long key;
-
- hash_del(&cset->hlist);
- cset->subsys[ss->subsys_id] = NULL;
- key = css_set_hash(cset->subsys);
- hash_add(css_set_table, &cset->hlist, key);
- }
- write_unlock(&css_set_lock);
-
- /*
- * remove subsystem's css from the cgroup_dummy_top and free it -
- * need to free before marking as null because ss->css_free needs
- * the cgrp->subsys pointer to find their state.
- */
- if (css)
- ss->css_free(css);
- RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
-
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
}
-EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
/**
* cgroup_init_early - cgroup initialization at system boot
@@ -4767,17 +3895,16 @@ int __init cgroup_init_early(void)
list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
- /* at bootup time, we don't worry about modular subsystems */
- for_each_builtin_subsys(ss, i) {
- BUG_ON(!ss->name);
- BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
- BUG_ON(!ss->css_alloc);
- BUG_ON(!ss->css_free);
- if (ss->subsys_id != i) {
- printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
- ss->name, ss->subsys_id);
- BUG();
- }
+ for_each_subsys(ss, i) {
+ WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
+ "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
+ i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
+ ss->id, ss->name);
+ WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
+ "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
+
+ ss->id = i;
+ ss->name = cgroup_subsys_name[i];
if (ss->early_init)
cgroup_init_subsys(ss);
@@ -4797,18 +3924,22 @@ int __init cgroup_init(void)
unsigned long key;
int i, err;
- err = bdi_init(&cgroup_backing_dev_info);
- if (err)
- return err;
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
- for_each_builtin_subsys(ss, i) {
+ for_each_subsys(ss, i) {
if (!ss->early_init)
cgroup_init_subsys(ss);
+
+ /*
+ * cftype registration needs kmalloc and can't be done
+ * during early_init. Register base cftypes separately.
+ */
+ if (ss->base_cftypes)
+ WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
}
/* allocate id for the dummy hierarchy */
mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
/* Add init_css_set to the hash table */
key = css_set_hash(init_css_set.subsys);
@@ -4820,28 +3951,20 @@ int __init cgroup_init(void)
0, 1, GFP_KERNEL);
BUG_ON(err < 0);
- mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
- if (!cgroup_kobj) {
- err = -ENOMEM;
- goto out;
- }
+ if (!cgroup_kobj)
+ return -ENOMEM;
err = register_filesystem(&cgroup_fs_type);
if (err < 0) {
kobject_put(cgroup_kobj);
- goto out;
+ return err;
}
proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
-
-out:
- if (err)
- bdi_destroy(&cgroup_backing_dev_info);
-
- return err;
+ return 0;
}
static int __init cgroup_wq_init(void)
@@ -4886,12 +4009,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *path;
int retval;
struct cgroupfs_root *root;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
@@ -4904,6 +4027,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
retval = 0;
mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
for_each_active_root(root) {
struct cgroup_subsys *ss;
@@ -4919,14 +4043,17 @@ int proc_cgroup_show(struct seq_file *m, void *v)
root->name);
seq_putc(m, ':');
cgrp = task_cgroup_from_root(tsk, root);
- retval = cgroup_path(cgrp, buf, PAGE_SIZE);
- if (retval < 0)
+ path = cgroup_path(cgrp, buf, PATH_MAX);
+ if (!path) {
+ retval = -ENAMETOOLONG;
goto out_unlock;
- seq_puts(m, buf);
+ }
+ seq_puts(m, path);
seq_putc(m, '\n');
}
out_unlock:
+ up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
put_task_struct(tsk);
out_free:
@@ -4952,7 +4079,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->name, ss->root->hierarchy_id,
- ss->root->number_of_cgroups, !ss->disabled);
+ atomic_read(&ss->root->nr_cgrps), !ss->disabled);
mutex_unlock(&cgroup_mutex);
return 0;
@@ -5022,12 +4149,12 @@ void cgroup_post_fork(struct task_struct *child)
* lock on fork.
*/
if (use_task_css_set_links) {
- write_lock(&css_set_lock);
+ down_write(&css_set_rwsem);
task_lock(child);
if (list_empty(&child->cg_list))
list_add(&child->cg_list, &task_css_set(child)->tasks);
task_unlock(child);
- write_unlock(&css_set_lock);
+ up_write(&css_set_rwsem);
}
/*
@@ -5036,15 +4163,7 @@ void cgroup_post_fork(struct task_struct *child)
* and addition to css_set.
*/
if (need_forkexit_callback) {
- /*
- * fork/exit callbacks are supported only for builtin
- * subsystems, and the builtin section of the subsys
- * array is immutable, so we don't need to lock the
- * subsys array here. On the other hand, modular section
- * of the array can be freed at module unload, so we
- * can't touch that.
- */
- for_each_builtin_subsys(ss, i)
+ for_each_subsys(ss, i)
if (ss->fork)
ss->fork(child);
}
@@ -5092,15 +4211,14 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
int i;
/*
- * Unlink from the css_set task list if necessary.
- * Optimistically check cg_list before taking
- * css_set_lock
+ * Unlink from the css_set task list if necessary. Optimistically
+ * check cg_list before taking css_set_rwsem.
*/
if (!list_empty(&tsk->cg_list)) {
- write_lock(&css_set_lock);
+ down_write(&css_set_rwsem);
if (!list_empty(&tsk->cg_list))
list_del_init(&tsk->cg_list);
- write_unlock(&css_set_lock);
+ up_write(&css_set_rwsem);
}
/* Reassign the task to the init_css_set. */
@@ -5109,11 +4227,8 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
if (run_callbacks && need_forkexit_callback) {
- /*
- * fork/exit callbacks are supported only for builtin
- * subsystems, see cgroup_post_fork() for details.
- */
- for_each_builtin_subsys(ss, i) {
+ /* see cgroup_post_fork() for details */
+ for_each_subsys(ss, i) {
if (ss->exit) {
struct cgroup_subsys_state *old_css = cset->subsys[i];
struct cgroup_subsys_state *css = task_css(tsk, i);
@@ -5124,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
}
task_unlock(tsk);
- put_css_set_taskexit(cset);
+ put_css_set(cset, true);
}
static void check_for_release(struct cgroup *cgrp)
@@ -5181,16 +4296,17 @@ static void cgroup_release_agent(struct work_struct *work)
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
- char *pathbuf = NULL, *agentbuf = NULL;
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
struct cgroup *cgrp = list_entry(release_list.next,
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
- pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!pathbuf)
goto continue_free;
- if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
goto continue_free;
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
if (!agentbuf)
@@ -5198,7 +4314,7 @@ static void cgroup_release_agent(struct work_struct *work)
i = 0;
argv[i++] = agentbuf;
- argv[i++] = pathbuf;
+ argv[i++] = path;
argv[i] = NULL;
i = 0;
@@ -5232,11 +4348,7 @@ static int __init cgroup_disable(char *str)
if (!*token)
continue;
- /*
- * cgroup_disable, being at boot time, can't know about
- * module subsystems, so we don't worry about them.
- */
- for_each_builtin_subsys(ss, i) {
+ for_each_subsys(ss, i) {
if (!strcmp(token, ss->name)) {
ss->disabled = 1;
printk(KERN_INFO "Disabling %s control group"
@@ -5250,28 +4362,42 @@ static int __init cgroup_disable(char *str)
__setup("cgroup_disable=", cgroup_disable);
/**
- * css_from_dir - get corresponding css from the dentry of a cgroup dir
+ * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
* @dentry: directory dentry of interest
* @ss: subsystem of interest
*
- * Must be called under cgroup_mutex or RCU read lock. The caller is
- * responsible for pinning the returned css if it needs to be accessed
- * outside the critical section.
+ * If @dentry is a directory for a cgroup which has @ss enabled on it, try
+ * to get the corresponding css and return it. If such css doesn't exist
+ * or can't be pinned, an ERR_PTR value is returned.
*/
-struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
- struct cgroup_subsys *ss)
+struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
+ struct cgroup_subsys *ss)
{
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct cgroup_subsys_state *css = NULL;
struct cgroup *cgrp;
- cgroup_assert_mutex_or_rcu_locked();
-
/* is @dentry a cgroup dir? */
- if (!dentry->d_inode ||
- dentry->d_inode->i_op != &cgroup_dir_inode_operations)
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
return ERR_PTR(-EBADF);
- cgrp = __d_cgrp(dentry);
- return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
+ rcu_read_lock();
+
+ /*
+ * This path doesn't originate from kernfs and @kn could already
+ * have been or be removed at any point. @kn->priv is RCU
+ * protected for this access. See destroy_locked() for details.
+ */
+ cgrp = rcu_dereference(kn->priv);
+ if (cgrp)
+ css = cgroup_css(cgrp, ss);
+
+ if (!css || !css_tryget(css))
+ css = ERR_PTR(-ENOENT);
+
+ rcu_read_unlock();
+ return css;
}
/**
@@ -5286,7 +4412,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
struct cgroup *cgrp;
- cgroup_assert_mutex_or_rcu_locked();
+ cgroup_assert_mutexes_or_rcu_locked();
cgrp = idr_find(&ss->root->cgroup_idr, id);
if (cgrp)
@@ -5338,23 +4464,30 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
{
struct cgrp_cset_link *link;
struct css_set *cset;
+ char *name_buf;
+
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
- read_lock(&css_set_lock);
+ down_read(&css_set_rwsem);
rcu_read_lock();
cset = rcu_dereference(current->cgroups);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
- const char *name;
+ const char *name = "?";
+
+ if (c != cgroup_dummy_top) {
+ cgroup_name(c, name_buf, NAME_MAX + 1);
+ name = name_buf;
+ }
- if (c->dentry)
- name = c->dentry->d_name.name;
- else
- name = "?";
seq_printf(seq, "Root %d group %s\n",
c->root->hierarchy_id, name);
}
rcu_read_unlock();
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
+ kfree(name_buf);
return 0;
}
@@ -5364,7 +4497,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
- read_lock(&css_set_lock);
+ down_read(&css_set_rwsem);
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
@@ -5380,7 +4513,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
}
}
}
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
return 0;
}
@@ -5423,11 +4556,9 @@ static struct cftype debug_files[] = {
{ } /* terminate */
};
-struct cgroup_subsys debug_subsys = {
- .name = "debug",
+struct cgroup_subsys debug_cgrp_subsys = {
.css_alloc = debug_css_alloc,
.css_free = debug_css_free,
- .subsys_id = debug_subsys_id,
.base_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 6c3154e477f6..7201a637c405 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -52,7 +52,7 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
static inline struct freezer *task_freezer(struct task_struct *task)
{
- return css_freezer(task_css(task, freezer_subsys_id));
+ return css_freezer(task_css(task, freezer_cgrp_id));
}
static struct freezer *parent_freezer(struct freezer *freezer)
@@ -84,8 +84,6 @@ static const char *freezer_state_strs(unsigned int state)
return "THAWED";
};
-struct cgroup_subsys freezer_subsys;
-
static struct cgroup_subsys_state *
freezer_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -189,7 +187,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
* current state before executing the following - !frozen tasks may
* be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
*/
- cgroup_taskset_for_each(task, new_css, tset) {
+ cgroup_taskset_for_each(task, tset) {
if (!(freezer->state & CGROUP_FREEZING)) {
__thaw_task(task);
} else {
@@ -473,13 +471,11 @@ static struct cftype files[] = {
{ } /* terminate */
};
-struct cgroup_subsys freezer_subsys = {
- .name = "freezer",
+struct cgroup_subsys freezer_cgrp_subsys = {
.css_alloc = freezer_css_alloc,
.css_online = freezer_css_online,
.css_offline = freezer_css_offline,
.css_free = freezer_css_free,
- .subsys_id = freezer_subsys_id,
.attach = freezer_attach,
.fork = freezer_fork,
.base_cftypes = files,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4410ac6a55f1..d8bec21d7a11 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -119,7 +119,7 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
- return css_cs(task_css(task, cpuset_subsys_id));
+ return css_cs(task_css(task, cpuset_cgrp_id));
}
static inline struct cpuset *parent_cs(struct cpuset *cs)
@@ -467,7 +467,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
* be changed to have empty cpus_allowed or mems_allowed.
*/
ret = -ENOSPC;
- if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) {
+ if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
if (!cpumask_empty(cur->cpus_allowed) &&
cpumask_empty(trial->cpus_allowed))
goto out;
@@ -829,55 +829,36 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
}
/**
- * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
- * @tsk: task to test
- * @data: cpuset to @tsk belongs to
- *
- * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
- * mask needs to be changed.
- *
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cpuset_mutex at this point.
- */
-static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
-{
- struct cpuset *cs = data;
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
-
- set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
-}
-
-/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
- * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
*
- * Called with cpuset_mutex held
- *
- * The css_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
- *
- * No return value. It's guaranteed that css_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Iterate through each task of @cs updating its cpus_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
*/
-static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_cpumask(struct cpuset *cs)
{
- css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap);
+ struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+ css_task_iter_end(&it);
}
/*
* update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
* @root_cs: the root cpuset of the hierarchy
* @update_root: update root cpuset or not?
- * @heap: the heap used by css_scan_tasks()
*
* This will update cpumasks of tasks in @root_cs and all other empty cpusets
* which take on cpumask of @root_cs.
*
* Called with cpuset_mutex held
*/
-static void update_tasks_cpumask_hier(struct cpuset *root_cs,
- bool update_root, struct ptr_heap *heap)
+static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -898,7 +879,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
continue;
rcu_read_unlock();
- update_tasks_cpumask(cp, heap);
+ update_tasks_cpumask(cp);
rcu_read_lock();
css_put(&cp->css);
@@ -914,7 +895,6 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- struct ptr_heap heap;
int retval;
int is_load_balanced;
@@ -947,19 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (retval)
- return retval;
-
is_load_balanced = is_sched_load_balance(trialcs);
mutex_lock(&callback_mutex);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
- update_tasks_cpumask_hier(cs, true, &heap);
-
- heap_free(&heap);
+ update_tasks_cpumask_hier(cs, true);
if (is_load_balanced)
rebuild_sched_domains_locked();
@@ -1052,53 +1026,22 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
task_unlock(tsk);
}
-struct cpuset_change_nodemask_arg {
- struct cpuset *cs;
- nodemask_t *newmems;
-};
-
-/*
- * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
- * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cpuset_mutex held.
- */
-static void cpuset_change_nodemask(struct task_struct *p, void *data)
-{
- struct cpuset_change_nodemask_arg *arg = data;
- struct cpuset *cs = arg->cs;
- struct mm_struct *mm;
- int migrate;
-
- cpuset_change_task_nodemask(p, arg->newmems);
-
- mm = get_task_mm(p);
- if (!mm)
- return;
-
- migrate = is_memory_migrate(cs);
-
- mpol_rebind_mm(mm, &cs->mems_allowed);
- if (migrate)
- cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
- mmput(mm);
-}
-
static void *cpuset_being_rebound;
/**
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
- * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
*
- * Called with cpuset_mutex held. No return value. It's guaranteed that
- * css_scan_tasks() always returns 0 if @heap != NULL.
+ * Iterate through each task of @cs updating its mems_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
*/
-static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_nodemask(struct cpuset *cs)
{
static nodemask_t newmems; /* protected by cpuset_mutex */
struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
- struct cpuset_change_nodemask_arg arg = { .cs = cs,
- .newmems = &newmems };
+ struct css_task_iter it;
+ struct task_struct *task;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
@@ -1114,7 +1057,25 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap);
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it))) {
+ struct mm_struct *mm;
+ bool migrate;
+
+ cpuset_change_task_nodemask(task, &newmems);
+
+ mm = get_task_mm(task);
+ if (!mm)
+ continue;
+
+ migrate = is_memory_migrate(cs);
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate)
+ cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
+ mmput(mm);
+ }
+ css_task_iter_end(&it);
/*
* All the tasks' nodemasks have been updated, update
@@ -1130,15 +1091,13 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
* update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
* @cs: the root cpuset of the hierarchy
* @update_root: update the root cpuset or not?
- * @heap: the heap used by css_scan_tasks()
*
* This will update nodemasks of tasks in @root_cs and all other empty cpusets
* which take on nodemask of @root_cs.
*
* Called with cpuset_mutex held
*/
-static void update_tasks_nodemask_hier(struct cpuset *root_cs,
- bool update_root, struct ptr_heap *heap)
+static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -1159,7 +1118,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
continue;
rcu_read_unlock();
- update_tasks_nodemask(cp, heap);
+ update_tasks_nodemask(cp);
rcu_read_lock();
css_put(&cp->css);
@@ -1184,7 +1143,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
- struct ptr_heap heap;
/*
* top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
@@ -1223,17 +1181,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (retval < 0)
- goto done;
-
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs->mems_allowed;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask_hier(cs, true, &heap);
-
- heap_free(&heap);
+ update_tasks_nodemask_hier(cs, true);
done:
return retval;
}
@@ -1261,38 +1213,22 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
}
/**
- * cpuset_change_flag - make a task's spread flags the same as its cpuset's
- * @tsk: task to be updated
- * @data: cpuset to @tsk belongs to
- *
- * Called by css_scan_tasks() for each task in a cgroup.
- *
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cpuset_mutex at this point.
- */
-static void cpuset_change_flag(struct task_struct *tsk, void *data)
-{
- struct cpuset *cs = data;
-
- cpuset_update_task_spread_flag(cs, tsk);
-}
-
-/**
* update_tasks_flags - update the spread flags of tasks in the cpuset.
* @cs: the cpuset in which each task's spread flags needs to be changed
- * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
- *
- * Called with cpuset_mutex held
- *
- * The css_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
*
- * No return value. It's guaranteed that css_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Iterate through each task of @cs updating its spread flags. As this
+ * function is called with cpuset_mutex held, cpuset membership stays
+ * stable.
*/
-static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_flags(struct cpuset *cs)
{
- css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap);
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ cpuset_update_task_spread_flag(cs, task);
+ css_task_iter_end(&it);
}
/*
@@ -1310,7 +1246,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
struct cpuset *trialcs;
int balance_flag_changed;
int spread_flag_changed;
- struct ptr_heap heap;
int err;
trialcs = alloc_trial_cpuset(cs);
@@ -1326,10 +1261,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (err < 0)
goto out;
- err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (err < 0)
- goto out;
-
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(trialcs));
@@ -1344,8 +1275,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
rebuild_sched_domains_locked();
if (spread_flag_changed)
- update_tasks_flags(cs, &heap);
- heap_free(&heap);
+ update_tasks_flags(cs);
out:
free_trial_cpuset(trialcs);
return err;
@@ -1449,6 +1379,8 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
+static struct cpuset *cpuset_attach_old_cs;
+
/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
@@ -1457,6 +1389,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
struct task_struct *task;
int ret;
+ /* used later by cpuset_attach() */
+ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
+
mutex_lock(&cpuset_mutex);
/*
@@ -1468,7 +1403,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
- cgroup_taskset_for_each(task, css, tset) {
+ cgroup_taskset_for_each(task, tset) {
/*
* Kthreads which disallow setaffinity shouldn't be moved
* to a new cpuset; we don't want to change their cpu
@@ -1520,10 +1455,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
struct mm_struct *mm;
struct task_struct *task;
struct task_struct *leader = cgroup_taskset_first(tset);
- struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
- cpuset_subsys_id);
struct cpuset *cs = css_cs(css);
- struct cpuset *oldcs = css_cs(oldcss);
+ struct cpuset *oldcs = cpuset_attach_old_cs;
struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
@@ -1537,7 +1470,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
- cgroup_taskset_for_each(task, css, tset) {
+ cgroup_taskset_for_each(task, tset) {
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
@@ -2024,8 +1957,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
kfree(cs);
}
-struct cgroup_subsys cpuset_subsys = {
- .name = "cpuset",
+struct cgroup_subsys cpuset_cgrp_subsys = {
.css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
.css_offline = cpuset_css_offline,
@@ -2033,7 +1965,6 @@ struct cgroup_subsys cpuset_subsys = {
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
- .subsys_id = cpuset_subsys_id,
.base_cftypes = files,
.early_init = 1,
};
@@ -2090,10 +2021,9 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
parent = parent_cs(parent);
if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
- rcu_read_lock();
- printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
- cgroup_name(cs->css.cgroup));
- rcu_read_unlock();
+ printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset ");
+ pr_cont_cgroup_name(cs->css.cgroup);
+ pr_cont("\n");
}
}
@@ -2141,7 +2071,7 @@ retry:
*/
if ((sane && cpumask_empty(cs->cpus_allowed)) ||
(!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
- update_tasks_cpumask(cs, NULL);
+ update_tasks_cpumask(cs);
mutex_lock(&callback_mutex);
nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
@@ -2155,7 +2085,7 @@ retry:
*/
if ((sane && nodes_empty(cs->mems_allowed)) ||
(!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
- update_tasks_nodemask(cs, NULL);
+ update_tasks_nodemask(cs);
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
@@ -2217,7 +2147,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = new_mems;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, NULL);
+ update_tasks_nodemask(&top_cpuset);
}
mutex_unlock(&cpuset_mutex);
@@ -2621,19 +2551,17 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
/* Statically allocated to prevent using excess stack. */
static char cpuset_nodelist[CPUSET_NODELIST_LEN];
static DEFINE_SPINLOCK(cpuset_buffer_lock);
-
struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
- rcu_read_lock();
spin_lock(&cpuset_buffer_lock);
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
- printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
- tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
+ printk(KERN_INFO "%s cpuset=", tsk->comm);
+ pr_cont_cgroup_name(cgrp);
+ pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
spin_unlock(&cpuset_buffer_lock);
- rcu_read_unlock();
}
/*
@@ -2683,12 +2611,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *p;
struct cgroup_subsys_state *css;
int retval;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
@@ -2698,14 +2626,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!tsk)
goto out_free;
+ retval = -ENAMETOOLONG;
rcu_read_lock();
- css = task_css(tsk, cpuset_subsys_id);
- retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+ css = task_css(tsk, cpuset_cgrp_id);
+ p = cgroup_path(css->cgroup, buf, PATH_MAX);
rcu_read_unlock();
- if (retval < 0)
+ if (!p)
goto out_put_task;
- seq_puts(m, buf);
+ seq_puts(m, p);
seq_putc(m, '\n');
+ retval = 0;
out_put_task:
put_task_struct(tsk);
out_free:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 45e5543e2a1e..1bc932a8a263 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -361,7 +361,7 @@ struct perf_cgroup {
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task)
{
- return container_of(task_css(task, perf_subsys_id),
+ return container_of(task_css(task, perf_event_cgrp_id),
struct perf_cgroup, css);
}
@@ -389,11 +389,6 @@ perf_cgroup_match(struct perf_event *event)
event->cgrp->css.cgroup);
}
-static inline bool perf_tryget_cgroup(struct perf_event *event)
-{
- return css_tryget(&event->cgrp->css);
-}
-
static inline void perf_put_cgroup(struct perf_event *event)
{
css_put(&event->cgrp->css);
@@ -612,9 +607,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
if (!f.file)
return -EBADF;
- rcu_read_lock();
-
- css = css_from_dir(f.file->f_dentry, &perf_subsys);
+ css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
goto out;
@@ -623,13 +616,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
- /* must be done before we fput() the file */
- if (!perf_tryget_cgroup(event)) {
- event->cgrp = NULL;
- ret = -ENOENT;
- goto out;
- }
-
/*
* all events in a group must monitor
* the same cgroup because a task belongs
@@ -640,7 +626,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
ret = -EINVAL;
}
out:
- rcu_read_unlock();
fdput(f);
return ret;
}
@@ -8055,7 +8040,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
{
struct task_struct *task;
- cgroup_taskset_for_each(task, css, tset)
+ cgroup_taskset_for_each(task, tset)
task_function_call(task, __perf_cgroup_move, task);
}
@@ -8074,9 +8059,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css,
task_function_call(task, __perf_cgroup_move, task);
}
-struct cgroup_subsys perf_subsys = {
- .name = "perf_event",
- .subsys_id = perf_subsys_id,
+struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
.exit = perf_cgroup_exit,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 633d0456868c..9684848f3bf2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7214,7 +7214,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
- tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id,
lockdep_is_held(&tsk->sighand->siglock)),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
@@ -7641,7 +7641,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
{
struct task_struct *task;
- cgroup_taskset_for_each(task, css, tset) {
+ cgroup_taskset_for_each(task, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
@@ -7659,7 +7659,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
{
struct task_struct *task;
- cgroup_taskset_for_each(task, css, tset)
+ cgroup_taskset_for_each(task, tset)
sched_move_task(task);
}
@@ -7998,8 +7998,7 @@ static struct cftype cpu_files[] = {
{ } /* terminate */
};
-struct cgroup_subsys cpu_cgroup_subsys = {
- .name = "cpu",
+struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_free = cpu_cgroup_css_free,
.css_online = cpu_cgroup_css_online,
@@ -8007,7 +8006,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
- .subsys_id = cpu_cgroup_subsys_id,
.base_cftypes = cpu_files,
.early_init = 1,
};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 622e0818f905..c143ee380e3a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
/* return cpu accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
- return css_ca(task_css(tsk, cpuacct_subsys_id));
+ return css_ca(task_css(tsk, cpuacct_cgrp_id));
}
static inline struct cpuacct *parent_ca(struct cpuacct *ca)
@@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
rcu_read_unlock();
}
-struct cgroup_subsys cpuacct_subsys = {
- .name = "cpuacct",
+struct cgroup_subsys cpuacct_cgrp_subsys = {
.css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free,
- .subsys_id = cpuacct_subsys_id,
.base_cftypes = files,
.early_init = 1,
};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f3344c31632a..695f9773bb60 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
- return group_path;
+ return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
}
#endif
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index cb00829bb466..b135853e68f3 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -30,7 +30,6 @@ struct hugetlb_cgroup {
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
-struct cgroup_subsys hugetlb_subsys __read_mostly;
static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
static inline
@@ -42,7 +41,7 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
{
- return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id));
+ return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
}
static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
@@ -358,7 +357,7 @@ static void __init __hugetlb_cgroup_file_init(int idx)
cft = &h->cgroup_files[4];
memset(cft, 0, sizeof(*cft));
- WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
+ WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files));
return;
}
@@ -402,10 +401,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
return;
}
-struct cgroup_subsys hugetlb_subsys = {
- .name = "hugetlb",
+struct cgroup_subsys hugetlb_cgrp_subsys = {
.css_alloc = hugetlb_cgroup_css_alloc,
.css_offline = hugetlb_cgroup_css_offline,
.css_free = hugetlb_cgroup_css_free,
- .subsys_id = hugetlb_subsys_id,
};
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53385cd4e6f0..d9c6ac1532e6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -66,8 +66,8 @@
#include <trace/events/vmscan.h>
-struct cgroup_subsys mem_cgroup_subsys __read_mostly;
-EXPORT_SYMBOL(mem_cgroup_subsys);
+struct cgroup_subsys memory_cgrp_subsys __read_mostly;
+EXPORT_SYMBOL(memory_cgrp_subsys);
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -538,7 +538,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
struct cgroup_subsys_state *css;
- css = css_from_id(id - 1, &mem_cgroup_subsys);
+ css = css_from_id(id - 1, &memory_cgrp_subsys);
return mem_cgroup_from_css(css);
}
@@ -1072,7 +1072,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
if (unlikely(!p))
return NULL;
- return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
+ return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}
struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -1683,15 +1683,8 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- /*
- * protects memcg_name and makes sure that parallel ooms do not
- * interleave
- */
+ /* oom_info_lock ensures that parallel ooms do not interleave */
static DEFINE_SPINLOCK(oom_info_lock);
- struct cgroup *task_cgrp;
- struct cgroup *mem_cgrp;
- static char memcg_name[PATH_MAX];
- int ret;
struct mem_cgroup *iter;
unsigned int i;
@@ -1701,36 +1694,14 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
spin_lock(&oom_info_lock);
rcu_read_lock();
- mem_cgrp = memcg->css.cgroup;
- task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
-
- ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
- if (ret < 0) {
- /*
- * Unfortunately, we are unable to convert to a useful name
- * But we'll still print out the usage information
- */
- rcu_read_unlock();
- goto done;
- }
- rcu_read_unlock();
-
- pr_info("Task in %s killed", memcg_name);
+ pr_info("Task in ");
+ pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+ pr_info(" killed as a result of limit of ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_info("\n");
- rcu_read_lock();
- ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
- if (ret < 0) {
- rcu_read_unlock();
- goto done;
- }
rcu_read_unlock();
- /*
- * Continues from above, so we don't need an KERN_ level
- */
- pr_cont(" as a result of limit of %s\n", memcg_name);
-done:
-
pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
@@ -1745,13 +1716,8 @@ done:
res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
for_each_mem_cgroup_tree(iter, memcg) {
- pr_info("Memory cgroup stats");
-
- rcu_read_lock();
- ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
- if (!ret)
- pr_cont(" for %s", memcg_name);
- rcu_read_unlock();
+ pr_info("Memory cgroup stats for ");
+ pr_cont_cgroup_path(iter->css.cgroup);
pr_cont(":");
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3401,7 +3367,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
struct kmem_cache *s)
{
struct kmem_cache *new = NULL;
- static char *tmp_name = NULL;
+ static char *tmp_path = NULL, *tmp_name = NULL;
static DEFINE_MUTEX(mutex); /* protects tmp_name */
BUG_ON(!memcg_can_account_kmem(memcg));
@@ -3413,18 +3379,20 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
* This static temporary buffer is used to prevent from
* pointless shortliving allocation.
*/
- if (!tmp_name) {
- tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!tmp_path || !tmp_name) {
+ if (!tmp_path)
+ tmp_path = kmalloc(PATH_MAX, GFP_KERNEL);
if (!tmp_name)
+ tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!tmp_path || !tmp_name)
goto out;
}
- rcu_read_lock();
- snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
- memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
- rcu_read_unlock();
+ cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1);
+ snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name,
+ memcg_cache_id(memcg), tmp_name);
- new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
+ new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align,
(s->flags & ~SLAB_PANIC), s->ctor, s);
if (new)
new->allocflags |= __GFP_KMEMCG;
@@ -4990,7 +4958,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
struct cgroup *cgrp = memcg->css.cgroup;
/* returns EBUSY if there is a task or if we come here twice. */
- if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
+ if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children))
return -EBUSY;
/* we call try-to-free pages for make this cgroup empty */
@@ -5172,7 +5140,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
* of course permitted.
*/
mutex_lock(&memcg_create_mutex);
- if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
+ if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg))
err = -EBUSY;
mutex_unlock(&memcg_create_mutex);
if (err)
@@ -6183,17 +6151,15 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
* automatically removed on cgroup destruction but the removal is
* asynchronous, so take an extra ref on @css.
*/
- rcu_read_lock();
-
+ cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent,
+ &memory_cgrp_subsys);
ret = -EINVAL;
- cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
- &mem_cgroup_subsys);
- if (cfile_css == css && css_tryget(css))
- ret = 0;
-
- rcu_read_unlock();
- if (ret)
+ if (IS_ERR(cfile_css))
+ goto out_put_cfile;
+ if (cfile_css != css) {
+ css_put(cfile_css);
goto out_put_cfile;
+ }
ret = event->register_event(memcg, event->eventfd, buffer);
if (ret)
@@ -6566,11 +6532,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
* unfortunate state in our controller.
*/
if (parent != root_mem_cgroup)
- mem_cgroup_subsys.broken_hierarchy = true;
+ memory_cgrp_subsys.broken_hierarchy = true;
}
mutex_unlock(&memcg_create_mutex);
- return memcg_init_kmem(memcg, &mem_cgroup_subsys);
+ return memcg_init_kmem(memcg, &memory_cgrp_subsys);
}
/*
@@ -7264,9 +7230,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
mem_cgroup_from_css(root_css)->use_hierarchy = true;
}
-struct cgroup_subsys mem_cgroup_subsys = {
- .name = "memory",
- .subsys_id = mem_cgroup_subsys_id,
+struct cgroup_subsys memory_cgrp_subsys = {
.css_alloc = mem_cgroup_css_alloc,
.css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
@@ -7292,7 +7256,7 @@ __setup("swapaccount=", enable_swap_account);
static void __init memsw_file_init(void)
{
- WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
+ WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
}
static void __init enable_swap_cgroup(void)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2f2f34a4e77d..566f7af6527a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -145,14 +145,10 @@ static int hwpoison_filter_task(struct page *p)
return -EINVAL;
css = mem_cgroup_css(mem);
- /* root_mem_cgroup has NULL dentries */
- if (!css->cgroup->dentry)
- return -EINVAL;
-
- ino = css->cgroup->dentry->d_inode->i_ino;
+ ino = cgroup_ino(css->cgroup);
css_put(css);
- if (ino != hwpoison_filter_memcg)
+ if (!ino || ino != hwpoison_filter_memcg)
return -EINVAL;
return 0;
diff --git a/net/Kconfig b/net/Kconfig
index e411046a62e3..a83bc4cc45a4 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -239,7 +239,7 @@ config XPS
default y
config CGROUP_NET_PRIO
- tristate "Network priority cgroup"
+ bool "Network priority cgroup"
depends on CGROUPS
---help---
Cgroup subsystem for use in assigning processes to network priorities on
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 719efd541668..22931e1b99b4 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -23,7 +23,7 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state
struct cgroup_cls_state *task_cls_state(struct task_struct *p)
{
- return css_cls_state(task_css(p, net_cls_subsys_id));
+ return css_cls_state(task_css(p, net_cls_cgrp_id));
}
EXPORT_SYMBOL_GPL(task_cls_state);
@@ -73,7 +73,7 @@ static void cgrp_attach(struct cgroup_subsys_state *css,
void *v = (void *)(unsigned long)cs->classid;
struct task_struct *p;
- cgroup_taskset_for_each(p, css, tset) {
+ cgroup_taskset_for_each(p, tset) {
task_lock(p);
iterate_fd(p->files, 0, update_classid, v);
task_unlock(p);
@@ -102,19 +102,10 @@ static struct cftype ss_files[] = {
{ } /* terminate */
};
-struct cgroup_subsys net_cls_subsys = {
- .name = "net_cls",
+struct cgroup_subsys net_cls_cgrp_subsys = {
.css_alloc = cgrp_css_alloc,
.css_online = cgrp_css_online,
.css_free = cgrp_css_free,
.attach = cgrp_attach,
- .subsys_id = net_cls_subsys_id,
.base_cftypes = ss_files,
- .module = THIS_MODULE,
};
-
-static int __init init_netclassid_cgroup(void)
-{
- return cgroup_load_subsys(&net_cls_subsys);
-}
-__initcall(init_netclassid_cgroup);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9043caedcd08..f9f3a40d3350 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -224,7 +224,7 @@ static void net_prio_attach(struct cgroup_subsys_state *css,
struct task_struct *p;
void *v = (void *)(unsigned long)css->cgroup->id;
- cgroup_taskset_for_each(p, css, tset) {
+ cgroup_taskset_for_each(p, tset) {
task_lock(p);
iterate_fd(p->files, 0, update_netprio, v);
task_unlock(p);
@@ -244,15 +244,12 @@ static struct cftype ss_files[] = {
{ } /* terminate */
};
-struct cgroup_subsys net_prio_subsys = {
- .name = "net_prio",
+struct cgroup_subsys net_prio_cgrp_subsys = {
.css_alloc = cgrp_css_alloc,
.css_online = cgrp_css_online,
.css_free = cgrp_css_free,
.attach = net_prio_attach,
- .subsys_id = net_prio_subsys_id,
.base_cftypes = ss_files,
- .module = THIS_MODULE,
};
static int netprio_device_event(struct notifier_block *unused,
@@ -283,37 +280,9 @@ static struct notifier_block netprio_device_notifier = {
static int __init init_cgroup_netprio(void)
{
- int ret;
-
- ret = cgroup_load_subsys(&net_prio_subsys);
- if (ret)
- goto out;
-
register_netdevice_notifier(&netprio_device_notifier);
-
-out:
- return ret;
-}
-
-static void __exit exit_cgroup_netprio(void)
-{
- struct netprio_map *old;
- struct net_device *dev;
-
- unregister_netdevice_notifier(&netprio_device_notifier);
-
- cgroup_unload_subsys(&net_prio_subsys);
-
- rtnl_lock();
- for_each_netdev(&init_net, dev) {
- old = rtnl_dereference(dev->priomap);
- RCU_INIT_POINTER(dev->priomap, NULL);
- if (old)
- kfree_rcu(old, rcu);
- }
- rtnl_unlock();
+ return 0;
}
-module_init(init_cgroup_netprio);
-module_exit(exit_cgroup_netprio);
+subsys_initcall(init_cgroup_netprio);
MODULE_LICENSE("GPL v2");
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index f7e522c558ba..20a0aca9131e 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -219,7 +219,7 @@ static struct cftype tcp_files[] = {
static int __init tcp_memcontrol_init(void)
{
- WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files));
+ WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files));
return 0;
}
__initcall(tcp_memcontrol_init);
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index d3b6d2cd3a06..7f88bcde7c61 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -58,11 +58,9 @@ static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
{
- return css_to_devcgroup(task_css(task, devices_subsys_id));
+ return css_to_devcgroup(task_css(task, devices_cgrp_id));
}
-struct cgroup_subsys devices_subsys;
-
/*
* called under devcgroup_mutex
*/
@@ -684,13 +682,11 @@ static struct cftype dev_cgroup_files[] = {
{ } /* terminate */
};
-struct cgroup_subsys devices_subsys = {
- .name = "devices",
+struct cgroup_subsys devices_cgrp_subsys = {
.css_alloc = devcgroup_css_alloc,
.css_free = devcgroup_css_free,
.css_online = devcgroup_online,
.css_offline = devcgroup_offline,
- .subsys_id = devices_subsys_id,
.base_cftypes = dev_cgroup_files,
};