From 356d2e581032b686da0854c7f17de2027c872762 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Jan 2013 08:05:10 -0800 Subject: blkcg: fix minor bug in blkg_alloc() blkg_alloc() was mistakenly checking blkcg_policy_enabled() twice. The latter test should have been on whether pol->pd_init_fn() exists. This doesn't cause actual problems because both blkcg policies implement pol->pd_init_fn(). Fix it. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b8858fb0cafa..7ef747b7f056 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -114,7 +114,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, pd->blkg = blkg; /* invoke per-policy init */ - if (blkcg_policy_enabled(blkg->q, pol)) + if (pol->pd_init_fn) pol->pd_init_fn(blkg); } -- cgit v1.2.3 From 86cde6b62313dd221c2e5935db02de7234cb4164 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Jan 2013 08:05:10 -0800 Subject: blkcg: reorganize blkg_lookup_create() and friends Reorganize such that * __blkg_lookup() takes bool param @update_hint to determine whether to update hint. * __blkg_lookup_create() no longer performs lookup before trying to create. Renamed to blkg_create(). * blkg_lookup_create() now performs lookup and then invokes blkg_create() if lookup fails. * root_blkg creation in blkcg_activate_policy() updated accordingly. Note that blkcg_activate_policy() no longer updates lookup hint if root_blkg already exists. Except for the last lookup hint bit which is immaterial, this is pure reorganization and doesn't introduce any visible behavior change. This is to prepare for proper hierarchy support. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal --- block/blk-cgroup.c | 75 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 23 deletions(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 7ef747b7f056..201275467d8b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -126,7 +126,7 @@ err_free: } static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, - struct request_queue *q) + struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; @@ -135,14 +135,19 @@ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, return blkg; /* - * Hint didn't match. Look up from the radix tree. Note that we - * may not be holding queue_lock and thus are not sure whether - * @blkg from blkg_tree has already been removed or not, so we - * can't update hint to the lookup result. Leave it to the caller. + * Hint didn't match. Look up from the radix tree. Note that the + * hint can only be updated under queue_lock as otherwise @blkg + * could have already been removed from blkg_tree. The caller is + * responsible for grabbing queue_lock if @update_hint. */ blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); - if (blkg && blkg->q == q) + if (blkg && blkg->q == q) { + if (update_hint) { + lockdep_assert_held(q->queue_lock); + rcu_assign_pointer(blkcg->blkg_hint, blkg); + } return blkg; + } return NULL; } @@ -162,7 +167,7 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) if (unlikely(blk_queue_bypass(q))) return NULL; - return __blkg_lookup(blkcg, q); + return __blkg_lookup(blkcg, q, false); } EXPORT_SYMBOL_GPL(blkg_lookup); @@ -170,9 +175,9 @@ EXPORT_SYMBOL_GPL(blkg_lookup); * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. */ -static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q, - struct blkcg_gq *new_blkg) +static struct blkcg_gq *blkg_create(struct blkcg *blkcg, + struct request_queue *q, + struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int ret; @@ -180,13 +185,6 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(q->queue_lock); - /* lookup and update hint on success, see __blkg_lookup() for details */ - blkg = __blkg_lookup(blkcg, q); - if (blkg) { - rcu_assign_pointer(blkcg->blkg_hint, blkg); - goto out_free; - } - /* blkg holds a reference to blkcg */ if (!css_tryget(&blkcg->css)) { blkg = ERR_PTR(-EINVAL); @@ -223,16 +221,39 @@ out_free: return blkg; } +/** + * blkg_lookup_create - lookup blkg, try to create one if not there + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to + * create one. This function should be called under RCU read lock and + * @q->queue_lock. + * + * Returns pointer to the looked up or created blkg on success, ERR_PTR() + * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not + * dead and bypassing, returns ERR_PTR(-EBUSY). + */ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q) { + struct blkcg_gq *blkg; + + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); + /* * This could be the first entry point of blkcg implementation and * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); - return __blkg_lookup_create(blkcg, q, NULL); + + blkg = __blkg_lookup(blkcg, q, true); + if (blkg) + return blkg; + + return blkg_create(blkcg, q, NULL); } EXPORT_SYMBOL_GPL(blkg_lookup_create); @@ -777,7 +798,7 @@ int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { LIST_HEAD(pds); - struct blkcg_gq *blkg; + struct blkcg_gq *blkg, *new_blkg; struct blkg_policy_data *pd, *n; int cnt = 0, ret; bool preloaded; @@ -786,19 +807,27 @@ int blkcg_activate_policy(struct request_queue *q, return 0; /* preallocations for root blkg */ - blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); - if (!blkg) + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!new_blkg) return -ENOMEM; preloaded = !radix_tree_preload(GFP_KERNEL); blk_queue_bypass_start(q); - /* make sure the root blkg exists and count the existing blkgs */ + /* + * Make sure the root blkg exists and count the existing blkgs. As + * @q is bypassing at this point, blkg_lookup_create() can't be + * used. Open code it. + */ spin_lock_irq(q->queue_lock); rcu_read_lock(); - blkg = __blkg_lookup_create(&blkcg_root, q, blkg); + blkg = __blkg_lookup(&blkcg_root, q, false); + if (blkg) + blkg_free(new_blkg); + else + blkg = blkg_create(&blkcg_root, q, new_blkg); rcu_read_unlock(); if (preloaded) -- cgit v1.2.3 From 93e6d5d8f5c909479623c6ab4427f038c6c3f63f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Jan 2013 08:05:10 -0800 Subject: blkcg: cosmetic updates to blkg_create() * Rename out_* labels to err_*. * Do ERR_PTR() conversion once in the error return path. This patch is cosmetic and to prepare for the hierarchy support. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal --- block/blk-cgroup.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 201275467d8b..18ae48083f4a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -187,16 +187,16 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* blkg holds a reference to blkcg */ if (!css_tryget(&blkcg->css)) { - blkg = ERR_PTR(-EINVAL); - goto out_free; + ret = -EINVAL; + goto err_free_blkg; } /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); if (unlikely(!new_blkg)) { - blkg = ERR_PTR(-ENOMEM); - goto out_put; + ret = -ENOMEM; + goto err_put_css; } } blkg = new_blkg; @@ -213,12 +213,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, if (!ret) return blkg; - blkg = ERR_PTR(ret); -out_put: +err_put_css: css_put(&blkcg->css); -out_free: +err_free_blkg: blkg_free(new_blkg); - return blkg; + return ERR_PTR(ret); } /** -- cgit v1.2.3 From 3c547865902e9fc30dc15941f326fd8039c6628d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Jan 2013 08:05:10 -0800 Subject: blkcg: make blkcg_gq's hierarchical Currently a child blkg (blkcg_gq) can be created even if its parent doesn't exist. ie. Given a blkg, it's not guaranteed that its ancestors will exist. This makes it difficult to implement proper hierarchy support for blkcg policies. Always create blkgs recursively and make a child blkg hold a reference to its parent. blkg->parent is added so that finding the parent is easy. blkcg_parent() is also added in the process. This change can be visible to userland. e.g. while issuing IO in a nested cgroup didn't affect the ancestors at all, now it will initialize all ancestor blkgs and zero stats for the request_queue will always appear on them. While this is userland visible, this shouldn't cause any functional difference. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal --- block/blk-cgroup.c | 42 +++++++++++++++++++++++++++++++++++++----- block/blk-cgroup.h | 18 ++++++++++++++++++ 2 files changed, 55 insertions(+), 5 deletions(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 18ae48083f4a..942f344fdfa7 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -201,7 +201,16 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, } blkg = new_blkg; - /* insert */ + /* link parent and insert */ + if (blkcg_parent(blkcg)) { + blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); + if (WARN_ON_ONCE(!blkg->parent)) { + blkg = ERR_PTR(-EINVAL); + goto err_put_css; + } + blkg_get(blkg->parent); + } + spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); if (likely(!ret)) { @@ -213,6 +222,10 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, if (!ret) return blkg; + /* @blkg failed fully initialized, use the usual release path */ + blkg_put(blkg); + return ERR_PTR(ret); + err_put_css: css_put(&blkcg->css); err_free_blkg: @@ -226,8 +239,9 @@ err_free_blkg: * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to - * create one. This function should be called under RCU read lock and - * @q->queue_lock. + * create one. blkg creation is performed recursively from blkcg_root such + * that all non-root blkg's have access to the parent blkg. This function + * should be called under RCU read lock and @q->queue_lock. * * Returns pointer to the looked up or created blkg on success, ERR_PTR() * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not @@ -252,7 +266,23 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, if (blkg) return blkg; - return blkg_create(blkcg, q, NULL); + /* + * Create blkgs walking down from blkcg_root to @blkcg, so that all + * non-root blkgs have access to their parents. + */ + while (true) { + struct blkcg *pos = blkcg; + struct blkcg *parent = blkcg_parent(blkcg); + + while (parent && !__blkg_lookup(parent, q, false)) { + pos = parent; + parent = blkcg_parent(parent); + } + + blkg = blkg_create(pos, q, NULL); + if (pos == blkcg || IS_ERR(blkg)) + return blkg; + } } EXPORT_SYMBOL_GPL(blkg_lookup_create); @@ -321,8 +351,10 @@ static void blkg_rcu_free(struct rcu_head *rcu_head) void __blkg_release(struct blkcg_gq *blkg) { - /* release the extra blkcg reference this blkg has been holding */ + /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); + if (blkg->parent) + blkg_put(blkg->parent); /* * A group is freed in rcu manner. But having an rcu lock does not diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 24597309e23d..b26ed58899fe 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -94,8 +94,13 @@ struct blkcg_gq { struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + /* request allocation list for this blkcg-q pair */ struct request_list rl; + /* reference count */ int refcnt; @@ -180,6 +185,19 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) return task_blkcg(current); } +/** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + struct cgroup *pcg = blkcg->css.cgroup->parent; + + return pcg ? cgroup_to_blkcg(pcg) : NULL; +} + /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest -- cgit v1.2.3 From e71357e118bdd4057e3bc020b9d80fecdd08f588 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Jan 2013 08:05:10 -0800 Subject: cfq-iosched: add leaf_weight cfq blkcg is about to grow proper hierarchy handling, where a child blkg's weight would nest inside the parent's. This makes tasks in a blkg to compete against both tasks in the sibling blkgs and the tasks of child blkgs. We're gonna use the existing weight as the group weight which decides the blkg's weight against its siblings. This patch introduces a new weight - leaf_weight - which decides the weight of a blkg against the child blkgs. It's named leaf_weight because another way to look at it is that each internal blkg nodes have a hidden child leaf node which contains all its tasks and leaf_weight is the weight of the leaf node and handled the same as the weight of the child blkgs. This patch only adds leaf_weight fields and exposes it to userland. The new weight isn't actually used anywhere yet. Note that cfq-iosched currently offcially supports only single level hierarchy and root blkgs compete with the first level blkgs - ie. root weight is basically being used as leaf_weight. For root blkgs, the two weights are kept in sync for backward compatibility. v2: cfqd->root_group->leaf_weight initialization was missing from cfq_init_queue() causing divide by zero when !CONFIG_CFQ_GROUP_SCHED. Fix it. Reported by Fengguang. Signed-off-by: Tejun Heo Cc: Fengguang Wu --- block/blk-cgroup.c | 4 +- block/blk-cgroup.h | 1 + block/cfq-iosched.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 130 insertions(+), 9 deletions(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 942f344fdfa7..10e1df9da46e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -26,7 +26,8 @@ static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; +struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, + .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; EXPORT_SYMBOL_GPL(blkcg_root); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; @@ -710,6 +711,7 @@ static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) return ERR_PTR(-ENOMEM); blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; + blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ done: spin_lock_init(&blkcg->lock); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b26ed58899fe..24462258200e 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -54,6 +54,7 @@ struct blkcg { /* TODO: per-policy storage in blkcg */ unsigned int cfq_weight; /* belongs to cfq */ + unsigned int cfq_leaf_weight; }; struct blkg_stat { diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index bc076f45ca46..175218d66d67 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -223,10 +223,21 @@ struct cfq_group { /* group service_tree key */ u64 vdisktime; + + /* + * There are two weights - (internal) weight is the weight of this + * cfqg against the sibling cfqgs. leaf_weight is the wight of + * this cfqg against the child cfqgs. For the root cfqg, both + * weights are kept in sync for backward compatibility. + */ unsigned int weight; unsigned int new_weight; unsigned int dev_weight; + unsigned int leaf_weight; + unsigned int new_leaf_weight; + unsigned int dev_leaf_weight; + /* number of cfqq currently on this group */ int nr_cfqq; @@ -1182,10 +1193,16 @@ static void cfq_update_group_weight(struct cfq_group *cfqg) { BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); + if (cfqg->new_weight) { cfqg->weight = cfqg->new_weight; cfqg->new_weight = 0; } + + if (cfqg->new_leaf_weight) { + cfqg->leaf_weight = cfqg->new_leaf_weight; + cfqg->new_leaf_weight = 0; + } } static void @@ -1348,6 +1365,7 @@ static void cfq_pd_init(struct blkcg_gq *blkg) cfq_init_cfqg_base(cfqg); cfqg->weight = blkg->blkcg->cfq_weight; + cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight; } /* @@ -1404,6 +1422,26 @@ static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, return 0; } +static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct cfq_group *cfqg = pd_to_cfqg(pd); + + if (!cfqg->dev_leaf_weight) + return 0; + return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); +} + +static int cfqg_print_leaf_weight_device(struct cgroup *cgrp, + struct cftype *cft, + struct seq_file *sf) +{ + blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), + cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0, + false); + return 0; +} + static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, struct seq_file *sf) { @@ -1411,8 +1449,16 @@ static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, return 0; } -static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) +{ + seq_printf(sf, "%u\n", + cgroup_to_blkcg(cgrp)->cfq_leaf_weight); + return 0; +} + +static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, + const char *buf, bool is_leaf_weight) { struct blkcg *blkcg = cgroup_to_blkcg(cgrp); struct blkg_conf_ctx ctx; @@ -1426,8 +1472,13 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, ret = -EINVAL; cfqg = blkg_to_cfqg(ctx.blkg); if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { - cfqg->dev_weight = ctx.v; - cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight; + if (!is_leaf_weight) { + cfqg->dev_weight = ctx.v; + cfqg->new_weight = ctx.v ?: blkcg->cfq_weight; + } else { + cfqg->dev_leaf_weight = ctx.v; + cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight; + } ret = 0; } @@ -1435,7 +1486,20 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, return ret; } -static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, + const char *buf) +{ + return __cfqg_set_weight_device(cgrp, cft, buf, false); +} + +static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft, + const char *buf) +{ + return __cfqg_set_weight_device(cgrp, cft, buf, true); +} + +static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, + bool is_leaf_weight) { struct blkcg *blkcg = cgroup_to_blkcg(cgrp); struct blkcg_gq *blkg; @@ -1445,19 +1509,41 @@ static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) return -EINVAL; spin_lock_irq(&blkcg->lock); - blkcg->cfq_weight = (unsigned int)val; + + if (!is_leaf_weight) + blkcg->cfq_weight = val; + else + blkcg->cfq_leaf_weight = val; hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { struct cfq_group *cfqg = blkg_to_cfqg(blkg); - if (cfqg && !cfqg->dev_weight) - cfqg->new_weight = blkcg->cfq_weight; + if (!cfqg) + continue; + + if (!is_leaf_weight) { + if (!cfqg->dev_weight) + cfqg->new_weight = blkcg->cfq_weight; + } else { + if (!cfqg->dev_leaf_weight) + cfqg->new_leaf_weight = blkcg->cfq_leaf_weight; + } } spin_unlock_irq(&blkcg->lock); return 0; } +static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + return __cfq_set_weight(cgrp, cft, val, false); +} + +static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + return __cfq_set_weight(cgrp, cft, val, true); +} + static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, struct seq_file *sf) { @@ -1518,6 +1604,37 @@ static struct cftype cfq_blkcg_files[] = { .read_seq_string = cfq_print_weight, .write_u64 = cfq_set_weight, }, + + /* on root, leaf_weight is mapped to weight */ + { + .name = "leaf_weight_device", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_seq_string = cfqg_print_weight_device, + .write_string = cfqg_set_weight_device, + .max_write_len = 256, + }, + { + .name = "leaf_weight", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_seq_string = cfq_print_weight, + .write_u64 = cfq_set_weight, + }, + + /* no such mapping necessary for !roots */ + { + .name = "leaf_weight_device", + .flags = CFTYPE_NOT_ON_ROOT, + .read_seq_string = cfqg_print_leaf_weight_device, + .write_string = cfqg_set_leaf_weight_device, + .max_write_len = 256, + }, + { + .name = "leaf_weight", + .flags = CFTYPE_NOT_ON_ROOT, + .read_seq_string = cfq_print_leaf_weight, + .write_u64 = cfq_set_leaf_weight, + }, + { .name = "time", .private = offsetof(struct cfq_group, stats.time), @@ -3992,6 +4109,7 @@ static int cfq_init_queue(struct request_queue *q) cfq_init_cfqg_base(cfqd->root_group); #endif cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; + cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; /* * Not strictly needed (since RB_ROOT just clears the node and we -- cgit v1.2.3 From b276a876a014c5fa58a16f247c0933f6c42112e3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Jan 2013 08:05:12 -0800 Subject: blkcg: add blkg_policy_data->plid Add pd->plid so that the policy a pd belongs to can be identified easily. This will be used to implement hierarchical blkg_[rw]stats. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal --- block/blk-cgroup.c | 2 ++ block/blk-cgroup.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 10e1df9da46e..3a8de321d1f6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -113,6 +113,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->pd[i] = pd; pd->blkg = blkg; + pd->plid = i; /* invoke per-policy init */ if (pol->pd_init_fn) @@ -908,6 +909,7 @@ int blkcg_activate_policy(struct request_queue *q, blkg->pd[pol->plid] = pd; pd->blkg = blkg; + pd->plid = pol->plid; pol->pd_init_fn(blkg); spin_unlock(&blkg->blkcg->lock); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 24462258200e..40f5b9768aac 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -81,8 +81,9 @@ struct blkg_rwstat { * beginning and pd_size can't be smaller than pd. */ struct blkg_policy_data { - /* the blkg this per-policy data belongs to */ + /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; + int plid; /* used during policy activation */ struct list_head alloc_node; -- cgit v1.2.3 From f427d909648aa592c9588d0f66b5b457752a0cd1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Jan 2013 08:05:12 -0800 Subject: blkcg: implement blkcg_policy->on/offline_pd_fn() and blkcg_gq->online Add two blkcg_policy methods, ->online_pd_fn() and ->offline_pd_fn(), which are invoked as the policy_data gets activated and deactivated while holding both blkcg and q locks. Also, add blkcg_gq->online bool, which is set and cleared as the blkcg_gq gets activated and deactivated. This flag also is toggled while holding both blkcg and q locks. These will be used to implement hierarchical stats. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal --- block/blk-cgroup.c | 21 ++++++++++++++++++++- block/blk-cgroup.h | 7 +++++++ 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3a8de321d1f6..4d625d28e070 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -182,7 +182,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; - int ret; + int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(q->queue_lock); @@ -218,7 +218,15 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); list_add(&blkg->q_node, &q->blkg_list); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_online_fn) + pol->pd_online_fn(blkg); + } } + blkg->online = true; spin_unlock(&blkcg->lock); if (!ret) @@ -291,6 +299,7 @@ EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; + int i; lockdep_assert_held(blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); @@ -299,6 +308,14 @@ static void blkg_destroy(struct blkcg_gq *blkg) WARN_ON_ONCE(list_empty(&blkg->q_node)); WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_offline_fn) + pol->pd_offline_fn(blkg); + } + blkg->online = false; + radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); list_del_init(&blkg->q_node); hlist_del_init_rcu(&blkg->blkcg_node); @@ -956,6 +973,8 @@ void blkcg_deactivate_policy(struct request_queue *q, /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg); if (pol->pd_exit_fn) pol->pd_exit_fn(blkg); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 40f5b9768aac..678e89ed8ecb 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -106,12 +106,17 @@ struct blkcg_gq { /* reference count */ int refcnt; + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct rcu_head rcu_head; }; typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); @@ -124,6 +129,8 @@ struct blkcg_policy { /* operations */ blkcg_pol_init_pd_fn *pd_init_fn; + blkcg_pol_online_pd_fn *pd_online_fn; + blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_exit_pd_fn *pd_exit_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; -- cgit v1.2.3 From b50da39f51139f81b3115d0f9d8632507f802755 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Jan 2013 08:05:12 -0800 Subject: blkcg: export __blkg_prfill_rwstat() Hierarchical stats for cfq-iosched will need __blkg_prfill_rwstat(). Export it. Signed-off-by: Tejun Heo Reported-by: Fengguang Wu --- block/blk-cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4d625d28e070..3aec4cdc8968 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -552,6 +552,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } +EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); /** * blkg_prfill_stat - prfill callback for blkg_stat -- cgit v1.2.3 From 16b3de6652c7aef151f38726faf90f0dbc9e9c71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Jan 2013 08:05:12 -0800 Subject: blkcg: implement blkg_[rw]stat_recursive_sum() and blkg_[rw]stat_merge() Implement blkg_[rw]stat_recursive_sum() and blkg_[rw]stat_merge(). The former two collect the [rw]stats designated by the target policy data and offset from the pd's subtree. The latter two add one [rw]stat to another. Note that the recursive sum functions require the queue lock to be held on entry to make blkg online test reliable. This is necessary to properly handle stats of a dying blkg. These will be used to implement hierarchical stats. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal --- block/blk-cgroup.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-cgroup.h | 35 ++++++++++++++++++ 2 files changed, 142 insertions(+) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3aec4cdc8968..f9797b244eb3 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -32,6 +32,26 @@ EXPORT_SYMBOL_GPL(blkcg_root); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; +static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, + struct request_queue *q, bool update_hint); + +/** + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_cgrp: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU + * read locked. If called under either blkcg or queue lock, the iteration + * is guaranteed to include all and only online blkgs. The caller may + * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip + * subtree. + */ +#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ + cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ + if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ + (p_blkg)->q, false))) + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -127,6 +147,17 @@ err_free: return NULL; } +/** + * __blkg_lookup - internal version of blkg_lookup() + * @blkcg: blkcg of interest + * @q: request_queue of interest + * @update_hint: whether to update lookup hint with the result or not + * + * This is internal version and shouldn't be used by policy + * implementations. Looks up blkgs for the @blkcg - @q pair regardless of + * @q's bypass state. If @update_hint is %true, the caller should be + * holding @q->queue_lock and lookup hint is updated on success. + */ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, bool update_hint) { @@ -585,6 +616,82 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, } EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); +/** + * blkg_stat_recursive_sum - collect hierarchical blkg_stat + * @pd: policy private data of interest + * @off: offset to the blkg_stat in @pd + * + * Collect the blkg_stat specified by @off from @pd and all its online + * descendants and return the sum. The caller must be holding the queue + * lock for online tests. + */ +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +{ + struct blkcg_policy *pol = blkcg_policy[pd->plid]; + struct blkcg_gq *pos_blkg; + struct cgroup *pos_cgrp; + u64 sum; + + lockdep_assert_held(pd->blkg->q->queue_lock); + + sum = blkg_stat_read((void *)pd + off); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); + struct blkg_stat *stat = (void *)pos_pd + off; + + if (pos_blkg->online) + sum += blkg_stat_read(stat); + } + rcu_read_unlock(); + + return sum; +} +EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); + +/** + * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat + * @pd: policy private data of interest + * @off: offset to the blkg_stat in @pd + * + * Collect the blkg_rwstat specified by @off from @pd and all its online + * descendants and return the sum. The caller must be holding the queue + * lock for online tests. + */ +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off) +{ + struct blkcg_policy *pol = blkcg_policy[pd->plid]; + struct blkcg_gq *pos_blkg; + struct cgroup *pos_cgrp; + struct blkg_rwstat sum; + int i; + + lockdep_assert_held(pd->blkg->q->queue_lock); + + sum = blkg_rwstat_read((void *)pd + off); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); + struct blkg_rwstat *rwstat = (void *)pos_pd + off; + struct blkg_rwstat tmp; + + if (!pos_blkg->online) + continue; + + tmp = blkg_rwstat_read(rwstat); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum.cnt[i] += tmp.cnt[i]; + } + rcu_read_unlock(); + + return sum; +} +EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); + /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 586c0ac3309a..f2b292925ccd 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -164,6 +164,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off); + struct blkg_conf_ctx { struct gendisk *disk; struct blkcg_gq *blkg; @@ -413,6 +417,18 @@ static inline void blkg_stat_reset(struct blkg_stat *stat) stat->cnt = 0; } +/** + * blkg_stat_merge - merge a blkg_stat into another + * @to: the destination blkg_stat + * @from: the source + * + * Add @from's count to @to. + */ +static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +{ + blkg_stat_add(to, blkg_stat_read(from)); +} + /** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat @@ -484,6 +500,25 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); } +/** + * blkg_rwstat_merge - merge a blkg_rwstat into another + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's counts to @to. + */ +static inline void blkg_rwstat_merge(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + struct blkg_rwstat v = blkg_rwstat_read(from); + int i; + + u64_stats_update_begin(&to->syncp); + for (i = 0; i < BLKG_RWSTAT_NR; i++) + to->cnt[i] += v.cnt[i]; + u64_stats_update_end(&to->syncp); +} + #else /* CONFIG_BLK_CGROUP */ struct cgroup; -- cgit v1.2.3 From 810ecfa765f8be20c8d9c468885b3403a2232d2f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Jan 2013 08:05:13 -0800 Subject: blkcg: make blkcg_print_blkgs() grab q locks instead of blkcg lock Instead of holding blkcg->lock while walking ->blkg_list and executing prfill(), RCU walk ->blkg_list and hold the blkg's queue lock while executing prfill(). This makes prfill() implementations easier as stats are mostly protected by queue lock. This will be used to implement hierarchical stats. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal --- block/blk-cgroup.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f9797b244eb3..87ea95d1f533 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -504,8 +504,9 @@ static const char *blkg_dev_name(struct blkcg_gq *blkg) * * This function invokes @prfill on each blkg of @blkcg if pd for the * policy specified by @pol exists. @prfill is invoked with @sf, the - * policy data and @data. If @show_total is %true, the sum of the return - * values from @prfill is printed with "Total" label at the end. + * policy data and @data and the matching queue lock held. If @show_total + * is %true, the sum of the return values from @prfill is printed with + * "Total" label at the end. * * This is to be used to construct print functions for * cftype->read_seq_string method. @@ -520,11 +521,14 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, struct hlist_node *n; u64 total = 0; - spin_lock_irq(&blkcg->lock); - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + spin_lock_irq(blkg->q->queue_lock); if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); - spin_unlock_irq(&blkcg->lock); + spin_unlock_irq(blkg->q->queue_lock); + } + rcu_read_unlock(); if (show_total) seq_printf(sf, "Total %llu\n", (unsigned long long)total); -- cgit v1.2.3