diff options
Diffstat (limited to 'libbcache/tier.c')
-rw-r--r-- | libbcache/tier.c | 243 |
1 files changed, 243 insertions, 0 deletions
diff --git a/libbcache/tier.c b/libbcache/tier.c new file mode 100644 index 00000000..2b568e1f --- /dev/null +++ b/libbcache/tier.c @@ -0,0 +1,243 @@ + +#include "bcache.h" +#include "alloc.h" +#include "btree_iter.h" +#include "buckets.h" +#include "clock.h" +#include "extents.h" +#include "io.h" +#include "keylist.h" +#include "move.h" +#include "tier.h" + +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <trace/events/bcache.h> + +struct tiering_state { + struct cache_group *tier; + unsigned tier_idx; + unsigned sectors; + unsigned stripe_size; + unsigned dev_idx; + struct cache *ca; +}; + +static bool tiering_pred(struct cache_set *c, + struct tiering_state *s, + struct bkey_s_c k) +{ + if (bkey_extent_is_data(k.k)) { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr; + struct cache_member_rcu *mi; + unsigned replicas = 0; + + /* Make sure we have room to add a new pointer: */ + if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > + BKEY_EXTENT_VAL_U64s_MAX) + return false; + + mi = cache_member_info_get(c); + extent_for_each_ptr(e, ptr) + if (ptr->dev < mi->nr_in_set && + mi->m[ptr->dev].tier >= s->tier_idx) + replicas++; + cache_member_info_put(); + + return replicas < c->opts.data_replicas; + } + + return false; +} + +static void tier_put_device(struct tiering_state *s) +{ + if (s->ca) + percpu_ref_put(&s->ca->ref); + s->ca = NULL; +} + +/** + * refill_next - move on to refilling the next cache's tiering keylist + */ +static void tier_next_device(struct cache_set *c, struct tiering_state *s) +{ + if (!s->ca || s->sectors > s->stripe_size) { + tier_put_device(s); + s->sectors = 0; + s->dev_idx++; + + spin_lock(&s->tier->lock); + if (s->dev_idx >= s->tier->nr_devices) + s->dev_idx = 0; + + if (s->tier->nr_devices) { + s->ca = s->tier->d[s->dev_idx].dev; + percpu_ref_get(&s->ca->ref); + } + spin_unlock(&s->tier->lock); + } +} + +static int issue_tiering_move(struct cache_set *c, + struct tiering_state *s, + struct moving_context *ctxt, + struct bkey_s_c k) +{ + int ret; + + ret = bch_data_move(c, ctxt, &s->ca->tiering_write_point, k, NULL); + if (!ret) { + trace_bcache_tiering_copy(k.k); + s->sectors += k.k->size; + } else { + trace_bcache_tiering_alloc_fail(c, k.k->size); + } + + return ret; +} + +/** + * tiering_next_cache - issue a move to write an extent to the next cache + * device in round robin order + */ +static s64 read_tiering(struct cache_set *c, struct cache_group *tier) +{ + struct moving_context ctxt; + struct tiering_state s; + struct btree_iter iter; + struct bkey_s_c k; + unsigned nr_devices = READ_ONCE(tier->nr_devices); + int ret; + + if (!nr_devices) + return 0; + + trace_bcache_tiering_start(c); + + memset(&s, 0, sizeof(s)); + s.tier = tier; + s.tier_idx = tier - c->cache_tiers; + s.stripe_size = 2048; /* 1 mb for now */ + + bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate, + nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE); + bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + + while (!kthread_should_stop() && + !bch_move_ctxt_wait(&ctxt) && + (k = bch_btree_iter_peek(&iter)).k && + !btree_iter_err(k)) { + if (!tiering_pred(c, &s, k)) + goto next; + + tier_next_device(c, &s); + if (!s.ca) + break; + + ret = issue_tiering_move(c, &s, &ctxt, k); + if (ret) { + bch_btree_iter_unlock(&iter); + + /* memory allocation failure, wait for some IO to finish */ + bch_move_ctxt_wait_for_io(&ctxt); + continue; + } +next: + bch_btree_iter_advance_pos(&iter); + //bch_btree_iter_cond_resched(&iter); + + /* unlock before calling moving_context_wait() */ + bch_btree_iter_unlock(&iter); + cond_resched(); + } + + bch_btree_iter_unlock(&iter); + tier_put_device(&s); + bch_move_ctxt_exit(&ctxt); + trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved); + + return ctxt.sectors_moved; +} + +static int bch_tiering_thread(void *arg) +{ + struct cache_set *c = arg; + struct cache_group *tier = &c->cache_tiers[1]; + struct io_clock *clock = &c->io_clock[WRITE]; + struct cache *ca; + u64 tier_capacity, available_sectors; + unsigned long last; + unsigned i; + + set_freezable(); + + while (!kthread_should_stop()) { + if (kthread_wait_freezable(c->tiering_enabled && + tier->nr_devices)) + break; + + while (1) { + struct cache_group *faster_tier; + + last = atomic_long_read(&clock->now); + + tier_capacity = available_sectors = 0; + rcu_read_lock(); + for (faster_tier = c->cache_tiers; + faster_tier != tier; + faster_tier++) { + group_for_each_cache_rcu(ca, faster_tier, i) { + tier_capacity += + (ca->mi.nbuckets - + ca->mi.first_bucket) << ca->bucket_bits; + available_sectors += + buckets_available_cache(ca) << ca->bucket_bits; + } + } + rcu_read_unlock(); + + if (available_sectors < (tier_capacity >> 1)) + break; + + bch_kthread_io_clock_wait(clock, + last + + available_sectors - + (tier_capacity >> 1)); + if (kthread_should_stop()) + return 0; + } + + read_tiering(c, tier); + } + + return 0; +} + +void bch_tiering_init_cache_set(struct cache_set *c) +{ + bch_pd_controller_init(&c->tiering_pd); +} + +int bch_tiering_read_start(struct cache_set *c) +{ + struct task_struct *t; + + t = kthread_create(bch_tiering_thread, c, "bch_tier_read"); + if (IS_ERR(t)) + return PTR_ERR(t); + + c->tiering_read = t; + wake_up_process(c->tiering_read); + + return 0; +} + +void bch_tiering_read_stop(struct cache_set *c) +{ + if (!IS_ERR_OR_NULL(c->tiering_read)) { + kthread_stop(c->tiering_read); + c->tiering_read = NULL; + } +} |