summaryrefslogtreecommitdiff
path: root/libbcache/tier.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbcache/tier.c')
-rw-r--r--libbcache/tier.c243
1 files changed, 243 insertions, 0 deletions
diff --git a/libbcache/tier.c b/libbcache/tier.c
new file mode 100644
index 00000000..2b568e1f
--- /dev/null
+++ b/libbcache/tier.c
@@ -0,0 +1,243 @@
+
+#include "bcache.h"
+#include "alloc.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "extents.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "tier.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <trace/events/bcache.h>
+
+struct tiering_state {
+ struct cache_group *tier;
+ unsigned tier_idx;
+ unsigned sectors;
+ unsigned stripe_size;
+ unsigned dev_idx;
+ struct cache *ca;
+};
+
+static bool tiering_pred(struct cache_set *c,
+ struct tiering_state *s,
+ struct bkey_s_c k)
+{
+ if (bkey_extent_is_data(k.k)) {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const struct bch_extent_ptr *ptr;
+ struct cache_member_rcu *mi;
+ unsigned replicas = 0;
+
+ /* Make sure we have room to add a new pointer: */
+ if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+ BKEY_EXTENT_VAL_U64s_MAX)
+ return false;
+
+ mi = cache_member_info_get(c);
+ extent_for_each_ptr(e, ptr)
+ if (ptr->dev < mi->nr_in_set &&
+ mi->m[ptr->dev].tier >= s->tier_idx)
+ replicas++;
+ cache_member_info_put();
+
+ return replicas < c->opts.data_replicas;
+ }
+
+ return false;
+}
+
+static void tier_put_device(struct tiering_state *s)
+{
+ if (s->ca)
+ percpu_ref_put(&s->ca->ref);
+ s->ca = NULL;
+}
+
+/**
+ * refill_next - move on to refilling the next cache's tiering keylist
+ */
+static void tier_next_device(struct cache_set *c, struct tiering_state *s)
+{
+ if (!s->ca || s->sectors > s->stripe_size) {
+ tier_put_device(s);
+ s->sectors = 0;
+ s->dev_idx++;
+
+ spin_lock(&s->tier->lock);
+ if (s->dev_idx >= s->tier->nr_devices)
+ s->dev_idx = 0;
+
+ if (s->tier->nr_devices) {
+ s->ca = s->tier->d[s->dev_idx].dev;
+ percpu_ref_get(&s->ca->ref);
+ }
+ spin_unlock(&s->tier->lock);
+ }
+}
+
+static int issue_tiering_move(struct cache_set *c,
+ struct tiering_state *s,
+ struct moving_context *ctxt,
+ struct bkey_s_c k)
+{
+ int ret;
+
+ ret = bch_data_move(c, ctxt, &s->ca->tiering_write_point, k, NULL);
+ if (!ret) {
+ trace_bcache_tiering_copy(k.k);
+ s->sectors += k.k->size;
+ } else {
+ trace_bcache_tiering_alloc_fail(c, k.k->size);
+ }
+
+ return ret;
+}
+
+/**
+ * tiering_next_cache - issue a move to write an extent to the next cache
+ * device in round robin order
+ */
+static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
+{
+ struct moving_context ctxt;
+ struct tiering_state s;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ unsigned nr_devices = READ_ONCE(tier->nr_devices);
+ int ret;
+
+ if (!nr_devices)
+ return 0;
+
+ trace_bcache_tiering_start(c);
+
+ memset(&s, 0, sizeof(s));
+ s.tier = tier;
+ s.tier_idx = tier - c->cache_tiers;
+ s.stripe_size = 2048; /* 1 mb for now */
+
+ bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate,
+ nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
+ bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+ while (!kthread_should_stop() &&
+ !bch_move_ctxt_wait(&ctxt) &&
+ (k = bch_btree_iter_peek(&iter)).k &&
+ !btree_iter_err(k)) {
+ if (!tiering_pred(c, &s, k))
+ goto next;
+
+ tier_next_device(c, &s);
+ if (!s.ca)
+ break;
+
+ ret = issue_tiering_move(c, &s, &ctxt, k);
+ if (ret) {
+ bch_btree_iter_unlock(&iter);
+
+ /* memory allocation failure, wait for some IO to finish */
+ bch_move_ctxt_wait_for_io(&ctxt);
+ continue;
+ }
+next:
+ bch_btree_iter_advance_pos(&iter);
+ //bch_btree_iter_cond_resched(&iter);
+
+ /* unlock before calling moving_context_wait() */
+ bch_btree_iter_unlock(&iter);
+ cond_resched();
+ }
+
+ bch_btree_iter_unlock(&iter);
+ tier_put_device(&s);
+ bch_move_ctxt_exit(&ctxt);
+ trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
+
+ return ctxt.sectors_moved;
+}
+
+static int bch_tiering_thread(void *arg)
+{
+ struct cache_set *c = arg;
+ struct cache_group *tier = &c->cache_tiers[1];
+ struct io_clock *clock = &c->io_clock[WRITE];
+ struct cache *ca;
+ u64 tier_capacity, available_sectors;
+ unsigned long last;
+ unsigned i;
+
+ set_freezable();
+
+ while (!kthread_should_stop()) {
+ if (kthread_wait_freezable(c->tiering_enabled &&
+ tier->nr_devices))
+ break;
+
+ while (1) {
+ struct cache_group *faster_tier;
+
+ last = atomic_long_read(&clock->now);
+
+ tier_capacity = available_sectors = 0;
+ rcu_read_lock();
+ for (faster_tier = c->cache_tiers;
+ faster_tier != tier;
+ faster_tier++) {
+ group_for_each_cache_rcu(ca, faster_tier, i) {
+ tier_capacity +=
+ (ca->mi.nbuckets -
+ ca->mi.first_bucket) << ca->bucket_bits;
+ available_sectors +=
+ buckets_available_cache(ca) << ca->bucket_bits;
+ }
+ }
+ rcu_read_unlock();
+
+ if (available_sectors < (tier_capacity >> 1))
+ break;
+
+ bch_kthread_io_clock_wait(clock,
+ last +
+ available_sectors -
+ (tier_capacity >> 1));
+ if (kthread_should_stop())
+ return 0;
+ }
+
+ read_tiering(c, tier);
+ }
+
+ return 0;
+}
+
+void bch_tiering_init_cache_set(struct cache_set *c)
+{
+ bch_pd_controller_init(&c->tiering_pd);
+}
+
+int bch_tiering_read_start(struct cache_set *c)
+{
+ struct task_struct *t;
+
+ t = kthread_create(bch_tiering_thread, c, "bch_tier_read");
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+
+ c->tiering_read = t;
+ wake_up_process(c->tiering_read);
+
+ return 0;
+}
+
+void bch_tiering_read_stop(struct cache_set *c)
+{
+ if (!IS_ERR_OR_NULL(c->tiering_read)) {
+ kthread_stop(c->tiering_read);
+ c->tiering_read = NULL;
+ }
+}