1 files changed, 0 insertions, 1913 deletions
diff --git a/libbcache/alloc.c b/libbcache/alloc.c
deleted file mode 100644
index 2f892914..00000000
--- a/libbcache/alloc.c
+++ /dev/null
@@ -1,1913 +0,0 @@
-/*
- * Primary bucket allocation code
- *
- * Copyright 2012 Google, Inc.
- *
- * Allocation in bcache is done in terms of buckets:
- *
- * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
- * btree pointers - they must match for the pointer to be considered valid.
- *
- * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
- * bucket simply by incrementing its gen.
- *
- * The gens (along with the priorities; it's really the gens are important but
- * the code is named as if it's the priorities) are written in an arbitrary list
- * of buckets on disk, with a pointer to them in the journal header.
- *
- * When we invalidate a bucket, we have to write its new gen to disk and wait
- * for that write to complete before we use it - otherwise after a crash we
- * could have pointers that appeared to be good but pointed to data that had
- * been overwritten.
- *
- * Since the gens and priorities are all stored contiguously on disk, we can
- * batch this up: We fill up the free_inc list with freshly invalidated buckets,
- * call prio_write(), and when prio_write() finishes we pull buckets off the
- * free_inc list and optionally discard them.
- *
- * free_inc isn't the only freelist - if it was, we'd often have to sleep while
- * priorities and gens were being written before we could allocate. c->free is a
- * smaller freelist, and buckets on that list are always ready to be used.
- *
- * If we've got discards enabled, that happens when a bucket moves from the
- * free_inc list to the free list.
- *
- * It's important to ensure that gens don't wrap around - with respect to
- * either the oldest gen in the btree or the gen on disk. This is quite
- * difficult to do in practice, but we explicitly guard against it anyways - if
- * a bucket is in danger of wrapping around we simply skip invalidating it that
- * time around, and we garbage collect or rewrite the priorities sooner than we
- * would have otherwise.
- *
- * bch_bucket_alloc() allocates a single bucket from a specific device.
- *
- * bch_bucket_alloc_set() allocates one or more buckets from different devices
- * in a given filesystem.
- *
- * invalidate_buckets() drives all the processes described above. It's called
- * from bch_bucket_alloc() and a few other places that need to make sure free
- * buckets are ready.
- *
- * invalidate_buckets_(lru|fifo)() find buckets that are available to be
- * invalidated, and then invalidate them and stick them on the free_inc list -
- * in either lru or fifo order.
- */
-
-#include "bcache.h"
-#include "alloc.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "io.h"
-#include "journal.h"
-#include "super-io.h"
-
-#include <linux/blkdev.h>
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/random.h>
-#include <linux/rcupdate.h>
-#include <trace/events/bcache.h>
-
-static void __bch_bucket_free(struct bch_dev *, struct bucket *);
-static void bch_recalc_min_prio(struct bch_dev *, int);
-
-/* Allocation groups: */
-
-void bch_dev_group_remove(struct dev_group *grp, struct bch_dev *ca)
-{
-	unsigned i;
-
-	spin_lock(&grp->lock);
-
-	for (i = 0; i < grp->nr; i++)
-		if (grp->d[i].dev == ca) {
-			grp->nr--;
-			memmove(&grp->d[i],
-				&grp->d[i + 1],
-				(grp->nr- i) * sizeof(grp->d[0]));
-			break;
-		}
-
-	spin_unlock(&grp->lock);
-}
-
-void bch_dev_group_add(struct dev_group *grp, struct bch_dev *ca)
-{
-	unsigned i;
-
-	spin_lock(&grp->lock);
-	for (i = 0; i < grp->nr; i++)
-		if (grp->d[i].dev == ca)
-			goto out;
-
-	BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
-
-	grp->d[grp->nr++].dev = ca;
-out:
-	spin_unlock(&grp->lock);
-}
-
-/* Ratelimiting/PD controllers */
-
-static void pd_controllers_update(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(to_delayed_work(work),
-					   struct bch_fs,
-					   pd_controllers_update);
-	struct bch_dev *ca;
-	unsigned i, iter;
-
-	/* All units are in bytes */
-	u64 faster_tiers_size	= 0;
-	u64 faster_tiers_dirty	= 0;
-
-	u64 fastest_tier_size	= 0;
-	u64 fastest_tier_free	= 0;
-	u64 copygc_can_free	= 0;
-
-	rcu_read_lock();
-	for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
-		bch_pd_controller_update(&c->tiers[i].pd,
-				div_u64(faster_tiers_size *
-					c->tiering_percent, 100),
-				faster_tiers_dirty,
-				-1);
-
-		spin_lock(&c->tiers[i].devs.lock);
-		group_for_each_dev(ca, &c->tiers[i].devs, iter) {
-			struct bch_dev_usage stats = bch_dev_usage_read(ca);
-			unsigned bucket_bits = ca->bucket_bits + 9;
-
-			u64 size = (ca->mi.nbuckets -
-				    ca->mi.first_bucket) << bucket_bits;
-			u64 dirty = stats.buckets_dirty << bucket_bits;
-			u64 free = __dev_buckets_free(ca, stats) << bucket_bits;
-			/*
-			 * Bytes of internal fragmentation, which can be
-			 * reclaimed by copy GC
-			 */
-			s64 fragmented = ((stats.buckets_dirty +
-					   stats.buckets_cached) <<
-					  bucket_bits) -
-				((stats.sectors[S_DIRTY] +
-				  stats.sectors[S_CACHED] ) << 9);
-
-			fragmented = max(0LL, fragmented);
-
-			bch_pd_controller_update(&ca->moving_gc_pd,
-						 free, fragmented, -1);
-
-			faster_tiers_size		+= size;
-			faster_tiers_dirty		+= dirty;
-
-			if (!c->fastest_tier ||
-			    c->fastest_tier == &c->tiers[i]) {
-				fastest_tier_size	+= size;
-				fastest_tier_free	+= free;
-			}
-
-			copygc_can_free			+= fragmented;
-		}
-		spin_unlock(&c->tiers[i].devs.lock);
-	}
-
-	rcu_read_unlock();
-
-	/*
-	 * Throttle foreground writes if tier 0 is running out of free buckets,
-	 * and either tiering or copygc can free up space.
-	 *
-	 * Target will be small if there isn't any work to do - we don't want to
-	 * throttle foreground writes if we currently have all the free space
-	 * we're ever going to have.
-	 *
-	 * Otherwise, if there's work to do, try to keep 20% of tier0 available
-	 * for foreground writes.
-	 */
-	if (c->fastest_tier)
-		copygc_can_free = U64_MAX;
-
-	bch_pd_controller_update(&c->foreground_write_pd,
-				 min(copygc_can_free,
-				     div_u64(fastest_tier_size *
-					     c->foreground_target_percent,
-					     100)),
-				 fastest_tier_free,
-				 -1);
-
-	schedule_delayed_work(&c->pd_controllers_update,
-			      c->pd_controllers_update_seconds * HZ);
-}
-
-/*
- * Bucket priorities/gens:
- *
- * For each bucket, we store on disk its
-   * 8 bit gen
-   * 16 bit priority
- *
- * See alloc.c for an explanation of the gen. The priority is used to implement
- * lru (and in the future other) cache replacement policies; for most purposes
- * it's just an opaque integer.
- *
- * The gens and the priorities don't have a whole lot to do with each other, and
- * it's actually the gens that must be written out at specific times - it's no
- * big deal if the priorities don't get written, if we lose them we just reuse
- * buckets in suboptimal order.
- *
- * On disk they're stored in a packed array, and in as many buckets are required
- * to fit them all. The buckets we use to store them form a list; the journal
- * header points to the first bucket, the first bucket points to the second
- * bucket, et cetera.
- *
- * This code is used by the allocation code; periodically (whenever it runs out
- * of buckets to allocate from) the allocation code will invalidate some
- * buckets, but it can't use those buckets until their new gens are safely on
- * disk.
- */
-
-static int prio_io(struct bch_dev *ca, uint64_t bucket, int op)
-{
-	bio_init(ca->bio_prio);
-	bio_set_op_attrs(ca->bio_prio, op, REQ_SYNC|REQ_META);
-
-	ca->bio_prio->bi_max_vecs	= bucket_pages(ca);
-	ca->bio_prio->bi_io_vec		= ca->bio_prio->bi_inline_vecs;
-	ca->bio_prio->bi_iter.bi_sector	= bucket * ca->mi.bucket_size;
-	ca->bio_prio->bi_bdev		= ca->disk_sb.bdev;
-	ca->bio_prio->bi_iter.bi_size	= bucket_bytes(ca);
-	bch_bio_map(ca->bio_prio, ca->disk_buckets);
-
-	return submit_bio_wait(ca->bio_prio);
-}
-
-static struct nonce prio_nonce(struct prio_set *p)
-{
-	return (struct nonce) {{
-		[0] = 0,
-		[1] = p->nonce[0],
-		[2] = p->nonce[1],
-		[3] = p->nonce[2]^BCH_NONCE_PRIO,
-	}};
-}
-
-static int bch_prio_write(struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
-	struct journal *j = &c->journal;
-	struct journal_res res = { 0 };
-	bool need_new_journal_entry;
-	int i, ret;
-
-	if (c->opts.nochanges)
-		return 0;
-
-	trace_bcache_prio_write_start(ca);
-
-	atomic64_add(ca->mi.bucket_size * prio_buckets(ca),
-		     &ca->meta_sectors_written);
-
-	for (i = prio_buckets(ca) - 1; i >= 0; --i) {
-		struct bucket *g;
-		struct prio_set *p = ca->disk_buckets;
-		struct bucket_disk *d = p->data;
-		struct bucket_disk *end = d + prios_per_bucket(ca);
-		size_t r;
-
-		for (r = i * prios_per_bucket(ca);
-		     r < ca->mi.nbuckets && d < end;
-		     r++, d++) {
-			g = ca->buckets + r;
-			d->read_prio = cpu_to_le16(g->read_prio);
-			d->write_prio = cpu_to_le16(g->write_prio);
-			d->gen = ca->buckets[r].mark.gen;
-		}
-
-		p->next_bucket	= cpu_to_le64(ca->prio_buckets[i + 1]);
-		p->magic	= cpu_to_le64(pset_magic(c));
-		get_random_bytes(&p->nonce, sizeof(p->nonce));
-
-		spin_lock(&ca->prio_buckets_lock);
-		r = bch_bucket_alloc(ca, RESERVE_PRIO);
-		BUG_ON(!r);
-
-		/*
-		 * goes here before dropping prio_buckets_lock to guard against
-		 * it getting gc'd from under us
-		 */
-		ca->prio_buckets[i] = r;
-		bch_mark_metadata_bucket(ca, ca->buckets + r,
-					 BUCKET_PRIOS, false);
-		spin_unlock(&ca->prio_buckets_lock);
-
-		SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c));
-
-		bch_encrypt(c, PSET_CSUM_TYPE(p),
-			    prio_nonce(p),
-			    p->encrypted_start,
-			    bucket_bytes(ca) -
-			    offsetof(struct prio_set, encrypted_start));
-
-		p->csum	 = bch_checksum(c, PSET_CSUM_TYPE(p),
-					prio_nonce(p),
-					(void *) p + sizeof(p->csum),
-					bucket_bytes(ca) - sizeof(p->csum));
-
-		ret = prio_io(ca, r, REQ_OP_WRITE);
-		if (bch_dev_fatal_io_err_on(ret, ca,
-					  "prio write to bucket %zu", r) ||
-		    bch_meta_write_fault("prio"))
-			return ret;
-	}
-
-	spin_lock(&j->lock);
-	j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]);
-	j->nr_prio_buckets = max_t(unsigned,
-				   ca->dev_idx + 1,
-				   j->nr_prio_buckets);
-	spin_unlock(&j->lock);
-
-	do {
-		unsigned u64s = jset_u64s(0);
-
-		if (!test_bit(JOURNAL_STARTED, &c->journal.flags))
-			break;
-
-		ret = bch_journal_res_get(j, &res, u64s, u64s);
-		if (ret)
-			return ret;
-
-		need_new_journal_entry = j->buf[res.idx].nr_prio_buckets <
-			ca->dev_idx + 1;
-		bch_journal_res_put(j, &res);
-
-		ret = bch_journal_flush_seq(j, res.seq);
-		if (ret)
-			return ret;
-	} while (need_new_journal_entry);
-
-	/*
-	 * Don't want the old priorities to get garbage collected until after we
-	 * finish writing the new ones, and they're journalled
-	 */
-
-	spin_lock(&ca->prio_buckets_lock);
-
-	for (i = 0; i < prio_buckets(ca); i++) {
-		if (ca->prio_last_buckets[i])
-			__bch_bucket_free(ca,
-				&ca->buckets[ca->prio_last_buckets[i]]);
-
-		ca->prio_last_buckets[i] = ca->prio_buckets[i];
-	}
-
-	spin_unlock(&ca->prio_buckets_lock);
-
-	trace_bcache_prio_write_end(ca);
-	return 0;
-}
-
-int bch_prio_read(struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
-	struct prio_set *p = ca->disk_buckets;
-	struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
-	struct bucket_mark new;
-	struct bch_csum csum;
-	unsigned bucket_nr = 0;
-	u64 bucket, expect, got;
-	size_t b;
-	int ret = 0;
-
-	spin_lock(&c->journal.lock);
-	bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]);
-	spin_unlock(&c->journal.lock);
-
-	/*
-	 * If the device hasn't been used yet, there won't be a prio bucket ptr
-	 */
-	if (!bucket)
-		return 0;
-
-	unfixable_fsck_err_on(bucket < ca->mi.first_bucket ||
-			      bucket >= ca->mi.nbuckets, c,
-			      "bad prio bucket %llu", bucket);
-
-	for (b = 0; b < ca->mi.nbuckets; b++, d++) {
-		if (d == end) {
-			ca->prio_last_buckets[bucket_nr] = bucket;
-			bucket_nr++;
-
-			ret = prio_io(ca, bucket, REQ_OP_READ);
-			if (bch_dev_fatal_io_err_on(ret, ca,
-					"prior read from bucket %llu",
-					bucket) ||
-			    bch_meta_read_fault("prio"))
-				return -EIO;
-
-			got = le64_to_cpu(p->magic);
-			expect = pset_magic(c);
-			unfixable_fsck_err_on(got != expect, c,
-				"bad magic (got %llu expect %llu) while reading prios from bucket %llu",
-				got, expect, bucket);
-
-			unfixable_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c,
-				"prio bucket with unknown csum type %llu bucket %lluu",
-				PSET_CSUM_TYPE(p), bucket);
-
-			csum = bch_checksum(c, PSET_CSUM_TYPE(p),
-					    prio_nonce(p),
-					    (void *) p + sizeof(p->csum),
-					    bucket_bytes(ca) - sizeof(p->csum));
-			unfixable_fsck_err_on(bch_crc_cmp(csum, p->csum), c,
-				"bad checksum reading prios from bucket %llu",
-				bucket);
-
-			bch_encrypt(c, PSET_CSUM_TYPE(p),
-				    prio_nonce(p),
-				    p->encrypted_start,
-				    bucket_bytes(ca) -
-				    offsetof(struct prio_set, encrypted_start));
-
-			bucket = le64_to_cpu(p->next_bucket);
-			d = p->data;
-		}
-
-		ca->buckets[b].read_prio = le16_to_cpu(d->read_prio);
-		ca->buckets[b].write_prio = le16_to_cpu(d->write_prio);
-
-		bucket_cmpxchg(&ca->buckets[b], new, new.gen = d->gen);
-	}
-
-	mutex_lock(&c->bucket_lock);
-	bch_recalc_min_prio(ca, READ);
-	bch_recalc_min_prio(ca, WRITE);
-	mutex_unlock(&c->bucket_lock);
-
-	ret = 0;
-fsck_err:
-	return ret;
-}
-
-#define BUCKET_GC_GEN_MAX	96U
-
-/**
- * wait_buckets_available - wait on reclaimable buckets
- *
- * If there aren't enough available buckets to fill up free_inc, wait until
- * there are.
- */
-static int wait_buckets_available(struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
-	int ret = 0;
-
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (kthread_should_stop()) {
-			ret = -1;
-			break;
-		}
-
-		if (ca->inc_gen_needs_gc >= fifo_free(&ca->free_inc)) {
-			if (c->gc_thread) {
-				trace_bcache_gc_cannot_inc_gens(ca->fs);
-				atomic_inc(&c->kick_gc);
-				wake_up_process(ca->fs->gc_thread);
-			}
-
-			/*
-			 * We are going to wait for GC to wake us up, even if
-			 * bucket counters tell us enough buckets are available,
-			 * because we are actually waiting for GC to rewrite
-			 * nodes with stale pointers
-			 */
-		} else if (dev_buckets_available(ca) >=
-			   fifo_free(&ca->free_inc))
-			break;
-
-		up_read(&ca->fs->gc_lock);
-		schedule();
-		try_to_freeze();
-		down_read(&ca->fs->gc_lock);
-	}
-
-	__set_current_state(TASK_RUNNING);
-	return ret;
-}
-
-static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket)
-{
-	if (expensive_debug_checks(ca->fs)) {
-		size_t iter;
-		long i;
-		unsigned j;
-
-		for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
-			BUG_ON(ca->prio_buckets[iter] == bucket);
-
-		for (j = 0; j < RESERVE_NR; j++)
-			fifo_for_each_entry(i, &ca->free[j], iter)
-				BUG_ON(i == bucket);
-		fifo_for_each_entry(i, &ca->free_inc, iter)
-			BUG_ON(i == bucket);
-	}
-}
-
-/* Bucket heap / gen */
-
-void bch_recalc_min_prio(struct bch_dev *ca, int rw)
-{
-	struct bch_fs *c = ca->fs;
-	struct prio_clock *clock = &c->prio_clock[rw];
-	struct bucket *g;
-	u16 max_delta = 1;
-	unsigned i;
-
-	lockdep_assert_held(&c->bucket_lock);
-
-	/* Determine min prio for this particular cache */
-	for_each_bucket(g, ca)
-		max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
-
-	ca->min_prio[rw] = clock->hand - max_delta;
-
-	/*
-	 * This may possibly increase the min prio for the whole cache, check
-	 * that as well.
-	 */
-	max_delta = 1;
-
-	for_each_member_device(ca, c, i)
-		max_delta = max(max_delta,
-				(u16) (clock->hand - ca->min_prio[rw]));
-
-	clock->min_prio = clock->hand - max_delta;
-}
-
-static void bch_rescale_prios(struct bch_fs *c, int rw)
-{
-	struct prio_clock *clock = &c->prio_clock[rw];
-	struct bch_dev *ca;
-	struct bucket *g;
-	unsigned i;
-
-	trace_bcache_rescale_prios(c);
-
-	for_each_member_device(ca, c, i) {
-		for_each_bucket(g, ca)
-			g->prio[rw] = clock->hand -
-				(clock->hand - g->prio[rw]) / 2;
-
-		bch_recalc_min_prio(ca, rw);
-	}
-}
-
-static void bch_inc_clock_hand(struct io_timer *timer)
-{
-	struct prio_clock *clock = container_of(timer,
-					struct prio_clock, rescale);
-	struct bch_fs *c = container_of(clock,
-				struct bch_fs, prio_clock[clock->rw]);
-	u64 capacity;
-
-	mutex_lock(&c->bucket_lock);
-
-	clock->hand++;
-
-	/* if clock cannot be advanced more, rescale prio */
-	if (clock->hand == (u16) (clock->min_prio - 1))
-		bch_rescale_prios(c, clock->rw);
-
-	mutex_unlock(&c->bucket_lock);
-
-	capacity = READ_ONCE(c->capacity);
-
-	if (!capacity)
-		return;
-
-	/*
-	 * we only increment when 0.1% of the filesystem capacity has been read
-	 * or written too, this determines if it's time
-	 *
-	 * XXX: we shouldn't really be going off of the capacity of devices in
-	 * RW mode (that will be 0 when we're RO, yet we can still service
-	 * reads)
-	 */
-	timer->expire += capacity >> 10;
-
-	bch_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch_prio_timer_init(struct bch_fs *c, int rw)
-{
-	struct prio_clock *clock = &c->prio_clock[rw];
-	struct io_timer *timer = &clock->rescale;
-
-	clock->rw	= rw;
-	timer->fn	= bch_inc_clock_hand;
-	timer->expire	= c->capacity >> 10;
-}
-
-/*
- * Background allocation thread: scans for buckets to be invalidated,
- * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
- * then optionally issues discard commands to the newly free buckets, then puts
- * them on the various freelists.
- */
-
-static inline bool can_inc_bucket_gen(struct bch_dev *ca, struct bucket *g)
-{
-	return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX;
-}
-
-static bool bch_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g)
-{
-	if (!is_available_bucket(READ_ONCE(g->mark)))
-		return false;
-
-	if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX - 1)
-		ca->inc_gen_needs_gc++;
-
-	return can_inc_bucket_gen(ca, g);
-}
-
-static void bch_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
-{
-	spin_lock(&ca->freelist_lock);
-
-	bch_invalidate_bucket(ca, g);
-
-	g->read_prio = ca->fs->prio_clock[READ].hand;
-	g->write_prio = ca->fs->prio_clock[WRITE].hand;
-
-	verify_not_on_freelist(ca, g - ca->buckets);
-	BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
-
-	spin_unlock(&ca->freelist_lock);
-}
-
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- *   indication of how hot the data is -- we scale the prio so that the prio
- *   farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- *   indication of the cost in cache misses this eviction will cause.
- *
- * - The difference between the bucket's current gen and oldest gen of any
- *   pointer into it, which gives us an indication of the cost of an eventual
- *   btree GC to rewrite nodes with stale pointers.
- */
-
-#define bucket_sort_key(g)						\
-({									\
-	unsigned long prio = g->read_prio - ca->min_prio[READ];		\
-	prio = (prio * 7) / (ca->fs->prio_clock[READ].hand -		\
-			     ca->min_prio[READ]);			\
-									\
-	(((prio + 1) * bucket_sectors_used(g)) << 8) | bucket_gc_gen(ca, g);\
-})
-
-static void invalidate_buckets_lru(struct bch_dev *ca)
-{
-	struct bucket_heap_entry e;
-	struct bucket *g;
-	unsigned i;
-
-	mutex_lock(&ca->heap_lock);
-
-	ca->heap.used = 0;
-
-	mutex_lock(&ca->fs->bucket_lock);
-	bch_recalc_min_prio(ca, READ);
-	bch_recalc_min_prio(ca, WRITE);
-
-	/*
-	 * Find buckets with lowest read priority, by building a maxheap sorted
-	 * by read priority and repeatedly replacing the maximum element until
-	 * all buckets have been visited.
-	 */
-	for_each_bucket(g, ca) {
-		if (!bch_can_invalidate_bucket(ca, g))
-			continue;
-
-		bucket_heap_push(ca, g, bucket_sort_key(g));
-	}
-
-	/* Sort buckets by physical location on disk for better locality */
-	for (i = 0; i < ca->heap.used; i++) {
-		struct bucket_heap_entry *e = &ca->heap.data[i];
-
-		e->val = e->g - ca->buckets;
-	}
-
-	heap_resort(&ca->heap, bucket_max_cmp);
-
-	/*
-	 * If we run out of buckets to invalidate, bch_allocator_thread() will
-	 * kick stuff and retry us
-	 */
-	while (!fifo_full(&ca->free_inc) &&
-	       heap_pop(&ca->heap, e, bucket_max_cmp)) {
-		BUG_ON(!bch_can_invalidate_bucket(ca, e.g));
-		bch_invalidate_one_bucket(ca, e.g);
-	}
-
-	mutex_unlock(&ca->fs->bucket_lock);
-	mutex_unlock(&ca->heap_lock);
-}
-
-static void invalidate_buckets_fifo(struct bch_dev *ca)
-{
-	struct bucket *g;
-	size_t checked = 0;
-
-	while (!fifo_full(&ca->free_inc)) {
-		if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
-		    ca->fifo_last_bucket >= ca->mi.nbuckets)
-			ca->fifo_last_bucket = ca->mi.first_bucket;
-
-		g = ca->buckets + ca->fifo_last_bucket++;
-
-		if (bch_can_invalidate_bucket(ca, g))
-			bch_invalidate_one_bucket(ca, g);
-
-		if (++checked >= ca->mi.nbuckets)
-			return;
-	}
-}
-
-static void invalidate_buckets_random(struct bch_dev *ca)
-{
-	struct bucket *g;
-	size_t checked = 0;
-
-	while (!fifo_full(&ca->free_inc)) {
-		size_t n = bch_rand_range(ca->mi.nbuckets -
-					  ca->mi.first_bucket) +
-			ca->mi.first_bucket;
-
-		g = ca->buckets + n;
-
-		if (bch_can_invalidate_bucket(ca, g))
-			bch_invalidate_one_bucket(ca, g);
-
-		if (++checked >= ca->mi.nbuckets / 2)
-			return;
-	}
-}
-
-static void invalidate_buckets(struct bch_dev *ca)
-{
-	ca->inc_gen_needs_gc = 0;
-
-	switch (ca->mi.replacement) {
-	case CACHE_REPLACEMENT_LRU:
-		invalidate_buckets_lru(ca);
-		break;
-	case CACHE_REPLACEMENT_FIFO:
-		invalidate_buckets_fifo(ca);
-		break;
-	case CACHE_REPLACEMENT_RANDOM:
-		invalidate_buckets_random(ca);
-		break;
-	}
-}
-
-static bool __bch_allocator_push(struct bch_dev *ca, long bucket)
-{
-	if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
-		goto success;
-
-	if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket))
-		goto success;
-
-	if (fifo_push(&ca->free[RESERVE_BTREE], bucket))
-		goto success;
-
-	if (fifo_push(&ca->free[RESERVE_NONE], bucket))
-		goto success;
-
-	return false;
-success:
-	closure_wake_up(&ca->fs->freelist_wait);
-	return true;
-}
-
-static bool bch_allocator_push(struct bch_dev *ca, long bucket)
-{
-	bool ret;
-
-	spin_lock(&ca->freelist_lock);
-	ret = __bch_allocator_push(ca, bucket);
-	if (ret)
-		fifo_pop(&ca->free_inc, bucket);
-	spin_unlock(&ca->freelist_lock);
-
-	return ret;
-}
-
-static void bch_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
-	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
-	struct bucket *g;
-
-	for_each_bucket(g, ca) {
-		struct bucket_mark m = READ_ONCE(g->mark);
-
-		if (is_available_bucket(m) &&
-		    !m.cached_sectors &&
-		    !m.had_metadata &&
-		    !bucket_needs_journal_commit(m, last_seq_ondisk)) {
-			spin_lock(&ca->freelist_lock);
-
-			bch_mark_alloc_bucket(ca, g, true);
-			g->read_prio = c->prio_clock[READ].hand;
-			g->write_prio = c->prio_clock[WRITE].hand;
-
-			verify_not_on_freelist(ca, g - ca->buckets);
-			BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
-
-			spin_unlock(&ca->freelist_lock);
-
-			if (fifo_full(&ca->free_inc))
-				break;
-		}
-	}
-}
-
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by invalidate_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch_allocator_thread(void *arg)
-{
-	struct bch_dev *ca = arg;
-	struct bch_fs *c = ca->fs;
-	int ret;
-
-	set_freezable();
-
-	bch_find_empty_buckets(c, ca);
-
-	while (1) {
-		/*
-		 * First, we pull buckets off of the free_inc list, possibly
-		 * issue discards to them, then we add the bucket to a
-		 * free list:
-		 */
-
-		while (!fifo_empty(&ca->free_inc)) {
-			long bucket = fifo_peek(&ca->free_inc);
-
-			/*
-			 * Don't remove from free_inc until after it's added
-			 * to freelist, so gc doesn't miss it while we've
-			 * dropped bucket lock
-			 */
-
-			if (ca->mi.discard &&
-			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
-				blkdev_issue_discard(ca->disk_sb.bdev,
-					bucket_to_sector(ca, bucket),
-					ca->mi.bucket_size, GFP_NOIO, 0);
-
-			while (1) {
-				set_current_state(TASK_INTERRUPTIBLE);
-				if (bch_allocator_push(ca, bucket))
-					break;
-
-				if (kthread_should_stop()) {
-					__set_current_state(TASK_RUNNING);
-					goto out;
-				}
-				schedule();
-				try_to_freeze();
-			}
-
-			__set_current_state(TASK_RUNNING);
-		}
-
-		down_read(&c->gc_lock);
-
-		/*
-		 * See if we have buckets we can reuse without invalidating them
-		 * or forcing a journal commit:
-		 */
-		//bch_find_empty_buckets(c, ca);
-
-		if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) {
-			up_read(&c->gc_lock);
-			continue;
-		}
-
-		/* We've run out of free buckets! */
-
-		while (!fifo_full(&ca->free_inc)) {
-			if (wait_buckets_available(ca)) {
-				up_read(&c->gc_lock);
-				goto out;
-			}
-
-			/*
-			 * Find some buckets that we can invalidate, either
-			 * they're completely unused, or only contain clean data
-			 * that's been written back to the backing device or
-			 * another cache tier
-			 */
-
-			invalidate_buckets(ca);
-			trace_bcache_alloc_batch(ca, fifo_used(&ca->free_inc),
-						 ca->free_inc.size);
-		}
-
-		up_read(&c->gc_lock);
-
-		/*
-		 * free_inc is full of newly-invalidated buckets, must write out
-		 * prios and gens before they can be re-used
-		 */
-		ret = bch_prio_write(ca);
-		if (ret) {
-			/*
-			 * Emergency read only - allocator thread has to
-			 * shutdown.
-			 *
-			 * N.B. we better be going into RO mode, else
-			 * allocations would hang indefinitely - whatever
-			 * generated the error will have sent us into RO mode.
-			 *
-			 * Clear out the free_inc freelist so things are
-			 * consistent-ish:
-			 */
-			spin_lock(&ca->freelist_lock);
-			while (!fifo_empty(&ca->free_inc)) {
-				long bucket;
-
-				fifo_pop(&ca->free_inc, bucket);
-				bch_mark_free_bucket(ca, ca->buckets + bucket);
-			}
-			spin_unlock(&ca->freelist_lock);
-			goto out;
-		}
-	}
-out:
-	/*
-	 * Avoid a race with bch_usage_update() trying to wake us up after
-	 * we've exited:
-	 */
-	synchronize_rcu();
-	return 0;
-}
-
-/* Allocation */
-
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-size_t bch_bucket_alloc(struct bch_dev *ca, enum alloc_reserve reserve)
-{
-	struct bucket *g;
-	long r;
-
-	spin_lock(&ca->freelist_lock);
-	if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
-	    fifo_pop(&ca->free[reserve], r))
-		goto out;
-
-	spin_unlock(&ca->freelist_lock);
-
-	trace_bcache_bucket_alloc_fail(ca, reserve);
-	return 0;
-out:
-	verify_not_on_freelist(ca, r);
-	spin_unlock(&ca->freelist_lock);
-
-	trace_bcache_bucket_alloc(ca, reserve);
-
-	bch_wake_allocator(ca);
-
-	g = ca->buckets + r;
-
-	g->read_prio = ca->fs->prio_clock[READ].hand;
-	g->write_prio = ca->fs->prio_clock[WRITE].hand;
-
-	return r;
-}
-
-static void __bch_bucket_free(struct bch_dev *ca, struct bucket *g)
-{
-	bch_mark_free_bucket(ca, g);
-
-	g->read_prio = ca->fs->prio_clock[READ].hand;
-	g->write_prio = ca->fs->prio_clock[WRITE].hand;
-}
-
-enum bucket_alloc_ret {
-	ALLOC_SUCCESS,
-	NO_DEVICES,		/* -EROFS */
-	FREELIST_EMPTY,		/* Allocator thread not keeping up */
-};
-
-static void recalc_alloc_group_weights(struct bch_fs *c,
-				       struct dev_group *devs)
-{
-	struct bch_dev *ca;
-	u64 available_buckets = 1; /* avoid a divide by zero... */
-	unsigned i;
-
-	for (i = 0; i < devs->nr; i++) {
-		ca = devs->d[i].dev;
-
-		devs->d[i].weight = dev_buckets_free(ca);
-		available_buckets += devs->d[i].weight;
-	}
-
-	for (i = 0; i < devs->nr; i++) {
-		const unsigned min_weight = U32_MAX >> 4;
-		const unsigned max_weight = U32_MAX;
-
-		devs->d[i].weight =
-			min_weight +
-			div64_u64(devs->d[i].weight *
-				  devs->nr *
-				  (max_weight - min_weight),
-				  available_buckets);
-		devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
-	}
-}
-
-static enum bucket_alloc_ret bch_bucket_alloc_group(struct bch_fs *c,
-						    struct open_bucket *ob,
-						    enum alloc_reserve reserve,
-						    unsigned nr_replicas,
-						    struct dev_group *devs,
-						    long *devs_used)
-{
-	enum bucket_alloc_ret ret;
-	unsigned fail_idx = -1, i;
-	unsigned available = 0;
-
-	BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs));
-
-	if (ob->nr_ptrs >= nr_replicas)
-		return ALLOC_SUCCESS;
-
-	spin_lock(&devs->lock);
-
-	for (i = 0; i < devs->nr; i++)
-		available += !test_bit(devs->d[i].dev->dev_idx,
-				       devs_used);
-
-	recalc_alloc_group_weights(c, devs);
-
-	i = devs->cur_device;
-
-	while (ob->nr_ptrs < nr_replicas) {
-		struct bch_dev *ca;
-		u64 bucket;
-
-		if (!available) {
-			ret = NO_DEVICES;
-			goto err;
-		}
-
-		i++;
-		i %= devs->nr;
-
-		ret = FREELIST_EMPTY;
-		if (i == fail_idx)
-			goto err;
-
-		ca = devs->d[i].dev;
-
-		if (test_bit(ca->dev_idx, devs_used))
-			continue;
-
-		if (fail_idx == -1 &&
-		    get_random_int() > devs->d[i].weight)
-			continue;
-
-		bucket = bch_bucket_alloc(ca, reserve);
-		if (!bucket) {
-			if (fail_idx == -1)
-				fail_idx = i;
-			continue;
-		}
-
-		/*
-		 * open_bucket_add_buckets expects new pointers at the head of
-		 * the list:
-		 */
-		memmove(&ob->ptrs[1],
-			&ob->ptrs[0],
-			ob->nr_ptrs * sizeof(ob->ptrs[0]));
-		memmove(&ob->ptr_offset[1],
-			&ob->ptr_offset[0],
-			ob->nr_ptrs * sizeof(ob->ptr_offset[0]));
-		ob->nr_ptrs++;
-		ob->ptrs[0] = (struct bch_extent_ptr) {
-			.gen	= ca->buckets[bucket].mark.gen,
-			.offset	= bucket_to_sector(ca, bucket),
-			.dev	= ca->dev_idx,
-		};
-		ob->ptr_offset[0] = 0;
-
-		__set_bit(ca->dev_idx, devs_used);
-		available--;
-		devs->cur_device = i;
-	}
-
-	ret = ALLOC_SUCCESS;
-err:
-	EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
-	spin_unlock(&devs->lock);
-	return ret;
-}
-
-static enum bucket_alloc_ret __bch_bucket_alloc_set(struct bch_fs *c,
-						    struct write_point *wp,
-						    struct open_bucket *ob,
-						    unsigned nr_replicas,
-						    enum alloc_reserve reserve,
-						    long *devs_used)
-{
-	struct bch_tier *tier;
-	/*
-	 * this should implement policy - for a given type of allocation, decide
-	 * which devices to allocate from:
-	 *
-	 * XXX: switch off wp->type and do something more intelligent here
-	 */
-	if (wp->group)
-		return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
-					      wp->group, devs_used);
-
-	/* foreground writes: prefer fastest tier: */
-	tier = READ_ONCE(c->fastest_tier);
-	if (tier)
-		bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
-				       &tier->devs, devs_used);
-
-	return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
-				      &c->all_devs, devs_used);
-}
-
-static int bch_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
-				struct open_bucket *ob, unsigned nr_replicas,
-				enum alloc_reserve reserve, long *devs_used,
-				struct closure *cl)
-{
-	bool waiting = false;
-
-	while (1) {
-		switch (__bch_bucket_alloc_set(c, wp, ob, nr_replicas,
-					       reserve, devs_used)) {
-		case ALLOC_SUCCESS:
-			if (waiting)
-				closure_wake_up(&c->freelist_wait);
-
-			return 0;
-
-		case NO_DEVICES:
-			if (waiting)
-				closure_wake_up(&c->freelist_wait);
-			return -EROFS;
-
-		case FREELIST_EMPTY:
-			if (!cl || waiting)
-				trace_bcache_freelist_empty_fail(c,
-							reserve, cl);
-
-			if (!cl)
-				return -ENOSPC;
-
-			if (waiting)
-				return -EAGAIN;
-
-			/* Retry allocation after adding ourself to waitlist: */
-			closure_wait(&c->freelist_wait, cl);
-			waiting = true;
-			break;
-		default:
-			BUG();
-		}
-	}
-}
-
-/* Open buckets: */
-
-/*
- * Open buckets represent one or more buckets (on multiple devices) that are
- * currently being allocated from. They serve two purposes:
- *
- *  - They track buckets that have been partially allocated, allowing for
- *    sub-bucket sized allocations - they're used by the sector allocator below
- *
- *  - They provide a reference to the buckets they own that mark and sweep GC
- *    can find, until the new allocation has a pointer to it inserted into the
- *    btree
- *
- * When allocating some space with the sector allocator, the allocation comes
- * with a reference to an open bucket - the caller is required to put that
- * reference _after_ doing the index update that makes its allocation reachable.
- */
-
-static void __bch_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
-	const struct bch_extent_ptr *ptr;
-
-	lockdep_assert_held(&c->open_buckets_lock);
-
-	open_bucket_for_each_ptr(ob, ptr) {
-		struct bch_dev *ca = c->devs[ptr->dev];
-
-		bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false);
-	}
-
-	ob->nr_ptrs = 0;
-
-	list_move(&ob->list, &c->open_buckets_free);
-	c->open_buckets_nr_free++;
-	closure_wake_up(&c->open_buckets_wait);
-}
-
-void bch_open_bucket_put(struct bch_fs *c, struct open_bucket *b)
-{
-	if (atomic_dec_and_test(&b->pin)) {
-		spin_lock(&c->open_buckets_lock);
-		__bch_open_bucket_put(c, b);
-		spin_unlock(&c->open_buckets_lock);
-	}
-}
-
-static struct open_bucket *bch_open_bucket_get(struct bch_fs *c,
-					       unsigned nr_reserved,
-					       struct closure *cl)
-{
-	struct open_bucket *ret;
-
-	spin_lock(&c->open_buckets_lock);
-
-	if (c->open_buckets_nr_free > nr_reserved) {
-		BUG_ON(list_empty(&c->open_buckets_free));
-		ret = list_first_entry(&c->open_buckets_free,
-				       struct open_bucket, list);
-		list_move(&ret->list, &c->open_buckets_open);
-		BUG_ON(ret->nr_ptrs);
-
-		atomic_set(&ret->pin, 1); /* XXX */
-		ret->has_full_ptrs	= false;
-
-		c->open_buckets_nr_free--;
-		trace_bcache_open_bucket_alloc(c, cl);
-	} else {
-		trace_bcache_open_bucket_alloc_fail(c, cl);
-
-		if (cl) {
-			closure_wait(&c->open_buckets_wait, cl);
-			ret = ERR_PTR(-EAGAIN);
-		} else
-			ret = ERR_PTR(-ENOSPC);
-	}
-
-	spin_unlock(&c->open_buckets_lock);
-
-	return ret;
-}
-
-static unsigned ob_ptr_sectors_free(struct bch_fs *c,
-				    struct open_bucket *ob,
-				    struct bch_extent_ptr *ptr)
-{
-	struct bch_dev *ca = c->devs[ptr->dev];
-	unsigned i = ptr - ob->ptrs;
-	unsigned bucket_size = ca->mi.bucket_size;
-	unsigned used = (ptr->offset & (bucket_size - 1)) +
-		ob->ptr_offset[i];
-
-	BUG_ON(used > bucket_size);
-
-	return bucket_size - used;
-}
-
-static unsigned open_bucket_sectors_free(struct bch_fs *c,
-					 struct open_bucket *ob,
-					 unsigned nr_replicas)
-{
-	unsigned i, sectors_free = UINT_MAX;
-
-	for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++)
-		sectors_free = min(sectors_free,
-				   ob_ptr_sectors_free(c, ob, &ob->ptrs[i]));
-
-	return sectors_free != UINT_MAX ? sectors_free : 0;
-}
-
-static void open_bucket_copy_unused_ptrs(struct bch_fs *c,
-					 struct open_bucket *new,
-					 struct open_bucket *old)
-{
-	unsigned i;
-
-	for (i = 0; i < old->nr_ptrs; i++)
-		if (ob_ptr_sectors_free(c, old, &old->ptrs[i])) {
-			struct bch_extent_ptr tmp = old->ptrs[i];
-
-			tmp.offset += old->ptr_offset[i];
-			new->ptrs[new->nr_ptrs] = tmp;
-			new->ptr_offset[new->nr_ptrs] = 0;
-			new->nr_ptrs++;
-		}
-}
-
-static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob)
-{
-#ifdef CONFIG_BCACHE_DEBUG
-	const struct bch_extent_ptr *ptr;
-
-	open_bucket_for_each_ptr(ob, ptr) {
-		struct bch_dev *ca = c->devs[ptr->dev];
-
-		BUG_ON(ptr_stale(ca, ptr));
-	}
-#endif
-}
-
-/* Sector allocator */
-
-static struct open_bucket *lock_writepoint(struct bch_fs *c,
-					   struct write_point *wp)
-{
-	struct open_bucket *ob;
-
-	while ((ob = ACCESS_ONCE(wp->b))) {
-		mutex_lock(&ob->lock);
-		if (wp->b == ob)
-			break;
-
-		mutex_unlock(&ob->lock);
-	}
-
-	return ob;
-}
-
-static int open_bucket_add_buckets(struct bch_fs *c,
-				   struct write_point *wp,
-				   struct open_bucket *ob,
-				   unsigned nr_replicas,
-				   unsigned nr_replicas_required,
-				   enum alloc_reserve reserve,
-				   struct closure *cl)
-{
-	long devs_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
-	unsigned i;
-	int ret;
-
-	/*
-	 * We might be allocating pointers to add to an existing extent
-	 * (tiering/copygc/migration) - if so, some of the pointers in our
-	 * existing open bucket might duplicate devices we already have. This is
-	 * moderately annoying.
-	 */
-
-	/* Short circuit all the fun stuff if posssible: */
-	if (ob->nr_ptrs >= nr_replicas)
-		return 0;
-
-	memset(devs_used, 0, sizeof(devs_used));
-
-	for (i = 0; i < ob->nr_ptrs; i++)
-		__set_bit(ob->ptrs[i].dev, devs_used);
-
-	ret = bch_bucket_alloc_set(c, wp, ob, nr_replicas,
-				   reserve, devs_used, cl);
-
-	if (ret == -EROFS &&
-	    ob->nr_ptrs >= nr_replicas_required)
-		ret = 0;
-
-	return ret;
-}
-
-/*
- * Get us an open_bucket we can allocate from, return with it locked:
- */
-struct open_bucket *bch_alloc_sectors_start(struct bch_fs *c,
-					    struct write_point *wp,
-					    unsigned nr_replicas,
-					    unsigned nr_replicas_required,
-					    enum alloc_reserve reserve,
-					    struct closure *cl)
-{
-	struct open_bucket *ob;
-	unsigned open_buckets_reserved = wp == &c->btree_write_point
-		? 0 : BTREE_NODE_RESERVE;
-	int ret;
-
-	BUG_ON(!reserve);
-	BUG_ON(!nr_replicas);
-retry:
-	ob = lock_writepoint(c, wp);
-
-	/*
-	 * If ob->sectors_free == 0, one or more of the buckets ob points to is
-	 * full. We can't drop pointers from an open bucket - garbage collection
-	 * still needs to find them; instead, we must allocate a new open bucket
-	 * and copy any pointers to non-full buckets into the new open bucket.
-	 */
-	if (!ob || ob->has_full_ptrs) {
-		struct open_bucket *new_ob;
-
-		new_ob = bch_open_bucket_get(c, open_buckets_reserved, cl);
-		if (IS_ERR(new_ob))
-			return new_ob;
-
-		mutex_lock(&new_ob->lock);
-
-		/*
-		 * We point the write point at the open_bucket before doing the
-		 * allocation to avoid a race with shutdown:
-		 */
-		if (race_fault() ||
-		    cmpxchg(&wp->b, ob, new_ob) != ob) {
-			/* We raced: */
-			mutex_unlock(&new_ob->lock);
-			bch_open_bucket_put(c, new_ob);
-
-			if (ob)
-				mutex_unlock(&ob->lock);
-			goto retry;
-		}
-
-		if (ob) {
-			open_bucket_copy_unused_ptrs(c, new_ob, ob);
-			mutex_unlock(&ob->lock);
-			bch_open_bucket_put(c, ob);
-		}
-
-		ob = new_ob;
-	}
-
-	ret = open_bucket_add_buckets(c, wp, ob, nr_replicas,
-				      nr_replicas_required,
-				      reserve, cl);
-	if (ret) {
-		mutex_unlock(&ob->lock);
-		return ERR_PTR(ret);
-	}
-
-	ob->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas);
-
-	BUG_ON(!ob->sectors_free);
-	verify_not_stale(c, ob);
-
-	return ob;
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
-				   unsigned nr_replicas, struct open_bucket *ob,
-				   unsigned sectors)
-{
-	struct bch_extent_ptr tmp;
-	bool has_data = false;
-	unsigned i;
-
-	/*
-	 * We're keeping any existing pointer k has, and appending new pointers:
-	 * __bch_write() will only write to the pointers we add here:
-	 */
-
-	BUG_ON(sectors > ob->sectors_free);
-
-	/* didn't use all the ptrs: */
-	if (nr_replicas < ob->nr_ptrs)
-		has_data = true;
-
-	for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) {
-		EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
-
-		tmp = ob->ptrs[i];
-		tmp.cached = bkey_extent_is_cached(&e->k);
-		tmp.offset += ob->ptr_offset[i];
-		extent_ptr_append(e, tmp);
-
-		ob->ptr_offset[i] += sectors;
-
-		this_cpu_add(*c->devs[tmp.dev]->sectors_written, sectors);
-	}
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch_alloc_sectors_done(struct bch_fs *c, struct write_point *wp,
-			    struct open_bucket *ob)
-{
-	bool has_data = false;
-	unsigned i;
-
-	for (i = 0; i < ob->nr_ptrs; i++) {
-		if (!ob_ptr_sectors_free(c, ob, &ob->ptrs[i]))
-			ob->has_full_ptrs = true;
-		else
-			has_data = true;
-	}
-
-	if (likely(has_data))
-		atomic_inc(&ob->pin);
-	else
-		BUG_ON(xchg(&wp->b, NULL) != ob);
-
-	mutex_unlock(&ob->lock);
-}
-
-/*
- * Allocates some space in the cache to write to, and k to point to the newly
- * allocated space, and updates k->size and k->offset (to point to the
- * end of the newly allocated space).
- *
- * May allocate fewer sectors than @sectors, k->size indicates how many
- * sectors were actually allocated.
- *
- * Return codes:
- * - -EAGAIN: closure was added to waitlist
- * - -ENOSPC: out of space and no closure provided
- *
- * @c  - filesystem.
- * @wp - write point to use for allocating sectors.
- * @k  - key to return the allocated space information.
- * @cl - closure to wait for a bucket
- */
-struct open_bucket *bch_alloc_sectors(struct bch_fs *c,
-				      struct write_point *wp,
-				      struct bkey_i_extent *e,
-				      unsigned nr_replicas,
-				      unsigned nr_replicas_required,
-				      enum alloc_reserve reserve,
-				      struct closure *cl)
-{
-	struct open_bucket *ob;
-
-	ob = bch_alloc_sectors_start(c, wp, nr_replicas,
-				     nr_replicas_required,
-				     reserve, cl);
-	if (IS_ERR_OR_NULL(ob))
-		return ob;
-
-	if (e->k.size > ob->sectors_free)
-		bch_key_resize(&e->k, ob->sectors_free);
-
-	bch_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size);
-
-	bch_alloc_sectors_done(c, wp, ob);
-
-	return ob;
-}
-
-/* Startup/shutdown (ro/rw): */
-
-void bch_recalc_capacity(struct bch_fs *c)
-{
-	struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
-	struct bch_dev *ca;
-	u64 total_capacity, capacity = 0, reserved_sectors = 0;
-	unsigned long ra_pages = 0;
-	unsigned i, j;
-
-	for_each_online_member(ca, c, i) {
-		struct backing_dev_info *bdi =
-			blk_get_backing_dev_info(ca->disk_sb.bdev);
-
-		ra_pages += bdi->ra_pages;
-	}
-
-	c->bdi.ra_pages = ra_pages;
-
-	/* Find fastest, slowest tiers with devices: */
-
-	for (tier = c->tiers;
-	     tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
-		if (!tier->devs.nr)
-			continue;
-		if (!fastest_tier)
-			fastest_tier = tier;
-		slowest_tier = tier;
-	}
-
-	c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
-
-	c->promote_write_point.group = &fastest_tier->devs;
-
-	if (!fastest_tier)
-		goto set_capacity;
-
-	/*
-	 * Capacity of the filesystem is the capacity of all the devices in the
-	 * slowest (highest) tier - we don't include lower tier devices.
-	 */
-	spin_lock(&slowest_tier->devs.lock);
-	group_for_each_dev(ca, &slowest_tier->devs, i) {
-		size_t reserve = 0;
-
-		/*
-		 * We need to reserve buckets (from the number
-		 * of currently available buckets) against
-		 * foreground writes so that mainly copygc can
-		 * make forward progress.
-		 *
-		 * We need enough to refill the various reserves
-		 * from scratch - copygc will use its entire
-		 * reserve all at once, then run against when
-		 * its reserve is refilled (from the formerly
-		 * available buckets).
-		 *
-		 * This reserve is just used when considering if
-		 * allocations for foreground writes must wait -
-		 * not -ENOSPC calculations.
-		 */
-		for (j = 0; j < RESERVE_NONE; j++)
-			reserve += ca->free[j].size;
-
-		reserve += ca->free_inc.size;
-
-		reserve += ARRAY_SIZE(c->write_points);
-
-		if (ca->mi.tier)
-			reserve += 1;	/* tiering write point */
-		reserve += 1;		/* btree write point */
-
-		reserved_sectors += reserve << ca->bucket_bits;
-
-		capacity += (ca->mi.nbuckets -
-			     ca->mi.first_bucket) <<
-			ca->bucket_bits;
-	}
-	spin_unlock(&slowest_tier->devs.lock);
-set_capacity:
-	total_capacity = capacity;
-
-	capacity *= (100 - c->opts.gc_reserve_percent);
-	capacity = div64_u64(capacity, 100);
-
-	BUG_ON(capacity + reserved_sectors > total_capacity);
-
-	c->capacity = capacity;
-
-	if (c->capacity) {
-		bch_io_timer_add(&c->io_clock[READ],
-				 &c->prio_clock[READ].rescale);
-		bch_io_timer_add(&c->io_clock[WRITE],
-				 &c->prio_clock[WRITE].rescale);
-	} else {
-		bch_io_timer_del(&c->io_clock[READ],
-				 &c->prio_clock[READ].rescale);
-		bch_io_timer_del(&c->io_clock[WRITE],
-				 &c->prio_clock[WRITE].rescale);
-	}
-
-	/* Wake up case someone was waiting for buckets */
-	closure_wake_up(&c->freelist_wait);
-}
-
-static void bch_stop_write_point(struct bch_dev *ca,
-				 struct write_point *wp)
-{
-	struct bch_fs *c = ca->fs;
-	struct open_bucket *ob;
-	struct bch_extent_ptr *ptr;
-
-	ob = lock_writepoint(c, wp);
-	if (!ob)
-		return;
-
-	for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
-		if (ptr->dev == ca->dev_idx)
-			goto found;
-
-	mutex_unlock(&ob->lock);
-	return;
-found:
-	BUG_ON(xchg(&wp->b, NULL) != ob);
-	mutex_unlock(&ob->lock);
-
-	/* Drop writepoint's ref: */
-	bch_open_bucket_put(c, ob);
-}
-
-static bool bch_dev_has_open_write_point(struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
-	struct bch_extent_ptr *ptr;
-	struct open_bucket *ob;
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++)
-		if (atomic_read(&ob->pin)) {
-			mutex_lock(&ob->lock);
-			for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
-				if (ptr->dev == ca->dev_idx) {
-					mutex_unlock(&ob->lock);
-					return true;
-				}
-			mutex_unlock(&ob->lock);
-		}
-
-	return false;
-}
-
-/* device goes ro: */
-void bch_dev_allocator_stop(struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
-	struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
-	struct task_struct *p;
-	struct closure cl;
-	unsigned i;
-
-	closure_init_stack(&cl);
-
-	/* First, remove device from allocation groups: */
-
-	bch_dev_group_remove(tier, ca);
-	bch_dev_group_remove(&c->all_devs, ca);
-
-	bch_recalc_capacity(c);
-
-	/*
-	 * Stopping the allocator thread comes after removing from allocation
-	 * groups, else pending allocations will hang:
-	 */
-
-	p = ca->alloc_thread;
-	ca->alloc_thread = NULL;
-	smp_wmb();
-
-	/*
-	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
-	 * the thread shutting down to avoid a race with bch_usage_update() -
-	 * the allocator thread itself does a synchronize_rcu() on exit.
-	 *
-	 * XXX: it would be better to have the rcu barrier be asynchronous
-	 * instead of blocking us here
-	 */
-	if (p) {
-		kthread_stop(p);
-		put_task_struct(p);
-	}
-
-	/* Next, close write points that point to this device... */
-
-	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
-		bch_stop_write_point(ca, &c->write_points[i]);
-
-	bch_stop_write_point(ca, &ca->copygc_write_point);
-	bch_stop_write_point(ca, &c->promote_write_point);
-	bch_stop_write_point(ca, &ca->tiering_write_point);
-	bch_stop_write_point(ca, &c->migration_write_point);
-	bch_stop_write_point(ca, &c->btree_write_point);
-
-	mutex_lock(&c->btree_reserve_cache_lock);
-	while (c->btree_reserve_cache_nr) {
-		struct btree_alloc *a =
-			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
-		bch_open_bucket_put(c, a->ob);
-	}
-	mutex_unlock(&c->btree_reserve_cache_lock);
-
-	/* Avoid deadlocks.. */
-
-	closure_wake_up(&c->freelist_wait);
-	wake_up(&c->journal.wait);
-
-	/* Now wait for any in flight writes: */
-
-	while (1) {
-		closure_wait(&c->open_buckets_wait, &cl);
-
-		if (!bch_dev_has_open_write_point(ca)) {
-			closure_wake_up(&c->open_buckets_wait);
-			break;
-		}
-
-		closure_sync(&cl);
-	}
-}
-
-/*
- * Startup the allocator thread for transition to RW mode:
- */
-int bch_dev_allocator_start(struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
-	struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
-	struct bch_sb_field_journal *journal_buckets;
-	bool has_journal;
-	struct task_struct *k;
-
-	/*
-	 * allocator thread already started?
-	 */
-	if (ca->alloc_thread)
-		return 0;
-
-	k = kthread_create(bch_allocator_thread, ca, "bcache_allocator");
-	if (IS_ERR(k))
-		return 0;
-
-	get_task_struct(k);
-	ca->alloc_thread = k;
-
-	bch_dev_group_add(tier, ca);
-	bch_dev_group_add(&c->all_devs, ca);
-
-	mutex_lock(&c->sb_lock);
-	journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
-	has_journal = bch_nr_journal_buckets(journal_buckets) >=
-		BCH_JOURNAL_BUCKETS_MIN;
-	mutex_unlock(&c->sb_lock);
-
-	if (has_journal)
-		bch_dev_group_add(&c->journal.devs, ca);
-
-	bch_recalc_capacity(c);
-
-	/*
-	 * Don't wake up allocator thread until after adding device to
-	 * allocator groups - otherwise, alloc thread could get a spurious
-	 * -EROFS due to prio_write() -> journal_meta() not finding any devices:
-	 */
-	wake_up_process(k);
-	return 0;
-}
-
-void bch_fs_allocator_init(struct bch_fs *c)
-{
-	unsigned i;
-
-	INIT_LIST_HEAD(&c->open_buckets_open);
-	INIT_LIST_HEAD(&c->open_buckets_free);
-	spin_lock_init(&c->open_buckets_lock);
-	bch_prio_timer_init(c, READ);
-	bch_prio_timer_init(c, WRITE);
-
-	/* open bucket 0 is a sentinal NULL: */
-	mutex_init(&c->open_buckets[0].lock);
-	INIT_LIST_HEAD(&c->open_buckets[0].list);
-
-	for (i = 1; i < ARRAY_SIZE(c->open_buckets); i++) {
-		mutex_init(&c->open_buckets[i].lock);
-		c->open_buckets_nr_free++;
-		list_add(&c->open_buckets[i].list, &c->open_buckets_free);
-	}
-
-	spin_lock_init(&c->all_devs.lock);
-
-	for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
-		spin_lock_init(&c->tiers[i].devs.lock);
-
-	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
-		c->write_points[i].throttle = true;
-
-	c->pd_controllers_update_seconds = 5;
-	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
-
-	spin_lock_init(&c->foreground_write_pd_lock);
-	bch_pd_controller_init(&c->foreground_write_pd);
-	/*
-	 * We do not want the write rate to have an effect on the computed
-	 * rate, for two reasons:
-	 *
-	 * We do not call bch_ratelimit_delay() at all if the write rate
-	 * exceeds 1GB/s. In this case, the PD controller will think we are
-	 * not "keeping up" and not change the rate.
-	 */
-	c->foreground_write_pd.backpressure = 0;
-	init_timer(&c->foreground_write_wakeup);
-
-	c->foreground_write_wakeup.data = (unsigned long) c;
-	c->foreground_write_wakeup.function = bch_wake_delayed_writes;
-}