bcache in userspace; userspace fsck

author: Kent Overstreet <kent.overstreet@gmail.com> 2017-01-08 00:13:18 -0900
committer: Kent Overstreet <kent.overstreet@gmail.com> 2017-01-20 09:07:08 -0900
commit: b33fc8298f7e13226b9895abc57c9bfce5e3fa2d (patch)
tree: a3d2a5a909b6372f7777c1c5c18cef5f81d123a9 /libbcache
parent: 7f4191a202ea4558ca2d5eb8a47daea33c9999c7 (diff)
106 files changed, 45386 insertions, 0 deletions
diff --git a/libbcache/acl.c b/libbcache/acl.c
new file mode 100644
index 0000000..64d5616
--- /dev/null
+++ b/libbcache/acl.c
@@ -0,0 +1,225 @@
+#include "bcache.h"
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *bch_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(bch_acl_header))
+		return ERR_PTR(-EINVAL);
+	if (((bch_acl_header *)value)->a_version !=
+	    cpu_to_le32(BCH_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(bch_acl_header);
+	count = bch_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n = 0; n < count; n++) {
+		bch_acl_entry *entry =
+			(bch_acl_entry *)value;
+		if ((char *)value + sizeof(bch_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			value = (char *)value +
+				sizeof(bch_acl_entry_short);
+			break;
+
+		case ACL_USER:
+			value = (char *)value + sizeof(bch_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_uid =
+				make_kuid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+		case ACL_GROUP:
+			value = (char *)value + sizeof(bch_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_gid =
+				make_kgid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *bch_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+	bch_acl_header *ext_acl;
+	char *e;
+	size_t n;
+
+	*size = bch_acl_size(acl->a_count);
+	ext_acl = kmalloc(sizeof(bch_acl_header) + acl->a_count *
+			sizeof(bch_acl_entry), GFP_KERNEL);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(BCH_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(bch_acl_header);
+	for (n = 0; n < acl->a_count; n++) {
+		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+		bch_acl_entry *entry = (bch_acl_entry *)e;
+
+		entry->e_tag = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			e += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			e += sizeof(bch_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			e += sizeof(bch_acl_entry_short);
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+struct posix_acl *bch_get_acl(struct inode *inode, int type)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int ret;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	ret = bch_xattr_get(c, inode, "", NULL, 0, name_index);
+	if (ret > 0) {
+		value = kmalloc(ret, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		ret = bch_xattr_get(c, inode, "", value,
+				    ret, name_index);
+	}
+	if (ret > 0)
+		acl = bch_acl_from_disk(value, ret);
+	else if (ret == -ENODATA || ret == -ENOSYS)
+		acl = NULL;
+	else
+		acl = ERR_PTR(ret);
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+int bch_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int ret;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			ret = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (ret < 0)
+				return ret;
+			else {
+				inode->i_ctime = CURRENT_TIME_SEC;
+				mark_inode_dirty(inode);
+				if (ret == 0)
+					acl = NULL;
+			}
+		}
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		value = bch_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	ret = bch_xattr_set(c, inode, "", value, size, 0, name_index);
+
+	kfree(value);
+
+	if (ret == -ERANGE)
+		ret = -E2BIG;
+
+	if (!ret)
+		set_cached_acl(inode, type, acl);
+
+	return ret;
+}
diff --git a/libbcache/acl.h b/libbcache/acl.h
new file mode 100644
index 0000000..079e568
--- /dev/null
+++ b/libbcache/acl.h
@@ -0,0 +1,56 @@
+/*
+  File: fs/bch/acl.h
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define BCH_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} bch_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} bch_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} bch_acl_header;
+
+static inline size_t bch_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(bch_acl_header) +
+		       count * sizeof(bch_acl_entry_short);
+	} else {
+		return sizeof(bch_acl_header) +
+		       4 * sizeof(bch_acl_entry_short) +
+		       (count - 4) * sizeof(bch_acl_entry);
+	}
+}
+
+static inline int bch_acl_count(size_t size)
+{
+	ssize_t s;
+
+	size -= sizeof(bch_acl_header);
+	s = size - 4 * sizeof(bch_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(bch_acl_entry_short))
+			return -1;
+		return size / sizeof(bch_acl_entry_short);
+	} else {
+		if (s % sizeof(bch_acl_entry))
+			return -1;
+		return s / sizeof(bch_acl_entry) + 4;
+	}
+}
+
+extern struct posix_acl *bch_get_acl(struct inode *, int);
+extern int bch_set_acl(struct inode *, struct posix_acl *, int);
diff --git a/libbcache/alloc.c b/libbcache/alloc.c
new file mode 100644
index 0000000..cff750c
--- /dev/null
+++ b/libbcache/alloc.c
@@ -0,0 +1,1861 @@
+/*
+ * Primary bucket allocation code
+ *
+ * Copyright 2012 Google, Inc.
+ *
+ * Allocation in bcache is done in terms of buckets:
+ *
+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
+ * btree pointers - they must match for the pointer to be considered valid.
+ *
+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
+ * bucket simply by incrementing its gen.
+ *
+ * The gens (along with the priorities; it's really the gens are important but
+ * the code is named as if it's the priorities) are written in an arbitrary list
+ * of buckets on disk, with a pointer to them in the journal header.
+ *
+ * When we invalidate a bucket, we have to write its new gen to disk and wait
+ * for that write to complete before we use it - otherwise after a crash we
+ * could have pointers that appeared to be good but pointed to data that had
+ * been overwritten.
+ *
+ * Since the gens and priorities are all stored contiguously on disk, we can
+ * batch this up: We fill up the free_inc list with freshly invalidated buckets,
+ * call prio_write(), and when prio_write() finishes we pull buckets off the
+ * free_inc list and optionally discard them.
+ *
+ * free_inc isn't the only freelist - if it was, we'd often have to sleep while
+ * priorities and gens were being written before we could allocate. c->free is a
+ * smaller freelist, and buckets on that list are always ready to be used.
+ *
+ * If we've got discards enabled, that happens when a bucket moves from the
+ * free_inc list to the free list.
+ *
+ * It's important to ensure that gens don't wrap around - with respect to
+ * either the oldest gen in the btree or the gen on disk. This is quite
+ * difficult to do in practice, but we explicitly guard against it anyways - if
+ * a bucket is in danger of wrapping around we simply skip invalidating it that
+ * time around, and we garbage collect or rewrite the priorities sooner than we
+ * would have otherwise.
+ *
+ * bch_bucket_alloc() allocates a single bucket from a specific cache.
+ *
+ * bch_bucket_alloc_set() allocates one or more buckets from different caches
+ * out of a cache set.
+ *
+ * invalidate_buckets() drives all the processes described above. It's called
+ * from bch_bucket_alloc() and a few other places that need to make sure free
+ * buckets are ready.
+ *
+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be
+ * invalidated, and then invalidate them and stick them on the free_inc list -
+ * in either lru or fifo order.
+ */
+
+#include "bcache.h"
+#include "alloc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "super.h"
+
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <trace/events/bcache.h>
+
+static size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
+static void __bch_bucket_free(struct cache *, struct bucket *);
+
+/* Allocation groups: */
+
+void bch_cache_group_remove_cache(struct cache_group *grp, struct cache *ca)
+{
+	unsigned i;
+
+	spin_lock(&grp->lock);
+
+	for (i = 0; i < grp->nr_devices; i++)
+		if (rcu_access_pointer(grp->d[i].dev) == ca) {
+			grp->nr_devices--;
+			memmove(&grp->d[i],
+				&grp->d[i + 1],
+				(grp->nr_devices - i) * sizeof(grp->d[0]));
+			break;
+		}
+
+	spin_unlock(&grp->lock);
+}
+
+void bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca)
+{
+	unsigned i;
+
+	spin_lock(&grp->lock);
+	for (i = 0; i < grp->nr_devices; i++)
+		if (rcu_access_pointer(grp->d[i].dev) == ca)
+			goto out;
+
+	BUG_ON(grp->nr_devices >= MAX_CACHES_PER_SET);
+
+	rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca);
+out:
+	spin_unlock(&grp->lock);
+}
+
+/* Ratelimiting/PD controllers */
+
+static void pd_controllers_update(struct work_struct *work)
+{
+	struct cache_set *c = container_of(to_delayed_work(work),
+					   struct cache_set,
+					   pd_controllers_update);
+	struct cache *ca;
+	unsigned iter;
+	int i;
+
+	/* All units are in bytes */
+	u64 tier_size[CACHE_TIERS];
+	u64 tier_free[CACHE_TIERS];
+	u64 tier_dirty[CACHE_TIERS];
+	u64 tier0_can_free = 0;
+
+	memset(tier_size, 0, sizeof(tier_size));
+	memset(tier_free, 0, sizeof(tier_free));
+	memset(tier_dirty, 0, sizeof(tier_dirty));
+
+	rcu_read_lock();
+	for (i = CACHE_TIERS - 1; i >= 0; --i)
+		group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
+			struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
+			unsigned bucket_bits = ca->bucket_bits + 9;
+
+			/*
+			 * Bytes of internal fragmentation, which can be
+			 * reclaimed by copy GC
+			 */
+			s64 fragmented = ((stats.buckets_dirty +
+					   stats.buckets_cached) <<
+					  bucket_bits) -
+				((stats.sectors_dirty +
+				  stats.sectors_cached) << 9);
+
+			u64 dev_size = (ca->mi.nbuckets -
+					ca->mi.first_bucket) << bucket_bits;
+
+			u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
+
+			if (fragmented < 0)
+				fragmented = 0;
+
+			bch_pd_controller_update(&ca->moving_gc_pd,
+						 free, fragmented, -1);
+
+			if (i == 0)
+				tier0_can_free += fragmented;
+
+			tier_size[i] += dev_size;
+			tier_free[i] += free;
+			tier_dirty[i] += stats.buckets_dirty << bucket_bits;
+		}
+	rcu_read_unlock();
+
+	if (tier_size[1]) {
+		u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
+
+		tier0_can_free = max_t(s64, 0, tier_dirty[0] - target);
+
+		bch_pd_controller_update(&c->tiering_pd,
+					 target,
+					 tier_dirty[0],
+					 -1);
+	}
+
+	/*
+	 * Throttle foreground writes if tier 0 is running out of free buckets,
+	 * and either tiering or copygc can free up space (but don't take both
+	 * into account).
+	 *
+	 * Target will be small if there isn't any work to do - we don't want to
+	 * throttle foreground writes if we currently have all the free space
+	 * we're ever going to have.
+	 *
+	 * Otherwise, if there's work to do, try to keep 20% of tier0 available
+	 * for foreground writes.
+	 */
+	bch_pd_controller_update(&c->foreground_write_pd,
+				 min(tier0_can_free,
+				     div_u64(tier_size[0] *
+					     c->foreground_target_percent,
+					     100)),
+				 tier_free[0],
+				 -1);
+
+	schedule_delayed_work(&c->pd_controllers_update,
+			      c->pd_controllers_update_seconds * HZ);
+}
+
+/*
+ * Bucket priorities/gens:
+ *
+ * For each bucket, we store on disk its
+   * 8 bit gen
+   * 16 bit priority
+ *
+ * See alloc.c for an explanation of the gen. The priority is used to implement
+ * lru (and in the future other) cache replacement policies; for most purposes
+ * it's just an opaque integer.
+ *
+ * The gens and the priorities don't have a whole lot to do with each other, and
+ * it's actually the gens that must be written out at specific times - it's no
+ * big deal if the priorities don't get written, if we lose them we just reuse
+ * buckets in suboptimal order.
+ *
+ * On disk they're stored in a packed array, and in as many buckets are required
+ * to fit them all. The buckets we use to store them form a list; the journal
+ * header points to the first bucket, the first bucket points to the second
+ * bucket, et cetera.
+ *
+ * This code is used by the allocation code; periodically (whenever it runs out
+ * of buckets to allocate from) the allocation code will invalidate some
+ * buckets, but it can't use those buckets until their new gens are safely on
+ * disk.
+ */
+
+static int prio_io(struct cache *ca, uint64_t bucket, int op)
+{
+	bio_init(ca->bio_prio);
+	bio_set_op_attrs(ca->bio_prio, op, REQ_SYNC|REQ_META);
+
+	ca->bio_prio->bi_max_vecs	= bucket_pages(ca);
+	ca->bio_prio->bi_io_vec		= ca->bio_prio->bi_inline_vecs;
+	ca->bio_prio->bi_iter.bi_sector	= bucket * ca->mi.bucket_size;
+	ca->bio_prio->bi_bdev		= ca->disk_sb.bdev;
+	ca->bio_prio->bi_iter.bi_size	= bucket_bytes(ca);
+	bch_bio_map(ca->bio_prio, ca->disk_buckets);
+
+	return submit_bio_wait(ca->bio_prio);
+}
+
+static int bch_prio_write(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct journal *j = &c->journal;
+	struct journal_res res = { 0 };
+	bool need_new_journal_entry;
+	int i, ret;
+
+	trace_bcache_prio_write_start(ca);
+
+	atomic64_add(ca->mi.bucket_size * prio_buckets(ca),
+		     &ca->meta_sectors_written);
+
+	for (i = prio_buckets(ca) - 1; i >= 0; --i) {
+		struct bucket *g;
+		struct prio_set *p = ca->disk_buckets;
+		struct bucket_disk *d = p->data;
+		struct bucket_disk *end = d + prios_per_bucket(ca);
+		size_t r;
+
+		for (r = i * prios_per_bucket(ca);
+		     r < ca->mi.nbuckets && d < end;
+		     r++, d++) {
+			g = ca->buckets + r;
+			d->read_prio = cpu_to_le16(g->read_prio);
+			d->write_prio = cpu_to_le16(g->write_prio);
+			d->gen = ca->buckets[r].mark.gen;
+		}
+
+		p->next_bucket	= cpu_to_le64(ca->prio_buckets[i + 1]);
+		p->magic	= cpu_to_le64(pset_magic(&c->disk_sb));
+
+		SET_PSET_CSUM_TYPE(p, c->opts.metadata_checksum);
+		p->csum		= cpu_to_le64(bch_checksum(PSET_CSUM_TYPE(p),
+							   &p->magic,
+							   bucket_bytes(ca) - 8));
+
+		spin_lock(&ca->prio_buckets_lock);
+		r = bch_bucket_alloc(ca, RESERVE_PRIO);
+		BUG_ON(!r);
+
+		/*
+		 * goes here before dropping prio_buckets_lock to guard against
+		 * it getting gc'd from under us
+		 */
+		ca->prio_buckets[i] = r;
+		bch_mark_metadata_bucket(ca, ca->buckets + r, false);
+		spin_unlock(&ca->prio_buckets_lock);
+
+		ret = prio_io(ca, r, REQ_OP_WRITE);
+		if (cache_fatal_io_err_on(ret, ca,
+					  "prio write to bucket %zu", r) ||
+		    bch_meta_write_fault("prio"))
+			return ret;
+	}
+
+	spin_lock(&j->lock);
+	j->prio_buckets[ca->sb.nr_this_dev] = cpu_to_le64(ca->prio_buckets[0]);
+	j->nr_prio_buckets = max_t(unsigned,
+				   ca->sb.nr_this_dev + 1,
+				   j->nr_prio_buckets);
+	spin_unlock(&j->lock);
+
+	do {
+		unsigned u64s = jset_u64s(0);
+
+		ret = bch_journal_res_get(j, &res, u64s, u64s);
+		if (ret)
+			return ret;
+
+		need_new_journal_entry = j->buf[res.idx].nr_prio_buckets <
+			ca->sb.nr_this_dev + 1;
+		bch_journal_res_put(j, &res);
+
+		ret = bch_journal_flush_seq(j, res.seq);
+		if (ret)
+			return ret;
+	} while (need_new_journal_entry);
+
+	/*
+	 * Don't want the old priorities to get garbage collected until after we
+	 * finish writing the new ones, and they're journalled
+	 */
+
+	spin_lock(&ca->prio_buckets_lock);
+
+	for (i = 0; i < prio_buckets(ca); i++) {
+		if (ca->prio_last_buckets[i])
+			__bch_bucket_free(ca,
+				&ca->buckets[ca->prio_last_buckets[i]]);
+
+		ca->prio_last_buckets[i] = ca->prio_buckets[i];
+	}
+
+	spin_unlock(&ca->prio_buckets_lock);
+
+	trace_bcache_prio_write_end(ca);
+	return 0;
+}
+
+int bch_prio_read(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct prio_set *p = ca->disk_buckets;
+	struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
+	struct bucket_mark new;
+	unsigned bucket_nr = 0;
+	u64 bucket, expect, got;
+	size_t b;
+	int ret = 0;
+
+	spin_lock(&c->journal.lock);
+	bucket = le64_to_cpu(c->journal.prio_buckets[ca->sb.nr_this_dev]);
+	spin_unlock(&c->journal.lock);
+
+	/*
+	 * If the device hasn't been used yet, there won't be a prio bucket ptr
+	 */
+	if (!bucket)
+		return 0;
+
+	unfixable_fsck_err_on(bucket < ca->mi.first_bucket ||
+			      bucket >= ca->mi.nbuckets, c,
+			      "bad prio bucket %llu", bucket);
+
+	for (b = 0; b < ca->mi.nbuckets; b++, d++) {
+		if (d == end) {
+			ca->prio_last_buckets[bucket_nr] = bucket;
+			bucket_nr++;
+
+			ret = prio_io(ca, bucket, REQ_OP_READ);
+			if (cache_fatal_io_err_on(ret, ca,
+					"prior read from bucket %llu",
+					bucket) ||
+			    bch_meta_read_fault("prio"))
+				return -EIO;
+
+			got = le64_to_cpu(p->magic);
+			expect = pset_magic(&c->disk_sb);
+			unfixable_fsck_err_on(got != expect, c,
+				"bad magic (got %llu expect %llu) while reading prios from bucket %llu",
+				got, expect, bucket);
+
+			got = le64_to_cpu(p->csum);
+			expect = bch_checksum(PSET_CSUM_TYPE(p),
+					      &p->magic,
+					      bucket_bytes(ca) - 8);
+			unfixable_fsck_err_on(got != expect, c,
+				"bad checksum (got %llu expect %llu) while reading prios from bucket %llu",
+				got, expect, bucket);
+
+			bucket = le64_to_cpu(p->next_bucket);
+			d = p->data;
+		}
+
+		ca->buckets[b].read_prio = le16_to_cpu(d->read_prio);
+		ca->buckets[b].write_prio = le16_to_cpu(d->write_prio);
+
+		bucket_cmpxchg(&ca->buckets[b], new, new.gen = d->gen);
+	}
+fsck_err:
+	return 0;
+}
+
+#define BUCKET_GC_GEN_MAX	96U
+
+/**
+ * wait_buckets_available - wait on reclaimable buckets
+ *
+ * If there aren't enough available buckets to fill up free_inc, wait until
+ * there are.
+ */
+static int wait_buckets_available(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	int ret = 0;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop()) {
+			ret = -1;
+			break;
+		}
+
+		if (ca->inc_gen_needs_gc >= fifo_free(&ca->free_inc)) {
+			if (c->gc_thread) {
+				trace_bcache_gc_cannot_inc_gens(ca->set);
+				atomic_inc(&c->kick_gc);
+				wake_up_process(ca->set->gc_thread);
+			}
+
+			/*
+			 * We are going to wait for GC to wake us up, even if
+			 * bucket counters tell us enough buckets are available,
+			 * because we are actually waiting for GC to rewrite
+			 * nodes with stale pointers
+			 */
+		} else if (buckets_available_cache(ca) >=
+			   fifo_free(&ca->free_inc))
+			break;
+
+		up_read(&ca->set->gc_lock);
+		schedule();
+		try_to_freeze();
+		down_read(&ca->set->gc_lock);
+	}
+
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
+
+static void verify_not_on_freelist(struct cache *ca, size_t bucket)
+{
+	if (expensive_debug_checks(ca->set)) {
+		size_t iter;
+		long i;
+		unsigned j;
+
+		for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
+			BUG_ON(ca->prio_buckets[iter] == bucket);
+
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each_entry(i, &ca->free[j], iter)
+				BUG_ON(i == bucket);
+		fifo_for_each_entry(i, &ca->free_inc, iter)
+			BUG_ON(i == bucket);
+	}
+}
+
+/* Bucket heap / gen */
+
+void bch_recalc_min_prio(struct cache *ca, int rw)
+{
+	struct cache_set *c = ca->set;
+	struct prio_clock *clock = &c->prio_clock[rw];
+	struct bucket *g;
+	u16 max_delta = 1;
+	unsigned i;
+
+	/* Determine min prio for this particular cache */
+	for_each_bucket(g, ca)
+		max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
+
+	ca->min_prio[rw] = clock->hand - max_delta;
+
+	/*
+	 * This may possibly increase the min prio for the whole cache, check
+	 * that as well.
+	 */
+	max_delta = 1;
+
+	for_each_cache(ca, c, i)
+		max_delta = max(max_delta,
+				(u16) (clock->hand - ca->min_prio[rw]));
+
+	clock->min_prio = clock->hand - max_delta;
+}
+
+static void bch_rescale_prios(struct cache_set *c, int rw)
+{
+	struct prio_clock *clock = &c->prio_clock[rw];
+	struct cache *ca;
+	struct bucket *g;
+	unsigned i;
+
+	trace_bcache_rescale_prios(c);
+
+	for_each_cache(ca, c, i) {
+		for_each_bucket(g, ca)
+			g->prio[rw] = clock->hand -
+				(clock->hand - g->prio[rw]) / 2;
+
+		bch_recalc_min_prio(ca, rw);
+	}
+}
+
+static void bch_inc_clock_hand(struct io_timer *timer)
+{
+	struct prio_clock *clock = container_of(timer,
+					struct prio_clock, rescale);
+	struct cache_set *c = container_of(clock,
+				struct cache_set, prio_clock[clock->rw]);
+	u64 capacity;
+
+	mutex_lock(&c->bucket_lock);
+
+	clock->hand++;
+
+	/* if clock cannot be advanced more, rescale prio */
+	if (clock->hand == (u16) (clock->min_prio - 1))
+		bch_rescale_prios(c, clock->rw);
+
+	mutex_unlock(&c->bucket_lock);
+
+	capacity = READ_ONCE(c->capacity);
+
+	if (!capacity)
+		return;
+
+	/*
+	 * we only increment when 0.1% of the cache_set has been read
+	 * or written too, this determines if it's time
+	 *
+	 * XXX: we shouldn't really be going off of the capacity of devices in
+	 * RW mode (that will be 0 when we're RO, yet we can still service
+	 * reads)
+	 */
+	timer->expire += capacity >> 10;
+
+	bch_io_timer_add(&c->io_clock[clock->rw], timer);
+}
+
+static void bch_prio_timer_init(struct cache_set *c, int rw)
+{
+	struct prio_clock *clock = &c->prio_clock[rw];
+	struct io_timer *timer = &clock->rescale;
+
+	clock->rw	= rw;
+	timer->fn	= bch_inc_clock_hand;
+	timer->expire	= c->capacity >> 10;
+}
+
+/*
+ * Background allocation thread: scans for buckets to be invalidated,
+ * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
+ * then optionally issues discard commands to the newly free buckets, then puts
+ * them on the various freelists.
+ */
+
+static inline bool can_inc_bucket_gen(struct cache *ca, struct bucket *g)
+{
+	return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX;
+}
+
+static bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *g)
+{
+	if (!is_available_bucket(READ_ONCE(g->mark)))
+		return false;
+
+	if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX - 1)
+		ca->inc_gen_needs_gc++;
+
+	return can_inc_bucket_gen(ca, g);
+}
+
+static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *g)
+{
+	spin_lock(&ca->freelist_lock);
+
+	bch_invalidate_bucket(ca, g);
+
+	g->read_prio = ca->set->prio_clock[READ].hand;
+	g->write_prio = ca->set->prio_clock[WRITE].hand;
+
+	verify_not_on_freelist(ca, g - ca->buckets);
+	BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
+
+	spin_unlock(&ca->freelist_lock);
+}
+
+/*
+ * Determines what order we're going to reuse buckets, smallest bucket_key()
+ * first.
+ *
+ *
+ * - We take into account the read prio of the bucket, which gives us an
+ *   indication of how hot the data is -- we scale the prio so that the prio
+ *   farthest from the clock is worth 1/8th of the closest.
+ *
+ * - The number of sectors of cached data in the bucket, which gives us an
+ *   indication of the cost in cache misses this eviction will cause.
+ *
+ * - The difference between the bucket's current gen and oldest gen of any
+ *   pointer into it, which gives us an indication of the cost of an eventual
+ *   btree GC to rewrite nodes with stale pointers.
+ */
+
+#define bucket_sort_key(g)						\
+({									\
+	unsigned long prio = g->read_prio - ca->min_prio[READ];		\
+	prio = (prio * 7) / (ca->set->prio_clock[READ].hand -		\
+			     ca->min_prio[READ]);			\
+									\
+	(((prio + 1) * bucket_sectors_used(g)) << 8) | bucket_gc_gen(ca, g);\
+})
+
+static void invalidate_buckets_lru(struct cache *ca)
+{
+	struct bucket_heap_entry e;
+	struct bucket *g;
+	unsigned i;
+
+	mutex_lock(&ca->heap_lock);
+
+	ca->heap.used = 0;
+
+	mutex_lock(&ca->set->bucket_lock);
+	bch_recalc_min_prio(ca, READ);
+	bch_recalc_min_prio(ca, WRITE);
+
+	/*
+	 * Find buckets with lowest read priority, by building a maxheap sorted
+	 * by read priority and repeatedly replacing the maximum element until
+	 * all buckets have been visited.
+	 */
+	for_each_bucket(g, ca) {
+		if (!bch_can_invalidate_bucket(ca, g))
+			continue;
+
+		bucket_heap_push(ca, g, bucket_sort_key(g));
+	}
+
+	/* Sort buckets by physical location on disk for better locality */
+	for (i = 0; i < ca->heap.used; i++) {
+		struct bucket_heap_entry *e = &ca->heap.data[i];
+
+		e->val = e->g - ca->buckets;
+	}
+
+	heap_resort(&ca->heap, bucket_max_cmp);
+
+	/*
+	 * If we run out of buckets to invalidate, bch_allocator_thread() will
+	 * kick stuff and retry us
+	 */
+	while (!fifo_full(&ca->free_inc) &&
+	       heap_pop(&ca->heap, e, bucket_max_cmp)) {
+		BUG_ON(!bch_can_invalidate_bucket(ca, e.g));
+		bch_invalidate_one_bucket(ca, e.g);
+	}
+
+	mutex_unlock(&ca->set->bucket_lock);
+	mutex_unlock(&ca->heap_lock);
+}
+
+static void invalidate_buckets_fifo(struct cache *ca)
+{
+	struct bucket *g;
+	size_t checked = 0;
+
+	while (!fifo_full(&ca->free_inc)) {
+		if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
+		    ca->fifo_last_bucket >= ca->mi.nbuckets)
+			ca->fifo_last_bucket = ca->mi.first_bucket;
+
+		g = ca->buckets + ca->fifo_last_bucket++;
+
+		if (bch_can_invalidate_bucket(ca, g))
+			bch_invalidate_one_bucket(ca, g);
+
+		if (++checked >= ca->mi.nbuckets)
+			return;
+	}
+}
+
+static void invalidate_buckets_random(struct cache *ca)
+{
+	struct bucket *g;
+	size_t checked = 0;
+
+	while (!fifo_full(&ca->free_inc)) {
+		size_t n = bch_rand_range(ca->mi.nbuckets -
+					  ca->mi.first_bucket) +
+			ca->mi.first_bucket;
+
+		g = ca->buckets + n;
+
+		if (bch_can_invalidate_bucket(ca, g))
+			bch_invalidate_one_bucket(ca, g);
+
+		if (++checked >= ca->mi.nbuckets / 2)
+			return;
+	}
+}
+
+static void invalidate_buckets(struct cache *ca)
+{
+	ca->inc_gen_needs_gc = 0;
+
+	switch (ca->mi.replacement) {
+	case CACHE_REPLACEMENT_LRU:
+		invalidate_buckets_lru(ca);
+		break;
+	case CACHE_REPLACEMENT_FIFO:
+		invalidate_buckets_fifo(ca);
+		break;
+	case CACHE_REPLACEMENT_RANDOM:
+		invalidate_buckets_random(ca);
+		break;
+	}
+}
+
+static bool __bch_allocator_push(struct cache *ca, long bucket)
+{
+	if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
+		goto success;
+
+	if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket))
+		goto success;
+
+	if (fifo_push(&ca->free[RESERVE_BTREE], bucket))
+		goto success;
+
+	if (fifo_push(&ca->free[RESERVE_NONE], bucket))
+		goto success;
+
+	return false;
+success:
+	closure_wake_up(&ca->set->freelist_wait);
+	return true;
+}
+
+static bool bch_allocator_push(struct cache *ca, long bucket)
+{
+	bool ret;
+
+	spin_lock(&ca->freelist_lock);
+	ret = __bch_allocator_push(ca, bucket);
+	if (ret)
+		fifo_pop(&ca->free_inc, bucket);
+	spin_unlock(&ca->freelist_lock);
+
+	return ret;
+}
+
+static void bch_find_empty_buckets(struct cache_set *c, struct cache *ca)
+{
+	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
+	struct bucket *g;
+
+	for_each_bucket(g, ca) {
+		struct bucket_mark m = READ_ONCE(g->mark);
+
+		if (is_available_bucket(m) &&
+		    !m.cached_sectors &&
+		    !m.had_metadata &&
+		    (!m.wait_on_journal ||
+		     ((s16) last_seq_ondisk - (s16) m.journal_seq >= 0))) {
+			spin_lock(&ca->freelist_lock);
+
+			bch_mark_alloc_bucket(ca, g, true);
+			g->read_prio = ca->set->prio_clock[READ].hand;
+			g->write_prio = ca->set->prio_clock[WRITE].hand;
+
+			verify_not_on_freelist(ca, g - ca->buckets);
+			BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
+
+			spin_unlock(&ca->freelist_lock);
+
+			if (fifo_full(&ca->free_inc))
+				break;
+		}
+	}
+}
+
+/**
+ * bch_allocator_thread - move buckets from free_inc to reserves
+ *
+ * The free_inc FIFO is populated by invalidate_buckets(), and
+ * the reserves are depleted by bucket allocation. When we run out
+ * of free_inc, try to invalidate some buckets and write out
+ * prios and gens.
+ */
+static int bch_allocator_thread(void *arg)
+{
+	struct cache *ca = arg;
+	struct cache_set *c = ca->set;
+	int ret;
+
+	set_freezable();
+
+	while (1) {
+		/*
+		 * First, we pull buckets off of the free_inc list, possibly
+		 * issue discards to them, then we add the bucket to a
+		 * free list:
+		 */
+
+		while (!fifo_empty(&ca->free_inc)) {
+			long bucket = fifo_peek(&ca->free_inc);
+
+			/*
+			 * Don't remove from free_inc until after it's added
+			 * to freelist, so gc doesn't miss it while we've
+			 * dropped bucket lock
+			 */
+
+			if (ca->mi.discard &&
+			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca, bucket),
+					ca->mi.bucket_size, GFP_NOIO, 0);
+
+			while (1) {
+				set_current_state(TASK_INTERRUPTIBLE);
+				if (bch_allocator_push(ca, bucket))
+					break;
+
+				if (kthread_should_stop()) {
+					__set_current_state(TASK_RUNNING);
+					goto out;
+				}
+				schedule();
+				try_to_freeze();
+			}
+
+			__set_current_state(TASK_RUNNING);
+		}
+
+		down_read(&c->gc_lock);
+
+		/*
+		 * See if we have buckets we can reuse without invalidating them
+		 * or forcing a journal commit:
+		 */
+		bch_find_empty_buckets(c, ca);
+
+		if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) {
+			up_read(&c->gc_lock);
+			continue;
+		}
+
+		/* We've run out of free buckets! */
+
+		while (!fifo_full(&ca->free_inc)) {
+			if (wait_buckets_available(ca)) {
+				up_read(&c->gc_lock);
+				goto out;
+			}
+
+			/*
+			 * Find some buckets that we can invalidate, either
+			 * they're completely unused, or only contain clean data
+			 * that's been written back to the backing device or
+			 * another cache tier
+			 */
+
+			invalidate_buckets(ca);
+			trace_bcache_alloc_batch(ca, fifo_used(&ca->free_inc),
+						 ca->free_inc.size);
+		}
+
+		up_read(&c->gc_lock);
+
+		/*
+		 * free_inc is full of newly-invalidated buckets, must write out
+		 * prios and gens before they can be re-used
+		 */
+		ret = bch_prio_write(ca);
+		if (ret) {
+			/*
+			 * Emergency read only - allocator thread has to
+			 * shutdown.
+			 *
+			 * N.B. we better be going into RO mode, else
+			 * allocations would hang indefinitely - whatever
+			 * generated the error will have sent us into RO mode.
+			 *
+			 * Clear out the free_inc freelist so things are
+			 * consistent-ish:
+			 */
+			spin_lock(&ca->freelist_lock);
+			while (!fifo_empty(&ca->free_inc)) {
+				long bucket;
+
+				fifo_pop(&ca->free_inc, bucket);
+				bch_mark_free_bucket(ca, ca->buckets + bucket);
+			}
+			spin_unlock(&ca->freelist_lock);
+			goto out;
+		}
+	}
+out:
+	/*
+	 * Avoid a race with bucket_stats_update() trying to wake us up after
+	 * we've exited:
+	 */
+	synchronize_rcu();
+	return 0;
+}
+
+/* Allocation */
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ * */
+static size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
+{
+	struct bucket *g;
+	long r;
+
+	spin_lock(&ca->freelist_lock);
+	if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
+	    fifo_pop(&ca->free[reserve], r))
+		goto out;
+
+	spin_unlock(&ca->freelist_lock);
+
+	trace_bcache_bucket_alloc_fail(ca, reserve);
+	return 0;
+out:
+	verify_not_on_freelist(ca, r);
+	spin_unlock(&ca->freelist_lock);
+
+	trace_bcache_bucket_alloc(ca, reserve);
+
+	bch_wake_allocator(ca);
+
+	g = ca->buckets + r;
+
+	g->read_prio = ca->set->prio_clock[READ].hand;
+	g->write_prio = ca->set->prio_clock[WRITE].hand;
+
+	return r;
+}
+
+static void __bch_bucket_free(struct cache *ca, struct bucket *g)
+{
+	bch_mark_free_bucket(ca, g);
+
+	g->read_prio = ca->set->prio_clock[READ].hand;
+	g->write_prio = ca->set->prio_clock[WRITE].hand;
+}
+
+enum bucket_alloc_ret {
+	ALLOC_SUCCESS,
+	NO_DEVICES,		/* -EROFS */
+	FREELIST_EMPTY,		/* Allocator thread not keeping up */
+};
+
+static void recalc_alloc_group_weights(struct cache_set *c,
+				       struct cache_group *devs)
+{
+	struct cache *ca;
+	u64 available_buckets = 1; /* avoid a divide by zero... */
+	unsigned i;
+
+	for (i = 0; i < devs->nr_devices; i++) {
+		ca = devs->d[i].dev;
+
+		devs->d[i].weight = buckets_free_cache(ca);
+		available_buckets += devs->d[i].weight;
+	}
+
+	for (i = 0; i < devs->nr_devices; i++) {
+		const unsigned min_weight = U32_MAX >> 4;
+		const unsigned max_weight = U32_MAX;
+
+		devs->d[i].weight =
+			min_weight +
+			div64_u64(devs->d[i].weight *
+				  devs->nr_devices *
+				  (max_weight - min_weight),
+				  available_buckets);
+		devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
+	}
+}
+
+static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
+						    struct open_bucket *ob,
+						    enum alloc_reserve reserve,
+						    unsigned nr_replicas,
+						    struct cache_group *devs,
+						    long *caches_used)
+{
+	enum bucket_alloc_ret ret;
+	unsigned fail_idx = -1, i;
+	unsigned available = 0;
+
+	BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs));
+
+	if (ob->nr_ptrs >= nr_replicas)
+		return ALLOC_SUCCESS;
+
+	rcu_read_lock();
+	spin_lock(&devs->lock);
+
+	for (i = 0; i < devs->nr_devices; i++)
+		available += !test_bit(devs->d[i].dev->sb.nr_this_dev,
+				       caches_used);
+
+	recalc_alloc_group_weights(c, devs);
+
+	i = devs->cur_device;
+
+	while (ob->nr_ptrs < nr_replicas) {
+		struct cache *ca;
+		u64 bucket;
+
+		if (!available) {
+			ret = NO_DEVICES;
+			goto err;
+		}
+
+		i++;
+		i %= devs->nr_devices;
+
+		ret = FREELIST_EMPTY;
+		if (i == fail_idx)
+			goto err;
+
+		ca = devs->d[i].dev;
+
+		if (test_bit(ca->sb.nr_this_dev, caches_used))
+			continue;
+
+		if (fail_idx == -1 &&
+		    get_random_int() > devs->d[i].weight)
+			continue;
+
+		bucket = bch_bucket_alloc(ca, reserve);
+		if (!bucket) {
+			if (fail_idx == -1)
+				fail_idx = i;
+			continue;
+		}
+
+		/*
+		 * open_bucket_add_buckets expects new pointers at the head of
+		 * the list:
+		 */
+		memmove(&ob->ptrs[1],
+			&ob->ptrs[0],
+			ob->nr_ptrs * sizeof(ob->ptrs[0]));
+		memmove(&ob->ptr_offset[1],
+			&ob->ptr_offset[0],
+			ob->nr_ptrs * sizeof(ob->ptr_offset[0]));
+		ob->nr_ptrs++;
+		ob->ptrs[0] = (struct bch_extent_ptr) {
+			.gen	= ca->buckets[bucket].mark.gen,
+			.offset	= bucket_to_sector(ca, bucket),
+			.dev	= ca->sb.nr_this_dev,
+		};
+		ob->ptr_offset[0] = 0;
+
+		__set_bit(ca->sb.nr_this_dev, caches_used);
+		available--;
+		devs->cur_device = i;
+	}
+
+	ret = ALLOC_SUCCESS;
+err:
+	EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
+	spin_unlock(&devs->lock);
+	rcu_read_unlock();
+	return ret;
+}
+
+static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
+						    struct write_point *wp,
+						    struct open_bucket *ob,
+						    unsigned nr_replicas,
+						    enum alloc_reserve reserve,
+						    long *caches_used)
+{
+	/*
+	 * this should implement policy - for a given type of allocation, decide
+	 * which devices to allocate from:
+	 *
+	 * XXX: switch off wp->type and do something more intelligent here
+	 */
+
+	/* foreground writes: prefer tier 0: */
+	if (wp->group == &c->cache_all)
+		bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
+				       &c->cache_tiers[0], caches_used);
+
+	return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
+				      wp->group, caches_used);
+}
+
+static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp,
+				struct open_bucket *ob, unsigned nr_replicas,
+				enum alloc_reserve reserve, long *caches_used,
+				struct closure *cl)
+{
+	bool waiting = false;
+
+	while (1) {
+		switch (__bch_bucket_alloc_set(c, wp, ob, nr_replicas,
+					       reserve, caches_used)) {
+		case ALLOC_SUCCESS:
+			if (waiting)
+				closure_wake_up(&c->freelist_wait);
+
+			return 0;
+
+		case NO_DEVICES:
+			if (waiting)
+				closure_wake_up(&c->freelist_wait);
+			return -EROFS;
+
+		case FREELIST_EMPTY:
+			if (!cl || waiting)
+				trace_bcache_freelist_empty_fail(c,
+							reserve, cl);
+
+			if (!cl)
+				return -ENOSPC;
+
+			if (waiting)
+				return -EAGAIN;
+
+			/* Retry allocation after adding ourself to waitlist: */
+			closure_wait(&c->freelist_wait, cl);
+			waiting = true;
+			break;
+		default:
+			BUG();
+		}
+	}
+}
+
+/* Open buckets: */
+
+/*
+ * Open buckets represent one or more buckets (on multiple devices) that are
+ * currently being allocated from. They serve two purposes:
+ *
+ *  - They track buckets that have been partially allocated, allowing for
+ *    sub-bucket sized allocations - they're used by the sector allocator below
+ *
+ *  - They provide a reference to the buckets they own that mark and sweep GC
+ *    can find, until the new allocation has a pointer to it inserted into the
+ *    btree
+ *
+ * When allocating some space with the sector allocator, the allocation comes
+ * with a reference to an open bucket - the caller is required to put that
+ * reference _after_ doing the index update that makes its allocation reachable.
+ */
+
+static void __bch_open_bucket_put(struct cache_set *c, struct open_bucket *ob)
+{
+	const struct bch_extent_ptr *ptr;
+	struct cache *ca;
+
+	lockdep_assert_held(&c->open_buckets_lock);
+
+	rcu_read_lock();
+	open_bucket_for_each_online_device(c, ob, ptr, ca)
+		bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false);
+	rcu_read_unlock();
+
+	ob->nr_ptrs = 0;
+
+	list_move(&ob->list, &c->open_buckets_free);
+	c->open_buckets_nr_free++;
+	closure_wake_up(&c->open_buckets_wait);
+}
+
+void bch_open_bucket_put(struct cache_set *c, struct open_bucket *b)
+{
+	if (atomic_dec_and_test(&b->pin)) {
+		spin_lock(&c->open_buckets_lock);
+		__bch_open_bucket_put(c, b);
+		spin_unlock(&c->open_buckets_lock);
+	}
+}
+
+static struct open_bucket *bch_open_bucket_get(struct cache_set *c,
+					       unsigned nr_reserved,
+					       struct closure *cl)
+{
+	struct open_bucket *ret;
+
+	spin_lock(&c->open_buckets_lock);
+
+	if (c->open_buckets_nr_free > nr_reserved) {
+		BUG_ON(list_empty(&c->open_buckets_free));
+		ret = list_first_entry(&c->open_buckets_free,
+				       struct open_bucket, list);
+		list_move(&ret->list, &c->open_buckets_open);
+		BUG_ON(ret->nr_ptrs);
+
+		atomic_set(&ret->pin, 1); /* XXX */
+		ret->has_full_ptrs	= false;
+
+		c->open_buckets_nr_free--;
+		trace_bcache_open_bucket_alloc(c, cl);
+	} else {
+		trace_bcache_open_bucket_alloc_fail(c, cl);
+
+		if (cl) {
+			closure_wait(&c->open_buckets_wait, cl);
+			ret = ERR_PTR(-EAGAIN);
+		} else
+			ret = ERR_PTR(-ENOSPC);
+	}
+
+	spin_unlock(&c->open_buckets_lock);
+
+	return ret;
+}
+
+static unsigned ob_ptr_sectors_free(struct open_bucket *ob,
+				    struct cache_member_rcu *mi,
+				    struct bch_extent_ptr *ptr)
+{
+	unsigned i = ptr - ob->ptrs;
+	unsigned bucket_size = mi->m[ptr->dev].bucket_size;
+	unsigned used = (ptr->offset & (bucket_size - 1)) +
+		ob->ptr_offset[i];
+
+	BUG_ON(used > bucket_size);
+
+	return bucket_size - used;
+}
+
+static unsigned open_bucket_sectors_free(struct cache_set *c,
+					 struct open_bucket *ob,
+					 unsigned nr_replicas)
+{
+	struct cache_member_rcu *mi = cache_member_info_get(c);
+	unsigned i, sectors_free = UINT_MAX;
+
+	BUG_ON(nr_replicas > ob->nr_ptrs);
+
+	for (i = 0; i < nr_replicas; i++)
+		sectors_free = min(sectors_free,
+				   ob_ptr_sectors_free(ob, mi, &ob->ptrs[i]));
+
+	cache_member_info_put();
+
+	return sectors_free != UINT_MAX ? sectors_free : 0;
+}
+
+static void open_bucket_copy_unused_ptrs(struct cache_set *c,
+					 struct open_bucket *new,
+					 struct open_bucket *old)
+{
+	struct cache_member_rcu *mi = cache_member_info_get(c);
+	unsigned i;
+
+	for (i = 0; i < old->nr_ptrs; i++)
+		if (ob_ptr_sectors_free(old, mi, &old->ptrs[i])) {
+			struct bch_extent_ptr tmp = old->ptrs[i];
+
+			tmp.offset += old->ptr_offset[i];
+			new->ptrs[new->nr_ptrs] = tmp;
+			new->ptr_offset[new->nr_ptrs] = 0;
+			new->nr_ptrs++;
+		}
+	cache_member_info_put();
+}
+
+static void verify_not_stale(struct cache_set *c, const struct open_bucket *ob)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	const struct bch_extent_ptr *ptr;
+	struct cache *ca;
+
+	rcu_read_lock();
+	open_bucket_for_each_online_device(c, ob, ptr, ca)
+		BUG_ON(ptr_stale(ca, ptr));
+	rcu_read_unlock();
+#endif
+}
+
+/* Sector allocator */
+
+static struct open_bucket *lock_writepoint(struct cache_set *c,
+					   struct write_point *wp)
+{
+	struct open_bucket *ob;
+
+	while ((ob = ACCESS_ONCE(wp->b))) {
+		mutex_lock(&ob->lock);
+		if (wp->b == ob)
+			break;
+
+		mutex_unlock(&ob->lock);
+	}
+
+	return ob;
+}
+
+static int open_bucket_add_buckets(struct cache_set *c,
+				   struct write_point *wp,
+				   struct open_bucket *ob,
+				   unsigned nr_replicas,
+				   enum alloc_reserve reserve,
+				   struct closure *cl)
+{
+	long caches_used[BITS_TO_LONGS(MAX_CACHES_PER_SET)];
+	int i, dst;
+
+	/*
+	 * We might be allocating pointers to add to an existing extent
+	 * (tiering/copygc/migration) - if so, some of the pointers in our
+	 * existing open bucket might duplicate devices we already have. This is
+	 * moderately annoying.
+	 */
+
+	/* Short circuit all the fun stuff if posssible: */
+	if (ob->nr_ptrs >= nr_replicas)
+		return 0;
+
+	memset(caches_used, 0, sizeof(caches_used));
+
+	/*
+	 * Shuffle pointers to devices we already have to the end:
+	 * bch_bucket_alloc_set() will add new pointers to the statr of @b, and
+	 * bch_alloc_sectors_done() will add the first nr_replicas ptrs to @e:
+	 */
+	for (i = dst = ob->nr_ptrs - 1; i >= 0; --i)
+		if (__test_and_set_bit(ob->ptrs[i].dev, caches_used)) {
+			if (i != dst) {
+				swap(ob->ptrs[i], ob->ptrs[dst]);
+				swap(ob->ptr_offset[i], ob->ptr_offset[dst]);
+			}
+			--dst;
+			nr_replicas++;
+		}
+
+	return bch_bucket_alloc_set(c, wp, ob, nr_replicas,
+				    reserve, caches_used, cl);
+}
+
+/*
+ * Get us an open_bucket we can allocate from, return with it locked:
+ */
+struct open_bucket *bch_alloc_sectors_start(struct cache_set *c,
+					    struct write_point *wp,
+					    unsigned nr_replicas,
+					    enum alloc_reserve reserve,
+					    struct closure *cl)
+{
+	struct open_bucket *ob;
+	unsigned open_buckets_reserved = wp == &c->btree_write_point
+		? 0 : BTREE_NODE_RESERVE;
+	int ret;
+
+	BUG_ON(!wp->group);
+	BUG_ON(!reserve);
+	BUG_ON(!nr_replicas);
+retry:
+	ob = lock_writepoint(c, wp);
+
+	/*
+	 * If ob->sectors_free == 0, one or more of the buckets ob points to is
+	 * full. We can't drop pointers from an open bucket - garbage collection
+	 * still needs to find them; instead, we must allocate a new open bucket
+	 * and copy any pointers to non-full buckets into the new open bucket.
+	 */
+	if (!ob || ob->has_full_ptrs) {
+		struct open_bucket *new_ob;
+
+		new_ob = bch_open_bucket_get(c, open_buckets_reserved, cl);
+		if (IS_ERR(new_ob))
+			return new_ob;
+
+		mutex_lock(&new_ob->lock);
+
+		/*
+		 * We point the write point at the open_bucket before doing the
+		 * allocation to avoid a race with shutdown:
+		 */
+		if (race_fault() ||
+		    cmpxchg(&wp->b, ob, new_ob) != ob) {
+			/* We raced: */
+			mutex_unlock(&new_ob->lock);
+			bch_open_bucket_put(c, new_ob);
+
+			if (ob)
+				mutex_unlock(&ob->lock);
+			goto retry;
+		}
+
+		if (ob) {
+			open_bucket_copy_unused_ptrs(c, new_ob, ob);
+			mutex_unlock(&ob->lock);
+			bch_open_bucket_put(c, ob);
+		}
+
+		ob = new_ob;
+	}
+
+	ret = open_bucket_add_buckets(c, wp, ob, nr_replicas,
+				      reserve, cl);
+	if (ret) {
+		mutex_unlock(&ob->lock);
+		return ERR_PTR(ret);
+	}
+
+	ob->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas);
+
+	BUG_ON(!ob->sectors_free);
+	verify_not_stale(c, ob);
+
+	return ob;
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
+				   unsigned nr_replicas, struct open_bucket *ob,
+				   unsigned sectors)
+{
+	struct bch_extent_ptr tmp, *ptr;
+	struct cache *ca;
+	bool has_data = false;
+	unsigned i;
+
+	/*
+	 * We're keeping any existing pointer k has, and appending new pointers:
+	 * __bch_write() will only write to the pointers we add here:
+	 */
+
+	/*
+	 * XXX: don't add pointers to devices @e already has
+	 */
+	BUG_ON(nr_replicas > ob->nr_ptrs);
+	BUG_ON(sectors > ob->sectors_free);
+
+	/* didn't use all the ptrs: */
+	if (nr_replicas < ob->nr_ptrs)
+		has_data = true;
+
+	for (i = 0; i < nr_replicas; i++) {
+		EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
+
+		tmp = ob->ptrs[i];
+		tmp.offset += ob->ptr_offset[i];
+		extent_ptr_append(e, tmp);
+
+		ob->ptr_offset[i] += sectors;
+	}
+
+	open_bucket_for_each_online_device(c, ob, ptr, ca)
+		this_cpu_add(*ca->sectors_written, sectors);
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch_alloc_sectors_done(struct cache_set *c, struct write_point *wp,
+			    struct open_bucket *ob)
+{
+	struct cache_member_rcu *mi = cache_member_info_get(c);
+	bool has_data = false;
+	unsigned i;
+
+	for (i = 0; i < ob->nr_ptrs; i++) {
+		if (!ob_ptr_sectors_free(ob, mi, &ob->ptrs[i]))
+			ob->has_full_ptrs = true;
+		else
+			has_data = true;
+	}
+
+	cache_member_info_put();
+
+	if (likely(has_data))
+		atomic_inc(&ob->pin);
+	else
+		BUG_ON(xchg(&wp->b, NULL) != ob);
+
+	mutex_unlock(&ob->lock);
+}
+
+/*
+ * Allocates some space in the cache to write to, and k to point to the newly
+ * allocated space, and updates k->size and k->offset (to point to the
+ * end of the newly allocated space).
+ *
+ * May allocate fewer sectors than @sectors, k->size indicates how many
+ * sectors were actually allocated.
+ *
+ * Return codes:
+ * - -EAGAIN: closure was added to waitlist
+ * - -ENOSPC: out of space and no closure provided
+ *
+ * @c  - cache set.
+ * @wp - write point to use for allocating sectors.
+ * @k  - key to return the allocated space information.
+ * @cl - closure to wait for a bucket
+ */
+struct open_bucket *bch_alloc_sectors(struct cache_set *c,
+				      struct write_point *wp,
+				      struct bkey_i_extent *e,
+				      unsigned nr_replicas,
+				      enum alloc_reserve reserve,
+				      struct closure *cl)
+{
+	struct open_bucket *ob;
+
+	ob = bch_alloc_sectors_start(c, wp, nr_replicas, reserve, cl);
+	if (IS_ERR_OR_NULL(ob))
+		return ob;
+
+	if (e->k.size > ob->sectors_free)
+		bch_key_resize(&e->k, ob->sectors_free);
+
+	bch_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size);
+
+	bch_alloc_sectors_done(c, wp, ob);
+
+	return ob;
+}
+
+/* Startup/shutdown (ro/rw): */
+
+static void bch_recalc_capacity(struct cache_set *c)
+{
+	struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers);
+	struct cache *ca;
+	u64 total_capacity, capacity = 0, reserved_sectors = 0;
+	unsigned long ra_pages = 0;
+	unsigned i, j;
+
+	rcu_read_lock();
+	for_each_cache_rcu(ca, c, i) {
+		struct backing_dev_info *bdi =
+			blk_get_backing_dev_info(ca->disk_sb.bdev);
+
+		ra_pages += bdi->ra_pages;
+	}
+
+	c->bdi.ra_pages = ra_pages;
+
+	/*
+	 * Capacity of the cache set is the capacity of all the devices in the
+	 * slowest (highest) tier - we don't include lower tier devices.
+	 */
+	for (tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers) - 1;
+	     tier > c->cache_tiers && !tier->nr_devices;
+	     --tier)
+		;
+
+	group_for_each_cache_rcu(ca, tier, i) {
+		size_t reserve = 0;
+
+		/*
+		 * We need to reserve buckets (from the number
+		 * of currently available buckets) against
+		 * foreground writes so that mainly copygc can
+		 * make forward progress.
+		 *
+		 * We need enough to refill the various reserves
+		 * from scratch - copygc will use its entire
+		 * reserve all at once, then run against when
+		 * its reserve is refilled (from the formerly
+		 * available buckets).
+		 *
+		 * This reserve is just used when considering if
+		 * allocations for foreground writes must wait -
+		 * not -ENOSPC calculations.
+		 */
+		for (j = 0; j < RESERVE_NONE; j++)
+			reserve += ca->free[j].size;
+
+		reserve += ca->free_inc.size;
+
+		reserve += ARRAY_SIZE(c->write_points);
+
+		if (ca->mi.tier)
+			reserve += 1;	/* tiering write point */
+		reserve += 1;		/* btree write point */
+
+		reserved_sectors += reserve << ca->bucket_bits;
+
+		capacity += (ca->mi.nbuckets -
+			     ca->mi.first_bucket) <<
+			ca->bucket_bits;
+	}
+	rcu_read_unlock();
+
+	total_capacity = capacity;
+
+	capacity *= (100 - c->opts.gc_reserve_percent);
+	capacity = div64_u64(capacity, 100);
+
+	BUG_ON(capacity + reserved_sectors > total_capacity);
+
+	c->capacity = capacity;
+
+	if (c->capacity) {
+		bch_io_timer_add(&c->io_clock[READ],
+				 &c->prio_clock[READ].rescale);
+		bch_io_timer_add(&c->io_clock[WRITE],
+				 &c->prio_clock[WRITE].rescale);
+	} else {
+		bch_io_timer_del(&c->io_clock[READ],
+				 &c->prio_clock[READ].rescale);
+		bch_io_timer_del(&c->io_clock[WRITE],
+				 &c->prio_clock[WRITE].rescale);
+	}
+
+	/* Wake up case someone was waiting for buckets */
+	closure_wake_up(&c->freelist_wait);
+}
+
+static void bch_stop_write_point(struct cache *ca,
+				 struct write_point *wp)
+{
+	struct cache_set *c = ca->set;
+	struct open_bucket *ob;
+	struct bch_extent_ptr *ptr;
+
+	ob = lock_writepoint(c, wp);
+	if (!ob)
+		return;
+
+	for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
+		if (ptr->dev == ca->sb.nr_this_dev)
+			goto found;
+
+	mutex_unlock(&ob->lock);
+	return;
+found:
+	BUG_ON(xchg(&wp->b, NULL) != ob);
+	mutex_unlock(&ob->lock);
+
+	/* Drop writepoint's ref: */
+	bch_open_bucket_put(c, ob);
+}
+
+static bool bch_dev_has_open_write_point(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct bch_extent_ptr *ptr;
+	struct open_bucket *ob;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++)
+		if (atomic_read(&ob->pin)) {
+			mutex_lock(&ob->lock);
+			for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
+				if (ptr->dev == ca->sb.nr_this_dev) {
+					mutex_unlock(&ob->lock);
+					return true;
+				}
+			mutex_unlock(&ob->lock);
+		}
+
+	return false;
+}
+
+/* device goes ro: */
+void bch_cache_allocator_stop(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	struct task_struct *p;
+	struct closure cl;
+	unsigned i;
+
+	closure_init_stack(&cl);
+
+	/* First, remove device from allocation groups: */
+
+	bch_cache_group_remove_cache(tier, ca);
+	bch_cache_group_remove_cache(&c->cache_all, ca);
+
+	bch_recalc_capacity(c);
+
+	/*
+	 * Stopping the allocator thread comes after removing from allocation
+	 * groups, else pending allocations will hang:
+	 */
+
+	p = ca->alloc_thread;
+	ca->alloc_thread = NULL;
+	smp_wmb();
+
+	/*
+	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
+	 * the thread shutting down to avoid a race with bucket_stats_update() -
+	 * the allocator thread itself does a synchronize_rcu() on exit.
+	 *
+	 * XXX: it would be better to have the rcu barrier be asynchronous
+	 * instead of blocking us here
+	 */
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+
+	/* Next, close write points that point to this device... */
+
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+		bch_stop_write_point(ca, &c->write_points[i]);
+
+	bch_stop_write_point(ca, &ca->copygc_write_point);
+	bch_stop_write_point(ca, &c->promote_write_point);
+	bch_stop_write_point(ca, &ca->tiering_write_point);
+	bch_stop_write_point(ca, &c->migration_write_point);
+	bch_stop_write_point(ca, &c->btree_write_point);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	while (c->btree_reserve_cache_nr) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		bch_open_bucket_put(c, a->ob);
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	/* Avoid deadlocks.. */
+
+	closure_wake_up(&c->freelist_wait);
+	wake_up(&c->journal.wait);
+
+	/* Now wait for any in flight writes: */
+
+	while (1) {
+		closure_wait(&c->open_buckets_wait, &cl);
+
+		if (!bch_dev_has_open_write_point(ca)) {
+			closure_wake_up(&c->open_buckets_wait);
+			break;
+		}
+
+		closure_sync(&cl);
+	}
+}
+
+/*
+ * Startup the allocator thread for transition to RW mode:
+ */
+int bch_cache_allocator_start(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	struct task_struct *k;
+
+	/*
+	 * allocator thread already started?
+	 */
+	if (ca->alloc_thread)
+		return 0;
+
+	k = kthread_create(bch_allocator_thread, ca, "bcache_allocator");
+	if (IS_ERR(k))
+		return 0;
+
+	get_task_struct(k);
+	ca->alloc_thread = k;
+
+	bch_cache_group_add_cache(tier, ca);
+	bch_cache_group_add_cache(&c->cache_all, ca);
+
+	bch_recalc_capacity(c);
+
+	/*
+	 * Don't wake up allocator thread until after adding device to
+	 * allocator groups - otherwise, alloc thread could get a spurious
+	 * -EROFS due to prio_write() -> journal_meta() not finding any devices:
+	 */
+	wake_up_process(k);
+	return 0;
+}
+
+void bch_open_buckets_init(struct cache_set *c)
+{
+	unsigned i;
+
+	INIT_LIST_HEAD(&c->open_buckets_open);
+	INIT_LIST_HEAD(&c->open_buckets_free);
+	spin_lock_init(&c->open_buckets_lock);
+	bch_prio_timer_init(c, READ);
+	bch_prio_timer_init(c, WRITE);
+
+	/* open bucket 0 is a sentinal NULL: */
+	mutex_init(&c->open_buckets[0].lock);
+	INIT_LIST_HEAD(&c->open_buckets[0].list);
+
+	for (i = 1; i < ARRAY_SIZE(c->open_buckets); i++) {
+		mutex_init(&c->open_buckets[i].lock);
+		c->open_buckets_nr_free++;
+		list_add(&c->open_buckets[i].list, &c->open_buckets_free);
+	}
+
+	spin_lock_init(&c->cache_all.lock);
+
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
+		c->write_points[i].throttle = true;
+		c->write_points[i].group = &c->cache_tiers[0];
+	}
+
+	for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
+		spin_lock_init(&c->cache_tiers[i].lock);
+
+	c->promote_write_point.group = &c->cache_tiers[0];
+
+	c->migration_write_point.group = &c->cache_all;
+
+	c->btree_write_point.group = &c->cache_all;
+
+	c->pd_controllers_update_seconds = 5;
+	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
+
+	spin_lock_init(&c->foreground_write_pd_lock);
+	bch_pd_controller_init(&c->foreground_write_pd);
+	/*
+	 * We do not want the write rate to have an effect on the computed
+	 * rate, for two reasons:
+	 *
+	 * We do not call bch_ratelimit_delay() at all if the write rate
+	 * exceeds 1GB/s. In this case, the PD controller will think we are
+	 * not "keeping up" and not change the rate.
+	 */
+	c->foreground_write_pd.backpressure = 0;
+	init_timer(&c->foreground_write_wakeup);
+
+	c->foreground_write_wakeup.data = (unsigned long) c;
+	c->foreground_write_wakeup.function = bch_wake_delayed_writes;
+}
diff --git a/libbcache/alloc.h b/libbcache/alloc.h
new file mode 100644
index 0000000..ac83e4f
--- /dev/null
+++ b/libbcache/alloc.h
@@ -0,0 +1,110 @@
+#ifndef _BCACHE_ALLOC_H
+#define _BCACHE_ALLOC_H
+
+#include "alloc_types.h"
+
+struct bkey;
+struct bucket;
+struct cache;
+struct cache_set;
+struct cache_group;
+
+static inline size_t prios_per_bucket(const struct cache *ca)
+{
+	return (bucket_bytes(ca) - sizeof(struct prio_set)) /
+		sizeof(struct bucket_disk);
+}
+
+static inline size_t prio_buckets(const struct cache *ca)
+{
+	return DIV_ROUND_UP((size_t) (ca)->mi.nbuckets, prios_per_bucket(ca));
+}
+
+void bch_cache_group_remove_cache(struct cache_group *, struct cache *);
+void bch_cache_group_add_cache(struct cache_group *, struct cache *);
+
+int bch_prio_read(struct cache *);
+
+void bch_recalc_min_prio(struct cache *, int);
+
+void bch_open_bucket_put(struct cache_set *, struct open_bucket *);
+
+struct open_bucket *bch_alloc_sectors_start(struct cache_set *,
+					    struct write_point *,
+					    unsigned, enum alloc_reserve,
+					    struct closure *);
+
+void bch_alloc_sectors_append_ptrs(struct cache_set *, struct bkey_i_extent *,
+				   unsigned, struct open_bucket *, unsigned);
+void bch_alloc_sectors_done(struct cache_set *, struct write_point *,
+			    struct open_bucket *);
+
+struct open_bucket *bch_alloc_sectors(struct cache_set *, struct write_point *,
+				      struct bkey_i_extent *, unsigned,
+				      enum alloc_reserve, struct closure *);
+
+static inline void bch_wake_allocator(struct cache *ca)
+{
+	struct task_struct *p;
+
+	rcu_read_lock();
+	if ((p = ACCESS_ONCE(ca->alloc_thread)))
+		wake_up_process(p);
+	rcu_read_unlock();
+}
+
+static inline struct cache *cache_group_next_rcu(struct cache_group *devs,
+						 unsigned *iter)
+{
+	struct cache *ret = NULL;
+
+	while (*iter < devs->nr_devices &&
+	       !(ret = rcu_dereference(devs->d[*iter].dev)))
+		(*iter)++;
+
+	return ret;
+}
+
+#define group_for_each_cache_rcu(ca, devs, iter)			\
+	for ((iter) = 0;						\
+	     ((ca) = cache_group_next_rcu((devs), &(iter)));		\
+	     (iter)++)
+
+static inline struct cache *cache_group_next(struct cache_group *devs,
+					     unsigned *iter)
+{
+	struct cache *ret;
+
+	rcu_read_lock();
+	if ((ret = cache_group_next_rcu(devs, iter)))
+		percpu_ref_get(&ret->ref);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+#define group_for_each_cache(ca, devs, iter)				\
+	for ((iter) = 0;						\
+	     (ca = cache_group_next(devs, &(iter)));			\
+	     percpu_ref_put(&ca->ref), (iter)++)
+
+#define __open_bucket_next_online_device(_c, _ob, _ptr, _ca)            \
+({									\
+	(_ca) = NULL;							\
+									\
+	while ((_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs &&			\
+	       !((_ca) = PTR_CACHE(_c, _ptr)))				\
+		(_ptr)++;						\
+	(_ca);								\
+})
+
+#define open_bucket_for_each_online_device(_c, _ob, _ptr, _ca)		\
+	for ((_ptr) = (_ob)->ptrs;					\
+	     ((_ca) = __open_bucket_next_online_device(_c, _ob,	_ptr, _ca));\
+	     (_ptr)++)
+
+void bch_cache_allocator_stop(struct cache *);
+int bch_cache_allocator_start(struct cache *);
+void bch_open_buckets_init(struct cache_set *);
+
+#endif /* _BCACHE_ALLOC_H */
diff --git a/libbcache/alloc_types.h b/libbcache/alloc_types.h
new file mode 100644
index 0000000..337b6e4
--- /dev/null
+++ b/libbcache/alloc_types.h
@@ -0,0 +1,102 @@
+#ifndef _BCACHE_ALLOC_TYPES_H
+#define _BCACHE_ALLOC_TYPES_H
+
+#include <linux/mutex.h>
+
+#include "clock_types.h"
+
+/*
+ * There's two of these clocks, one for reads and one for writes:
+ *
+ * All fields protected by bucket_lock
+ */
+struct prio_clock {
+	/*
+	 * "now" in (read/write) IO time - incremented whenever we do X amount
+	 * of reads or writes.
+	 *
+	 * Goes with the bucket read/write prios: when we read or write to a
+	 * bucket we reset the bucket's prio to the current hand; thus hand -
+	 * prio = time since bucket was last read/written.
+	 *
+	 * The units are some amount (bytes/sectors) of data read/written, and
+	 * the units can change on the fly if we need to rescale to fit
+	 * everything in a u16 - your only guarantee is that the units are
+	 * consistent.
+	 */
+	u16			hand;
+	u16			min_prio;
+
+	int			rw;
+
+	struct io_timer		rescale;
+};
+
+/* There is one reserve for each type of btree, one for prios and gens
+ * and one for moving GC */
+enum alloc_reserve {
+	RESERVE_PRIO,
+	RESERVE_BTREE,
+	RESERVE_METADATA_LAST = RESERVE_BTREE,
+	RESERVE_MOVINGGC,
+
+	RESERVE_NONE,
+	RESERVE_NR,
+};
+
+static inline bool allocation_is_metadata(enum alloc_reserve id)
+{
+	return id <= RESERVE_METADATA_LAST;
+}
+
+struct cache_group {
+	spinlock_t		lock;
+	unsigned		nr_devices;
+	unsigned		cur_device;
+	struct {
+		u64		weight;
+		struct cache	*dev;
+	}			d[MAX_CACHES_PER_SET];
+};
+
+/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
+#define OPEN_BUCKETS_COUNT	256
+
+#define WRITE_POINT_COUNT	16
+
+struct open_bucket {
+	struct list_head	list;
+	struct mutex		lock;
+	atomic_t		pin;
+	bool			has_full_ptrs;
+	/*
+	 * recalculated every time we allocate from this open_bucket based on
+	 * how many pointers we're actually going to use:
+	 */
+	unsigned		sectors_free;
+	unsigned		nr_ptrs;
+	struct bch_extent_ptr	ptrs[BCH_REPLICAS_MAX];
+	unsigned		ptr_offset[BCH_REPLICAS_MAX];
+};
+
+struct write_point {
+	struct open_bucket	*b;
+
+	/*
+	 * Throttle writes to this write point if tier 0 is full?
+	 */
+	bool			throttle;
+
+	/*
+	 * If not NULL, cache group for tiering, promotion and moving GC -
+	 * always allocates a single replica
+	 */
+	struct cache_group	*group;
+
+	/*
+	 * Otherwise do a normal replicated bucket allocation that could come
+	 * from any device in tier 0 (foreground write)
+	 */
+};
+
+#endif /* _BCACHE_ALLOC_TYPES_H */
diff --git a/libbcache/bcache.h b/libbcache/bcache.h
new file mode 100644
index 0000000..9a43a69
--- /dev/null
+++ b/libbcache/bcache.h
@@ -0,0 +1,831 @@
+#ifndef _BCACHE_H
+#define _BCACHE_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include <linux/bug.h>
+#include <linux/bcache.h>
+#include <linux/bio.h>
+#include <linux/kobject.h>
+#include <linux/lglock.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/percpu-refcount.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/rhashtable.h>
+#include <linux/rwsem.h>
+#include <linux/seqlock.h>
+#include <linux/shrinker.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "bset.h"
+#include "fifo.h"
+#include "util.h"
+#include "closure.h"
+#include "opts.h"
+
+#include <linux/dynamic_fault.h>
+
+#define cache_set_init_fault(name)					\
+	dynamic_fault("bcache:cache_set_init:" name)
+#define bch_meta_read_fault(name)					\
+	 dynamic_fault("bcache:meta:read:" name)
+#define bch_meta_write_fault(name)					\
+	 dynamic_fault("bcache:meta:write:" name)
+
+#define bch_fmt(_c, fmt)					\
+	"bcache (%s): " fmt "\n", ((_c)->name)
+
+#define bch_info(c, fmt, ...) \
+	printk(KERN_INFO bch_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_notice(c, fmt, ...) \
+	printk(KERN_NOTICE bch_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn(c, fmt, ...) \
+	printk(KERN_WARNING bch_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err(c, fmt, ...) \
+	printk(KERN_ERR bch_fmt(c, fmt), ##__VA_ARGS__)
+
+#define bch_verbose(c, fmt, ...)					\
+do {									\
+	if ((c)->opts.verbose_recovery)					\
+		bch_info(c, fmt, ##__VA_ARGS__);			\
+} while (0)
+
+/* Parameters that are useful for debugging, but should always be compiled in: */
+#define BCH_DEBUG_PARAMS_ALWAYS()					\
+	BCH_DEBUG_PARAM(key_merging_disabled,				\
+		"Disables merging of extents")				\
+	BCH_DEBUG_PARAM(btree_gc_always_rewrite,			\
+		"Causes mark and sweep to compact and rewrite every "	\
+		"btree node it traverses")				\
+	BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,			\
+		"Disables rewriting of btree nodes during mark and sweep")\
+	BCH_DEBUG_PARAM(btree_gc_coalesce_disabled,			\
+		"Disables coalescing of btree nodes")			\
+	BCH_DEBUG_PARAM(btree_shrinker_disabled,			\
+		"Disables the shrinker callback for the btree node cache")
+
+/* Parameters that should only be compiled in in debug mode: */
+#define BCH_DEBUG_PARAMS_DEBUG()					\
+	BCH_DEBUG_PARAM(expensive_debug_checks,				\
+		"Enables various runtime debugging checks that "	\
+		"significantly affect performance")			\
+	BCH_DEBUG_PARAM(debug_check_bkeys,				\
+		"Run bkey_debugcheck (primarily checking GC/allocation "\
+		"information) when iterating over keys")		\
+	BCH_DEBUG_PARAM(version_stress_test,				\
+		"Assigns random version numbers to newly written "	\
+		"extents, to test overlapping extent cases")		\
+	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
+		"Reread btree nodes at various points to verify the "	\
+		"mergesort in the read path against modifications "	\
+		"done in memory")					\
+
+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
+
+#ifdef CONFIG_BCACHE_DEBUG
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
+#else
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
+#endif
+
+/* name, frequency_units, duration_units */
+#define BCH_TIME_STATS()						\
+	BCH_TIME_STAT(mca_alloc,		sec, us)		\
+	BCH_TIME_STAT(mca_scan,			sec, ms)		\
+	BCH_TIME_STAT(btree_gc,			sec, ms)		\
+	BCH_TIME_STAT(btree_coalesce,		sec, ms)		\
+	BCH_TIME_STAT(btree_split,		sec, us)		\
+	BCH_TIME_STAT(btree_sort,		ms, us)			\
+	BCH_TIME_STAT(btree_read,		ms, us)			\
+	BCH_TIME_STAT(journal_write,		us, us)			\
+	BCH_TIME_STAT(journal_delay,		ms, us)			\
+	BCH_TIME_STAT(journal_blocked,		sec, ms)		\
+	BCH_TIME_STAT(journal_flush_seq,	us, us)
+
+#include "alloc_types.h"
+#include "blockdev_types.h"
+#include "buckets_types.h"
+#include "clock_types.h"
+#include "io_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "keybuf_types.h"
+#include "move_types.h"
+#include "stats_types.h"
+#include "super_types.h"
+
+/* 256k, in sectors */
+#define BTREE_NODE_SIZE_MAX		512
+
+/*
+ * Number of nodes we might have to allocate in a worst case btree split
+ * operation - we split all the way up to the root, then allocate a new root.
+ */
+#define btree_reserve_required_nodes(depth)	(((depth) + 1) * 2 + 1)
+
+/* Number of nodes btree coalesce will try to coalesce at once */
+#define GC_MERGE_NODES		4U
+
+/* Maximum number of nodes we might need to allocate atomically: */
+#define BTREE_RESERVE_MAX						\
+	(btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES)
+
+/* Size of the freelist we allocate btree nodes from: */
+#define BTREE_NODE_RESERVE		(BTREE_RESERVE_MAX * 2)
+
+struct btree;
+struct cache;
+
+enum gc_phase {
+	GC_PHASE_PENDING_DELETE		= BTREE_ID_NR + 1,
+	GC_PHASE_DONE
+};
+
+struct gc_pos {
+	enum gc_phase		phase;
+	struct bpos		pos;
+	unsigned		level;
+};
+
+struct cache_member_cpu {
+	u64			nbuckets;	/* device size */
+	u16			first_bucket;   /* index of first bucket used */
+	u16			bucket_size;	/* sectors */
+	u8			state;
+	u8			tier;
+	u8			replication_set;
+	u8			has_metadata;
+	u8			has_data;
+	u8			replacement;
+	u8			discard;
+	u8			valid;
+};
+
+struct cache_member_rcu {
+	struct rcu_head		rcu;
+	unsigned		nr_in_set;
+	struct cache_member_cpu	m[];
+};
+
+/* cache->flags: */
+enum {
+	CACHE_DEV_REMOVING,
+	CACHE_DEV_FORCE_REMOVE,
+};
+
+struct cache {
+	struct percpu_ref	ref;
+	struct rcu_head		free_rcu;
+	struct work_struct	free_work;
+	struct work_struct	remove_work;
+	unsigned long		flags;
+
+	struct cache_set	*set;
+
+	struct cache_group	self;
+
+	/*
+	 * Cached version of this device's member info from superblock
+	 * Committed by write_super()
+	 */
+	struct {
+		u8		nr_this_dev;
+	}			sb;
+	struct cache_member_cpu	mi;
+
+	struct bcache_superblock disk_sb;
+
+	struct kobject		kobj;
+
+	/* biosets used in cloned bios for replicas and moving_gc */
+	struct bio_set		replica_set;
+
+	struct task_struct	*alloc_thread;
+
+	struct prio_set		*disk_buckets;
+
+	/*
+	 * When allocating new buckets, prio_write() gets first dibs - since we
+	 * may not be allocate at all without writing priorities and gens.
+	 * prio_last_buckets[] contains the last buckets we wrote priorities to
+	 * (so gc can mark them as metadata).
+	 */
+	u64			*prio_buckets;
+	u64			*prio_last_buckets;
+	spinlock_t		prio_buckets_lock;
+	struct bio		*bio_prio;
+
+	/*
+	 * free: Buckets that are ready to be used
+	 *
+	 * free_inc: Incoming buckets - these are buckets that currently have
+	 * cached data in them, and we can't reuse them until after we write
+	 * their new gen to disk. After prio_write() finishes writing the new
+	 * gens/prios, they'll be moved to the free list (and possibly discarded
+	 * in the process)
+	 */
+	DECLARE_FIFO(long, free)[RESERVE_NR];
+	DECLARE_FIFO(long, free_inc);
+	spinlock_t		freelist_lock;
+
+	size_t			fifo_last_bucket;
+
+	/* Allocation stuff: */
+
+	/* most out of date gen in the btree */
+	u8			*oldest_gens;
+	struct bucket		*buckets;
+	unsigned short		bucket_bits;	/* ilog2(bucket_size) */
+
+	/* last calculated minimum prio */
+	u16			min_prio[2];
+
+	/*
+	 * Bucket book keeping. The first element is updated by GC, the
+	 * second contains a saved copy of the stats from the beginning
+	 * of GC.
+	 */
+	struct bucket_stats_cache __percpu *bucket_stats_percpu;
+	struct bucket_stats_cache	bucket_stats_cached;
+
+	atomic_long_t		saturated_count;
+	size_t			inc_gen_needs_gc;
+
+	struct mutex		heap_lock;
+	DECLARE_HEAP(struct bucket_heap_entry, heap);
+
+	/* Moving GC: */
+	struct task_struct	*moving_gc_read;
+
+	struct bch_pd_controller moving_gc_pd;
+
+	/* Tiering: */
+	struct write_point	tiering_write_point;
+
+	struct write_point	copygc_write_point;
+
+	struct journal_device	journal;
+
+	struct work_struct	io_error_work;
+
+	/* The rest of this all shows up in sysfs */
+#define IO_ERROR_SHIFT		20
+	atomic_t		io_errors;
+	atomic_t		io_count;
+
+	atomic64_t		meta_sectors_written;
+	atomic64_t		btree_sectors_written;
+	u64 __percpu		*sectors_written;
+};
+
+/*
+ * Flag bits for what phase of startup/shutdown the cache set is at, how we're
+ * shutting down, etc.:
+ *
+ * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching
+ * all the backing devices first (their cached data gets invalidated, and they
+ * won't automatically reattach).
+ *
+ * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
+ * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
+ * flushing dirty data).
+ *
+ * CACHE_SET_RUNNING means all cache devices have been registered and journal
+ * replay is complete.
+ */
+enum {
+	/* Startup: */
+	CACHE_SET_INITIAL_GC_DONE,
+	CACHE_SET_RUNNING,
+
+	/* Shutdown: */
+	CACHE_SET_UNREGISTERING,
+	CACHE_SET_STOPPING,
+	CACHE_SET_RO,
+	CACHE_SET_RO_COMPLETE,
+	CACHE_SET_EMERGENCY_RO,
+	CACHE_SET_WRITE_DISABLE_COMPLETE,
+	CACHE_SET_GC_STOPPING,
+	CACHE_SET_GC_FAILURE,
+	CACHE_SET_BDEV_MOUNTED,
+	CACHE_SET_ERROR,
+	CACHE_SET_FSCK_FIXED_ERRORS,
+};
+
+struct btree_debug {
+	unsigned		id;
+	struct dentry		*btree;
+	struct dentry		*btree_format;
+	struct dentry		*failed;
+};
+
+struct cache_set {
+	struct closure		cl;
+
+	struct list_head	list;
+	struct kobject		kobj;
+	struct kobject		internal;
+	struct kobject		opts_dir;
+	struct kobject		time_stats;
+	struct completion	*stop_completion;
+	unsigned long		flags;
+
+	int			minor;
+	struct device		*chardev;
+	struct super_block	*vfs_sb;
+	char			name[40];
+
+	/* Counts outstanding writes, for clean transition to read-only */
+	struct percpu_ref	writes;
+	struct work_struct	read_only_work;
+
+	struct cache __rcu	*cache[MAX_CACHES_PER_SET];
+
+	struct mutex		mi_lock;
+	struct cache_member_rcu __rcu *members;
+	struct cache_member	*disk_mi; /* protected by register_lock */
+
+	struct cache_set_opts	opts;
+
+	/*
+	 * Cached copy in native endianness:
+	 * Set by cache_sb_to_cache_set:
+	 */
+	struct {
+		u16		block_size;
+		u16		btree_node_size;
+
+		u8		nr_in_set;
+		u8		clean;
+
+		u8		meta_replicas_have;
+		u8		data_replicas_have;
+
+		u8		str_hash_type;
+	}			sb;
+
+	struct cache_sb		disk_sb;
+	unsigned short		block_bits;	/* ilog2(block_size) */
+
+	struct closure		sb_write;
+	struct semaphore	sb_write_mutex;
+
+	struct backing_dev_info bdi;
+
+	/* BTREE CACHE */
+	struct bio_set		btree_read_bio;
+
+	struct btree_root	btree_roots[BTREE_ID_NR];
+	struct mutex		btree_root_lock;
+
+	bool			btree_cache_table_init_done;
+	struct rhashtable	btree_cache_table;
+
+	/*
+	 * We never free a struct btree, except on shutdown - we just put it on
+	 * the btree_cache_freed list and reuse it later. This simplifies the
+	 * code, and it doesn't cost us much memory as the memory usage is
+	 * dominated by buffers that hold the actual btree node data and those
+	 * can be freed - and the number of struct btrees allocated is
+	 * effectively bounded.
+	 *
+	 * btree_cache_freeable effectively is a small cache - we use it because
+	 * high order page allocations can be rather expensive, and it's quite
+	 * common to delete and allocate btree nodes in quick succession. It
+	 * should never grow past ~2-3 nodes in practice.
+	 */
+	struct mutex		btree_cache_lock;
+	struct list_head	btree_cache;
+	struct list_head	btree_cache_freeable;
+	struct list_head	btree_cache_freed;
+
+	/* Number of elements in btree_cache + btree_cache_freeable lists */
+	unsigned		btree_cache_used;
+	unsigned		btree_cache_reserve;
+	struct shrinker		btree_cache_shrink;
+
+	/*
+	 * If we need to allocate memory for a new btree node and that
+	 * allocation fails, we can cannibalize another node in the btree cache
+	 * to satisfy the allocation - lock to guarantee only one thread does
+	 * this at a time:
+	 */
+	struct closure_waitlist	mca_wait;
+	struct task_struct	*btree_cache_alloc_lock;
+
+	mempool_t		btree_reserve_pool;
+
+	/*
+	 * Cache of allocated btree nodes - if we allocate a btree node and
+	 * don't use it, if we free it that space can't be reused until going
+	 * _all_ the way through the allocator (which exposes us to a livelock
+	 * when allocating btree reserves fail halfway through) - instead, we
+	 * can stick them here:
+	 */
+	struct btree_alloc {
+		struct open_bucket	*ob;
+		BKEY_PADDED(k);
+	}			btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+	unsigned		btree_reserve_cache_nr;
+	struct mutex		btree_reserve_cache_lock;
+
+	mempool_t		btree_interior_update_pool;
+	struct list_head	btree_interior_update_list;
+	struct mutex		btree_interior_update_lock;
+
+	struct workqueue_struct	*wq;
+	/* copygc needs its own workqueue for index updates.. */
+	struct workqueue_struct	*copygc_wq;
+
+	/* ALLOCATION */
+	struct bch_pd_controller foreground_write_pd;
+	struct delayed_work	pd_controllers_update;
+	unsigned		pd_controllers_update_seconds;
+	spinlock_t		foreground_write_pd_lock;
+	struct bch_write_op	*write_wait_head;
+	struct bch_write_op	*write_wait_tail;
+
+	struct timer_list	foreground_write_wakeup;
+
+	/*
+	 * These contain all r/w devices - i.e. devices we can currently
+	 * allocate from:
+	 */
+	struct cache_group	cache_all;
+	struct cache_group	cache_tiers[CACHE_TIERS];
+
+	u64			capacity; /* sectors */
+
+	/*
+	 * When capacity _decreases_ (due to a disk being removed), we
+	 * increment capacity_gen - this invalidates outstanding reservations
+	 * and forces them to be revalidated
+	 */
+	u32			capacity_gen;
+
+	atomic64_t		sectors_available;
+
+	struct bucket_stats_cache_set __percpu *bucket_stats_percpu;
+	struct bucket_stats_cache_set	bucket_stats_cached;
+	struct lglock		bucket_stats_lock;
+
+	struct mutex		bucket_lock;
+
+	struct closure_waitlist	freelist_wait;
+
+
+	/*
+	 * When we invalidate buckets, we use both the priority and the amount
+	 * of good data to determine which buckets to reuse first - to weight
+	 * those together consistently we keep track of the smallest nonzero
+	 * priority of any bucket.
+	 */
+	struct prio_clock	prio_clock[2];
+
+	struct io_clock		io_clock[2];
+
+	/* SECTOR ALLOCATOR */
+	struct list_head	open_buckets_open;
+	struct list_head	open_buckets_free;
+	unsigned		open_buckets_nr_free;
+	struct closure_waitlist	open_buckets_wait;
+	spinlock_t		open_buckets_lock;
+	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
+
+	struct write_point	btree_write_point;
+
+	struct write_point	write_points[WRITE_POINT_COUNT];
+	struct write_point	promote_write_point;
+
+	/*
+	 * This write point is used for migrating data off a device
+	 * and can point to any other device.
+	 * We can't use the normal write points because those will
+	 * gang up n replicas, and for migration we want only one new
+	 * replica.
+	 */
+	struct write_point	migration_write_point;
+
+	/* GARBAGE COLLECTION */
+	struct task_struct	*gc_thread;
+	atomic_t		kick_gc;
+
+	/*
+	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
+	 * has been marked by GC.
+	 *
+	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
+	 *
+	 * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
+	 * currently running, and gc marks are currently valid
+	 *
+	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
+	 * can read without a lock.
+	 */
+	seqcount_t		gc_pos_lock;
+	struct gc_pos		gc_pos;
+
+	/*
+	 * The allocation code needs gc_mark in struct bucket to be correct, but
+	 * it's not while a gc is in progress.
+	 */
+	struct rw_semaphore	gc_lock;
+
+	/* IO PATH */
+	struct bio_set		bio_read;
+	struct bio_set		bio_read_split;
+	struct bio_set		bio_write;
+	struct mutex		bio_bounce_pages_lock;
+	mempool_t		bio_bounce_pages;
+
+	mempool_t		lz4_workspace_pool;
+	void			*zlib_workspace;
+	struct mutex		zlib_workspace_lock;
+	mempool_t		compression_bounce[2];
+	struct bio_decompress_worker __percpu
+				*bio_decompress_worker;
+
+	/* For punting bio submissions to workqueue, io.c */
+	struct bio_list		bio_submit_list;
+	struct work_struct	bio_submit_work;
+	spinlock_t		bio_submit_lock;
+
+	struct bio_list		read_retry_list;
+	struct work_struct	read_retry_work;
+	spinlock_t		read_retry_lock;
+
+	/* FILESYSTEM */
+	wait_queue_head_t	writeback_wait;
+	atomic_t		writeback_pages;
+	unsigned		writeback_pages_max;
+	atomic_long_t		nr_inodes;
+
+	/* TIERING */
+	struct task_struct	*tiering_read;
+	struct bch_pd_controller tiering_pd;
+
+	/* NOTIFICATIONS */
+	struct mutex		uevent_lock;
+	struct kobj_uevent_env	uevent_env;
+
+	/* DEBUG JUNK */
+	struct dentry		*debug;
+	struct btree_debug	btree_debug[BTREE_ID_NR];
+#ifdef CONFIG_BCACHE_DEBUG
+	struct btree		*verify_data;
+	struct btree_node	*verify_ondisk;
+	struct mutex		verify_lock;
+#endif
+
+	u64			unused_inode_hint;
+
+	/*
+	 * A btree node on disk could have too many bsets for an iterator to fit
+	 * on the stack - have to dynamically allocate them
+	 */
+	mempool_t		fill_iter;
+
+	mempool_t		btree_bounce_pool;
+
+	struct journal		journal;
+
+	unsigned		bucket_journal_seq;
+
+	/* CACHING OTHER BLOCK DEVICES */
+	mempool_t		search;
+	struct radix_tree_root	devices;
+	struct list_head	cached_devs;
+	u64			cached_dev_sectors;
+	struct closure		caching;
+
+#define CONGESTED_MAX		1024
+	unsigned		congested_last_us;
+	atomic_t		congested;
+
+	/* The rest of this all shows up in sysfs */
+	unsigned		congested_read_threshold_us;
+	unsigned		congested_write_threshold_us;
+
+	struct cache_accounting accounting;
+	atomic_long_t		cache_read_races;
+	atomic_long_t		writeback_keys_done;
+	atomic_long_t		writeback_keys_failed;
+
+	unsigned		error_limit;
+	unsigned		error_decay;
+
+	unsigned		foreground_write_ratelimit_enabled:1;
+	unsigned		copy_gc_enabled:1;
+	unsigned		tiering_enabled:1;
+	unsigned		tiering_percent;
+
+	/*
+	 * foreground writes will be throttled when the number of free
+	 * buckets is below this percentage
+	 */
+	unsigned		foreground_target_percent;
+
+#define BCH_DEBUG_PARAM(name, description) bool name;
+	BCH_DEBUG_PARAMS_ALL()
+#undef BCH_DEBUG_PARAM
+
+#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+	struct time_stats	name##_time;
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+};
+
+static inline unsigned bucket_pages(const struct cache *ca)
+{
+	return ca->mi.bucket_size / PAGE_SECTORS;
+}
+
+static inline unsigned bucket_bytes(const struct cache *ca)
+{
+	return ca->mi.bucket_size << 9;
+}
+
+static inline unsigned block_bytes(const struct cache_set *c)
+{
+	return c->sb.block_size << 9;
+}
+
+#endif /* _BCACHE_H */
diff --git a/libbcache/bkey.c b/libbcache/bkey.c
new file mode 100644
index 0000000..64d2c84
--- /dev/null
+++ b/libbcache/bkey.c
@@ -0,0 +1,1261 @@
+
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include <linux/kernel.h>
+
+#include "bkey.h"
+#include "bset.h"
+#include "util.h"
+
+const struct bkey_format bch_bkey_format_current = BKEY_FORMAT_CURRENT;
+
+struct bkey __bkey_unpack_key(const struct bkey_format *,
+			      const struct bkey_packed *);
+
+void bch_to_binary(char *out, const u64 *p, unsigned nr_bits)
+{
+	unsigned bit = high_bit_offset, done = 0;
+
+	while (1) {
+		while (bit < 64) {
+			if (done && !(done % 8))
+				*out++ = ' ';
+			*out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
+			bit++;
+			done++;
+			if (done == nr_bits) {
+				*out++ = '\0';
+				return;
+			}
+		}
+
+		p = next_word(p);
+		bit = 0;
+	}
+}
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+static void bch_bkey_pack_verify(const struct bkey_packed *packed,
+				 const struct bkey *unpacked,
+				 const struct bkey_format *format)
+{
+	struct bkey tmp;
+
+	BUG_ON(bkeyp_val_u64s(format, packed) !=
+	       bkey_val_u64s(unpacked));
+
+	BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
+
+	tmp = __bkey_unpack_key(format, packed);
+
+	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
+		char buf1[160], buf2[160];
+		char buf3[160], buf4[160];
+
+		bch_bkey_to_text(buf1, sizeof(buf1), unpacked);
+		bch_bkey_to_text(buf2, sizeof(buf2), &tmp);
+		bch_to_binary(buf3, (void *) unpacked, 80);
+		bch_to_binary(buf4, high_word(format, packed), 80);
+
+		panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
+		      format->key_u64s,
+		      format->bits_per_field[0],
+		      format->bits_per_field[1],
+		      format->bits_per_field[2],
+		      format->bits_per_field[3],
+		      format->bits_per_field[4],
+		      buf1, buf2, buf3, buf4);
+	}
+}
+
+#else
+static inline void bch_bkey_pack_verify(const struct bkey_packed *packed,
+					const struct bkey *unpacked,
+					const struct bkey_format *format) {}
+#endif
+
+int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
+{
+	char *out = buf, *end = buf + size;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	p("u64s %u type %u %llu:%llu snap %u len %u ver %u",
+	  k->u64s, k->type, k->p.inode, k->p.offset,
+	  k->p.snapshot, k->size, k->version);
+
+	BUG_ON(bkey_packed(k));
+
+	switch (k->type) {
+	case KEY_TYPE_DELETED:
+		p(" deleted");
+		break;
+	case KEY_TYPE_DISCARD:
+		p(" discard");
+		break;
+	case KEY_TYPE_ERROR:
+		p(" error");
+		break;
+	case KEY_TYPE_COOKIE:
+		p(" cookie");
+		break;
+	}
+#undef p
+
+	return out - buf;
+}
+
+struct pack_state {
+	const struct bkey_format *format;
+	unsigned		bits;	/* bits remaining in current word */
+	u64			w;	/* current word */
+	u64			*p;	/* pointer to next word */
+};
+
+__always_inline
+static struct pack_state pack_state_init(const struct bkey_format *format,
+					 struct bkey_packed *k)
+{
+	u64 *p = high_word(format, k);
+
+	return (struct pack_state) {
+		.format	= format,
+		.bits	= 64 - high_bit_offset,
+		.w	= 0,
+		.p	= p,
+	};
+}
+
+__always_inline
+static void pack_state_finish(struct pack_state *state,
+			      struct bkey_packed *k)
+{
+	EBUG_ON(state->p <  k->_data);
+	EBUG_ON(state->p >= k->_data + state->format->key_u64s);
+
+	*state->p = state->w;
+}
+
+struct unpack_state {
+	const struct bkey_format *format;
+	unsigned		bits;	/* bits remaining in current word */
+	u64			w;	/* current word */
+	const u64		*p;	/* pointer to next word */
+};
+
+__always_inline
+static struct unpack_state unpack_state_init(const struct bkey_format *format,
+					     const struct bkey_packed *k)
+{
+	const u64 *p = high_word(format, k);
+
+	return (struct unpack_state) {
+		.format	= format,
+		.bits	= 64 - high_bit_offset,
+		.w	= *p << high_bit_offset,
+		.p	= p,
+	};
+}
+
+__always_inline
+static u64 get_inc_field(struct unpack_state *state, unsigned field)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
+
+	if (bits >= state->bits) {
+		v = state->w >> (64 - bits);
+		bits -= state->bits;
+
+		state->p = next_word(state->p);
+		state->w = *state->p;
+		state->bits = 64;
+	}
+
+	/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+	v |= (state->w >> 1) >> (63 - bits);
+	state->w <<= bits;
+	state->bits -= bits;
+
+	return v + offset;
+}
+
+__always_inline
+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 offset = le64_to_cpu(state->format->field_offset[field]);
+
+	if (v < offset)
+		return false;
+
+	v -= offset;
+
+	if (fls64(v) > bits)
+		return false;
+
+	if (bits > state->bits) {
+		bits -= state->bits;
+		/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+		state->w |= (v >> 1) >> (bits - 1);
+
+		*state->p = state->w;
+		state->p = next_word(state->p);
+		state->w = 0;
+		state->bits = 64;
+	}
+
+	state->bits -= bits;
+	state->w |= v << state->bits;
+
+	return true;
+}
+
+/*
+ * Note: does NOT set out->format (we don't know what it should be here!)
+ *
+ * Also: doesn't work on extents - it doesn't preserve the invariant that
+ * if k is packed bkey_start_pos(k) will successfully pack
+ */
+static bool bch_bkey_transform_key(const struct bkey_format *out_f,
+				   struct bkey_packed *out,
+				   const struct bkey_format *in_f,
+				   const struct bkey_packed *in)
+{
+	struct pack_state out_s = pack_state_init(out_f, out);
+	struct unpack_state in_s = unpack_state_init(in_f, in);
+	unsigned i;
+
+	out->_data[0] = 0;
+
+	for (i = 0; i < BKEY_NR_FIELDS; i++)
+		if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
+			return false;
+
+	/* Can't happen because the val would be too big to unpack: */
+	EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
+
+	pack_state_finish(&out_s, out);
+	out->u64s	= out_f->key_u64s + in->u64s - in_f->key_u64s;
+	out->needs_whiteout = in->needs_whiteout;
+	out->type	= in->type;
+
+	return true;
+}
+
+bool bch_bkey_transform(const struct bkey_format *out_f,
+			struct bkey_packed *out,
+			const struct bkey_format *in_f,
+			const struct bkey_packed *in)
+{
+	if (!bch_bkey_transform_key(out_f, out, in_f, in))
+		return false;
+
+	memcpy_u64s((u64 *) out + out_f->key_u64s,
+		    (u64 *) in + in_f->key_u64s,
+		    (in->u64s - in_f->key_u64s));
+	return true;
+}
+
+struct bkey __bkey_unpack_key(const struct bkey_format *format,
+			      const struct bkey_packed *in)
+{
+	struct unpack_state state = unpack_state_init(format, in);
+	struct bkey out;
+
+	EBUG_ON(format->nr_fields != 5);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+	EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
+
+	out.u64s	= BKEY_U64s + in->u64s - format->key_u64s;
+	out.format	= KEY_FORMAT_CURRENT;
+	out.needs_whiteout = in->needs_whiteout;
+	out.type	= in->type;
+	out.pad[0]	= 0;
+	out.p.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
+	out.p.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
+	out.p.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
+	out.size	= get_inc_field(&state, BKEY_FIELD_SIZE);
+	out.version	= get_inc_field(&state, BKEY_FIELD_VERSION);
+
+	return out;
+}
+
+#ifndef HAVE_BCACHE_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *format,
+				     const struct bkey_packed *in)
+{
+	struct unpack_state state = unpack_state_init(format, in);
+	struct bpos out;
+
+	EBUG_ON(format->nr_fields != 5);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+
+	out.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
+	out.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
+	out.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
+
+	return out;
+}
+#endif
+
+/**
+ * bkey_pack_key -- pack just the key, not the value
+ */
+bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
+		   const struct bkey_format *format)
+{
+	struct pack_state state = pack_state_init(format, out);
+
+	EBUG_ON((void *) in == (void *) out);
+	EBUG_ON(format->nr_fields != 5);
+	EBUG_ON(in->format != KEY_FORMAT_CURRENT);
+
+	out->_data[0] = 0;
+
+	if (!set_inc_field(&state, BKEY_FIELD_INODE,	in->p.inode) ||
+	    !set_inc_field(&state, BKEY_FIELD_OFFSET,	in->p.offset) ||
+	    !set_inc_field(&state, BKEY_FIELD_SNAPSHOT,	in->p.snapshot) ||
+	    !set_inc_field(&state, BKEY_FIELD_SIZE,	in->size) ||
+	    !set_inc_field(&state, BKEY_FIELD_VERSION,	in->version))
+		return false;
+
+	/*
+	 * Extents - we have to guarantee that if an extent is packed, a trimmed
+	 * version will also pack:
+	 */
+	if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET])
+		return false;
+
+	pack_state_finish(&state, out);
+	out->u64s	= format->key_u64s + in->u64s - BKEY_U64s;
+	out->format	= KEY_FORMAT_LOCAL_BTREE;
+	out->needs_whiteout = in->needs_whiteout;
+	out->type	= in->type;
+
+	bch_bkey_pack_verify(out, in, format);
+	return true;
+}
+
+/*
+ * Alternate implementations using bch_bkey_transform_key() - unfortunately, too
+ * slow
+ */
+#if 0
+struct bkey __bkey_unpack_key(const struct bkey_format *format,
+			      const struct bkey_packed *in)
+{
+	struct bkey out;
+	bool s;
+
+	EBUG_ON(format->nr_fields != 5);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+
+	s = bch_bkey_transform_key(&bch_bkey_format_current, (void *) &out,
+				   format, in);
+	EBUG_ON(!s);
+
+	out.format = KEY_FORMAT_CURRENT;
+
+	return out;
+}
+
+bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
+		   const struct bkey_format *format)
+{
+	EBUG_ON(format->nr_fields != 5);
+	EBUG_ON(in->format != KEY_FORMAT_CURRENT);
+
+	if (!bch_bkey_transform_key(format, out,
+				    &bch_bkey_format_current, (void *) in))
+		return false;
+
+	out->format = KEY_FORMAT_LOCAL_BTREE;
+
+	bch_bkey_pack_verify(out, in, format);
+	return true;
+}
+#endif
+
+/**
+ * bkey_unpack -- unpack the key and the value
+ */
+void bkey_unpack(const struct btree *b, struct bkey_i *dst,
+		 const struct bkey_packed *src)
+{
+	dst->k = bkey_unpack_key(b, src);
+
+	memcpy_u64s(&dst->v,
+		    bkeyp_val(&b->format, src),
+		    bkeyp_val_u64s(&b->format, src));
+}
+
+/**
+ * bkey_pack -- pack the key and the value
+ */
+bool bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
+	       const struct bkey_format *format)
+{
+	struct bkey_packed tmp;
+
+	if (!bkey_pack_key(&tmp, &in->k, format))
+		return false;
+
+	memmove_u64s((u64 *) out + format->key_u64s,
+		     &in->v,
+		     bkey_val_u64s(&in->k));
+	memcpy_u64s(out, &tmp, format->key_u64s);
+
+	return true;
+}
+
+__always_inline
+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 offset = le64_to_cpu(state->format->field_offset[field]);
+	bool ret = true;
+
+	EBUG_ON(v < offset);
+	v -= offset;
+
+	if (fls64(v) > bits) {
+		v = ~(~0ULL << bits);
+		ret = false;
+	}
+
+	if (bits > state->bits) {
+		bits -= state->bits;
+		state->w |= (v >> 1) >> (bits - 1);
+
+		*state->p = state->w;
+		state->p = next_word(state->p);
+		state->w = 0;
+		state->bits = 64;
+	}
+
+	state->bits -= bits;
+	state->w |= v << state->bits;
+
+	return ret;
+}
+
+#ifdef CONFIG_BCACHE_DEBUG
+static bool bkey_packed_successor(struct bkey_packed *out,
+				  const struct btree *b,
+				  struct bkey_packed k)
+{
+	const struct bkey_format *f = &b->format;
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned first_bit, offset;
+	u64 *p;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	if (!nr_key_bits)
+		return false;
+
+	*out = k;
+
+	first_bit = high_bit_offset + nr_key_bits - 1;
+	p = nth_word(high_word(f, out), first_bit >> 6);
+	offset = 63 - (first_bit & 63);
+
+	while (nr_key_bits) {
+		unsigned bits = min(64 - offset, nr_key_bits);
+		u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+		if ((*p & mask) != mask) {
+			*p += 1ULL << offset;
+			EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
+			return true;
+		}
+
+		*p &= ~mask;
+		p = prev_word(p);
+		nr_key_bits -= bits;
+		offset = 0;
+	}
+
+	return false;
+}
+#endif
+
+/*
+ * Returns a packed key that compares <= in
+ *
+ * This is used in bset_search_tree(), where we need a packed pos in order to be
+ * able to compare against the keys in the auxiliary search tree - and it's
+ * legal to use a packed pos that isn't equivalent to the original pos,
+ * _provided_ it compares <= to the original pos.
+ */
+enum bkey_pack_pos_ret bkey_pack_pos_lossy(struct bkey_packed *out,
+					   struct bpos in,
+					   const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	struct pack_state state = pack_state_init(f, out);
+#ifdef CONFIG_BCACHE_DEBUG
+	struct bpos orig = in;
+#endif
+	bool exact = true;
+
+	out->_data[0] = 0;
+
+	if (unlikely(in.snapshot <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
+		if (!in.offset-- &&
+		    !in.inode--)
+			return BKEY_PACK_POS_FAIL;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(in.offset <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
+		if (!in.inode--)
+			return BKEY_PACK_POS_FAIL;
+		in.offset	= KEY_OFFSET_MAX;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(in.inode <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
+		return BKEY_PACK_POS_FAIL;
+
+	if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) {
+		in.offset	= KEY_OFFSET_MAX;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) {
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))
+		exact = false;
+
+	pack_state_finish(&state, out);
+	out->u64s	= f->key_u64s;
+	out->format	= KEY_FORMAT_LOCAL_BTREE;
+	out->type	= KEY_TYPE_DELETED;
+
+#ifdef CONFIG_BCACHE_DEBUG
+	if (exact) {
+		BUG_ON(bkey_cmp_left_packed(b, out, &orig));
+	} else {
+		struct bkey_packed successor;
+
+		BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
+		BUG_ON(bkey_packed_successor(&successor, b, *out) &&
+		       bkey_cmp_left_packed(b, &successor, &orig) < 0);
+	}
+#endif
+
+	return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
+}
+
+void bch_bkey_format_init(struct bkey_format_state *s)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
+		s->field_min[i] = U64_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
+		s->field_max[i] = 0;
+
+	/* Make sure we can store a size of 0: */
+	s->field_min[BKEY_FIELD_SIZE] = 0;
+}
+
+static void __bkey_format_add(struct bkey_format_state *s,
+			      unsigned field, u64 v)
+{
+	s->field_min[field] = min(s->field_min[field], v);
+	s->field_max[field] = max(s->field_max[field], v);
+}
+
+/*
+ * Changes @format so that @k can be successfully packed with @format
+ */
+void bch_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
+{
+	__bkey_format_add(s, BKEY_FIELD_INODE, k->p.inode);
+	__bkey_format_add(s, BKEY_FIELD_OFFSET, k->p.offset);
+	__bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
+	__bkey_format_add(s, BKEY_FIELD_SNAPSHOT, k->p.snapshot);
+	__bkey_format_add(s, BKEY_FIELD_SIZE, k->size);
+	__bkey_format_add(s, BKEY_FIELD_VERSION, k->version);
+}
+
+void bch_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
+{
+	unsigned field = 0;
+
+	__bkey_format_add(s, field++, p.inode);
+	__bkey_format_add(s, field++, p.offset);
+	__bkey_format_add(s, field++, p.snapshot);
+}
+
+/*
+ * We don't want it to be possible for the packed format to represent fields
+ * bigger than a u64... that will cause confusion and issues (like with
+ * bkey_packed_successor())
+ */
+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
+			     unsigned bits, u64 offset)
+{
+	offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1));
+
+	f->bits_per_field[i]	= bits;
+	f->field_offset[i]	= cpu_to_le64(offset);
+}
+
+struct bkey_format bch_bkey_format_done(struct bkey_format_state *s)
+{
+	unsigned i, bits = KEY_PACKED_BITS_START;
+	struct bkey_format ret = {
+		.nr_fields = BKEY_NR_FIELDS,
+	};
+
+	for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
+		s->field_min[i] = min(s->field_min[i], s->field_max[i]);
+
+		set_format_field(&ret, i,
+				 fls64(s->field_max[i] - s->field_min[i]),
+				 s->field_min[i]);
+
+		bits += ret.bits_per_field[i];
+	}
+
+	ret.key_u64s = DIV_ROUND_UP(bits, 64);
+
+	/* if we have enough spare bits, round fields up to nearest byte */
+	bits = ret.key_u64s * 64 - bits;
+
+	for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
+		unsigned r = round_up(ret.bits_per_field[i], 8) -
+			ret.bits_per_field[i];
+
+		if (r <= bits) {
+			set_format_field(&ret, i,
+					 ret.bits_per_field[i] + r,
+					 le64_to_cpu(ret.field_offset[i]));
+			bits -= r;
+		}
+	}
+
+	EBUG_ON(bch_bkey_format_validate(&ret));
+	return ret;
+}
+
+const char *bch_bkey_format_validate(struct bkey_format *f)
+{
+	unsigned i, bits = KEY_PACKED_BITS_START;
+
+	if (f->nr_fields != BKEY_NR_FIELDS)
+		return "invalid format: incorrect number of fields";
+
+	for (i = 0; i < f->nr_fields; i++) {
+		u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+		if (f->bits_per_field[i] > 64)
+			return "invalid format: field too large";
+
+		if (field_offset &&
+		    (f->bits_per_field[i] == 64 ||
+		    (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
+		     field_offset)))
+			return "invalid format: offset + bits overflow";
+
+		bits += f->bits_per_field[i];
+	}
+
+	if (f->key_u64s != DIV_ROUND_UP(bits, 64))
+		return "invalid format: incorrect key_u64s";
+
+	return NULL;
+}
+
+/*
+ * Most significant differing bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bkey_greatest_differing_bit(const struct btree *b,
+				     const struct bkey_packed *l_k,
+				     const struct bkey_packed *r_k)
+{
+	const u64 *l = high_word(&b->format, l_k);
+	const u64 *r = high_word(&b->format, r_k);
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned word_bits = 64 - high_bit_offset;
+	u64 l_v, r_v;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+	/* for big endian, skip past header */
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (nr_key_bits) {
+		if (nr_key_bits < word_bits) {
+			l_v >>= word_bits - nr_key_bits;
+			r_v >>= word_bits - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= word_bits;
+		}
+
+		if (l_v != r_v)
+			return fls64(l_v ^ r_v) - 1 + nr_key_bits;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+		word_bits = 64;
+	}
+
+	return 0;
+}
+
+/*
+ * First set bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bkey_ffs(const struct btree *b,
+		  const struct bkey_packed *k)
+{
+	const u64 *p = high_word(&b->format, k);
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned ret = 0, offset;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+	offset = nr_key_bits;
+	while (offset > 64) {
+		p = next_word(p);
+		offset -= 64;
+	}
+
+	offset = 64 - offset;
+
+	while (nr_key_bits) {
+		unsigned bits = nr_key_bits + offset < 64
+			? nr_key_bits
+			: 64 - offset;
+
+		u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+		if (*p & mask)
+			return ret + __ffs64(*p & mask) - offset;
+
+		p = prev_word(p);
+		nr_key_bits -= bits;
+		ret += bits;
+		offset = 0;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_X86_64
+
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	long d0, d1, d2, d3;
+	int cmp;
+
+	/* we shouldn't need asm for this, but gcc is being retarded: */
+
+	asm(".intel_syntax noprefix;"
+	    "xor eax, eax;"
+	    "xor edx, edx;"
+	    "1:;"
+	    "mov r8, [rdi];"
+	    "mov r9, [rsi];"
+	    "sub ecx, 64;"
+	    "jl 2f;"
+
+	    "cmp r8, r9;"
+	    "jnz 3f;"
+
+	    "lea rdi, [rdi - 8];"
+	    "lea rsi, [rsi - 8];"
+	    "jmp 1b;"
+
+	    "2:;"
+	    "not ecx;"
+	    "shr r8, 1;"
+	    "shr r9, 1;"
+	    "shr r8, cl;"
+	    "shr r9, cl;"
+	    "cmp r8, r9;"
+
+	    "3:\n"
+	    "seta al;"
+	    "setb dl;"
+	    "sub eax, edx;"
+	    ".att_syntax prefix;"
+	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+	    : "0" (l), "1" (r), "3" (nr_key_bits)
+	    : "r8", "r9", "cc", "memory");
+
+	return cmp;
+}
+
+#define I(_x)			(*(out)++ = (_x))
+#define I1(i0)						I(i0)
+#define I2(i0, i1)		(I1(i0),		I(i1))
+#define I3(i0, i1, i2)		(I2(i0, i1),		I(i2))
+#define I4(i0, i1, i2, i3)	(I3(i0, i1, i2),	I(i3))
+#define I5(i0, i1, i2, i3, i4)	(I4(i0, i1, i2, i3),	I(i4))
+
+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
+			      enum bch_bkey_fields field,
+			      unsigned dst_offset, unsigned dst_size,
+			      bool *eax_zeroed)
+{
+	unsigned byte = format->key_u64s * sizeof(u64);
+	unsigned bits = format->bits_per_field[field];
+	u64 offset = format->field_offset[field];
+	unsigned i, bit_offset = 0;
+	unsigned shl, shr;
+
+	if (!bits && !offset) {
+		if (!*eax_zeroed) {
+			/* xor eax, eax */
+			I2(0x31, 0xc0);
+		}
+
+		*eax_zeroed = true;
+		goto set_field;
+	}
+
+	if (!bits) {
+		/* just return offset: */
+
+		switch (dst_size) {
+		case 8:
+			if (offset > S32_MAX) {
+				/* mov [rdi + dst_offset], offset */
+				I3(0xc7, 0x47, dst_offset);
+				memcpy(out, &offset, 4);
+				out += 4;
+
+				I3(0xc7, 0x47, dst_offset + 4);
+				memcpy(out, (void *) &offset + 4, 4);
+				out += 4;
+			} else {
+				/* mov [rdi + dst_offset], offset */
+				/* sign extended */
+				I4(0x48, 0xc7, 0x47, dst_offset);
+				memcpy(out, &offset, 4);
+				out += 4;
+			}
+			break;
+		case 4:
+			/* mov [rdi + dst_offset], offset */
+			I3(0xc7, 0x47, dst_offset);
+			memcpy(out, &offset, 4);
+			out += 4;
+			break;
+		default:
+			BUG();
+		}
+
+		return out;
+	}
+
+	for (i = 0; i <= field; i++)
+		bit_offset += format->bits_per_field[i];
+
+	byte -= DIV_ROUND_UP(bit_offset, 8);
+	bit_offset = round_up(bit_offset, 8) - bit_offset;
+
+	*eax_zeroed = false;
+
+	if (bit_offset == 0 && bits == 8) {
+		/* movzx eax, BYTE PTR [rsi + imm8] */
+		I4(0x0f, 0xb6, 0x46, byte);
+	} else if (bit_offset == 0 && bits == 16) {
+		/* movzx eax, WORD PTR [rsi + imm8] */
+		I4(0x0f, 0xb7, 0x46, byte);
+	} else if (bit_offset + bits <= 32) {
+		/* mov eax, [rsi + imm8] */
+		I3(0x8b, 0x46, byte);
+
+		if (bit_offset) {
+			/* shr eax, imm8 */
+			I3(0xc1, 0xe8, bit_offset);
+		}
+
+		if (bit_offset + bits < 32) {
+			unsigned mask = ~0U >> (32 - bits);
+
+			/* and eax, imm32 */
+			I1(0x25);
+			memcpy(out, &mask, 4);
+			out += 4;
+		}
+	} else if (bit_offset + bits <= 64) {
+		/* mov rax, [rsi + imm8] */
+		I4(0x48, 0x8b, 0x46, byte);
+
+		shl = 64 - bit_offset - bits;
+		shr = bit_offset + shl;
+
+		if (shl) {
+			/* shl rax, imm8 */
+			I4(0x48, 0xc1, 0xe0, shl);
+		}
+
+		if (shr) {
+			/* shr rax, imm8 */
+			I4(0x48, 0xc1, 0xe8, shr);
+		}
+	} else {
+		/* mov rax, [rsi + byte] */
+		I4(0x48, 0x8b, 0x46, byte);
+
+		/* mov edx, [rsi + byte + 8] */
+		I3(0x8b, 0x56, byte + 8);
+
+		/* bits from next word: */
+		shr = bit_offset + bits - 64;
+		BUG_ON(shr > bit_offset);
+
+		/* shr rax, bit_offset */
+		I4(0x48, 0xc1, 0xe8, shr);
+
+		/* shl rdx, imm8 */
+		I4(0x48, 0xc1, 0xe2, 64 - shr);
+
+		/* or rax, rdx */
+		I3(0x48, 0x09, 0xd0);
+
+		shr = bit_offset - shr;
+
+		if (shr) {
+			/* shr rax, imm8 */
+			I4(0x48, 0xc1, 0xe8, shr);
+		}
+	}
+
+	/* rax += offset: */
+	if (offset > S32_MAX) {
+		/* mov rdx, imm64 */
+		I2(0x48, 0xba);
+		memcpy(out, &offset, 8);
+		out += 8;
+		/* add %rdx, %rax */
+		I3(0x48, 0x01, 0xd0);
+	} else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
+		/* add rax, imm32 */
+		I2(0x48, 0x05);
+		memcpy(out, &offset, 4);
+		out += 4;
+	} else if (offset) {
+		/* add eax, imm32 */
+		I1(0x05);
+		memcpy(out, &offset, 4);
+		out += 4;
+	}
+set_field:
+	switch (dst_size) {
+	case 8:
+		/* mov [rdi + dst_offset], rax */
+		I4(0x48, 0x89, 0x47, dst_offset);
+		break;
+	case 4:
+		/* mov [rdi + dst_offset], eax */
+		I3(0x89, 0x47, dst_offset);
+		break;
+	default:
+		BUG();
+	}
+
+	return out;
+}
+
+int bch_compile_bkey_format(const struct bkey_format *format, void *_out)
+{
+	bool eax_zeroed = false;
+	u8 *out = _out;
+
+	/*
+	 * rdi: dst - unpacked key
+	 * rsi: src - packed key
+	 */
+
+	/* k->u64s, k->format, k->type */
+
+	/* mov eax, [rsi] */
+	I2(0x8b, 0x06);
+
+	/* add eax, BKEY_U64s - format->key_u64s */
+	I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
+
+	/* and eax, imm32: mask out k->pad: */
+	I5(0x25, 0xff, 0xff, 0xff, 0);
+
+	/* mov [rdi], eax */
+	I2(0x89, 0x07);
+
+	out = compile_bkey_field(format, out,	BKEY_FIELD_INODE,
+				 offsetof(struct bkey, p.inode), 8,
+				 &eax_zeroed);
+
+	out = compile_bkey_field(format, out,	BKEY_FIELD_OFFSET,
+				 offsetof(struct bkey, p.offset), 8,
+				 &eax_zeroed);
+
+	out = compile_bkey_field(format, out,	BKEY_FIELD_SNAPSHOT,
+				 offsetof(struct bkey, p.snapshot), 4,
+				 &eax_zeroed);
+
+	out = compile_bkey_field(format, out,	BKEY_FIELD_SIZE,
+				 offsetof(struct bkey, size), 4,
+				 &eax_zeroed);
+
+	out = compile_bkey_field(format, out,	BKEY_FIELD_VERSION,
+				 offsetof(struct bkey, version), 4,
+				 &eax_zeroed);
+
+	/* retq */
+	I1(0xc3);
+
+	return (void *) out - _out;
+}
+
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	u64 l_v, r_v;
+
+	if (!nr_key_bits)
+		return 0;
+
+	/* for big endian, skip past header */
+	nr_key_bits += high_bit_offset;
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (1) {
+		if (nr_key_bits < 64) {
+			l_v >>= 64 - nr_key_bits;
+			r_v >>= 64 - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= 64;
+		}
+
+		if (l_v != r_v)
+			return l_v < r_v ? -1 : 1;
+
+		if (!nr_key_bits)
+			return 0;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+	}
+}
+#endif
+
+/*
+ * Would like to use this if we can make __bkey_cmp_bits() fast enough, it'll be
+ * a decent reduction in code size
+ */
+#if 0
+static int bkey_cmp_verify(const struct bkey *l, const struct bkey *r)
+{
+	if (l->p.inode != r->p.inode)
+		return l->p.inode < r->p.inode ? -1 : 1;
+
+	if (l->p.offset != r->p.offset)
+		return l->p.offset < r->p.offset ? -1 : 1;
+
+	if (l->p.snapshot != r->p.snapshot)
+		return l->p.snapshot < r->p.snapshot ? -1 : 1;
+
+	return 0;
+}
+
+int bkey_cmp(const struct bkey *l, const struct bkey *r)
+{
+	int ret;
+
+	EBUG_ON(bkey_packed(l) || bkey_packed(r));
+
+	ret = __bkey_cmp_bits((sizeof(l->inode) +
+			       sizeof(l->offset) +
+			       sizeof(l->snapshot)) * BITS_PER_BYTE,
+			      __high_word(BKEY_U64s, l),
+			      __high_word(BKEY_U64s, r));
+
+	BUG_ON(ret != bkey_cmp_verify(l, r));
+
+	return ret;
+}
+#endif
+
+__pure
+int __bkey_cmp_packed_format_checked(const struct bkey_packed *l,
+				     const struct bkey_packed *r,
+				     const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	int ret;
+
+	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	ret = __bkey_cmp_bits(high_word(f, l),
+			      high_word(f, r),
+			      b->nr_key_bits);
+
+	EBUG_ON(ret != bkey_cmp(bkey_unpack_key_format_checked(b, l).p,
+				bkey_unpack_key_format_checked(b, r).p));
+	return ret;
+}
+
+__pure __flatten
+int __bkey_cmp_left_packed_format_checked(const struct btree *b,
+					  const struct bkey_packed *l,
+					  const struct bpos *r)
+{
+	return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+}
+
+__pure __flatten
+int __bkey_cmp_packed(const struct bkey_packed *l,
+		      const struct bkey_packed *r,
+		      const struct btree *b)
+{
+	int packed = bkey_lr_packed(l, r);
+
+	if (likely(packed == BKEY_PACKED_BOTH))
+		return __bkey_cmp_packed_format_checked(l, r, b);
+
+	switch (packed) {
+	case BKEY_PACKED_NONE:
+		return bkey_cmp(((struct bkey *) l)->p,
+				((struct bkey *) r)->p);
+	case BKEY_PACKED_LEFT:
+		return __bkey_cmp_left_packed_format_checked(b,
+				  (struct bkey_packed *) l,
+				  &((struct bkey *) r)->p);
+	case BKEY_PACKED_RIGHT:
+		return -__bkey_cmp_left_packed_format_checked(b,
+				  (struct bkey_packed *) r,
+				  &((struct bkey *) l)->p);
+	default:
+		unreachable();
+	}
+}
+
+__pure __flatten
+int bkey_cmp_left_packed(const struct btree *b,
+			 const struct bkey_packed *l, const struct bpos *r)
+{
+	const struct bkey *l_unpacked;
+
+	return unlikely(l_unpacked = packed_to_bkey_c(l))
+		? bkey_cmp(l_unpacked->p, *r)
+		: __bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+void bch_bpos_swab(struct bpos *p)
+{
+	u8 *l = (u8 *) p;
+	u8 *h = ((u8 *) &p[1]) - 1;
+
+	while (l < h) {
+		swap(*l, *h);
+		l++;
+		--h;
+	}
+}
+
+void bch_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
+{
+	const struct bkey_format *f = bkey_packed(k) ? _f : &bch_bkey_format_current;
+	u8 *l = k->key_start;
+	u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
+
+	while (l < h) {
+		swap(*l, *h);
+		l++;
+		--h;
+	}
+}
+
+#ifdef CONFIG_BCACHE_DEBUG
+void bkey_pack_test(void)
+{
+	struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
+	struct bkey_packed p;
+
+	struct bkey_format test_format = {
+		.key_u64s	= 2,
+		.nr_fields	= 5,
+		.bits_per_field = {
+			13,
+			64,
+		},
+	};
+
+	struct unpack_state in_s =
+		unpack_state_init(&bch_bkey_format_current, (void *) &t);
+	struct pack_state out_s = pack_state_init(&test_format, &p);
+	unsigned i;
+
+	for (i = 0; i < out_s.format->nr_fields; i++) {
+		u64 a, v = get_inc_field(&in_s, i);
+
+		switch (i) {
+		case 0:
+			a = t.p.inode;
+			break;
+		case 1:
+			a = t.p.offset;
+			break;
+		case 2:
+			a = t.p.snapshot;
+			break;
+		case 3:
+			a = t.size;
+			break;
+		case 4:
+			a = t.version;
+			break;
+		default:
+			BUG();
+		}
+
+		if (a != v)
+			panic("got %llu actual %llu i %u\n", v, a, i);
+
+		if (!set_inc_field(&out_s, i, v))
+			panic("failed at %u\n", i);
+	}
+
+	BUG_ON(!bkey_pack_key(&p, &t, &test_format));
+}
+#endif
diff --git a/libbcache/bkey.h b/libbcache/bkey.h
new file mode 100644
index 0000000..3e29cdd
--- /dev/null
+++ b/libbcache/bkey.h
@@ -0,0 +1,596 @@
+#ifndef _BCACHE_BKEY_H
+#define _BCACHE_BKEY_H
+
+#include <linux/bug.h>
+#include <linux/bcache.h>
+
+#include "util.h"
+
+void bch_to_binary(char *, const u64 *, unsigned);
+int bch_bkey_to_text(char *, size_t, const struct bkey *);
+
+#define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
+
+/* bkey with split value, const */
+struct bkey_s_c {
+	const struct bkey	*k;
+	const struct bch_val	*v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+	union {
+	struct {
+		struct bkey	*k;
+		struct bch_val	*v;
+	};
+	struct bkey_s_c		s_c;
+	};
+};
+
+#define bkey_next(_k)							\
+({									\
+	BUILD_BUG_ON(!type_is(_k, struct bkey *) &&			\
+		     !type_is(_k, struct bkey_i *) &&			\
+		     !type_is(_k, struct bkey_packed *));		\
+									\
+	((typeof(_k)) __bkey_idx(((struct bkey *) (_k)),		\
+				 ((struct bkey *) (_k))->u64s));	\
+})
+
+static inline unsigned bkey_val_u64s(const struct bkey *k)
+{
+	return k->u64s - BKEY_U64s;
+}
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+	return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+	k->u64s = BKEY_U64s + val_u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+	k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+/*
+ * Mark a key as deleted without changing the size of the value (i.e. modifying
+ * keys in the btree in place)
+ */
+static inline void __set_bkey_deleted(struct bkey *k)
+{
+	k->type = KEY_TYPE_DELETED;
+}
+
+static inline void set_bkey_deleted(struct bkey *k)
+{
+	__set_bkey_deleted(k);
+	set_bkey_val_u64s(k, 0);
+}
+
+#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_DELETED)
+
+#define bkey_whiteout(_k)				\
+	((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD)
+
+#define bkey_packed_typecheck(_k)					\
+({									\
+	BUILD_BUG_ON(!type_is(_k, struct bkey *) &&			\
+		     !type_is(_k, struct bkey_packed *));		\
+	type_is(_k, struct bkey_packed *);				\
+})
+
+enum bkey_lr_packed {
+	BKEY_PACKED_BOTH,
+	BKEY_PACKED_RIGHT,
+	BKEY_PACKED_LEFT,
+	BKEY_PACKED_NONE,
+};
+
+#define bkey_lr_packed_typecheck(_l, _r)				\
+	(!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
+
+#define bkey_lr_packed(_l, _r)						\
+	((_l)->format + ((_r)->format << 1))
+
+#define bkey_copy(_dst, _src)					\
+do {								\
+	BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) &&		\
+		     !type_is(_dst, struct bkey_packed *));	\
+	BUILD_BUG_ON(!type_is(_src, struct bkey_i *) &&		\
+		     !type_is(_src, struct bkey_packed *));	\
+	EBUG_ON((u64 *) (_dst) > (u64 *) (_src) &&		\
+		(u64 *) (_dst) < (u64 *) (_src) +		\
+		((struct bkey *) (_src))->u64s);		\
+								\
+	__memmove_u64s_down((_dst), (_src),			\
+			    ((struct bkey *) (_src))->u64s);	\
+} while (0)
+
+struct btree;
+
+struct bkey_format_state {
+	u64 field_min[BKEY_NR_FIELDS];
+	u64 field_max[BKEY_NR_FIELDS];
+};
+
+void bch_bkey_format_init(struct bkey_format_state *);
+void bch_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
+void bch_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
+struct bkey_format bch_bkey_format_done(struct bkey_format_state *);
+const char *bch_bkey_format_validate(struct bkey_format *);
+
+__pure
+unsigned bkey_greatest_differing_bit(const struct btree *,
+				     const struct bkey_packed *,
+				     const struct bkey_packed *);
+__pure
+unsigned bkey_ffs(const struct btree *, const struct bkey_packed *);
+
+__pure
+int __bkey_cmp_packed_format_checked(const struct bkey_packed *,
+				     const struct bkey_packed *,
+				     const struct btree *);
+
+__pure
+int __bkey_cmp_left_packed_format_checked(const struct btree *,
+					  const struct bkey_packed *,
+					  const struct bpos *);
+
+__pure
+int __bkey_cmp_packed(const struct bkey_packed *,
+		      const struct bkey_packed *,
+		      const struct btree *);
+
+__pure
+int bkey_cmp_left_packed(const struct btree *,
+			 const struct bkey_packed *,
+			 const struct bpos *);
+
+/*
+ * we prefer to pass bpos by ref, but it's often enough terribly convenient to
+ * pass it by by val... as much as I hate c++, const ref would be nice here:
+ */
+__pure __flatten
+static inline int bkey_cmp_left_packed_byval(const struct btree *b,
+					     const struct bkey_packed *l,
+					     struct bpos r)
+{
+	return bkey_cmp_left_packed(b, l, &r);
+}
+
+/*
+ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
+ * skip dispatching on k->format:
+ */
+#define bkey_cmp_packed(_b, _l, _r)					\
+({									\
+	int _cmp;							\
+									\
+	switch (bkey_lr_packed_typecheck(_l, _r)) {			\
+	case BKEY_PACKED_NONE:						\
+		_cmp = bkey_cmp(((struct bkey *) (_l))->p,		\
+				((struct bkey *) (_r))->p);		\
+		break;							\
+	case BKEY_PACKED_LEFT:						\
+		_cmp = bkey_cmp_left_packed((_b),			\
+				  (struct bkey_packed *) (_l),		\
+				  &((struct bkey *) (_r))->p);		\
+		break;							\
+	case BKEY_PACKED_RIGHT:						\
+		_cmp = -bkey_cmp_left_packed((_b),			\
+				  (struct bkey_packed *) (_r),		\
+				  &((struct bkey *) (_l))->p);		\
+		break;							\
+	case BKEY_PACKED_BOTH:						\
+		_cmp = __bkey_cmp_packed((void *) (_l),			\
+					 (void *) (_r), (_b));		\
+		break;							\
+	}								\
+	_cmp;								\
+})
+
+#if 1
+static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
+{
+	if (l.inode != r.inode)
+		return l.inode < r.inode ? -1 : 1;
+	if (l.offset != r.offset)
+		return l.offset < r.offset ? -1 : 1;
+	if (l.snapshot != r.snapshot)
+		return l.snapshot < r.snapshot ? -1 : 1;
+	return 0;
+}
+#else
+int bkey_cmp(struct bpos l, struct bpos r);
+#endif
+
+static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+{
+	return bkey_cmp(l, r) < 0 ? l : r;
+}
+
+void bch_bpos_swab(struct bpos *);
+void bch_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
+
+#ifdef CONFIG_BCACHE_DEBUG
+/* statement expressions confusing unlikely()? */
+#define bkey_packed(_k)							\
+	({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT);			\
+	 (_k)->format != KEY_FORMAT_CURRENT; })
+#else
+#define bkey_packed(_k)		((_k)->format != KEY_FORMAT_CURRENT)
+#endif
+
+/*
+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse
+ */
+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
+{
+	return (struct bkey_packed *) k;
+}
+
+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
+{
+	return (const struct bkey_packed *) k;
+}
+
+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (struct bkey_i *) k;
+}
+
+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (const struct bkey *) k;
+}
+
+static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
+{
+	return format->bits_per_field[BKEY_FIELD_INODE] +
+		format->bits_per_field[BKEY_FIELD_OFFSET] +
+		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
+}
+
+static inline struct bpos bkey_successor(struct bpos p)
+{
+	struct bpos ret = p;
+
+	if (!++ret.offset)
+		BUG_ON(!++ret.inode);
+
+	return ret;
+}
+
+static inline u64 bkey_start_offset(const struct bkey *k)
+{
+	return k->p.offset - k->size;
+}
+
+static inline struct bpos bkey_start_pos(const struct bkey *k)
+{
+	return (struct bpos) {
+		.inode		= k->p.inode,
+		.offset		= bkey_start_offset(k),
+		.snapshot	= k->p.snapshot,
+	};
+}
+
+/* Packed helpers */
+
+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+
+	EBUG_ON(k->u64s < ret);
+	return ret;
+}
+
+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
+				       const struct bkey_packed *k)
+{
+	return bkeyp_key_u64s(format, k) * sizeof(u64);
+}
+
+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	return k->u64s - bkeyp_key_u64s(format, k);
+}
+
+static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
+				     const struct bkey_packed *k)
+{
+	return bkeyp_val_u64s(format, k) * sizeof(u64);
+}
+
+static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
+				      struct bkey_packed *k, unsigned val_u64s)
+{
+	k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
+}
+
+#define bkeyp_val(_format, _k)						\
+	 ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
+
+extern const struct bkey_format bch_bkey_format_current;
+
+bool bch_bkey_transform(const struct bkey_format *,
+			struct bkey_packed *,
+			const struct bkey_format *,
+			const struct bkey_packed *);
+
+struct bkey __bkey_unpack_key(const struct bkey_format *,
+			      const struct bkey_packed *);
+
+#ifndef HAVE_BCACHE_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *,
+			      const struct bkey_packed *);
+#endif
+
+bool bkey_pack_key(struct bkey_packed *, const struct bkey *,
+		   const struct bkey_format *);
+
+enum bkey_pack_pos_ret {
+	BKEY_PACK_POS_EXACT,
+	BKEY_PACK_POS_SMALLER,
+	BKEY_PACK_POS_FAIL,
+};
+
+enum bkey_pack_pos_ret bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
+					   const struct btree *);
+
+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
+				 const struct btree *b)
+{
+	return bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
+}
+
+void bkey_unpack(const struct btree *, struct bkey_i *,
+		 const struct bkey_packed *);
+bool bkey_pack(struct bkey_packed *, const struct bkey_i *,
+	       const struct bkey_format *);
+
+static inline u64 bkey_field_max(const struct bkey_format *f,
+				 enum bch_bkey_fields nr)
+{
+	return f->bits_per_field[nr] < 64
+		? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr])
+		: U64_MAX;
+}
+
+#ifdef CONFIG_X86_64
+#define HAVE_BCACHE_COMPILED_UNPACK	1
+
+int bch_compile_bkey_format(const struct bkey_format *, void *);
+
+#else
+
+static inline int bch_compile_bkey_format(const struct bkey_format *format,
+					  void *out) { return 0; }
+
+#endif
+
+static inline void bkey_reassemble(struct bkey_i *dst,
+				   struct bkey_s_c src)
+{
+	BUG_ON(bkey_packed(src.k));
+	dst->k = *src.k;
+	memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
+}
+
+#define bkey_s_null		((struct bkey_s)   { .k = NULL })
+#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+	return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+	return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+	return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define __BKEY_VAL_ACCESSORS(name, nr, _assert)				\
+struct bkey_s_c_##name {						\
+	union {								\
+	struct {							\
+		const struct bkey	*k;				\
+		const struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+struct bkey_s_##name {							\
+	union {								\
+	struct {							\
+		struct bkey		*k;				\
+		struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c_##name		c;				\
+	struct bkey_s			s;				\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
+{									\
+	_assert(k->k.type, nr);						\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline const struct bkey_i_##name *				\
+bkey_i_to_##name##_c(const struct bkey_i *k)				\
+{									\
+	_assert(k->k.type, nr);						\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
+{									\
+	_assert(k.k->type, nr);						\
+	return (struct bkey_s_##name) {					\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{									\
+	_assert(k.k->type, nr);						\
+	return (struct bkey_s_c_##name) {				\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{									\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+name##_i_to_s_c(const struct bkey_i_##name *k)				\
+{									\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
+{									\
+	_assert(k->k.type, nr);						\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+bkey_i_to_s_c_##name(const struct bkey_i *k)				\
+{									\
+	_assert(k->k.type, nr);						\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bch_##name *					\
+bkey_p_##name##_val(const struct bkey_format *f,			\
+		    struct bkey_packed *k)				\
+{									\
+	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
+}									\
+									\
+static inline const struct bch_##name *					\
+bkey_p_c_##name##_val(const struct bkey_format *f,			\
+		      const struct bkey_packed *k)			\
+{									\
+	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
+}									\
+									\
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{									\
+	struct bkey_i_##name *k =					\
+		container_of(&_k->k, struct bkey_i_##name, k);		\
+									\
+	bkey_init(&k->k);						\
+	memset(&k->v, 0, sizeof(k->v));					\
+	k->k.type = nr;							\
+	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
+									\
+	return k;							\
+}
+
+#define __BKEY_VAL_ASSERT(_type, _nr)	EBUG_ON(_type != _nr)
+
+#define BKEY_VAL_ACCESSORS(name, _nr)					\
+	static inline void __bch_##name##_assert(u8 type, u8 nr)	\
+	{								\
+		EBUG_ON(type != _nr);					\
+	}								\
+									\
+	__BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert)
+
+BKEY_VAL_ACCESSORS(cookie,		KEY_TYPE_COOKIE);
+
+static inline void __bch_extent_assert(u8 type, u8 nr)
+{
+	EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED);
+}
+
+__BKEY_VAL_ACCESSORS(extent,		BCH_EXTENT, __bch_extent_assert);
+
+BKEY_VAL_ACCESSORS(inode,		BCH_INODE_FS);
+BKEY_VAL_ACCESSORS(inode_blockdev,	BCH_INODE_BLOCKDEV);
+
+BKEY_VAL_ACCESSORS(dirent,		BCH_DIRENT);
+
+BKEY_VAL_ACCESSORS(xattr,		BCH_XATTR);
+
+/* byte order helpers */
+
+#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
+#error edit for your odd byteorder.
+#endif
+
+#ifdef __LITTLE_ENDIAN
+
+#define high_bit_offset		0
+#define __high_word(u64s, k)	((k)->_data + (u64s) - 1)
+#define nth_word(p, n)		((p) - (n))
+
+#else
+
+#define high_bit_offset		KEY_PACKED_BITS_START
+#define __high_word(u64s, k)	((k)->_data)
+#define nth_word(p, n)		((p) + (n))
+
+#endif
+
+#define high_word(format, k)	__high_word((format)->key_u64s, k)
+#define next_word(p)		nth_word(p, 1)
+#define prev_word(p)		nth_word(p, -1)
+
+#ifdef CONFIG_BCACHE_DEBUG
+void bkey_pack_test(void);
+#else
+static inline void bkey_pack_test(void) {}
+#endif
+
+#endif /* _BCACHE_BKEY_H */
diff --git a/libbcache/bkey_methods.c b/libbcache/bkey_methods.c
new file mode 100644
index 0000000..3bcd0e0
--- /dev/null
+++ b/libbcache/bkey_methods.c
@@ -0,0 +1,117 @@
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "dirent.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "xattr.h"
+
+const struct bkey_ops *bch_bkey_ops[] = {
+	[BKEY_TYPE_EXTENTS]	= &bch_bkey_extent_ops,
+	[BKEY_TYPE_INODES]	= &bch_bkey_inode_ops,
+	[BKEY_TYPE_DIRENTS]	= &bch_bkey_dirent_ops,
+	[BKEY_TYPE_XATTRS]	= &bch_bkey_xattr_ops,
+	[BKEY_TYPE_BTREE]	= &bch_bkey_btree_ops,
+};
+
+/* Returns string indicating reason for being invalid, or NULL if valid: */
+const char *bkey_invalid(struct cache_set *c, enum bkey_type type,
+			 struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = bch_bkey_ops[type];
+
+	if (k.k->u64s < BKEY_U64s)
+		return "u64s too small";
+
+	if (k.k->size &&
+	    (bkey_deleted(k.k) || !ops->is_extents))
+		return "nonzero size field";
+
+	switch (k.k->type) {
+	case KEY_TYPE_DELETED:
+	case KEY_TYPE_DISCARD:
+		return NULL;
+
+	case KEY_TYPE_ERROR:
+		return bkey_val_bytes(k.k) != 0
+			? "value size should be zero"
+			: NULL;
+
+	case KEY_TYPE_COOKIE:
+		return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
+			? "incorrect value size"
+			: NULL;
+
+	default:
+		if (k.k->type < KEY_TYPE_GENERIC_NR)
+			return "invalid type";
+
+		return ops->key_invalid(c, k);
+	}
+}
+
+const char *btree_bkey_invalid(struct cache_set *c, struct btree *b,
+			       struct bkey_s_c k)
+{
+	if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
+		return "key before start of btree node";
+
+	if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+		return "key past end of btree node";
+
+	if (k.k->p.snapshot)
+		return "nonzero snapshot";
+
+	return bkey_invalid(c, btree_node_type(b), k);
+}
+
+void bkey_debugcheck(struct cache_set *c, struct btree *b, struct bkey_s_c k)
+{
+	enum bkey_type type = btree_node_type(b);
+	const struct bkey_ops *ops = bch_bkey_ops[type];
+	const char *invalid;
+
+	BUG_ON(!k.k->u64s);
+
+	invalid = btree_bkey_invalid(c, b, k);
+	if (invalid) {
+		char buf[160];
+
+		bch_bkey_val_to_text(c, type, buf, sizeof(buf), k);
+		cache_set_bug(c, "invalid bkey %s: %s", buf, invalid);
+		return;
+	}
+
+	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
+	    ops->key_debugcheck)
+		ops->key_debugcheck(c, b, k);
+}
+
+void bch_bkey_val_to_text(struct cache_set *c, enum bkey_type type,
+			  char *buf, size_t size, struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = bch_bkey_ops[type];
+	char *out = buf, *end = buf + size;
+
+	out += bch_bkey_to_text(out, end - out, k.k);
+
+	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
+	    ops->val_to_text) {
+		out += scnprintf(out, end - out, " -> ");
+		ops->val_to_text(c, out, end - out, k);
+	}
+}
+
+void bch_bkey_swab(enum bkey_type type,
+		   const struct bkey_format *f,
+		   struct bkey_packed *k)
+{
+	const struct bkey_ops *ops = bch_bkey_ops[type];
+
+	bch_bkey_swab_key(f, k);
+
+	if (ops->swab)
+		ops->swab(f, k);
+}
diff --git a/libbcache/bkey_methods.h b/libbcache/bkey_methods.h
new file mode 100644
index 0000000..0e305eb
--- /dev/null
+++ b/libbcache/bkey_methods.h
@@ -0,0 +1,80 @@
+#ifndef _BCACHE_BKEY_METHODS_H
+#define _BCACHE_BKEY_METHODS_H
+
+#include "bkey.h"
+
+#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val,
+
+enum bkey_type {
+	DEFINE_BCH_BTREE_IDS()
+	BKEY_TYPE_BTREE,
+};
+
+/* Type of a key in btree @id at level @level: */
+static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
+{
+	return level ? BKEY_TYPE_BTREE : id;
+}
+
+static inline bool btree_type_has_ptrs(enum bkey_type type)
+{
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+	case BKEY_TYPE_EXTENTS:
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct cache_set;
+struct btree;
+struct bkey;
+
+enum merge_result {
+	BCH_MERGE_NOMERGE,
+
+	/*
+	 * The keys were mergeable, but would have overflowed size - so instead
+	 * l was changed to the maximum size, and both keys were modified:
+	 */
+	BCH_MERGE_PARTIAL,
+	BCH_MERGE_MERGE,
+};
+
+typedef bool (*key_filter_fn)(struct cache_set *, struct btree *,
+			      struct bkey_s);
+typedef enum merge_result (*key_merge_fn)(struct cache_set *,
+					  struct btree *,
+					  struct bkey_i *, struct bkey_i *);
+
+struct bkey_ops {
+	/* Returns reason for being invalid if invalid, else NULL: */
+	const char *	(*key_invalid)(const struct cache_set *,
+				       struct bkey_s_c);
+	void		(*key_debugcheck)(struct cache_set *, struct btree *,
+					  struct bkey_s_c);
+	void		(*val_to_text)(struct cache_set *, char *,
+				       size_t, struct bkey_s_c);
+	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
+	key_filter_fn	key_normalize;
+	key_merge_fn	key_merge;
+	bool		is_extents;
+};
+
+const char *bkey_invalid(struct cache_set *, enum bkey_type, struct bkey_s_c);
+const char *btree_bkey_invalid(struct cache_set *, struct btree *,
+			       struct bkey_s_c);
+
+void bkey_debugcheck(struct cache_set *, struct btree *, struct bkey_s_c);
+void bch_bkey_val_to_text(struct cache_set *, enum bkey_type,
+			  char *, size_t, struct bkey_s_c);
+
+void bch_bkey_swab(enum bkey_type, const struct bkey_format *,
+		   struct bkey_packed *);
+
+extern const struct bkey_ops *bch_bkey_ops[];
+
+#undef DEF_BTREE_ID
+
+#endif /* _BCACHE_BKEY_METHODS_H */
diff --git a/libbcache/blockdev.c b/libbcache/blockdev.c
new file mode 100644
index 0000000..cd231f5
--- /dev/null
+++ b/libbcache/blockdev.c
@@ -0,0 +1,824 @@
+
+#include "bcache.h"
+#include "blockdev.h"
+#include "btree_iter.h"
+#include "checksum.h"
+#include "error.h"
+#include "inode.h"
+#include "request.h"
+#include "super.h"
+#include "writeback.h"
+
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+static int bch_blockdev_major;
+static DEFINE_IDA(bch_blockdev_minor);
+static LIST_HEAD(uncached_devices);
+struct kmem_cache *bch_search_cache;
+
+static void write_bdev_super_endio(struct bio *bio)
+{
+	struct cached_dev *dc = bio->bi_private;
+	/* XXX: error checking */
+
+	closure_put(&dc->sb_write);
+}
+
+static void bch_write_bdev_super_unlock(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
+
+	up(&dc->sb_write_mutex);
+}
+
+void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
+{
+	struct backingdev_sb *sb = dc->disk_sb.sb;
+	struct closure *cl = &dc->sb_write;
+	struct bio *bio = dc->disk_sb.bio;
+
+	down(&dc->sb_write_mutex);
+	closure_init(cl, parent);
+
+	bio_reset(bio);
+	bio->bi_end_io	= write_bdev_super_endio;
+	bio->bi_private = dc;
+
+	closure_get(cl);
+
+	sb->csum = cpu_to_le64(__csum_set(sb, 0, BCH_CSUM_CRC64));
+	__write_super(dc->disk.c, (void *) &dc->disk_sb);
+
+	closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
+}
+
+bool bch_is_open_backing_dev(struct block_device *bdev)
+{
+	struct cache_set *c, *tc;
+	struct cached_dev *dc, *t;
+
+	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+		list_for_each_entry_safe(dc, t, &c->cached_devs, list)
+			if (dc->disk_sb.bdev == bdev)
+				return true;
+	list_for_each_entry_safe(dc, t, &uncached_devices, list)
+		if (dc->disk_sb.bdev == bdev)
+			return true;
+	return false;
+}
+
+static int open_dev(struct block_device *b, fmode_t mode)
+{
+	struct bcache_device *d = b->bd_disk->private_data;
+
+	if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
+		return -ENXIO;
+
+	closure_get(&d->cl);
+	return 0;
+}
+
+static void release_dev(struct gendisk *b, fmode_t mode)
+{
+	struct bcache_device *d = b->private_data;
+
+	closure_put(&d->cl);
+}
+
+static int ioctl_dev(struct block_device *b, fmode_t mode,
+		     unsigned int cmd, unsigned long arg)
+{
+	struct bcache_device *d = b->bd_disk->private_data;
+
+	return d->ioctl(d, mode, cmd, arg);
+}
+
+static const struct block_device_operations bcache_ops = {
+	.open		= open_dev,
+	.release	= release_dev,
+	.ioctl		= ioctl_dev,
+	.owner		= THIS_MODULE,
+};
+
+void bch_blockdev_stop(struct bcache_device *d)
+{
+	if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
+		closure_queue(&d->cl);
+}
+
+static void bcache_device_unlink(struct bcache_device *d)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
+		sysfs_remove_link(&d->c->kobj, d->name);
+		sysfs_remove_link(&d->kobj, "cache");
+	}
+}
+
+static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
+			       const char *name)
+{
+	snprintf(d->name, BCACHEDEVNAME_SIZE,
+		 "%s%llu", name, bcache_dev_inum(d));
+
+	WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
+	     sysfs_create_link(&c->kobj, &d->kobj, d->name),
+	     "Couldn't create device <-> cache set symlinks");
+
+	clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
+}
+
+static void bcache_device_detach(struct bcache_device *d)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
+		mutex_lock(&d->inode_lock);
+		bch_inode_rm(d->c, bcache_dev_inum(d));
+		mutex_unlock(&d->inode_lock);
+	}
+
+	bcache_device_unlink(d);
+
+	radix_tree_delete(&d->c->devices, bcache_dev_inum(d));
+
+	closure_put(&d->c->caching);
+	d->c = NULL;
+}
+
+static int bcache_device_attach(struct bcache_device *d, struct cache_set *c)
+{
+	int ret;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	ret = radix_tree_insert(&c->devices, bcache_dev_inum(d), d);
+	if (ret) {
+		pr_err("radix_tree_insert() error for inum %llu",
+		       bcache_dev_inum(d));
+		return ret;
+	}
+
+	d->c = c;
+	closure_get(&c->caching);
+
+	return ret;
+}
+
+static void bcache_device_free(struct bcache_device *d)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	pr_info("%s stopped", d->disk->disk_name);
+
+	if (d->c)
+		bcache_device_detach(d);
+	if (d->disk && d->disk->flags & GENHD_FL_UP)
+		del_gendisk(d->disk);
+	if (d->disk && d->disk->queue)
+		blk_cleanup_queue(d->disk->queue);
+	if (d->disk) {
+		ida_simple_remove(&bch_blockdev_minor, d->disk->first_minor);
+		put_disk(d->disk);
+	}
+
+	bioset_exit(&d->bio_split);
+
+	closure_debug_destroy(&d->cl);
+}
+
+static int bcache_device_init(struct bcache_device *d, unsigned block_size,
+			      sector_t sectors)
+{
+	struct request_queue *q;
+	int minor;
+
+	mutex_init(&d->inode_lock);
+
+	minor = ida_simple_get(&bch_blockdev_minor, 0, MINORMASK + 1, GFP_KERNEL);
+	if (minor < 0) {
+		pr_err("cannot allocate minor");
+		return minor;
+	}
+
+	if (!(d->disk = alloc_disk(1)) ||
+	    bioset_init(&d->bio_split, 4, offsetof(struct bch_read_bio, bio))) {
+		pr_err("cannot allocate disk");
+		ida_simple_remove(&bch_blockdev_minor, minor);
+		return -ENOMEM;
+	}
+
+	set_capacity(d->disk, sectors);
+	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
+
+	d->disk->major		= bch_blockdev_major;
+	d->disk->first_minor	= minor;
+	d->disk->fops		= &bcache_ops;
+	d->disk->private_data	= d;
+
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q) {
+		pr_err("cannot allocate queue");
+		return -ENOMEM;
+	}
+
+	blk_queue_make_request(q, NULL);
+	d->disk->queue			= q;
+	q->queuedata			= d;
+	q->backing_dev_info.congested_data = d;
+	q->limits.max_hw_sectors	= UINT_MAX;
+	q->limits.max_sectors		= UINT_MAX;
+	q->limits.max_segment_size	= UINT_MAX;
+	q->limits.max_segments		= BIO_MAX_PAGES;
+	blk_queue_max_discard_sectors(q, UINT_MAX);
+	q->limits.discard_granularity	= 512;
+	q->limits.io_min		= block_size;
+	q->limits.logical_block_size	= block_size;
+	q->limits.physical_block_size	= block_size;
+	set_bit(QUEUE_FLAG_NONROT,	&d->disk->queue->queue_flags);
+	clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
+	set_bit(QUEUE_FLAG_DISCARD,	&d->disk->queue->queue_flags);
+
+	blk_queue_write_cache(q, true, true);
+
+	return 0;
+}
+
+/* Cached device */
+
+static void calc_cached_dev_sectors(struct cache_set *c)
+{
+	u64 sectors = 0;
+	struct cached_dev *dc;
+
+	list_for_each_entry(dc, &c->cached_devs, list)
+		sectors += bdev_sectors(dc->disk_sb.bdev);
+
+	c->cached_dev_sectors = sectors;
+}
+
+void bch_cached_dev_run(struct cached_dev *dc)
+{
+	struct bcache_device *d = &dc->disk;
+	char buf[SB_LABEL_SIZE + 1];
+	char *env[] = {
+		"DRIVER=bcache",
+		kasprintf(GFP_KERNEL, "CACHED_UUID=%pU",
+			  dc->disk_sb.sb->disk_uuid.b),
+		NULL,
+		NULL,
+	};
+
+	memcpy(buf, dc->disk_sb.sb->label, SB_LABEL_SIZE);
+	buf[SB_LABEL_SIZE] = '\0';
+	env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
+
+	if (atomic_xchg(&dc->running, 1)) {
+		kfree(env[1]);
+		kfree(env[2]);
+		return;
+	}
+
+	if (!d->c &&
+	    BDEV_STATE(dc->disk_sb.sb) != BDEV_STATE_NONE) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+
+		SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_STALE);
+		bch_write_bdev_super(dc, &cl);
+		closure_sync(&cl);
+	}
+
+	add_disk(d->disk);
+	bd_link_disk_holder(dc->disk_sb.bdev, dc->disk.disk);
+	/* won't show up in the uevent file, use udevadm monitor -e instead
+	 * only class / kset properties are persistent */
+	kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
+	kfree(env[1]);
+	kfree(env[2]);
+
+	if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
+	    sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
+		pr_debug("error creating sysfs link");
+}
+
+static void cached_dev_detach_finish(struct work_struct *w)
+{
+	struct cached_dev *dc = container_of(w, struct cached_dev, detach);
+	char buf[BDEVNAME_SIZE];
+	struct closure cl;
+
+	closure_init_stack(&cl);
+
+	BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
+	BUG_ON(atomic_read(&dc->count));
+
+	mutex_lock(&bch_register_lock);
+
+	memset(&dc->disk_sb.sb->set_uuid, 0, 16);
+	SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_NONE);
+
+	bch_write_bdev_super(dc, &cl);
+	closure_sync(&cl);
+
+	bcache_device_detach(&dc->disk);
+	list_move(&dc->list, &uncached_devices);
+
+	clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
+	clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
+
+	mutex_unlock(&bch_register_lock);
+
+	pr_info("Caching disabled for %s", bdevname(dc->disk_sb.bdev, buf));
+
+	/* Drop ref we took in cached_dev_detach() */
+	closure_put(&dc->disk.cl);
+}
+
+void bch_cached_dev_detach(struct cached_dev *dc)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
+		return;
+
+	if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
+		return;
+
+	/*
+	 * Block the device from being closed and freed until we're finished
+	 * detaching
+	 */
+	closure_get(&dc->disk.cl);
+
+	dc->writeback_pd.rate.rate = UINT_MAX;
+	bch_writeback_queue(dc);
+	cached_dev_put(dc);
+}
+
+int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
+{
+	__le64 rtime = cpu_to_le64(ktime_get_seconds());
+	char buf[BDEVNAME_SIZE];
+	bool found;
+	int ret;
+
+	bdevname(dc->disk_sb.bdev, buf);
+
+	if (memcmp(&dc->disk_sb.sb->set_uuid,
+		   &c->disk_sb.set_uuid,
+		   sizeof(c->disk_sb.set_uuid)))
+		return -ENOENT;
+
+	if (dc->disk.c) {
+		pr_err("Can't attach %s: already attached", buf);
+		return -EINVAL;
+	}
+
+	if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+		return 0;
+
+	if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
+		pr_err("Can't attach %s: shutting down", buf);
+		return -EINVAL;
+	}
+
+	if (le16_to_cpu(dc->disk_sb.sb->block_size) < c->sb.block_size) {
+		/* Will die */
+		pr_err("Couldn't attach %s: block size less than set's block size",
+		       buf);
+		return -EINVAL;
+	}
+
+	found = !bch_cached_dev_inode_find_by_uuid(c,
+					&dc->disk_sb.sb->disk_uuid,
+					&dc->disk.inode);
+
+	if (!found && BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) {
+		pr_err("Couldn't find uuid for %s in set", buf);
+		return -ENOENT;
+	}
+
+	if (found &&
+	    (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE ||
+	     BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE)) {
+		found = false;
+		bch_inode_rm(c, bcache_dev_inum(&dc->disk));
+	}
+
+	/* Deadlocks since we're called via sysfs...
+	sysfs_remove_file(&dc->kobj, &sysfs_attach);
+	 */
+
+	if (!found) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+
+		bkey_inode_blockdev_init(&dc->disk.inode.k_i);
+		dc->disk.inode.k.type = BCH_INODE_BLOCKDEV;
+		SET_CACHED_DEV(&dc->disk.inode.v, true);
+		dc->disk.inode.v.i_uuid = dc->disk_sb.sb->disk_uuid;
+		memcpy(dc->disk.inode.v.i_label,
+		       dc->disk_sb.sb->label, SB_LABEL_SIZE);
+		dc->disk.inode.v.i_ctime = rtime;
+		dc->disk.inode.v.i_mtime = rtime;
+
+		ret = bch_inode_create(c, &dc->disk.inode.k_i,
+				       0, BLOCKDEV_INODE_MAX,
+				       &c->unused_inode_hint);
+		if (ret) {
+			pr_err("Error %d, not caching %s", ret, buf);
+			return ret;
+		}
+
+		pr_info("attached inode %llu", bcache_dev_inum(&dc->disk));
+
+		dc->disk_sb.sb->set_uuid = c->disk_sb.set_uuid;
+		SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN);
+
+		bch_write_bdev_super(dc, &cl);
+		closure_sync(&cl);
+	} else {
+		dc->disk.inode.v.i_mtime = rtime;
+		bch_inode_update(c, &dc->disk.inode.k_i, NULL);
+	}
+
+	/* Count dirty sectors before attaching */
+	if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY)
+		bch_sectors_dirty_init(dc, c);
+
+	ret = bcache_device_attach(&dc->disk, c);
+	if (ret)
+		return ret;
+
+	list_move(&dc->list, &c->cached_devs);
+	calc_cached_dev_sectors(c);
+
+	/*
+	 * dc->c must be set before dc->count != 0 - paired with the mb in
+	 * cached_dev_get()
+	 */
+	smp_wmb();
+	atomic_set(&dc->count, 1);
+
+	if (bch_cached_dev_writeback_start(dc))
+		return -ENOMEM;
+
+	if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) {
+		atomic_set(&dc->has_dirty, 1);
+		atomic_inc(&dc->count);
+	}
+
+	bch_cached_dev_run(dc);
+	bcache_device_link(&dc->disk, c, "bdev");
+
+	pr_info("Caching %s as %s on set %pU",
+		bdevname(dc->disk_sb.bdev, buf), dc->disk.disk->disk_name,
+		dc->disk.c->disk_sb.set_uuid.b);
+	return 0;
+}
+
+void bch_attach_backing_devs(struct cache_set *c)
+{
+	struct cached_dev *dc, *t;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	list_for_each_entry_safe(dc, t, &uncached_devices, list)
+		bch_cached_dev_attach(dc, c);
+}
+
+void bch_cached_dev_release(struct kobject *kobj)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+	kfree(dc);
+	module_put(THIS_MODULE);
+}
+
+static void cached_dev_free(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+
+	bch_cached_dev_writeback_stop(dc);
+	bch_cached_dev_writeback_free(dc);
+
+	mutex_lock(&bch_register_lock);
+
+	if (atomic_read(&dc->running))
+		bd_unlink_disk_holder(dc->disk_sb.bdev, dc->disk.disk);
+	bcache_device_free(&dc->disk);
+	list_del(&dc->list);
+
+	mutex_unlock(&bch_register_lock);
+
+	free_super((void *) &dc->disk_sb);
+
+	kobject_put(&dc->disk.kobj);
+}
+
+static void cached_dev_flush(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+	struct bcache_device *d = &dc->disk;
+
+	mutex_lock(&bch_register_lock);
+	bcache_device_unlink(d);
+	mutex_unlock(&bch_register_lock);
+
+	bch_cache_accounting_destroy(&dc->accounting);
+	kobject_del(&d->kobj);
+
+	continue_at(cl, cached_dev_free, system_wq);
+}
+
+static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
+{
+	int ret;
+	struct io *io;
+	struct request_queue *q = bdev_get_queue(dc->disk_sb.bdev);
+
+	dc->sequential_cutoff		= 4 << 20;
+
+	for (io = dc->io; io < dc->io + RECENT_IO; io++) {
+		list_add(&io->lru, &dc->io_lru);
+		hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
+	}
+
+	dc->disk.stripe_size = q->limits.io_opt >> 9;
+
+	if (dc->disk.stripe_size)
+		dc->partial_stripes_expensive =
+			q->limits.raid_partial_stripes_expensive;
+
+	ret = bcache_device_init(&dc->disk, block_size,
+			 dc->disk_sb.bdev->bd_part->nr_sects -
+			 le64_to_cpu(dc->disk_sb.sb->data_offset));
+	if (ret)
+		return ret;
+
+	dc->disk.disk->queue->backing_dev_info.ra_pages =
+		max(dc->disk.disk->queue->backing_dev_info.ra_pages,
+		    q->backing_dev_info.ra_pages);
+
+	bch_cached_dev_request_init(dc);
+	ret = bch_cached_dev_writeback_init(dc);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/* Cached device - bcache superblock */
+
+static const char *bdev_validate_super(struct backingdev_sb *sb)
+{
+	switch (le64_to_cpu(sb->version)) {
+	case BCACHE_SB_VERSION_BDEV:
+		sb->data_offset	= cpu_to_le64(BDEV_DATA_START_DEFAULT);
+		break;
+	case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
+		if (le64_to_cpu(sb->data_offset) < BDEV_DATA_START_DEFAULT)
+			return "Bad data offset";
+
+		break;
+	default:
+		return"Unsupported superblock version";
+	}
+
+	sb->last_mount	= cpu_to_le32(get_seconds());
+
+	return NULL;
+}
+
+const char *bch_backing_dev_register(struct bcache_superblock *sb)
+{
+	char name[BDEVNAME_SIZE];
+	const char *err;
+	struct cache_set *c;
+	struct cached_dev *dc;
+
+	dc = kzalloc(sizeof(*dc), GFP_KERNEL);
+	if (!dc)
+		return "cannot allocate memory";
+
+	__module_get(THIS_MODULE);
+	INIT_LIST_HEAD(&dc->list);
+	closure_init(&dc->disk.cl, NULL);
+	set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
+	kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
+	INIT_WORK(&dc->detach, cached_dev_detach_finish);
+	sema_init(&dc->sb_write_mutex, 1);
+	INIT_LIST_HEAD(&dc->io_lru);
+	spin_lock_init(&dc->io_lock);
+	bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
+
+	memcpy(&dc->disk_sb, sb, sizeof(*sb));
+	dc->disk_sb.bdev->bd_holder = dc;
+	memset(sb, 0, sizeof(*sb));
+
+	err = bdev_validate_super(dc->disk_sb.sb);
+	if (err)
+		goto err;
+
+	if (cached_dev_init(dc, le16_to_cpu(dc->disk_sb.sb->block_size) << 9))
+		goto err;
+
+	err = "error creating kobject";
+	if (kobject_add(&dc->disk.kobj,
+			&part_to_dev(dc->disk_sb.bdev->bd_part)->kobj,
+			"bcache"))
+		goto err;
+
+	err = "error accounting kobject";
+	if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
+		goto err;
+
+	pr_info("registered backing device %s",
+		bdevname(dc->disk_sb.bdev, name));
+
+	list_add(&dc->list, &uncached_devices);
+	list_for_each_entry(c, &bch_cache_sets, list)
+		bch_cached_dev_attach(dc, c);
+
+	if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE ||
+	    BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE)
+		bch_cached_dev_run(dc);
+
+	return NULL;
+err:
+	bch_blockdev_stop(&dc->disk);
+	return err;
+}
+
+/* Flash only volumes */
+
+void bch_blockdev_volume_release(struct kobject *kobj)
+{
+	struct bcache_device *d = container_of(kobj, struct bcache_device,
+					       kobj);
+	kfree(d);
+}
+
+static void blockdev_volume_free(struct closure *cl)
+{
+	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+
+	mutex_lock(&bch_register_lock);
+	bcache_device_free(d);
+	mutex_unlock(&bch_register_lock);
+	kobject_put(&d->kobj);
+}
+
+static void blockdev_volume_flush(struct closure *cl)
+{
+	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+
+	mutex_lock(&bch_register_lock);
+	bcache_device_unlink(d);
+	mutex_unlock(&bch_register_lock);
+	kobject_del(&d->kobj);
+	continue_at(cl, blockdev_volume_free, system_wq);
+}
+
+static int blockdev_volume_run(struct cache_set *c,
+			       struct bkey_s_c_inode_blockdev inode)
+{
+	struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
+					  GFP_KERNEL);
+	int ret = -ENOMEM;
+
+	if (!d)
+		return ret;
+
+	bkey_reassemble(&d->inode.k_i, inode.s_c);
+
+	closure_init(&d->cl, NULL);
+	set_closure_fn(&d->cl, blockdev_volume_flush, system_wq);
+
+	kobject_init(&d->kobj, &bch_blockdev_volume_ktype);
+
+	ret = bcache_device_init(d, block_bytes(c),
+				 le64_to_cpu(inode.v->i_size) >> 9);
+	if (ret)
+		goto err;
+
+	ret = bcache_device_attach(d, c);
+	if (ret)
+		goto err;
+
+	bch_blockdev_volume_request_init(d);
+	add_disk(d->disk);
+
+	if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
+		goto err;
+
+	bcache_device_link(d, c, "volume");
+
+	return 0;
+err:
+	kobject_put(&d->kobj);
+	return ret;
+}
+
+int bch_blockdev_volumes_start(struct cache_set *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_inode_blockdev inode;
+	int ret = 0;
+
+	if (test_bit(CACHE_SET_STOPPING, &c->flags))
+		return -EINVAL;
+
+	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
+		if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
+			break;
+
+		if (k.k->type != BCH_INODE_BLOCKDEV)
+			continue;
+
+		inode = bkey_s_c_to_inode_blockdev(k);
+
+		ret = blockdev_volume_run(c, inode);
+		if (ret)
+			break;
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+int bch_blockdev_volume_create(struct cache_set *c, u64 size)
+{
+	__le64 rtime = cpu_to_le64(ktime_get_seconds());
+	struct bkey_i_inode_blockdev inode;
+	int ret;
+
+	bkey_inode_blockdev_init(&inode.k_i);
+	get_random_bytes(&inode.v.i_uuid, sizeof(inode.v.i_uuid));
+	inode.v.i_ctime = rtime;
+	inode.v.i_mtime = rtime;
+	inode.v.i_size = cpu_to_le64(size);
+
+	ret = bch_inode_create(c, &inode.k_i, 0, BLOCKDEV_INODE_MAX,
+			       &c->unused_inode_hint);
+	if (ret) {
+		pr_err("Can't create volume: %d", ret);
+		return ret;
+	}
+
+	return blockdev_volume_run(c, inode_blockdev_i_to_s_c(&inode));
+}
+
+void bch_blockdevs_stop(struct cache_set *c)
+{
+	struct cached_dev *dc;
+	struct bcache_device *d;
+	struct radix_tree_iter iter;
+	void **slot;
+
+	mutex_lock(&bch_register_lock);
+	rcu_read_lock();
+
+	radix_tree_for_each_slot(slot, &c->devices, &iter, 0) {
+		d = radix_tree_deref_slot(slot);
+
+		if (CACHED_DEV(&d->inode.v) &&
+		    test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
+			dc = container_of(d, struct cached_dev, disk);
+			bch_cached_dev_detach(dc);
+		} else {
+			bch_blockdev_stop(d);
+		}
+	}
+
+	rcu_read_unlock();
+	mutex_unlock(&bch_register_lock);
+}
+
+void bch_blockdev_exit(void)
+{
+	kmem_cache_destroy(bch_search_cache);
+
+	if (bch_blockdev_major >= 0)
+		unregister_blkdev(bch_blockdev_major, "bcache");
+}
+
+int __init bch_blockdev_init(void)
+{
+	bch_blockdev_major = register_blkdev(0, "bcache");
+	if (bch_blockdev_major < 0)
+		return bch_blockdev_major;
+
+	bch_search_cache = KMEM_CACHE(search, 0);
+	if (!bch_search_cache)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/libbcache/blockdev.h b/libbcache/blockdev.h
new file mode 100644
index 0000000..0fc0ed1
--- /dev/null
+++ b/libbcache/blockdev.h
@@ -0,0 +1,99 @@
+#ifndef _BCACHE_BLOCKDEV_H
+#define _BCACHE_BLOCKDEV_H
+
+#include "blockdev_types.h"
+#include "io_types.h"
+
+void bch_write_bdev_super(struct cached_dev *, struct closure *);
+
+void bch_cached_dev_release(struct kobject *);
+void bch_blockdev_volume_release(struct kobject *);
+
+int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
+void bch_attach_backing_devs(struct cache_set *);
+
+void bch_cached_dev_detach(struct cached_dev *);
+void bch_cached_dev_run(struct cached_dev *);
+void bch_blockdev_stop(struct bcache_device *);
+
+bool bch_is_open_backing_dev(struct block_device *);
+const char *bch_backing_dev_register(struct bcache_superblock *);
+
+int bch_blockdev_volume_create(struct cache_set *, u64);
+int bch_blockdev_volumes_start(struct cache_set *);
+
+void bch_blockdevs_stop(struct cache_set *);
+
+void bch_blockdev_exit(void);
+int bch_blockdev_init(void);
+
+static inline void cached_dev_put(struct cached_dev *dc)
+{
+	if (atomic_dec_and_test(&dc->count))
+		schedule_work(&dc->detach);
+}
+
+static inline bool cached_dev_get(struct cached_dev *dc)
+{
+	if (!atomic_inc_not_zero(&dc->count))
+		return false;
+
+	/* Paired with the mb in cached_dev_attach */
+	smp_mb__after_atomic();
+	return true;
+}
+
+static inline u64 bcache_dev_inum(struct bcache_device *d)
+{
+	return d->inode.k.p.inode;
+}
+
+static inline struct bcache_device *bch_dev_find(struct cache_set *c, u64 inode)
+{
+	return radix_tree_lookup(&c->devices, inode);
+}
+
+struct search {
+	/* Stack frame for bio_complete */
+	struct closure		cl;
+
+	union {
+	struct bch_read_bio	rbio;
+	struct bch_write_bio	wbio;
+	};
+	/* Not modified */
+	struct bio		*orig_bio;
+	struct bcache_device	*d;
+
+	unsigned		inode;
+	unsigned		write:1;
+
+	/* Flags only used for reads */
+	unsigned		recoverable:1;
+	unsigned		read_dirty_data:1;
+	unsigned		cache_miss:1;
+
+	/*
+	 * For reads:  bypass read from cache and insertion into cache
+	 * For writes: discard key range from cache, sending the write to
+	 *             the backing device (if there is a backing device)
+	 */
+	unsigned		bypass:1;
+
+	unsigned long		start_time;
+
+	/*
+	 * Mostly only used for writes. For reads, we still make use of
+	 * some trivial fields:
+	 * - c
+	 * - error
+	 */
+	struct bch_write_op	iop;
+};
+
+extern struct kmem_cache *bch_search_cache;
+
+extern struct kobj_type bch_cached_dev_ktype;
+extern struct kobj_type bch_blockdev_volume_ktype;
+
+#endif /* _BCACHE_BLOCKDEV_H */
diff --git a/libbcache/blockdev_types.h b/libbcache/blockdev_types.h
new file mode 100644
index 0000000..3254917
--- /dev/null
+++ b/libbcache/blockdev_types.h
@@ -0,0 +1,123 @@
+#ifndef _BCACHE_BLOCKDEV_TYPES_H
+#define _BCACHE_BLOCKDEV_TYPES_H
+
+#include "keybuf_types.h"
+#include "stats_types.h"
+#include "super_types.h"
+#include "util.h"
+
+struct bcache_device {
+	struct closure		cl;
+
+	struct kobject		kobj;
+
+	struct cache_set	*c;
+
+	struct rb_node		node;
+	struct bkey_i_inode_blockdev inode;
+	struct mutex		inode_lock;
+
+#define BCACHEDEVNAME_SIZE	12
+	char			name[BCACHEDEVNAME_SIZE];
+
+	struct gendisk		*disk;
+
+	unsigned long		flags;
+#define BCACHE_DEV_CLOSING	0
+#define BCACHE_DEV_DETACHING	1
+#define BCACHE_DEV_UNLINK_DONE	2
+
+	unsigned		nr_stripes;
+	unsigned		stripe_size;
+	atomic_t		*stripe_sectors_dirty;
+	unsigned long		*full_dirty_stripes;
+
+	struct bio_set		bio_split;
+
+	unsigned		data_csum:1;
+
+	int (*ioctl)(struct bcache_device *, fmode_t, unsigned, unsigned long);
+};
+
+struct io {
+	/* Used to track sequential IO so it can be skipped */
+	struct hlist_node	hash;
+	struct list_head	lru;
+
+	unsigned long		last_io;
+	unsigned		sequential;
+	sector_t		last;
+};
+
+struct cached_dev {
+	struct list_head	list;
+	struct bcache_device	disk;
+
+	//struct backingdev_sb		sb;
+
+	struct {
+		struct backingdev_sb	*sb;
+		struct block_device	*bdev;
+		struct bio		*bio;
+		unsigned		page_order;
+	} disk_sb;
+	struct closure		sb_write;
+	struct semaphore	sb_write_mutex;
+
+	/* Refcount on the cache set. Always nonzero when we're caching. */
+	atomic_t		count;
+	struct work_struct	detach;
+
+	/*
+	 * Device might not be running if it's dirty and the cache set hasn't
+	 * showed up yet.
+	 */
+	atomic_t		running;
+
+	/*
+	 * Writes take a shared lock from start to finish; scanning for dirty
+	 * data to refill the rb tree requires an exclusive lock.
+	 */
+	struct rw_semaphore	writeback_lock;
+
+	/*
+	 * Nonzero, and writeback has a refcount (d->count), iff there is dirty
+	 * data in the cache. Protected by writeback_lock; must have an
+	 * shared lock to set and exclusive lock to clear.
+	 */
+	atomic_t		has_dirty;
+
+	/* for dynamic rate control of writeback */
+	struct bch_pd_controller writeback_pd;
+	struct delayed_work	writeback_pd_update;
+	unsigned		writeback_pd_update_seconds;
+
+	struct task_struct	*writeback_thread;
+	struct keybuf		writeback_keys;
+	mempool_t		writeback_io_pool;
+	mempool_t		writeback_page_pool;
+
+	/* For tracking sequential IO */
+#define RECENT_IO_BITS	7
+#define RECENT_IO	(1 << RECENT_IO_BITS)
+	struct io		io[RECENT_IO];
+	struct hlist_head	io_hash[RECENT_IO + 1];
+	struct list_head	io_lru;
+	spinlock_t		io_lock;
+
+	struct cache_accounting	accounting;
+
+	/* The rest of this all shows up in sysfs */
+	unsigned		sequential_cutoff;
+	unsigned		readahead;
+
+	unsigned		verify:1;
+	unsigned		bypass_torture_test:1;
+
+	unsigned		partial_stripes_expensive:1;
+	unsigned		writeback_metadata:1;
+	unsigned		writeback_running:1;
+	unsigned char		writeback_percent;
+};
+
+#endif /* _BCACHE_BLOCKDEV_TYPES_H */
diff --git a/libbcache/bset.c b/libbcache/bset.c
new file mode 100644
index 0000000..3488095
--- /dev/null
+++ b/libbcache/bset.c
@@ -0,0 +1,1846 @@
+/*
+ * Code for working with individual keys, and sorted sets of keys with in a
+ * btree node
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include "eytzinger.h"
+#include "util.h"
+#include "bset.h"
+
+#include <asm/unaligned.h>
+#include <linux/dynamic_fault.h>
+#include <linux/console.h>
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+/* hack.. */
+#include "alloc_types.h"
+#include <trace/events/bcache.h>
+
+struct bset_tree *bch_bkey_to_bset(struct btree *b, struct bkey_packed *k)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		if (k >= btree_bkey_first(b, t) &&
+		    k < btree_bkey_last(b, t))
+			return t;
+
+	BUG();
+}
+
+/*
+ * There are never duplicate live keys in the btree - but including keys that
+ * have been flagged as deleted (and will be cleaned up later) we _will_ see
+ * duplicates.
+ *
+ * Thus the sort order is: usual key comparison first, but for keys that compare
+ * equal the deleted key(s) come first, and the (at most one) live version comes
+ * last.
+ *
+ * The main reason for this is insertion: to handle overwrites, we first iterate
+ * over keys that compare equal to our insert key, and then insert immediately
+ * prior to the first key greater than the key we're inserting - our insert
+ * position will be after all keys that compare equal to our insert key, which
+ * by the time we actually do the insert will all be deleted.
+ */
+
+void bch_dump_bset(struct btree *b, struct bset *i, unsigned set)
+{
+	struct bkey_packed *_k, *_n;
+	struct bkey k, n;
+	char buf[120];
+
+	if (!i->u64s)
+		return;
+
+	for (_k = i->start, k = bkey_unpack_key(b, _k);
+	     _k < bset_bkey_last(i);
+	     _k = _n, k = n) {
+		_n = bkey_next(_k);
+
+		bch_bkey_to_text(buf, sizeof(buf), &k);
+		printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
+		       _k->_data - i->_data, i->u64s, buf);
+
+		if (_n == bset_bkey_last(i))
+			continue;
+
+		n = bkey_unpack_key(b, _n);
+
+		if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
+			printk(KERN_ERR "Key skipped backwards\n");
+			continue;
+		}
+
+		/*
+		 * Weird check for duplicate non extent keys: extents are
+		 * deleted iff they have 0 size, so if it has zero size and it's
+		 * not deleted these aren't extents:
+		 */
+		if (((!k.size && !bkey_deleted(&k)) ||
+		     (!n.size && !bkey_deleted(&n))) &&
+		    !bkey_deleted(&k) &&
+		    !bkey_cmp(n.p, k.p))
+			printk(KERN_ERR "Duplicate keys\n");
+	}
+}
+
+void bch_dump_btree_node(struct btree *b)
+{
+	struct bset_tree *t;
+
+	console_lock();
+	for_each_bset(b, t)
+		bch_dump_bset(b, bset(b, t), t - b->set);
+	console_unlock();
+}
+
+void bch_dump_btree_node_iter(struct btree *b,
+			      struct btree_node_iter *iter)
+{
+	struct btree_node_iter_set *set;
+
+	printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
+
+	btree_node_iter_for_each(iter, set) {
+		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
+		struct bset_tree *t = bch_bkey_to_bset(b, k);
+		struct bkey uk = bkey_unpack_key(b, k);
+		char buf[100];
+
+		bch_bkey_to_text(buf, sizeof(buf), &uk);
+		printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
+		       k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
+	}
+}
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+static bool keys_out_of_order(struct btree *b,
+			      const struct bkey_packed *prev,
+			      const struct bkey_packed *next,
+			      bool is_extents)
+{
+	struct bkey nextu = bkey_unpack_key(b, next);
+
+	return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
+		((is_extents
+		  ? !bkey_deleted(next)
+		  : !bkey_deleted(prev)) &&
+		 !bkey_cmp_packed(b, prev, next));
+}
+
+void __bch_verify_btree_nr_keys(struct btree *b)
+{
+	struct bset_tree *t;
+	struct bkey_packed *k;
+	struct btree_nr_keys nr = { 0 };
+
+	for_each_bset(b, t)
+		for (k = btree_bkey_first(b, t);
+		     k != btree_bkey_last(b, t);
+		     k = bkey_next(k))
+			if (!bkey_whiteout(k))
+				btree_keys_account_key_add(&nr, t - b->set, k);
+
+	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
+}
+
+static void bch_btree_node_iter_next_check(struct btree_node_iter *iter,
+					   struct btree *b,
+					   struct bkey_packed *k)
+{
+	const struct bkey_packed *n = bch_btree_node_iter_peek_all(iter, b);
+
+	bkey_unpack_key(b, k);
+
+	if (n &&
+	    keys_out_of_order(b, k, n, iter->is_extents)) {
+		struct bkey ku = bkey_unpack_key(b, k);
+		struct bkey nu = bkey_unpack_key(b, n);
+		char buf1[80], buf2[80];
+
+		bch_dump_btree_node(b);
+		bch_bkey_to_text(buf1, sizeof(buf1), &ku);
+		bch_bkey_to_text(buf2, sizeof(buf2), &nu);
+		panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
+	}
+}
+
+void bch_btree_node_iter_verify(struct btree_node_iter *iter,
+				struct btree *b)
+{
+	struct btree_node_iter_set *set;
+	struct bset_tree *t;
+	struct bkey_packed *k, *first;
+
+	BUG_ON(iter->used > MAX_BSETS);
+
+	if (!iter->used)
+		return;
+
+	btree_node_iter_for_each(iter, set) {
+		k = __btree_node_offset_to_key(b, set->k);
+		t = bch_bkey_to_bset(b, k);
+
+		BUG_ON(__btree_node_offset_to_key(b, set->end) !=
+		       btree_bkey_last(b, t));
+
+		BUG_ON(set + 1 < iter->data + iter->used &&
+		       btree_node_iter_cmp(iter, b, set[0], set[1]) > 0);
+	}
+
+	first = __btree_node_offset_to_key(b, iter->data[0].k);
+
+	for_each_bset(b, t)
+		if (bch_btree_node_iter_bset_pos(iter, b, t) ==
+		    btree_bkey_last(b, t) &&
+		    (k = bkey_prev_all(b, t, btree_bkey_last(b, t))))
+			BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
+						     k, first) > 0);
+}
+
+void bch_verify_key_order(struct btree *b,
+			  struct btree_node_iter *iter,
+			  struct bkey_packed *where)
+{
+	struct bset_tree *t = bch_bkey_to_bset(b, where);
+	struct bkey_packed *k, *prev;
+	struct bkey uk, uw = bkey_unpack_key(b, where);
+
+	k = bkey_prev_all(b, t, where);
+	if (k &&
+	    keys_out_of_order(b, k, where, iter->is_extents)) {
+		char buf1[100], buf2[100];
+
+		bch_dump_btree_node(b);
+		uk = bkey_unpack_key(b, k);
+		bch_bkey_to_text(buf1, sizeof(buf1), &uk);
+		bch_bkey_to_text(buf2, sizeof(buf2), &uw);
+		panic("out of order with prev:\n%s\n%s\n",
+		      buf1, buf2);
+	}
+
+	k = bkey_next(where);
+	BUG_ON(k != btree_bkey_last(b, t) &&
+	       keys_out_of_order(b, where, k, iter->is_extents));
+
+	for_each_bset(b, t) {
+		if (where >= btree_bkey_first(b, t) ||
+		    where < btree_bkey_last(b, t))
+			continue;
+
+		k = bch_btree_node_iter_bset_pos(iter, b, t);
+
+		if (k == btree_bkey_last(b, t))
+			k = bkey_prev_all(b, t, k);
+
+		while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
+		       (prev = bkey_prev_all(b, t, k)))
+			k = prev;
+
+		for (;
+		     k != btree_bkey_last(b, t);
+		     k = bkey_next(k)) {
+			uk = bkey_unpack_key(b, k);
+
+			if (iter->is_extents) {
+				BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
+					 bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
+			} else {
+				BUG_ON(!bkey_cmp(uw.p, uk.p) &&
+				       !bkey_deleted(&uk));
+			}
+
+			if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
+				break;
+		}
+	}
+}
+
+#else
+
+static void bch_btree_node_iter_next_check(struct btree_node_iter *iter,
+					   struct btree *b,
+					   struct bkey_packed *k) {}
+
+#endif
+
+/* Auxiliary search trees */
+
+#define BFLOAT_FAILED_UNPACKED	(U8_MAX - 0)
+#define BFLOAT_FAILED_PREV	(U8_MAX - 1)
+#define BFLOAT_FAILED_OVERFLOW	(U8_MAX - 2)
+#define BFLOAT_FAILED		(U8_MAX - 2)
+
+#define KEY_WORDS		BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
+
+struct bkey_float {
+	u8		exponent;
+	u8		key_offset;
+	union {
+		u32	mantissa32;
+	struct {
+		u16	mantissa16;
+		u16	_pad;
+	};
+	};
+} __packed;
+
+#define BFLOAT_32BIT_NR		32U
+
+static unsigned bkey_float_byte_offset(unsigned idx)
+{
+	int d = (idx - BFLOAT_32BIT_NR) << 1;
+
+	d &= ~(d >> 31);
+
+	return idx * 6 - d;
+}
+
+struct ro_aux_tree {
+	struct bkey_float	_d[0];
+};
+
+struct rw_aux_tree {
+	u16		offset;
+	struct bpos	k;
+};
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE		128
+
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree *b)
+{
+	return PAGE_SIZE << b->page_order;
+}
+
+static inline size_t btree_keys_cachelines(struct btree *b)
+{
+	return btree_keys_bytes(b) / BSET_CACHELINE;
+}
+
+static inline size_t btree_aux_data_bytes(struct btree *b)
+{
+	return btree_keys_cachelines(b) * 8;
+}
+
+static inline size_t btree_aux_data_u64s(struct btree *b)
+{
+	return btree_aux_data_bytes(b) / sizeof(u64);
+}
+
+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
+{
+	BUG_ON(t->aux_data_offset == U16_MAX);
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		return t->aux_data_offset;
+	case BSET_RO_AUX_TREE:
+		return t->aux_data_offset +
+			DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
+				     sizeof(u8) * t->size, 8);
+	case BSET_RW_AUX_TREE:
+		return t->aux_data_offset +
+			DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
+	default:
+		BUG();
+	}
+}
+
+static unsigned bset_aux_tree_buf_start(const struct btree *b,
+					const struct bset_tree *t)
+{
+	return t == b->set
+		? DIV_ROUND_UP(b->unpack_fn_len, 8)
+		: bset_aux_tree_buf_end(t - 1);
+}
+
+static void *__aux_tree_base(const struct btree *b,
+			     const struct bset_tree *t)
+{
+	return b->aux_data + t->aux_data_offset * 8;
+}
+
+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
+					    const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	return __aux_tree_base(b, t);
+}
+
+static u8 *ro_aux_tree_prev(const struct btree *b,
+			    const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
+}
+
+static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
+					 unsigned idx)
+{
+	return (void *) b + bkey_float_byte_offset(idx);
+}
+
+static struct bkey_float *bkey_float(const struct btree *b,
+				     const struct bset_tree *t,
+				     unsigned idx)
+{
+	return bkey_float_get(ro_aux_tree_base(b, t), idx);
+}
+
+static void bset_aux_tree_verify(struct btree *b)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		if (t->aux_data_offset == U16_MAX)
+			continue;
+
+		BUG_ON(t != b->set &&
+		       t[-1].aux_data_offset == U16_MAX);
+
+		BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
+		BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
+		BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
+	}
+#endif
+}
+
+/* Memory allocation */
+
+void bch_btree_keys_free(struct btree *b)
+{
+	vfree(b->aux_data);
+	b->aux_data = NULL;
+}
+
+int bch_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
+{
+	b->page_order	= page_order;
+	b->aux_data	= __vmalloc(btree_aux_data_bytes(b), gfp,
+				    PAGE_KERNEL_EXEC);
+	if (!b->aux_data)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void bch_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
+{
+	unsigned i;
+
+	b->nsets		= 0;
+	memset(&b->nr, 0, sizeof(b->nr));
+#ifdef CONFIG_BCACHE_DEBUG
+	b->expensive_debug_checks = expensive_debug_checks;
+#endif
+	for (i = 0; i < MAX_BSETS; i++)
+		b->set[i].data_offset = U16_MAX;
+
+	bch_bset_set_no_aux_tree(b, b->set);
+}
+
+/* Binary tree stuff for auxiliary search trees */
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmetic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; eytzinger_to_inorder() gives us the cacheline, and
+ * then bkey_float->m gives us the offset within that cacheline, in units of 8
+ * bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static inline void *bset_cacheline(const struct btree *b,
+				   const struct bset_tree *t,
+				   unsigned cacheline)
+{
+	return (void *) round_down((unsigned long) btree_bkey_first(b, t),
+				   L1_CACHE_BYTES) +
+		cacheline * BSET_CACHELINE;
+}
+
+static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
+					     const struct bset_tree *t,
+					     unsigned cacheline,
+					     unsigned offset)
+{
+	return bset_cacheline(b, t, cacheline) + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(const struct btree *b,
+				  const struct bset_tree *t,
+				  const struct bkey_packed *k)
+{
+	return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
+}
+
+static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
+					  const struct bset_tree *t,
+					  unsigned cacheline,
+					  const struct bkey_packed *k)
+{
+	return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
+}
+
+static unsigned bkey_to_cacheline_offset(const struct btree *b,
+					 const struct bset_tree *t,
+					 unsigned cacheline,
+					 const struct bkey_packed *k)
+{
+	size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
+
+	EBUG_ON(m > U8_MAX);
+	return m;
+}
+
+static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
+					       const struct bset_tree *t,
+					       unsigned j)
+{
+	return cacheline_to_bkey(b, t,
+			__eytzinger_to_inorder(j, t->size, t->extra),
+			bkey_float(b, t, j)->key_offset);
+}
+
+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
+					     const struct bset_tree *t,
+					     unsigned j)
+{
+	unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
+
+	return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
+}
+
+static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
+				       const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+
+	return __aux_tree_base(b, t);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
+					  struct bset_tree *t,
+					  unsigned j)
+{
+	return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
+}
+
+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
+			    unsigned j, struct bkey_packed *k)
+{
+	BUG_ON(k >= btree_bkey_last(b, t));
+
+	rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
+		.offset	= __btree_node_key_to_offset(b, k),
+		.k	= bkey_unpack_pos(b, k),
+	};
+}
+
+static void bch_bset_verify_rw_aux_tree(struct btree *b,
+					struct bset_tree *t)
+{
+	struct bkey_packed *k = btree_bkey_first(b, t);
+	unsigned j = 0;
+
+	if (!btree_keys_expensive_checks(b))
+		return;
+
+	BUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	BUG_ON(t->size < 1);
+	BUG_ON(rw_aux_to_bkey(b, t, j) != k);
+
+	goto start;
+	while (1) {
+		if (rw_aux_to_bkey(b, t, j) == k) {
+			BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
+					bkey_unpack_pos(b, k)));
+start:
+			if (++j == t->size)
+				break;
+
+			BUG_ON(rw_aux_tree(b, t)[j].offset <=
+			       rw_aux_tree(b, t)[j - 1].offset);
+		}
+
+		k = bkey_next(k);
+		BUG_ON(k >= btree_bkey_last(b, t));
+	}
+}
+
+/* returns idx of first entry >= offset: */
+static unsigned rw_aux_tree_bsearch(struct btree *b,
+				    struct bset_tree *t,
+				    unsigned offset)
+{
+	unsigned l = 0, r = t->size;
+
+	BUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+
+	while (l < r) {
+		unsigned m = (l + r) >> 1;
+
+		if (rw_aux_tree(b, t)[m].offset < offset)
+			l = m + 1;
+		else
+			r = m;
+	}
+
+	BUG_ON(l < t->size &&
+	       rw_aux_tree(b, t)[l].offset < offset);
+	BUG_ON(l &&
+	       rw_aux_tree(b, t)[l - 1].offset >= offset);
+
+	BUG_ON(l > r);
+	BUG_ON(l > t->size);
+
+	return l;
+}
+
+static inline unsigned bfloat_mantissa(const struct bkey_float *f,
+				       unsigned idx)
+{
+	return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
+}
+
+static inline void bfloat_mantissa_set(struct bkey_float *f,
+				       unsigned idx, unsigned mantissa)
+{
+	if (idx < BFLOAT_32BIT_NR)
+		f->mantissa32 = mantissa;
+	else
+		f->mantissa16 = mantissa;
+}
+
+static inline unsigned bkey_mantissa(const struct bkey_packed *k,
+				     const struct bkey_float *f,
+				     unsigned idx)
+{
+	u64 v;
+
+	EBUG_ON(!bkey_packed(k));
+
+	v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
+
+	/*
+	 * In little endian, we're shifting off low bits (and then the bits we
+	 * want are at the low end), in big endian we're shifting off high bits
+	 * (and then the bits we want are at the high end, so we shift them
+	 * back down):
+	 */
+#ifdef __LITTLE_ENDIAN
+	v >>= f->exponent & 7;
+#else
+	v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
+#endif
+	return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
+}
+
+static void make_bfloat(struct btree *b, struct bset_tree *t,
+			unsigned j,
+			struct bkey_packed *min_key,
+			struct bkey_packed *max_key)
+{
+	struct bkey_float *f = bkey_float(b, t, j);
+	struct bkey_packed *m = tree_to_bkey(b, t, j);
+	struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
+	struct bkey_packed *l, *r;
+	unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
+	unsigned mantissa;
+	int shift, exponent;
+
+	EBUG_ON(bkey_next(p) != m);
+
+	if (is_power_of_2(j)) {
+		l = min_key;
+
+		if (!l->u64s) {
+			if (!bkey_pack_pos(l, b->data->min_key, b)) {
+				struct bkey_i tmp;
+
+				bkey_init(&tmp.k);
+				tmp.k.p = b->data->min_key;
+				bkey_copy(l, &tmp);
+			}
+		}
+	} else {
+		l = tree_to_prev_bkey(b, t, j >> ffs(j));
+
+		EBUG_ON(m < l);
+	}
+
+	if (is_power_of_2(j + 1)) {
+		r = max_key;
+
+		if (!r->u64s) {
+			if (!bkey_pack_pos(r, t->max_key, b)) {
+				struct bkey_i tmp;
+
+				bkey_init(&tmp.k);
+				tmp.k.p = t->max_key;
+				bkey_copy(r, &tmp);
+			}
+		}
+	} else {
+		r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
+
+		EBUG_ON(m > r);
+	}
+
+	/*
+	 * for failed bfloats, the lookup code falls back to comparing against
+	 * the original key.
+	 */
+
+	if (!bkey_packed(l) || !bkey_packed(r) ||
+	    !bkey_packed(p) || !bkey_packed(m)) {
+		f->exponent = BFLOAT_FAILED_UNPACKED;
+		return;
+	}
+
+	/*
+	 * The greatest differing bit of l and r is the first bit we must
+	 * include in the bfloat mantissa we're creating in order to do
+	 * comparisons - that bit always becomes the high bit of
+	 * bfloat->mantissa, and thus the exponent we're calculating here is
+	 * the position of what will become the low bit in bfloat->mantissa:
+	 *
+	 * Note that this may be negative - we may be running off the low end
+	 * of the key: we handle this later:
+	 */
+	exponent = (int) bkey_greatest_differing_bit(b, l, r) - (bits - 1);
+
+	/*
+	 * Then we calculate the actual shift value, from the start of the key
+	 * (k->_data), to get the key bits starting at exponent:
+	 */
+#ifdef __LITTLE_ENDIAN
+	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
+
+	EBUG_ON(shift + bits > b->format.key_u64s * 64);
+#else
+	shift = high_bit_offset +
+		b->nr_key_bits -
+		exponent -
+		bits;
+
+	EBUG_ON(shift < KEY_PACKED_BITS_START);
+#endif
+	EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
+
+	f->exponent = shift;
+	mantissa = bkey_mantissa(m, f, j);
+
+	/*
+	 * If we've got garbage bits, set them to all 1s - it's legal for the
+	 * bfloat to compare larger than the original key, but not smaller:
+	 */
+	if (exponent < 0)
+		mantissa |= ~(~0U << -exponent);
+
+	bfloat_mantissa_set(f, j, mantissa);
+
+	/*
+	 * The bfloat must be able to tell its key apart from the previous key -
+	 * if its key and the previous key don't differ in the required bits,
+	 * flag as failed - unless the keys are actually equal, in which case
+	 * we aren't required to return a specific one:
+	 */
+	if (exponent > 0 &&
+	    bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
+	    bkey_cmp_packed(b, p, m)) {
+		f->exponent = BFLOAT_FAILED_PREV;
+		return;
+	}
+
+	/*
+	 * f->mantissa must compare >= the original key - for transitivity with
+	 * the comparison in bset_search_tree. If we're dropping set bits,
+	 * increment it:
+	 */
+	if (exponent > (int) bkey_ffs(b, m)) {
+		if (j < BFLOAT_32BIT_NR
+		    ? f->mantissa32 == U32_MAX
+		    : f->mantissa16 == U16_MAX)
+			f->exponent = BFLOAT_FAILED_OVERFLOW;
+
+		if (j < BFLOAT_32BIT_NR)
+			f->mantissa32++;
+		else
+			f->mantissa16++;
+	}
+}
+
+/* bytes remaining - only valid for last bset: */
+static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+	bset_aux_tree_verify(b);
+
+	return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
+}
+
+static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+	unsigned bytes = __bset_tree_capacity(b, t);
+
+	if (bytes < 7 * BFLOAT_32BIT_NR)
+		return bytes / 7;
+
+	bytes -= 7 * BFLOAT_32BIT_NR;
+
+	return BFLOAT_32BIT_NR + bytes / 5;
+}
+
+static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+	return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
+}
+
+static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bkey_packed *k;
+
+	t->size = 1;
+	t->extra = BSET_RW_AUX_TREE_VAL;
+	rw_aux_tree(b, t)[0].offset =
+		__btree_node_key_to_offset(b, btree_bkey_first(b, t));
+
+	for (k = btree_bkey_first(b, t);
+	     k != btree_bkey_last(b, t);
+	     k = bkey_next(k)) {
+		if (t->size == bset_rw_tree_capacity(b, t))
+			break;
+
+		if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
+		    L1_CACHE_BYTES)
+			rw_aux_tree_set(b, t, t->size++, k);
+	}
+}
+
+static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
+	struct bkey_packed min_key, max_key;
+	unsigned j, cacheline = 1;
+
+	/* signal to make_bfloat() that they're uninitialized: */
+	min_key.u64s = max_key.u64s = 0;
+
+	t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
+		      bset_ro_tree_capacity(b, t));
+retry:
+	if (t->size < 2) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		return;
+	}
+
+	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+	/* First we figure out where the first key in each cacheline is */
+	eytzinger_for_each(j, t->size) {
+		while (bkey_to_cacheline(b, t, k) < cacheline)
+			prev = k, k = bkey_next(k);
+
+		if (k >= btree_bkey_last(b, t)) {
+			t->size--;
+			goto retry;
+		}
+
+		ro_aux_tree_prev(b, t)[j] = prev->u64s;
+		bkey_float(b, t, j)->key_offset =
+			bkey_to_cacheline_offset(b, t, cacheline++, k);
+
+		BUG_ON(tree_to_prev_bkey(b, t, j) != prev);
+		BUG_ON(tree_to_bkey(b, t, j) != k);
+	}
+
+	while (bkey_next(k) != btree_bkey_last(b, t))
+		k = bkey_next(k);
+
+	t->max_key = bkey_unpack_pos(b, k);
+
+	/* Then we build the tree */
+	eytzinger_for_each(j, t->size)
+		make_bfloat(b, t, j, &min_key, &max_key);
+}
+
+static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bset_tree *i;
+
+	for (i = b->set; i != t; i++)
+		BUG_ON(bset_has_rw_aux_tree(i));
+
+	bch_bset_set_no_aux_tree(b, t);
+
+	/* round up to next cacheline: */
+	t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
+				      SMP_CACHE_BYTES / sizeof(u64));
+
+	bset_aux_tree_verify(b);
+}
+
+void bch_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
+			     bool writeable)
+{
+	if (writeable
+	    ? bset_has_rw_aux_tree(t)
+	    : bset_has_ro_aux_tree(t))
+		return;
+
+	bset_alloc_tree(b, t);
+
+	if (!__bset_tree_capacity(b, t))
+		return;
+
+	if (writeable)
+		__build_rw_aux_tree(b, t);
+	else
+		__build_ro_aux_tree(b, t);
+
+	bset_aux_tree_verify(b);
+}
+
+void bch_bset_init_first(struct btree *b, struct bset *i)
+{
+	struct bset_tree *t;
+
+	BUG_ON(b->nsets);
+
+	memset(i, 0, sizeof(*i));
+	get_random_bytes(&i->seq, sizeof(i->seq));
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	t = &b->set[b->nsets++];
+	set_btree_bset(b, t, i);
+}
+
+void bch_bset_init_next(struct btree *b, struct bset *i)
+{
+	struct bset_tree *t;
+
+	BUG_ON(b->nsets >= MAX_BSETS);
+
+	memset(i, 0, sizeof(*i));
+	i->seq = btree_bset_first(b)->seq;
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	t = &b->set[b->nsets++];
+	set_btree_bset(b, t, i);
+}
+
+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
+				       struct bkey_packed *k)
+{
+	struct bkey_packed *p;
+	unsigned offset;
+	int j;
+
+	EBUG_ON(k < btree_bkey_first(b, t) ||
+		k > btree_bkey_last(b, t));
+
+	if (k == btree_bkey_first(b, t))
+		return NULL;
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		p = btree_bkey_first(b, t);
+		break;
+	case BSET_RO_AUX_TREE:
+		j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
+
+		do {
+			p = j ? tree_to_bkey(b, t,
+					__inorder_to_eytzinger(j--,
+							t->size, t->extra))
+			      : btree_bkey_first(b, t);
+		} while (p >= k);
+		break;
+	case BSET_RW_AUX_TREE:
+		offset = __btree_node_key_to_offset(b, k);
+		j = rw_aux_tree_bsearch(b, t, offset);
+		p = j ? rw_aux_to_bkey(b, t, j - 1)
+		      : btree_bkey_first(b, t);
+		break;
+	}
+
+	return p;
+}
+
+struct bkey_packed *bkey_prev_all(struct btree *b, struct bset_tree *t,
+				  struct bkey_packed *k)
+{
+	struct bkey_packed *p;
+
+	p = __bkey_prev(b, t, k);
+	if (!p)
+		return NULL;
+
+	while (bkey_next(p) != k)
+		p = bkey_next(p);
+
+	return p;
+}
+
+struct bkey_packed *bkey_prev(struct btree *b, struct bset_tree *t,
+			      struct bkey_packed *k)
+{
+	while (1) {
+		struct bkey_packed *p, *i, *ret = NULL;
+
+		p = __bkey_prev(b, t, k);
+		if (!p)
+			return NULL;
+
+		for (i = p; i != k; i = bkey_next(i))
+			if (!bkey_deleted(i))
+				ret = i;
+
+		if (ret)
+			return ret;
+
+		k = p;
+	}
+}
+
+/* Insert */
+
+static void rw_aux_tree_fix_invalidated_key(struct btree *b,
+					    struct bset_tree *t,
+					    struct bkey_packed *k)
+{
+	unsigned offset = __btree_node_key_to_offset(b, k);
+	unsigned j = rw_aux_tree_bsearch(b, t, offset);
+
+	if (j < t->size &&
+	    rw_aux_tree(b, t)[j].offset == offset)
+		rw_aux_tree_set(b, t, j, k);
+
+	bch_bset_verify_rw_aux_tree(b, t);
+}
+
+static void ro_aux_tree_fix_invalidated_key(struct btree *b,
+					    struct bset_tree *t,
+					    struct bkey_packed *k)
+{
+	struct bkey_packed min_key, max_key;
+	unsigned inorder, j;
+
+	BUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	/* signal to make_bfloat() that they're uninitialized: */
+	min_key.u64s = max_key.u64s = 0;
+
+	if (bkey_next(k) == btree_bkey_last(b, t)) {
+		t->max_key = bkey_unpack_pos(b, k);
+
+		for (j = 1; j < t->size; j = j * 2 + 1)
+			make_bfloat(b, t, j, &min_key, &max_key);
+	}
+
+	inorder = bkey_to_cacheline(b, t, k);
+
+	if (inorder &&
+	    inorder < t->size) {
+		j = __inorder_to_eytzinger(inorder, t->size, t->extra);
+
+		if (k == tree_to_bkey(b, t, j)) {
+			/* Fix the node this key corresponds to */
+			make_bfloat(b, t, j, &min_key, &max_key);
+
+			/* Children for which this key is the right boundary */
+			for (j = eytzinger_left_child(j);
+			     j < t->size;
+			     j = eytzinger_right_child(j))
+				make_bfloat(b, t, j, &min_key, &max_key);
+		}
+	}
+
+	if (inorder + 1 < t->size) {
+		j = __inorder_to_eytzinger(inorder + 1, t->size, t->extra);
+
+		if (k == tree_to_prev_bkey(b, t, j)) {
+			make_bfloat(b, t, j, &min_key, &max_key);
+
+			/* Children for which this key is the left boundary */
+			for (j = eytzinger_right_child(j);
+			     j < t->size;
+			     j = eytzinger_left_child(j))
+				make_bfloat(b, t, j, &min_key, &max_key);
+		}
+	}
+}
+
+/**
+ * bch_bset_fix_invalidated_key() - given an existing  key @k that has been
+ * modified, fix any auxiliary search tree by remaking all the nodes in the
+ * auxiliary search tree that @k corresponds to
+ */
+void bch_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t,
+				  struct bkey_packed *k)
+{
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		break;
+	case BSET_RO_AUX_TREE:
+		ro_aux_tree_fix_invalidated_key(b, t, k);
+		break;
+	case BSET_RW_AUX_TREE:
+		rw_aux_tree_fix_invalidated_key(b, t, k);
+		break;
+	}
+}
+
+static void bch_bset_fix_lookup_table(struct btree *b,
+				      struct bset_tree *t,
+				      struct bkey_packed *_where,
+				      unsigned clobber_u64s,
+				      unsigned new_u64s)
+{
+	int shift = new_u64s - clobber_u64s;
+	unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+
+	BUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	l = rw_aux_tree_bsearch(b, t, where);
+
+	/* l is first >= than @where */
+
+	BUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
+	BUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
+
+	if (!l) /* never delete first entry */
+		l++;
+	else if (l < t->size &&
+		 where < t->end_offset &&
+		 rw_aux_tree(b, t)[l].offset == where)
+		rw_aux_tree_set(b, t, l++, _where);
+
+	/* l now > where */
+
+	for (j = l;
+	     j < t->size &&
+	     rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
+	     j++)
+		;
+
+	if (j < t->size &&
+	    rw_aux_tree(b, t)[j].offset + shift ==
+	    rw_aux_tree(b, t)[l - 1].offset)
+		j++;
+
+	memmove(&rw_aux_tree(b, t)[l],
+		&rw_aux_tree(b, t)[j],
+		(void *) &rw_aux_tree(b, t)[t->size] -
+		(void *) &rw_aux_tree(b, t)[j]);
+	t->size -= j - l;
+
+	for (j = l; j < t->size; j++)
+	       rw_aux_tree(b, t)[j].offset += shift;
+
+	BUG_ON(l < t->size &&
+	       rw_aux_tree(b, t)[l].offset ==
+	       rw_aux_tree(b, t)[l - 1].offset);
+
+	if (t->size < bset_rw_tree_capacity(b, t) &&
+	    (l < t->size
+	     ? rw_aux_tree(b, t)[l].offset
+	     : t->end_offset) -
+	    rw_aux_tree(b, t)[l - 1].offset >
+	    L1_CACHE_BYTES / sizeof(u64)) {
+		struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
+		struct bkey_packed *end = l < t->size
+			? rw_aux_to_bkey(b, t, l)
+			: btree_bkey_last(b, t);
+		struct bkey_packed *k = start;
+
+		while (1) {
+			k = bkey_next(k);
+			if (k == end)
+				break;
+
+			if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+				memmove(&rw_aux_tree(b, t)[l + 1],
+					&rw_aux_tree(b, t)[l],
+					(void *) &rw_aux_tree(b, t)[t->size] -
+					(void *) &rw_aux_tree(b, t)[l]);
+				t->size++;
+				rw_aux_tree_set(b, t, l, k);
+				break;
+			}
+		}
+	}
+
+	bch_bset_verify_rw_aux_tree(b, t);
+	bset_aux_tree_verify(b);
+}
+
+void bch_bset_insert(struct btree *b,
+		    struct btree_node_iter *iter,
+		    struct bkey_packed *where,
+		    struct bkey_i *insert,
+		    unsigned clobber_u64s)
+{
+	struct bkey_format *f = &b->format;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bkey_packed packed, *src = bkey_to_packed(insert);
+
+	bch_bset_verify_rw_aux_tree(b, t);
+
+	if (bkey_pack_key(&packed, &insert->k, f))
+		src = &packed;
+
+	if (!bkey_whiteout(&insert->k))
+		btree_keys_account_key_add(&b->nr, t - b->set, src);
+
+	if (src->u64s != clobber_u64s) {
+		u64 *src_p = where->_data + clobber_u64s;
+		u64 *dst_p = where->_data + src->u64s;
+
+		BUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
+		       (int) clobber_u64s - src->u64s);
+
+		memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+		le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
+		set_btree_bset_end(b, t);
+	}
+
+	memcpy_u64s(where, src,
+		    bkeyp_key_u64s(f, src));
+	memcpy_u64s(bkeyp_val(f, where), &insert->v,
+		    bkeyp_val_u64s(f, src));
+
+	bch_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
+
+	bch_verify_key_order(b, iter, where);
+	bch_verify_btree_nr_keys(b);
+}
+
+void bch_bset_delete(struct btree *b,
+		     struct bkey_packed *where,
+		     unsigned clobber_u64s)
+{
+	struct bset_tree *t = bset_tree_last(b);
+	u64 *src_p = where->_data + clobber_u64s;
+	u64 *dst_p = where->_data;
+
+	bch_bset_verify_rw_aux_tree(b, t);
+
+	BUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
+
+	memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+	le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
+	set_btree_bset_end(b, t);
+
+	bch_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
+}
+
+/* Lookup */
+
+__flatten
+static struct bkey_packed *bset_search_write_set(const struct btree *b,
+				struct bset_tree *t,
+				struct bpos search,
+				const struct bkey_packed *packed_search)
+{
+	unsigned l = 0, r = t->size;
+
+	while (l + 1 != r) {
+		unsigned m = (l + r) >> 1;
+
+		if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0)
+			l = m;
+		else
+			r = m;
+	}
+
+	return rw_aux_to_bkey(b, t, l);
+}
+
+noinline
+static int bset_search_tree_slowpath(const struct btree *b,
+				struct bset_tree *t, struct bpos *search,
+				const struct bkey_packed *packed_search,
+				unsigned n)
+{
+	return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
+				 packed_search, search) < 0;
+}
+
+__flatten
+static struct bkey_packed *bset_search_tree(const struct btree *b,
+				struct bset_tree *t,
+				struct bpos search,
+				const struct bkey_packed *packed_search)
+{
+	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
+	struct bkey_float *f = bkey_float_get(base, 1);
+	void *p;
+	unsigned inorder, n = 1;
+
+	while (1) {
+		if (likely(n << 4 < t->size)) {
+			p = bkey_float_get(base, n << 4);
+			prefetch(p);
+		} else if (n << 3 < t->size) {
+			inorder = __eytzinger_to_inorder(n, t->size, t->extra);
+			p = bset_cacheline(b, t, inorder);
+#ifdef CONFIG_X86_64
+			asm(".intel_syntax noprefix;"
+			    "prefetcht0 [%0 - 127 + 64 * 0];"
+			    "prefetcht0 [%0 - 127 + 64 * 1];"
+			    "prefetcht0 [%0 - 127 + 64 * 2];"
+			    "prefetcht0 [%0 - 127 + 64 * 3];"
+			    ".att_syntax prefix;"
+			    :
+			    : "r" (p + 127));
+#else
+			prefetch(p + L1_CACHE_BYTES * 0);
+			prefetch(p + L1_CACHE_BYTES * 1);
+			prefetch(p + L1_CACHE_BYTES * 2);
+			prefetch(p + L1_CACHE_BYTES * 3);
+#endif
+		} else if (n >= t->size)
+			break;
+
+		f = bkey_float_get(base, n);
+
+		if (packed_search &&
+		    likely(f->exponent < BFLOAT_FAILED))
+			n = n * 2 + (bfloat_mantissa(f, n) <
+				     bkey_mantissa(packed_search, f, n));
+		else
+			n = n * 2 + bset_search_tree_slowpath(b, t,
+						&search, packed_search, n);
+	} while (n < t->size);
+
+	inorder = __eytzinger_to_inorder(n >> 1, t->size, t->extra);
+
+	/*
+	 * n would have been the node we recursed to - the low bit tells us if
+	 * we recursed left or recursed right.
+	 */
+	if (n & 1) {
+		return cacheline_to_bkey(b, t, inorder, f->key_offset);
+	} else {
+		if (--inorder) {
+			n = eytzinger_prev(n >> 1, t->size);
+			f = bkey_float_get(base, n);
+			return cacheline_to_bkey(b, t, inorder, f->key_offset);
+		} else
+			return btree_bkey_first(b, t);
+	}
+}
+
+/*
+ * Returns the first key greater than or equal to @search
+ */
+__always_inline __flatten
+static struct bkey_packed *bch_bset_search(struct btree *b,
+				struct bset_tree *t,
+				struct bpos search,
+				struct bkey_packed *packed_search,
+				const struct bkey_packed *lossy_packed_search,
+				bool strictly_greater)
+{
+	struct bkey_packed *m;
+
+	/*
+	 * First, we search for a cacheline, then lastly we do a linear search
+	 * within that cacheline.
+	 *
+	 * To search for the cacheline, there's three different possibilities:
+	 *  * The set is too small to have a search tree, so we just do a linear
+	 *    search over the whole set.
+	 *  * The set is the one we're currently inserting into; keeping a full
+	 *    auxiliary search tree up to date would be too expensive, so we
+	 *    use a much simpler lookup table to do a binary search -
+	 *    bset_search_write_set().
+	 *  * Or we use the auxiliary search tree we constructed earlier -
+	 *    bset_search_tree()
+	 */
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		m = btree_bkey_first(b, t);
+		break;
+	case BSET_RW_AUX_TREE:
+		m = bset_search_write_set(b, t, search, lossy_packed_search);
+		break;
+	case BSET_RO_AUX_TREE:
+		/*
+		 * Each node in the auxiliary search tree covers a certain range
+		 * of bits, and keys above and below the set it covers might
+		 * differ outside those bits - so we have to special case the
+		 * start and end - handle that here:
+		 */
+
+		if (bkey_cmp(search, t->max_key) > 0)
+			return btree_bkey_last(b, t);
+
+		m = bset_search_tree(b, t, search, lossy_packed_search);
+		break;
+	}
+
+	if (lossy_packed_search)
+		while (m != btree_bkey_last(b, t) &&
+		       !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search,
+						    m, strictly_greater))
+			m = bkey_next(m);
+
+	if (!packed_search)
+		while (m != btree_bkey_last(b, t) &&
+		       !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
+			m = bkey_next(m);
+
+	if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+		struct bkey_packed *prev = bkey_prev_all(b, t, m);
+
+		BUG_ON(prev &&
+		       btree_iter_pos_cmp_p_or_unp(b, search, packed_search,
+						   prev, strictly_greater));
+	}
+
+	return m;
+}
+
+/* Btree node iterator */
+
+void bch_btree_node_iter_push(struct btree_node_iter *iter,
+			      struct btree *b,
+			      const struct bkey_packed *k,
+			      const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set *pos, n =
+			((struct btree_node_iter_set) {
+				 __btree_node_key_to_offset(b, k),
+				 __btree_node_key_to_offset(b, end)
+			 });
+
+		btree_node_iter_for_each(iter, pos)
+			if (btree_node_iter_cmp(iter, b, n, *pos) <= 0)
+				break;
+
+		memmove(pos + 1, pos,
+			(void *) (iter->data + iter->used) - (void *) pos);
+		iter->used++;
+		*pos = n;
+	}
+}
+
+noinline __flatten __attribute__((cold))
+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
+			      struct btree *b, struct bpos search,
+			      bool strictly_greater, bool is_extents)
+{
+	struct bset_tree *t;
+
+	trace_bkey_pack_pos_fail(search);
+
+	for_each_bset(b, t)
+		__bch_btree_node_iter_push(iter, b,
+			bch_bset_search(b, t, search, NULL, NULL,
+					strictly_greater),
+			btree_bkey_last(b, t));
+
+	bch_btree_node_iter_sort(iter, b);
+}
+
+/**
+ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * given position
+ *
+ * Main entry point to the lookup code for individual btree nodes:
+ *
+ * NOTE:
+ *
+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
+ * keys. This doesn't matter for most code, but it does matter for lookups.
+ *
+ * Some adjacent keys with a string of equal keys:
+ *	i j k k k k l m
+ *
+ * If you search for k, the lookup code isn't guaranteed to return you any
+ * specific k. The lookup code is conceptually doing a binary search and
+ * iterating backwards is very expensive so if the pivot happens to land at the
+ * last k that's what you'll get.
+ *
+ * This works out ok, but it's something to be aware of:
+ *
+ *  - For non extents, we guarantee that the live key comes last - see
+ *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
+ *    see will only be deleted keys you don't care about.
+ *
+ *  - For extents, deleted keys sort last (see the comment at the top of this
+ *    file). But when you're searching for extents, you actually want the first
+ *    key strictly greater than your search key - an extent that compares equal
+ *    to the search key is going to have 0 sectors after the search key.
+ *
+ *    But this does mean that we can't just search for
+ *    bkey_successor(start_of_range) to get the first extent that overlaps with
+ *    the range we want - if we're unlucky and there's an extent that ends
+ *    exactly where we searched, then there could be a deleted key at the same
+ *    position and we'd get that when we search instead of the preceding extent
+ *    we needed.
+ *
+ *    So we've got to search for start_of_range, then after the lookup iterate
+ *    past any extents that compare equal to the position we searched for.
+ */
+void bch_btree_node_iter_init(struct btree_node_iter *iter,
+			      struct btree *b, struct bpos search,
+			      bool strictly_greater, bool is_extents)
+{
+	struct bset_tree *t;
+	struct bkey_packed p, *packed_search = NULL;
+
+	EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
+	bset_aux_tree_verify(b);
+
+	__bch_btree_node_iter_init(iter, is_extents);
+
+	//if (bkey_cmp(search, b->curr_max_key) > 0)
+	//	return;
+
+	switch (bkey_pack_pos_lossy(&p, search, b)) {
+	case BKEY_PACK_POS_EXACT:
+		packed_search = &p;
+		break;
+	case BKEY_PACK_POS_SMALLER:
+		packed_search = NULL;
+		break;
+	case BKEY_PACK_POS_FAIL:
+		btree_node_iter_init_pack_failed(iter, b, search,
+					strictly_greater, is_extents);
+		return;
+	}
+
+	for_each_bset(b, t)
+		__bch_btree_node_iter_push(iter, b,
+					   bch_bset_search(b, t, search,
+							   packed_search, &p,
+							   strictly_greater),
+					   btree_bkey_last(b, t));
+
+	bch_btree_node_iter_sort(iter, b);
+}
+
+void bch_btree_node_iter_init_from_start(struct btree_node_iter *iter,
+					 struct btree *b,
+					 bool is_extents)
+{
+	struct bset_tree *t;
+
+	__bch_btree_node_iter_init(iter, is_extents);
+
+	for_each_bset(b, t)
+		__bch_btree_node_iter_push(iter, b,
+					   btree_bkey_first(b, t),
+					   btree_bkey_last(b, t));
+	bch_btree_node_iter_sort(iter, b);
+}
+
+struct bkey_packed *bch_btree_node_iter_bset_pos(struct btree_node_iter *iter,
+						 struct btree *b,
+						 struct bset_tree *t)
+{
+	struct btree_node_iter_set *set;
+
+	BUG_ON(iter->used > MAX_BSETS);
+
+	btree_node_iter_for_each(iter, set)
+		if (set->end == t->end_offset)
+			return __btree_node_offset_to_key(b, set->k);
+
+	return btree_bkey_last(b, t);
+}
+
+static inline void btree_node_iter_sift(struct btree_node_iter *iter,
+					struct btree *b,
+					unsigned start)
+{
+	unsigned i;
+
+	EBUG_ON(iter->used > MAX_BSETS);
+
+	for (i = start;
+	     i + 1 < iter->used &&
+	     btree_node_iter_cmp(iter, b, iter->data[i], iter->data[i + 1]) > 0;
+	     i++)
+		swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void btree_node_iter_sort_two(struct btree_node_iter *iter,
+					    struct btree *b,
+					    unsigned first)
+{
+	if (btree_node_iter_cmp(iter, b,
+				iter->data[first],
+				iter->data[first + 1]) > 0)
+		swap(iter->data[first], iter->data[first + 1]);
+}
+
+void bch_btree_node_iter_sort(struct btree_node_iter *iter,
+			      struct btree *b)
+{
+	EBUG_ON(iter->used > 3);
+
+	/* unrolled bubble sort: */
+
+	if (iter->used > 2) {
+		btree_node_iter_sort_two(iter, b, 0);
+		btree_node_iter_sort_two(iter, b, 1);
+	}
+
+	if (iter->used > 1)
+		btree_node_iter_sort_two(iter, b, 0);
+}
+EXPORT_SYMBOL(bch_btree_node_iter_sort);
+
+/**
+ * bch_btree_node_iter_advance - advance @iter by one key
+ *
+ * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
+ * momentarily have out of order extents.
+ */
+void bch_btree_node_iter_advance(struct btree_node_iter *iter,
+				 struct btree *b)
+{
+	struct bkey_packed *k = bch_btree_node_iter_peek_all(iter, b);
+
+	iter->data->k += __bch_btree_node_iter_peek_all(iter, b)->u64s;
+
+	BUG_ON(iter->data->k > iter->data->end);
+
+	if (iter->data->k == iter->data->end) {
+		BUG_ON(iter->used == 0);
+		iter->data[0] = iter->data[--iter->used];
+	}
+
+	btree_node_iter_sift(iter, b, 0);
+
+	bch_btree_node_iter_next_check(iter, b, k);
+}
+
+/*
+ * Expensive:
+ */
+struct bkey_packed *bch_btree_node_iter_prev_all(struct btree_node_iter *iter,
+						 struct btree *b)
+{
+	struct bkey_packed *k, *prev = NULL;
+	struct btree_node_iter_set *set;
+	struct bset_tree *t;
+	struct bset_tree *prev_t;
+	unsigned end;
+
+	bch_btree_node_iter_verify(iter, b);
+
+	for_each_bset(b, t) {
+		k = bkey_prev_all(b, t,
+			bch_btree_node_iter_bset_pos(iter, b, t));
+		if (k &&
+		    (!prev || __btree_node_iter_cmp(iter->is_extents, b,
+						    k, prev) > 0)) {
+			prev = k;
+			prev_t = t;
+		}
+	}
+
+	if (!prev)
+		return NULL;
+
+	/*
+	 * We're manually memmoving instead of just calling sort() to ensure the
+	 * prev we picked ends up in slot 0 - sort won't necessarily put it
+	 * there because of duplicate deleted keys:
+	 */
+	end = __btree_node_key_to_offset(b, btree_bkey_last(b, prev_t));
+	btree_node_iter_for_each(iter, set)
+		if (set->end == end) {
+			memmove(&iter->data[1],
+				&iter->data[0],
+				(void *) set - (void *) &iter->data[0]);
+			goto out;
+		}
+
+	memmove(&iter->data[1],
+		&iter->data[0],
+		(void *) &iter->data[iter->used] - (void *) &iter->data[0]);
+	iter->used++;
+out:
+	iter->data[0].k = __btree_node_key_to_offset(b, prev);
+	iter->data[0].end = end;
+	return prev;
+}
+
+struct bkey_packed *bch_btree_node_iter_prev(struct btree_node_iter *iter,
+					     struct btree *b)
+{
+	struct bkey_packed *k;
+
+	do {
+		k = bch_btree_node_iter_prev_all(iter, b);
+	} while (k && bkey_deleted(k));
+
+	return k;
+}
+
+struct bkey_s_c bch_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
+						struct btree *b,
+						struct bkey *u)
+{
+	struct bkey_packed *k = bch_btree_node_iter_peek(iter, b);
+
+	return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
+}
+EXPORT_SYMBOL(bch_btree_node_iter_peek_unpack);
+
+/* Mergesort */
+
+void bch_btree_keys_stats(struct btree *b, struct bset_stats *stats)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		enum bset_aux_tree_type type = bset_aux_tree_type(t);
+		size_t j;
+
+		stats->sets[type].nr++;
+		stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
+			sizeof(u64);
+
+		if (bset_has_ro_aux_tree(t)) {
+			stats->floats += t->size - 1;
+
+			for (j = 1; j < t->size; j++)
+				switch (bkey_float(b, t, j)->exponent) {
+				case BFLOAT_FAILED_UNPACKED:
+					stats->failed_unpacked++;
+					break;
+				case BFLOAT_FAILED_PREV:
+					stats->failed_prev++;
+					break;
+				case BFLOAT_FAILED_OVERFLOW:
+					stats->failed_overflow++;
+					break;
+				}
+		}
+	}
+}
+
+int bch_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
+			  char *buf, size_t size)
+{
+	struct bset_tree *t = bch_bkey_to_bset(b, k);
+	struct bkey_packed *l, *r, *p;
+	struct bkey uk, up;
+	char buf1[200], buf2[200];
+	unsigned j;
+
+	if (!size)
+		return 0;
+
+	if (!bset_has_ro_aux_tree(t))
+		goto out;
+
+	j = __inorder_to_eytzinger(bkey_to_cacheline(b, t, k), t->size, t->extra);
+	if (j &&
+	    j < t->size &&
+	    k == tree_to_bkey(b, t, j))
+		switch (bkey_float(b, t, j)->exponent) {
+		case BFLOAT_FAILED_UNPACKED:
+			uk = bkey_unpack_key(b, k);
+			return scnprintf(buf, size,
+					 "    failed unpacked at depth %u\n"
+					 "\t%llu:%llu\n",
+					 ilog2(j),
+					 uk.p.inode, uk.p.offset);
+		case BFLOAT_FAILED_PREV:
+			p = tree_to_prev_bkey(b, t, j);
+			l = is_power_of_2(j)
+				? btree_bkey_first(b, t)
+				: tree_to_prev_bkey(b, t, j >> ffs(j));
+			r = is_power_of_2(j + 1)
+				? bkey_prev_all(b, t, btree_bkey_last(b, t))
+				: tree_to_bkey(b, t, j >> (ffz(j) + 1));
+
+			up = bkey_unpack_key(b, p);
+			uk = bkey_unpack_key(b, k);
+			bch_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
+			bch_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
+
+			return scnprintf(buf, size,
+					 "    failed prev at depth %u\n"
+					 "\tkey starts at bit %u but first differing bit at %u\n"
+					 "\t%llu:%llu\n"
+					 "\t%llu:%llu\n"
+					 "\t%s\n"
+					 "\t%s\n",
+					 ilog2(j),
+					 bkey_greatest_differing_bit(b, l, r),
+					 bkey_greatest_differing_bit(b, p, k),
+					 uk.p.inode, uk.p.offset,
+					 up.p.inode, up.p.offset,
+					 buf1, buf2);
+		case BFLOAT_FAILED_OVERFLOW:
+			uk = bkey_unpack_key(b, k);
+			return scnprintf(buf, size,
+					 "    failed overflow at depth %u\n"
+					 "\t%llu:%llu\n",
+					 ilog2(j),
+					 uk.p.inode, uk.p.offset);
+		}
+out:
+	*buf = '\0';
+	return 0;
+}
diff --git a/libbcache/bset.h b/libbcache/bset.h
new file mode 100644
index 0000000..f03e6b8
--- /dev/null
+++ b/libbcache/bset.h
@@ -0,0 +1,628 @@
+#ifndef _BCACHE_BSET_H
+#define _BCACHE_BSET_H
+
+#include <linux/bcache.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "util.h" /* for time_stats */
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bkey_invalid and
+ * bkey_deleted().
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that,  we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking.  Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n.  The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+struct btree_node_iter;
+struct btree_node_iter_set;
+
+enum bset_aux_tree_type {
+	BSET_NO_AUX_TREE,
+	BSET_RO_AUX_TREE,
+	BSET_RW_AUX_TREE,
+};
+
+#define BSET_TREE_NR_TYPES	3
+
+#define BSET_NO_AUX_TREE_VAL	(U16_MAX)
+#define BSET_RW_AUX_TREE_VAL	(U16_MAX - 1)
+
+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
+{
+	switch (t->extra) {
+	case BSET_NO_AUX_TREE_VAL:
+		EBUG_ON(t->size);
+		return BSET_NO_AUX_TREE;
+	case BSET_RW_AUX_TREE_VAL:
+		EBUG_ON(!t->size);
+		return BSET_RW_AUX_TREE;
+	default:
+		EBUG_ON(!t->size);
+		return BSET_RO_AUX_TREE;
+	}
+}
+
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+	struct bkey dst;
+
+#ifdef HAVE_BCACHE_COMPILED_UNPACK
+	{
+		compiled_unpack_fn unpack_fn = b->aux_data;
+		unpack_fn(&dst, src);
+
+		if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+			struct bkey dst2 = __bkey_unpack_key(&b->format, src);
+
+			BUG_ON(memcmp(&dst, &dst2, sizeof(dst)));
+		}
+	}
+#else
+	dst = __bkey_unpack_key(&b->format, src);
+#endif
+	return dst;
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_key_format_checked(b, src)
+		: *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHE_COMPILED_UNPACK
+	return bkey_unpack_key_format_checked(b, src).p;
+#else
+	return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_pos_format_checked(b, src)
+		: packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(struct btree *b,
+					       const struct bkey_packed *k,
+					       struct bkey *u)
+{
+	*u = bkey_unpack_key(b, k);
+
+	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(struct btree *b,
+					       struct bkey_packed *k,
+					       struct bkey *u)
+{
+	*u = bkey_unpack_key(b, k);
+
+	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
+#define for_each_bset(_b, _t)					\
+	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+
+extern bool bch_expensive_debug_checks;
+
+static inline bool btree_keys_expensive_checks(struct btree *b)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	return bch_expensive_debug_checks || *b->expensive_debug_checks;
+#else
+	return false;
+#endif
+}
+
+static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
+}
+
+static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
+}
+
+static inline void bch_bset_set_no_aux_tree(struct btree *b,
+					    struct bset_tree *t)
+{
+	BUG_ON(t < b->set);
+
+	for (; t < b->set + ARRAY_SIZE(b->set); t++) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		t->aux_data_offset = U16_MAX;
+	}
+}
+
+static inline void btree_node_set_format(struct btree *b,
+					 struct bkey_format f)
+{
+	int len;
+
+	b->format	= f;
+	b->nr_key_bits	= bkey_format_key_bits(&f);
+
+	len = bch_compile_bkey_format(&b->format, b->aux_data);
+	BUG_ON(len < 0 || len > U8_MAX);
+
+	b->unpack_fn_len = len;
+
+	bch_bset_set_no_aux_tree(b, b->set);
+}
+
+#define __set_bytes(_i, _u64s)	(sizeof(*(_i)) + (_u64s) * sizeof(u64))
+#define set_bytes(_i)		__set_bytes(_i, (_i)->u64s)
+
+#define __set_blocks(_i, _u64s, _block_bytes)				\
+	DIV_ROUND_UP((size_t) __set_bytes((_i), (_u64s)), (_block_bytes))
+
+#define set_blocks(_i, _block_bytes)					\
+	__set_blocks((_i), (_i)->u64s, (_block_bytes))
+
+static inline struct bset *bset_next_set(struct btree *b,
+					 unsigned block_bytes)
+{
+	struct bset *i = btree_bset_last(b);
+
+	EBUG_ON(!is_power_of_2(block_bytes));
+
+	return ((void *) i) + round_up(set_bytes(i), block_bytes);
+}
+
+void bch_btree_keys_free(struct btree *);
+int bch_btree_keys_alloc(struct btree *, unsigned, gfp_t);
+void bch_btree_keys_init(struct btree *, bool *);
+
+void bch_bset_init_first(struct btree *, struct bset *);
+void bch_bset_init_next(struct btree *, struct bset *);
+void bch_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
+void bch_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
+				  struct bkey_packed *);
+
+void bch_bset_insert(struct btree *, struct btree_node_iter *,
+		     struct bkey_packed *, struct bkey_i *, unsigned);
+void bch_bset_delete(struct btree *, struct bkey_packed *, unsigned);
+
+/* Bkey utility code */
+
+/* packed or unpacked */
+static inline int bkey_cmp_p_or_unp(const struct btree *b,
+				    const struct bkey_packed *l,
+				    const struct bkey_packed *r_packed,
+				    struct bpos *r)
+{
+	EBUG_ON(r_packed && !bkey_packed(r_packed));
+
+	if (unlikely(!bkey_packed(l)))
+		return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+
+	if (likely(r_packed))
+		return __bkey_cmp_packed_format_checked(l, r_packed, b);
+
+	return __bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+/* Returns true if @k is after iterator position @pos */
+static inline bool btree_iter_pos_cmp(struct bpos pos, const struct bkey *k,
+				      bool strictly_greater)
+{
+	int cmp = bkey_cmp(k->p, pos);
+
+	return cmp > 0 ||
+		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
+}
+
+static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
+					     struct bpos *pos,
+					     const struct bkey_packed *k,
+					     bool strictly_greater)
+{
+	int cmp = bkey_cmp_left_packed(b, k, pos);
+
+	return cmp > 0 ||
+		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
+}
+
+static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
+					struct bpos pos,
+					const struct bkey_packed *pos_packed,
+					const struct bkey_packed *k,
+					bool strictly_greater)
+{
+	int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos);
+
+	return cmp > 0 ||
+		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
+}
+
+static inline struct bkey_packed *bset_bkey_idx(struct bset *i, unsigned idx)
+{
+	return bkey_idx(i, idx);
+}
+
+struct bset_tree *bch_bkey_to_bset(struct btree *, struct bkey_packed *);
+struct bkey_packed *bkey_prev_all(struct btree *, struct bset_tree *,
+				  struct bkey_packed *);
+struct bkey_packed *bkey_prev(struct btree *, struct bset_tree *,
+			      struct bkey_packed *);
+
+enum bch_extent_overlap {
+	BCH_EXTENT_OVERLAP_ALL		= 0,
+	BCH_EXTENT_OVERLAP_BACK		= 1,
+	BCH_EXTENT_OVERLAP_FRONT	= 2,
+	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch_extent_overlap(const struct bkey *k,
+							 const struct bkey *m)
+{
+	int cmp1 = bkey_cmp(k->p, m->p) < 0;
+	int cmp2 = bkey_cmp(bkey_start_pos(k),
+			    bkey_start_pos(m)) > 0;
+
+	return (cmp1 << 1) + cmp2;
+}
+
+/* Btree key iteration */
+
+struct btree_node_iter {
+	u8		is_extents;
+	u16		used;
+
+	struct btree_node_iter_set {
+		u16	k, end;
+	} data[MAX_BSETS];
+};
+
+static inline void __bch_btree_node_iter_init(struct btree_node_iter *iter,
+					      bool is_extents)
+{
+	iter->used = 0;
+	iter->is_extents = is_extents;
+}
+
+void bch_btree_node_iter_push(struct btree_node_iter *, struct btree *,
+			      const struct bkey_packed *,
+			      const struct bkey_packed *);
+void bch_btree_node_iter_init(struct btree_node_iter *, struct btree *,
+			      struct bpos, bool, bool);
+void bch_btree_node_iter_init_from_start(struct btree_node_iter *,
+					 struct btree *, bool);
+struct bkey_packed *bch_btree_node_iter_bset_pos(struct btree_node_iter *,
+						 struct btree *,
+						 struct bset_tree *);
+
+void bch_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
+void bch_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
+
+#define btree_node_iter_for_each(_iter, _set)			\
+	for (_set = (_iter)->data;				\
+	     _set < (_iter)->data + (_iter)->used;		\
+	     _set++)
+
+static inline bool bch_btree_node_iter_end(struct btree_node_iter *iter)
+{
+	return !iter->used;
+}
+
+static inline int __btree_node_iter_cmp(bool is_extents,
+					struct btree *b,
+					struct bkey_packed *l,
+					struct bkey_packed *r)
+{
+	/*
+	 * For non extents, when keys compare equal the deleted keys have to
+	 * come first - so that bch_btree_node_iter_next_check() can detect
+	 * duplicate nondeleted keys (and possibly other reasons?)
+	 *
+	 * For extents, bkey_deleted() is used as a proxy for k->size == 0, so
+	 * deleted keys have to sort last.
+	 */
+	return bkey_cmp_packed(b, l, r) ?: is_extents
+		? (int) bkey_deleted(l) - (int) bkey_deleted(r)
+		: (int) bkey_deleted(r) - (int) bkey_deleted(l);
+}
+
+static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
+				      struct btree *b,
+				      struct btree_node_iter_set l,
+				      struct btree_node_iter_set r)
+{
+	return __btree_node_iter_cmp(iter->is_extents, b,
+			__btree_node_offset_to_key(b, l.k),
+			__btree_node_offset_to_key(b, r.k));
+}
+
+static inline void __bch_btree_node_iter_push(struct btree_node_iter *iter,
+			      struct btree *b,
+			      const struct bkey_packed *k,
+			      const struct bkey_packed *end)
+{
+	if (k != end)
+		iter->data[iter->used++] = (struct btree_node_iter_set) {
+			__btree_node_key_to_offset(b, k),
+			__btree_node_key_to_offset(b, end)
+		};
+}
+
+static inline struct bkey_packed *
+__bch_btree_node_iter_peek_all(struct btree_node_iter *iter,
+			       struct btree *b)
+{
+	return __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static inline struct bkey_packed *
+bch_btree_node_iter_peek_all(struct btree_node_iter *iter,
+			     struct btree *b)
+{
+	return bch_btree_node_iter_end(iter)
+		? NULL
+		: __bch_btree_node_iter_peek_all(iter, b);
+}
+
+static inline struct bkey_packed *
+bch_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
+{
+	struct bkey_packed *ret;
+
+	while ((ret = bch_btree_node_iter_peek_all(iter, b)) &&
+	       bkey_deleted(ret))
+		bch_btree_node_iter_advance(iter, b);
+
+	return ret;
+}
+
+static inline struct bkey_packed *
+bch_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
+{
+	struct bkey_packed *ret = bch_btree_node_iter_peek_all(iter, b);
+
+	if (ret)
+		bch_btree_node_iter_advance(iter, b);
+
+	return ret;
+}
+
+struct bkey_packed *bch_btree_node_iter_prev_all(struct btree_node_iter *,
+						 struct btree *);
+struct bkey_packed *bch_btree_node_iter_prev(struct btree_node_iter *,
+					     struct btree *);
+
+/*
+ * Iterates over all _live_ keys - skipping deleted (and potentially
+ * overlapping) keys
+ */
+#define for_each_btree_node_key(b, k, iter, _is_extents)		\
+	for (bch_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+	     ((k) = bch_btree_node_iter_peek(iter, b));			\
+	     bch_btree_node_iter_advance(iter, b))
+
+struct bkey_s_c bch_btree_node_iter_peek_unpack(struct btree_node_iter *,
+						struct btree *,
+						struct bkey *);
+
+#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\
+	for (bch_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+	     (k = bch_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
+	     bch_btree_node_iter_advance(iter, b))
+
+/* Accounting: */
+
+static inline void btree_keys_account_key(struct btree_nr_keys *n,
+					  unsigned bset,
+					  struct bkey_packed *k,
+					  int sign)
+{
+	n->live_u64s		+= k->u64s * sign;
+	n->bset_u64s[bset]	+= k->u64s * sign;
+
+	if (bkey_packed(k))
+		n->packed_keys	+= sign;
+	else
+		n->unpacked_keys += sign;
+}
+
+#define btree_keys_account_key_add(_nr, _bset_idx, _k)		\
+	btree_keys_account_key(_nr, _bset_idx, _k, 1)
+#define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
+	btree_keys_account_key(_nr, _bset_idx, _k, -1)
+
+struct bset_stats {
+	struct {
+		size_t nr, bytes;
+	} sets[BSET_TREE_NR_TYPES];
+
+	size_t floats;
+	size_t failed_unpacked;
+	size_t failed_prev;
+	size_t failed_overflow;
+};
+
+void bch_btree_keys_stats(struct btree *, struct bset_stats *);
+int bch_bkey_print_bfloat(struct btree *, struct bkey_packed *,
+			  char *, size_t);
+
+/* Debug stuff */
+
+void bch_dump_bset(struct btree *, struct bset *, unsigned);
+void bch_dump_btree_node(struct btree *);
+void bch_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+void __bch_verify_btree_nr_keys(struct btree *);
+void bch_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
+void bch_verify_key_order(struct btree *, struct btree_node_iter *,
+			  struct bkey_packed *);
+
+#else
+
+static inline void __bch_verify_btree_nr_keys(struct btree *b) {}
+static inline void bch_btree_node_iter_verify(struct btree_node_iter *iter,
+					      struct btree *b) {}
+static inline void bch_verify_key_order(struct btree *b,
+					struct btree_node_iter *iter,
+					struct bkey_packed *where) {}
+#endif
+
+static inline void bch_verify_btree_nr_keys(struct btree *b)
+{
+	if (btree_keys_expensive_checks(b))
+		__bch_verify_btree_nr_keys(b);
+}
+
+#endif
diff --git a/libbcache/btree_cache.c b/libbcache/btree_cache.c
new file mode 100644
index 0000000..0994190
--- /dev/null
+++ b/libbcache/btree_cache.c
@@ -0,0 +1,701 @@
+
+#include "bcache.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+
+#include <trace/events/bcache.h>
+
+#define DEF_BTREE_ID(kwd, val, name) name,
+
+const char *bch_btree_id_names[BTREE_ID_NR] = {
+	DEFINE_BCH_BTREE_IDS()
+};
+
+#undef DEF_BTREE_ID
+
+void bch_recalc_btree_reserve(struct cache_set *c)
+{
+	unsigned i, reserve = 16;
+
+	if (!c->btree_roots[0].b)
+		reserve += 8;
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (c->btree_roots[i].b)
+			reserve += min_t(unsigned, 1,
+					 c->btree_roots[i].b->level) * 8;
+
+	c->btree_cache_reserve = reserve;
+}
+
+#define mca_can_free(c)						\
+	max_t(int, 0, c->btree_cache_used - c->btree_cache_reserve)
+
+static void __mca_data_free(struct cache_set *c, struct btree *b)
+{
+	EBUG_ON(btree_node_write_in_flight(b));
+
+	free_pages((unsigned long) b->data, btree_page_order(c));
+	b->data = NULL;
+	bch_btree_keys_free(b);
+}
+
+static void mca_data_free(struct cache_set *c, struct btree *b)
+{
+	__mca_data_free(c, b);
+	c->btree_cache_used--;
+	list_move(&b->list, &c->btree_cache_freed);
+}
+
+#define PTR_HASH(_k)	(bkey_i_to_extent_c(_k)->v._data[0])
+
+static const struct rhashtable_params bch_btree_cache_params = {
+	.head_offset	= offsetof(struct btree, hash),
+	.key_offset	= offsetof(struct btree, key.v),
+	.key_len	= sizeof(struct bch_extent_ptr),
+};
+
+static void mca_data_alloc(struct cache_set *c, struct btree *b, gfp_t gfp)
+{
+	unsigned order = ilog2(btree_pages(c));
+
+	b->data = (void *) __get_free_pages(gfp, order);
+	if (!b->data)
+		goto err;
+
+	if (bch_btree_keys_alloc(b, order, gfp))
+		goto err;
+
+	c->btree_cache_used++;
+	list_move(&b->list, &c->btree_cache_freeable);
+	return;
+err:
+	free_pages((unsigned long) b->data, order);
+	b->data = NULL;
+	list_move(&b->list, &c->btree_cache_freed);
+}
+
+static struct btree *mca_bucket_alloc(struct cache_set *c, gfp_t gfp)
+{
+	struct btree *b = kzalloc(sizeof(struct btree), gfp);
+	if (!b)
+		return NULL;
+
+	six_lock_init(&b->lock);
+	INIT_LIST_HEAD(&b->list);
+	INIT_LIST_HEAD(&b->write_blocked);
+
+	mca_data_alloc(c, b, gfp);
+	return b->data ? b : NULL;
+}
+
+/* Btree in memory cache - hash table */
+
+void mca_hash_remove(struct cache_set *c, struct btree *b)
+{
+	BUG_ON(btree_node_dirty(b));
+
+	b->nsets = 0;
+
+	rhashtable_remove_fast(&c->btree_cache_table, &b->hash,
+			       bch_btree_cache_params);
+
+	/* Cause future lookups for this node to fail: */
+	bkey_i_to_extent(&b->key)->v._data[0] = 0;
+}
+
+int mca_hash_insert(struct cache_set *c, struct btree *b,
+		    unsigned level, enum btree_id id)
+{
+	int ret;
+	b->level	= level;
+	b->btree_id	= id;
+
+	ret = rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash,
+					    bch_btree_cache_params);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->btree_cache_lock);
+	list_add(&b->list, &c->btree_cache);
+	mutex_unlock(&c->btree_cache_lock);
+
+	return 0;
+}
+
+__flatten
+static inline struct btree *mca_find(struct cache_set *c,
+				     const struct bkey_i *k)
+{
+	return rhashtable_lookup_fast(&c->btree_cache_table, &PTR_HASH(k),
+				      bch_btree_cache_params);
+}
+
+/*
+ * this version is for btree nodes that have already been freed (we're not
+ * reaping a real btree node)
+ */
+static int mca_reap_notrace(struct cache_set *c, struct btree *b, bool flush)
+{
+	lockdep_assert_held(&c->btree_cache_lock);
+
+	if (!six_trylock_intent(&b->lock))
+		return -ENOMEM;
+
+	if (!six_trylock_write(&b->lock))
+		goto out_unlock_intent;
+
+	if (btree_node_write_error(b))
+		goto out_unlock;
+
+	if (!list_empty(&b->write_blocked))
+		goto out_unlock;
+
+	if (!flush &&
+	    (btree_node_dirty(b) ||
+	     btree_node_write_in_flight(b)))
+		goto out_unlock;
+
+	/*
+	 * Using the underscore version because we don't want to compact bsets
+	 * after the write, since this node is about to be evicted - unless
+	 * btree verify mode is enabled, since it runs out of the post write
+	 * cleanup:
+	 */
+	if (btree_node_dirty(b)) {
+		if (verify_btree_ondisk(c))
+			bch_btree_node_write(c, b, NULL, SIX_LOCK_intent, -1);
+		else
+			__bch_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
+	}
+
+	/* wait for any in flight btree write */
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+
+	return 0;
+out_unlock:
+	six_unlock_write(&b->lock);
+out_unlock_intent:
+	six_unlock_intent(&b->lock);
+	return -ENOMEM;
+}
+
+static int mca_reap(struct cache_set *c, struct btree *b, bool flush)
+{
+	int ret = mca_reap_notrace(c, b, flush);
+
+	trace_bcache_mca_reap(c, b, ret);
+	return ret;
+}
+
+static unsigned long bch_mca_scan(struct shrinker *shrink,
+				  struct shrink_control *sc)
+{
+	struct cache_set *c = container_of(shrink, struct cache_set,
+					   btree_cache_shrink);
+	struct btree *b, *t;
+	unsigned long nr = sc->nr_to_scan;
+	unsigned long can_free;
+	unsigned long touched = 0;
+	unsigned long freed = 0;
+	unsigned i;
+
+	u64 start_time = local_clock();
+
+	if (btree_shrinker_disabled(c))
+		return SHRINK_STOP;
+
+	if (c->btree_cache_alloc_lock)
+		return SHRINK_STOP;
+
+	/* Return -1 if we can't do anything right now */
+	if (sc->gfp_mask & __GFP_IO)
+		mutex_lock(&c->btree_cache_lock);
+	else if (!mutex_trylock(&c->btree_cache_lock))
+		return -1;
+
+	/*
+	 * It's _really_ critical that we don't free too many btree nodes - we
+	 * have to always leave ourselves a reserve. The reserve is how we
+	 * guarantee that allocating memory for a new btree node can always
+	 * succeed, so that inserting keys into the btree can always succeed and
+	 * IO can always make forward progress:
+	 */
+	nr /= btree_pages(c);
+	can_free = mca_can_free(c);
+	nr = min_t(unsigned long, nr, can_free);
+
+	i = 0;
+	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
+		touched++;
+
+		if (freed >= nr)
+			break;
+
+		if (++i > 3 &&
+		    !mca_reap_notrace(c, b, false)) {
+			mca_data_free(c, b);
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+			freed++;
+		}
+	}
+restart:
+	list_for_each_entry_safe(b, t, &c->btree_cache, list) {
+		touched++;
+
+		if (freed >= nr) {
+			/* Save position */
+			if (&t->list != &c->btree_cache)
+				list_move_tail(&c->btree_cache, &t->list);
+			break;
+		}
+
+		if (!btree_node_accessed(b) &&
+		    !mca_reap(c, b, false)) {
+			/* can't call mca_hash_remove under btree_cache_lock  */
+			freed++;
+			if (&t->list != &c->btree_cache)
+				list_move_tail(&c->btree_cache, &t->list);
+
+			mca_data_free(c, b);
+			mutex_unlock(&c->btree_cache_lock);
+
+			mca_hash_remove(c, b);
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+
+			if (freed >= nr)
+				goto out;
+
+			if (sc->gfp_mask & __GFP_IO)
+				mutex_lock(&c->btree_cache_lock);
+			else if (!mutex_trylock(&c->btree_cache_lock))
+				goto out;
+			goto restart;
+		} else
+			clear_btree_node_accessed(b);
+	}
+
+	mutex_unlock(&c->btree_cache_lock);
+out:
+	bch_time_stats_update(&c->mca_scan_time, start_time);
+
+	trace_bcache_mca_scan(c,
+			      touched * btree_pages(c),
+			      freed * btree_pages(c),
+			      can_free * btree_pages(c),
+			      sc->nr_to_scan);
+
+	return (unsigned long) freed * btree_pages(c);
+}
+
+static unsigned long bch_mca_count(struct shrinker *shrink,
+				   struct shrink_control *sc)
+{
+	struct cache_set *c = container_of(shrink, struct cache_set,
+					   btree_cache_shrink);
+
+	if (btree_shrinker_disabled(c))
+		return 0;
+
+	if (c->btree_cache_alloc_lock)
+		return 0;
+
+	return mca_can_free(c) * btree_pages(c);
+}
+
+void bch_btree_cache_free(struct cache_set *c)
+{
+	struct btree *b;
+	unsigned i;
+
+	if (c->btree_cache_shrink.list.next)
+		unregister_shrinker(&c->btree_cache_shrink);
+
+	mutex_lock(&c->btree_cache_lock);
+
+#ifdef CONFIG_BCACHE_DEBUG
+	if (c->verify_data)
+		list_move(&c->verify_data->list, &c->btree_cache);
+
+	free_pages((unsigned long) c->verify_ondisk, ilog2(btree_pages(c)));
+#endif
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (c->btree_roots[i].b)
+			list_add(&c->btree_roots[i].b->list, &c->btree_cache);
+
+	list_splice(&c->btree_cache_freeable,
+		    &c->btree_cache);
+
+	while (!list_empty(&c->btree_cache)) {
+		b = list_first_entry(&c->btree_cache, struct btree, list);
+
+		if (btree_node_dirty(b))
+			bch_btree_complete_write(c, b, btree_current_write(b));
+		clear_btree_node_dirty(b);
+
+		mca_data_free(c, b);
+	}
+
+	while (!list_empty(&c->btree_cache_freed)) {
+		b = list_first_entry(&c->btree_cache_freed,
+				     struct btree, list);
+		list_del(&b->list);
+		kfree(b);
+	}
+
+	mutex_unlock(&c->btree_cache_lock);
+
+	if (c->btree_cache_table_init_done)
+		rhashtable_destroy(&c->btree_cache_table);
+}
+
+int bch_btree_cache_alloc(struct cache_set *c)
+{
+	unsigned i;
+	int ret;
+
+	ret = rhashtable_init(&c->btree_cache_table, &bch_btree_cache_params);
+	if (ret)
+		return ret;
+
+	c->btree_cache_table_init_done = true;
+
+	bch_recalc_btree_reserve(c);
+
+	for (i = 0; i < c->btree_cache_reserve; i++)
+		if (!mca_bucket_alloc(c, GFP_KERNEL))
+			return -ENOMEM;
+
+	list_splice_init(&c->btree_cache,
+			 &c->btree_cache_freeable);
+
+#ifdef CONFIG_BCACHE_DEBUG
+	mutex_init(&c->verify_lock);
+
+	c->verify_ondisk = (void *)
+		__get_free_pages(GFP_KERNEL, ilog2(btree_pages(c)));
+	if (!c->verify_ondisk)
+		return -ENOMEM;
+
+	c->verify_data = mca_bucket_alloc(c, GFP_KERNEL);
+	if (!c->verify_data)
+		return -ENOMEM;
+
+	list_del_init(&c->verify_data->list);
+#endif
+
+	c->btree_cache_shrink.count_objects = bch_mca_count;
+	c->btree_cache_shrink.scan_objects = bch_mca_scan;
+	c->btree_cache_shrink.seeks = 4;
+	c->btree_cache_shrink.batch = btree_pages(c) * 2;
+	register_shrinker(&c->btree_cache_shrink);
+
+	return 0;
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+void mca_cannibalize_unlock(struct cache_set *c)
+{
+	if (c->btree_cache_alloc_lock == current) {
+		trace_bcache_mca_cannibalize_unlock(c);
+		c->btree_cache_alloc_lock = NULL;
+		closure_wake_up(&c->mca_wait);
+	}
+}
+
+int mca_cannibalize_lock(struct cache_set *c, struct closure *cl)
+{
+	struct task_struct *old;
+
+	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+	if (old == NULL || old == current)
+		goto success;
+
+	if (!cl) {
+		trace_bcache_mca_cannibalize_lock_fail(c);
+		return -ENOMEM;
+	}
+
+	closure_wait(&c->mca_wait, cl);
+
+	/* Try again, after adding ourselves to waitlist */
+	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+	if (old == NULL || old == current) {
+		/* We raced */
+		closure_wake_up(&c->mca_wait);
+		goto success;
+	}
+
+	trace_bcache_mca_cannibalize_lock_fail(c);
+	return -EAGAIN;
+
+success:
+	trace_bcache_mca_cannibalize_lock(c);
+	return 0;
+}
+
+static struct btree *mca_cannibalize(struct cache_set *c)
+{
+	struct btree *b;
+
+	list_for_each_entry_reverse(b, &c->btree_cache, list)
+		if (!mca_reap(c, b, false))
+			return b;
+
+	while (1) {
+		list_for_each_entry_reverse(b, &c->btree_cache, list)
+			if (!mca_reap(c, b, true))
+				return b;
+
+		/*
+		 * Rare case: all nodes were intent-locked.
+		 * Just busy-wait.
+		 */
+		WARN_ONCE(1, "btree cache cannibalize failed\n");
+		cond_resched();
+	}
+}
+
+struct btree *mca_alloc(struct cache_set *c)
+{
+	struct btree *b;
+	u64 start_time = local_clock();
+
+	mutex_lock(&c->btree_cache_lock);
+
+	/*
+	 * btree_free() doesn't free memory; it sticks the node on the end of
+	 * the list. Check if there's any freed nodes there:
+	 */
+	list_for_each_entry(b, &c->btree_cache_freeable, list)
+		if (!mca_reap_notrace(c, b, false))
+			goto out_unlock;
+
+	/*
+	 * We never free struct btree itself, just the memory that holds the on
+	 * disk node. Check the freed list before allocating a new one:
+	 */
+	list_for_each_entry(b, &c->btree_cache_freed, list)
+		if (!mca_reap_notrace(c, b, false)) {
+			mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
+			if (b->data)
+				goto out_unlock;
+
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+			goto err;
+		}
+
+	b = mca_bucket_alloc(c, __GFP_NOWARN|GFP_NOIO);
+	if (!b)
+		goto err;
+
+	BUG_ON(!six_trylock_intent(&b->lock));
+	BUG_ON(!six_trylock_write(&b->lock));
+out_unlock:
+	BUG_ON(bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key));
+	BUG_ON(btree_node_write_in_flight(b));
+
+	list_del_init(&b->list);
+	mutex_unlock(&c->btree_cache_lock);
+out:
+	b->flags		= 0;
+	b->written		= 0;
+	b->nsets		= 0;
+	b->sib_u64s[0]		= 0;
+	b->sib_u64s[1]		= 0;
+	b->whiteout_u64s	= 0;
+	b->uncompacted_whiteout_u64s = 0;
+	bch_btree_keys_init(b, &c->expensive_debug_checks);
+
+	bch_time_stats_update(&c->mca_alloc_time, start_time);
+
+	return b;
+err:
+	/* Try to cannibalize another cached btree node: */
+	if (c->btree_cache_alloc_lock == current) {
+		b = mca_cannibalize(c);
+		list_del_init(&b->list);
+		mutex_unlock(&c->btree_cache_lock);
+
+		mca_hash_remove(c, b);
+
+		trace_bcache_mca_cannibalize(c);
+		goto out;
+	}
+
+	mutex_unlock(&c->btree_cache_lock);
+	return ERR_PTR(-ENOMEM);
+}
+
+/* Slowpath, don't want it inlined into btree_iter_traverse() */
+static noinline struct btree *bch_btree_node_fill(struct btree_iter *iter,
+						  const struct bkey_i *k,
+						  unsigned level,
+						  enum six_lock_type lock_type)
+{
+	struct cache_set *c = iter->c;
+	struct btree *b;
+
+	b = mca_alloc(c);
+	if (IS_ERR(b))
+		return b;
+
+	bkey_copy(&b->key, k);
+	if (mca_hash_insert(c, b, level, iter->btree_id)) {
+		/* raced with another fill: */
+
+		/* mark as unhashed... */
+		bkey_i_to_extent(&b->key)->v._data[0] = 0;
+
+		mutex_lock(&c->btree_cache_lock);
+		list_add(&b->list, &c->btree_cache_freeable);
+		mutex_unlock(&c->btree_cache_lock);
+
+		six_unlock_write(&b->lock);
+		six_unlock_intent(&b->lock);
+		return NULL;
+	}
+
+	/*
+	 * If the btree node wasn't cached, we can't drop our lock on
+	 * the parent until after it's added to the cache - because
+	 * otherwise we could race with a btree_split() freeing the node
+	 * we're trying to lock.
+	 *
+	 * But the deadlock described below doesn't exist in this case,
+	 * so it's safe to not drop the parent lock until here:
+	 */
+	if (btree_node_read_locked(iter, level + 1))
+		btree_node_unlock(iter, level + 1);
+
+	bch_btree_node_read(c, b);
+	six_unlock_write(&b->lock);
+
+	if (lock_type == SIX_LOCK_read)
+		six_lock_downgrade(&b->lock);
+
+	return b;
+}
+
+/**
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * the @write parameter.
+ */
+struct btree *bch_btree_node_get(struct btree_iter *iter,
+				 const struct bkey_i *k, unsigned level,
+				 enum six_lock_type lock_type)
+{
+	struct btree *b;
+	struct bset_tree *t;
+
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+retry:
+	rcu_read_lock();
+	b = mca_find(iter->c, k);
+	rcu_read_unlock();
+
+	if (unlikely(!b)) {
+		/*
+		 * We must have the parent locked to call bch_btree_node_fill(),
+		 * else we could read in a btree node from disk that's been
+		 * freed:
+		 */
+		b = bch_btree_node_fill(iter, k, level, lock_type);
+
+		/* We raced and found the btree node in the cache */
+		if (!b)
+			goto retry;
+
+		if (IS_ERR(b))
+			return b;
+	} else {
+		/*
+		 * There's a potential deadlock with splits and insertions into
+		 * interior nodes we have to avoid:
+		 *
+		 * The other thread might be holding an intent lock on the node
+		 * we want, and they want to update its parent node so they're
+		 * going to upgrade their intent lock on the parent node to a
+		 * write lock.
+		 *
+		 * But if we're holding a read lock on the parent, and we're
+		 * trying to get the intent lock they're holding, we deadlock.
+		 *
+		 * So to avoid this we drop the read locks on parent nodes when
+		 * we're starting to take intent locks - and handle the race.
+		 *
+		 * The race is that they might be about to free the node we
+		 * want, and dropping our read lock on the parent node lets them
+		 * update the parent marking the node we want as freed, and then
+		 * free it:
+		 *
+		 * To guard against this, btree nodes are evicted from the cache
+		 * when they're freed - and PTR_HASH() is zeroed out, which we
+		 * check for after we lock the node.
+		 *
+		 * Then, btree_node_relock() on the parent will fail - because
+		 * the parent was modified, when the pointer to the node we want
+		 * was removed - and we'll bail out:
+		 */
+		if (btree_node_read_locked(iter, level + 1))
+			btree_node_unlock(iter, level + 1);
+
+		if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
+			return ERR_PTR(-EINTR);
+
+		if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
+			     b->level != level ||
+			     race_fault())) {
+			six_unlock_type(&b->lock, lock_type);
+			if (btree_node_relock(iter, level + 1))
+				goto retry;
+
+			return ERR_PTR(-EINTR);
+		}
+	}
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	/* avoid atomic set bit if it's not needed: */
+	if (btree_node_accessed(b))
+		set_btree_node_accessed(b);
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_type(&b->lock, lock_type);
+		return ERR_PTR(-EIO);
+	}
+
+	EBUG_ON(!b->written);
+	EBUG_ON(b->btree_id != iter->btree_id ||
+		BSET_BTREE_LEVEL(&b->data->keys) != level ||
+		bkey_cmp(b->data->max_key, k->k.p));
+
+	return b;
+}
diff --git a/libbcache/btree_cache.h b/libbcache/btree_cache.h
new file mode 100644
index 0000000..e745abb
--- /dev/null
+++ b/libbcache/btree_cache.h
@@ -0,0 +1,61 @@
+#ifndef _BCACHE_BTREE_CACHE_H
+#define _BCACHE_BTREE_CACHE_H
+
+#include "bcache.h"
+#include "btree_types.h"
+
+struct btree_iter;
+
+extern const char *bch_btree_id_names[BTREE_ID_NR];
+
+void bch_recalc_btree_reserve(struct cache_set *);
+
+void mca_hash_remove(struct cache_set *, struct btree *);
+int mca_hash_insert(struct cache_set *, struct btree *,
+		    unsigned, enum btree_id);
+
+void mca_cannibalize_unlock(struct cache_set *);
+int mca_cannibalize_lock(struct cache_set *, struct closure *);
+
+struct btree *mca_alloc(struct cache_set *);
+
+struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *,
+				 unsigned, enum six_lock_type);
+
+void bch_btree_cache_free(struct cache_set *);
+int bch_btree_cache_alloc(struct cache_set *);
+
+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
+	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl,	\
+					  &(_c)->btree_cache_table),	\
+	     _iter = 0;	_iter < (_tbl)->size; _iter++)			\
+		rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
+
+static inline size_t btree_bytes(struct cache_set *c)
+{
+	return c->sb.btree_node_size << 9;
+}
+
+static inline size_t btree_max_u64s(struct cache_set *c)
+{
+	return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+}
+
+static inline size_t btree_pages(struct cache_set *c)
+{
+	return c->sb.btree_node_size >> (PAGE_SHIFT - 9);
+}
+
+static inline size_t btree_page_order(struct cache_set *c)
+{
+	return ilog2(btree_pages(c));
+}
+
+static inline unsigned btree_blocks(struct cache_set *c)
+{
+	return c->sb.btree_node_size >> c->block_bits;
+}
+
+#define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->btree_id].b)
+
+#endif /* _BCACHE_BTREE_CACHE_H */
diff --git a/libbcache/btree_gc.c b/libbcache/btree_gc.c
new file mode 100644
index 0000000..8417187
--- /dev/null
+++ b/libbcache/btree_gc.c
@@ -0,0 +1,898 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright (C) 2014 Datera Inc.
+ */
+
+#include "bcache.h"
+#include "alloc.h"
+#include "bkey_methods.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_io.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "writeback.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/rcupdate.h>
+#include <trace/events/bcache.h>
+
+struct range_checks {
+	struct range_level {
+		struct bpos	min;
+		struct bpos	max;
+	}			l[BTREE_MAX_DEPTH];
+	unsigned		depth;
+};
+
+static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
+{
+	unsigned i;
+
+	for (i = 0; i < BTREE_MAX_DEPTH; i++)
+		r->l[i].min = r->l[i].max = POS_MIN;
+	r->depth = depth;
+}
+
+static void btree_node_range_checks(struct cache_set *c, struct btree *b,
+				    struct range_checks *r)
+{
+	struct range_level *l = &r->l[b->level];
+
+	struct bpos expected_min = bkey_cmp(l->min, l->max)
+		? btree_type_successor(b->btree_id, l->max)
+		: l->max;
+
+	cache_set_inconsistent_on(bkey_cmp(b->data->min_key,
+					   expected_min), c,
+		"btree node has incorrect min key: %llu:%llu != %llu:%llu",
+		b->data->min_key.inode,
+		b->data->min_key.offset,
+		expected_min.inode,
+		expected_min.offset);
+
+	l->max = b->data->max_key;
+
+	if (b->level > r->depth) {
+		l = &r->l[b->level - 1];
+
+		cache_set_inconsistent_on(bkey_cmp(b->data->min_key,
+						   l->min), c,
+			"btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
+			b->data->min_key.inode,
+			b->data->min_key.offset,
+			l->min.inode,
+			l->min.offset);
+
+		cache_set_inconsistent_on(bkey_cmp(b->data->max_key,
+						   l->max), c,
+			"btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
+			b->data->max_key.inode,
+			b->data->max_key.offset,
+			l->max.inode,
+			l->max.offset);
+
+		if (bkey_cmp(b->data->max_key, POS_MAX))
+			l->min = l->max =
+				btree_type_successor(b->btree_id,
+						     b->data->max_key);
+	}
+}
+
+u8 bch_btree_key_recalc_oldest_gen(struct cache_set *c, struct bkey_s_c k)
+{
+	const struct bch_extent_ptr *ptr;
+	struct cache *ca;
+	u8 max_stale = 0;
+
+	if (bkey_extent_is_data(k.k)) {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
+		rcu_read_lock();
+
+		extent_for_each_online_device(c, e, ptr, ca) {
+			size_t b = PTR_BUCKET_NR(ca, ptr);
+
+			if (__gen_after(ca->oldest_gens[b], ptr->gen))
+				ca->oldest_gens[b] = ptr->gen;
+
+			max_stale = max(max_stale, ptr_stale(ca, ptr));
+		}
+
+		rcu_read_unlock();
+	}
+
+	return max_stale;
+}
+
+/*
+ * For runtime mark and sweep:
+ */
+u8 __bch_btree_mark_key(struct cache_set *c, enum bkey_type type,
+			struct bkey_s_c k)
+{
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+		bch_gc_mark_key(c, k, c->sb.btree_node_size, true);
+		return 0;
+	case BKEY_TYPE_EXTENTS:
+		bch_gc_mark_key(c, k, k.k->size, false);
+		return bch_btree_key_recalc_oldest_gen(c, k);
+	default:
+		BUG();
+	}
+}
+
+static u8 btree_mark_key(struct cache_set *c, struct btree *b,
+			 struct bkey_s_c k)
+{
+	return __bch_btree_mark_key(c, btree_node_type(b), k);
+}
+
+static bool btree_gc_mark_node(struct cache_set *c, struct btree *b)
+{
+	if (btree_node_has_ptrs(b)) {
+		struct btree_node_iter iter;
+		struct bkey unpacked;
+		struct bkey_s_c k;
+		u8 stale = 0;
+
+		for_each_btree_node_key_unpack(b, k, &iter,
+					       btree_node_is_extents(b),
+					       &unpacked) {
+			bkey_debugcheck(c, b, k);
+			stale = max(stale, btree_mark_key(c, b, k));
+		}
+
+		if (btree_gc_rewrite_disabled(c))
+			return false;
+
+		if (stale > 10)
+			return true;
+	}
+
+	if (btree_gc_always_rewrite(c))
+		return true;
+
+	return false;
+}
+
+static inline void __gc_pos_set(struct cache_set *c, struct gc_pos new_pos)
+{
+	write_seqcount_begin(&c->gc_pos_lock);
+	c->gc_pos = new_pos;
+	write_seqcount_end(&c->gc_pos_lock);
+}
+
+static inline void gc_pos_set(struct cache_set *c, struct gc_pos new_pos)
+{
+	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
+	__gc_pos_set(c, new_pos);
+}
+
+static int bch_gc_btree(struct cache_set *c, enum btree_id btree_id)
+{
+	struct btree_iter iter;
+	struct btree *b;
+	bool should_rewrite;
+	struct range_checks r;
+	unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
+	int ret;
+
+	/*
+	 * if expensive_debug_checks is on, run range_checks on all leaf nodes:
+	 */
+	if (expensive_debug_checks(c))
+		depth = 0;
+
+	btree_node_range_checks_init(&r, depth);
+
+	for_each_btree_node(&iter, c, btree_id, POS_MIN, depth, b) {
+		btree_node_range_checks(c, b, &r);
+
+		bch_verify_btree_nr_keys(b);
+
+		should_rewrite = btree_gc_mark_node(c, b);
+
+		gc_pos_set(c, gc_pos_btree_node(b));
+
+		if (should_rewrite)
+			bch_btree_node_rewrite(&iter, b, NULL);
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->btree_root_lock);
+
+	b = c->btree_roots[btree_id].b;
+	__bch_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key));
+	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
+
+	mutex_unlock(&c->btree_root_lock);
+	return 0;
+}
+
+static void bch_mark_allocator_buckets(struct cache_set *c)
+{
+	struct cache *ca;
+	struct open_bucket *ob;
+	size_t i, j, iter;
+	unsigned ci;
+
+	for_each_cache(ca, c, ci) {
+		spin_lock(&ca->freelist_lock);
+
+		fifo_for_each_entry(i, &ca->free_inc, iter)
+			bch_mark_alloc_bucket(ca, &ca->buckets[i], true);
+
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each_entry(i, &ca->free[j], iter)
+				bch_mark_alloc_bucket(ca, &ca->buckets[i], true);
+
+		spin_unlock(&ca->freelist_lock);
+	}
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		const struct bch_extent_ptr *ptr;
+
+		mutex_lock(&ob->lock);
+		rcu_read_lock();
+		open_bucket_for_each_online_device(c, ob, ptr, ca)
+			bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), true);
+		rcu_read_unlock();
+		mutex_unlock(&ob->lock);
+	}
+}
+
+/*
+ * Mark non btree metadata - prios, journal
+ */
+static void bch_mark_metadata(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+
+	for_each_cache(ca, c, i) {
+		unsigned j;
+		u64 *i;
+
+		for (j = 0; j < bch_nr_journal_buckets(ca->disk_sb.sb); j++)
+			bch_mark_metadata_bucket(ca,
+				&ca->buckets[journal_bucket(ca->disk_sb.sb, j)],
+				true);
+
+		spin_lock(&ca->prio_buckets_lock);
+
+		for (i = ca->prio_buckets;
+		     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
+			bch_mark_metadata_bucket(ca, &ca->buckets[*i], true);
+
+		spin_unlock(&ca->prio_buckets_lock);
+	}
+}
+
+/* Also see bch_pending_btree_node_free_insert_done() */
+static void bch_mark_pending_btree_node_frees(struct cache_set *c)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+	struct btree_interior_update *as;
+	struct pending_btree_node_free *d;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
+
+	for_each_pending_btree_node_free(c, as, d)
+		if (d->index_update_done)
+			__bch_gc_mark_key(c, bkey_i_to_s_c(&d->key),
+					  c->sb.btree_node_size, true,
+					  &stats);
+	/*
+	 * Don't apply stats - pending deletes aren't tracked in
+	 * cache_set_stats:
+	 */
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+/**
+ * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
+ */
+void bch_gc(struct cache_set *c)
+{
+	struct cache *ca;
+	struct bucket *g;
+	struct bucket_mark new;
+	u64 start_time = local_clock();
+	unsigned i;
+	int cpu;
+
+	/*
+	 * Walk _all_ references to buckets, and recompute them:
+	 *
+	 * Order matters here:
+	 *  - Concurrent GC relies on the fact that we have a total ordering for
+	 *    everything that GC walks - see  gc_will_visit_node(),
+	 *    gc_will_visit_root()
+	 *
+	 *  - also, references move around in the course of index updates and
+	 *    various other crap: everything needs to agree on the ordering
+	 *    references are allowed to move around in - e.g., we're allowed to
+	 *    start with a reference owned by an open_bucket (the allocator) and
+	 *    move it to the btree, but not the reverse.
+	 *
+	 *    This is necessary to ensure that gc doesn't miss references that
+	 *    move around - if references move backwards in the ordering GC
+	 *    uses, GC could skip past them
+	 */
+
+	if (test_bit(CACHE_SET_GC_FAILURE, &c->flags))
+		return;
+
+	trace_bcache_gc_start(c);
+
+	/*
+	 * Do this before taking gc_lock - bch_disk_reservation_get() blocks on
+	 * gc_lock if sectors_available goes to 0:
+	 */
+	bch_recalc_sectors_available(c);
+
+	down_write(&c->gc_lock);
+
+	lg_global_lock(&c->bucket_stats_lock);
+
+	/*
+	 * Indicates to buckets code that gc is now in progress - done under
+	 * bucket_stats_lock to avoid racing with bch_mark_key():
+	 */
+	__gc_pos_set(c, GC_POS_MIN);
+
+	/* Save a copy of the existing bucket stats while we recompute them: */
+	for_each_cache(ca, c, i) {
+		ca->bucket_stats_cached = __bch_bucket_stats_read_cache(ca);
+		for_each_possible_cpu(cpu) {
+			struct bucket_stats_cache *p =
+				per_cpu_ptr(ca->bucket_stats_percpu, cpu);
+			memset(p, 0, sizeof(*p));
+		}
+	}
+
+	c->bucket_stats_cached = __bch_bucket_stats_read_cache_set(c);
+	for_each_possible_cpu(cpu) {
+		struct bucket_stats_cache_set *p =
+			per_cpu_ptr(c->bucket_stats_percpu, cpu);
+
+		memset(p->s, 0, sizeof(p->s));
+		p->persistent_reserved = 0;
+	}
+
+	lg_global_unlock(&c->bucket_stats_lock);
+
+	/* Clear bucket marks: */
+	for_each_cache(ca, c, i)
+		for_each_bucket(g, ca) {
+			bucket_cmpxchg(g, new, ({
+				new.owned_by_allocator	= 0;
+				new.is_metadata		= 0;
+				new.cached_sectors	= 0;
+				new.dirty_sectors	= 0;
+			}));
+			ca->oldest_gens[g - ca->buckets] = new.gen;
+		}
+
+	/* Walk allocator's references: */
+	bch_mark_allocator_buckets(c);
+
+	/* Walk btree: */
+	while (c->gc_pos.phase < (int) BTREE_ID_NR) {
+		int ret = c->btree_roots[c->gc_pos.phase].b
+			? bch_gc_btree(c, (int) c->gc_pos.phase)
+			: 0;
+
+		if (ret) {
+			bch_err(c, "btree gc failed: %d", ret);
+			set_bit(CACHE_SET_GC_FAILURE, &c->flags);
+			up_write(&c->gc_lock);
+			return;
+		}
+
+		gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
+	}
+
+	bch_mark_metadata(c);
+	bch_mark_pending_btree_node_frees(c);
+	bch_writeback_recalc_oldest_gens(c);
+
+	for_each_cache(ca, c, i)
+		atomic_long_set(&ca->saturated_count, 0);
+
+	/* Indicates that gc is no longer in progress: */
+	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
+
+	up_write(&c->gc_lock);
+	trace_bcache_gc_end(c);
+	bch_time_stats_update(&c->btree_gc_time, start_time);
+
+	/*
+	 * Wake up allocator in case it was waiting for buckets
+	 * because of not being able to inc gens
+	 */
+	for_each_cache(ca, c, i)
+		bch_wake_allocator(ca);
+}
+
+/* Btree coalescing */
+
+static void recalc_packed_keys(struct btree *b)
+{
+	struct bkey_packed *k;
+
+	memset(&b->nr, 0, sizeof(b->nr));
+
+	BUG_ON(b->nsets != 1);
+
+	for (k =  btree_bkey_first(b, b->set);
+	     k != btree_bkey_last(b, b->set);
+	     k = bkey_next(k))
+		btree_keys_account_key_add(&b->nr, 0, k);
+}
+
+static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
+			       struct btree_iter *iter)
+{
+	struct btree *parent = iter->nodes[old_nodes[0]->level + 1];
+	struct cache_set *c = iter->c;
+	unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
+	unsigned blocks = btree_blocks(c) * 2 / 3;
+	struct btree *new_nodes[GC_MERGE_NODES];
+	struct btree_interior_update *as;
+	struct btree_reserve *res;
+	struct keylist keylist;
+	struct bkey_format_state format_state;
+	struct bkey_format new_format;
+
+	memset(new_nodes, 0, sizeof(new_nodes));
+	bch_keylist_init(&keylist, NULL, 0);
+
+	/* Count keys that are not deleted */
+	for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
+		u64s += old_nodes[i]->nr.live_u64s;
+
+	nr_old_nodes = nr_new_nodes = i;
+
+	/* Check if all keys in @old_nodes could fit in one fewer node */
+	if (nr_old_nodes <= 1 ||
+	    __set_blocks(old_nodes[0]->data,
+			 DIV_ROUND_UP(u64s, nr_old_nodes - 1),
+			 block_bytes(c)) > blocks)
+		return;
+
+	res = bch_btree_reserve_get(c, parent, nr_old_nodes,
+				    BTREE_INSERT_NOFAIL|
+				    BTREE_INSERT_USE_RESERVE,
+				    NULL);
+	if (IS_ERR(res)) {
+		trace_bcache_btree_gc_coalesce_fail(c,
+				BTREE_GC_COALESCE_FAIL_RESERVE_GET);
+		return;
+	}
+
+	if (bch_keylist_realloc(&keylist, NULL, 0,
+			(BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
+		trace_bcache_btree_gc_coalesce_fail(c,
+				BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
+		goto out;
+	}
+
+	/* Find a format that all keys in @old_nodes can pack into */
+	bch_bkey_format_init(&format_state);
+
+	for (i = 0; i < nr_old_nodes; i++)
+		__bch_btree_calc_format(&format_state, old_nodes[i]);
+
+	new_format = bch_bkey_format_done(&format_state);
+
+	/* Check if repacking would make any nodes too big to fit */
+	for (i = 0; i < nr_old_nodes; i++)
+		if (!bch_btree_node_format_fits(c, old_nodes[i], &new_format)) {
+			trace_bcache_btree_gc_coalesce_fail(c,
+					BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
+			goto out;
+		}
+
+	trace_bcache_btree_gc_coalesce(c, parent, nr_old_nodes);
+
+	as = bch_btree_interior_update_alloc(c);
+
+	for (i = 0; i < nr_old_nodes; i++)
+		bch_btree_interior_update_will_free_node(c, as, old_nodes[i]);
+
+	/* Repack everything with @new_format and sort down to one bset */
+	for (i = 0; i < nr_old_nodes; i++)
+		new_nodes[i] = __btree_node_alloc_replacement(c, old_nodes[i],
+							      new_format, res);
+
+	/*
+	 * Conceptually we concatenate the nodes together and slice them
+	 * up at different boundaries.
+	 */
+	for (i = nr_new_nodes - 1; i > 0; --i) {
+		struct btree *n1 = new_nodes[i];
+		struct btree *n2 = new_nodes[i - 1];
+
+		struct bset *s1 = btree_bset_first(n1);
+		struct bset *s2 = btree_bset_first(n2);
+		struct bkey_packed *k, *last = NULL;
+
+		/* Calculate how many keys from @n2 we could fit inside @n1 */
+		u64s = 0;
+
+		for (k = s2->start;
+		     k < bset_bkey_last(s2) &&
+		     __set_blocks(n1->data, le16_to_cpu(s1->u64s) + u64s + k->u64s,
+				  block_bytes(c)) <= blocks;
+		     k = bkey_next(k)) {
+			last = k;
+			u64s += k->u64s;
+		}
+
+		if (u64s == le16_to_cpu(s2->u64s)) {
+			/* n2 fits entirely in n1 */
+			n1->key.k.p = n1->data->max_key = n2->data->max_key;
+
+			memcpy_u64s(bset_bkey_last(s1),
+				    s2->start,
+				    le16_to_cpu(s2->u64s));
+			le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
+
+			set_btree_bset_end(n1, n1->set);
+
+			six_unlock_write(&n2->lock);
+			bch_btree_node_free_never_inserted(c, n2);
+			six_unlock_intent(&n2->lock);
+
+			memmove(new_nodes + i - 1,
+				new_nodes + i,
+				sizeof(new_nodes[0]) * (nr_new_nodes - i));
+			new_nodes[--nr_new_nodes] = NULL;
+		} else if (u64s) {
+			/* move part of n2 into n1 */
+			n1->key.k.p = n1->data->max_key =
+				bkey_unpack_pos(n1, last);
+
+			n2->data->min_key =
+				btree_type_successor(iter->btree_id,
+						     n1->data->max_key);
+
+			memcpy_u64s(bset_bkey_last(s1),
+				    s2->start, u64s);
+			le16_add_cpu(&s1->u64s, u64s);
+
+			memmove(s2->start,
+				bset_bkey_idx(s2, u64s),
+				(le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
+			s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
+
+			set_btree_bset_end(n1, n1->set);
+			set_btree_bset_end(n2, n2->set);
+		}
+	}
+
+	for (i = 0; i < nr_new_nodes; i++) {
+		struct btree *n = new_nodes[i];
+
+		recalc_packed_keys(n);
+		btree_node_reset_sib_u64s(n);
+
+		bch_btree_build_aux_trees(n);
+		six_unlock_write(&n->lock);
+
+		bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
+	}
+
+	/*
+	 * The keys for the old nodes get deleted. We don't want to insert keys
+	 * that compare equal to the keys for the new nodes we'll also be
+	 * inserting - we can't because keys on a keylist must be strictly
+	 * greater than the previous keys, and we also don't need to since the
+	 * key for the new node will serve the same purpose (overwriting the key
+	 * for the old node).
+	 */
+	for (i = 0; i < nr_old_nodes; i++) {
+		struct bkey_i delete;
+		unsigned j;
+
+		for (j = 0; j < nr_new_nodes; j++)
+			if (!bkey_cmp(old_nodes[i]->key.k.p,
+				      new_nodes[j]->key.k.p))
+				goto next;
+
+		bkey_init(&delete.k);
+		delete.k.p = old_nodes[i]->key.k.p;
+		bch_keylist_add_in_order(&keylist, &delete);
+next:
+		i = i;
+	}
+
+	/*
+	 * Keys for the new nodes get inserted: bch_btree_insert_keys() only
+	 * does the lookup once and thus expects the keys to be in sorted order
+	 * so we have to make sure the new keys are correctly ordered with
+	 * respect to the deleted keys added in the previous loop
+	 */
+	for (i = 0; i < nr_new_nodes; i++)
+		bch_keylist_add_in_order(&keylist, &new_nodes[i]->key);
+
+	/* Insert the newly coalesced nodes */
+	bch_btree_insert_node(parent, iter, &keylist, res, as);
+
+	BUG_ON(!bch_keylist_empty(&keylist));
+
+	BUG_ON(iter->nodes[old_nodes[0]->level] != old_nodes[0]);
+
+	BUG_ON(!bch_btree_iter_node_replace(iter, new_nodes[0]));
+
+	for (i = 0; i < nr_new_nodes; i++)
+		btree_open_bucket_put(c, new_nodes[i]);
+
+	/* Free the old nodes and update our sliding window */
+	for (i = 0; i < nr_old_nodes; i++) {
+		bch_btree_node_free_inmem(iter, old_nodes[i]);
+		six_unlock_intent(&old_nodes[i]->lock);
+
+		/*
+		 * the index update might have triggered a split, in which case
+		 * the nodes we coalesced - the new nodes we just created -
+		 * might not be sibling nodes anymore - don't add them to the
+		 * sliding window (except the first):
+		 */
+		if (!i) {
+			old_nodes[i] = new_nodes[i];
+		} else {
+			old_nodes[i] = NULL;
+			if (new_nodes[i])
+				six_unlock_intent(&new_nodes[i]->lock);
+		}
+	}
+out:
+	bch_keylist_free(&keylist, NULL);
+	bch_btree_reserve_put(c, res);
+}
+
+static int bch_coalesce_btree(struct cache_set *c, enum btree_id btree_id)
+{
+	struct btree_iter iter;
+	struct btree *b;
+	unsigned i;
+
+	/* Sliding window of adjacent btree nodes */
+	struct btree *merge[GC_MERGE_NODES];
+	u32 lock_seq[GC_MERGE_NODES];
+
+	/*
+	 * XXX: We don't have a good way of positively matching on sibling nodes
+	 * that have the same parent - this code works by handling the cases
+	 * where they might not have the same parent, and is thus fragile. Ugh.
+	 *
+	 * Perhaps redo this to use multiple linked iterators?
+	 */
+	memset(merge, 0, sizeof(merge));
+
+	__for_each_btree_node(&iter, c, btree_id, POS_MIN, 0, b, U8_MAX) {
+		memmove(merge + 1, merge,
+			sizeof(merge) - sizeof(merge[0]));
+		memmove(lock_seq + 1, lock_seq,
+			sizeof(lock_seq) - sizeof(lock_seq[0]));
+
+		merge[0] = b;
+
+		for (i = 1; i < GC_MERGE_NODES; i++) {
+			if (!merge[i] ||
+			    !six_relock_intent(&merge[i]->lock, lock_seq[i]))
+				break;
+
+			if (merge[i]->level != merge[0]->level) {
+				six_unlock_intent(&merge[i]->lock);
+				break;
+			}
+		}
+		memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
+
+		bch_coalesce_nodes(merge, &iter);
+
+		for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
+			lock_seq[i] = merge[i]->lock.state.seq;
+			six_unlock_intent(&merge[i]->lock);
+		}
+
+		lock_seq[0] = merge[0]->lock.state.seq;
+
+		if (test_bit(CACHE_SET_GC_STOPPING, &c->flags)) {
+			bch_btree_iter_unlock(&iter);
+			return -ESHUTDOWN;
+		}
+
+		bch_btree_iter_cond_resched(&iter);
+
+		/*
+		 * If the parent node wasn't relocked, it might have been split
+		 * and the nodes in our sliding window might not have the same
+		 * parent anymore - blow away the sliding window:
+		 */
+		if (iter.nodes[iter.level + 1] &&
+		    !btree_node_intent_locked(&iter, iter.level + 1))
+			memset(merge + 1, 0,
+			       (GC_MERGE_NODES - 1) * sizeof(merge[0]));
+	}
+	return bch_btree_iter_unlock(&iter);
+}
+
+/**
+ * bch_coalesce - coalesce adjacent nodes with low occupancy
+ */
+void bch_coalesce(struct cache_set *c)
+{
+	u64 start_time;
+	enum btree_id id;
+
+	if (btree_gc_coalesce_disabled(c))
+		return;
+
+	if (test_bit(CACHE_SET_GC_FAILURE, &c->flags))
+		return;
+
+	down_read(&c->gc_lock);
+	trace_bcache_gc_coalesce_start(c);
+	start_time = local_clock();
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		int ret = c->btree_roots[id].b
+			? bch_coalesce_btree(c, id)
+			: 0;
+
+		if (ret) {
+			if (ret != -ESHUTDOWN)
+				bch_err(c, "btree coalescing failed: %d", ret);
+			set_bit(CACHE_SET_GC_FAILURE, &c->flags);
+			return;
+		}
+	}
+
+	bch_time_stats_update(&c->btree_coalesce_time, start_time);
+	trace_bcache_gc_coalesce_end(c);
+	up_read(&c->gc_lock);
+}
+
+static int bch_gc_thread(void *arg)
+{
+	struct cache_set *c = arg;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	unsigned long last = atomic_long_read(&clock->now);
+	unsigned last_kick = atomic_read(&c->kick_gc);
+
+	set_freezable();
+
+	while (1) {
+		unsigned long next = last + c->capacity / 16;
+
+		while (atomic_long_read(&clock->now) < next) {
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			if (kthread_should_stop()) {
+				__set_current_state(TASK_RUNNING);
+				return 0;
+			}
+
+			if (atomic_read(&c->kick_gc) != last_kick) {
+				__set_current_state(TASK_RUNNING);
+				break;
+			}
+
+			bch_io_clock_schedule_timeout(clock, next);
+			try_to_freeze();
+		}
+
+		last = atomic_long_read(&clock->now);
+		last_kick = atomic_read(&c->kick_gc);
+
+		bch_gc(c);
+		bch_coalesce(c);
+
+		debug_check_no_locks_held();
+	}
+
+	return 0;
+}
+
+void bch_gc_thread_stop(struct cache_set *c)
+{
+	set_bit(CACHE_SET_GC_STOPPING, &c->flags);
+
+	if (!IS_ERR_OR_NULL(c->gc_thread))
+		kthread_stop(c->gc_thread);
+}
+
+int bch_gc_thread_start(struct cache_set *c)
+{
+	clear_bit(CACHE_SET_GC_STOPPING, &c->flags);
+
+	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
+	if (IS_ERR(c->gc_thread))
+		return PTR_ERR(c->gc_thread);
+
+	wake_up_process(c->gc_thread);
+	return 0;
+}
+
+/* Initial GC computes bucket marks during startup */
+
+static void bch_initial_gc_btree(struct cache_set *c, enum btree_id id)
+{
+	struct btree_iter iter;
+	struct btree *b;
+	struct range_checks r;
+
+	btree_node_range_checks_init(&r, 0);
+
+	if (!c->btree_roots[id].b)
+		return;
+
+	/*
+	 * We have to hit every btree node before starting journal replay, in
+	 * order for the journal seq blacklist machinery to work:
+	 */
+	for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+		btree_node_range_checks(c, b, &r);
+
+		if (btree_node_has_ptrs(b)) {
+			struct btree_node_iter node_iter;
+			struct bkey unpacked;
+			struct bkey_s_c k;
+
+			for_each_btree_node_key_unpack(b, k, &node_iter,
+						       btree_node_is_extents(b),
+						       &unpacked)
+				btree_mark_key(c, b, k);
+		}
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+
+	bch_btree_iter_unlock(&iter);
+
+	__bch_btree_mark_key(c, BKEY_TYPE_BTREE,
+			     bkey_i_to_s_c(&c->btree_roots[id].b->key));
+}
+
+int bch_initial_gc(struct cache_set *c, struct list_head *journal)
+{
+	enum btree_id id;
+
+	if (journal) {
+		for (id = 0; id < BTREE_ID_NR; id++)
+			bch_initial_gc_btree(c, id);
+
+		bch_journal_mark(c, journal);
+	}
+
+	bch_mark_metadata(c);
+
+	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
+	set_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags);
+
+	return 0;
+}
diff --git a/libbcache/btree_gc.h b/libbcache/btree_gc.h
new file mode 100644
index 0000000..91d31c0
--- /dev/null
+++ b/libbcache/btree_gc.h
@@ -0,0 +1,103 @@
+#ifndef _BCACHE_GC_H
+#define _BCACHE_GC_H
+
+#include "btree_types.h"
+
+enum bkey_type;
+
+void bch_coalesce(struct cache_set *);
+void bch_gc(struct cache_set *);
+void bch_gc_thread_stop(struct cache_set *);
+int bch_gc_thread_start(struct cache_set *);
+int bch_initial_gc(struct cache_set *, struct list_head *);
+u8 bch_btree_key_recalc_oldest_gen(struct cache_set *, struct bkey_s_c);
+u8 __bch_btree_mark_key(struct cache_set *, enum bkey_type,
+				struct bkey_s_c);
+
+/*
+ * For concurrent mark and sweep (with other index updates), we define a total
+ * ordering of _all_ references GC walks:
+ *
+ * Note that some references will have the same GC position as others - e.g.
+ * everything within the same btree node; in those cases we're relying on
+ * whatever locking exists for where those references live, i.e. the write lock
+ * on a btree node.
+ *
+ * That locking is also required to ensure GC doesn't pass the updater in
+ * between the updater adding/removing the reference and updating the GC marks;
+ * without that, we would at best double count sometimes.
+ *
+ * That part is important - whenever calling bch_mark_pointers(), a lock _must_
+ * be held that prevents GC from passing the position the updater is at.
+ *
+ * (What about the start of gc, when we're clearing all the marks? GC clears the
+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
+ * position inside its cmpxchg loop, so crap magically works).
+ */
+
+/* Position of (the start of) a gc phase: */
+static inline struct gc_pos gc_phase(enum gc_phase phase)
+{
+	return (struct gc_pos) {
+		.phase	= phase,
+		.pos	= POS_MIN,
+		.level	= 0,
+	};
+}
+
+#define GC_POS_MIN	gc_phase(0)
+
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
+{
+	if (l.phase != r.phase)
+		return l.phase < r.phase ? -1 : 1;
+	if (bkey_cmp(l.pos, r.pos))
+		return bkey_cmp(l.pos, r.pos);
+	if (l.level != r.level)
+		return l.level < r.level ? -1 : 1;
+	return 0;
+}
+
+/*
+ * GC position of the pointers within a btree node: note, _not_ for &b->key
+ * itself, that lives in the parent node:
+ */
+static inline struct gc_pos gc_pos_btree_node(struct btree *b)
+{
+	return (struct gc_pos) {
+		.phase	= b->btree_id,
+		.pos	= b->key.k.p,
+		.level	= b->level,
+	};
+}
+
+/*
+ * GC position of the pointer to a btree root: we don't use
+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with
+ * btree_split() increasing the tree depth - the new root will have level > the
+ * old root and thus have a greater gc position than the old root, but that
+ * would be incorrect since once gc has marked the root it's not coming back.
+ */
+static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
+{
+	return (struct gc_pos) {
+		.phase	= (int) id,
+		.pos	= POS_MAX,
+		.level	= U8_MAX,
+	};
+}
+
+static inline bool gc_will_visit(struct cache_set *c, struct gc_pos pos)
+{
+	unsigned seq;
+	bool ret;
+
+	do {
+		seq = read_seqcount_begin(&c->gc_pos_lock);
+		ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+	return ret;
+}
+
+#endif
diff --git a/libbcache/btree_io.c b/libbcache/btree_io.c
new file mode 100644
index 0000000..ff976b5
--- /dev/null
+++ b/libbcache/btree_io.c
@@ -0,0 +1,1674 @@
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+
+#include <trace/events/bcache.h>
+
+static void verify_no_dups(struct btree *b,
+			   struct bkey_packed *start,
+			   struct bkey_packed *end)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	struct bkey_packed *k;
+
+	for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
+		struct bkey l = bkey_unpack_key(b, k);
+		struct bkey r = bkey_unpack_key(b, bkey_next(k));
+
+		BUG_ON(btree_node_is_extents(b)
+		       ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
+		       : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
+		//BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
+	}
+#endif
+}
+
+static void clear_needs_whiteout(struct bset *i)
+{
+	struct bkey_packed *k;
+
+	for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k))
+		k->needs_whiteout = false;
+}
+
+static void set_needs_whiteout(struct bset *i)
+{
+	struct bkey_packed *k;
+
+	for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k))
+		k->needs_whiteout = true;
+}
+
+static void btree_bounce_free(struct cache_set *c, unsigned order,
+			      bool used_mempool, void *p)
+{
+	if (used_mempool)
+		mempool_free(virt_to_page(p), &c->btree_bounce_pool);
+	else
+		free_pages((unsigned long) p, order);
+}
+
+static void *btree_bounce_alloc(struct cache_set *c, unsigned order,
+				bool *used_mempool)
+{
+	void *p;
+
+	BUG_ON(1 << order > btree_pages(c));
+
+	*used_mempool = false;
+	p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
+	if (p)
+		return p;
+
+	*used_mempool = true;
+	return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO));
+}
+
+typedef int (*sort_cmp_fn)(struct btree *,
+			   struct bkey_packed *,
+			   struct bkey_packed *);
+
+struct sort_iter {
+	struct btree	*b;
+	unsigned		used;
+
+	struct sort_iter_set {
+		struct bkey_packed *k, *end;
+	} data[MAX_BSETS + 1];
+};
+
+static void sort_iter_init(struct sort_iter *iter, struct btree *b)
+{
+	memset(iter, 0, sizeof(*iter));
+	iter->b = b;
+}
+
+static inline void __sort_iter_sift(struct sort_iter *iter,
+				    unsigned from,
+				    sort_cmp_fn cmp)
+{
+	unsigned i;
+
+	for (i = from;
+	     i + 1 < iter->used &&
+	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
+	     i++)
+		swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+
+	__sort_iter_sift(iter, 0, cmp);
+}
+
+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	unsigned i = iter->used;
+
+	while (i--)
+		__sort_iter_sift(iter, i, cmp);
+}
+
+static void sort_iter_add(struct sort_iter *iter,
+			  struct bkey_packed *k,
+			  struct bkey_packed *end)
+{
+	BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
+
+	if (k != end)
+		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
+}
+
+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
+{
+	return iter->used ? iter->data->k : NULL;
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	iter->data->k = bkey_next(iter->data->k);
+
+	BUG_ON(iter->data->k > iter->data->end);
+
+	if (iter->data->k == iter->data->end)
+		memmove(&iter->data[0],
+			&iter->data[1],
+			sizeof(iter->data[0]) * --iter->used);
+	else
+		sort_iter_sift(iter, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
+						 sort_cmp_fn cmp)
+{
+	struct bkey_packed *ret = sort_iter_peek(iter);
+
+	if (ret)
+		sort_iter_advance(iter, cmp);
+
+	return ret;
+}
+
+static inline int sort_key_whiteouts_cmp(struct btree *b,
+					 struct bkey_packed *l,
+					 struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r);
+}
+
+static unsigned sort_key_whiteouts(struct bkey_packed *dst,
+				   struct sort_iter *iter)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, sort_key_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
+		bkey_copy(out, in);
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extent_whiteouts_cmp(struct btree *b,
+					    struct bkey_packed *l,
+					    struct bkey_packed *r)
+{
+	struct bkey ul = bkey_unpack_key(b, l);
+	struct bkey ur = bkey_unpack_key(b, r);
+
+	return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
+}
+
+static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
+				      struct sort_iter *iter)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *out = dst;
+	struct bkey_i l, r;
+	bool prev = false, l_packed;
+	u64 max_packed_size	= bkey_field_max(f, BKEY_FIELD_SIZE);
+	u64 max_packed_offset	= bkey_field_max(f, BKEY_FIELD_OFFSET);
+	u64 new_size;
+
+	max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
+
+	sort_iter_sort(iter, sort_extent_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+		EBUG_ON(bkeyp_val_u64s(f, in));
+		EBUG_ON(in->type != KEY_TYPE_DISCARD);
+
+		r.k = bkey_unpack_key(iter->b, in);
+
+		if (prev &&
+		    bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
+			if (bkey_cmp(l.k.p, r.k.p) >= 0)
+				continue;
+
+			new_size = l_packed
+				? min(max_packed_size, max_packed_offset -
+				      bkey_start_offset(&l.k))
+				: KEY_SIZE_MAX;
+
+			new_size = min(new_size, r.k.p.offset -
+				       bkey_start_offset(&l.k));
+
+			BUG_ON(new_size < l.k.size);
+
+			bch_key_resize(&l.k, new_size);
+
+			if (bkey_cmp(l.k.p, r.k.p) >= 0)
+				continue;
+
+			bch_cut_front(l.k.p, &r);
+		}
+
+		if (prev) {
+			if (!bkey_pack(out, &l, f)) {
+				BUG_ON(l_packed);
+				bkey_copy(out, &l);
+			}
+			out = bkey_next(out);
+		}
+
+		l = r;
+		prev = true;
+		l_packed = bkey_packed(in);
+	}
+
+	if (prev) {
+		if (!bkey_pack(out, &l, f)) {
+			BUG_ON(l_packed);
+			bkey_copy(out, &l);
+		}
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
+				    bool compacting,
+				    enum compact_mode mode)
+{
+	unsigned live_u64s = b->nr.bset_u64s[t - b->set];
+	unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
+
+	if (live_u64s == bset_u64s)
+		return 0;
+
+	if (mode == COMPACT_LAZY) {
+		if (live_u64s * 4 < bset_u64s * 3 ||
+		    (compacting && bset_unwritten(b, bset(b, t))))
+			return bset_u64s - live_u64s;
+	} else {
+		if (bset_written(b, bset(b, t)))
+			return bset_u64s - live_u64s;
+	}
+
+	return 0;
+}
+
+bool __bch_compact_whiteouts(struct cache_set *c, struct btree *b,
+			     enum compact_mode mode)
+{
+	const struct bkey_format *f = &b->format;
+	struct bset_tree *t;
+	struct bkey_packed *whiteouts = NULL;
+	struct bkey_packed *u_start, *u_pos;
+	struct sort_iter sort_iter;
+	unsigned order, whiteout_u64s = 0, u64s;
+	bool used_mempool, compacting = false;
+
+	for_each_bset(b, t)
+		whiteout_u64s += should_compact_bset(b, t,
+					whiteout_u64s != 0, mode);
+
+	if (!whiteout_u64s)
+		return false;
+
+	sort_iter_init(&sort_iter, b);
+
+	whiteout_u64s += b->whiteout_u64s;
+	order = get_order(whiteout_u64s * sizeof(u64));
+
+	whiteouts = btree_bounce_alloc(c, order, &used_mempool);
+	u_start = u_pos = whiteouts;
+
+	memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
+		    b->whiteout_u64s);
+	u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
+
+	sort_iter_add(&sort_iter, u_start, u_pos);
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k, *n, *out, *start, *end;
+		struct btree_node_entry *src = NULL, *dst = NULL;
+
+		if (t != b->set && bset_unwritten(b, i)) {
+			src = container_of(i, struct btree_node_entry, keys);
+			dst = max(write_block(b),
+				  (void *) btree_bkey_last(b, t -1));
+		}
+
+		if (!should_compact_bset(b, t, compacting, mode)) {
+			if (src != dst) {
+				memmove(dst, src, sizeof(*src) +
+					le16_to_cpu(src->keys.u64s) *
+					sizeof(u64));
+				i = &dst->keys;
+				set_btree_bset(b, t, i);
+			}
+			continue;
+		}
+
+		compacting = true;
+		u_start = u_pos;
+		start = i->start;
+		end = bset_bkey_last(i);
+
+		if (src != dst) {
+			memmove(dst, src, sizeof(*src));
+			i = &dst->keys;
+			set_btree_bset(b, t, i);
+		}
+
+		out = i->start;
+
+		for (k = start; k != end; k = n) {
+			n = bkey_next(k);
+
+			if (bkey_deleted(k) && btree_node_is_extents(b))
+				continue;
+
+			if (bkey_whiteout(k) && !k->needs_whiteout)
+				continue;
+
+			if (bkey_whiteout(k)) {
+				unreserve_whiteout(b, t, k);
+				memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
+				set_bkeyp_val_u64s(f, u_pos, 0);
+				u_pos = bkey_next(u_pos);
+			} else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+				bkey_copy(out, k);
+				out = bkey_next(out);
+			}
+		}
+
+		sort_iter_add(&sort_iter, u_start, u_pos);
+
+		if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+			i->u64s = cpu_to_le16((u64 *) out - i->_data);
+			set_btree_bset_end(b, t);
+			bch_bset_set_no_aux_tree(b, t);
+		}
+	}
+
+	b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
+
+	BUG_ON((void *) unwritten_whiteouts_start(c, b) <
+	       (void *) btree_bkey_last(b, bset_tree_last(b)));
+
+	u64s = btree_node_is_extents(b)
+		? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
+					&sort_iter)
+		: sort_key_whiteouts(unwritten_whiteouts_start(c, b),
+				     &sort_iter);
+
+	BUG_ON(u64s > b->whiteout_u64s);
+	BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
+	BUG_ON(u_pos != whiteouts && !u64s);
+
+	if (u64s != b->whiteout_u64s) {
+		void *src = unwritten_whiteouts_start(c, b);
+
+		b->whiteout_u64s = u64s;
+		memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
+	}
+
+	verify_no_dups(b,
+		       unwritten_whiteouts_start(c, b),
+		       unwritten_whiteouts_end(c, b));
+
+	btree_bounce_free(c, order, used_mempool, whiteouts);
+
+	if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
+		bch_btree_build_aux_trees(b);
+
+	bch_btree_keys_u64s_remaining(c, b);
+	bch_verify_btree_nr_keys(b);
+
+	return true;
+}
+
+static bool bch_drop_whiteouts(struct btree *b)
+{
+	struct bset_tree *t;
+	bool ret = false;
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k, *n, *out, *start, *end;
+
+		if (!should_compact_bset(b, t, true, true))
+			continue;
+
+		start	= btree_bkey_first(b, t);
+		end	= btree_bkey_last(b, t);
+
+		if (bset_unwritten(b, i) &&
+		    t != b->set) {
+			struct bset *dst =
+			       max_t(struct bset *, write_block(b),
+				     (void *) btree_bkey_last(b, t -1));
+
+			memmove(dst, i, sizeof(struct bset));
+			i = dst;
+			set_btree_bset(b, t, i);
+		}
+
+		out = i->start;
+
+		for (k = start; k != end; k = n) {
+			n = bkey_next(k);
+
+			if (!bkey_whiteout(k)) {
+				bkey_copy(out, k);
+				out = bkey_next(out);
+			}
+		}
+
+		i->u64s = cpu_to_le16((u64 *) out - i->_data);
+		bch_bset_set_no_aux_tree(b, t);
+		ret = true;
+	}
+
+	bch_verify_btree_nr_keys(b);
+
+	return ret;
+}
+
+static inline int sort_keys_cmp(struct btree *b,
+				struct bkey_packed *l,
+				struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r) ?:
+		(int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
+		(int) l->needs_whiteout - (int) r->needs_whiteout;
+}
+
+static unsigned sort_keys(struct bkey_packed *dst,
+			  struct sort_iter *iter,
+			  bool filter_whiteouts)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *next, *out = dst;
+
+	sort_iter_sort(iter, sort_keys_cmp);
+
+	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+		if (bkey_whiteout(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		if (bkey_whiteout(in) &&
+		    (next = sort_iter_peek(iter)) &&
+		    !bkey_cmp_packed(iter->b, in, next)) {
+			BUG_ON(in->needs_whiteout &&
+			       next->needs_whiteout);
+			/*
+			 * XXX racy, called with read lock from write path
+			 *
+			 * leads to spurious BUG_ON() in bkey_unpack_key() in
+			 * debug mode
+			 */
+			next->needs_whiteout |= in->needs_whiteout;
+			continue;
+		}
+
+		if (bkey_whiteout(in)) {
+			memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
+			set_bkeyp_val_u64s(f, out, 0);
+		} else {
+			bkey_copy(out, in);
+		}
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extents_cmp(struct btree *b,
+				   struct bkey_packed *l,
+				   struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r) ?:
+		(int) bkey_deleted(l) - (int) bkey_deleted(r);
+}
+
+static unsigned sort_extents(struct bkey_packed *dst,
+			     struct sort_iter *iter,
+			     bool filter_whiteouts)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, sort_extents_cmp);
+
+	while ((in = sort_iter_next(iter, sort_extents_cmp))) {
+		if (bkey_deleted(in))
+			continue;
+
+		if (bkey_whiteout(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		bkey_copy(out, in);
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static void btree_node_sort(struct cache_set *c, struct btree *b,
+			    struct btree_iter *iter,
+			    unsigned start_idx,
+			    unsigned end_idx,
+			    bool filter_whiteouts)
+{
+	struct btree_node *out;
+	struct sort_iter sort_iter;
+	struct bset_tree *t;
+	struct bset *start_bset = bset(b, &b->set[start_idx]);
+	bool used_mempool = false;
+	u64 start_time;
+	unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
+	bool sorting_entire_node = start_idx == 0 &&
+		end_idx == b->nsets;
+
+	sort_iter_init(&sort_iter, b);
+
+	for (t = b->set + start_idx;
+	     t < b->set + end_idx;
+	     t++) {
+		u64s += le16_to_cpu(bset(b, t)->u64s);
+		sort_iter_add(&sort_iter,
+			      btree_bkey_first(b, t),
+			      btree_bkey_last(b, t));
+	}
+
+	order = sorting_entire_node
+		? btree_page_order(c)
+		: get_order(__set_bytes(b->data, u64s));
+
+	out = btree_bounce_alloc(c, order, &used_mempool);
+
+	start_time = local_clock();
+
+	if (btree_node_is_extents(b))
+		filter_whiteouts = bset_written(b, start_bset);
+
+	u64s = btree_node_is_extents(b)
+		? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
+		: sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
+
+	out->keys.u64s = cpu_to_le16(u64s);
+
+	BUG_ON((void *) bset_bkey_last(&out->keys) >
+	       (void *) out + (PAGE_SIZE << order));
+
+	if (sorting_entire_node)
+		bch_time_stats_update(&c->btree_sort_time, start_time);
+
+	/* Make sure we preserve bset journal_seq: */
+	for (t = b->set + start_idx + 1;
+	     t < b->set + end_idx;
+	     t++)
+		start_bset->journal_seq =
+			max(start_bset->journal_seq,
+			    bset(b, t)->journal_seq);
+
+	if (sorting_entire_node) {
+		unsigned u64s = le16_to_cpu(out->keys.u64s);
+
+		BUG_ON(order != btree_page_order(c));
+
+		/*
+		 * Our temporary buffer is the same size as the btree node's
+		 * buffer, we can just swap buffers instead of doing a big
+		 * memcpy()
+		 */
+		*out = *b->data;
+		out->keys.u64s = cpu_to_le16(u64s);
+		swap(out, b->data);
+		set_btree_bset(b, b->set, &b->data->keys);
+	} else {
+		start_bset->u64s = out->keys.u64s;
+		memcpy_u64s(start_bset->start,
+			    out->keys.start,
+			    le16_to_cpu(out->keys.u64s));
+	}
+
+	for (i = start_idx + 1; i < end_idx; i++)
+		b->nr.bset_u64s[start_idx] +=
+			b->nr.bset_u64s[i];
+
+	b->nsets -= shift;
+
+	for (i = start_idx + 1; i < b->nsets; i++) {
+		b->nr.bset_u64s[i]	= b->nr.bset_u64s[i + shift];
+		b->set[i]		= b->set[i + shift];
+	}
+
+	for (i = b->nsets; i < MAX_BSETS; i++)
+		b->nr.bset_u64s[i] = 0;
+
+	set_btree_bset_end(b, &b->set[start_idx]);
+	bch_bset_set_no_aux_tree(b, &b->set[start_idx]);
+
+	btree_bounce_free(c, order, used_mempool, out);
+
+	bch_verify_btree_nr_keys(b);
+}
+
+/* Sort + repack in a new format: */
+static struct btree_nr_keys sort_repack(struct bset *dst,
+					struct btree *src,
+					struct btree_node_iter *src_iter,
+					struct bkey_format *out_f,
+					bool filter_whiteouts)
+{
+	struct bkey_format *in_f = &src->format;
+	struct bkey_packed *in, *out = bset_bkey_last(dst);
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((in = bch_btree_node_iter_next_all(src_iter, src))) {
+		if (filter_whiteouts && bkey_whiteout(in))
+			continue;
+
+		if (bch_bkey_transform(out_f, out, bkey_packed(in)
+				       ? in_f : &bch_bkey_format_current, in))
+			out->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bkey_unpack(src, (void *) out, in);
+
+		btree_keys_account_key_add(&nr, 0, out);
+		out = bkey_next(out);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Sort, repack, and merge: */
+static struct btree_nr_keys sort_repack_merge(struct cache_set *c,
+					      struct bset *dst,
+					      struct btree *src,
+					      struct btree_node_iter *iter,
+					      struct bkey_format *out_f,
+					      bool filter_whiteouts,
+					      key_filter_fn filter,
+					      key_merge_fn merge)
+{
+	struct bkey_packed *k, *prev = NULL, *out;
+	struct btree_nr_keys nr;
+	BKEY_PADDED(k) tmp;
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((k = bch_btree_node_iter_next_all(iter, src))) {
+		if (filter_whiteouts && bkey_whiteout(k))
+			continue;
+
+		/*
+		 * The filter might modify pointers, so we have to unpack the
+		 * key and values to &tmp.k:
+		 */
+		bkey_unpack(src, &tmp.k, k);
+
+		if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
+			continue;
+
+		/* prev is always unpacked, for key merging: */
+
+		if (prev &&
+		    merge &&
+		    merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
+			continue;
+
+		/*
+		 * the current key becomes the new prev: advance prev, then
+		 * copy the current key - but first pack prev (in place):
+		 */
+		if (prev) {
+			bkey_pack(prev, (void *) prev, out_f);
+
+			btree_keys_account_key_add(&nr, 0, prev);
+			prev = bkey_next(prev);
+		} else {
+			prev = bset_bkey_last(dst);
+		}
+
+		bkey_copy(prev, &tmp.k);
+	}
+
+	if (prev) {
+		bkey_pack(prev, (void *) prev, out_f);
+		btree_keys_account_key_add(&nr, 0, prev);
+		out = bkey_next(prev);
+	} else {
+		out = bset_bkey_last(dst);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+void bch_btree_sort_into(struct cache_set *c,
+			 struct btree *dst,
+			 struct btree *src)
+{
+	struct btree_nr_keys nr;
+	struct btree_node_iter src_iter;
+	u64 start_time = local_clock();
+
+	BUG_ON(dst->nsets != 1);
+
+	bch_bset_set_no_aux_tree(dst, dst->set);
+
+	bch_btree_node_iter_init_from_start(&src_iter, src,
+					    btree_node_is_extents(src));
+
+	if (btree_node_ops(src)->key_normalize ||
+	    btree_node_ops(src)->key_merge)
+		nr = sort_repack_merge(c, btree_bset_first(dst),
+				src, &src_iter,
+				&dst->format,
+				true,
+				btree_node_ops(src)->key_normalize,
+				btree_node_ops(src)->key_merge);
+	else
+		nr = sort_repack(btree_bset_first(dst),
+				src, &src_iter,
+				&dst->format,
+				true);
+
+	bch_time_stats_update(&c->btree_sort_time, start_time);
+
+	set_btree_bset_end(dst, dst->set);
+
+	dst->nr.live_u64s	+= nr.live_u64s;
+	dst->nr.bset_u64s[0]	+= nr.bset_u64s[0];
+	dst->nr.packed_keys	+= nr.packed_keys;
+	dst->nr.unpacked_keys	+= nr.unpacked_keys;
+
+	bch_verify_btree_nr_keys(dst);
+}
+
+#define SORT_CRIT	(4096 / sizeof(u64))
+
+/*
+ * We're about to add another bset to the btree node, so if there's currently
+ * too many bsets - sort some of them together:
+ */
+static bool btree_node_compact(struct cache_set *c, struct btree *b,
+			       struct btree_iter *iter)
+{
+	unsigned unwritten_idx;
+	bool ret = false;
+
+	for (unwritten_idx = 0;
+	     unwritten_idx < b->nsets;
+	     unwritten_idx++)
+		if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
+			break;
+
+	if (b->nsets - unwritten_idx > 1) {
+		btree_node_sort(c, b, iter, unwritten_idx,
+				b->nsets, false);
+		ret = true;
+	}
+
+	if (unwritten_idx > 1) {
+		btree_node_sort(c, b, iter, 0, unwritten_idx, false);
+		ret = true;
+	}
+
+	return ret;
+}
+
+void bch_btree_build_aux_trees(struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		bch_bset_build_aux_tree(b, t,
+				bset_unwritten(b, bset(b, t)) &&
+				t == bset_tree_last(b));
+}
+
+/*
+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
+ * inserted into
+ *
+ * Safe to call if there already is an unwritten bset - will only add a new bset
+ * if @b doesn't already have one.
+ *
+ * Returns true if we sorted (i.e. invalidated iterators
+ */
+void bch_btree_init_next(struct cache_set *c, struct btree *b,
+			 struct btree_iter *iter)
+{
+	struct btree_node_entry *bne;
+	bool did_sort;
+
+	EBUG_ON(!(b->lock.state.seq & 1));
+	EBUG_ON(iter && iter->nodes[b->level] != b);
+
+	did_sort = btree_node_compact(c, b, iter);
+
+	bne = want_new_bset(c, b);
+	if (bne)
+		bch_bset_init_next(b, &bne->keys);
+
+	bch_btree_build_aux_trees(b);
+
+	if (iter && did_sort)
+		bch_btree_iter_reinit_node(iter, b);
+}
+
+/*
+ * We seed the checksum with the entire first pointer (dev, gen and offset),
+ * since for btree nodes we have to store the checksum with the data instead of
+ * the pointer - this helps guard against reading a valid btree node that is not
+ * the node we actually wanted:
+ */
+#define btree_csum_set(_b, _i)						\
+({									\
+	void *_data = (void *) (_i) + 8;				\
+	void *_end = bset_bkey_last(&(_i)->keys);			\
+									\
+	bch_checksum_update(BSET_CSUM_TYPE(&(_i)->keys),		\
+			    bkey_i_to_extent_c(&(_b)->key)->v._data[0],	\
+			    _data,					\
+			    _end - _data) ^ 0xffffffffffffffffULL;	\
+})
+
+#define btree_node_error(b, c, ptr, fmt, ...)				\
+	cache_set_inconsistent(c,					\
+		"btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\
+		(b)->btree_id, (b)->level, btree_node_root(c, b)	\
+			    ? btree_node_root(c, b)->level : -1,	\
+		PTR_BUCKET_NR(ca, ptr), (b)->written,			\
+		(i)->u64s, ##__VA_ARGS__)
+
+static const char *validate_bset(struct cache_set *c, struct btree *b,
+				 struct cache *ca,
+				 const struct bch_extent_ptr *ptr,
+				 struct bset *i, unsigned sectors,
+				 unsigned *whiteout_u64s)
+{
+	struct bkey_packed *k, *prev = NULL;
+	bool seen_non_whiteout = false;
+
+	if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION)
+		return "unsupported bset version";
+
+	if (b->written + sectors > c->sb.btree_node_size)
+		return  "bset past end of btree node";
+
+	if (i != &b->data->keys && !i->u64s)
+		btree_node_error(b, c, ptr, "empty set");
+
+	if (!BSET_SEPARATE_WHITEOUTS(i)) {
+		seen_non_whiteout = true;
+		whiteout_u64s = 0;
+	}
+
+	for (k = i->start;
+	     k != bset_bkey_last(i);) {
+		struct bkey_s_c u;
+		struct bkey tmp;
+		const char *invalid;
+
+		if (!k->u64s) {
+			btree_node_error(b, c, ptr,
+				"KEY_U64s 0: %zu bytes of metadata lost",
+				(void *) bset_bkey_last(i) - (void *) k);
+
+			i->u64s = cpu_to_le16((u64 *) k - i->_data);
+			break;
+		}
+
+		if (bkey_next(k) > bset_bkey_last(i)) {
+			btree_node_error(b, c, ptr,
+					 "key extends past end of bset");
+
+			i->u64s = cpu_to_le16((u64 *) k - i->_data);
+			break;
+		}
+
+		if (k->format > KEY_FORMAT_CURRENT) {
+			btree_node_error(b, c, ptr,
+					 "invalid bkey format %u", k->format);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_next(k),
+					  (u64 *) bset_bkey_last(i) - (u64 *) k);
+			continue;
+		}
+
+		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
+			bch_bkey_swab(btree_node_type(b), &b->format, k);
+
+		u = bkey_disassemble(b, k, &tmp);
+
+		invalid = btree_bkey_invalid(c, b, u);
+		if (invalid) {
+			char buf[160];
+
+			bch_bkey_val_to_text(c, btree_node_type(b),
+					     buf, sizeof(buf), u);
+			btree_node_error(b, c, ptr,
+					 "invalid bkey %s: %s", buf, invalid);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_next(k),
+					  (u64 *) bset_bkey_last(i) - (u64 *) k);
+			continue;
+		}
+
+		/*
+		 * with the separate whiteouts thing (used for extents), the
+		 * second set of keys actually can have whiteouts too, so we
+		 * can't solely go off bkey_whiteout()...
+		 */
+
+		if (!seen_non_whiteout &&
+		    (!bkey_whiteout(k) ||
+		     (prev && bkey_cmp_left_packed_byval(b, prev,
+					bkey_start_pos(u.k)) > 0))) {
+			*whiteout_u64s = k->_data - i->_data;
+			seen_non_whiteout = true;
+		}
+
+		prev = k;
+		k = bkey_next(k);
+	}
+
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	b->written += sectors;
+	return NULL;
+}
+
+void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
+			      struct cache *ca,
+			      const struct bch_extent_ptr *ptr)
+{
+	struct btree_node_entry *bne;
+	struct bset *i = &b->data->keys;
+	struct btree_node_iter *iter;
+	struct btree_node *sorted;
+	bool used_mempool;
+	unsigned u64s;
+	const char *err;
+	int ret;
+
+	iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
+	__bch_btree_node_iter_init(iter, btree_node_is_extents(b));
+
+	err = "dynamic fault";
+	if (bch_meta_read_fault("btree"))
+		goto err;
+
+	while (b->written < c->sb.btree_node_size) {
+		unsigned sectors, whiteout_u64s = 0;
+
+		if (!b->written) {
+			i = &b->data->keys;
+
+			err = "unknown checksum type";
+			if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR)
+				goto err;
+
+			/* XXX: retry checksum errors */
+
+			err = "bad checksum";
+			if (le64_to_cpu(b->data->csum) !=
+			    btree_csum_set(b, b->data))
+				goto err;
+
+			sectors = __set_blocks(b->data,
+					       le16_to_cpu(b->data->keys.u64s),
+					       block_bytes(c)) << c->block_bits;
+
+			err = "bad magic";
+			if (le64_to_cpu(b->data->magic) != bset_magic(&c->disk_sb))
+				goto err;
+
+			err = "bad btree header";
+			if (!b->data->keys.seq)
+				goto err;
+
+			if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
+				bch_bpos_swab(&b->data->min_key);
+				bch_bpos_swab(&b->data->max_key);
+			}
+
+			err = "incorrect max key";
+			if (bkey_cmp(b->data->max_key, b->key.k.p))
+				goto err;
+
+			err = "incorrect level";
+			if (BSET_BTREE_LEVEL(i) != b->level)
+				goto err;
+
+			err = bch_bkey_format_validate(&b->data->format);
+			if (err)
+				goto err;
+
+			set_btree_bset(b, b->set, &b->data->keys);
+
+			btree_node_set_format(b, b->data->format);
+		} else {
+			bne = write_block(b);
+			i = &bne->keys;
+
+			if (i->seq != b->data->keys.seq)
+				break;
+
+			err = "unknown checksum type";
+			if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR)
+				goto err;
+
+			err = "bad checksum";
+			if (le64_to_cpu(bne->csum) !=
+			    btree_csum_set(b, bne))
+				goto err;
+
+			sectors = __set_blocks(bne,
+					       le16_to_cpu(bne->keys.u64s),
+					       block_bytes(c)) << c->block_bits;
+		}
+
+		err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s);
+		if (err)
+			goto err;
+
+		err = "insufficient memory";
+		ret = bch_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
+		if (ret < 0)
+			goto err;
+
+		if (ret)
+			continue;
+
+		__bch_btree_node_iter_push(iter, b,
+					   i->start,
+					   bkey_idx(i, whiteout_u64s));
+
+		__bch_btree_node_iter_push(iter, b,
+					   bkey_idx(i, whiteout_u64s),
+					   bset_bkey_last(i));
+	}
+
+	err = "corrupted btree";
+	for (bne = write_block(b);
+	     bset_byte_offset(b, bne) < btree_bytes(c);
+	     bne = (void *) bne + block_bytes(c))
+		if (bne->keys.seq == b->data->keys.seq)
+			goto err;
+
+	sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool);
+	sorted->keys.u64s = 0;
+
+	b->nr = btree_node_is_extents(b)
+		? bch_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
+		: bch_key_sort_fix_overlapping(&sorted->keys, b, iter);
+
+	u64s = le16_to_cpu(sorted->keys.u64s);
+	*sorted = *b->data;
+	sorted->keys.u64s = cpu_to_le16(u64s);
+	swap(sorted, b->data);
+	set_btree_bset(b, b->set, &b->data->keys);
+	b->nsets = 1;
+
+	BUG_ON(b->nr.live_u64s != u64s);
+
+	btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted);
+
+	bch_bset_build_aux_tree(b, b->set, false);
+
+	set_needs_whiteout(btree_bset_first(b));
+
+	btree_node_reset_sib_u64s(b);
+out:
+	mempool_free(iter, &c->fill_iter);
+	return;
+err:
+	set_btree_node_read_error(b);
+	btree_node_error(b, c, ptr, "%s", err);
+	goto out;
+}
+
+static void btree_node_read_endio(struct bio *bio)
+{
+	closure_put(bio->bi_private);
+}
+
+void bch_btree_node_read(struct cache_set *c, struct btree *b)
+{
+	uint64_t start_time = local_clock();
+	struct closure cl;
+	struct bio *bio;
+	struct extent_pick_ptr pick;
+
+	trace_bcache_btree_read(c, b);
+
+	closure_init_stack(&cl);
+
+	pick = bch_btree_pick_ptr(c, b);
+	if (cache_set_fatal_err_on(!pick.ca, c,
+				   "no cache device for btree node")) {
+		set_btree_node_read_error(b);
+		return;
+	}
+
+	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
+	bio->bi_bdev		= pick.ca->disk_sb.bdev;
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bio->bi_iter.bi_size	= btree_bytes(c);
+	bio->bi_end_io		= btree_node_read_endio;
+	bio->bi_private		= &cl;
+	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
+
+	bch_bio_map(bio, b->data);
+
+	closure_get(&cl);
+	bch_generic_make_request(bio, c);
+	closure_sync(&cl);
+
+	if (cache_fatal_io_err_on(bio->bi_error,
+				  pick.ca, "IO error reading bucket %zu",
+				  PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
+	    bch_meta_read_fault("btree")) {
+		set_btree_node_read_error(b);
+		goto out;
+	}
+
+	bch_btree_node_read_done(c, b, pick.ca, &pick.ptr);
+	bch_time_stats_update(&c->btree_read_time, start_time);
+out:
+	bio_put(bio);
+	percpu_ref_put(&pick.ca->ref);
+}
+
+int bch_btree_root_read(struct cache_set *c, enum btree_id id,
+			const struct bkey_i *k, unsigned level)
+{
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		ret = mca_cannibalize_lock(c, &cl);
+		closure_sync(&cl);
+	} while (ret);
+
+	b = mca_alloc(c);
+	mca_cannibalize_unlock(c);
+
+	BUG_ON(IS_ERR(b));
+
+	bkey_copy(&b->key, k);
+	BUG_ON(mca_hash_insert(c, b, level, id));
+
+	bch_btree_node_read(c, b);
+	six_unlock_write(&b->lock);
+
+	if (btree_node_read_error(b)) {
+		six_unlock_intent(&b->lock);
+		return -EIO;
+	}
+
+	bch_btree_set_root_initial(c, b, NULL);
+	six_unlock_intent(&b->lock);
+
+	return 0;
+}
+
+void bch_btree_complete_write(struct cache_set *c, struct btree *b,
+			      struct btree_write *w)
+{
+	bch_journal_pin_drop(&c->journal, &w->journal);
+	closure_wake_up(&w->wait);
+}
+
+static void btree_node_write_done(struct cache_set *c, struct btree *b)
+{
+	struct btree_write *w = btree_prev_write(b);
+
+	/*
+	 * Before calling bch_btree_complete_write() - if the write errored, we
+	 * have to halt new journal writes before they see this btree node
+	 * write as completed:
+	 */
+	if (btree_node_write_error(b))
+		bch_journal_halt(&c->journal);
+
+	bch_btree_complete_write(c, b, w);
+	btree_node_io_unlock(b);
+}
+
+static void btree_node_write_endio(struct bio *bio)
+{
+	struct btree *b = bio->bi_private;
+	struct bch_write_bio *wbio = to_wbio(bio);
+	struct cache_set *c	= wbio->c;
+	struct bio *orig	= wbio->split ? wbio->orig : NULL;
+	struct closure *cl	= !wbio->split ? wbio->cl : NULL;
+	struct cache *ca	= wbio->ca;
+
+	if (cache_fatal_io_err_on(bio->bi_error, ca, "btree write") ||
+	    bch_meta_write_fault("btree"))
+		set_btree_node_write_error(b);
+
+	if (wbio->bounce)
+		btree_bounce_free(c,
+			wbio->order,
+			wbio->used_mempool,
+			page_address(bio->bi_io_vec[0].bv_page));
+
+	if (wbio->put_bio)
+		bio_put(bio);
+
+	if (orig) {
+		bio_endio(orig);
+	} else {
+		btree_node_write_done(c, b);
+		if (cl)
+			closure_put(cl);
+	}
+
+	if (ca)
+		percpu_ref_put(&ca->ref);
+}
+
+void __bch_btree_node_write(struct cache_set *c, struct btree *b,
+			    struct closure *parent,
+			    enum six_lock_type lock_type_held,
+			    int idx_to_write)
+{
+	struct bio *bio;
+	struct bch_write_bio *wbio;
+	struct bset_tree *t;
+	struct bset *i;
+	struct btree_node *bn = NULL;
+	struct btree_node_entry *bne = NULL;
+	BKEY_PADDED(key) k;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct cache *ca;
+	struct sort_iter sort_iter;
+	unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
+	u64 seq = 0;
+	bool used_mempool;
+	unsigned long old, new;
+	void *data;
+
+	/*
+	 * We may only have a read lock on the btree node - the dirty bit is our
+	 * "lock" against racing with other threads that may be trying to start
+	 * a write, we do a write iff we clear the dirty bit. Since setting the
+	 * dirty bit requires a write lock, we can't race with other threads
+	 * redirtying it:
+	 */
+	do {
+		old = new = READ_ONCE(b->flags);
+
+		if (!(old & (1 << BTREE_NODE_dirty)))
+			return;
+
+		if (idx_to_write >= 0 &&
+		    idx_to_write != !!(old & (1 << BTREE_NODE_write_idx)))
+			return;
+
+		if (old & (1 << BTREE_NODE_write_in_flight)) {
+			wait_on_bit_io(&b->flags,
+				       BTREE_NODE_write_in_flight,
+				       TASK_UNINTERRUPTIBLE);
+			continue;
+		}
+
+		new &= ~(1 << BTREE_NODE_dirty);
+		new |=  (1 << BTREE_NODE_write_in_flight);
+		new |=  (1 << BTREE_NODE_just_written);
+		new ^=  (1 << BTREE_NODE_write_idx);
+	} while (cmpxchg_acquire(&b->flags, old, new) != old);
+
+	BUG_ON(!list_empty(&b->write_blocked));
+
+	BUG_ON(b->written >= c->sb.btree_node_size);
+	BUG_ON(bset_written(b, btree_bset_last(b)));
+	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(&c->disk_sb));
+	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
+
+	if (lock_type_held == SIX_LOCK_intent) {
+		six_lock_write(&b->lock);
+		__bch_compact_whiteouts(c, b, COMPACT_WRITTEN);
+		six_unlock_write(&b->lock);
+	} else {
+		__bch_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
+	}
+
+	BUG_ON(b->uncompacted_whiteout_u64s);
+
+	sort_iter_init(&sort_iter, b);
+
+	bytes = !b->written
+		? sizeof(struct btree_node)
+		: sizeof(struct btree_node_entry);
+
+	bytes += b->whiteout_u64s * sizeof(u64);
+
+	for_each_bset(b, t) {
+		i = bset(b, t);
+
+		if (bset_written(b, i))
+			continue;
+
+		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
+		sort_iter_add(&sort_iter,
+			      btree_bkey_first(b, t),
+			      btree_bkey_last(b, t));
+		seq = max(seq, le64_to_cpu(i->journal_seq));
+	}
+
+	order = get_order(bytes);
+	data = btree_bounce_alloc(c, order, &used_mempool);
+
+	if (!b->written) {
+		bn = data;
+		*bn = *b->data;
+		i = &bn->keys;
+	} else {
+		bne = data;
+		bne->keys = b->data->keys;
+		i = &bne->keys;
+	}
+
+	i->journal_seq	= cpu_to_le64(seq);
+	i->u64s		= 0;
+
+	if (!btree_node_is_extents(b)) {
+		sort_iter_add(&sort_iter,
+			      unwritten_whiteouts_start(c, b),
+			      unwritten_whiteouts_end(c, b));
+		SET_BSET_SEPARATE_WHITEOUTS(i, false);
+	} else {
+		memcpy_u64s(i->start,
+			    unwritten_whiteouts_start(c, b),
+			    b->whiteout_u64s);
+		i->u64s = cpu_to_le16(b->whiteout_u64s);
+		SET_BSET_SEPARATE_WHITEOUTS(i, true);
+	}
+
+	b->whiteout_u64s = 0;
+
+	u64s = btree_node_is_extents(b)
+		? sort_extents(bset_bkey_last(i), &sort_iter, false)
+		: sort_keys(i->start, &sort_iter, false);
+	le16_add_cpu(&i->u64s, u64s);
+
+	clear_needs_whiteout(i);
+
+	if (b->written && !i->u64s) {
+		/* Nothing to write: */
+		btree_bounce_free(c, order, used_mempool, data);
+		btree_node_write_done(c, b);
+		return;
+	}
+
+	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
+	BUG_ON(i->seq != b->data->keys.seq);
+
+	i->version = cpu_to_le16(BCACHE_BSET_VERSION);
+	SET_BSET_CSUM_TYPE(i, c->opts.metadata_checksum);
+
+	if (bn)
+		bn->csum = cpu_to_le64(btree_csum_set(b, bn));
+	else
+		bne->csum = cpu_to_le64(btree_csum_set(b, bne));
+
+	bytes_to_write = (void *) bset_bkey_last(i) - data;
+	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+
+	memset(data + bytes_to_write, 0,
+	       (sectors_to_write << 9) - bytes_to_write);
+
+	BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
+
+	trace_bcache_btree_write(b, bytes_to_write, sectors_to_write);
+
+	/*
+	 * We handle btree write errors by immediately halting the journal -
+	 * after we've done that, we can't issue any subsequent btree writes
+	 * because they might have pointers to new nodes that failed to write.
+	 *
+	 * Furthermore, there's no point in doing any more btree writes because
+	 * with the journal stopped, we're never going to update the journal to
+	 * reflect that those writes were done and the data flushed from the
+	 * journal:
+	 *
+	 * Make sure to update b->written so bch_btree_init_next() doesn't
+	 * break:
+	 */
+	if (bch_journal_error(&c->journal)) {
+		set_btree_node_write_error(b);
+		b->written += sectors_to_write;
+
+		btree_bounce_free(c, order, used_mempool, data);
+		btree_node_write_done(c, b);
+		return;
+	}
+
+	bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
+
+	wbio			= to_wbio(bio);
+	wbio->cl		= parent;
+	wbio->bounce		= true;
+	wbio->put_bio		= true;
+	wbio->order		= order;
+	wbio->used_mempool	= used_mempool;
+	bio->bi_iter.bi_size	= sectors_to_write << 9;
+	bio->bi_end_io		= btree_node_write_endio;
+	bio->bi_private		= b;
+	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
+
+	if (parent)
+		closure_get(parent);
+
+	bch_bio_map(bio, data);
+
+	/*
+	 * If we're appending to a leaf node, we don't technically need FUA -
+	 * this write just needs to be persisted before the next journal write,
+	 * which will be marked FLUSH|FUA.
+	 *
+	 * Similarly if we're writing a new btree root - the pointer is going to
+	 * be in the next journal entry.
+	 *
+	 * But if we're writing a new btree node (that isn't a root) or
+	 * appending to a non leaf btree node, we need either FUA or a flush
+	 * when we write the parent with the new pointer. FUA is cheaper than a
+	 * flush, and writes appending to leaf nodes aren't blocking anything so
+	 * just make all btree node writes FUA to keep things sane.
+	 */
+
+	bkey_copy(&k.key, &b->key);
+	e = bkey_i_to_s_extent(&k.key);
+
+	extent_for_each_ptr(e, ptr)
+		ptr->offset += b->written;
+
+	rcu_read_lock();
+	extent_for_each_online_device(c, e, ptr, ca)
+		atomic64_add(sectors_to_write, &ca->btree_sectors_written);
+	rcu_read_unlock();
+
+	b->written += sectors_to_write;
+
+	bch_submit_wbio_replicas(wbio, c, &k.key, true);
+}
+
+/*
+ * Work that must be done with write lock held:
+ */
+bool bch_btree_post_write_cleanup(struct cache_set *c, struct btree *b)
+{
+	bool invalidated_iter = false;
+	struct btree_node_entry *bne;
+	struct bset_tree *t;
+
+	if (!btree_node_just_written(b))
+		return false;
+
+	BUG_ON(b->whiteout_u64s);
+	BUG_ON(b->uncompacted_whiteout_u64s);
+
+	clear_btree_node_just_written(b);
+
+	/*
+	 * Note: immediately after write, bset_unwritten()/bset_written() don't
+	 * work - the amount of data we had to write after compaction might have
+	 * been smaller than the offset of the last bset.
+	 *
+	 * However, we know that all bsets have been written here, as long as
+	 * we're still holding the write lock:
+	 */
+
+	/*
+	 * XXX: decide if we really want to unconditionally sort down to a
+	 * single bset:
+	 */
+	if (b->nsets > 1) {
+		btree_node_sort(c, b, NULL, 0, b->nsets, true);
+		invalidated_iter = true;
+	} else {
+		invalidated_iter = bch_drop_whiteouts(b);
+	}
+
+	for_each_bset(b, t)
+		set_needs_whiteout(bset(b, t));
+
+	bch_btree_verify(c, b);
+
+	/*
+	 * If later we don't unconditionally sort down to a single bset, we have
+	 * to ensure this is still true:
+	 */
+	BUG_ON((void *) bset_bkey_last(btree_bset_last(b)) > write_block(b));
+
+	bne = want_new_bset(c, b);
+	if (bne)
+		bch_bset_init_next(b, &bne->keys);
+
+	bch_btree_build_aux_trees(b);
+
+	return invalidated_iter;
+}
+
+/*
+ * Use this one if the node is intent locked:
+ */
+void bch_btree_node_write(struct cache_set *c, struct btree *b,
+			  struct closure *parent,
+			  enum six_lock_type lock_type_held,
+			  int idx_to_write)
+{
+	BUG_ON(lock_type_held == SIX_LOCK_write);
+
+	if (lock_type_held == SIX_LOCK_intent ||
+	    six_trylock_convert(&b->lock, SIX_LOCK_read,
+				SIX_LOCK_intent)) {
+		__bch_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write);
+
+		six_lock_write(&b->lock);
+		bch_btree_post_write_cleanup(c, b);
+		six_unlock_write(&b->lock);
+
+		if (lock_type_held == SIX_LOCK_read)
+			six_lock_downgrade(&b->lock);
+	} else {
+		__bch_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write);
+	}
+}
+
+static void bch_btree_node_write_dirty(struct cache_set *c, struct btree *b,
+				       struct closure *parent)
+{
+	six_lock_read(&b->lock);
+	BUG_ON(b->level);
+
+	bch_btree_node_write(c, b, parent, SIX_LOCK_read, -1);
+	six_unlock_read(&b->lock);
+}
+
+/*
+ * Write all dirty btree nodes to disk, including roots
+ */
+void bch_btree_flush(struct cache_set *c)
+{
+	struct closure cl;
+	struct btree *b;
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	bool dropped_lock;
+	unsigned i;
+
+	closure_init_stack(&cl);
+
+	rcu_read_lock();
+
+	do {
+		dropped_lock = false;
+		i = 0;
+restart:
+		tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
+					  &c->btree_cache_table);
+
+		for (; i < tbl->size; i++)
+			rht_for_each_entry_rcu(b, pos, tbl, i, hash)
+				/*
+				 * XXX - locking for b->level, when called from
+				 * bch_journal_move()
+				 */
+				if (!b->level && btree_node_dirty(b)) {
+					rcu_read_unlock();
+					bch_btree_node_write_dirty(c, b, &cl);
+					dropped_lock = true;
+					rcu_read_lock();
+					goto restart;
+				}
+	} while (dropped_lock);
+
+	rcu_read_unlock();
+
+	closure_sync(&cl);
+}
+
+/**
+ * bch_btree_node_flush_journal - flush any journal entries that contain keys
+ * from this node
+ *
+ * The bset's journal sequence number is used for preserving ordering of index
+ * updates across unclean shutdowns - it's used to ignore bsets newer than the
+ * most recent journal entry.
+ *
+ * But when rewriting btree nodes we compact all the bsets in a btree node - and
+ * if we compacted a bset that should be ignored with bsets we do need, that
+ * would be bad. So to avoid that, prior to making the new node visible ensure
+ * that the journal has been flushed so that all the bsets we compacted should
+ * be visible.
+ */
+void bch_btree_node_flush_journal_entries(struct cache_set *c,
+					  struct btree *b,
+					  struct closure *cl)
+{
+	int i = b->nsets;
+
+	/*
+	 * Journal sequence numbers in the different bsets will always be in
+	 * ascending order, we only need to flush the highest - except that the
+	 * most recent bset might not have a journal sequence number yet, so we
+	 * need to loop:
+	 */
+	while (i--) {
+		u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq);
+
+		if (seq) {
+			bch_journal_flush_seq_async(&c->journal, seq, cl);
+			break;
+		}
+	}
+}
diff --git a/libbcache/btree_io.h b/libbcache/btree_io.h
new file mode 100644
index 0000000..866cc6c
--- /dev/null
+++ b/libbcache/btree_io.h
@@ -0,0 +1,73 @@
+#ifndef _BCACHE_BTREE_IO_H
+#define _BCACHE_BTREE_IO_H
+
+struct cache_set;
+struct btree_write;
+struct btree;
+struct btree_iter;
+
+static inline void btree_node_io_unlock(struct btree *b)
+{
+	EBUG_ON(!btree_node_write_in_flight(b));
+	clear_btree_node_write_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+static inline void btree_node_io_lock(struct btree *b)
+{
+	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+			    TASK_UNINTERRUPTIBLE);
+}
+
+enum compact_mode {
+	COMPACT_LAZY,
+	COMPACT_WRITTEN,
+	COMPACT_WRITTEN_NO_WRITE_LOCK,
+};
+
+bool __bch_compact_whiteouts(struct cache_set *, struct btree *, enum compact_mode);
+
+static inline bool bch_maybe_compact_whiteouts(struct cache_set *c, struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		unsigned live_u64s = b->nr.bset_u64s[t - b->set];
+		unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
+
+		if (live_u64s * 4 < bset_u64s * 3)
+			goto compact;
+	}
+
+	return false;
+compact:
+	return __bch_compact_whiteouts(c, b, COMPACT_LAZY);
+}
+
+void bch_btree_sort_into(struct cache_set *, struct btree *, struct btree *);
+
+void bch_btree_build_aux_trees(struct btree *);
+void bch_btree_init_next(struct cache_set *, struct btree *,
+			 struct btree_iter *);
+
+void bch_btree_node_read_done(struct cache_set *, struct btree *,
+			      struct cache *, const struct bch_extent_ptr *);
+void bch_btree_node_read(struct cache_set *, struct btree *);
+int bch_btree_root_read(struct cache_set *, enum btree_id,
+			const struct bkey_i *, unsigned);
+
+void bch_btree_complete_write(struct cache_set *, struct btree *,
+			      struct btree_write *);
+
+void __bch_btree_node_write(struct cache_set *, struct btree *,
+			    struct closure *, enum six_lock_type, int);
+bool bch_btree_post_write_cleanup(struct cache_set *, struct btree *);
+
+void bch_btree_node_write(struct cache_set *, struct btree *,
+			  struct closure *, enum six_lock_type, int);
+
+void bch_btree_flush(struct cache_set *);
+void bch_btree_node_flush_journal_entries(struct cache_set *, struct btree *,
+					  struct closure *);
+
+#endif /* _BCACHE_BTREE_IO_H */
diff --git a/libbcache/btree_iter.c b/libbcache/btree_iter.c
new file mode 100644
index 0000000..a9859e3
--- /dev/null
+++ b/libbcache/btree_iter.c
@@ -0,0 +1,1150 @@
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+
+#include <trace/events/bcache.h>
+
+#define BTREE_ITER_NOT_END	((struct btree *) 1)
+
+static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
+{
+	return iter->nodes[l] && iter->nodes[l] != BTREE_ITER_NOT_END;
+}
+
+/* Btree node locking: */
+
+/*
+ * Updates the saved lock sequence number, so that btree_node_relock() will
+ * succeed:
+ */
+void btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	EBUG_ON(iter->nodes[b->level] != b);
+	EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
+
+	for_each_linked_btree_node(iter, b, linked)
+		linked->lock_seq[b->level] += 2;
+
+	iter->lock_seq[b->level] += 2;
+
+	six_unlock_write(&b->lock);
+}
+
+void btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+	unsigned readers = 0;
+
+	EBUG_ON(iter->nodes[b->level] != b);
+	EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
+
+	if (six_trylock_write(&b->lock))
+		return;
+
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->nodes[b->level] == b &&
+		    btree_node_read_locked(linked, b->level))
+			readers++;
+
+	if (likely(!readers)) {
+		six_lock_write(&b->lock);
+	} else {
+		/*
+		 * Must drop our read locks before calling six_lock_write() -
+		 * six_unlock() won't do wakeups until the reader count
+		 * goes to 0, and it's safe because we have the node intent
+		 * locked:
+		 */
+		atomic64_sub(__SIX_VAL(read_lock, readers),
+			     &b->lock.state.counter);
+		six_lock_write(&b->lock);
+		atomic64_add(__SIX_VAL(read_lock, readers),
+			     &b->lock.state.counter);
+	}
+}
+
+/* versions that allow iter to be null: */
+void __btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+{
+	if (likely(iter))
+		btree_node_unlock_write(b, iter);
+	else
+		six_unlock_write(&b->lock);
+}
+
+void __btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+	if (likely(iter))
+		btree_node_lock_write(b, iter);
+	else
+		six_lock_write(&b->lock);
+}
+
+bool btree_node_relock(struct btree_iter *iter, unsigned level)
+{
+	struct btree_iter *linked;
+	struct btree *b = iter->nodes[level];
+	enum btree_node_locked_type want = btree_lock_want(iter, level);
+	enum btree_node_locked_type have = btree_node_locked_type(iter, level);
+
+	if (want == have)
+		return true;
+
+	if (!is_btree_node(iter, level))
+		return false;
+
+	if (race_fault())
+		return false;
+
+	if (have != BTREE_NODE_UNLOCKED
+	    ? six_trylock_convert(&b->lock, have, want)
+	    : six_relock_type(&b->lock, want, iter->lock_seq[level]))
+		goto success;
+
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->nodes[level] == b &&
+		    btree_node_locked_type(linked, level) == want &&
+		    iter->lock_seq[level] == b->lock.state.seq) {
+			btree_node_unlock(iter, level);
+			six_lock_increment(&b->lock, want);
+			goto success;
+		}
+
+	return false;
+success:
+	mark_btree_node_unlocked(iter, level);
+	mark_btree_node_locked(iter, level, want);
+	return true;
+}
+
+/* Slowpath: */
+bool __bch_btree_node_lock(struct btree *b, struct bpos pos,
+			   unsigned level,
+			   struct btree_iter *iter,
+			   enum six_lock_type type)
+{
+	struct btree_iter *linked;
+
+	/* Can't have children locked before ancestors: */
+	EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
+
+	/*
+	 * Can't hold any read locks while we block taking an intent lock - see
+	 * below for reasoning, and we should have already dropped any read
+	 * locks in the current iterator
+	 */
+	EBUG_ON(type == SIX_LOCK_intent &&
+		iter->nodes_locked != iter->nodes_intent_locked);
+
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->nodes[level] == b &&
+		    btree_node_locked_type(linked, level) == type) {
+			six_lock_increment(&b->lock, type);
+			return true;
+		}
+
+	/*
+	 * Must lock btree nodes in key order - this case hapens when locking
+	 * the prev sibling in btree node merging:
+	 */
+	if (iter->nodes_locked &&
+	    __ffs(iter->nodes_locked) == level &&
+	    __btree_iter_cmp(iter->btree_id, pos, iter))
+		return false;
+
+	for_each_linked_btree_iter(iter, linked) {
+		if (!linked->nodes_locked)
+			continue;
+
+		/*
+		 * Can't block taking an intent lock if we have _any_ nodes read
+		 * locked:
+		 *
+		 * - Our read lock blocks another thread with an intent lock on
+		 *   the same node from getting a write lock, and thus from
+		 *   dropping its intent lock
+		 *
+		 * - And the other thread may have multiple nodes intent locked:
+		 *   both the node we want to intent lock, and the node we
+		 *   already have read locked - deadlock:
+		 */
+		if (type == SIX_LOCK_intent &&
+		    linked->nodes_locked != linked->nodes_intent_locked) {
+			linked->locks_want = max(linked->locks_want,
+						 iter->locks_want);
+			return false;
+		}
+
+		/* We have to lock btree nodes in key order: */
+		if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
+			return false;
+
+		/*
+		 * Interior nodes must be locked before their descendants: if
+		 * another iterator has possible descendants locked of the node
+		 * we're about to lock, it must have the ancestors locked too:
+		 */
+		if (linked->btree_id == iter->btree_id &&
+		    level > __fls(linked->nodes_locked)) {
+			linked->locks_want = max(linked->locks_want,
+						 iter->locks_want);
+			return false;
+		}
+	}
+
+	six_lock_type(&b->lock, type);
+	return true;
+}
+
+/* Btree iterator locking: */
+
+
+static void btree_iter_drop_extra_locks(struct btree_iter *iter)
+{
+	unsigned l;
+
+	while (iter->nodes_locked &&
+	       (l = __fls(iter->nodes_locked)) > iter->locks_want) {
+		if (!btree_node_locked(iter, l))
+			panic("l %u nodes_locked %u\n", l, iter->nodes_locked);
+
+		if (l > iter->level) {
+			btree_node_unlock(iter, l);
+		} else if (btree_node_intent_locked(iter, l)) {
+			six_lock_downgrade(&iter->nodes[l]->lock);
+			iter->nodes_intent_locked ^= 1 << l;
+		}
+	}
+}
+
+bool __bch_btree_iter_set_locks_want(struct btree_iter *iter,
+				     unsigned new_locks_want)
+{
+	struct btree_iter *linked;
+	unsigned l;
+
+	/* Drop locks we don't want anymore: */
+	if (new_locks_want < iter->locks_want)
+		for_each_linked_btree_iter(iter, linked)
+			if (linked->locks_want > new_locks_want) {
+				linked->locks_want = max_t(unsigned, 1,
+							   new_locks_want);
+				btree_iter_drop_extra_locks(linked);
+			}
+
+	iter->locks_want = new_locks_want;
+	btree_iter_drop_extra_locks(iter);
+
+	for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
+		if (!btree_node_relock(iter, l))
+			goto fail;
+
+	return true;
+fail:
+	/*
+	 * Just an optimization: ancestor nodes must be locked before child
+	 * nodes, so set locks_want on iterators that might lock ancestors
+	 * before us to avoid getting -EINTR later:
+	 */
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->btree_id == iter->btree_id &&
+		    btree_iter_cmp(linked, iter) <= 0)
+			linked->locks_want = max_t(unsigned, linked->locks_want,
+						   new_locks_want);
+	return false;
+}
+
+static int __bch_btree_iter_unlock(struct btree_iter *iter)
+{
+	BUG_ON(iter->error == -EINTR);
+
+	while (iter->nodes_locked)
+		btree_node_unlock(iter, __ffs(iter->nodes_locked));
+
+	return iter->error;
+}
+
+int bch_btree_iter_unlock(struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	for_each_linked_btree_iter(iter, linked)
+		__bch_btree_iter_unlock(linked);
+	return __bch_btree_iter_unlock(iter);
+}
+
+/* Btree iterator: */
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+static void __bch_btree_iter_verify(struct btree_iter *iter,
+				    struct btree *b)
+{
+	struct btree_node_iter *node_iter = &iter->node_iters[b->level];
+	struct btree_node_iter tmp = *node_iter;
+	struct bkey_packed *k;
+
+	bch_btree_node_iter_verify(node_iter, b);
+
+	/*
+	 * For interior nodes, the iterator will have skipped past
+	 * deleted keys:
+	 */
+	k = b->level
+		? bch_btree_node_iter_prev(&tmp, b)
+		: bch_btree_node_iter_prev_all(&tmp, b);
+	if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k,
+					   iter->is_extents)) {
+		char buf[100];
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		bch_bkey_to_text(buf, sizeof(buf), &uk);
+		panic("prev key should be before after pos:\n%s\n%llu:%llu\n",
+		      buf, iter->pos.inode, iter->pos.offset);
+	}
+
+	k = bch_btree_node_iter_peek_all(node_iter, b);
+	if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k,
+					    iter->is_extents)) {
+		char buf[100];
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		bch_bkey_to_text(buf, sizeof(buf), &uk);
+		panic("next key should be before iter pos:\n%llu:%llu\n%s\n",
+		      iter->pos.inode, iter->pos.offset, buf);
+	}
+}
+
+void bch_btree_iter_verify(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+
+	if (iter->nodes[b->level] == b)
+		__bch_btree_iter_verify(iter, b);
+
+	for_each_linked_btree_node(iter, b, linked)
+		__bch_btree_iter_verify(iter, b);
+}
+
+#endif
+
+static void __bch_btree_node_iter_fix(struct btree_iter *iter,
+				      struct btree *b,
+				      struct btree_node_iter *node_iter,
+				      struct bset_tree *t,
+				      struct bkey_packed *where,
+				      unsigned clobber_u64s,
+				      unsigned new_u64s)
+{
+	const struct bkey_packed *end = btree_bkey_last(b, t);
+	struct btree_node_iter_set *set;
+	unsigned offset = __btree_node_key_to_offset(b, where);
+	int shift = new_u64s - clobber_u64s;
+	unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift;
+
+	btree_node_iter_for_each(node_iter, set)
+		if (set->end == old_end)
+			goto found;
+
+	/* didn't find the bset in the iterator - might have to readd it: */
+	if (new_u64s &&
+	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
+				      iter->is_extents))
+		bch_btree_node_iter_push(node_iter, b, where, end);
+	return;
+found:
+	set->end = (int) set->end + shift;
+
+	/* Iterator hasn't gotten to the key that changed yet: */
+	if (set->k < offset)
+		return;
+
+	if (new_u64s &&
+	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
+				      iter->is_extents)) {
+		set->k = offset;
+		bch_btree_node_iter_sort(node_iter, b);
+	} else if (set->k < offset + clobber_u64s) {
+		set->k = offset + new_u64s;
+		if (set->k == set->end)
+			*set = node_iter->data[--node_iter->used];
+		bch_btree_node_iter_sort(node_iter, b);
+	} else {
+		set->k = (int) set->k + shift;
+	}
+
+	/*
+	 * Interior nodes are special because iterators for interior nodes don't
+	 * obey the usual invariants regarding the iterator position:
+	 *
+	 * We may have whiteouts that compare greater than the iterator
+	 * position, and logically should be in the iterator, but that we
+	 * skipped past to find the first live key greater than the iterator
+	 * position. This becomes an issue when we insert a new key that is
+	 * greater than the current iterator position, but smaller than the
+	 * whiteouts we've already skipped past - this happens in the course of
+	 * a btree split.
+	 *
+	 * We have to rewind the iterator past to before those whiteouts here,
+	 * else bkey_node_iter_prev() is not going to work and who knows what
+	 * else would happen. And we have to do it manually, because here we've
+	 * already done the insert and the iterator is currently inconsistent:
+	 *
+	 * We've got multiple competing invariants, here - we have to be careful
+	 * about rewinding iterators for interior nodes, because they should
+	 * always point to the key for the child node the btree iterator points
+	 * to.
+	 */
+	if (b->level && new_u64s && !bkey_deleted(where) &&
+	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
+				      iter->is_extents)) {
+		struct bset_tree *t;
+		struct bkey_packed *k;
+
+		for_each_bset(b, t) {
+			if (bch_bkey_to_bset(b, where) == t)
+				continue;
+
+			k = bkey_prev_all(b, t,
+				bch_btree_node_iter_bset_pos(node_iter, b, t));
+			if (k &&
+			    __btree_node_iter_cmp(node_iter, b,
+						  k, where) > 0) {
+				struct btree_node_iter_set *set;
+				unsigned offset =
+					__btree_node_key_to_offset(b, bkey_next(k));
+
+				btree_node_iter_for_each(node_iter, set)
+					if (set->k == offset) {
+						set->k = __btree_node_key_to_offset(b, k);
+						bch_btree_node_iter_sort(node_iter, b);
+						goto next_bset;
+					}
+
+				bch_btree_node_iter_push(node_iter, b, k,
+						btree_bkey_last(b, t));
+			}
+next_bset:
+			t = t;
+		}
+	}
+}
+
+void bch_btree_node_iter_fix(struct btree_iter *iter,
+			     struct btree *b,
+			     struct btree_node_iter *node_iter,
+			     struct bset_tree *t,
+			     struct bkey_packed *where,
+			     unsigned clobber_u64s,
+			     unsigned new_u64s)
+{
+	struct btree_iter *linked;
+
+	if (node_iter != &iter->node_iters[b->level])
+		__bch_btree_node_iter_fix(iter, b, node_iter, t,
+					  where, clobber_u64s, new_u64s);
+
+	if (iter->nodes[b->level] == b)
+		__bch_btree_node_iter_fix(iter, b,
+					  &iter->node_iters[b->level], t,
+					  where, clobber_u64s, new_u64s);
+
+	for_each_linked_btree_node(iter, b, linked)
+		__bch_btree_node_iter_fix(linked, b,
+					  &linked->node_iters[b->level], t,
+					  where, clobber_u64s, new_u64s);
+
+	/* interior node iterators are... special... */
+	if (!b->level)
+		bch_btree_iter_verify(iter, b);
+}
+
+/* peek_all() doesn't skip deleted keys */
+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter)
+{
+	struct btree *b = iter->nodes[iter->level];
+	struct bkey_packed *k =
+		bch_btree_node_iter_peek_all(&iter->node_iters[iter->level], b);
+	struct bkey_s_c ret;
+
+	EBUG_ON(!btree_node_locked(iter, iter->level));
+
+	if (!k)
+		return bkey_s_c_null;
+
+	ret = bkey_disassemble(b, k, &iter->k);
+
+	if (debug_check_bkeys(iter->c))
+		bkey_debugcheck(iter->c, b, ret);
+
+	return ret;
+}
+
+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter)
+{
+	struct btree *b = iter->nodes[iter->level];
+	struct bkey_packed *k =
+		bch_btree_node_iter_peek(&iter->node_iters[iter->level], b);
+	struct bkey_s_c ret;
+
+	EBUG_ON(!btree_node_locked(iter, iter->level));
+
+	if (!k)
+		return bkey_s_c_null;
+
+	ret = bkey_disassemble(b, k, &iter->k);
+
+	if (debug_check_bkeys(iter->c))
+		bkey_debugcheck(iter->c, b, ret);
+
+	return ret;
+}
+
+static inline void __btree_iter_advance(struct btree_iter *iter)
+{
+	bch_btree_node_iter_advance(&iter->node_iters[iter->level],
+				    iter->nodes[iter->level]);
+}
+
+/*
+ * Verify that iterator for parent node points to child node:
+ */
+static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
+{
+	bool parent_locked;
+	struct bkey_packed *k;
+
+	if (!IS_ENABLED(CONFIG_BCACHE_DEBUG) ||
+	    !iter->nodes[b->level + 1])
+		return;
+
+	parent_locked = btree_node_locked(iter, b->level + 1);
+
+	if (!btree_node_relock(iter, b->level + 1))
+		return;
+
+	k = bch_btree_node_iter_peek_all(&iter->node_iters[b->level + 1],
+					 iter->nodes[b->level + 1]);
+	if (!k ||
+	    bkey_deleted(k) ||
+	    bkey_cmp_left_packed(iter->nodes[b->level + 1],
+				 k, &b->key.k.p)) {
+		char buf[100];
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		bch_bkey_to_text(buf, sizeof(buf), &uk);
+		panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
+		      buf, b->key.k.p.inode, b->key.k.p.offset);
+	}
+
+	if (!parent_locked)
+		btree_node_unlock(iter, b->level + 1);
+}
+
+static inline void __btree_iter_init(struct btree_iter *iter,
+				     struct btree *b)
+{
+	bch_btree_node_iter_init(&iter->node_iters[b->level], b,
+				 iter->pos, iter->is_extents,
+				 btree_node_is_extents(b));
+
+	/* Skip to first non whiteout: */
+	if (b->level)
+		bch_btree_node_iter_peek(&iter->node_iters[b->level], b);
+}
+
+static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
+					  struct btree *b)
+{
+	return iter->btree_id == b->btree_id &&
+		bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
+		btree_iter_pos_cmp(iter->pos, &b->key.k, iter->is_extents);
+}
+
+static inline void btree_iter_node_set(struct btree_iter *iter,
+				       struct btree *b)
+{
+	btree_iter_verify_new_node(iter, b);
+
+	EBUG_ON(!btree_iter_pos_in_node(iter, b));
+	EBUG_ON(b->lock.state.seq & 1);
+
+	iter->lock_seq[b->level] = b->lock.state.seq;
+	iter->nodes[b->level] = b;
+	__btree_iter_init(iter, b);
+}
+
+/*
+ * A btree node is being replaced - update the iterator to point to the new
+ * node:
+ */
+bool bch_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+
+	for_each_linked_btree_iter(iter, linked)
+		if (btree_iter_pos_in_node(linked, b)) {
+			/*
+			 * bch_btree_iter_node_drop() has already been called -
+			 * the old node we're replacing has already been
+			 * unlocked and the pointer invalidated
+			 */
+			BUG_ON(btree_node_locked(linked, b->level));
+
+			/*
+			 * If @linked wants this node read locked, we don't want
+			 * to actually take the read lock now because it's not
+			 * legal to hold read locks on other nodes while we take
+			 * write locks, so the journal can make forward
+			 * progress...
+			 *
+			 * Instead, btree_iter_node_set() sets things up so
+			 * btree_node_relock() will succeed:
+			 */
+
+			if (btree_want_intent(linked, b->level)) {
+				six_lock_increment(&b->lock, SIX_LOCK_intent);
+				mark_btree_node_intent_locked(linked, b->level);
+			}
+
+			btree_iter_node_set(linked, b);
+		}
+
+	if (!btree_iter_pos_in_node(iter, b)) {
+		six_unlock_intent(&b->lock);
+		return false;
+	}
+
+	mark_btree_node_intent_locked(iter, b->level);
+	btree_iter_node_set(iter, b);
+	return true;
+}
+
+void bch_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+	unsigned level = b->level;
+
+	for_each_linked_btree_iter(iter, linked)
+		if (linked->nodes[level] == b) {
+			btree_node_unlock(linked, level);
+			linked->nodes[level] = BTREE_ITER_NOT_END;
+		}
+}
+
+void bch_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
+{
+	unsigned level = b->level;
+
+	if (iter->nodes[level] == b) {
+		BUG_ON(b->lock.state.intent_lock != 1);
+		btree_node_unlock(iter, level);
+		iter->nodes[level] = BTREE_ITER_NOT_END;
+	}
+}
+
+/*
+ * A btree node has been modified in such a way as to invalidate iterators - fix
+ * them:
+ */
+void bch_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+
+	for_each_linked_btree_node(iter, b, linked)
+		__btree_iter_init(linked, b);
+	__btree_iter_init(iter, b);
+}
+
+static inline int btree_iter_lock_root(struct btree_iter *iter,
+				       unsigned depth_want)
+{
+	struct cache_set *c = iter->c;
+	struct btree *b;
+	enum six_lock_type lock_type;
+	unsigned i;
+
+	EBUG_ON(iter->nodes_locked);
+
+	while (1) {
+		b = READ_ONCE(c->btree_roots[iter->btree_id].b);
+		iter->level = READ_ONCE(b->level);
+
+		if (unlikely(iter->level < depth_want)) {
+			/*
+			 * the root is at a lower depth than the depth we want:
+			 * got to the end of the btree, or we're walking nodes
+			 * greater than some depth and there are no nodes >=
+			 * that depth
+			 */
+			iter->level = depth_want;
+			iter->nodes[iter->level] = NULL;
+			return 0;
+		}
+
+		lock_type = btree_lock_want(iter, iter->level);
+		if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
+					      iter, lock_type)))
+			return -EINTR;
+
+		if (likely(b == c->btree_roots[iter->btree_id].b &&
+			   b->level == iter->level &&
+			   !race_fault())) {
+			for (i = 0; i < iter->level; i++)
+				iter->nodes[i] = BTREE_ITER_NOT_END;
+			iter->nodes[iter->level] = b;
+
+			mark_btree_node_locked(iter, iter->level, lock_type);
+			btree_iter_node_set(iter, b);
+			return 0;
+
+		}
+
+		six_unlock_type(&b->lock, lock_type);
+	}
+}
+
+static inline int btree_iter_down(struct btree_iter *iter)
+{
+	struct btree *b;
+	struct bkey_s_c k = __btree_iter_peek(iter);
+	unsigned level = iter->level - 1;
+	enum six_lock_type lock_type = btree_lock_want(iter, level);
+	BKEY_PADDED(k) tmp;
+
+	bkey_reassemble(&tmp.k, k);
+
+	b = bch_btree_node_get(iter, &tmp.k, level, lock_type);
+	if (unlikely(IS_ERR(b)))
+		return PTR_ERR(b);
+
+	iter->level = level;
+	mark_btree_node_locked(iter, level, lock_type);
+	btree_iter_node_set(iter, b);
+	return 0;
+}
+
+static void btree_iter_up(struct btree_iter *iter)
+{
+	btree_node_unlock(iter, iter->level++);
+}
+
+int __must_check __bch_btree_iter_traverse(struct btree_iter *);
+
+static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
+{
+	struct cache_set *c = iter->c;
+	struct btree_iter *linked, *sorted_iters, **i;
+retry_all:
+	bch_btree_iter_unlock(iter);
+
+	if (ret != -ENOMEM && ret != -EINTR)
+		goto io_error;
+
+	if (ret == -ENOMEM) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+
+		do {
+			ret = mca_cannibalize_lock(c, &cl);
+			closure_sync(&cl);
+		} while (ret);
+	}
+
+	/*
+	 * Linked iters are normally a circular singly linked list - break cycle
+	 * while we sort them:
+	 */
+	linked = iter->next;
+	iter->next = NULL;
+	sorted_iters = NULL;
+
+	while (linked) {
+		iter = linked;
+		linked = linked->next;
+
+		i = &sorted_iters;
+		while (*i && btree_iter_cmp(iter, *i) > 0)
+			i = &(*i)->next;
+
+		iter->next = *i;
+		*i = iter;
+	}
+
+	/* Make list circular again: */
+	iter = sorted_iters;
+	while (iter->next)
+		iter = iter->next;
+	iter->next = sorted_iters;
+
+	/* Now, redo traversals in correct order: */
+
+	iter = sorted_iters;
+	do {
+retry:
+		ret = __bch_btree_iter_traverse(iter);
+		if (unlikely(ret)) {
+			if (ret == -EINTR)
+				goto retry;
+			goto retry_all;
+		}
+
+		iter = iter->next;
+	} while (iter != sorted_iters);
+
+	ret = btree_iter_linked(iter) ? -EINTR : 0;
+out:
+	mca_cannibalize_unlock(c);
+	return ret;
+io_error:
+	BUG_ON(ret != -EIO);
+
+	iter->error = ret;
+	iter->nodes[iter->level] = NULL;
+	goto out;
+}
+
+/*
+ * This is the main state machine for walking down the btree - walks down to a
+ * specified depth
+ *
+ * Returns 0 on success, -EIO on error (error reading in a btree node).
+ *
+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is
+ * stashed in the iterator and returned from bch_btree_iter_unlock().
+ */
+int __must_check __bch_btree_iter_traverse(struct btree_iter *iter)
+{
+	unsigned depth_want = iter->level;
+
+	/* make sure we have all the intent locks we need - ugh */
+	if (unlikely(iter->nodes[iter->level] &&
+		     iter->level + 1 < iter->locks_want)) {
+		unsigned i;
+
+		for (i = iter->level + 1;
+		     i < iter->locks_want && iter->nodes[i];
+		     i++)
+			if (!btree_node_relock(iter, i)) {
+				while (iter->nodes[iter->level] &&
+				       iter->level + 1 < iter->locks_want)
+					btree_iter_up(iter);
+				break;
+			}
+	}
+
+	/*
+	 * If the current node isn't locked, go up until we have a locked node
+	 * or run out of nodes:
+	 */
+	while (iter->nodes[iter->level] &&
+	       !(is_btree_node(iter, iter->level) &&
+		 btree_node_relock(iter, iter->level) &&
+		 btree_iter_pos_cmp(iter->pos,
+				    &iter->nodes[iter->level]->key.k,
+				    iter->is_extents)))
+		btree_iter_up(iter);
+
+	/*
+	 * If we've got a btree node locked (i.e. we aren't about to relock the
+	 * root) - advance its node iterator if necessary:
+	 */
+	if (iter->nodes[iter->level]) {
+		struct bkey_s_c k;
+
+		while ((k = __btree_iter_peek_all(iter)).k &&
+		       !btree_iter_pos_cmp(iter->pos, k.k, iter->is_extents))
+			__btree_iter_advance(iter);
+	}
+
+	/*
+	 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
+	 * would indicate to other code that we got to the end of the btree,
+	 * here it indicates that relocking the root failed - it's critical that
+	 * btree_iter_lock_root() comes next and that it can't fail
+	 */
+	while (iter->level > depth_want) {
+		int ret = iter->nodes[iter->level]
+			? btree_iter_down(iter)
+			: btree_iter_lock_root(iter, depth_want);
+		if (unlikely(ret)) {
+			iter->level = depth_want;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+int __must_check bch_btree_iter_traverse(struct btree_iter *iter)
+{
+	int ret;
+
+	if (unlikely(!iter->nodes[iter->level]))
+		return 0;
+
+	iter->at_end_of_leaf = false;
+
+	ret = __bch_btree_iter_traverse(iter);
+	if (unlikely(ret))
+		ret = btree_iter_traverse_error(iter, ret);
+
+	return ret;
+}
+
+/* Iterate across nodes (leaf and interior nodes) */
+
+struct btree *bch_btree_iter_peek_node(struct btree_iter *iter)
+{
+	struct btree *b;
+	int ret;
+
+	EBUG_ON(iter->is_extents);
+
+	ret = bch_btree_iter_traverse(iter);
+	if (ret)
+		return NULL;
+
+	b = iter->nodes[iter->level];
+
+	if (b) {
+		EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+		iter->pos = b->key.k.p;
+	}
+
+	return b;
+}
+
+struct btree *bch_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
+{
+	struct btree *b;
+	int ret;
+
+	EBUG_ON(iter->is_extents);
+
+	btree_iter_up(iter);
+
+	if (!iter->nodes[iter->level])
+		return NULL;
+
+	/* parent node usually won't be locked: redo traversal if necessary */
+	ret = bch_btree_iter_traverse(iter);
+	if (ret)
+		return NULL;
+
+	b = iter->nodes[iter->level];
+	if (!b)
+		return b;
+
+	if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
+		/* Haven't gotten to the end of the parent node: */
+
+		/* ick: */
+		iter->pos	= iter->btree_id == BTREE_ID_INODES
+			? btree_type_successor(iter->btree_id, iter->pos)
+			: bkey_successor(iter->pos);
+		iter->level	= depth;
+
+		ret = bch_btree_iter_traverse(iter);
+		if (ret)
+			return NULL;
+
+		b = iter->nodes[iter->level];
+	}
+
+	iter->pos = b->key.k.p;
+
+	return b;
+}
+
+/* Iterate across keys (in leaf nodes only) */
+
+void bch_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
+{
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	struct bkey_packed *k;
+
+	EBUG_ON(iter->level != 0);
+	EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
+	EBUG_ON(!btree_node_locked(iter, 0));
+	EBUG_ON(bkey_cmp(new_pos, b->key.k.p) > 0);
+
+	while ((k = bch_btree_node_iter_peek_all(node_iter, b)) &&
+	       !btree_iter_pos_cmp_packed(b, &new_pos, k,
+					  iter->is_extents))
+		bch_btree_node_iter_advance(node_iter, b);
+
+	if (!k &&
+	    !btree_iter_pos_cmp(new_pos, &b->key.k, iter->is_extents))
+		iter->at_end_of_leaf = true;
+
+	iter->pos = new_pos;
+}
+
+void bch_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+	EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */
+	iter->pos = new_pos;
+}
+
+void bch_btree_iter_advance_pos(struct btree_iter *iter)
+{
+	/*
+	 * We use iter->k instead of iter->pos for extents: iter->pos will be
+	 * equal to the start of the extent we returned, but we need to advance
+	 * to the end of the extent we returned.
+	 */
+	bch_btree_iter_set_pos(iter,
+		btree_type_successor(iter->btree_id, iter->k.p));
+}
+
+/* XXX: expensive */
+void bch_btree_iter_rewind(struct btree_iter *iter, struct bpos pos)
+{
+	/* incapable of rewinding across nodes: */
+	BUG_ON(bkey_cmp(pos, iter->nodes[iter->level]->data->min_key) < 0);
+
+	iter->pos = pos;
+	__btree_iter_init(iter, iter->nodes[iter->level]);
+}
+
+struct bkey_s_c bch_btree_iter_peek(struct btree_iter *iter)
+{
+	struct bkey_s_c k;
+	int ret;
+
+	while (1) {
+		ret = bch_btree_iter_traverse(iter);
+		if (unlikely(ret)) {
+			iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
+			return bkey_s_c_err(ret);
+		}
+
+		k = __btree_iter_peek(iter);
+		if (likely(k.k)) {
+			/*
+			 * iter->pos should always be equal to the key we just
+			 * returned - except extents can straddle iter->pos:
+			 */
+			if (!iter->is_extents ||
+			    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+				bch_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+			return k;
+		}
+
+		iter->pos = iter->nodes[0]->key.k.p;
+
+		if (!bkey_cmp(iter->pos, POS_MAX)) {
+			iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
+			bch_btree_iter_unlock(iter);
+			return bkey_s_c_null;
+		}
+
+		iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+	}
+}
+
+struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *iter)
+{
+	struct bkey_s_c k;
+	struct bkey n;
+	int ret;
+
+	while (1) {
+		ret = bch_btree_iter_traverse(iter);
+		if (unlikely(ret)) {
+			iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
+			return bkey_s_c_err(ret);
+		}
+
+		k = __btree_iter_peek_all(iter);
+recheck:
+		if (!k.k || bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) {
+			/* hole */
+			bkey_init(&n);
+			n.p = iter->pos;
+
+			if (iter->is_extents) {
+				if (n.p.offset == KEY_OFFSET_MAX) {
+					iter->pos = bkey_successor(iter->pos);
+					goto recheck;
+				}
+
+				if (!k.k)
+					k.k = &iter->nodes[0]->key.k;
+
+				bch_key_resize(&n,
+				       min_t(u64, KEY_SIZE_MAX,
+					     (k.k->p.inode == n.p.inode
+					      ? bkey_start_offset(k.k)
+					      : KEY_OFFSET_MAX) -
+					     n.p.offset));
+
+				EBUG_ON(!n.size);
+			}
+
+			iter->k = n;
+			return (struct bkey_s_c) { &iter->k, NULL };
+		} else if (!bkey_deleted(k.k)) {
+			return k;
+		} else {
+			__btree_iter_advance(iter);
+		}
+	}
+}
+
+void __bch_btree_iter_init(struct btree_iter *iter, struct cache_set *c,
+			   enum btree_id btree_id, struct bpos pos,
+			   unsigned locks_want, unsigned depth)
+{
+	iter->level			= depth;
+	/* bch_bkey_ops isn't used much, this would be a cache miss */
+	/* iter->is_extents		= bch_bkey_ops[btree_id]->is_extents; */
+	iter->is_extents		= btree_id == BTREE_ID_EXTENTS;
+	iter->nodes_locked		= 0;
+	iter->nodes_intent_locked	= 0;
+	iter->locks_want		= min(locks_want, BTREE_MAX_DEPTH);
+	iter->btree_id			= btree_id;
+	iter->at_end_of_leaf		= 0;
+	iter->error			= 0;
+	iter->c				= c;
+	iter->pos			= pos;
+	memset(iter->nodes, 0, sizeof(iter->nodes));
+	iter->nodes[iter->level]	= BTREE_ITER_NOT_END;
+	iter->next			= iter;
+
+	prefetch(c->btree_roots[btree_id].b);
+}
+
+void bch_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
+{
+	BUG_ON(btree_iter_linked(new));
+
+	new->next = iter->next;
+	iter->next = new;
+
+	if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+		unsigned nr_iters = 1;
+
+		for_each_linked_btree_iter(iter, new)
+			nr_iters++;
+
+		BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE);
+	}
+}
+
+void bch_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
+{
+	bch_btree_iter_unlock(dst);
+	memcpy(dst, src, offsetof(struct btree_iter, next));
+	dst->nodes_locked = dst->nodes_intent_locked = 0;
+}
diff --git a/libbcache/btree_iter.h b/libbcache/btree_iter.h
new file mode 100644
index 0000000..9835334
--- /dev/null
+++ b/libbcache/btree_iter.h
@@ -0,0 +1,282 @@
+#ifndef _BCACHE_BTREE_ITER_H
+#define _BCACHE_BTREE_ITER_H
+
+#include "btree_types.h"
+
+struct btree_iter {
+	/* Current btree depth */
+	u8			level;
+
+	/*
+	 * Used in bch_btree_iter_traverse(), to indicate whether we're
+	 * searching for @pos or the first key strictly greater than @pos
+	 */
+	u8			is_extents;
+
+	/* Bitmasks for read/intent locks held per level */
+	u8			nodes_locked;
+	u8			nodes_intent_locked;
+
+	/* Btree level below which we start taking intent locks */
+	u8			locks_want;
+
+	enum btree_id		btree_id:8;
+
+	/*
+	 * indicates we need to call bch_btree_iter_traverse() to revalidate
+	 * iterator:
+	 */
+	u8			at_end_of_leaf;
+
+	s8			error;
+
+	struct cache_set	*c;
+
+	/* Current position of the iterator */
+	struct bpos		pos;
+
+	u32			lock_seq[BTREE_MAX_DEPTH];
+
+	/*
+	 * NOTE: Never set iter->nodes to NULL except in btree_iter_lock_root().
+	 *
+	 * This is because iter->nodes[iter->level] == NULL is how
+	 * btree_iter_next_node() knows that it's finished with a depth first
+	 * traversal. Just unlocking a node (with btree_node_unlock()) is fine,
+	 * and if you really don't want that node used again (e.g. btree_split()
+	 * freed it) decrementing lock_seq will cause btree_node_relock() to
+	 * always fail (but since freeing a btree node takes a write lock on the
+	 * node, which increments the node's lock seq, that's not actually
+	 * necessary in that example).
+	 *
+	 * One extra slot for a sentinel NULL:
+	 */
+	struct btree		*nodes[BTREE_MAX_DEPTH + 1];
+	struct btree_node_iter	node_iters[BTREE_MAX_DEPTH];
+
+	/*
+	 * Current unpacked key - so that bch_btree_iter_next()/
+	 * bch_btree_iter_next_with_holes() can correctly advance pos.
+	 */
+	struct bkey		k;
+
+	/*
+	 * Circular linked list of linked iterators: linked iterators share
+	 * locks (e.g. two linked iterators may have the same node intent
+	 * locked, or read and write locked, at the same time), and insertions
+	 * through one iterator won't invalidate the other linked iterators.
+	 */
+
+	/* Must come last: */
+	struct btree_iter	*next;
+};
+
+static inline bool btree_iter_linked(const struct btree_iter *iter)
+{
+	return iter->next != iter;
+}
+
+/**
+ * for_each_linked_btree_iter - iterate over all iterators linked with @_iter
+ */
+#define for_each_linked_btree_iter(_iter, _linked)			\
+	for ((_linked) = (_iter)->next;					\
+	     (_linked) != (_iter);					\
+	     (_linked) = (_linked)->next)
+
+static inline struct btree_iter *
+__next_linked_btree_node(struct btree_iter *iter, struct btree *b,
+			 struct btree_iter *linked)
+{
+	do {
+		linked = linked->next;
+
+		if (linked == iter)
+			return NULL;
+
+		/*
+		 * We don't compare the low bits of the lock sequence numbers
+		 * because @iter might have taken a write lock on @b, and we
+		 * don't want to skip the linked iterator if the sequence
+		 * numbers were equal before taking that write lock. The lock
+		 * sequence number is incremented by taking and releasing write
+		 * locks and is even when unlocked:
+		 */
+	} while (linked->nodes[b->level] != b ||
+		 linked->lock_seq[b->level] >> 1 != b->lock.state.seq >> 1);
+
+	return linked;
+}
+
+/**
+ * for_each_linked_btree_node - iterate over all iterators linked with @_iter
+ * that also point to @_b
+ *
+ * @_b is assumed to be locked by @_iter
+ *
+ * Filters out iterators that don't have a valid btree_node iterator for @_b -
+ * i.e. iterators for which btree_node_relock() would not succeed.
+ */
+#define for_each_linked_btree_node(_iter, _b, _linked)			\
+	for ((_linked) = (_iter);					\
+	     ((_linked) = __next_linked_btree_node(_iter, _b, _linked));)
+
+#ifdef CONFIG_BCACHE_DEBUG
+void bch_btree_iter_verify(struct btree_iter *, struct btree *);
+#else
+static inline void bch_btree_iter_verify(struct btree_iter *iter,
+					 struct btree *b) {}
+#endif
+
+void bch_btree_node_iter_fix(struct btree_iter *, struct btree *,
+			     struct btree_node_iter *, struct bset_tree *,
+			     struct bkey_packed *, unsigned, unsigned);
+
+int bch_btree_iter_unlock(struct btree_iter *);
+bool __bch_btree_iter_set_locks_want(struct btree_iter *, unsigned);
+
+static inline bool bch_btree_iter_set_locks_want(struct btree_iter *iter,
+						 unsigned new_locks_want)
+{
+	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+	if (iter->locks_want == new_locks_want &&
+	    iter->nodes_intent_locked == (1 << new_locks_want) - 1)
+		return true;
+
+	return __bch_btree_iter_set_locks_want(iter, new_locks_want);
+}
+
+bool bch_btree_iter_node_replace(struct btree_iter *, struct btree *);
+void bch_btree_iter_node_drop_linked(struct btree_iter *, struct btree *);
+void bch_btree_iter_node_drop(struct btree_iter *, struct btree *);
+
+void bch_btree_iter_reinit_node(struct btree_iter *, struct btree *);
+
+int __must_check bch_btree_iter_traverse(struct btree_iter *);
+
+struct btree *bch_btree_iter_peek_node(struct btree_iter *);
+struct btree *bch_btree_iter_next_node(struct btree_iter *, unsigned);
+
+struct bkey_s_c bch_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *);
+void bch_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
+void bch_btree_iter_set_pos(struct btree_iter *, struct bpos);
+void bch_btree_iter_advance_pos(struct btree_iter *);
+void bch_btree_iter_rewind(struct btree_iter *, struct bpos);
+
+void __bch_btree_iter_init(struct btree_iter *, struct cache_set *,
+			   enum btree_id, struct bpos, unsigned , unsigned);
+
+static inline void bch_btree_iter_init(struct btree_iter *iter,
+				       struct cache_set *c,
+				       enum btree_id btree_id,
+				       struct bpos pos)
+{
+	__bch_btree_iter_init(iter, c, btree_id, pos, 0, 0);
+}
+
+static inline void bch_btree_iter_init_intent(struct btree_iter *iter,
+					      struct cache_set *c,
+					      enum btree_id btree_id,
+					      struct bpos pos)
+{
+	__bch_btree_iter_init(iter, c, btree_id, pos, 1, 0);
+}
+
+void bch_btree_iter_link(struct btree_iter *, struct btree_iter *);
+void bch_btree_iter_copy(struct btree_iter *, struct btree_iter *);
+
+static inline struct bpos btree_type_successor(enum btree_id id,
+					       struct bpos pos)
+{
+	if (id == BTREE_ID_INODES) {
+		pos.inode++;
+		pos.offset = 0;
+	} else if (id != BTREE_ID_EXTENTS) {
+		pos = bkey_successor(pos);
+	}
+
+	return pos;
+}
+
+static inline int __btree_iter_cmp(enum btree_id id,
+				   struct bpos pos,
+				   const struct btree_iter *r)
+{
+	if (id != r->btree_id)
+		return id < r->btree_id ? -1 : 1;
+	return bkey_cmp(pos, r->pos);
+}
+
+static inline int btree_iter_cmp(const struct btree_iter *l,
+				 const struct btree_iter *r)
+{
+	return __btree_iter_cmp(l->btree_id, l->pos, r);
+}
+
+#define __for_each_btree_node(_iter, _c, _btree_id, _start, _depth,	\
+			      _b, _locks_want)				\
+	for (__bch_btree_iter_init((_iter), (_c), (_btree_id),		\
+				   _start, _locks_want, _depth),	\
+	     (_iter)->is_extents = false,				\
+	     _b = bch_btree_iter_peek_node(_iter);			\
+	     (_b);							\
+	     (_b) = bch_btree_iter_next_node(_iter, _depth))
+
+#define for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b)	\
+	__for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b, 0)
+
+#define __for_each_btree_key(_iter, _c, _btree_id,  _start,		\
+			     _k, _locks_want)				\
+	for (__bch_btree_iter_init((_iter), (_c), (_btree_id),		\
+				   _start, _locks_want, 0);		\
+	     !IS_ERR_OR_NULL(((_k) = bch_btree_iter_peek(_iter)).k);	\
+	     bch_btree_iter_advance_pos(_iter))
+
+#define for_each_btree_key(_iter, _c, _btree_id,  _start, _k)		\
+	__for_each_btree_key(_iter, _c, _btree_id, _start, _k, 0)
+
+#define for_each_btree_key_intent(_iter, _c, _btree_id,  _start, _k)	\
+	__for_each_btree_key(_iter, _c, _btree_id, _start, _k, 1)
+
+#define __for_each_btree_key_with_holes(_iter, _c, _btree_id,		\
+					_start, _k, _locks_want)	\
+	for (__bch_btree_iter_init((_iter), (_c), (_btree_id),		\
+				   _start, _locks_want, 0);		\
+	     !IS_ERR_OR_NULL(((_k) = bch_btree_iter_peek_with_holes(_iter)).k);\
+	     bch_btree_iter_advance_pos(_iter))
+
+#define for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k)	\
+	__for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 0)
+
+#define for_each_btree_key_with_holes_intent(_iter, _c, _btree_id,	\
+					     _start, _k)		\
+	__for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 1)
+
+static inline int btree_iter_err(struct bkey_s_c k)
+{
+	return IS_ERR(k.k) ? PTR_ERR(k.k) : 0;
+}
+
+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline void bch_btree_iter_cond_resched(struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	if (need_resched()) {
+		for_each_linked_btree_iter(iter, linked)
+			bch_btree_iter_unlock(linked);
+		bch_btree_iter_unlock(iter);
+		schedule();
+	} else if (race_fault()) {
+		for_each_linked_btree_iter(iter, linked)
+			bch_btree_iter_unlock(linked);
+		bch_btree_iter_unlock(iter);
+	}
+}
+
+#endif /* _BCACHE_BTREE_ITER_H */
diff --git a/libbcache/btree_locking.h b/libbcache/btree_locking.h
new file mode 100644
index 0000000..76f85c0
--- /dev/null
+++ b/libbcache/btree_locking.h
@@ -0,0 +1,119 @@
+#ifndef _BCACHE_BTREE_LOCKING_H
+#define _BCACHE_BTREE_LOCKING_H
+
+/*
+ * Only for internal btree use:
+ *
+ * The btree iterator tracks what locks it wants to take, and what locks it
+ * currently has - here we have wrappers for locking/unlocking btree nodes and
+ * updating the iterator state
+ */
+
+#include "btree_iter.h"
+#include "six.h"
+
+/* matches six lock types */
+enum btree_node_locked_type {
+	BTREE_NODE_UNLOCKED		= -1,
+	BTREE_NODE_READ_LOCKED		= SIX_LOCK_read,
+	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
+};
+
+static inline int btree_node_locked_type(struct btree_iter *iter,
+					 unsigned level)
+{
+	/*
+	 * We're relying on the fact that if nodes_intent_locked is set
+	 * nodes_locked must be set as well, so that we can compute without
+	 * branches:
+	 */
+	return BTREE_NODE_UNLOCKED +
+		((iter->nodes_locked >> level) & 1) +
+		((iter->nodes_intent_locked >> level) & 1);
+}
+
+static inline bool btree_node_intent_locked(struct btree_iter *iter,
+					    unsigned level)
+{
+	return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
+}
+
+static inline bool btree_node_read_locked(struct btree_iter *iter,
+					  unsigned level)
+{
+	return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
+}
+
+static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
+{
+	return iter->nodes_locked & (1 << level);
+}
+
+static inline void mark_btree_node_unlocked(struct btree_iter *iter,
+					    unsigned level)
+{
+	iter->nodes_locked &= ~(1 << level);
+	iter->nodes_intent_locked &= ~(1 << level);
+}
+
+static inline void mark_btree_node_locked(struct btree_iter *iter,
+					  unsigned level,
+					  enum six_lock_type type)
+{
+	/* relying on this to avoid a branch */
+	BUILD_BUG_ON(SIX_LOCK_read   != 0);
+	BUILD_BUG_ON(SIX_LOCK_intent != 1);
+
+	iter->nodes_locked |= 1 << level;
+	iter->nodes_intent_locked |= type << level;
+}
+
+static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
+						 unsigned level)
+{
+	mark_btree_node_locked(iter, level, SIX_LOCK_intent);
+}
+
+static inline enum six_lock_type
+btree_lock_want(struct btree_iter *iter, int level)
+{
+	return level < iter->locks_want
+		? SIX_LOCK_intent
+		: SIX_LOCK_read;
+}
+
+static inline bool btree_want_intent(struct btree_iter *iter, int level)
+{
+	return btree_lock_want(iter, level) == SIX_LOCK_intent;
+}
+
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+{
+	int lock_type = btree_node_locked_type(iter, level);
+
+	if (lock_type != BTREE_NODE_UNLOCKED)
+		six_unlock_type(&iter->nodes[level]->lock, lock_type);
+	mark_btree_node_unlocked(iter, level);
+}
+
+bool __bch_btree_node_lock(struct btree *, struct bpos, unsigned,
+			   struct btree_iter *, enum six_lock_type);
+
+static inline bool btree_node_lock(struct btree *b, struct bpos pos,
+				   unsigned level,
+				   struct btree_iter *iter,
+				   enum six_lock_type type)
+{
+	return likely(six_trylock_type(&b->lock, type)) ||
+		__bch_btree_node_lock(b, pos, level, iter, type);
+}
+
+bool btree_node_relock(struct btree_iter *, unsigned);
+
+void btree_node_unlock_write(struct btree *, struct btree_iter *);
+void btree_node_lock_write(struct btree *, struct btree_iter *);
+
+void __btree_node_unlock_write(struct btree *, struct btree_iter *);
+void __btree_node_lock_write(struct btree *, struct btree_iter *);
+
+#endif /* _BCACHE_BTREE_LOCKING_H */
diff --git a/libbcache/btree_types.h b/libbcache/btree_types.h
new file mode 100644
index 0000000..3632a04
--- /dev/null
+++ b/libbcache/btree_types.h
@@ -0,0 +1,322 @@
+#ifndef _BCACHE_BTREE_TYPES_H
+#define _BCACHE_BTREE_TYPES_H
+
+#include <linux/bcache.h>
+#include <linux/list.h>
+#include <linux/rhashtable.h>
+#include <linux/semaphore.h>
+#include <linux/workqueue.h>
+
+#include "bkey_methods.h"
+#include "journal_types.h"
+#include "six.h"
+
+struct cache_set;
+struct open_bucket;
+struct btree_interior_update;
+
+#define MAX_BSETS		3U
+
+struct btree_nr_keys {
+
+	/*
+	 * Amount of live metadata (i.e. size of node after a compaction) in
+	 * units of u64s
+	 */
+	u16			live_u64s;
+	u16			bset_u64s[MAX_BSETS];
+
+	/* live keys only: */
+	u16			packed_keys;
+	u16			unpacked_keys;
+};
+
+struct bset_tree {
+	/*
+	 * We construct a binary tree in an array as if the array
+	 * started at 1, so that things line up on the same cachelines
+	 * better: see comments in bset.c at cacheline_to_bkey() for
+	 * details
+	 */
+
+	/* size of the binary tree and prev array */
+	u16			size;
+
+	/* function of size - precalculated for to_inorder() */
+	u16			extra;
+
+	u16			data_offset;
+	u16			aux_data_offset;
+	u16			end_offset;
+
+	struct bpos		max_key;
+};
+
+struct btree_write {
+	struct journal_entry_pin	journal;
+	struct closure_waitlist		wait;
+};
+
+struct btree {
+	/* Hottest entries first */
+	struct rhash_head	hash;
+
+	/* Key/pointer for this btree node */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
+	struct six_lock		lock;
+
+	unsigned long		flags;
+	u16			written;
+	u8			level;
+	u8			btree_id;
+	u8			nsets;
+	u8			nr_key_bits;
+
+	struct bkey_format	format;
+
+	struct btree_node	*data;
+	void			*aux_data;
+
+	/*
+	 * Sets of sorted keys - the real btree node - plus a binary search tree
+	 *
+	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+	 * to the memory we have allocated for this btree node. Additionally,
+	 * set[0]->data points to the entire btree node as it exists on disk.
+	 */
+	struct bset_tree	set[MAX_BSETS];
+
+	struct btree_nr_keys	nr;
+	u16			sib_u64s[2];
+	u16			whiteout_u64s;
+	u16			uncompacted_whiteout_u64s;
+	u8			page_order;
+	u8			unpack_fn_len;
+
+	/*
+	 * XXX: add a delete sequence number, so when btree_node_relock() fails
+	 * because the lock sequence number has changed - i.e. the contents were
+	 * modified - we can still relock the node if it's still the one we
+	 * want, without redoing the traversal
+	 */
+
+	/*
+	 * For asynchronous splits/interior node updates:
+	 * When we do a split, we allocate new child nodes and update the parent
+	 * node to point to them: we update the parent in memory immediately,
+	 * but then we must wait until the children have been written out before
+	 * the update to the parent can be written - this is a list of the
+	 * btree_interior_updates that are blocking this node from being
+	 * written:
+	 */
+	struct list_head	write_blocked;
+
+	struct open_bucket	*ob;
+
+	/* lru list */
+	struct list_head	list;
+
+	struct btree_write	writes[2];
+
+#ifdef CONFIG_BCACHE_DEBUG
+	bool			*expensive_debug_checks;
+#endif
+};
+
+#define BTREE_FLAG(flag)						\
+static inline bool btree_node_ ## flag(struct btree *b)			\
+{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
+									\
+static inline void set_btree_node_ ## flag(struct btree *b)		\
+{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
+									\
+static inline void clear_btree_node_ ## flag(struct btree *b)		\
+{	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
+
+enum btree_flags {
+	BTREE_NODE_read_error,
+	BTREE_NODE_write_error,
+	BTREE_NODE_dirty,
+	BTREE_NODE_write_idx,
+	BTREE_NODE_accessed,
+	BTREE_NODE_write_in_flight,
+	BTREE_NODE_just_written,
+};
+
+BTREE_FLAG(read_error);
+BTREE_FLAG(write_error);
+BTREE_FLAG(dirty);
+BTREE_FLAG(write_idx);
+BTREE_FLAG(accessed);
+BTREE_FLAG(write_in_flight);
+BTREE_FLAG(just_written);
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+	return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+	return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline struct bset_tree *bset_tree_last(struct btree *b)
+{
+	EBUG_ON(!b->nsets);
+	return b->set + b->nsets - 1;
+}
+
+static inline struct bset *bset(const struct btree *b,
+				const struct bset_tree *t)
+{
+	return (void *) b->data + t->data_offset * sizeof(u64);
+}
+
+static inline struct bset *btree_bset_first(struct btree *b)
+{
+	return bset(b, b->set);
+}
+
+static inline struct bset *btree_bset_last(struct btree *b)
+{
+	return bset(b, bset_tree_last(b));
+}
+
+static inline u16
+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
+{
+	size_t ret = (u64 *) k - (u64 *) b->data - 1;
+
+	EBUG_ON(ret > U16_MAX);
+	return ret;
+}
+
+static inline struct bkey_packed *
+__btree_node_offset_to_key(const struct btree *b, u16 k)
+{
+	return (void *) ((u64 *) b->data + k + 1);
+}
+
+#define __bkey_idx(_set, _offset)				\
+	((_set)->_data + (_offset))
+
+#define bkey_idx(_set, _offset)					\
+	((typeof(&(_set)->start[0])) __bkey_idx((_set), (_offset)))
+
+#define __bset_bkey_last(_set)					\
+	 __bkey_idx((_set), (_set)->u64s)
+
+#define bset_bkey_last(_set)					\
+	 bkey_idx((_set), le16_to_cpu((_set)->u64s))
+
+#define btree_bkey_first(_b, _t)	(bset(_b, _t)->start)
+
+#define btree_bkey_last(_b, _t)						\
+({									\
+	EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=	\
+		bset_bkey_last(bset(_b, _t)));				\
+									\
+	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
+})
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+	t->end_offset =
+		__btree_node_key_to_offset(b, bset_bkey_last(bset(b, t)));
+	btree_bkey_last(b, t);
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+				  const struct bset *i)
+{
+	t->data_offset = (u64 *) i - (u64 *) b->data;
+
+	EBUG_ON(bset(b, t) != i);
+
+	set_btree_bset_end(b, t);
+}
+
+static inline unsigned bset_byte_offset(struct btree *b, void *i)
+{
+	return i - (void *) b->data;
+}
+
+/* Type of keys @b contains: */
+static inline enum bkey_type btree_node_type(struct btree *b)
+{
+	return b->level ? BKEY_TYPE_BTREE : b->btree_id;
+}
+
+static inline const struct bkey_ops *btree_node_ops(struct btree *b)
+{
+	return bch_bkey_ops[btree_node_type(b)];
+}
+
+static inline bool btree_node_has_ptrs(struct btree *b)
+{
+	return btree_type_has_ptrs(btree_node_type(b));
+}
+
+static inline bool btree_node_is_extents(struct btree *b)
+{
+	return btree_node_type(b) == BKEY_TYPE_EXTENTS;
+}
+
+struct btree_root {
+	struct btree		*b;
+
+	struct btree_interior_update *as;
+
+	/* On disk root - see async splits: */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+	u8			level;
+	u8			alive;
+};
+
+/*
+ * Optional hook that will be called just prior to a btree node update, when
+ * we're holding the write lock and we know what key is about to be overwritten:
+ */
+
+struct btree_iter;
+struct bucket_stats_cache_set;
+struct btree_node_iter;
+
+enum extent_insert_hook_ret {
+	BTREE_HOOK_DO_INSERT,
+	BTREE_HOOK_NO_INSERT,
+	BTREE_HOOK_RESTART_TRANS,
+};
+
+struct extent_insert_hook {
+	enum extent_insert_hook_ret
+	(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
+	      struct bkey_s_c, const struct bkey_i *);
+};
+
+enum btree_insert_ret {
+	BTREE_INSERT_OK,
+	/* extent spanned multiple leaf nodes: have to traverse to next node: */
+	BTREE_INSERT_NEED_TRAVERSE,
+	/* write lock held for too long */
+	BTREE_INSERT_NEED_RESCHED,
+	/* leaf node needs to be split */
+	BTREE_INSERT_BTREE_NODE_FULL,
+	BTREE_INSERT_JOURNAL_RES_FULL,
+	BTREE_INSERT_ENOSPC,
+	BTREE_INSERT_NEED_GC_LOCK,
+};
+
+enum btree_gc_coalesce_fail_reason {
+	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
+	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
+	BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
+};
+
+typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
+							struct btree *,
+							struct btree_node_iter *);
+
+#endif /* _BCACHE_BTREE_TYPES_H */
diff --git a/libbcache/btree_update.c b/libbcache/btree_update.c
new file mode 100644
index 0000000..95406a4
--- /dev/null
+++ b/libbcache/btree_update.c
@@ -0,0 +1,2343 @@
+
+#include "bcache.h"
+#include "alloc.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "extents.h"
+#include "journal.h"
+#include "keylist.h"
+#include "super.h"
+
+#include <linux/random.h>
+#include <linux/sort.h>
+#include <trace/events/bcache.h>
+
+static void btree_interior_update_updated_root(struct cache_set *,
+					       struct btree_interior_update *,
+					       enum btree_id);
+
+/* Calculate ideal packed bkey format for new btree nodes: */
+
+void __bch_btree_calc_format(struct bkey_format_state *s, struct btree *b)
+{
+	struct bkey_packed *k;
+	struct bset_tree *t;
+	struct bkey uk;
+
+	bch_bkey_format_add_pos(s, b->data->min_key);
+
+	for_each_bset(b, t)
+		for (k = btree_bkey_first(b, t);
+		     k != btree_bkey_last(b, t);
+		     k = bkey_next(k))
+			if (!bkey_whiteout(k)) {
+				uk = bkey_unpack_key(b, k);
+				bch_bkey_format_add_key(s, &uk);
+			}
+}
+
+static struct bkey_format bch_btree_calc_format(struct btree *b)
+{
+	struct bkey_format_state s;
+
+	bch_bkey_format_init(&s);
+	__bch_btree_calc_format(&s, b);
+
+	return bch_bkey_format_done(&s);
+}
+
+static size_t btree_node_u64s_with_format(struct btree *b,
+					  struct bkey_format *new_f)
+{
+	struct bkey_format *old_f = &b->format;
+
+	/* stupid integer promotion rules */
+	ssize_t delta =
+	    (((int) new_f->key_u64s - old_f->key_u64s) *
+	     (int) b->nr.packed_keys) +
+	    (((int) new_f->key_u64s - BKEY_U64s) *
+	     (int) b->nr.unpacked_keys);
+
+	BUG_ON(delta + b->nr.live_u64s < 0);
+
+	return b->nr.live_u64s + delta;
+}
+
+/**
+ * btree_node_format_fits - check if we could rewrite node with a new format
+ *
+ * This assumes all keys can pack with the new format -- it just checks if
+ * the re-packed keys would fit inside the node itself.
+ */
+bool bch_btree_node_format_fits(struct cache_set *c, struct btree *b,
+				struct bkey_format *new_f)
+{
+	size_t u64s = btree_node_u64s_with_format(b, new_f);
+
+	return __set_bytes(b->data, u64s) < btree_bytes(c);
+}
+
+/* Btree node freeing/allocation: */
+
+/*
+ * We're doing the index update that makes @b unreachable, update stuff to
+ * reflect that:
+ *
+ * Must be called _before_ btree_interior_update_updated_root() or
+ * btree_interior_update_updated_btree:
+ */
+static void bch_btree_node_free_index(struct cache_set *c, struct btree *b,
+				      enum btree_id id, struct bkey_s_c k,
+				      struct bucket_stats_cache_set *stats)
+{
+	struct btree_interior_update *as;
+	struct pending_btree_node_free *d;
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	for_each_pending_btree_node_free(c, as, d)
+		if (!bkey_cmp(k.k->p, d->key.k.p) &&
+		    bkey_val_bytes(k.k) == bkey_val_bytes(&d->key.k) &&
+		    !memcmp(k.v, &d->key.v, bkey_val_bytes(k.k)))
+			goto found;
+
+	BUG();
+found:
+	d->index_update_done = true;
+
+	/*
+	 * Btree nodes are accounted as freed in cache_set_stats when they're
+	 * freed from the index:
+	 */
+	stats->s[S_COMPRESSED][S_META]	 -= c->sb.btree_node_size;
+	stats->s[S_UNCOMPRESSED][S_META] -= c->sb.btree_node_size;
+
+	/*
+	 * We're dropping @k from the btree, but it's still live until the
+	 * index update is persistent so we need to keep a reference around for
+	 * mark and sweep to find - that's primarily what the
+	 * btree_node_pending_free list is for.
+	 *
+	 * So here (when we set index_update_done = true), we're moving an
+	 * existing reference to a different part of the larger "gc keyspace" -
+	 * and the new position comes after the old position, since GC marks
+	 * the pending free list after it walks the btree.
+	 *
+	 * If we move the reference while mark and sweep is _between_ the old
+	 * and the new position, mark and sweep will see the reference twice
+	 * and it'll get double accounted - so check for that here and subtract
+	 * to cancel out one of mark and sweep's markings if necessary:
+	 */
+
+	/*
+	 * bch_mark_key() compares the current gc pos to the pos we're
+	 * moving this reference from, hence one comparison here:
+	 */
+	if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
+		struct bucket_stats_cache_set tmp = { 0 };
+
+		bch_mark_key(c, bkey_i_to_s_c(&d->key),
+			     -c->sb.btree_node_size, true, b
+			     ? gc_pos_btree_node(b)
+			     : gc_pos_btree_root(id),
+			     &tmp, 0);
+		/*
+		 * Don't apply tmp - pending deletes aren't tracked in
+		 * cache_set_stats:
+		 */
+	}
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void __btree_node_free(struct cache_set *c, struct btree *b,
+			      struct btree_iter *iter)
+{
+	trace_bcache_btree_node_free(c, b);
+
+	BUG_ON(b == btree_node_root(c, b));
+	BUG_ON(b->ob);
+	BUG_ON(!list_empty(&b->write_blocked));
+
+	six_lock_write(&b->lock);
+
+	if (btree_node_dirty(b))
+		bch_btree_complete_write(c, b, btree_current_write(b));
+	clear_btree_node_dirty(b);
+
+	mca_hash_remove(c, b);
+
+	mutex_lock(&c->btree_cache_lock);
+	list_move(&b->list, &c->btree_cache_freeable);
+	mutex_unlock(&c->btree_cache_lock);
+
+	/*
+	 * By using six_unlock_write() directly instead of
+	 * btree_node_unlock_write(), we don't update the iterator's sequence
+	 * numbers and cause future btree_node_relock() calls to fail:
+	 */
+	six_unlock_write(&b->lock);
+}
+
+void bch_btree_node_free_never_inserted(struct cache_set *c, struct btree *b)
+{
+	struct open_bucket *ob = b->ob;
+
+	b->ob = NULL;
+
+	__btree_node_free(c, b, NULL);
+
+	bch_open_bucket_put(c, ob);
+}
+
+void bch_btree_node_free_inmem(struct btree_iter *iter, struct btree *b)
+{
+	bch_btree_iter_node_drop_linked(iter, b);
+
+	__btree_node_free(iter->c, b, iter);
+
+	bch_btree_iter_node_drop(iter, b);
+}
+
+static void bch_btree_node_free_ondisk(struct cache_set *c,
+				       struct pending_btree_node_free *pending)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+
+	BUG_ON(!pending->index_update_done);
+
+	bch_mark_key(c, bkey_i_to_s_c(&pending->key),
+		     -c->sb.btree_node_size, true,
+		     gc_phase(GC_PHASE_PENDING_DELETE),
+		     &stats, 0);
+	/*
+	 * Don't apply stats - pending deletes aren't tracked in
+	 * cache_set_stats:
+	 */
+}
+
+void btree_open_bucket_put(struct cache_set *c, struct btree *b)
+{
+	bch_open_bucket_put(c, b->ob);
+	b->ob = NULL;
+}
+
+static struct btree *__bch_btree_node_alloc(struct cache_set *c,
+					    bool use_reserve,
+					    struct disk_reservation *res,
+					    struct closure *cl)
+{
+	BKEY_PADDED(k) tmp;
+	struct open_bucket *ob;
+	struct btree *b;
+	unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE;
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	if (c->btree_reserve_cache_nr > reserve) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		ob = a->ob;
+		bkey_copy(&tmp.k, &a->k);
+		mutex_unlock(&c->btree_reserve_cache_lock);
+		goto mem_alloc;
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+retry:
+	/* alloc_sectors is weird, I suppose */
+	bkey_extent_init(&tmp.k);
+	tmp.k.k.size = c->sb.btree_node_size,
+
+	ob = bch_alloc_sectors(c, &c->btree_write_point,
+			       bkey_i_to_extent(&tmp.k),
+			       res->nr_replicas,
+			       use_reserve ? RESERVE_BTREE : RESERVE_NONE,
+			       cl);
+	if (IS_ERR(ob))
+		return ERR_CAST(ob);
+
+	if (tmp.k.k.size < c->sb.btree_node_size) {
+		bch_open_bucket_put(c, ob);
+		goto retry;
+	}
+mem_alloc:
+	b = mca_alloc(c);
+
+	/* we hold cannibalize_lock: */
+	BUG_ON(IS_ERR(b));
+	BUG_ON(b->ob);
+
+	bkey_copy(&b->key, &tmp.k);
+	b->key.k.size = 0;
+	b->ob = ob;
+
+	return b;
+}
+
+static struct btree *bch_btree_node_alloc(struct cache_set *c,
+					  unsigned level, enum btree_id id,
+					  struct btree_reserve *reserve)
+{
+	struct btree *b;
+
+	BUG_ON(!reserve->nr);
+
+	b = reserve->b[--reserve->nr];
+
+	BUG_ON(mca_hash_insert(c, b, level, id));
+
+	set_btree_node_accessed(b);
+	set_btree_node_dirty(b);
+
+	bch_bset_init_first(b, &b->data->keys);
+	memset(&b->nr, 0, sizeof(b->nr));
+	b->data->magic = cpu_to_le64(bset_magic(&c->disk_sb));
+	SET_BSET_BTREE_LEVEL(&b->data->keys, level);
+
+	bch_btree_build_aux_trees(b);
+
+	bch_check_mark_super(c, &b->key, true);
+
+	trace_bcache_btree_node_alloc(c, b);
+	return b;
+}
+
+struct btree *__btree_node_alloc_replacement(struct cache_set *c,
+					     struct btree *b,
+					     struct bkey_format format,
+					     struct btree_reserve *reserve)
+{
+	struct btree *n;
+
+	n = bch_btree_node_alloc(c, b->level, b->btree_id, reserve);
+
+	n->data->min_key	= b->data->min_key;
+	n->data->max_key	= b->data->max_key;
+	n->data->format		= format;
+
+	btree_node_set_format(n, format);
+
+	bch_btree_sort_into(c, n, b);
+
+	btree_node_reset_sib_u64s(n);
+
+	n->key.k.p = b->key.k.p;
+	trace_bcache_btree_node_alloc_replacement(c, b, n);
+
+	return n;
+}
+
+struct btree *btree_node_alloc_replacement(struct cache_set *c,
+					   struct btree *b,
+					   struct btree_reserve *reserve)
+{
+	struct bkey_format new_f = bch_btree_calc_format(b);
+
+	/*
+	 * The keys might expand with the new format - if they wouldn't fit in
+	 * the btree node anymore, use the old format for now:
+	 */
+	if (!bch_btree_node_format_fits(c, b, &new_f))
+		new_f = b->format;
+
+	return __btree_node_alloc_replacement(c, b, new_f, reserve);
+}
+
+static void bch_btree_set_root_inmem(struct cache_set *c, struct btree *b,
+				     struct btree_reserve *btree_reserve)
+{
+	struct btree *old = btree_node_root(c, b);
+
+	/* Root nodes cannot be reaped */
+	mutex_lock(&c->btree_cache_lock);
+	list_del_init(&b->list);
+	mutex_unlock(&c->btree_cache_lock);
+
+	mutex_lock(&c->btree_root_lock);
+	btree_node_root(c, b) = b;
+	mutex_unlock(&c->btree_root_lock);
+
+	if (btree_reserve) {
+		/*
+		 * New allocation (we're not being called because we're in
+		 * bch_btree_root_read()) - do marking while holding
+		 * btree_root_lock:
+		 */
+		struct bucket_stats_cache_set stats = { 0 };
+
+		bch_mark_key(c, bkey_i_to_s_c(&b->key),
+			     c->sb.btree_node_size, true,
+			     gc_pos_btree_root(b->btree_id),
+			     &stats, 0);
+
+		if (old)
+			bch_btree_node_free_index(c, NULL, old->btree_id,
+						  bkey_i_to_s_c(&old->key),
+						  &stats);
+		bch_cache_set_stats_apply(c, &stats, &btree_reserve->disk_res,
+					  gc_pos_btree_root(b->btree_id));
+	}
+
+	bch_recalc_btree_reserve(c);
+}
+
+static void bch_btree_set_root_ondisk(struct cache_set *c, struct btree *b)
+{
+	struct btree_root *r = &c->btree_roots[b->btree_id];
+
+	mutex_lock(&c->btree_root_lock);
+
+	BUG_ON(b != r->b);
+	bkey_copy(&r->key, &b->key);
+	r->level = b->level;
+	r->alive = true;
+
+	mutex_unlock(&c->btree_root_lock);
+}
+
+/*
+ * Only for cache set bringup, when first reading the btree roots or allocating
+ * btree roots when initializing a new cache set:
+ */
+void bch_btree_set_root_initial(struct cache_set *c, struct btree *b,
+				struct btree_reserve *btree_reserve)
+{
+	BUG_ON(btree_node_root(c, b));
+
+	bch_btree_set_root_inmem(c, b, btree_reserve);
+	bch_btree_set_root_ondisk(c, b);
+}
+
+/**
+ * bch_btree_set_root - update the root in memory and on disk
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks. However, you must hold an intent lock on the
+ * old root.
+ *
+ * Note: This allocates a journal entry but doesn't add any keys to
+ * it.  All the btree roots are part of every journal write, so there
+ * is nothing new to be done.  This just guarantees that there is a
+ * journal write.
+ */
+static void bch_btree_set_root(struct btree_iter *iter, struct btree *b,
+			       struct btree_interior_update *as,
+			       struct btree_reserve *btree_reserve)
+{
+	struct cache_set *c = iter->c;
+	struct btree *old;
+
+	trace_bcache_btree_set_root(c, b);
+	BUG_ON(!b->written);
+
+	old = btree_node_root(c, b);
+
+	/*
+	 * Ensure no one is using the old root while we switch to the
+	 * new root:
+	 */
+	btree_node_lock_write(old, iter);
+
+	bch_btree_set_root_inmem(c, b, btree_reserve);
+
+	btree_interior_update_updated_root(c, as, iter->btree_id);
+
+	/*
+	 * Unlock old root after new root is visible:
+	 *
+	 * The new root isn't persistent, but that's ok: we still have
+	 * an intent lock on the new root, and any updates that would
+	 * depend on the new root would have to update the new root.
+	 */
+	btree_node_unlock_write(old, iter);
+}
+
+static struct btree *__btree_root_alloc(struct cache_set *c, unsigned level,
+					enum btree_id id,
+					struct btree_reserve *reserve)
+{
+	struct btree *b = bch_btree_node_alloc(c, level, id, reserve);
+
+	b->data->min_key = POS_MIN;
+	b->data->max_key = POS_MAX;
+	b->data->format = bch_btree_calc_format(b);
+	b->key.k.p = POS_MAX;
+
+	btree_node_set_format(b, b->data->format);
+	bch_btree_build_aux_trees(b);
+
+	six_unlock_write(&b->lock);
+
+	return b;
+}
+
+void bch_btree_reserve_put(struct cache_set *c, struct btree_reserve *reserve)
+{
+	bch_disk_reservation_put(c, &reserve->disk_res);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+
+	while (reserve->nr) {
+		struct btree *b = reserve->b[--reserve->nr];
+
+		six_unlock_write(&b->lock);
+
+		if (c->btree_reserve_cache_nr <
+		    ARRAY_SIZE(c->btree_reserve_cache)) {
+			struct btree_alloc *a =
+				&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+
+			a->ob = b->ob;
+			b->ob = NULL;
+			bkey_copy(&a->k, &b->key);
+		} else {
+			bch_open_bucket_put(c, b->ob);
+			b->ob = NULL;
+		}
+
+		__btree_node_free(c, b, NULL);
+
+		six_unlock_intent(&b->lock);
+	}
+
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	mempool_free(reserve, &c->btree_reserve_pool);
+}
+
+static struct btree_reserve *__bch_btree_reserve_get(struct cache_set *c,
+						     unsigned nr_nodes,
+						     unsigned flags,
+						     struct closure *cl)
+{
+	struct btree_reserve *reserve;
+	struct btree *b;
+	struct disk_reservation disk_res = { 0, 0 };
+	unsigned sectors = nr_nodes * c->sb.btree_node_size;
+	int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD|
+		BCH_DISK_RESERVATION_METADATA;
+
+	if (flags & BTREE_INSERT_NOFAIL)
+		disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
+
+	/*
+	 * This check isn't necessary for correctness - it's just to potentially
+	 * prevent us from doing a lot of work that'll end up being wasted:
+	 */
+	ret = bch_journal_error(&c->journal);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (bch_disk_reservation_get(c, &disk_res, sectors, disk_res_flags))
+		return ERR_PTR(-ENOSPC);
+
+	BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+
+	/*
+	 * Protects reaping from the btree node cache and using the btree node
+	 * open bucket reserve:
+	 */
+	ret = mca_cannibalize_lock(c, cl);
+	if (ret) {
+		bch_disk_reservation_put(c, &disk_res);
+		return ERR_PTR(ret);
+	}
+
+	reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
+
+	reserve->disk_res = disk_res;
+	reserve->nr = 0;
+
+	while (reserve->nr < nr_nodes) {
+		b = __bch_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE,
+					   &disk_res, cl);
+		if (IS_ERR(b)) {
+			ret = PTR_ERR(b);
+			goto err_free;
+		}
+
+		reserve->b[reserve->nr++] = b;
+	}
+
+	mca_cannibalize_unlock(c);
+	return reserve;
+err_free:
+	bch_btree_reserve_put(c, reserve);
+	mca_cannibalize_unlock(c);
+	trace_bcache_btree_reserve_get_fail(c, nr_nodes, cl);
+	return ERR_PTR(ret);
+}
+
+struct btree_reserve *bch_btree_reserve_get(struct cache_set *c,
+					    struct btree *b,
+					    unsigned extra_nodes,
+					    unsigned flags,
+					    struct closure *cl)
+{
+	unsigned depth = btree_node_root(c, b)->level - b->level;
+	unsigned nr_nodes = btree_reserve_required_nodes(depth) + extra_nodes;
+
+	return __bch_btree_reserve_get(c, nr_nodes, flags, cl);
+
+}
+
+int bch_btree_root_alloc(struct cache_set *c, enum btree_id id,
+			 struct closure *writes)
+{
+	struct closure cl;
+	struct btree_reserve *reserve;
+	struct btree *b;
+
+	closure_init_stack(&cl);
+
+	while (1) {
+		/* XXX haven't calculated capacity yet :/ */
+		reserve = __bch_btree_reserve_get(c, 1, 0, &cl);
+		if (!IS_ERR(reserve))
+			break;
+
+		if (PTR_ERR(reserve) == -ENOSPC)
+			return PTR_ERR(reserve);
+
+		closure_sync(&cl);
+	}
+
+	b = __btree_root_alloc(c, 0, id, reserve);
+
+	bch_btree_node_write(c, b, writes, SIX_LOCK_intent, -1);
+
+	bch_btree_set_root_initial(c, b, reserve);
+	btree_open_bucket_put(c, b);
+	six_unlock_intent(&b->lock);
+
+	bch_btree_reserve_put(c, reserve);
+
+	return 0;
+}
+
+static void bch_insert_fixup_btree_ptr(struct btree_iter *iter,
+				       struct btree *b,
+				       struct bkey_i *insert,
+				       struct btree_node_iter *node_iter,
+				       struct disk_reservation *disk_res)
+{
+	struct cache_set *c = iter->c;
+	struct bucket_stats_cache_set stats = { 0 };
+	struct bkey_packed *k;
+	struct bkey tmp;
+
+	if (bkey_extent_is_data(&insert->k))
+		bch_mark_key(c, bkey_i_to_s_c(insert),
+			     c->sb.btree_node_size, true,
+			     gc_pos_btree_node(b), &stats, 0);
+
+	while ((k = bch_btree_node_iter_peek_all(node_iter, b)) &&
+	       !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
+		bch_btree_node_iter_advance(node_iter, b);
+
+	/*
+	 * If we're overwriting, look up pending delete and mark so that gc
+	 * marks it on the pending delete list:
+	 */
+	if (k && !bkey_cmp_packed(b, k, &insert->k))
+		bch_btree_node_free_index(c, b, iter->btree_id,
+					  bkey_disassemble(b, k, &tmp),
+					  &stats);
+
+	bch_cache_set_stats_apply(c, &stats, disk_res, gc_pos_btree_node(b));
+
+	bch_btree_bset_insert_key(iter, b, node_iter, insert);
+	set_btree_node_dirty(b);
+}
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch_btree_bset_insert_key(struct btree_iter *iter,
+			       struct btree *b,
+			       struct btree_node_iter *node_iter,
+			       struct bkey_i *insert)
+{
+	const struct bkey_format *f = &b->format;
+	struct bkey_packed *k;
+	struct bset_tree *t;
+	unsigned clobber_u64s;
+
+	EBUG_ON(btree_node_just_written(b));
+	EBUG_ON(bset_written(b, btree_bset_last(b)));
+	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+	EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
+		bkey_cmp(insert->k.p, b->data->max_key) > 0);
+	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->c, b));
+
+	k = bch_btree_node_iter_peek_all(node_iter, b);
+	if (k && !bkey_cmp_packed(b, k, &insert->k)) {
+		BUG_ON(bkey_whiteout(k));
+
+		t = bch_bkey_to_bset(b, k);
+
+		if (bset_unwritten(b, bset(b, t)) &&
+		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
+			BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k));
+
+			k->type = insert->k.type;
+			memcpy_u64s(bkeyp_val(f, k), &insert->v,
+				    bkey_val_u64s(&insert->k));
+			return true;
+		}
+
+		insert->k.needs_whiteout = k->needs_whiteout;
+
+		btree_keys_account_key_drop(&b->nr, t - b->set, k);
+
+		if (t == bset_tree_last(b)) {
+			clobber_u64s = k->u64s;
+
+			/*
+			 * If we're deleting, and the key we're deleting doesn't
+			 * need a whiteout (it wasn't overwriting a key that had
+			 * been written to disk) - just delete it:
+			 */
+			if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
+				bch_bset_delete(b, k, clobber_u64s);
+				bch_btree_node_iter_fix(iter, b, node_iter, t,
+							k, clobber_u64s, 0);
+				return true;
+			}
+
+			goto overwrite;
+		}
+
+		k->type = KEY_TYPE_DELETED;
+		bch_btree_node_iter_fix(iter, b, node_iter, t, k,
+					k->u64s, k->u64s);
+
+		if (bkey_whiteout(&insert->k)) {
+			reserve_whiteout(b, t, k);
+			return true;
+		} else {
+			k->needs_whiteout = false;
+		}
+	} else {
+		/*
+		 * Deleting, but the key to delete wasn't found - nothing to do:
+		 */
+		if (bkey_whiteout(&insert->k))
+			return false;
+
+		insert->k.needs_whiteout = false;
+	}
+
+	t = bset_tree_last(b);
+	k = bch_btree_node_iter_bset_pos(node_iter, b, t);
+	clobber_u64s = 0;
+overwrite:
+	bch_bset_insert(b, node_iter, k, insert, clobber_u64s);
+	if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
+		bch_btree_node_iter_fix(iter, b, node_iter, t, k,
+					clobber_u64s, k->u64s);
+	return true;
+}
+
+static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+			       unsigned i)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	struct btree_write *w = container_of(pin, struct btree_write, journal);
+	struct btree *b = container_of(w, struct btree, writes[i]);
+
+	six_lock_read(&b->lock);
+	/*
+	 * Reusing a btree node can race with the journal reclaim code calling
+	 * the journal pin flush fn, and there's no good fix for this: we don't
+	 * really want journal_pin_drop() to block until the flush fn is no
+	 * longer running, because journal_pin_drop() is called from the btree
+	 * node write endio function, and we can't wait on the flush fn to
+	 * finish running in mca_reap() - where we make reused btree nodes ready
+	 * to use again - because there, we're holding the lock this function
+	 * needs - deadlock.
+	 *
+	 * So, the b->level check is a hack so we don't try to write nodes we
+	 * shouldn't:
+	 */
+	if (!b->level)
+		bch_btree_node_write(c, b, NULL, SIX_LOCK_read, i);
+	six_unlock_read(&b->lock);
+}
+
+static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin)
+{
+	return __btree_node_flush(j, pin, 0);
+}
+
+static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin)
+{
+	return __btree_node_flush(j, pin, 1);
+}
+
+void bch_btree_journal_key(struct btree_insert *trans,
+			   struct btree_iter *iter,
+			   struct bkey_i *insert)
+{
+	struct cache_set *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree *b = iter->nodes[0];
+	struct btree_write *w = btree_current_write(b);
+
+	EBUG_ON(iter->level || b->level);
+	EBUG_ON(!trans->journal_res.ref &&
+		test_bit(JOURNAL_REPLAY_DONE, &j->flags));
+
+	if (!journal_pin_active(&w->journal))
+		bch_journal_pin_add(j, &w->journal,
+				    btree_node_write_idx(b) == 0
+				    ? btree_node_flush0
+				    : btree_node_flush1);
+
+	if (trans->journal_res.ref) {
+		u64 seq = trans->journal_res.seq;
+		bool needs_whiteout = insert->k.needs_whiteout;
+
+		/*
+		 * have a bug where we're seeing an extent with an invalid crc
+		 * entry in the journal, trying to track it down:
+		 */
+		BUG_ON(bkey_invalid(c, b->btree_id, bkey_i_to_s_c(insert)));
+
+		/* ick */
+		insert->k.needs_whiteout = false;
+		bch_journal_add_keys(j, &trans->journal_res,
+				     b->btree_id, insert);
+		insert->k.needs_whiteout = needs_whiteout;
+
+		if (trans->journal_seq)
+			*trans->journal_seq = seq;
+		btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
+	}
+
+	if (!btree_node_dirty(b))
+		set_btree_node_dirty(b);
+}
+
+static enum btree_insert_ret
+bch_insert_fixup_key(struct btree_insert *trans,
+		     struct btree_insert_entry *insert)
+{
+	struct btree_iter *iter = insert->iter;
+
+	BUG_ON(iter->level);
+
+	if (bch_btree_bset_insert_key(iter,
+				      iter->nodes[0],
+				      &iter->node_iters[0],
+				      insert->k))
+		bch_btree_journal_key(trans, iter, insert->k);
+
+	trans->did_work = true;
+	return BTREE_INSERT_OK;
+}
+
+static void verify_keys_sorted(struct keylist *l)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	struct bkey_i *k;
+
+	for_each_keylist_key(l, k)
+		BUG_ON(bkey_next(k) != l->top &&
+		       bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
+#endif
+}
+
+static void btree_node_lock_for_insert(struct btree *b, struct btree_iter *iter)
+{
+	struct cache_set *c = iter->c;
+
+	btree_node_lock_write(b, iter);
+
+	if (btree_node_just_written(b) &&
+	    bch_btree_post_write_cleanup(c, b))
+		bch_btree_iter_reinit_node(iter, b);
+
+	/*
+	 * If the last bset has been written, or if it's gotten too big - start
+	 * a new bset to insert into:
+	 */
+	if (want_new_bset(c, b))
+		bch_btree_init_next(c, b, iter);
+}
+
+/* Asynchronous interior node update machinery */
+
+struct btree_interior_update *
+bch_btree_interior_update_alloc(struct cache_set *c)
+{
+	struct btree_interior_update *as;
+
+	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
+	memset(as, 0, sizeof(*as));
+	closure_init(&as->cl, &c->cl);
+	as->c		= c;
+	as->mode	= BTREE_INTERIOR_NO_UPDATE;
+
+	bch_keylist_init(&as->parent_keys, as->inline_keys,
+			 ARRAY_SIZE(as->inline_keys));
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add(&as->list, &c->btree_interior_update_list);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	return as;
+}
+
+static void btree_interior_update_free(struct closure *cl)
+{
+	struct btree_interior_update *as = container_of(cl, struct btree_interior_update, cl);
+
+	mempool_free(as, &as->c->btree_interior_update_pool);
+}
+
+static void btree_interior_update_nodes_reachable(struct closure *cl)
+{
+	struct btree_interior_update *as =
+		container_of(cl, struct btree_interior_update, cl);
+	struct cache_set *c = as->c;
+	unsigned i;
+
+	bch_journal_pin_drop(&c->journal, &as->journal);
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	for (i = 0; i < as->nr_pending; i++)
+		bch_btree_node_free_ondisk(c, &as->pending[i]);
+	as->nr_pending = 0;
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_del(&as->list);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	closure_wake_up(&as->wait);
+
+	closure_return_with_destructor(cl, btree_interior_update_free);
+}
+
+static void btree_interior_update_nodes_written(struct closure *cl)
+{
+	struct btree_interior_update *as =
+		container_of(cl, struct btree_interior_update, cl);
+	struct cache_set *c = as->c;
+	struct btree *b;
+
+	if (bch_journal_error(&c->journal)) {
+		/* XXX what? */
+	}
+
+	/* XXX: missing error handling, damnit */
+
+	/* check for journal error, bail out if we flushed */
+
+	/*
+	 * We did an update to a parent node where the pointers we added pointed
+	 * to child nodes that weren't written yet: now, the child nodes have
+	 * been written so we can write out the update to the interior node.
+	 */
+retry:
+	mutex_lock(&c->btree_interior_update_lock);
+	switch (as->mode) {
+	case BTREE_INTERIOR_NO_UPDATE:
+		BUG();
+	case BTREE_INTERIOR_UPDATING_NODE:
+		/* The usual case: */
+		b = READ_ONCE(as->b);
+
+		if (!six_trylock_read(&b->lock)) {
+			mutex_unlock(&c->btree_interior_update_lock);
+			six_lock_read(&b->lock);
+			six_unlock_read(&b->lock);
+			goto retry;
+		}
+
+		BUG_ON(!btree_node_dirty(b));
+		closure_wait(&btree_current_write(b)->wait, cl);
+
+		list_del(&as->write_blocked_list);
+
+		if (list_empty(&b->write_blocked))
+			bch_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
+		six_unlock_read(&b->lock);
+		break;
+
+	case BTREE_INTERIOR_UPDATING_AS:
+		/*
+		 * The btree node we originally updated has been freed and is
+		 * being rewritten - so we need to write anything here, we just
+		 * need to signal to that btree_interior_update that it's ok to make the
+		 * new replacement node visible:
+		 */
+		closure_put(&as->parent_as->cl);
+
+		/*
+		 * and then we have to wait on that btree_interior_update to finish:
+		 */
+		closure_wait(&as->parent_as->wait, cl);
+		break;
+
+	case BTREE_INTERIOR_UPDATING_ROOT:
+		/* b is the new btree root: */
+		b = READ_ONCE(as->b);
+
+		if (!six_trylock_read(&b->lock)) {
+			mutex_unlock(&c->btree_interior_update_lock);
+			six_lock_read(&b->lock);
+			six_unlock_read(&b->lock);
+			goto retry;
+		}
+
+		BUG_ON(c->btree_roots[b->btree_id].as != as);
+		c->btree_roots[b->btree_id].as = NULL;
+
+		bch_btree_set_root_ondisk(c, b);
+
+		/*
+		 * We don't have to wait anything anything here (before
+		 * btree_interior_update_nodes_reachable frees the old nodes
+		 * ondisk) - we've ensured that the very next journal write will
+		 * have the pointer to the new root, and before the allocator
+		 * can reuse the old nodes it'll have to do a journal commit:
+		 */
+		six_unlock_read(&b->lock);
+	}
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	continue_at(cl, btree_interior_update_nodes_reachable, system_wq);
+}
+
+/*
+ * We're updating @b with pointers to nodes that haven't finished writing yet:
+ * block @b from being written until @as completes
+ */
+static void btree_interior_update_updated_btree(struct cache_set *c,
+						struct btree_interior_update *as,
+						struct btree *b)
+{
+	mutex_lock(&c->btree_interior_update_lock);
+
+	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+	BUG_ON(!btree_node_dirty(b));
+
+	as->mode = BTREE_INTERIOR_UPDATING_NODE;
+	as->b = b;
+	list_add(&as->write_blocked_list, &b->write_blocked);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	bch_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
+
+	continue_at(&as->cl, btree_interior_update_nodes_written,
+		    system_freezable_wq);
+}
+
+static void btree_interior_update_updated_root(struct cache_set *c,
+					       struct btree_interior_update *as,
+					       enum btree_id btree_id)
+{
+	struct btree_root *r = &c->btree_roots[btree_id];
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+
+	/*
+	 * Old root might not be persistent yet - if so, redirect its
+	 * btree_interior_update operation to point to us:
+	 */
+	if (r->as) {
+		BUG_ON(r->as->mode != BTREE_INTERIOR_UPDATING_ROOT);
+
+		r->as->b = NULL;
+		r->as->mode = BTREE_INTERIOR_UPDATING_AS;
+		r->as->parent_as = as;
+		closure_get(&as->cl);
+	}
+
+	as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+	as->b = r->b;
+	r->as = as;
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	bch_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
+
+	continue_at(&as->cl, btree_interior_update_nodes_written,
+		    system_freezable_wq);
+}
+
+static void interior_update_flush(struct journal *j, struct journal_entry_pin *pin)
+{
+	struct btree_interior_update *as =
+		container_of(pin, struct btree_interior_update, journal);
+
+	bch_journal_flush_seq_async(j, as->journal_seq, NULL);
+}
+
+/*
+ * @b is being split/rewritten: it may have pointers to not-yet-written btree
+ * nodes and thus outstanding btree_interior_updates - redirect @b's
+ * btree_interior_updates to point to this btree_interior_update:
+ */
+void bch_btree_interior_update_will_free_node(struct cache_set *c,
+					      struct btree_interior_update *as,
+					      struct btree *b)
+{
+	struct btree_interior_update *p, *n;
+	struct pending_btree_node_free *d;
+	struct bset_tree *t;
+
+	/*
+	 * Does this node have data that hasn't been written in the journal?
+	 *
+	 * If so, we have to wait for the corresponding journal entry to be
+	 * written before making the new nodes reachable - we can't just carry
+	 * over the bset->journal_seq tracking, since we'll be mixing those keys
+	 * in with keys that aren't in the journal anymore:
+	 */
+	for_each_bset(b, t)
+		as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
+
+	/*
+	 * Does this node have unwritten data that has a pin on the journal?
+	 *
+	 * If so, transfer that pin to the btree_interior_update operation -
+	 * note that if we're freeing multiple nodes, we only need to keep the
+	 * oldest pin of any of the nodes we're freeing. We'll release the pin
+	 * when the new nodes are persistent and reachable on disk:
+	 */
+	bch_journal_pin_add_if_older(&c->journal,
+				     &b->writes[0].journal,
+				     &as->journal, interior_update_flush);
+	bch_journal_pin_add_if_older(&c->journal,
+				     &b->writes[1].journal,
+				     &as->journal, interior_update_flush);
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	/*
+	 * Does this node have any btree_interior_update operations preventing
+	 * it from being written?
+	 *
+	 * If so, redirect them to point to this btree_interior_update: we can
+	 * write out our new nodes, but we won't make them visible until those
+	 * operations complete
+	 */
+	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
+		BUG_ON(p->mode != BTREE_INTERIOR_UPDATING_NODE);
+
+		p->mode = BTREE_INTERIOR_UPDATING_AS;
+		list_del(&p->write_blocked_list);
+		p->b = NULL;
+		p->parent_as = as;
+		closure_get(&as->cl);
+	}
+
+	/* Add this node to the list of nodes being freed: */
+	BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
+
+	d = &as->pending[as->nr_pending++];
+	d->index_update_done	= false;
+	d->seq			= b->data->keys.seq;
+	d->btree_id		= b->btree_id;
+	d->level		= b->level;
+	bkey_copy(&d->key, &b->key);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_node_interior_verify(struct btree *b)
+{
+	struct btree_node_iter iter;
+	struct bkey_packed *k;
+
+	BUG_ON(!b->level);
+
+	bch_btree_node_iter_init(&iter, b, b->key.k.p, false, false);
+#if 1
+	BUG_ON(!(k = bch_btree_node_iter_peek(&iter, b)) ||
+	       bkey_cmp_left_packed(b, k, &b->key.k.p));
+
+	BUG_ON((bch_btree_node_iter_advance(&iter, b),
+		!bch_btree_node_iter_end(&iter)));
+#else
+	const char *msg;
+
+	msg = "not found";
+	k = bch_btree_node_iter_peek(&iter, b);
+	if (!k)
+		goto err;
+
+	msg = "isn't what it should be";
+	if (bkey_cmp_left_packed(b, k, &b->key.k.p))
+		goto err;
+
+	bch_btree_node_iter_advance(&iter, b);
+
+	msg = "isn't last key";
+	if (!bch_btree_node_iter_end(&iter))
+		goto err;
+	return;
+err:
+	bch_dump_btree_node(b);
+	printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
+	       b->key.k.p.offset, msg);
+	BUG();
+#endif
+}
+
+static enum btree_insert_ret
+bch_btree_insert_keys_interior(struct btree *b,
+			       struct btree_iter *iter,
+			       struct keylist *insert_keys,
+			       struct btree_interior_update *as,
+			       struct btree_reserve *res)
+{
+	struct cache_set *c = iter->c;
+	struct btree_iter *linked;
+	struct btree_node_iter node_iter;
+	struct bkey_i *insert = bch_keylist_front(insert_keys);
+	struct bkey_packed *k;
+
+	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
+	BUG_ON(!b->level);
+	BUG_ON(!as || as->b);
+	verify_keys_sorted(insert_keys);
+
+	btree_node_lock_for_insert(b, iter);
+
+	if (bch_keylist_u64s(insert_keys) >
+	    bch_btree_keys_u64s_remaining(c, b)) {
+		btree_node_unlock_write(b, iter);
+		return BTREE_INSERT_BTREE_NODE_FULL;
+	}
+
+	/* Don't screw up @iter's position: */
+	node_iter = iter->node_iters[b->level];
+
+	/*
+	 * btree_split(), btree_gc_coalesce() will insert keys before
+	 * the iterator's current position - they know the keys go in
+	 * the node the iterator points to:
+	 */
+	while ((k = bch_btree_node_iter_prev_all(&node_iter, b)) &&
+	       (bkey_cmp_packed(b, k, &insert->k) >= 0))
+		;
+
+	while (!bch_keylist_empty(insert_keys)) {
+		insert = bch_keylist_front(insert_keys);
+
+		bch_insert_fixup_btree_ptr(iter, b, insert,
+					   &node_iter, &res->disk_res);
+		bch_keylist_pop_front(insert_keys);
+	}
+
+	btree_interior_update_updated_btree(c, as, b);
+
+	for_each_linked_btree_node(iter, b, linked)
+		bch_btree_node_iter_peek(&linked->node_iters[b->level],
+					 b);
+	bch_btree_node_iter_peek(&iter->node_iters[b->level], b);
+
+	bch_btree_iter_verify(iter, b);
+
+	if (bch_maybe_compact_whiteouts(c, b))
+		bch_btree_iter_reinit_node(iter, b);
+
+	btree_node_unlock_write(b, iter);
+
+	btree_node_interior_verify(b);
+	return BTREE_INSERT_OK;
+}
+
+/*
+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher
+ * node)
+ */
+static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n1,
+					struct btree_reserve *reserve)
+{
+	size_t nr_packed = 0, nr_unpacked = 0;
+	struct btree *n2;
+	struct bset *set1, *set2;
+	struct bkey_packed *k, *prev = NULL;
+
+	n2 = bch_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve);
+	n2->data->max_key	= n1->data->max_key;
+	n2->data->format	= n1->format;
+	n2->key.k.p = n1->key.k.p;
+
+	btree_node_set_format(n2, n2->data->format);
+
+	set1 = btree_bset_first(n1);
+	set2 = btree_bset_first(n2);
+
+	/*
+	 * Has to be a linear search because we don't have an auxiliary
+	 * search tree yet
+	 */
+	k = set1->start;
+	while (1) {
+		if (bkey_next(k) == bset_bkey_last(set1))
+			break;
+		if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
+			break;
+
+		if (bkey_packed(k))
+			nr_packed++;
+		else
+			nr_unpacked++;
+
+		prev = k;
+		k = bkey_next(k);
+	}
+
+	BUG_ON(!prev);
+
+	n1->key.k.p = bkey_unpack_pos(n1, prev);
+	n1->data->max_key = n1->key.k.p;
+	n2->data->min_key =
+		btree_type_successor(n1->btree_id, n1->key.k.p);
+
+	set2->u64s = cpu_to_le16((u64 *) bset_bkey_last(set1) - (u64 *) k);
+	set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
+
+	set_btree_bset_end(n1, n1->set);
+	set_btree_bset_end(n2, n2->set);
+
+	n2->nr.live_u64s	= le16_to_cpu(set2->u64s);
+	n2->nr.bset_u64s[0]	= le16_to_cpu(set2->u64s);
+	n2->nr.packed_keys	= n1->nr.packed_keys - nr_packed;
+	n2->nr.unpacked_keys	= n1->nr.unpacked_keys - nr_unpacked;
+
+	n1->nr.live_u64s	= le16_to_cpu(set1->u64s);
+	n1->nr.bset_u64s[0]	= le16_to_cpu(set1->u64s);
+	n1->nr.packed_keys	= nr_packed;
+	n1->nr.unpacked_keys	= nr_unpacked;
+
+	BUG_ON(!set1->u64s);
+	BUG_ON(!set2->u64s);
+
+	memcpy_u64s(set2->start,
+		    bset_bkey_last(set1),
+		    le16_to_cpu(set2->u64s));
+
+	btree_node_reset_sib_u64s(n1);
+	btree_node_reset_sib_u64s(n2);
+
+	bch_verify_btree_nr_keys(n1);
+	bch_verify_btree_nr_keys(n2);
+
+	if (n1->level) {
+		btree_node_interior_verify(n1);
+		btree_node_interior_verify(n2);
+	}
+
+	return n2;
+}
+
+/*
+ * For updates to interior nodes, we've got to do the insert before we split
+ * because the stuff we're inserting has to be inserted atomically. Post split,
+ * the keys might have to go in different nodes and the split would no longer be
+ * atomic.
+ *
+ * Worse, if the insert is from btree node coalescing, if we do the insert after
+ * we do the split (and pick the pivot) - the pivot we pick might be between
+ * nodes that were coalesced, and thus in the middle of a child node post
+ * coalescing:
+ */
+static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b,
+				    struct keylist *keys,
+				    struct btree_reserve *res)
+{
+	struct btree_node_iter node_iter;
+	struct bkey_i *k = bch_keylist_front(keys);
+	struct bkey_packed *p;
+	struct bset *i;
+
+	BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
+
+	bch_btree_node_iter_init(&node_iter, b, k->k.p, false, false);
+
+	while (!bch_keylist_empty(keys)) {
+		k = bch_keylist_front(keys);
+
+		BUG_ON(bch_keylist_u64s(keys) >
+		       bch_btree_keys_u64s_remaining(iter->c, b));
+		BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
+		BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
+
+		bch_insert_fixup_btree_ptr(iter, b, k, &node_iter, &res->disk_res);
+		bch_keylist_pop_front(keys);
+	}
+
+	/*
+	 * We can't tolerate whiteouts here - with whiteouts there can be
+	 * duplicate keys, and it would be rather bad if we picked a duplicate
+	 * for the pivot:
+	 */
+	i = btree_bset_first(b);
+	p = i->start;
+	while (p != bset_bkey_last(i))
+		if (bkey_deleted(p)) {
+			le16_add_cpu(&i->u64s, -p->u64s);
+			set_btree_bset_end(b, b->set);
+			memmove_u64s_down(p, bkey_next(p),
+					  (u64 *) bset_bkey_last(i) -
+					  (u64 *) p);
+		} else
+			p = bkey_next(p);
+
+	BUG_ON(b->nsets != 1 ||
+	       b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
+
+	btree_node_interior_verify(b);
+}
+
+static void btree_split(struct btree *b, struct btree_iter *iter,
+			struct keylist *insert_keys,
+			struct btree_reserve *reserve,
+			struct btree_interior_update *as)
+{
+	struct cache_set *c = iter->c;
+	struct btree *parent = iter->nodes[b->level + 1];
+	struct btree *n1, *n2 = NULL, *n3 = NULL;
+	u64 start_time = local_clock();
+
+	BUG_ON(!parent && (b != btree_node_root(c, b)));
+	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
+
+	bch_btree_interior_update_will_free_node(c, as, b);
+
+	n1 = btree_node_alloc_replacement(c, b, reserve);
+	if (b->level)
+		btree_split_insert_keys(iter, n1, insert_keys, reserve);
+
+	if (__set_blocks(n1->data,
+			 le16_to_cpu(n1->data->keys.u64s),
+			 block_bytes(c)) > BTREE_SPLIT_THRESHOLD(c)) {
+		trace_bcache_btree_node_split(c, b, b->nr.live_u64s);
+
+		n2 = __btree_split_node(iter, n1, reserve);
+
+		bch_btree_build_aux_trees(n2);
+		bch_btree_build_aux_trees(n1);
+		six_unlock_write(&n2->lock);
+		six_unlock_write(&n1->lock);
+
+		bch_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent, -1);
+
+		/*
+		 * Note that on recursive parent_keys == insert_keys, so we
+		 * can't start adding new keys to parent_keys before emptying it
+		 * out (which we did with btree_split_insert_keys() above)
+		 */
+		bch_keylist_add(&as->parent_keys, &n1->key);
+		bch_keylist_add(&as->parent_keys, &n2->key);
+
+		if (!parent) {
+			/* Depth increases, make a new root */
+			n3 = __btree_root_alloc(c, b->level + 1,
+						iter->btree_id,
+						reserve);
+			n3->sib_u64s[0] = U16_MAX;
+			n3->sib_u64s[1] = U16_MAX;
+
+			btree_split_insert_keys(iter, n3, &as->parent_keys,
+						reserve);
+			bch_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent, -1);
+		}
+	} else {
+		trace_bcache_btree_node_compact(c, b, b->nr.live_u64s);
+
+		bch_btree_build_aux_trees(n1);
+		six_unlock_write(&n1->lock);
+
+		bch_keylist_add(&as->parent_keys, &n1->key);
+	}
+
+	bch_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent, -1);
+
+	/* New nodes all written, now make them visible: */
+
+	if (parent) {
+		/* Split a non root node */
+		bch_btree_insert_node(parent, iter, &as->parent_keys,
+				      reserve, as);
+	} else if (n3) {
+		bch_btree_set_root(iter, n3, as, reserve);
+	} else {
+		/* Root filled up but didn't need to be split */
+		bch_btree_set_root(iter, n1, as, reserve);
+	}
+
+	btree_open_bucket_put(c, n1);
+	if (n2)
+		btree_open_bucket_put(c, n2);
+	if (n3)
+		btree_open_bucket_put(c, n3);
+
+	/*
+	 * Note - at this point other linked iterators could still have @b read
+	 * locked; we're depending on the bch_btree_iter_node_replace() calls
+	 * below removing all references to @b so we don't return with other
+	 * iterators pointing to a node they have locked that's been freed.
+	 *
+	 * We have to free the node first because the bch_iter_node_replace()
+	 * calls will drop _our_ iterator's reference - and intent lock - to @b.
+	 */
+	bch_btree_node_free_inmem(iter, b);
+
+	/* Successful split, update the iterator to point to the new nodes: */
+
+	if (n3)
+		bch_btree_iter_node_replace(iter, n3);
+	if (n2)
+		bch_btree_iter_node_replace(iter, n2);
+	bch_btree_iter_node_replace(iter, n1);
+
+	bch_time_stats_update(&c->btree_split_time, start_time);
+}
+
+/**
+ * bch_btree_insert_node - insert bkeys into a given btree node
+ *
+ * @iter:		btree iterator
+ * @insert_keys:	list of keys to insert
+ * @hook:		insert callback
+ * @persistent:		if not null, @persistent will wait on journal write
+ *
+ * Inserts as many keys as it can into a given btree node, splitting it if full.
+ * If a split occurred, this function will return early. This can only happen
+ * for leaf nodes -- inserts into interior nodes have to be atomic.
+ */
+void bch_btree_insert_node(struct btree *b,
+			   struct btree_iter *iter,
+			   struct keylist *insert_keys,
+			   struct btree_reserve *reserve,
+			   struct btree_interior_update *as)
+{
+	BUG_ON(!b->level);
+	BUG_ON(!reserve || !as);
+
+	switch (bch_btree_insert_keys_interior(b, iter, insert_keys,
+					       as, reserve)) {
+	case BTREE_INSERT_OK:
+		break;
+	case BTREE_INSERT_BTREE_NODE_FULL:
+		btree_split(b, iter, insert_keys, reserve, as);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags)
+{
+	struct cache_set *c = iter->c;
+	struct btree *b = iter->nodes[0];
+	struct btree_reserve *reserve;
+	struct btree_interior_update *as;
+	struct closure cl;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+
+	/* Hack, because gc and splitting nodes doesn't mix yet: */
+	if (!down_read_trylock(&c->gc_lock)) {
+		bch_btree_iter_unlock(iter);
+		down_read(&c->gc_lock);
+	}
+
+	/*
+	 * XXX: figure out how far we might need to split,
+	 * instead of locking/reserving all the way to the root:
+	 */
+	if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) {
+		ret = -EINTR;
+		goto out;
+	}
+
+	reserve = bch_btree_reserve_get(c, b, 0, flags, &cl);
+	if (IS_ERR(reserve)) {
+		ret = PTR_ERR(reserve);
+		if (ret == -EAGAIN) {
+			bch_btree_iter_unlock(iter);
+			up_read(&c->gc_lock);
+			closure_sync(&cl);
+			return -EINTR;
+		}
+		goto out;
+	}
+
+	as = bch_btree_interior_update_alloc(c);
+
+	btree_split(b, iter, NULL, reserve, as);
+	bch_btree_reserve_put(c, reserve);
+
+	bch_btree_iter_set_locks_want(iter, 1);
+out:
+	up_read(&c->gc_lock);
+	return ret;
+}
+
+enum btree_node_sibling {
+	btree_prev_sib,
+	btree_next_sib,
+};
+
+static struct btree *btree_node_get_sibling(struct btree_iter *iter,
+					    struct btree *b,
+					    enum btree_node_sibling sib)
+{
+	struct btree *parent;
+	struct btree_node_iter node_iter;
+	struct bkey_packed *k;
+	BKEY_PADDED(k) tmp;
+	struct btree *ret;
+	unsigned level = b->level;
+
+	parent = iter->nodes[level + 1];
+	if (!parent)
+		return NULL;
+
+	if (!btree_node_relock(iter, level + 1)) {
+		bch_btree_iter_set_locks_want(iter, level + 2);
+		return ERR_PTR(-EINTR);
+	}
+
+	node_iter = iter->node_iters[parent->level];
+
+	k = bch_btree_node_iter_peek_all(&node_iter, parent);
+	BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
+
+	do {
+		k = sib == btree_prev_sib
+			? bch_btree_node_iter_prev_all(&node_iter, parent)
+			: (bch_btree_node_iter_advance(&node_iter, parent),
+			   bch_btree_node_iter_peek_all(&node_iter, parent));
+		if (!k)
+			return NULL;
+	} while (bkey_deleted(k));
+
+	bkey_unpack(parent, &tmp.k, k);
+
+	ret = bch_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent);
+
+	if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) {
+		btree_node_unlock(iter, level);
+		ret = bch_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent);
+	}
+
+	if (!IS_ERR(ret) && !btree_node_relock(iter, level)) {
+		six_unlock_intent(&ret->lock);
+		ret = ERR_PTR(-EINTR);
+	}
+
+	return ret;
+}
+
+static int __foreground_maybe_merge(struct btree_iter *iter,
+				    enum btree_node_sibling sib)
+{
+	struct cache_set *c = iter->c;
+	struct btree_reserve *reserve;
+	struct btree_interior_update *as;
+	struct bkey_format_state new_s;
+	struct bkey_format new_f;
+	struct bkey_i delete;
+	struct btree *b, *m, *n, *prev, *next, *parent;
+	struct closure cl;
+	size_t sib_u64s;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+retry:
+	if (!btree_node_relock(iter, iter->level))
+		return 0;
+
+	b = iter->nodes[iter->level];
+
+	parent = iter->nodes[b->level + 1];
+	if (!parent)
+		return 0;
+
+	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
+		return 0;
+
+	/* XXX: can't be holding read locks */
+	m = btree_node_get_sibling(iter, b, sib);
+	if (IS_ERR(m)) {
+		ret = PTR_ERR(m);
+		goto out;
+	}
+
+	/* NULL means no sibling: */
+	if (!m) {
+		b->sib_u64s[sib] = U16_MAX;
+		return 0;
+	}
+
+	if (sib == btree_prev_sib) {
+		prev = m;
+		next = b;
+	} else {
+		prev = b;
+		next = m;
+	}
+
+	bch_bkey_format_init(&new_s);
+	__bch_btree_calc_format(&new_s, b);
+	__bch_btree_calc_format(&new_s, m);
+	new_f = bch_bkey_format_done(&new_s);
+
+	sib_u64s = btree_node_u64s_with_format(b, &new_f) +
+		btree_node_u64s_with_format(m, &new_f);
+
+	if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
+		sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+		sib_u64s /= 2;
+		sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+	}
+
+	sib_u64s = min(sib_u64s, btree_max_u64s(c));
+	b->sib_u64s[sib] = sib_u64s;
+
+	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
+		six_unlock_intent(&m->lock);
+		return 0;
+	}
+
+	/* We're changing btree topology, doesn't mix with gc: */
+	if (!down_read_trylock(&c->gc_lock)) {
+		six_unlock_intent(&m->lock);
+		bch_btree_iter_unlock(iter);
+
+		down_read(&c->gc_lock);
+		up_read(&c->gc_lock);
+		ret = -EINTR;
+		goto out;
+	}
+
+	if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) {
+		ret = -EINTR;
+		goto out_unlock;
+	}
+
+	reserve = bch_btree_reserve_get(c, b, 0,
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_USE_RESERVE,
+					&cl);
+	if (IS_ERR(reserve)) {
+		ret = PTR_ERR(reserve);
+		goto out_unlock;
+	}
+
+	as = bch_btree_interior_update_alloc(c);
+
+	bch_btree_interior_update_will_free_node(c, as, b);
+	bch_btree_interior_update_will_free_node(c, as, m);
+
+	n = bch_btree_node_alloc(c, b->level, b->btree_id, reserve);
+	n->data->min_key	= prev->data->min_key;
+	n->data->max_key	= next->data->max_key;
+	n->data->format		= new_f;
+	n->key.k.p		= next->key.k.p;
+
+	btree_node_set_format(n, new_f);
+
+	bch_btree_sort_into(c, n, prev);
+	bch_btree_sort_into(c, n, next);
+
+	bch_btree_build_aux_trees(n);
+	six_unlock_write(&n->lock);
+
+	bkey_init(&delete.k);
+	delete.k.p = prev->key.k.p;
+	bch_keylist_add(&as->parent_keys, &delete);
+	bch_keylist_add(&as->parent_keys, &n->key);
+
+	bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
+
+	bch_btree_insert_node(parent, iter, &as->parent_keys, reserve, as);
+
+	btree_open_bucket_put(c, n);
+	bch_btree_node_free_inmem(iter, b);
+	bch_btree_node_free_inmem(iter, m);
+	bch_btree_iter_node_replace(iter, n);
+
+	bch_btree_iter_verify(iter, n);
+
+	bch_btree_reserve_put(c, reserve);
+out_unlock:
+	if (ret != -EINTR && ret != -EAGAIN)
+		bch_btree_iter_set_locks_want(iter, 1);
+	six_unlock_intent(&m->lock);
+	up_read(&c->gc_lock);
+out:
+	if (ret == -EAGAIN || ret == -EINTR) {
+		bch_btree_iter_unlock(iter);
+		ret = -EINTR;
+	}
+
+	closure_sync(&cl);
+
+	if (ret == -EINTR) {
+		ret = bch_btree_iter_traverse(iter);
+		if (!ret)
+			goto retry;
+	}
+
+	return ret;
+}
+
+static int inline foreground_maybe_merge(struct btree_iter *iter,
+					 enum btree_node_sibling sib)
+{
+	struct cache_set *c = iter->c;
+	struct btree *b;
+
+	if (!btree_node_locked(iter, iter->level))
+		return 0;
+
+	b = iter->nodes[iter->level];
+	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
+		return 0;
+
+	return __foreground_maybe_merge(iter, sib);
+}
+
+/**
+ * btree_insert_key - insert a key one key into a leaf node
+ */
+static enum btree_insert_ret
+btree_insert_key(struct btree_insert *trans,
+		 struct btree_insert_entry *insert)
+{
+	struct cache_set *c = trans->c;
+	struct btree_iter *iter = insert->iter;
+	struct btree *b = iter->nodes[0];
+	enum btree_insert_ret ret;
+	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+
+	ret = !btree_node_is_extents(b)
+		? bch_insert_fixup_key(trans, insert)
+		: bch_insert_fixup_extent(trans, insert);
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch_maybe_compact_whiteouts(iter->c, b))
+		bch_btree_iter_reinit_node(iter, b);
+
+	trace_bcache_btree_insert_key(c, b, insert->k);
+	return ret;
+}
+
+static bool same_leaf_as_prev(struct btree_insert *trans,
+			      struct btree_insert_entry *i)
+{
+	/*
+	 * Because we sorted the transaction entries, if multiple iterators
+	 * point to the same leaf node they'll always be adjacent now:
+	 */
+	return i != trans->entries &&
+		i[0].iter->nodes[0] == i[-1].iter->nodes[0];
+}
+
+#define trans_for_each_entry(trans, i)					\
+	for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
+
+static void multi_lock_write(struct btree_insert *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_entry(trans, i)
+		if (!same_leaf_as_prev(trans, i))
+			btree_node_lock_for_insert(i->iter->nodes[0], i->iter);
+}
+
+static void multi_unlock_write(struct btree_insert *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_entry(trans, i)
+		if (!same_leaf_as_prev(trans, i))
+			btree_node_unlock_write(i->iter->nodes[0], i->iter);
+}
+
+static int btree_trans_entry_cmp(const void *_l, const void *_r)
+{
+	const struct btree_insert_entry *l = _l;
+	const struct btree_insert_entry *r = _r;
+
+	return btree_iter_cmp(l->iter, r->iter);
+}
+
+/* Normal update interface: */
+
+/**
+ * __bch_btree_insert_at - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: cache set read only
+ * -EIO: journal or btree node IO error
+ */
+int __bch_btree_insert_at(struct btree_insert *trans)
+{
+	struct cache_set *c = trans->c;
+	struct btree_insert_entry *i;
+	struct btree_iter *split = NULL;
+	bool cycle_gc_lock = false;
+	unsigned u64s;
+	int ret;
+
+	trans_for_each_entry(trans, i) {
+		EBUG_ON(i->iter->level);
+		EBUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+	}
+
+	sort(trans->entries, trans->nr, sizeof(trans->entries[0]),
+	     btree_trans_entry_cmp, NULL);
+
+	if (unlikely(!percpu_ref_tryget(&c->writes)))
+		return -EROFS;
+retry_locks:
+	ret = -EINTR;
+	trans_for_each_entry(trans, i)
+		if (!bch_btree_iter_set_locks_want(i->iter, 1))
+			goto err;
+retry:
+	trans->did_work = false;
+	u64s = 0;
+	trans_for_each_entry(trans, i)
+		if (!i->done)
+			u64s += jset_u64s(i->k->k.u64s);
+
+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+
+	ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
+		? bch_journal_res_get(&c->journal,
+				      &trans->journal_res,
+				      u64s, u64s)
+		: 0;
+	if (ret)
+		goto err;
+
+	multi_lock_write(trans);
+
+	u64s = 0;
+	trans_for_each_entry(trans, i) {
+		/* Multiple inserts might go to same leaf: */
+		if (!same_leaf_as_prev(trans, i))
+			u64s = 0;
+
+		/*
+		 * bch_btree_node_insert_fits() must be called under write lock:
+		 * with only an intent lock, another thread can still call
+		 * bch_btree_node_write(), converting an unwritten bset to a
+		 * written one
+		 */
+		if (!i->done) {
+			u64s += i->k->k.u64s;
+			if (!bch_btree_node_insert_fits(c,
+					i->iter->nodes[0], u64s)) {
+				split = i->iter;
+				goto unlock;
+			}
+		}
+	}
+
+	ret = 0;
+	split = NULL;
+	cycle_gc_lock = false;
+
+	trans_for_each_entry(trans, i) {
+		if (i->done)
+			continue;
+
+		switch (btree_insert_key(trans, i)) {
+		case BTREE_INSERT_OK:
+			i->done = true;
+			break;
+		case BTREE_INSERT_JOURNAL_RES_FULL:
+		case BTREE_INSERT_NEED_TRAVERSE:
+			ret = -EINTR;
+			break;
+		case BTREE_INSERT_NEED_RESCHED:
+			ret = -EAGAIN;
+			break;
+		case BTREE_INSERT_BTREE_NODE_FULL:
+			split = i->iter;
+			break;
+		case BTREE_INSERT_ENOSPC:
+			ret = -ENOSPC;
+			break;
+		case BTREE_INSERT_NEED_GC_LOCK:
+			cycle_gc_lock = true;
+			ret = -EINTR;
+			break;
+		default:
+			BUG();
+		}
+
+		if (!trans->did_work && (ret || split))
+			break;
+	}
+unlock:
+	multi_unlock_write(trans);
+	bch_journal_res_put(&c->journal, &trans->journal_res);
+
+	if (split)
+		goto split;
+	if (ret)
+		goto err;
+
+	/*
+	 * hack: iterators are inconsistent when they hit end of leaf, until
+	 * traversed again
+	 */
+	trans_for_each_entry(trans, i)
+		if (i->iter->at_end_of_leaf)
+			goto out;
+
+	trans_for_each_entry(trans, i)
+		if (!same_leaf_as_prev(trans, i)) {
+			foreground_maybe_merge(i->iter, btree_prev_sib);
+			foreground_maybe_merge(i->iter, btree_next_sib);
+		}
+out:
+	/* make sure we didn't lose an error: */
+	if (!ret && IS_ENABLED(CONFIG_BCACHE_DEBUG))
+		trans_for_each_entry(trans, i)
+			BUG_ON(!i->done);
+
+	percpu_ref_put(&c->writes);
+	return ret;
+split:
+	/*
+	 * have to drop journal res before splitting, because splitting means
+	 * allocating new btree nodes, and holding a journal reservation
+	 * potentially blocks the allocator:
+	 */
+	ret = bch_btree_split_leaf(split, trans->flags);
+	if (ret)
+		goto err;
+	/*
+	 * if the split didn't have to drop locks the insert will still be
+	 * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
+	 * and is overwriting won't have changed)
+	 */
+	goto retry_locks;
+err:
+	if (cycle_gc_lock) {
+		down_read(&c->gc_lock);
+		up_read(&c->gc_lock);
+	}
+
+	if (ret == -EINTR) {
+		trans_for_each_entry(trans, i) {
+			int ret2 = bch_btree_iter_traverse(i->iter);
+			if (ret2) {
+				ret = ret2;
+				goto out;
+			}
+		}
+
+		/*
+		 * BTREE_ITER_ATOMIC means we have to return -EINTR if we
+		 * dropped locks:
+		 */
+		if (!(trans->flags & BTREE_INSERT_ATOMIC))
+			goto retry;
+	}
+
+	goto out;
+}
+
+int bch_btree_insert_list_at(struct btree_iter *iter,
+			     struct keylist *keys,
+			     struct disk_reservation *disk_res,
+			     struct extent_insert_hook *hook,
+			     u64 *journal_seq, unsigned flags)
+{
+	BUG_ON(flags & BTREE_INSERT_ATOMIC);
+	BUG_ON(bch_keylist_empty(keys));
+	verify_keys_sorted(keys);
+
+	while (!bch_keylist_empty(keys)) {
+		/* need to traverse between each insert */
+		int ret = bch_btree_iter_traverse(iter);
+		if (ret)
+			return ret;
+
+		ret = bch_btree_insert_at(iter->c, disk_res, hook,
+				journal_seq, flags,
+				BTREE_INSERT_ENTRY(iter, bch_keylist_front(keys)));
+		if (ret)
+			return ret;
+
+		bch_keylist_pop_front(keys);
+	}
+
+	return 0;
+}
+
+/**
+ * bch_btree_insert_check_key - insert dummy key into btree
+ *
+ * We insert a random key on a cache miss, then compare exchange on it
+ * once the cache promotion or backing device read completes. This
+ * ensures that if this key is written to after the read, the read will
+ * lose and not overwrite the key with stale data.
+ *
+ * Return values:
+ * -EAGAIN: @iter->cl was put on a waitlist waiting for btree node allocation
+ * -EINTR: btree node was changed while upgrading to write lock
+ */
+int bch_btree_insert_check_key(struct btree_iter *iter,
+			       struct bkey_i *check_key)
+{
+	struct bpos saved_pos = iter->pos;
+	struct bkey_i_cookie *cookie;
+	BKEY_PADDED(key) tmp;
+	int ret;
+
+	BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&check_key->k)));
+
+	check_key->k.type = KEY_TYPE_COOKIE;
+	set_bkey_val_bytes(&check_key->k, sizeof(struct bch_cookie));
+
+	cookie = bkey_i_to_cookie(check_key);
+	get_random_bytes(&cookie->v, sizeof(cookie->v));
+
+	bkey_copy(&tmp.key, check_key);
+
+	ret = bch_btree_insert_at(iter->c, NULL, NULL, NULL,
+				  BTREE_INSERT_ATOMIC,
+				  BTREE_INSERT_ENTRY(iter, &tmp.key));
+
+	bch_btree_iter_rewind(iter, saved_pos);
+
+	return ret;
+}
+
+/**
+ * bch_btree_insert - insert keys into the extent btree
+ * @c:			pointer to struct cache_set
+ * @id:			btree to insert into
+ * @insert_keys:	list of keys to insert
+ * @hook:		insert callback
+ */
+int bch_btree_insert(struct cache_set *c, enum btree_id id,
+		     struct bkey_i *k,
+		     struct disk_reservation *disk_res,
+		     struct extent_insert_hook *hook,
+		     u64 *journal_seq, int flags)
+{
+	struct btree_iter iter;
+	int ret, ret2;
+
+	bch_btree_iter_init_intent(&iter, c, id, bkey_start_pos(&k->k));
+
+	ret = bch_btree_iter_traverse(&iter);
+	if (unlikely(ret))
+		goto out;
+
+	ret = bch_btree_insert_at(c, disk_res, hook, journal_seq, flags,
+				  BTREE_INSERT_ENTRY(&iter, k));
+out:	ret2 = bch_btree_iter_unlock(&iter);
+
+	return ret ?: ret2;
+}
+
+/**
+ * bch_btree_update - like bch_btree_insert(), but asserts that we're
+ * overwriting an existing key
+ */
+int bch_btree_update(struct cache_set *c, enum btree_id id,
+		     struct bkey_i *k, u64 *journal_seq)
+{
+	struct btree_iter iter;
+	struct bkey_s_c u;
+	int ret;
+
+	EBUG_ON(id == BTREE_ID_EXTENTS);
+
+	bch_btree_iter_init_intent(&iter, c, id, k->k.p);
+
+	u = bch_btree_iter_peek_with_holes(&iter);
+	ret = btree_iter_err(u);
+	if (ret)
+		return ret;
+
+	if (bkey_deleted(u.k)) {
+		bch_btree_iter_unlock(&iter);
+		return -ENOENT;
+	}
+
+	ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, 0,
+				  BTREE_INSERT_ENTRY(&iter, k));
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch_btree_delete_range(struct cache_set *c, enum btree_id id,
+			   struct bpos start,
+			   struct bpos end,
+			   u64 version,
+			   struct disk_reservation *disk_res,
+			   struct extent_insert_hook *hook,
+			   u64 *journal_seq)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch_btree_iter_init_intent(&iter, c, id, start);
+
+	while ((k = bch_btree_iter_peek(&iter)).k &&
+	       !(ret = btree_iter_err(k))) {
+		unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+		/* really shouldn't be using a bare, unpadded bkey_i */
+		struct bkey_i delete;
+
+		if (bkey_cmp(iter.pos, end) >= 0)
+			break;
+
+		bkey_init(&delete.k);
+
+		/*
+		 * For extents, iter.pos won't necessarily be the same as
+		 * bkey_start_pos(k.k) (for non extents they always will be the
+		 * same). It's important that we delete starting from iter.pos
+		 * because the range we want to delete could start in the middle
+		 * of k.
+		 *
+		 * (bch_btree_iter_peek() does guarantee that iter.pos >=
+		 * bkey_start_pos(k.k)).
+		 */
+		delete.k.p = iter.pos;
+		delete.k.version = version;
+
+		if (iter.is_extents) {
+			/*
+			 * The extents btree is special - KEY_TYPE_DISCARD is
+			 * used for deletions, not KEY_TYPE_DELETED. This is an
+			 * internal implementation detail that probably
+			 * shouldn't be exposed (internally, KEY_TYPE_DELETED is
+			 * used as a proxy for k->size == 0):
+			 */
+			delete.k.type = KEY_TYPE_DISCARD;
+
+			/* create the biggest key we can */
+			bch_key_resize(&delete.k, max_sectors);
+			bch_cut_back(end, &delete.k);
+		}
+
+		ret = bch_btree_insert_at(c, disk_res, hook, journal_seq,
+					  BTREE_INSERT_NOFAIL,
+					  BTREE_INSERT_ENTRY(&iter, &delete));
+		if (ret)
+			break;
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
+
+/**
+ * bch_btree_node_rewrite - Rewrite/move a btree node
+ *
+ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
+ * btree_check_reserve() has to wait)
+ */
+int bch_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
+			   struct closure *cl)
+{
+	struct cache_set *c = iter->c;
+	struct btree *n, *parent = iter->nodes[b->level + 1];
+	struct btree_reserve *reserve;
+	struct btree_interior_update *as;
+	unsigned flags = BTREE_INSERT_NOFAIL;
+
+	/*
+	 * if caller is going to wait if allocating reserve fails, then this is
+	 * a rewrite that must succeed:
+	 */
+	if (cl)
+		flags |= BTREE_INSERT_USE_RESERVE;
+
+	if (!bch_btree_iter_set_locks_want(iter, U8_MAX))
+		return -EINTR;
+
+	reserve = bch_btree_reserve_get(c, b, 0, flags, cl);
+	if (IS_ERR(reserve)) {
+		trace_bcache_btree_gc_rewrite_node_fail(c, b);
+		return PTR_ERR(reserve);
+	}
+
+	as = bch_btree_interior_update_alloc(c);
+
+	bch_btree_interior_update_will_free_node(c, as, b);
+
+	n = btree_node_alloc_replacement(c, b, reserve);
+
+	bch_btree_build_aux_trees(n);
+	six_unlock_write(&n->lock);
+
+	trace_bcache_btree_gc_rewrite_node(c, b);
+
+	bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
+
+	if (parent) {
+		bch_btree_insert_node(parent, iter,
+				      &keylist_single(&n->key),
+				      reserve, as);
+	} else {
+		bch_btree_set_root(iter, n, as, reserve);
+	}
+
+	btree_open_bucket_put(c, n);
+
+	bch_btree_node_free_inmem(iter, b);
+
+	BUG_ON(!bch_btree_iter_node_replace(iter, n));
+
+	bch_btree_reserve_put(c, reserve);
+	return 0;
+}
diff --git a/libbcache/btree_update.h b/libbcache/btree_update.h
new file mode 100644
index 0000000..0154441
--- /dev/null
+++ b/libbcache/btree_update.h
@@ -0,0 +1,421 @@
+#ifndef _BCACHE_BTREE_INSERT_H
+#define _BCACHE_BTREE_INSERT_H
+
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "journal.h"
+
+struct cache_set;
+struct bkey_format_state;
+struct bkey_format;
+struct btree;
+
+#define BTREE_SPLIT_THRESHOLD(c)		(btree_blocks(c) * 3 / 4)
+
+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
+	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
+	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
+
+static inline void btree_node_reset_sib_u64s(struct btree *b)
+{
+	b->sib_u64s[0] = b->nr.live_u64s;
+	b->sib_u64s[1] = b->nr.live_u64s;
+}
+
+struct btree_reserve {
+	struct disk_reservation	disk_res;
+	unsigned		nr;
+	struct btree		*b[BTREE_RESERVE_MAX];
+};
+
+void __bch_btree_calc_format(struct bkey_format_state *, struct btree *);
+bool bch_btree_node_format_fits(struct cache_set *c, struct btree *,
+				struct bkey_format *);
+
+/* Btree node freeing/allocation: */
+
+/*
+ * Tracks a btree node that has been (or is about to be) freed in memory, but
+ * has _not_ yet been freed on disk (because the write that makes the new
+ * node(s) visible and frees the old hasn't completed yet)
+ */
+struct pending_btree_node_free {
+	bool			index_update_done;
+
+	__le64			seq;
+	enum btree_id		btree_id;
+	unsigned		level;
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+};
+
+/*
+ * Tracks an in progress split/rewrite of a btree node and the update to the
+ * parent node:
+ *
+ * When we split/rewrite a node, we do all the updates in memory without
+ * waiting for any writes to complete - we allocate the new node(s) and update
+ * the parent node, possibly recursively up to the root.
+ *
+ * The end result is that we have one or more new nodes being written -
+ * possibly several, if there were multiple splits - and then a write (updating
+ * an interior node) which will make all these new nodes visible.
+ *
+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
+ * nodes can't be freed (their space on disk can't be reclaimed) until the
+ * update to the interior node that makes the new node visible completes -
+ * until then, the old nodes are still reachable on disk.
+ *
+ */
+struct btree_interior_update {
+	struct closure			cl;
+	struct cache_set		*c;
+
+	struct list_head		list;
+
+	/* What kind of update are we doing? */
+	enum {
+		BTREE_INTERIOR_NO_UPDATE,
+		BTREE_INTERIOR_UPDATING_NODE,
+		BTREE_INTERIOR_UPDATING_ROOT,
+		BTREE_INTERIOR_UPDATING_AS,
+	} mode;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_NODE:
+	 * The update that made the new nodes visible was a regular update to an
+	 * existing interior node - @b. We can't write out the update to @b
+	 * until the new nodes we created are finished writing, so we block @b
+	 * from writing by putting this btree_interior update on the
+	 * @b->write_blocked list with @write_blocked_list:
+	 */
+	struct btree			*b;
+	struct list_head		write_blocked_list;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
+	 * we're now blocking another btree_interior_update
+	 * @parent_as - btree_interior_update that's waiting on our nodes to finish
+	 * writing, before it can make new nodes visible on disk
+	 * @wait - list of child btree_interior_updates that are waiting on this
+	 * btree_interior_update to make all the new nodes visible before they can free
+	 * their old btree nodes
+	 */
+	struct btree_interior_update	*parent_as;
+	struct closure_waitlist		wait;
+
+	/*
+	 * We may be freeing nodes that were dirty, and thus had journal entries
+	 * pinned: we need to transfer the oldest of those pins to the
+	 * btree_interior_update operation, and release it when the new node(s)
+	 * are all persistent and reachable:
+	 */
+	struct journal_entry_pin	journal;
+
+	u64				journal_seq;
+
+	/*
+	 * Nodes being freed:
+	 * Protected by c->btree_node_pending_free_lock
+	 */
+	struct pending_btree_node_free	pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
+	unsigned			nr_pending;
+
+	/* Only here to reduce stack usage on recursive splits: */
+	struct keylist			parent_keys;
+	/*
+	 * Enough room for btree_split's keys without realloc - btree node
+	 * pointers never have crc/compression info, so we only need to acount
+	 * for the pointers for three keys
+	 */
+	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
+};
+
+#define for_each_pending_btree_node_free(c, as, p)			\
+	list_for_each_entry(as, &c->btree_interior_update_list, list)	\
+		for (p = as->pending; p < as->pending + as->nr_pending; p++)
+
+void bch_btree_node_free_inmem(struct btree_iter *, struct btree *);
+void bch_btree_node_free_never_inserted(struct cache_set *, struct btree *);
+
+void btree_open_bucket_put(struct cache_set *c, struct btree *);
+
+struct btree *__btree_node_alloc_replacement(struct cache_set *,
+					     struct btree *,
+					     struct bkey_format,
+					     struct btree_reserve *);
+struct btree *btree_node_alloc_replacement(struct cache_set *, struct btree *,
+					   struct btree_reserve *);
+
+struct btree_interior_update *
+bch_btree_interior_update_alloc(struct cache_set *);
+
+void bch_btree_interior_update_will_free_node(struct cache_set *,
+					      struct btree_interior_update *,
+					      struct btree *);
+
+void bch_btree_set_root_initial(struct cache_set *, struct btree *,
+				struct btree_reserve *);
+
+void bch_btree_reserve_put(struct cache_set *, struct btree_reserve *);
+struct btree_reserve *bch_btree_reserve_get(struct cache_set *,
+					    struct btree *, unsigned,
+					    unsigned, struct closure *);
+
+int bch_btree_root_alloc(struct cache_set *, enum btree_id, struct closure *);
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+bool bch_btree_bset_insert_key(struct btree_iter *, struct btree *,
+			       struct btree_node_iter *, struct bkey_i *);
+void bch_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
+			   struct bkey_i *);
+
+static inline void *btree_data_end(struct cache_set *c, struct btree *b)
+{
+	return (void *) b->data + btree_bytes(c);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_start(struct cache_set *c,
+							    struct btree *b)
+{
+	return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_end(struct cache_set *c,
+							  struct btree *b)
+{
+	return btree_data_end(c, b);
+}
+
+static inline void *write_block(struct btree *b)
+{
+	return (void *) b->data + (b->written << 9);
+}
+
+static inline bool bset_written(struct btree *b, struct bset *i)
+{
+	return (void *) i < write_block(b);
+}
+
+static inline bool bset_unwritten(struct btree *b, struct bset *i)
+{
+	return (void *) i > write_block(b);
+}
+
+static inline unsigned bset_end_sector(struct cache_set *c, struct btree *b,
+				       struct bset *i)
+{
+	return round_up(bset_byte_offset(b, bset_bkey_last(i)),
+			block_bytes(c)) >> 9;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c,
+						   struct btree *b)
+{
+	struct bset *i = btree_bset_last(b);
+	unsigned used = bset_byte_offset(b, bset_bkey_last(i)) / sizeof(u64) +
+		b->whiteout_u64s +
+		b->uncompacted_whiteout_u64s;
+	unsigned total = c->sb.btree_node_size << 6;
+
+	EBUG_ON(used > total);
+
+	if (bset_written(b, i))
+		return 0;
+
+	return total - used;
+}
+
+static inline unsigned btree_write_set_buffer(struct btree *b)
+{
+	/*
+	 * Could buffer up larger amounts of keys for btrees with larger keys,
+	 * pending benchmarking:
+	 */
+	return 4 << 10;
+}
+
+static inline struct btree_node_entry *want_new_bset(struct cache_set *c,
+						     struct btree *b)
+{
+	struct bset *i = btree_bset_last(b);
+	unsigned offset = max_t(unsigned, b->written << 9,
+				bset_byte_offset(b, bset_bkey_last(i)));
+	ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t)
+		(offset + sizeof(struct btree_node_entry) +
+		 b->whiteout_u64s * sizeof(u64) +
+		 b->uncompacted_whiteout_u64s * sizeof(u64));
+
+	EBUG_ON(offset > btree_bytes(c));
+
+	if ((unlikely(bset_written(b, i)) && n > 0) ||
+	    (unlikely(__set_bytes(i, le16_to_cpu(i->u64s)) >
+		      btree_write_set_buffer(b)) && n > btree_write_set_buffer(b)))
+		return (void *) b->data + offset;
+
+	return NULL;
+}
+
+/*
+ * write lock must be held on @b (else the dirty bset that we were going to
+ * insert into could be written out from under us)
+ */
+static inline bool bch_btree_node_insert_fits(struct cache_set *c,
+					      struct btree *b, unsigned u64s)
+{
+	if (btree_node_is_extents(b)) {
+		/* The insert key might split an existing key
+		 * (bch_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
+		 */
+		u64s += BKEY_EXTENT_U64s_MAX;
+	}
+
+	return u64s <= bch_btree_keys_u64s_remaining(c, b);
+}
+
+static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
+				      struct bkey_packed *k)
+{
+	if (bset_written(b, bset(b, t))) {
+		EBUG_ON(b->uncompacted_whiteout_u64s <
+			bkeyp_key_u64s(&b->format, k));
+		b->uncompacted_whiteout_u64s -=
+			bkeyp_key_u64s(&b->format, k);
+	}
+}
+
+static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
+				    struct bkey_packed *k)
+{
+	if (bset_written(b, bset(b, t))) {
+		BUG_ON(!k->needs_whiteout);
+		b->uncompacted_whiteout_u64s +=
+			bkeyp_key_u64s(&b->format, k);
+	}
+}
+
+void bch_btree_insert_node(struct btree *, struct btree_iter *,
+			   struct keylist *, struct btree_reserve *,
+			   struct btree_interior_update *as);
+
+/* Normal update interface: */
+
+struct btree_insert {
+	struct cache_set	*c;
+	struct disk_reservation *disk_res;
+	struct journal_res	journal_res;
+	u64			*journal_seq;
+	struct extent_insert_hook *hook;
+	unsigned		flags;
+	bool			did_work;
+
+	unsigned short		nr;
+	struct btree_insert_entry {
+		struct btree_iter *iter;
+		struct bkey_i	*k;
+		/*
+		 * true if entire key was inserted - can only be false for
+		 * extents
+		 */
+		bool		done;
+	}			*entries;
+};
+
+int __bch_btree_insert_at(struct btree_insert *);
+
+
+#define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...)   N
+#define COUNT_ARGS(...)  _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define BTREE_INSERT_ENTRY(_iter, _k)					\
+	((struct btree_insert_entry) {					\
+		.iter		= (_iter),				\
+		.k		= (_k),					\
+		.done		= false,				\
+	})
+
+/**
+ * bch_btree_insert_at - insert one or more keys at iterator positions
+ * @iter:		btree iterator
+ * @insert_key:		key to insert
+ * @disk_res:		disk reservation
+ * @hook:		extent insert callback
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: cache set read only
+ * -EIO: journal or btree node IO error
+ */
+#define bch_btree_insert_at(_c, _disk_res, _hook,			\
+			    _journal_seq, _flags, ...)			\
+	__bch_btree_insert_at(&(struct btree_insert) {			\
+		.c		= (_c),					\
+		.disk_res	= (_disk_res),				\
+		.journal_seq	= (_journal_seq),			\
+		.hook		= (_hook),				\
+		.flags		= (_flags),				\
+		.nr		= COUNT_ARGS(__VA_ARGS__),		\
+		.entries	= (struct btree_insert_entry[]) {	\
+			__VA_ARGS__					\
+		}})
+
+/*
+ * Don't drop/retake locks: instead return -EINTR if need to upgrade to intent
+ * locks, -EAGAIN if need to wait on btree reserve
+ */
+#define BTREE_INSERT_ATOMIC		(1 << 0)
+
+/* Don't check for -ENOSPC: */
+#define BTREE_INSERT_NOFAIL		(1 << 1)
+
+/* for copygc, or when merging btree nodes */
+#define BTREE_INSERT_USE_RESERVE	(1 << 2)
+
+/*
+ * Insert is for journal replay: don't get journal reservations, or mark extents
+ * (bch_mark_key)
+ */
+#define BTREE_INSERT_JOURNAL_REPLAY	(1 << 3)
+
+int bch_btree_insert_list_at(struct btree_iter *, struct keylist *,
+			     struct disk_reservation *,
+			     struct extent_insert_hook *, u64 *, unsigned);
+
+static inline bool journal_res_insert_fits(struct btree_insert *trans,
+					   struct btree_insert_entry *insert)
+{
+	unsigned u64s = 0;
+	struct btree_insert_entry *i;
+
+	/*
+	 * If we didn't get a journal reservation, we're in journal replay and
+	 * we're not journalling updates:
+	 */
+	if (!trans->journal_res.ref)
+		return true;
+
+	for (i = insert; i < trans->entries + trans->nr; i++)
+		u64s += jset_u64s(i->k->k.u64s);
+
+	return u64s <= trans->journal_res.u64s;
+}
+
+int bch_btree_insert_check_key(struct btree_iter *, struct bkey_i *);
+int bch_btree_insert(struct cache_set *, enum btree_id, struct bkey_i *,
+		     struct disk_reservation *,
+		     struct extent_insert_hook *, u64 *, int flags);
+int bch_btree_update(struct cache_set *, enum btree_id,
+		     struct bkey_i *, u64 *);
+
+int bch_btree_delete_range(struct cache_set *, enum btree_id,
+			   struct bpos, struct bpos, u64,
+			   struct disk_reservation *,
+			   struct extent_insert_hook *, u64 *);
+
+int bch_btree_node_rewrite(struct btree_iter *, struct btree *, struct closure *);
+
+#endif /* _BCACHE_BTREE_INSERT_H */
+
diff --git a/libbcache/buckets.c b/libbcache/buckets.c
new file mode 100644
index 0000000..3398b25
--- /dev/null
+++ b/libbcache/buckets.c
@@ -0,0 +1,755 @@
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ *
+ * Bucket states:
+ * - free bucket: mark == 0
+ *   The bucket contains no data and will not be read
+ *
+ * - allocator bucket: owned_by_allocator == 1
+ *   The bucket is on a free list, or it is an open bucket
+ *
+ * - cached bucket: owned_by_allocator == 0 &&
+ *                  dirty_sectors == 0 &&
+ *                  cached_sectors > 0
+ *   The bucket contains data but may be safely discarded as there are
+ *   enough replicas of the data on other cache devices, or it has been
+ *   written back to the backing device
+ *
+ * - dirty bucket: owned_by_allocator == 0 &&
+ *                 dirty_sectors > 0
+ *   The bucket contains data that we must not discard (either only copy,
+ *   or one of the 'main copies' for data requiring multiple replicas)
+ *
+ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
+ *   This is a btree node, journal or gen/prio bucket
+ *
+ * Lifecycle:
+ *
+ * bucket invalidated => bucket on freelist => open bucket =>
+ *     [dirty bucket =>] cached bucket => bucket invalidated => ...
+ *
+ * Note that cache promotion can skip the dirty bucket step, as data
+ * is copied from a deeper tier to a shallower tier, onto a cached
+ * bucket.
+ * Note also that a cached bucket can spontaneously become dirty --
+ * see below.
+ *
+ * Only a traversal of the key space can determine whether a bucket is
+ * truly dirty or cached.
+ *
+ * Transitions:
+ *
+ * - free => allocator: bucket was invalidated
+ * - cached => allocator: bucket was invalidated
+ *
+ * - allocator => dirty: open bucket was filled up
+ * - allocator => cached: open bucket was filled up
+ * - allocator => metadata: metadata was allocated
+ *
+ * - dirty => cached: dirty sectors were copied to a deeper tier
+ * - dirty => free: dirty sectors were overwritten or moved (copy gc)
+ * - cached => free: cached sectors were overwritten
+ *
+ * - metadata => free: metadata was freed
+ *
+ * Oddities:
+ * - cached => dirty: a device was removed so formerly replicated data
+ *                    is no longer sufficiently replicated
+ * - free => cached: cannot happen
+ * - free => dirty: cannot happen
+ * - free => metadata: cannot happen
+ */
+
+#include "bcache.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "buckets.h"
+
+#include <linux/preempt.h>
+#include <trace/events/bcache.h>
+
+#ifdef DEBUG_BUCKETS
+
+#define lg_local_lock	lg_global_lock
+#define lg_local_unlock	lg_global_unlock
+
+static void bch_cache_set_stats_verify(struct cache_set *c)
+{
+	struct bucket_stats_cache_set stats =
+		__bch_bucket_stats_read_cache_set(c);
+
+	if ((s64) stats.sectors_dirty < 0)
+		panic("sectors_dirty underflow: %lli\n", stats.sectors_dirty);
+
+	if ((s64) stats.sectors_cached < 0)
+		panic("sectors_cached underflow: %lli\n", stats.sectors_cached);
+
+	if ((s64) stats.sectors_meta < 0)
+		panic("sectors_meta underflow: %lli\n", stats.sectors_meta);
+
+	if ((s64) stats.sectors_persistent_reserved < 0)
+		panic("sectors_persistent_reserved underflow: %lli\n", stats.sectors_persistent_reserved);
+
+	if ((s64) stats.sectors_online_reserved < 0)
+		panic("sectors_online_reserved underflow: %lli\n", stats.sectors_online_reserved);
+}
+
+#else
+
+static void bch_cache_set_stats_verify(struct cache_set *c) {}
+
+#endif
+
+void bch_bucket_seq_cleanup(struct cache_set *c)
+{
+	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
+	struct cache *ca;
+	struct bucket *g;
+	struct bucket_mark m;
+	unsigned i;
+
+	for_each_cache(ca, c, i)
+		for_each_bucket(g, ca) {
+			bucket_cmpxchg(g, m, ({
+				if (!m.wait_on_journal ||
+				    ((s16) last_seq_ondisk -
+				     (s16) m.journal_seq < 0))
+					break;
+
+				m.wait_on_journal = 0;
+			}));
+		}
+}
+
+#define bucket_stats_add(_acc, _stats)					\
+do {									\
+	typeof(_acc) _a = (_acc), _s = (_stats);			\
+	unsigned i;							\
+									\
+	for (i = 0; i < sizeof(*_a) / sizeof(u64); i++)			\
+		((u64 *) (_a))[i] += ((u64 *) (_s))[i];			\
+} while (0)
+
+#define bucket_stats_read_raw(_stats)					\
+({									\
+	typeof(*this_cpu_ptr(_stats)) _acc = { 0 };			\
+	int cpu;							\
+									\
+	for_each_possible_cpu(cpu)					\
+		bucket_stats_add(&_acc, per_cpu_ptr((_stats), cpu));	\
+									\
+	_acc;								\
+})
+
+#define bucket_stats_read_cached(_c, _cached, _uncached)		\
+({									\
+	typeof(_cached) _ret;						\
+	unsigned _seq;							\
+									\
+	do {								\
+		_seq = read_seqcount_begin(&(_c)->gc_pos_lock);		\
+		_ret = (_c)->gc_pos.phase == GC_PHASE_DONE		\
+			? bucket_stats_read_raw(_uncached)		\
+			: (_cached);					\
+	} while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));	\
+									\
+	_ret;								\
+})
+
+struct bucket_stats_cache __bch_bucket_stats_read_cache(struct cache *ca)
+{
+	return bucket_stats_read_raw(ca->bucket_stats_percpu);
+}
+
+struct bucket_stats_cache bch_bucket_stats_read_cache(struct cache *ca)
+{
+	return bucket_stats_read_cached(ca->set,
+				ca->bucket_stats_cached,
+				ca->bucket_stats_percpu);
+}
+
+struct bucket_stats_cache_set
+__bch_bucket_stats_read_cache_set(struct cache_set *c)
+{
+	return bucket_stats_read_raw(c->bucket_stats_percpu);
+}
+
+struct bucket_stats_cache_set
+bch_bucket_stats_read_cache_set(struct cache_set *c)
+{
+	return bucket_stats_read_cached(c,
+				c->bucket_stats_cached,
+				c->bucket_stats_percpu);
+}
+
+static inline int is_meta_bucket(struct bucket_mark m)
+{
+	return !m.owned_by_allocator && m.is_metadata;
+}
+
+static inline int is_dirty_bucket(struct bucket_mark m)
+{
+	return !m.owned_by_allocator && !m.is_metadata && !!m.dirty_sectors;
+}
+
+static inline int is_cached_bucket(struct bucket_mark m)
+{
+	return !m.owned_by_allocator && !m.dirty_sectors && !!m.cached_sectors;
+}
+
+void bch_cache_set_stats_apply(struct cache_set *c,
+			       struct bucket_stats_cache_set *stats,
+			       struct disk_reservation *disk_res,
+			       struct gc_pos gc_pos)
+{
+	s64 added =
+		stats->s[S_COMPRESSED][S_META] +
+		stats->s[S_COMPRESSED][S_DIRTY] +
+		stats->persistent_reserved +
+		stats->online_reserved;
+
+	/*
+	 * Not allowed to reduce sectors_available except by getting a
+	 * reservation:
+	 */
+	BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
+
+	if (added > 0) {
+		disk_res->sectors	-= added;
+		stats->online_reserved	-= added;
+	}
+
+	lg_local_lock(&c->bucket_stats_lock);
+	/* online_reserved not subject to gc: */
+	this_cpu_ptr(c->bucket_stats_percpu)->online_reserved +=
+		stats->online_reserved;
+	stats->online_reserved = 0;
+
+	if (!gc_will_visit(c, gc_pos))
+		bucket_stats_add(this_cpu_ptr(c->bucket_stats_percpu), stats);
+
+	bch_cache_set_stats_verify(c);
+	lg_local_unlock(&c->bucket_stats_lock);
+
+	memset(stats, 0, sizeof(*stats));
+}
+
+static void bucket_stats_update(struct cache *ca,
+			struct bucket_mark old, struct bucket_mark new,
+			bool may_make_unavailable,
+			struct bucket_stats_cache_set *cache_set_stats)
+{
+	struct cache_set *c = ca->set;
+	struct bucket_stats_cache *cache_stats;
+
+	BUG_ON(!may_make_unavailable &&
+	       is_available_bucket(old) &&
+	       !is_available_bucket(new) &&
+	       c->gc_pos.phase == GC_PHASE_DONE);
+
+	if (cache_set_stats) {
+		cache_set_stats->s[S_COMPRESSED][S_CACHED] +=
+			(int) new.cached_sectors - (int) old.cached_sectors;
+
+		cache_set_stats->s[S_COMPRESSED]
+			[old.is_metadata ? S_META : S_DIRTY] -=
+			old.dirty_sectors;
+
+		cache_set_stats->s[S_COMPRESSED]
+			[new.is_metadata ? S_META : S_DIRTY] +=
+			new.dirty_sectors;
+	}
+
+	preempt_disable();
+	cache_stats = this_cpu_ptr(ca->bucket_stats_percpu);
+
+	cache_stats->sectors_cached +=
+		(int) new.cached_sectors - (int) old.cached_sectors;
+
+	if (old.is_metadata)
+		cache_stats->sectors_meta -= old.dirty_sectors;
+	else
+		cache_stats->sectors_dirty -= old.dirty_sectors;
+
+	if (new.is_metadata)
+		cache_stats->sectors_meta += new.dirty_sectors;
+	else
+		cache_stats->sectors_dirty += new.dirty_sectors;
+
+	cache_stats->buckets_alloc +=
+		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+
+	cache_stats->buckets_meta += is_meta_bucket(new) - is_meta_bucket(old);
+	cache_stats->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
+	cache_stats->buckets_dirty += is_dirty_bucket(new) - is_dirty_bucket(old);
+	preempt_enable();
+
+	if (!is_available_bucket(old) && is_available_bucket(new))
+		bch_wake_allocator(ca);
+}
+
+void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+	struct bucket_mark old, new;
+
+	old = bucket_cmpxchg(g, new, ({
+		new.owned_by_allocator	= 1;
+		new.is_metadata		= 0;
+		new.cached_sectors	= 0;
+		new.dirty_sectors	= 0;
+		new.copygc		= 0;
+		new.gen++;
+	}));
+
+	BUG_ON(old.dirty_sectors);
+
+	bucket_stats_update(ca, old, new, true, &stats);
+
+	/*
+	 * Ick:
+	 *
+	 * Only stats.sectors_cached should be nonzero: this is important
+	 * because in this path we modify cache_set_stats based on how the
+	 * bucket_mark was modified, and the sector counts in bucket_mark are
+	 * subject to (saturating) overflow - and if they did overflow, the
+	 * cache set stats will now be off. We can tolerate this for
+	 * sectors_cached, but not anything else:
+	 */
+	stats.s[S_COMPRESSED][S_CACHED] = 0;
+	stats.s[S_UNCOMPRESSED][S_CACHED] = 0;
+	BUG_ON(!bch_is_zero(&stats, sizeof(stats)));
+
+	if (!old.owned_by_allocator && old.cached_sectors)
+		trace_bcache_invalidate(ca, g - ca->buckets,
+					old.cached_sectors);
+}
+
+void bch_mark_free_bucket(struct cache *ca, struct bucket *g)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+	struct bucket_mark old, new;
+
+	old = bucket_cmpxchg(g, new, ({
+		new.owned_by_allocator	= 0;
+		new.is_metadata		= 0;
+		new.cached_sectors	= 0;
+		new.dirty_sectors	= 0;
+	}));
+
+	bucket_stats_update(ca, old, new, false, &stats);
+}
+
+void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g,
+			   bool owned_by_allocator)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+	struct bucket_mark old, new;
+
+	old = bucket_cmpxchg(g, new, new.owned_by_allocator = owned_by_allocator);
+
+	bucket_stats_update(ca, old, new, true, &stats);
+}
+
+void bch_mark_metadata_bucket(struct cache *ca, struct bucket *g,
+			      bool may_make_unavailable)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+	struct bucket_mark old, new;
+
+	old = bucket_cmpxchg(g, new, ({
+		new.is_metadata = 1;
+		new.had_metadata = 1;
+	}));
+
+	BUG_ON(old.cached_sectors);
+	BUG_ON(old.dirty_sectors);
+
+	bucket_stats_update(ca, old, new, may_make_unavailable, &stats);
+}
+
+#define saturated_add(ca, dst, src, max)			\
+do {								\
+	BUG_ON((int) (dst) + (src) < 0);			\
+	if ((dst) == (max))					\
+		;						\
+	else if ((dst) + (src) <= (max))			\
+		dst += (src);					\
+	else {							\
+		dst = (max);					\
+		trace_bcache_sectors_saturated(ca);		\
+	}							\
+} while (0)
+
+#if 0
+/* Reverting this until the copygc + compression issue is fixed: */
+
+static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
+{
+	return crc_compression_type(crc)
+		? sectors * crc_compressed_size(crc) / crc_uncompressed_size(crc)
+		: sectors;
+}
+
+static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
+{
+	return crc_compression_type(crc)
+		? min_t(unsigned, crc_compressed_size(crc), sectors)
+		: sectors;
+}
+#else
+static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
+{
+	return sectors;
+}
+
+static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
+{
+	return sectors;
+}
+#endif
+
+/*
+ * Checking against gc's position has to be done here, inside the cmpxchg()
+ * loop, to avoid racing with the start of gc clearing all the marks - GC does
+ * that with the gc pos seqlock held.
+ */
+static void bch_mark_pointer(struct cache_set *c,
+			     struct bkey_s_c_extent e,
+			     struct cache *ca,
+			     const union bch_extent_crc *crc,
+			     const struct bch_extent_ptr *ptr,
+			     s64 sectors, enum s_alloc type,
+			     bool may_make_unavailable,
+			     struct bucket_stats_cache_set *stats,
+			     bool gc_will_visit, u64 journal_seq)
+{
+	struct bucket_mark old, new;
+	unsigned saturated;
+	struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
+	u64 v = READ_ONCE(g->_mark.counter);
+	unsigned old_sectors, new_sectors;
+	int disk_sectors, compressed_sectors;
+
+	if (sectors > 0) {
+		old_sectors = 0;
+		new_sectors = sectors;
+	} else {
+		old_sectors = e.k->size;
+		new_sectors = e.k->size + sectors;
+	}
+
+	disk_sectors = -__disk_sectors(crc, old_sectors)
+		+ __disk_sectors(crc, new_sectors);
+	compressed_sectors = -__compressed_sectors(crc, old_sectors)
+		+ __compressed_sectors(crc, new_sectors);
+
+	if (gc_will_visit) {
+		if (journal_seq)
+			bucket_cmpxchg(g, new, new.journal_seq = journal_seq);
+
+		goto out;
+	}
+
+	do {
+		new.counter = old.counter = v;
+		saturated = 0;
+
+		/*
+		 * Check this after reading bucket mark to guard against
+		 * the allocator invalidating a bucket after we've already
+		 * checked the gen
+		 */
+		if (gen_after(old.gen, ptr->gen)) {
+			EBUG_ON(type != S_CACHED &&
+				test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
+			return;
+		}
+
+		EBUG_ON(type != S_CACHED &&
+			!may_make_unavailable &&
+			is_available_bucket(old) &&
+			test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
+
+		if (type != S_CACHED &&
+		    new.dirty_sectors == GC_MAX_SECTORS_USED &&
+		    disk_sectors < 0)
+			saturated = -disk_sectors;
+
+		if (type == S_CACHED)
+			saturated_add(ca, new.cached_sectors, disk_sectors,
+				      GC_MAX_SECTORS_USED);
+		else
+			saturated_add(ca, new.dirty_sectors, disk_sectors,
+				      GC_MAX_SECTORS_USED);
+
+		if (!new.dirty_sectors &&
+		    !new.cached_sectors) {
+			new.is_metadata = false;
+
+			if (journal_seq) {
+				new.wait_on_journal = true;
+				new.journal_seq = journal_seq;
+			}
+		} else {
+			new.is_metadata = (type == S_META);
+		}
+
+		new.had_metadata |= new.is_metadata;
+	} while ((v = cmpxchg(&g->_mark.counter,
+			      old.counter,
+			      new.counter)) != old.counter);
+
+	bucket_stats_update(ca, old, new, may_make_unavailable, NULL);
+
+	if (saturated &&
+	    atomic_long_add_return(saturated,
+				   &ca->saturated_count) >=
+	    ca->free_inc.size << ca->bucket_bits) {
+		if (c->gc_thread) {
+			trace_bcache_gc_sectors_saturated(c);
+			wake_up_process(c->gc_thread);
+		}
+	}
+out:
+	stats->s[S_COMPRESSED][type]	+= compressed_sectors;
+	stats->s[S_UNCOMPRESSED][type]	+= sectors;
+}
+
+static void bch_mark_extent(struct cache_set *c, struct bkey_s_c_extent e,
+			    s64 sectors, bool metadata,
+			    bool may_make_unavailable,
+			    struct bucket_stats_cache_set *stats,
+			    bool gc_will_visit, u64 journal_seq)
+{
+	const struct bch_extent_ptr *ptr;
+	const union bch_extent_crc *crc;
+	struct cache *ca;
+	enum s_alloc type = metadata ? S_META : S_DIRTY;
+
+	BUG_ON(metadata && bkey_extent_is_cached(e.k));
+	BUG_ON(!sectors);
+
+	rcu_read_lock();
+	extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+		bool dirty = bch_extent_ptr_is_dirty(c, e, ptr);
+
+		trace_bcache_mark_bucket(ca, e.k, ptr, sectors, dirty);
+
+		bch_mark_pointer(c, e, ca, crc, ptr, sectors,
+				 dirty ? type : S_CACHED,
+				 may_make_unavailable,
+				 stats, gc_will_visit, journal_seq);
+	}
+	rcu_read_unlock();
+}
+
+static void __bch_mark_key(struct cache_set *c, struct bkey_s_c k,
+			   s64 sectors, bool metadata,
+			   bool may_make_unavailable,
+			   struct bucket_stats_cache_set *stats,
+			   bool gc_will_visit, u64 journal_seq)
+{
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		bch_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
+				may_make_unavailable, stats,
+				gc_will_visit, journal_seq);
+		break;
+	case BCH_RESERVATION:
+		stats->persistent_reserved += sectors;
+		break;
+	}
+}
+
+void __bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k,
+		       s64 sectors, bool metadata,
+		       struct bucket_stats_cache_set *stats)
+{
+	__bch_mark_key(c, k, sectors, metadata, true, stats, false, 0);
+}
+
+void bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k,
+		     s64 sectors, bool metadata)
+{
+	struct bucket_stats_cache_set stats = { 0 };
+
+	__bch_gc_mark_key(c, k, sectors, metadata, &stats);
+
+	preempt_disable();
+	bucket_stats_add(this_cpu_ptr(c->bucket_stats_percpu), &stats);
+	preempt_enable();
+}
+
+void bch_mark_key(struct cache_set *c, struct bkey_s_c k,
+		  s64 sectors, bool metadata, struct gc_pos gc_pos,
+		  struct bucket_stats_cache_set *stats, u64 journal_seq)
+{
+	/*
+	 * synchronization w.r.t. GC:
+	 *
+	 * Normally, bucket sector counts/marks are updated on the fly, as
+	 * references are added/removed from the btree, the lists of buckets the
+	 * allocator owns, other metadata buckets, etc.
+	 *
+	 * When GC is in progress and going to mark this reference, we do _not_
+	 * mark this reference here, to avoid double counting - GC will count it
+	 * when it gets to it.
+	 *
+	 * To know whether we should mark a given reference (GC either isn't
+	 * running, or has already marked references at this position) we
+	 * construct a total order for everything GC walks. Then, we can simply
+	 * compare the position of the reference we're marking - @gc_pos - with
+	 * GC's current position. If GC is going to mark this reference, GC's
+	 * current position will be less than @gc_pos; if GC's current position
+	 * is greater than @gc_pos GC has either already walked this position,
+	 * or isn't running.
+	 *
+	 * To avoid racing with GC's position changing, we have to deal with
+	 *  - GC's position being set to GC_POS_MIN when GC starts:
+	 *    bucket_stats_lock guards against this
+	 *  - GC's position overtaking @gc_pos: we guard against this with
+	 *    whatever lock protects the data structure the reference lives in
+	 *    (e.g. the btree node lock, or the relevant allocator lock).
+	 */
+	lg_local_lock(&c->bucket_stats_lock);
+	__bch_mark_key(c, k, sectors, metadata, false, stats,
+		       gc_will_visit(c, gc_pos), journal_seq);
+
+	bch_cache_set_stats_verify(c);
+	lg_local_unlock(&c->bucket_stats_lock);
+}
+
+static u64 __recalc_sectors_available(struct cache_set *c)
+{
+	return c->capacity - cache_set_sectors_used(c);
+}
+
+/* Used by gc when it's starting: */
+void bch_recalc_sectors_available(struct cache_set *c)
+{
+	int cpu;
+
+	lg_global_lock(&c->bucket_stats_lock);
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(c->bucket_stats_percpu, cpu)->available_cache = 0;
+
+	atomic64_set(&c->sectors_available,
+		     __recalc_sectors_available(c));
+
+	lg_global_unlock(&c->bucket_stats_lock);
+}
+
+void bch_disk_reservation_put(struct cache_set *c,
+			      struct disk_reservation *res)
+{
+	if (res->sectors) {
+		lg_local_lock(&c->bucket_stats_lock);
+		this_cpu_sub(c->bucket_stats_percpu->online_reserved,
+			     res->sectors);
+
+		bch_cache_set_stats_verify(c);
+		lg_local_unlock(&c->bucket_stats_lock);
+
+		res->sectors = 0;
+	}
+}
+
+#define SECTORS_CACHE	1024
+
+int bch_disk_reservation_add(struct cache_set *c,
+			     struct disk_reservation *res,
+			     unsigned sectors, int flags)
+{
+	struct bucket_stats_cache_set *stats;
+	u64 old, new, v;
+	s64 sectors_available;
+	int ret;
+
+	sectors *= res->nr_replicas;
+
+	lg_local_lock(&c->bucket_stats_lock);
+	stats = this_cpu_ptr(c->bucket_stats_percpu);
+
+	if (sectors >= stats->available_cache)
+		goto out;
+
+	v = atomic64_read(&c->sectors_available);
+	do {
+		old = v;
+		if (old < sectors) {
+			lg_local_unlock(&c->bucket_stats_lock);
+			goto recalculate;
+		}
+
+		new = max_t(s64, 0, old - sectors - SECTORS_CACHE);
+	} while ((v = atomic64_cmpxchg(&c->sectors_available,
+				       old, new)) != old);
+
+	stats->available_cache	+= old - new;
+out:
+	stats->available_cache	-= sectors;
+	stats->online_reserved	+= sectors;
+	res->sectors		+= sectors;
+
+	bch_cache_set_stats_verify(c);
+	lg_local_unlock(&c->bucket_stats_lock);
+	return 0;
+
+recalculate:
+	/*
+	 * GC recalculates sectors_available when it starts, so that hopefully
+	 * we don't normally end up blocking here:
+	 */
+
+	/*
+	 * Piss fuck, we can be called from extent_insert_fixup() with btree
+	 * locks held:
+	 */
+
+	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
+		if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
+			down_read(&c->gc_lock);
+		else if (!down_read_trylock(&c->gc_lock))
+			return -EINTR;
+	}
+	lg_global_lock(&c->bucket_stats_lock);
+
+	sectors_available = __recalc_sectors_available(c);
+
+	if (sectors <= sectors_available ||
+	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
+		atomic64_set(&c->sectors_available,
+			     max_t(s64, 0, sectors_available - sectors));
+		stats->online_reserved	+= sectors;
+		res->sectors		+= sectors;
+		ret = 0;
+	} else {
+		atomic64_set(&c->sectors_available, sectors_available);
+		ret = -ENOSPC;
+	}
+
+	bch_cache_set_stats_verify(c);
+	lg_global_unlock(&c->bucket_stats_lock);
+	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
+		up_read(&c->gc_lock);
+
+	return ret;
+}
+
+int bch_disk_reservation_get(struct cache_set *c,
+			     struct disk_reservation *res,
+			     unsigned sectors, int flags)
+{
+	res->sectors = 0;
+	res->gen = c->capacity_gen;
+	res->nr_replicas = (flags & BCH_DISK_RESERVATION_METADATA)
+		? c->opts.metadata_replicas
+		: c->opts.data_replicas;
+
+	return bch_disk_reservation_add(c, res, sectors, flags);
+}
diff --git a/libbcache/buckets.h b/libbcache/buckets.h
new file mode 100644
index 0000000..35100eb
--- /dev/null
+++ b/libbcache/buckets.h
@@ -0,0 +1,272 @@
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ */
+
+#ifndef _BUCKETS_H
+#define _BUCKETS_H
+
+#include "buckets_types.h"
+#include "super.h"
+
+#define for_each_bucket(b, ca)					\
+	for (b = (ca)->buckets + (ca)->mi.first_bucket;		\
+	     b < (ca)->buckets + (ca)->mi.nbuckets; b++)
+
+#define bucket_cmpxchg(g, new, expr)				\
+({								\
+	u64 _v = READ_ONCE((g)->_mark.counter);			\
+	struct bucket_mark _old;				\
+								\
+	do {							\
+		(new).counter = _old.counter = _v;		\
+		expr;						\
+	} while ((_v = cmpxchg(&(g)->_mark.counter,		\
+			       _old.counter,			\
+			       (new).counter)) != _old.counter);\
+	_old;							\
+})
+
+/*
+ * bucket_gc_gen() returns the difference between the bucket's current gen and
+ * the oldest gen of any pointer into that bucket in the btree.
+ */
+
+static inline u8 bucket_gc_gen(struct cache *ca, struct bucket *g)
+{
+	unsigned long r = g - ca->buckets;
+	return g->mark.gen - ca->oldest_gens[r];
+}
+
+static inline struct cache *PTR_CACHE(const struct cache_set *c,
+				      const struct bch_extent_ptr *ptr)
+{
+	EBUG_ON(ptr->dev > rcu_dereference(c->members)->nr_in_set);
+
+	return rcu_dereference(c->cache[ptr->dev]);
+}
+
+static inline size_t PTR_BUCKET_NR(const struct cache *ca,
+				   const struct bch_extent_ptr *ptr)
+{
+	return sector_to_bucket(ca, ptr->offset);
+}
+
+/*
+ * Returns 0 if no pointers or device offline - only for tracepoints!
+ */
+static inline size_t PTR_BUCKET_NR_TRACE(const struct cache_set *c,
+					 const struct bkey_i *k,
+					 unsigned ptr)
+{
+	size_t bucket = 0;
+#if 0
+	if (bkey_extent_is_data(&k->k)) {
+		const struct bch_extent_ptr *ptr;
+		const struct cache *ca;
+
+		rcu_read_lock();
+		extent_for_each_online_device(c, bkey_i_to_s_c_extent(k), ptr, ca) {
+			bucket = PTR_BUCKET_NR(ca, ptr);
+			break;
+		}
+		rcu_read_unlock();
+	}
+#endif
+	return bucket;
+}
+
+static inline struct bucket *PTR_BUCKET(const struct cache *ca,
+					const struct bch_extent_ptr *ptr)
+{
+	return ca->buckets + PTR_BUCKET_NR(ca, ptr);
+}
+
+static inline u8 __gen_after(u8 a, u8 b)
+{
+	u8 r = a - b;
+
+	return r > 128U ? 0 : r;
+}
+
+static inline u8 gen_after(u8 a, u8 b)
+{
+	u8 r = a - b;
+
+	BUG_ON(r > 128U);
+
+	return r;
+}
+
+/**
+ * ptr_stale() - check if a pointer points into a bucket that has been
+ * invalidated.
+ *
+ * Warning: PTR_CACHE(c, k, ptr) must equal ca.
+ */
+static inline u8 ptr_stale(const struct cache *ca,
+			   const struct bch_extent_ptr *ptr)
+{
+	return gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen);
+}
+
+/* bucket heaps */
+
+static inline bool bucket_min_cmp(struct bucket_heap_entry l,
+				  struct bucket_heap_entry r)
+{
+	return l.val < r.val;
+}
+
+static inline bool bucket_max_cmp(struct bucket_heap_entry l,
+				  struct bucket_heap_entry r)
+{
+	return l.val > r.val;
+}
+
+static inline void bucket_heap_push(struct cache *ca, struct bucket *g,
+				    unsigned long val)
+{
+	struct bucket_heap_entry new = { g, val };
+
+	if (!heap_full(&ca->heap))
+		heap_add(&ca->heap, new, bucket_min_cmp);
+	else if (bucket_min_cmp(new, heap_peek(&ca->heap))) {
+		ca->heap.data[0] = new;
+		heap_sift(&ca->heap, 0, bucket_min_cmp);
+	}
+}
+
+/* bucket gc marks */
+
+/* The dirty and cached sector counts saturate. If this occurs,
+ * reference counting alone will not free the bucket, and a btree
+ * GC must be performed. */
+#define GC_MAX_SECTORS_USED ((1U << 15) - 1)
+
+static inline bool bucket_unused(struct bucket *g)
+{
+	return !g->mark.counter;
+}
+
+static inline unsigned bucket_sectors_used(struct bucket *g)
+{
+	return g->mark.dirty_sectors + g->mark.cached_sectors;
+}
+
+/* Per device stats: */
+
+struct bucket_stats_cache __bch_bucket_stats_read_cache(struct cache *);
+struct bucket_stats_cache bch_bucket_stats_read_cache(struct cache *);
+
+static inline u64 __buckets_available_cache(struct cache *ca,
+					    struct bucket_stats_cache stats)
+{
+	return max_t(s64, 0,
+		     ca->mi.nbuckets - ca->mi.first_bucket -
+		     stats.buckets_dirty -
+		     stats.buckets_alloc -
+		     stats.buckets_meta);
+}
+
+/*
+ * Number of reclaimable buckets - only for use by the allocator thread:
+ */
+static inline u64 buckets_available_cache(struct cache *ca)
+{
+	return __buckets_available_cache(ca, bch_bucket_stats_read_cache(ca));
+}
+
+static inline u64 __buckets_free_cache(struct cache *ca,
+				       struct bucket_stats_cache stats)
+{
+	return __buckets_available_cache(ca, stats) +
+		fifo_used(&ca->free[RESERVE_NONE]) +
+		fifo_used(&ca->free_inc);
+}
+
+static inline u64 buckets_free_cache(struct cache *ca)
+{
+	return __buckets_free_cache(ca, bch_bucket_stats_read_cache(ca));
+}
+
+/* Cache set stats: */
+
+struct bucket_stats_cache_set __bch_bucket_stats_read_cache_set(struct cache_set *);
+struct bucket_stats_cache_set bch_bucket_stats_read_cache_set(struct cache_set *);
+void bch_cache_set_stats_apply(struct cache_set *,
+			       struct bucket_stats_cache_set *,
+			       struct disk_reservation *,
+			       struct gc_pos);
+
+static inline u64 __cache_set_sectors_used(struct cache_set *c)
+{
+	struct bucket_stats_cache_set stats = __bch_bucket_stats_read_cache_set(c);
+	u64 reserved = stats.persistent_reserved +
+		stats.online_reserved;
+
+	return stats.s[S_COMPRESSED][S_META] +
+		stats.s[S_COMPRESSED][S_DIRTY] +
+		reserved +
+		(reserved >> 7);
+}
+
+static inline u64 cache_set_sectors_used(struct cache_set *c)
+{
+	return min(c->capacity, __cache_set_sectors_used(c));
+}
+
+/* XXX: kill? */
+static inline u64 sectors_available(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+	u64 ret = 0;
+
+	rcu_read_lock();
+	for_each_cache_rcu(ca, c, i)
+		ret += buckets_available_cache(ca) << ca->bucket_bits;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool is_available_bucket(struct bucket_mark mark)
+{
+	return (!mark.owned_by_allocator &&
+		!mark.is_metadata &&
+		!mark.dirty_sectors);
+}
+
+void bch_bucket_seq_cleanup(struct cache_set *);
+
+void bch_invalidate_bucket(struct cache *, struct bucket *);
+void bch_mark_free_bucket(struct cache *, struct bucket *);
+void bch_mark_alloc_bucket(struct cache *, struct bucket *, bool);
+void bch_mark_metadata_bucket(struct cache *, struct bucket *, bool);
+
+void __bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool,
+		       struct bucket_stats_cache_set *);
+void bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool);
+void bch_mark_key(struct cache_set *, struct bkey_s_c, s64, bool,
+		  struct gc_pos, struct bucket_stats_cache_set *, u64);
+
+void bch_recalc_sectors_available(struct cache_set *);
+
+void bch_disk_reservation_put(struct cache_set *,
+			      struct disk_reservation *);
+
+#define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
+#define BCH_DISK_RESERVATION_METADATA		(1 << 1)
+#define BCH_DISK_RESERVATION_GC_LOCK_HELD	(1 << 2)
+#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD	(1 << 3)
+
+int bch_disk_reservation_add(struct cache_set *,
+			     struct disk_reservation *,
+			     unsigned, int);
+int bch_disk_reservation_get(struct cache_set *,
+			     struct disk_reservation *,
+			     unsigned, int);
+
+#endif /* _BUCKETS_H */
diff --git a/libbcache/buckets_types.h b/libbcache/buckets_types.h
new file mode 100644
index 0000000..6bbdcd2
--- /dev/null
+++ b/libbcache/buckets_types.h
@@ -0,0 +1,99 @@
+#ifndef _BUCKETS_TYPES_H
+#define _BUCKETS_TYPES_H
+
+struct bucket_mark {
+	union {
+	struct {
+		u64		counter;
+	};
+
+	struct {
+		u8		gen;
+
+		/* generation copygc is going to move this bucket into */
+		unsigned	copygc:1;
+		unsigned	wait_on_journal:1;
+
+		/*
+		 * If this bucket ever had metadata in it, the allocator must
+		 * increment its gen before we reuse it:
+		 */
+		unsigned	had_metadata:1;
+
+		unsigned	owned_by_allocator:1;
+		unsigned	is_metadata:1;
+
+		u16		cached_sectors;
+		u16		dirty_sectors;
+
+		/*
+		 * low bits of journal sequence number when this bucket was most
+		 * recently modified:
+		 */
+		u16		journal_seq;
+	};
+	};
+};
+
+struct bucket {
+	union {
+		struct {
+			u16	read_prio;
+			u16	write_prio;
+		};
+		u16		prio[2];
+	};
+
+	union {
+		struct bucket_mark	_mark;
+		const struct bucket_mark mark;
+	};
+};
+
+struct bucket_stats_cache {
+	u64			buckets_dirty;
+	u64			buckets_cached;
+	u64			buckets_meta;
+	u64			buckets_alloc;
+
+	u64			sectors_dirty;
+	u64			sectors_cached;
+	u64			sectors_meta;
+};
+
+enum s_alloc {
+	S_META,
+	S_DIRTY,
+	S_CACHED,
+	S_ALLOC_NR,
+};
+
+enum s_compressed {
+	S_COMPRESSED,
+	S_UNCOMPRESSED,
+	S_COMPRESSED_NR,
+};
+
+struct bucket_stats_cache_set {
+	/* all fields are in units of 512 byte sectors: */
+	u64			s[S_COMPRESSED_NR][S_ALLOC_NR];
+	u64			persistent_reserved;
+	u64			online_reserved;
+	u64			available_cache;
+};
+
+struct bucket_heap_entry {
+	struct bucket *g;
+	unsigned long val;
+};
+
+/*
+ * A reservation for space on disk:
+ */
+struct disk_reservation {
+	u64		sectors;
+	u32		gen;
+	unsigned	nr_replicas;
+};
+
+#endif /* _BUCKETS_TYPES_H */
diff --git a/libbcache/chardev.c b/libbcache/chardev.c
new file mode 100644
index 0000000..0b020c8
--- /dev/null
+++ b/libbcache/chardev.c
@@ -0,0 +1,319 @@
+/*
+ * This file adds support for a character device /dev/bcache that is used to
+ * atomically register a list of devices, remove a device from a cache_set
+ * and add a device to a cache set.
+ *
+ * Copyright (c) 2014 Datera, Inc.
+ *
+ */
+
+#include "bcache.h"
+#include "super.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/major.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/ioctl.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/bcache-ioctl.h>
+
+static long bch_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
+{
+	struct bch_ioctl_assemble arg;
+	const char *err;
+	u64 *user_devs = NULL;
+	char **devs = NULL;
+	unsigned i;
+	int ret = -EFAULT;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
+	if (!devs)
+		return -ENOMEM;
+
+	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
+
+	if (copy_from_user(user_devs, user_arg->devs,
+			   sizeof(u64) * arg.nr_devs))
+		goto err;
+
+	for (i = 0; i < arg.nr_devs; i++) {
+		devs[i] = strndup_user((const char __user *)(unsigned long)
+				       user_devs[i],
+				       PATH_MAX);
+		if (!devs[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	err = bch_register_cache_set(devs, arg.nr_devs,
+				     cache_set_opts_empty(),
+				     NULL);
+	if (err) {
+		pr_err("Could not register cache set: %s", err);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = 0;
+err:
+	if (devs)
+		for (i = 0; i < arg.nr_devs; i++)
+			kfree(devs[i]);
+	kfree(devs);
+	return ret;
+}
+
+static long bch_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
+{
+	struct bch_ioctl_incremental arg;
+	const char *err;
+	char *path;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	err = bch_register_one(path);
+	kfree(path);
+
+	if (err) {
+		pr_err("Could not register bcache devices: %s", err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static long bch_global_ioctl(unsigned cmd, void __user *arg)
+{
+	switch (cmd) {
+	case BCH_IOCTL_ASSEMBLE:
+		return bch_ioctl_assemble(arg);
+	case BCH_IOCTL_INCREMENTAL:
+		return bch_ioctl_incremental(arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long bch_ioctl_stop(struct cache_set *c)
+{
+	bch_cache_set_stop(c);
+	return 0;
+}
+
+static long bch_ioctl_disk_add(struct cache_set *c,
+			       struct bch_ioctl_disk_add __user *user_arg)
+{
+	struct bch_ioctl_disk_add arg;
+	char *path;
+	int ret;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	ret = bch_cache_set_add_cache(c, path);
+	kfree(path);
+
+	return ret;
+}
+
+/* returns with ref on ca->ref */
+static struct cache *bch_device_lookup(struct cache_set *c,
+				       const char __user *dev)
+{
+	struct block_device *bdev;
+	struct cache *ca;
+	char *path;
+	unsigned i;
+
+	path = strndup_user(dev, PATH_MAX);
+	if (!path)
+		return ERR_PTR(-ENOMEM);
+
+	bdev = lookup_bdev(strim(path));
+	kfree(path);
+	if (IS_ERR(bdev))
+		return ERR_CAST(bdev);
+
+	for_each_cache(ca, c, i)
+		if (ca->disk_sb.bdev == bdev)
+			goto found;
+
+	ca = NULL;
+found:
+	bdput(bdev);
+	return ca;
+}
+
+static long bch_ioctl_disk_remove(struct cache_set *c,
+				  struct bch_ioctl_disk_remove __user *user_arg)
+{
+	struct bch_ioctl_disk_remove arg;
+	struct cache *ca;
+	int ret;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	ca = bch_device_lookup(c, (const char __user *)(unsigned long) arg.dev);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch_cache_remove(ca, arg.flags & BCH_FORCE_IF_DATA_MISSING)
+		? 0 : -EBUSY;
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+static long bch_ioctl_disk_fail(struct cache_set *c,
+				struct bch_ioctl_disk_fail __user *user_arg)
+{
+	struct bch_ioctl_disk_fail arg;
+	struct cache *ca;
+	int ret;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	ca = bch_device_lookup(c, (const char __user *)(unsigned long) arg.dev);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	/* XXX: failed not actually implemented yet */
+	ret = bch_cache_remove(ca, true);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+static struct cache_member *bch_uuid_lookup(struct cache_set *c, uuid_le uuid)
+{
+	struct cache_member *mi = c->disk_mi;
+	unsigned i;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	for (i = 0; i < c->disk_sb.nr_in_set; i++)
+		if (!memcmp(&mi[i].uuid, &uuid, sizeof(uuid)))
+			return &mi[i];
+
+	return NULL;
+}
+
+static long bch_ioctl_disk_remove_by_uuid(struct cache_set *c,
+			struct bch_ioctl_disk_remove_by_uuid __user *user_arg)
+{
+	struct bch_ioctl_disk_fail_by_uuid arg;
+	struct cache_member *m;
+	int ret = -ENOENT;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	mutex_lock(&bch_register_lock);
+	if ((m = bch_uuid_lookup(c, arg.dev))) {
+		/* XXX: */
+		SET_CACHE_STATE(m, CACHE_FAILED);
+		bcache_write_super(c);
+		ret = 0;
+	}
+	mutex_unlock(&bch_register_lock);
+
+	return ret;
+}
+
+static long bch_ioctl_disk_fail_by_uuid(struct cache_set *c,
+			struct bch_ioctl_disk_fail_by_uuid __user *user_arg)
+{
+	struct bch_ioctl_disk_fail_by_uuid arg;
+	struct cache_member *m;
+	int ret = -ENOENT;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	mutex_lock(&bch_register_lock);
+	if ((m = bch_uuid_lookup(c, arg.dev))) {
+		SET_CACHE_STATE(m, CACHE_FAILED);
+		bcache_write_super(c);
+		ret = 0;
+	}
+	mutex_unlock(&bch_register_lock);
+
+	return ret;
+}
+
+static long bch_ioctl_query_uuid(struct cache_set *c,
+			struct bch_ioctl_query_uuid __user *user_arg)
+{
+	return copy_to_user(&user_arg->uuid,
+			    &c->disk_sb.user_uuid,
+			    sizeof(c->disk_sb.user_uuid));
+}
+
+long bch_cache_set_ioctl(struct cache_set *c, unsigned cmd, void __user *arg)
+{
+	/* ioctls that don't require admin cap: */
+	switch (cmd) {
+	case BCH_IOCTL_QUERY_UUID:
+		return bch_ioctl_query_uuid(c, arg);
+	}
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* ioctls that do require admin cap: */
+	switch (cmd) {
+	case BCH_IOCTL_RUN:
+		return -ENOTTY;
+	case BCH_IOCTL_STOP:
+		return bch_ioctl_stop(c);
+
+	case BCH_IOCTL_DISK_ADD:
+		return bch_ioctl_disk_add(c, arg);
+	case BCH_IOCTL_DISK_REMOVE:
+		return bch_ioctl_disk_remove(c, arg);
+	case BCH_IOCTL_DISK_FAIL:
+		return bch_ioctl_disk_fail(c, arg);
+
+	case BCH_IOCTL_DISK_REMOVE_BY_UUID:
+		return bch_ioctl_disk_remove_by_uuid(c, arg);
+	case BCH_IOCTL_DISK_FAIL_BY_UUID:
+		return bch_ioctl_disk_fail_by_uuid(c, arg);
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long bch_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
+{
+	struct cache_set *c = filp->private_data;
+	void __user *arg = (void __user *) v;
+
+	return c
+		? bch_cache_set_ioctl(c, cmd, arg)
+		: bch_global_ioctl(cmd, arg);
+}
+
+const struct file_operations bch_chardev_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl = bch_chardev_ioctl,
+	.open		= nonseekable_open,
+};
diff --git a/libbcache/chardev.h b/libbcache/chardev.h
new file mode 100644
index 0000000..657bf2b
--- /dev/null
+++ b/libbcache/chardev.h
@@ -0,0 +1,7 @@
+#ifndef _BCACHE_CHARDEV_H
+#define _BCACHE_CHARDEV_H
+
+long bch_cache_set_ioctl(struct cache_set *, unsigned, void __user *);
+extern const struct file_operations bch_chardev_fops;
+
+#endif /* _BCACHE_CHARDEV_H */
diff --git a/libbcache/checksum.c b/libbcache/checksum.c
new file mode 100644
index 0000000..beae0b2
--- /dev/null
+++ b/libbcache/checksum.c
@@ -0,0 +1,174 @@
+
+#include "bcache.h"
+#include "checksum.h"
+
+#include <linux/crc32c.h>
+#include <crypto/chacha20.h>
+#include <crypto/hash.h>
+#include <crypto/poly1305.h>
+
+/*
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
+ * use permitted, subject to terms of PostgreSQL license; see.)
+
+ * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
+ * usual sort of implementation. (See Ross Williams' excellent introduction
+ * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
+ * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
+ * If we have no working 64-bit type, then fake it with two 32-bit registers.
+ *
+ * The present implementation is a normal (not "reflected", in Williams'
+ * terms) 64-bit CRC, using initial all-ones register contents and a final
+ * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
+ * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+*/
+
+static const u64 crc_table[256] = {
+	0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
+	0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
+	0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
+	0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
+	0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
+	0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
+	0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
+	0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
+	0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
+	0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
+	0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
+	0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
+	0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
+	0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
+	0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
+	0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
+	0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
+	0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
+	0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
+	0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
+	0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
+	0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
+	0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
+	0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
+	0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
+	0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
+	0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
+	0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
+	0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
+	0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
+	0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
+	0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
+	0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
+	0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
+	0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
+	0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
+	0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
+	0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
+	0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
+	0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
+	0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
+	0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
+	0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
+	0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
+	0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
+	0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
+	0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
+	0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
+	0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
+	0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
+	0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
+	0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
+	0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
+	0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
+	0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
+	0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
+	0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
+	0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
+	0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
+	0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
+	0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
+	0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
+	0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
+	0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
+	0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
+	0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
+	0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
+	0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
+	0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
+	0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
+	0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
+	0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
+	0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
+	0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
+	0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
+	0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
+	0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
+	0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
+	0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
+	0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
+	0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
+	0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
+	0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
+	0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
+	0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
+	0x9AFCE626CE85B507ULL,
+};
+
+u64 bch_crc64_update(u64 crc, const void *_data, size_t len)
+{
+	const unsigned char *data = _data;
+
+	while (len--) {
+		int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
+		crc = crc_table[i] ^ (crc << 8);
+	}
+
+	return crc;
+}
+
+u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return 0;
+	case BCH_CSUM_CRC32C:
+		return crc32c(crc, data, len);
+	case BCH_CSUM_CRC64:
+		return bch_crc64_update(crc, data, len);
+	default:
+		BUG();
+	}
+}
+
+u64 bch_checksum(unsigned type, const void *data, size_t len)
+{
+	u64 crc = 0xffffffffffffffffULL;
+
+	crc = bch_checksum_update(type, crc, data, len);
+
+	return crc ^ 0xffffffffffffffffULL;
+}
+
+u32 bch_checksum_bio(struct bio *bio, unsigned type)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	u32 csum = U32_MAX;
+
+	if (type == BCH_CSUM_NONE)
+		return 0;
+
+	bio_for_each_segment(bv, bio, iter) {
+		void *p = kmap_atomic(bv.bv_page);
+
+		csum = bch_checksum_update(type, csum,
+					   p + bv.bv_offset,
+					   bv.bv_len);
+		kunmap_atomic(p);
+	}
+
+	return csum ^= U32_MAX;
+}
diff --git a/libbcache/checksum.h b/libbcache/checksum.h
new file mode 100644
index 0000000..196b7e8
--- /dev/null
+++ b/libbcache/checksum.h
@@ -0,0 +1,24 @@
+#ifndef _BCACHE_CHECKSUM_H
+#define _BCACHE_CHECKSUM_H
+
+#include "btree_types.h"
+
+u64 bch_crc64_update(u64, const void *, size_t);
+
+u64 bch_checksum_update(unsigned, u64, const void *, size_t);
+u64 bch_checksum(unsigned, const void *, size_t);
+u32 bch_checksum_bio(struct bio *, unsigned);
+
+/*
+ * This is used for various on disk data structures - cache_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first 8 bytes of these structs
+ */
+#define __csum_set(i, u64s, type)					\
+({									\
+	const void *start = ((const void *) (i)) + sizeof(u64);		\
+	const void *end = __bkey_idx(i, u64s);				\
+									\
+	bch_checksum(type, start, end - start);				\
+})
+
+#endif /* _BCACHE_CHECKSUM_H */
diff --git a/libbcache/clock.c b/libbcache/clock.c
new file mode 100644
index 0000000..8218769
--- /dev/null
+++ b/libbcache/clock.c
@@ -0,0 +1,161 @@
+#include "bcache.h"
+#include "clock.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+
+static inline bool io_timer_cmp(struct io_timer *l, struct io_timer *r)
+{
+	return time_after(l->expire, r->expire);
+}
+
+void bch_io_timer_add(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer)
+			goto out;
+
+	BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp));
+out:
+	spin_unlock(&clock->timer_lock);
+}
+
+void bch_io_timer_del(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer) {
+			heap_del(&clock->timers, i, io_timer_cmp);
+			break;
+		}
+
+	spin_unlock(&clock->timer_lock);
+}
+
+struct io_clock_wait {
+	struct io_timer		timer;
+	struct task_struct	*task;
+	int			expired;
+};
+
+static void io_clock_wait_fn(struct io_timer *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+void bch_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+{
+	struct io_clock_wait wait;
+
+	/* XXX: calculate sleep time rigorously */
+	wait.timer.expire	= until;
+	wait.timer.fn		= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch_io_timer_add(clock, &wait.timer);
+
+	schedule();
+
+	bch_io_timer_del(clock, &wait.timer);
+}
+
+/*
+ * _only_ to be used from a kthread
+ */
+void bch_kthread_io_clock_wait(struct io_clock *clock,
+			       unsigned long until)
+{
+	struct io_clock_wait wait;
+
+	/* XXX: calculate sleep time rigorously */
+	wait.timer.expire	= until;
+	wait.timer.fn		= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch_io_timer_add(clock, &wait.timer);
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+
+		if (wait.expired)
+			break;
+
+		schedule();
+		try_to_freeze();
+	}
+
+	__set_current_state(TASK_RUNNING);
+	bch_io_timer_del(clock, &wait.timer);
+}
+
+static struct io_timer *get_expired_timer(struct io_clock *clock,
+					  unsigned long now)
+{
+	struct io_timer *ret = NULL;
+
+	spin_lock(&clock->timer_lock);
+
+	if (clock->timers.used &&
+	    time_after_eq(now, clock->timers.data[0]->expire))
+		heap_pop(&clock->timers, ret, io_timer_cmp);
+
+	spin_unlock(&clock->timer_lock);
+
+	return ret;
+}
+
+void bch_increment_clock(struct cache_set *c, unsigned sectors, int rw)
+{
+	struct io_clock *clock = &c->io_clock[rw];
+	struct io_timer *timer;
+	unsigned long now;
+
+	/* Buffer up one megabyte worth of IO in the percpu counter */
+	preempt_disable();
+
+	if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
+		   IO_CLOCK_PCPU_SECTORS)) {
+		preempt_enable();
+		return;
+	}
+
+	sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
+	preempt_enable();
+	now = atomic_long_add_return(sectors, &clock->now);
+
+	while ((timer = get_expired_timer(clock, now)))
+		timer->fn(timer);
+}
+
+void bch_io_clock_exit(struct io_clock *clock)
+{
+	free_heap(&clock->timers);
+	free_percpu(clock->pcpu_buf);
+}
+
+int bch_io_clock_init(struct io_clock *clock)
+{
+	atomic_long_set(&clock->now, 0);
+	spin_lock_init(&clock->timer_lock);
+
+	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
+	if (!clock->pcpu_buf)
+		return -ENOMEM;
+
+	if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/libbcache/clock.h b/libbcache/clock.h
new file mode 100644
index 0000000..f59f071
--- /dev/null
+++ b/libbcache/clock.h
@@ -0,0 +1,23 @@
+#ifndef _BCACHE_CLOCK_H
+#define _BCACHE_CLOCK_H
+
+void bch_io_timer_add(struct io_clock *, struct io_timer *);
+void bch_io_timer_del(struct io_clock *, struct io_timer *);
+void bch_kthread_io_clock_wait(struct io_clock *, unsigned long);
+void bch_increment_clock(struct cache_set *, unsigned, int);
+
+void bch_io_clock_schedule_timeout(struct io_clock *, unsigned long);
+
+#define bch_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_event_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+
+void bch_io_clock_exit(struct io_clock *);
+int bch_io_clock_init(struct io_clock *);
+
+#endif /* _BCACHE_CLOCK_H */
diff --git a/libbcache/clock_types.h b/libbcache/clock_types.h
new file mode 100644
index 0000000..4a02f46
--- /dev/null
+++ b/libbcache/clock_types.h
@@ -0,0 +1,34 @@
+#ifndef _BCACHE_CLOCK_TYPES_H
+#define _BCACHE_CLOCK_TYPES_H
+
+#include "util.h"
+
+#define NR_IO_TIMERS		8
+
+/*
+ * Clocks/timers in units of sectors of IO:
+ *
+ * Note - they use percpu batching, so they're only approximate.
+ */
+
+struct io_timer;
+typedef void (*io_timer_fn)(struct io_timer *);
+
+struct io_timer {
+	io_timer_fn		fn;
+	unsigned long		expire;
+};
+
+/* Amount to buffer up on a percpu counter */
+#define IO_CLOCK_PCPU_SECTORS	128
+
+struct io_clock {
+	atomic_long_t		now;
+	u16 __percpu		*pcpu_buf;
+
+	spinlock_t		timer_lock;
+	DECLARE_HEAP(struct io_timer *, timers);
+};
+
+#endif /* _BCACHE_CLOCK_TYPES_H */
+
diff --git a/libbcache/closure.c b/libbcache/closure.c
new file mode 100644
index 0000000..f6f4dd9
--- /dev/null
+++ b/libbcache/closure.c
@@ -0,0 +1,210 @@
+/*
+ * Asynchronous refcounty things
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include "closure.h"
+
+static inline void closure_put_after_sub(struct closure *cl, int flags)
+{
+	int r = flags & CLOSURE_REMAINING_MASK;
+
+	BUG_ON(flags & CLOSURE_GUARD_MASK);
+	BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
+
+	if (!r) {
+		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
+			atomic_set(&cl->remaining,
+				   CLOSURE_REMAINING_INITIALIZER);
+			closure_queue(cl);
+		} else {
+			struct closure *parent = cl->parent;
+			closure_fn *destructor = cl->fn;
+
+			closure_debug_destroy(cl);
+
+			if (destructor)
+				destructor(cl);
+
+			if (parent)
+				closure_put(parent);
+		}
+	}
+}
+
+/* For clearing flags with the same atomic op as a put */
+void closure_sub(struct closure *cl, int v)
+{
+	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+}
+EXPORT_SYMBOL(closure_sub);
+
+/**
+ * closure_put - decrement a closure's refcount
+ */
+void closure_put(struct closure *cl)
+{
+	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+}
+EXPORT_SYMBOL(closure_put);
+
+/**
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
+ */
+void __closure_wake_up(struct closure_waitlist *wait_list)
+{
+	struct llist_node *list, *next;
+	struct closure *cl;
+
+	/*
+	 * Grab entire list, reverse order to preserve FIFO ordering, and wake
+	 * everything up
+	 */
+	for (list = llist_reverse_order(llist_del_all(&wait_list->list));
+	     list;
+	     list = next) {
+		next = llist_next(list);
+		cl = container_of(list, struct closure, list);
+
+		closure_set_waiting(cl, 0);
+		closure_sub(cl, CLOSURE_WAITING + 1);
+	}
+}
+EXPORT_SYMBOL(__closure_wake_up);
+
+/**
+ * closure_wait - add a closure to a waitlist
+ *
+ * @waitlist will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ *
+ */
+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
+{
+	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+		return false;
+
+	closure_set_waiting(cl, _RET_IP_);
+	atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
+	llist_add(&cl->list, &waitlist->list);
+
+	return true;
+}
+EXPORT_SYMBOL(closure_wait);
+
+struct closure_syncer {
+	struct task_struct	*task;
+	int			done;
+};
+
+static void closure_sync_fn(struct closure *cl)
+{
+	cl->s->done = 1;
+	wake_up_process(cl->s->task);
+}
+
+void __sched __closure_sync(struct closure *cl)
+{
+	struct closure_syncer s = { .task = current };
+
+	cl->s = &s;
+	continue_at_noreturn(cl, closure_sync_fn, NULL);
+
+	while (1) {
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		smp_mb();
+		if (s.done)
+			break;
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL(__closure_sync);
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+
+static LIST_HEAD(closure_list);
+static DEFINE_SPINLOCK(closure_list_lock);
+
+void closure_debug_create(struct closure *cl)
+{
+	unsigned long flags;
+
+	BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
+	cl->magic = CLOSURE_MAGIC_ALIVE;
+
+	spin_lock_irqsave(&closure_list_lock, flags);
+	list_add(&cl->all, &closure_list);
+	spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL(closure_debug_create);
+
+void closure_debug_destroy(struct closure *cl)
+{
+	unsigned long flags;
+
+	BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
+	cl->magic = CLOSURE_MAGIC_DEAD;
+
+	spin_lock_irqsave(&closure_list_lock, flags);
+	list_del(&cl->all);
+	spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL(closure_debug_destroy);
+
+static struct dentry *debug;
+
+static int debug_seq_show(struct seq_file *f, void *data)
+{
+	struct closure *cl;
+
+	spin_lock_irq(&closure_list_lock);
+
+	list_for_each_entry(cl, &closure_list, all) {
+		int r = atomic_read(&cl->remaining);
+
+		seq_printf(f, "%p: %pF -> %pf p %p r %i ",
+			   cl, (void *) cl->ip, cl->fn, cl->parent,
+			   r & CLOSURE_REMAINING_MASK);
+
+		seq_printf(f, "%s%s\n",
+			   test_bit(WORK_STRUCT_PENDING_BIT,
+				    work_data_bits(&cl->work)) ? "Q" : "",
+			   r & CLOSURE_RUNNING	? "R" : "");
+
+		if (r & CLOSURE_WAITING)
+			seq_printf(f, " W %pF\n",
+				   (void *) cl->waiting_on);
+
+		seq_puts(f, "\n");
+	}
+
+	spin_unlock_irq(&closure_list_lock);
+	return 0;
+}
+
+static int debug_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, debug_seq_show, NULL);
+}
+
+static const struct file_operations debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= debug_seq_open,
+	.read		= seq_read,
+	.release	= single_release
+};
+
+void __init closure_debug_init(void)
+{
+	debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
+}
+
+#endif
diff --git a/libbcache/closure.h b/libbcache/closure.h
new file mode 100644
index 0000000..b55254b
--- /dev/null
+++ b/libbcache/closure.h
@@ -0,0 +1,387 @@
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ *   continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, requires a 'return' immediately following the
+ * location where this macro is referenced, to return to the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio)
+ * {
+ *	closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ */
+
+struct closure;
+struct closure_syncer;
+typedef void (closure_fn) (struct closure *);
+
+struct closure_waitlist {
+	struct llist_head	list;
+};
+
+enum closure_state {
+	/*
+	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+	 * the thread that owns the closure, and cleared by the thread that's
+	 * waking up the closure.
+	 *
+	 * The rest are for debugging and don't affect behaviour:
+	 *
+	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+	 * closure_init() and when closure_put() runs then next function), and
+	 * must be cleared before remaining hits 0. Primarily to help guard
+	 * against incorrect usage and accidentally transferring references.
+	 * continue_at() and closure_return() clear it for you, if you're doing
+	 * something unusual you can use closure_set_dead() which also helps
+	 * annotate where references are being transferred.
+	 */
+
+	CLOSURE_BITS_START	= (1U << 27),
+	CLOSURE_DESTRUCTOR	= (1U << 27),
+	CLOSURE_WAITING		= (1U << 29),
+	CLOSURE_RUNNING		= (1U << 31),
+};
+
+#define CLOSURE_GUARD_MASK					\
+	((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
+
+#define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
+#define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
+
+struct closure {
+	union {
+		struct {
+			struct workqueue_struct *wq;
+			struct closure_syncer	*s;
+			struct llist_node	list;
+			closure_fn		*fn;
+		};
+		struct work_struct	work;
+	};
+
+	struct closure		*parent;
+
+	atomic_t		remaining;
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#define CLOSURE_MAGIC_DEAD	0xc054dead
+#define CLOSURE_MAGIC_ALIVE	0xc054a11e
+
+	unsigned		magic;
+	struct list_head	all;
+	unsigned long		ip;
+	unsigned long		waiting_on;
+#endif
+};
+
+void closure_sub(struct closure *cl, int v);
+void closure_put(struct closure *cl);
+void __closure_wake_up(struct closure_waitlist *list);
+bool closure_wait(struct closure_waitlist *list, struct closure *cl);
+void __closure_sync(struct closure *cl);
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+static inline void closure_sync(struct closure *cl)
+{
+	if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+		__closure_sync(cl);
+}
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+
+void closure_debug_init(void);
+void closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void closure_debug_init(void) {}
+static inline void closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	cl->waiting_on = f;
+#endif
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+				  struct workqueue_struct *wq)
+{
+	closure_set_ip(cl);
+	cl->fn = fn;
+	cl->wq = wq;
+	/* between atomic_dec() in closure_put() */
+	smp_mb__before_atomic();
+}
+
+static inline void closure_queue(struct closure *cl)
+{
+	struct workqueue_struct *wq = cl->wq;
+
+	if (wq) {
+		INIT_WORK(&cl->work, cl->work.func);
+		queue_work(wq, &cl->work);
+	} else
+		cl->fn(cl);
+}
+
+/**
+ * closure_get - increment a closure's refcount
+ */
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	BUG_ON((atomic_inc_return(&cl->remaining) &
+		CLOSURE_REMAINING_MASK) <= 1);
+#else
+	atomic_inc(&cl->remaining);
+#endif
+}
+
+/**
+ * closure_init - Initialize a closure, setting the refcount to 1
+ * @cl:		closure to initialize
+ * @parent:	parent of the new closure. cl will take a refcount on it for its
+ *		lifetime; may be NULL.
+ */
+static inline void closure_init(struct closure *cl, struct closure *parent)
+{
+	cl->fn = NULL;
+	cl->parent = parent;
+	if (parent)
+		closure_get(parent);
+
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+
+	closure_debug_create(cl);
+	closure_set_ip(cl);
+}
+
+static inline void closure_init_stack(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+}
+
+/**
+ * closure_wake_up - wake up all closures on a wait list.
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+	smp_mb();
+	__closure_wake_up(list);
+}
+
+#define continue_at_noreturn(_cl, _fn, _wq)				\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
+} while (0)
+
+/**
+ * continue_at - jump to another function with barrier
+ *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
+ * been dropped with closure_put()), it will resume execution at @fn running out
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
+ *
+ * NOTE: This macro expands to a return in the calling function!
+ *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
+ * and whatever @cl owns may be freed out from under you - a running closure fn
+ * has a ref on its own closure which continue_at() drops.
+ */
+#define continue_at(_cl, _fn, _wq)					\
+do {									\
+	continue_at_noreturn(_cl, _fn, _wq);				\
+	return;								\
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
+#define closure_return(_cl)	continue_at((_cl), NULL, NULL)
+
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * NOTE: like continue_at(), this macro expands to a return in the caller!
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
+#define continue_at_nobarrier(_cl, _fn, _wq)				\
+do {									\
+	closure_set_ip(cl);						\
+	if (_wq) {							\
+		INIT_WORK(&(_cl)->work, (void *) _fn);			\
+		queue_work((_wq), &(_cl)->work);			\
+	} else {							\
+		(_fn)(_cl);						\
+	}								\
+	return;								\
+} while (0)
+
+#define closure_return_with_destructor_noreturn(_cl, _destructor)	\
+do {									\
+	set_closure_fn(_cl, _destructor, NULL);				\
+	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure, with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
+#define closure_return_with_destructor(_cl, _destructor)		\
+do {									\
+	closure_return_with_destructor_noreturn(_cl, _destructor);	\
+	return;								\
+} while (0)
+
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
+static inline void closure_call(struct closure *cl, closure_fn fn,
+				struct workqueue_struct *wq,
+				struct closure *parent)
+{
+	closure_init(cl, parent);
+	continue_at_nobarrier(cl, fn, wq);
+}
+
+#endif /* _LINUX_CLOSURE_H */
diff --git a/libbcache/compress.c b/libbcache/compress.c
new file mode 100644
index 0000000..f7bfd57
--- /dev/null
+++ b/libbcache/compress.c
@@ -0,0 +1,458 @@
+#include "bcache.h"
+#include "compress.h"
+#include "io.h"
+
+#include <linux/lz4.h>
+#include <linux/zlib.h>
+
+enum bounced {
+	BOUNCED_MAPPED,
+	BOUNCED_KMALLOCED,
+	BOUNCED_VMALLOCED,
+	BOUNCED_MEMPOOLED,
+};
+
+static void *__bounce_alloc(struct cache_set *c, unsigned size,
+			    unsigned *bounced, int direction)
+{
+	void *data;
+
+	*bounced = BOUNCED_KMALLOCED;
+	data = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
+	if (data)
+		return data;
+
+	*bounced = BOUNCED_MEMPOOLED;
+	data = mempool_alloc(&c->compression_bounce[direction], GFP_NOWAIT);
+	if (data)
+		return page_address(data);
+
+	*bounced = BOUNCED_VMALLOCED;
+	data = vmalloc(size);
+	if (data)
+		return data;
+
+	*bounced = BOUNCED_MEMPOOLED;
+	data = mempool_alloc(&c->compression_bounce[direction], GFP_NOIO);
+	return page_address(data);
+}
+
+static void *__bio_map_or_bounce(struct cache_set *c,
+				 struct bio *bio, struct bvec_iter start,
+				 unsigned *bounced, int direction)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	unsigned nr_pages = 0;
+	struct page *stack_pages[16];
+	struct page **pages = NULL;
+	bool first = true;
+	unsigned prev_end = PAGE_SIZE;
+	void *data;
+
+	BUG_ON(bvec_iter_sectors(start) > BCH_COMPRESSED_EXTENT_MAX);
+
+	*bounced = BOUNCED_MAPPED;
+
+	__bio_for_each_segment(bv, bio, iter, start) {
+		if ((!first && bv.bv_offset) ||
+		    prev_end != PAGE_SIZE)
+			goto bounce;
+
+		prev_end = bv.bv_offset + bv.bv_len;
+		nr_pages++;
+	}
+
+	BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
+
+	pages = nr_pages > ARRAY_SIZE(stack_pages)
+		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
+		: stack_pages;
+	if (!pages)
+		goto bounce;
+
+	nr_pages = 0;
+	__bio_for_each_segment(bv, bio, iter, start)
+		pages[nr_pages++] = bv.bv_page;
+
+	data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (pages != stack_pages)
+		kfree(pages);
+
+	return data + bio_iter_offset(bio, start);
+bounce:
+	data = __bounce_alloc(c, start.bi_size, bounced, direction);
+
+	if (direction == READ)
+		memcpy_from_bio(data, bio, start);
+
+	return data;
+}
+
+static void *bio_map_or_bounce(struct cache_set *c, struct bio *bio,
+			       unsigned *bounced, int direction)
+{
+	return __bio_map_or_bounce(c, bio, bio->bi_iter, bounced, direction);
+}
+
+static void bio_unmap_or_unbounce(struct cache_set *c, void *data,
+				  unsigned bounced, int direction)
+{
+	if (!data)
+		return;
+
+	switch (bounced) {
+	case BOUNCED_MAPPED:
+		vunmap((void *) ((unsigned long) data & PAGE_MASK));
+		return;
+	case BOUNCED_KMALLOCED:
+		kfree(data);
+		return;
+	case BOUNCED_VMALLOCED:
+		vfree(data);
+		return;
+	case BOUNCED_MEMPOOLED:
+		mempool_free(virt_to_page(data), &c->compression_bounce[direction]);
+		return;
+	}
+}
+
+static int __bio_uncompress(struct cache_set *c, struct bio *src,
+			    void *dst_data, struct bch_extent_crc64 crc)
+{
+	void *src_data = NULL;
+	unsigned src_bounced;
+	size_t src_len = src->bi_iter.bi_size;
+	size_t dst_len = crc.uncompressed_size << 9;
+	int ret;
+
+	src_data = bio_map_or_bounce(c, src, &src_bounced, READ);
+
+	switch (crc.compression_type) {
+	case BCH_COMPRESSION_LZ4:
+		ret = lz4_decompress(src_data, &src_len,
+				     dst_data, dst_len);
+		if (ret) {
+			ret = -EIO;
+			goto err;
+		}
+		break;
+	case BCH_COMPRESSION_GZIP: {
+		void *workspace;
+		z_stream strm;
+
+		workspace = kmalloc(zlib_inflate_workspacesize(),
+				    GFP_NOIO|__GFP_NOWARN);
+		if (!workspace) {
+			mutex_lock(&c->zlib_workspace_lock);
+			workspace = c->zlib_workspace;
+		}
+
+		strm.workspace	= workspace;
+		strm.next_in	= src_data;
+		strm.avail_in	= src_len;
+		strm.next_out	= dst_data;
+		strm.avail_out	= dst_len;
+		zlib_inflateInit2(&strm, -MAX_WBITS);
+
+		ret = zlib_inflate(&strm, Z_FINISH);
+
+		if (workspace == c->zlib_workspace)
+			mutex_unlock(&c->zlib_workspace_lock);
+		else
+			kfree(workspace);
+
+		if (ret != Z_STREAM_END) {
+			ret = -EIO;
+			goto err;
+		}
+		break;
+	}
+	default:
+		BUG();
+	}
+	ret = 0;
+err:
+	bio_unmap_or_unbounce(c, src_data, src_bounced, READ);
+	return ret;
+}
+
+int bch_bio_uncompress_inplace(struct cache_set *c, struct bio *bio,
+			       unsigned live_data_sectors,
+			       struct bch_extent_crc64 crc)
+{
+	void *dst_data = NULL;
+	size_t dst_len = crc.uncompressed_size << 9;
+	int ret = -ENOMEM;
+
+	BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs);
+
+	/* XXX mempoolify */
+	dst_data = kmalloc(dst_len, GFP_NOIO|__GFP_NOWARN);
+	if (!dst_data) {
+		dst_data = vmalloc(dst_len);
+		if (!dst_data)
+			goto err;
+	}
+
+	ret = __bio_uncompress(c, bio, dst_data, crc);
+	if (ret)
+		goto err;
+
+	while (bio->bi_vcnt < DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS)) {
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
+
+		bv->bv_page = alloc_page(GFP_NOIO);
+		if (!bv->bv_page)
+			goto use_mempool;
+
+		bv->bv_len = PAGE_SIZE;
+		bv->bv_offset = 0;
+		bio->bi_vcnt++;
+	}
+
+	bio->bi_iter.bi_size = live_data_sectors << 9;
+copy_data:
+	memcpy_to_bio(bio, bio->bi_iter, dst_data + (crc.offset << 9));
+err:
+	kvfree(dst_data);
+	return ret;
+use_mempool:
+	/*
+	 * We already allocated from mempool, we can't allocate from it again
+	 * without freeing the pages we already allocated or else we could
+	 * deadlock:
+	 */
+
+	bch_bio_free_pages_pool(c, bio);
+	bch_bio_alloc_pages_pool(c, bio, live_data_sectors << 9);
+	goto copy_data;
+}
+
+int bch_bio_uncompress(struct cache_set *c, struct bio *src,
+		       struct bio *dst, struct bvec_iter dst_iter,
+		       struct bch_extent_crc64 crc)
+{
+	void *dst_data = NULL;
+	unsigned dst_bounced;
+	size_t dst_len = crc.uncompressed_size << 9;
+	int ret = -ENOMEM;
+
+	dst_data = dst_len == dst_iter.bi_size
+		? __bio_map_or_bounce(c, dst, dst_iter, &dst_bounced, WRITE)
+		: __bounce_alloc(c, dst_len, &dst_bounced, WRITE);
+
+	ret = __bio_uncompress(c, src, dst_data, crc);
+	if (ret)
+		goto err;
+
+	if (dst_bounced)
+		memcpy_to_bio(dst, dst_iter, dst_data + (crc.offset << 9));
+err:
+	bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE);
+	return ret;
+}
+
+static int __bio_compress(struct cache_set *c,
+			  struct bio *dst, size_t *dst_len,
+			  struct bio *src, size_t *src_len,
+			  unsigned compression_type)
+{
+	void *src_data = NULL, *dst_data = NULL;
+	unsigned src_bounced, dst_bounced, pad;
+	int ret = -1;
+
+	dst_data = bio_map_or_bounce(c, dst, &dst_bounced, WRITE);
+	src_data = bio_map_or_bounce(c, src, &src_bounced, READ);
+
+	switch (compression_type) {
+	case BCH_COMPRESSION_LZ4: {
+		void *workspace;
+
+		*dst_len = dst->bi_iter.bi_size;
+		*src_len = src->bi_iter.bi_size;
+
+		workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
+retry_compress:
+		ret = lz4_compress(src_data, *src_len,
+				   dst_data, dst_len,
+				   workspace);
+		/*
+		 * On error, the compressed data was bigger than dst_len, and
+		 * -ret is the amount of data we were able to compress - round
+		 * down to nearest block and try again:
+		 */
+		if (ret && round_down(-ret, block_bytes(c)) > *dst_len) {
+			BUG_ON(ret > 0);
+
+			/* not supposed to happen */
+			if (WARN_ON(-ret >= *src_len))
+				goto err;
+
+			*src_len = round_down(-ret, block_bytes(c));
+			if (!*src_len)
+				goto err;
+
+			goto retry_compress;
+		}
+		mempool_free(workspace, &c->lz4_workspace_pool);
+
+		if (ret)
+			goto err;
+		break;
+	}
+	case BCH_COMPRESSION_GZIP: {
+		void *workspace;
+		z_stream strm;
+
+		workspace = kmalloc(zlib_deflate_workspacesize(MAX_WBITS,
+							       DEF_MEM_LEVEL),
+				    GFP_NOIO|__GFP_NOWARN);
+		if (!workspace) {
+			mutex_lock(&c->zlib_workspace_lock);
+			workspace = c->zlib_workspace;
+		}
+
+		strm.workspace	= workspace;
+		strm.next_in	= src_data;
+		strm.avail_in	= min(src->bi_iter.bi_size,
+				      dst->bi_iter.bi_size);
+		strm.next_out	= dst_data;
+		strm.avail_out	= dst->bi_iter.bi_size;
+		zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+				  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+				  Z_DEFAULT_STRATEGY);
+
+		ret = zlib_deflate(&strm, Z_FINISH);
+		if (ret != Z_STREAM_END) {
+			ret = -EIO;
+			goto zlib_err;
+		}
+
+		ret = zlib_deflateEnd(&strm);
+		if (ret != Z_OK) {
+			ret = -EIO;
+			goto zlib_err;
+		}
+
+		ret = 0;
+zlib_err:
+		if (workspace == c->zlib_workspace)
+			mutex_unlock(&c->zlib_workspace_lock);
+		else
+			kfree(workspace);
+
+		if (ret)
+			goto err;
+
+		*dst_len = strm.total_out;
+		*src_len = strm.total_in;
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	BUG_ON(!*dst_len);
+
+	/* Didn't get smaller: */
+	if (round_up(*dst_len, block_bytes(c)) >= *src_len) {
+		ret = -1;
+		goto err;
+	}
+
+	pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
+
+	memset(dst_data + *dst_len, 0, pad);
+	*dst_len += pad;
+
+	if (dst_bounced)
+		memcpy_to_bio(dst, dst->bi_iter, dst_data);
+err:
+	bio_unmap_or_unbounce(c, src_data, src_bounced, READ);
+	bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE);
+	return ret;
+}
+
+void bch_bio_compress(struct cache_set *c,
+		      struct bio *dst, size_t *dst_len,
+		      struct bio *src, size_t *src_len,
+		      unsigned *compression_type)
+{
+	unsigned orig_dst = dst->bi_iter.bi_size;
+	unsigned orig_src = src->bi_iter.bi_size;
+
+	/* Don't consume more than BCH_COMPRESSED_EXTENT_MAX from @src: */
+	src->bi_iter.bi_size =
+		min(src->bi_iter.bi_size, BCH_COMPRESSED_EXTENT_MAX << 9);
+
+	/* Don't generate a bigger output than input: */
+	dst->bi_iter.bi_size =
+		min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+
+	/* If it's only one block, don't bother trying to compress: */
+	if (*compression_type != BCH_COMPRESSION_NONE &&
+	    bio_sectors(src) > c->sb.block_size &&
+	    !__bio_compress(c, dst, dst_len, src, src_len, *compression_type))
+		goto out;
+
+	/* If compressing failed (didn't get smaller), just copy: */
+	*compression_type = BCH_COMPRESSION_NONE;
+	*dst_len = *src_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+	bio_copy_data(dst, src);
+out:
+	dst->bi_iter.bi_size = orig_dst;
+	src->bi_iter.bi_size = orig_src;
+}
+
+void bch_compress_free(struct cache_set *c)
+{
+	vfree(c->zlib_workspace);
+	mempool_exit(&c->lz4_workspace_pool);
+	mempool_exit(&c->compression_bounce[WRITE]);
+	mempool_exit(&c->compression_bounce[READ]);
+	free_percpu(c->bio_decompress_worker);
+}
+
+#define COMPRESSION_WORKSPACE_SIZE					\
+	max_t(size_t, zlib_inflate_workspacesize(),			\
+	      zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
+
+int bch_compress_init(struct cache_set *c)
+{
+	int ret, cpu;
+
+	c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker);
+	if (!c->bio_decompress_worker)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct bio_decompress_worker *d =
+			per_cpu_ptr(c->bio_decompress_worker, cpu);
+
+		d->c = c;
+		INIT_WORK(&d->work, bch_bio_decompress_work);
+		init_llist_head(&d->bio_list);
+	}
+
+	ret = mempool_init_page_pool(&c->compression_bounce[READ], 1,
+				     get_order(BCH_COMPRESSED_EXTENT_MAX << 9));
+	if (ret)
+		return ret;
+
+	ret = mempool_init_page_pool(&c->compression_bounce[WRITE], 1,
+				     get_order(BCH_COMPRESSED_EXTENT_MAX << 9));
+	if (ret)
+		return ret;
+
+	ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool, 1,
+					LZ4_MEM_COMPRESS);
+	if (ret)
+		return ret;
+
+	c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE);
+	if (!c->zlib_workspace)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/libbcache/compress.h b/libbcache/compress.h
new file mode 100644
index 0000000..02578ef
--- /dev/null
+++ b/libbcache/compress.h
@@ -0,0 +1,14 @@
+#ifndef _BCACHE_COMPRESS_H
+#define _BCACHE_COMPRESS_H
+
+int bch_bio_uncompress_inplace(struct cache_set *, struct bio *,
+			       unsigned, struct bch_extent_crc64);
+int bch_bio_uncompress(struct cache_set *, struct bio *, struct bio *,
+		       struct bvec_iter, struct bch_extent_crc64);
+void bch_bio_compress(struct cache_set *, struct bio *, size_t *,
+		      struct bio *, size_t *, unsigned *);
+
+void bch_compress_free(struct cache_set *);
+int bch_compress_init(struct cache_set *);
+
+#endif /* _BCACHE_COMPRESS_H */
diff --git a/libbcache/debug.c b/libbcache/debug.c
new file mode 100644
index 0000000..1be2e60
--- /dev/null
+++ b/libbcache/debug.c
@@ -0,0 +1,513 @@
+/*
+ * Assorted bcache debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "fs-gc.h"
+#include "inode.h"
+#include "io.h"
+#include "super.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bch_debug;
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+static void btree_verify_endio(struct bio *bio)
+{
+	struct closure *cl = bio->bi_private;
+
+	closure_put(cl);
+}
+
+void __bch_btree_verify(struct cache_set *c, struct btree *b)
+{
+	struct btree *v = c->verify_data;
+	struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
+	struct bset *sorted, *inmemory;
+	struct extent_pick_ptr pick;
+	struct bio *bio;
+	struct closure cl;
+
+	closure_init_stack(&cl);
+
+	btree_node_io_lock(b);
+	mutex_lock(&c->verify_lock);
+
+	n_ondisk = c->verify_ondisk;
+	n_sorted = c->verify_data->data;
+	n_inmemory = b->data;
+
+	bkey_copy(&v->key, &b->key);
+	v->written	= 0;
+	v->level	= b->level;
+	v->btree_id	= b->btree_id;
+	bch_btree_keys_init(v, &c->expensive_debug_checks);
+
+	pick = bch_btree_pick_ptr(c, b);
+	if (IS_ERR_OR_NULL(pick.ca))
+		return;
+
+	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
+	bio->bi_bdev		= pick.ca->disk_sb.bdev;
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bio->bi_iter.bi_size	= btree_bytes(c);
+	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
+	bio->bi_private		= &cl;
+	bio->bi_end_io		= btree_verify_endio;
+	bch_bio_map(bio, n_sorted);
+
+	closure_get(&cl);
+	bch_generic_make_request(bio, c);
+	closure_sync(&cl);
+
+	bio_put(bio);
+
+	memcpy(n_ondisk, n_sorted, btree_bytes(c));
+
+	bch_btree_node_read_done(c, v, pick.ca, &pick.ptr);
+	n_sorted = c->verify_data->data;
+
+	percpu_ref_put(&pick.ca->ref);
+
+	sorted = &n_sorted->keys;
+	inmemory = &n_inmemory->keys;
+
+	if (inmemory->u64s != sorted->u64s ||
+	    memcmp(inmemory->start,
+		   sorted->start,
+		   (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
+		unsigned offset = 0, sectors;
+		struct bset *i;
+		unsigned j;
+
+		console_lock();
+
+		printk(KERN_ERR "*** in memory:\n");
+		bch_dump_bset(b, inmemory, 0);
+
+		printk(KERN_ERR "*** read back in:\n");
+		bch_dump_bset(v, sorted, 0);
+
+		while (offset < b->written) {
+			if (!offset ) {
+				i = &n_ondisk->keys;
+				sectors = __set_blocks(n_ondisk,
+						       le16_to_cpu(n_ondisk->keys.u64s),
+						       block_bytes(c)) <<
+					c->block_bits;
+			} else {
+				struct btree_node_entry *bne =
+					(void *) n_ondisk + (offset << 9);
+				i = &bne->keys;
+
+				sectors = __set_blocks(bne,
+						       le16_to_cpu(bne->keys.u64s),
+						       block_bytes(c)) <<
+					c->block_bits;
+			}
+
+			printk(KERN_ERR "*** on disk block %u:\n", offset);
+			bch_dump_bset(b, i, offset);
+
+			offset += sectors;
+		}
+
+		printk(KERN_ERR "*** block %u/%u not written\n",
+		       offset >> c->block_bits, btree_blocks(c));
+
+		for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
+			if (inmemory->_data[j] != sorted->_data[j])
+				break;
+
+		printk(KERN_ERR "b->written %u\n", b->written);
+
+		console_unlock();
+		panic("verify failed at %u\n", j);
+	}
+
+	mutex_unlock(&c->verify_lock);
+	btree_node_io_unlock(b);
+}
+
+void bch_data_verify(struct cached_dev *dc, struct bio *bio)
+{
+	char name[BDEVNAME_SIZE];
+	struct bio *check;
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	check = bio_clone(bio, GFP_NOIO);
+	if (!check)
+		return;
+	bio_set_op_attrs(check, REQ_OP_READ, READ_SYNC);
+
+	if (bio_alloc_pages(check, GFP_NOIO))
+		goto out_put;
+
+	submit_bio_wait(check);
+
+	bio_for_each_segment(bv, bio, iter) {
+		void *p1 = kmap_atomic(bv.bv_page);
+		void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page);
+
+		if (memcmp(p1 + bv.bv_offset,
+			   p2 + bv.bv_offset,
+			   bv.bv_len))
+			panic("verify failed at dev %s sector %llu\n",
+			      bdevname(dc->disk_sb.bdev, name),
+			      (uint64_t) bio->bi_iter.bi_sector);
+
+		kunmap_atomic(p1);
+	}
+
+	bio_free_pages(check);
+out_put:
+	bio_put(check);
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: cache set refcounting */
+
+struct dump_iter {
+	struct bpos		from;
+	struct cache_set	*c;
+	enum btree_id		id;
+
+	char			buf[PAGE_SIZE];
+	size_t			bytes;	/* what's currently in buf */
+
+	char __user		*ubuf;	/* destination user buffer */
+	size_t			size;	/* size of requested read */
+	ssize_t			ret;	/* bytes read so far */
+};
+
+static int flush_buf(struct dump_iter *i)
+{
+	if (i->bytes) {
+		size_t bytes = min(i->bytes, i->size);
+		int err = copy_to_user(i->ubuf, i->buf, bytes);
+
+		if (err)
+			return err;
+
+		i->ret	 += bytes;
+		i->ubuf	 += bytes;
+		i->size	 -= bytes;
+		i->bytes -= bytes;
+		memmove(i->buf, i->buf + bytes, i->bytes);
+	}
+
+	return 0;
+}
+
+static int bch_dump_open(struct inode *inode, struct file *file)
+{
+	struct btree_debug *bd = inode->i_private;
+	struct dump_iter *i;
+
+	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+	if (!i)
+		return -ENOMEM;
+
+	file->private_data = i;
+	i->from = POS_MIN;
+	i->c	= container_of(bd, struct cache_set, btree_debug[bd->id]);
+	i->id	= bd->id;
+
+	return 0;
+}
+
+static int bch_dump_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t bch_read_btree(struct file *file, char __user *buf,
+			      size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size)
+		return i->ret;
+
+	bch_btree_iter_init(&iter, i->c, i->id, i->from);
+
+	while ((k = bch_btree_iter_peek(&iter)).k &&
+	       !(err = btree_iter_err(k))) {
+		bch_bkey_val_to_text(i->c, bkey_type(0, i->id),
+				     i->buf, sizeof(i->buf), k);
+		i->bytes = strlen(i->buf);
+		BUG_ON(i->bytes >= PAGE_SIZE);
+		i->buf[i->bytes] = '\n';
+		i->bytes++;
+
+		bch_btree_iter_advance_pos(&iter);
+		i->from = iter.pos;
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		if (!i->size)
+			break;
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations btree_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch_dump_open,
+	.release	= bch_dump_release,
+	.read		= bch_read_btree,
+};
+
+static int print_btree_node(struct dump_iter *i, struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	struct bset_stats stats;
+
+	memset(&stats, 0, sizeof(stats));
+
+	bch_btree_keys_stats(b, &stats);
+
+	i->bytes = scnprintf(i->buf, sizeof(i->buf),
+			     "l %u %llu:%llu - %llu:%llu:\n"
+			     "    format: u64s %u fields %u %u %u %u %u\n"
+			     "    unpack fn len: %u\n"
+			     "    bytes used %zu/%zu (%zu%% full)\n"
+			     "    sib u64s: %u, %u (merge threshold %zu)\n"
+			     "    nr packed keys %u\n"
+			     "    nr unpacked keys %u\n"
+			     "    floats %zu\n"
+			     "    failed unpacked %zu\n"
+			     "    failed prev %zu\n"
+			     "    failed overflow %zu\n",
+			     b->level,
+			     b->data->min_key.inode,
+			     b->data->min_key.offset,
+			     b->data->max_key.inode,
+			     b->data->max_key.offset,
+			     f->key_u64s,
+			     f->bits_per_field[0],
+			     f->bits_per_field[1],
+			     f->bits_per_field[2],
+			     f->bits_per_field[3],
+			     f->bits_per_field[4],
+			     b->unpack_fn_len,
+			     b->nr.live_u64s * sizeof(u64),
+			     btree_bytes(i->c) - sizeof(struct btree_node),
+			     b->nr.live_u64s * 100 / btree_max_u64s(i->c),
+			     b->sib_u64s[0],
+			     b->sib_u64s[1],
+			     BTREE_FOREGROUND_MERGE_THRESHOLD(i->c),
+			     b->nr.packed_keys,
+			     b->nr.unpacked_keys,
+			     stats.floats,
+			     stats.failed_unpacked,
+			     stats.failed_prev,
+			     stats.failed_overflow);
+
+	return flush_buf(i);
+}
+
+static ssize_t bch_read_btree_formats(struct file *file, char __user *buf,
+				      size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_iter iter;
+	struct btree *b;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size || !bkey_cmp(POS_MAX, i->from))
+		return i->ret;
+
+	for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) {
+		err = print_btree_node(i, b);
+		if (err)
+			break;
+
+		/*
+		 * can't easily correctly restart a btree node traversal across
+		 * all nodes, meh
+		 */
+		i->from = bkey_cmp(POS_MAX, b->key.k.p)
+			? bkey_successor(b->key.k.p)
+			: b->key.k.p;
+
+		if (!i->size)
+			break;
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations btree_format_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch_dump_open,
+	.release	= bch_dump_release,
+	.read		= bch_read_btree_formats,
+};
+
+static ssize_t bch_read_bfloat_failed(struct file *file, char __user *buf,
+				      size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct btree *prev_node = NULL;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size)
+		return i->ret;
+
+	bch_btree_iter_init(&iter, i->c, i->id, i->from);
+
+	while ((k = bch_btree_iter_peek(&iter)).k &&
+	       !(err = btree_iter_err(k))) {
+		struct btree *b = iter.nodes[0];
+		struct btree_node_iter *node_iter = &iter.node_iters[0];
+		struct bkey_packed *_k = bch_btree_node_iter_peek(node_iter, b);
+
+		if (iter.nodes[0] != prev_node) {
+			err = print_btree_node(i, iter.nodes[0]);
+			if (err)
+				break;
+		}
+		prev_node = iter.nodes[0];
+
+		i->bytes = bch_bkey_print_bfloat(b, _k, i->buf, sizeof(i->buf));
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		bch_btree_iter_advance_pos(&iter);
+		i->from = iter.pos;
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		if (!i->size)
+			break;
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations bfloat_failed_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch_dump_open,
+	.release	= bch_dump_release,
+	.read		= bch_read_bfloat_failed,
+};
+
+void bch_debug_exit_cache_set(struct cache_set *c)
+{
+	if (!IS_ERR_OR_NULL(c->debug))
+		debugfs_remove_recursive(c->debug);
+}
+
+void bch_debug_init_cache_set(struct cache_set *c)
+{
+	struct btree_debug *bd;
+	char name[100];
+
+	if (IS_ERR_OR_NULL(bch_debug))
+		return;
+
+	snprintf(name, sizeof(name), "%pU", c->disk_sb.user_uuid.b);
+	c->debug = debugfs_create_dir(name, bch_debug);
+	if (IS_ERR_OR_NULL(c->debug))
+		return;
+
+	for (bd = c->btree_debug;
+	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
+	     bd++) {
+		bd->id = bd - c->btree_debug;
+		bd->btree = debugfs_create_file(bch_btree_id_names[bd->id],
+						0400, c->debug, bd,
+						&btree_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-formats",
+			 bch_btree_id_names[bd->id]);
+
+		bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
+						       &btree_format_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-bfloat-failed",
+			 bch_btree_id_names[bd->id]);
+
+		bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
+						 &bfloat_failed_debug_ops);
+	}
+}
+
+#endif
+
+void bch_debug_exit(void)
+{
+	if (!IS_ERR_OR_NULL(bch_debug))
+		debugfs_remove_recursive(bch_debug);
+}
+
+int __init bch_debug_init(void)
+{
+	int ret = 0;
+
+	bch_debug = debugfs_create_dir("bcache", NULL);
+	return ret;
+}
diff --git a/libbcache/debug.h b/libbcache/debug.h
new file mode 100644
index 0000000..a3635e6
--- /dev/null
+++ b/libbcache/debug.h
@@ -0,0 +1,65 @@
+#ifndef _BCACHE_DEBUG_H
+#define _BCACHE_DEBUG_H
+
+#include "bcache.h"
+
+struct bio;
+struct btree;
+struct cached_dev;
+struct cache_set;
+
+#define BCH_DEBUG_PARAM(name, description) extern bool bch_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct cache_set *c)			\
+	{ return bch_##name || c->name;	}
+BCH_DEBUG_PARAMS_ALWAYS()
+#undef BCH_DEBUG_PARAM
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct cache_set *c)			\
+	{ return bch_##name || c->name;	}
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+
+void __bch_btree_verify(struct cache_set *, struct btree *);
+void bch_data_verify(struct cached_dev *, struct bio *);
+
+#define bypass_torture_test(d)		((d)->bypass_torture_test)
+
+#else /* DEBUG */
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct cache_set *c) { return false; }
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+
+static inline void __bch_btree_verify(struct cache_set *c, struct btree *b) {}
+static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
+
+#define bypass_torture_test(d)		0
+
+#endif
+
+static inline void bch_btree_verify(struct cache_set *c, struct btree *b)
+{
+	if (verify_btree_ondisk(c))
+		__bch_btree_verify(c, b);
+}
+
+#ifdef CONFIG_DEBUG_FS
+void bch_debug_exit_cache_set(struct cache_set *);
+void bch_debug_init_cache_set(struct cache_set *);
+#else
+static inline void bch_debug_exit_cache_set(struct cache_set *c) {}
+static inline void bch_debug_init_cache_set(struct cache_set *c) {}
+#endif
+
+void bch_debug_exit(void);
+int bch_debug_init(void);
+
+#endif
diff --git a/libbcache/dirent.c b/libbcache/dirent.c
new file mode 100644
index 0000000..920ad2f
--- /dev/null
+++ b/libbcache/dirent.c
@@ -0,0 +1,449 @@
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "dirent.h"
+#include "fs.h"
+#include "keylist.h"
+#include "str_hash.h"
+
+#include <linux/dcache.h>
+
+static unsigned dirent_name_bytes(struct bkey_s_c_dirent d)
+{
+	unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent);
+
+	while (len && !d.v->d_name[len - 1])
+		--len;
+
+	return len;
+}
+
+static u64 bch_dirent_hash(const struct bch_hash_info *info,
+			   const struct qstr *name)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_SHA1: {
+		SHASH_DESC_ON_STACK(desc, bch_sha1);
+		u8 digest[SHA1_DIGEST_SIZE];
+		u64 ret;
+		desc->tfm = bch_sha1;
+		desc->flags = 0;
+		crypto_shash_init(desc);
+
+		crypto_shash_update(desc, (void *) &info->seed, sizeof(info->seed));
+
+		crypto_shash_update(desc, (void *) name->name, name->len);
+		crypto_shash_final(desc, digest);
+		memcpy(&ret, &digest, sizeof(ret));
+		return max_t(u64, ret >> 1, 2);
+	}
+	default: {
+		struct bch_str_hash_ctx ctx;
+
+		bch_str_hash_init(&ctx, info->type);
+		bch_str_hash_update(&ctx, info->type, &info->seed, sizeof(info->seed));
+
+		bch_str_hash_update(&ctx, info->type, name->name, name->len);
+
+		/* [0,2) reserved for dots */
+		return max_t(u64, bch_str_hash_end(&ctx, info->type), 2);
+	}
+	}
+}
+
+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch_dirent_hash(info, key);
+}
+
+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr name = QSTR_INIT(d.v->d_name, dirent_name_bytes(d));
+
+	return bch_dirent_hash(info, &name);
+}
+
+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	int len = dirent_name_bytes(l);
+	const struct qstr *r = _r;
+
+	return len - r->len ?: memcmp(l.v->d_name, r->name, len);
+}
+
+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
+	int l_len = dirent_name_bytes(l);
+	int r_len = dirent_name_bytes(r);
+
+	return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
+}
+
+static const struct bch_hash_desc dirent_hash_desc = {
+	.btree_id	= BTREE_ID_DIRENTS,
+	.key_type	= BCH_DIRENT,
+	.whiteout_type	= BCH_DIRENT_WHITEOUT,
+	.hash_key	= dirent_hash_key,
+	.hash_bkey	= dirent_hash_bkey,
+	.cmp_key	= dirent_cmp_key,
+	.cmp_bkey	= dirent_cmp_bkey,
+};
+
+static const char *bch_dirent_invalid(const struct cache_set *c,
+				      struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_DIRENT:
+		return bkey_val_bytes(k.k) < sizeof(struct bch_dirent)
+			? "value too small"
+			: NULL;
+
+	case BCH_DIRENT_WHITEOUT:
+		return bkey_val_bytes(k.k) != 0
+			? "value size should be zero"
+			: NULL;
+
+	default:
+		return "invalid type";
+	}
+}
+
+static void bch_dirent_to_text(struct cache_set *c, char *buf,
+			       size_t size, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d;
+
+	switch (k.k->type) {
+	case BCH_DIRENT:
+		d = bkey_s_c_to_dirent(k);
+
+		if (size) {
+			unsigned n = min_t(unsigned, size,
+					   dirent_name_bytes(d));
+			memcpy(buf, d.v->d_name, n);
+			buf[size - 1] = '\0';
+			buf += n;
+			size -= n;
+		}
+
+		scnprintf(buf, size, " -> %llu", d.v->d_inum);
+		break;
+	case BCH_DIRENT_WHITEOUT:
+		scnprintf(buf, size, "whiteout");
+		break;
+	}
+}
+
+const struct bkey_ops bch_bkey_dirent_ops = {
+	.key_invalid	= bch_dirent_invalid,
+	.val_to_text	= bch_dirent_to_text,
+};
+
+static struct bkey_i_dirent *dirent_create_key(u8 type,
+				const struct qstr *name, u64 dst)
+{
+	struct bkey_i_dirent *dirent;
+	unsigned u64s = BKEY_U64s +
+		DIV_ROUND_UP(sizeof(struct bch_dirent) + name->len,
+			     sizeof(u64));
+
+	dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS);
+	if (!dirent)
+		return NULL;
+
+	bkey_dirent_init(&dirent->k_i);
+	dirent->k.u64s = u64s;
+	dirent->v.d_inum = cpu_to_le64(dst);
+	dirent->v.d_type = type;
+
+	memcpy(dirent->v.d_name, name->name, name->len);
+	memset(dirent->v.d_name + name->len, 0,
+	       bkey_val_bytes(&dirent->k) -
+	       (sizeof(struct bch_dirent) + name->len));
+
+	EBUG_ON(dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+
+	return dirent;
+}
+
+int bch_dirent_create(struct cache_set *c, struct inode *dir, u8 type,
+		      const struct qstr *name, u64 dst_inum)
+{
+	struct bch_inode_info *ei = to_bch_ei(dir);
+	struct bkey_i_dirent *dirent;
+	int ret;
+
+	dirent = dirent_create_key(type, name, dst_inum);
+	if (!dirent)
+		return -ENOMEM;
+
+	ret = bch_hash_set(dirent_hash_desc, &ei->str_hash, c,
+			   ei->vfs_inode.i_ino, &ei->journal_seq,
+			   &dirent->k_i, BCH_HASH_SET_MUST_CREATE);
+	kfree(dirent);
+
+	return ret;
+}
+
+static void dirent_copy_target(struct bkey_i_dirent *dst,
+			       struct bkey_s_c_dirent src)
+{
+	dst->v.d_inum = src.v->d_inum;
+	dst->v.d_type = src.v->d_type;
+}
+
+static struct bpos bch_dirent_pos(struct bch_inode_info *ei,
+				  const struct qstr *name)
+{
+	return POS(ei->vfs_inode.i_ino, bch_dirent_hash(&ei->str_hash, name));
+}
+
+int bch_dirent_rename(struct cache_set *c,
+		      struct inode *src_dir, const struct qstr *src_name,
+		      struct inode *dst_dir, const struct qstr *dst_name,
+		      u64 *journal_seq, enum bch_rename_mode mode)
+{
+	struct bch_inode_info *src_ei = to_bch_ei(src_dir);
+	struct bch_inode_info *dst_ei = to_bch_ei(dst_dir);
+	struct btree_iter src_iter, dst_iter, whiteout_iter;
+	struct bkey_s_c old_src, old_dst;
+	struct bkey delete;
+	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
+	struct bpos src_pos = bch_dirent_pos(src_ei, src_name);
+	struct bpos dst_pos = bch_dirent_pos(dst_ei, dst_name);
+	bool need_whiteout;
+	int ret = -ENOMEM;
+
+	bch_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos);
+	bch_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos);
+	bch_btree_iter_link(&src_iter, &dst_iter);
+
+	bch_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos);
+	bch_btree_iter_link(&src_iter, &whiteout_iter);
+
+	if (mode == BCH_RENAME_EXCHANGE) {
+		new_src = dirent_create_key(0, src_name, 0);
+		if (!new_src)
+			goto err;
+	} else {
+		new_src = (void *) &delete;
+	}
+
+	new_dst = dirent_create_key(0, dst_name, 0);
+	if (!new_dst)
+		goto err;
+retry:
+	/*
+	 * Note that on -EINTR/dropped locks we're not restarting the lookup
+	 * from the original hashed position (like we do when creating dirents,
+	 * in bch_hash_set) -  we never move existing dirents to different slot:
+	 */
+	old_src = bch_hash_lookup_at(dirent_hash_desc,
+				     &src_ei->str_hash,
+				     &src_iter, src_name);
+	if ((ret = btree_iter_err(old_src)))
+		goto err;
+
+	ret = bch_hash_needs_whiteout(dirent_hash_desc,
+				&src_ei->str_hash,
+				&whiteout_iter, &src_iter);
+	if (ret < 0)
+		goto err;
+	need_whiteout = ret;
+
+	/*
+	 * Note that in BCH_RENAME mode, we're _not_ checking if
+	 * the target already exists - we're relying on the VFS
+	 * to do that check for us for correctness:
+	 */
+	old_dst = mode == BCH_RENAME
+		? bch_hash_hole_at(dirent_hash_desc, &dst_iter)
+		: bch_hash_lookup_at(dirent_hash_desc,
+				     &dst_ei->str_hash,
+				     &dst_iter, dst_name);
+	if ((ret = btree_iter_err(old_dst)))
+		goto err;
+
+	switch (mode) {
+	case BCH_RENAME:
+		bkey_init(&new_src->k);
+		dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+
+		if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
+		    bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
+			/*
+			 * If we couldn't insert new_dst at its hashed
+			 * position (dst_pos) due to a hash collision,
+			 * and we're going to be deleting in
+			 * between the hashed position and first empty
+			 * slot we found - just overwrite the pos we
+			 * were going to delete:
+			 *
+			 * Note: this is a correctness issue, in this
+			 * situation bch_hash_needs_whiteout() could
+			 * return false when the whiteout would have
+			 * been needed if we inserted at the pos
+			 * __dirent_find_hole() found
+			 */
+			new_dst->k.p = src_iter.pos;
+			ret = bch_btree_insert_at(c, NULL, NULL,
+					journal_seq,
+					BTREE_INSERT_ATOMIC,
+					BTREE_INSERT_ENTRY(&src_iter,
+							   &new_dst->k_i));
+			goto err;
+		}
+
+		if (need_whiteout)
+			new_src->k.type = BCH_DIRENT_WHITEOUT;
+		break;
+	case BCH_RENAME_OVERWRITE:
+		bkey_init(&new_src->k);
+		dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+
+		if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
+		    bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
+			/*
+			 * Same case described above -
+			 * bch_hash_needs_whiteout could spuriously
+			 * return false, but we have to insert at
+			 * dst_iter.pos because we're overwriting
+			 * another dirent:
+			 */
+			new_src->k.type = BCH_DIRENT_WHITEOUT;
+		} else if (need_whiteout)
+			new_src->k.type = BCH_DIRENT_WHITEOUT;
+		break;
+	case BCH_RENAME_EXCHANGE:
+		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
+		dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+		break;
+	}
+
+	new_src->k.p = src_iter.pos;
+	new_dst->k.p = dst_iter.pos;
+	ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
+			BTREE_INSERT_ATOMIC,
+			BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i),
+			BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i));
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	bch_btree_iter_unlock(&whiteout_iter);
+	bch_btree_iter_unlock(&dst_iter);
+	bch_btree_iter_unlock(&src_iter);
+
+	if (new_src != (void *) &delete)
+		kfree(new_src);
+	kfree(new_dst);
+	return ret;
+}
+
+int bch_dirent_delete(struct cache_set *c, struct inode *dir,
+		      const struct qstr *name)
+{
+	struct bch_inode_info *ei = to_bch_ei(dir);
+
+	return bch_hash_delete(dirent_hash_desc, &ei->str_hash,
+			       c, ei->vfs_inode.i_ino,
+			       &ei->journal_seq, name);
+}
+
+u64 bch_dirent_lookup(struct cache_set *c, struct inode *dir,
+		      const struct qstr *name)
+{
+	struct bch_inode_info *ei = to_bch_ei(dir);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 inum;
+
+	k = bch_hash_lookup(dirent_hash_desc, &ei->str_hash, c,
+			    ei->vfs_inode.i_ino, &iter, name);
+	if (IS_ERR(k.k)) {
+		bch_btree_iter_unlock(&iter);
+		return 0;
+	}
+
+	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+	bch_btree_iter_unlock(&iter);
+
+	return inum;
+}
+
+int bch_empty_dir(struct cache_set *c, u64 dir_inum)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), k) {
+		if (k.k->p.inode > dir_inum)
+			break;
+
+		if (k.k->type == BCH_DIRENT) {
+			ret = -ENOTEMPTY;
+			break;
+		}
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+int bch_readdir(struct cache_set *c, struct file *file,
+		struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent dirent;
+	unsigned len;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	pr_debug("listing for %lu from %llu", inode->i_ino, ctx->pos);
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+			   POS(inode->i_ino, ctx->pos), k) {
+		if (k.k->type != BCH_DIRENT)
+			continue;
+
+		dirent = bkey_s_c_to_dirent(k);
+
+		pr_debug("saw %llu:%llu (%s) -> %llu",
+			 k.k->p.inode, k.k->p.offset,
+			 dirent.v->d_name, dirent.v->d_inum);
+
+		if (bkey_cmp(k.k->p, POS(inode->i_ino, ctx->pos)) < 0)
+			continue;
+
+		if (k.k->p.inode > inode->i_ino)
+			break;
+
+		len = dirent_name_bytes(dirent);
+
+		pr_debug("emitting %s", dirent.v->d_name);
+
+		/*
+		 * XXX: dir_emit() can fault and block, while we're holding
+		 * locks
+		 */
+		if (!dir_emit(ctx, dirent.v->d_name, len,
+			      le64_to_cpu(dirent.v->d_inum),
+			      dirent.v->d_type))
+			break;
+
+		ctx->pos = k.k->p.offset + 1;
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return 0;
+}
diff --git a/libbcache/dirent.h b/libbcache/dirent.h
new file mode 100644
index 0000000..e18089b
--- /dev/null
+++ b/libbcache/dirent.h
@@ -0,0 +1,32 @@
+#ifndef _BCACHE_DIRENT_H
+#define _BCACHE_DIRENT_H
+
+extern const struct bkey_ops bch_bkey_dirent_ops;
+
+struct qstr;
+struct file;
+struct dir_context;
+struct cache_set;
+
+int bch_dirent_create(struct cache_set *c, struct inode *, u8,
+		      const struct qstr *, u64);
+int bch_dirent_delete(struct cache_set *c, struct inode *, const struct qstr *);
+
+enum bch_rename_mode {
+	BCH_RENAME,
+	BCH_RENAME_OVERWRITE,
+	BCH_RENAME_EXCHANGE,
+};
+
+int bch_dirent_rename(struct cache_set *,
+		      struct inode *, const struct qstr *,
+		      struct inode *, const struct qstr *,
+		      u64 *, enum bch_rename_mode);
+
+u64 bch_dirent_lookup(struct cache_set *c, struct inode *,
+		      const struct qstr *);
+int bch_empty_dir(struct cache_set *, u64);
+int bch_readdir(struct cache_set *, struct file *, struct dir_context *);
+
+#endif /* _BCACHE_DIRENT_H */
+
diff --git a/libbcache/error.c b/libbcache/error.c
new file mode 100644
index 0000000..9ba33ef
--- /dev/null
+++ b/libbcache/error.c
@@ -0,0 +1,140 @@
+#include "bcache.h"
+#include "error.h"
+#include "io.h"
+#include "notify.h"
+#include "super.h"
+
+void bch_inconsistent_error(struct cache_set *c)
+{
+	set_bit(CACHE_SET_ERROR, &c->flags);
+
+	switch (c->opts.errors) {
+	case BCH_ON_ERROR_CONTINUE:
+		break;
+	case BCH_ON_ERROR_RO:
+		if (!test_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags)) {
+			/* XXX do something better here? */
+			bch_cache_set_stop(c);
+			return;
+		}
+
+		if (bch_cache_set_emergency_read_only(c))
+			bch_err(c, "emergency read only");
+		break;
+	case BCH_ON_ERROR_PANIC:
+		panic(bch_fmt(c, "panic after error"));
+		break;
+	}
+}
+
+void bch_fatal_error(struct cache_set *c)
+{
+	if (bch_cache_set_emergency_read_only(c))
+		bch_err(c, "emergency read only");
+}
+
+/* Nonfatal IO errors, IO error/latency accounting: */
+
+/* Just does IO error accounting: */
+void bch_account_io_completion(struct cache *ca)
+{
+	/*
+	 * The halflife of an error is:
+	 * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
+	 */
+
+	if (ca->set->error_decay) {
+		unsigned count = atomic_inc_return(&ca->io_count);
+
+		while (count > ca->set->error_decay) {
+			unsigned errors;
+			unsigned old = count;
+			unsigned new = count - ca->set->error_decay;
+
+			/*
+			 * First we subtract refresh from count; each time we
+			 * succesfully do so, we rescale the errors once:
+			 */
+
+			count = atomic_cmpxchg(&ca->io_count, old, new);
+
+			if (count == old) {
+				count = new;
+
+				errors = atomic_read(&ca->io_errors);
+				do {
+					old = errors;
+					new = ((uint64_t) errors * 127) / 128;
+					errors = atomic_cmpxchg(&ca->io_errors,
+								old, new);
+				} while (old != errors);
+			}
+		}
+	}
+}
+
+/* IO error accounting and latency accounting: */
+void bch_account_io_completion_time(struct cache *ca,
+				    unsigned submit_time_us, int op)
+{
+	struct cache_set *c;
+	unsigned threshold;
+
+	if (!ca)
+		return;
+
+	c = ca->set;
+	threshold = op_is_write(op)
+		? c->congested_write_threshold_us
+		: c->congested_read_threshold_us;
+
+	if (threshold && submit_time_us) {
+		unsigned t = local_clock_us();
+
+		int us = t - submit_time_us;
+		int congested = atomic_read(&c->congested);
+
+		if (us > (int) threshold) {
+			int ms = us / 1024;
+			c->congested_last_us = t;
+
+			ms = min(ms, CONGESTED_MAX + congested);
+			atomic_sub(ms, &c->congested);
+		} else if (congested < 0)
+			atomic_inc(&c->congested);
+	}
+
+	bch_account_io_completion(ca);
+}
+
+void bch_nonfatal_io_error_work(struct work_struct *work)
+{
+	struct cache *ca = container_of(work, struct cache, io_error_work);
+	struct cache_set *c = ca->set;
+	unsigned errors = atomic_read(&ca->io_errors);
+	char buf[BDEVNAME_SIZE];
+	bool dev;
+
+	if (errors < c->error_limit) {
+		bch_notify_cache_error(ca, false);
+	} else {
+		bch_notify_cache_error(ca, true);
+
+		mutex_lock(&bch_register_lock);
+		dev = bch_cache_may_remove(ca);
+		if (dev
+		    ? bch_cache_read_only(ca)
+		    : bch_cache_set_emergency_read_only(c))
+			bch_err(c,
+				"too many IO errors on %s, setting %s RO",
+				bdevname(ca->disk_sb.bdev, buf),
+				dev ? "device" : "filesystem");
+		mutex_unlock(&bch_register_lock);
+	}
+}
+
+void bch_nonfatal_io_error(struct cache *ca)
+{
+	atomic_add(1 << IO_ERROR_SHIFT, &ca->io_errors);
+	queue_work(system_long_wq, &ca->io_error_work);
+}
diff --git a/libbcache/error.h b/libbcache/error.h
new file mode 100644
index 0000000..9eb9335
--- /dev/null
+++ b/libbcache/error.h
@@ -0,0 +1,238 @@
+#ifndef _BCACHE_ERROR_H
+#define _BCACHE_ERROR_H
+
+#include <linux/printk.h>
+
+struct cache;
+struct cache_set;
+
+/*
+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag
+ * superblock as such
+ */
+
+/* Error messages: */
+
+#define __bch_cache_error(ca, fmt, ...)					\
+do {									\
+	char _buf[BDEVNAME_SIZE];					\
+	bch_err((ca)->set, "%s: " fmt,					\
+		bdevname((ca)->disk_sb.bdev, _buf), ##__VA_ARGS__);	\
+} while (0)
+
+/*
+ * Very fatal logic/inconsistency errors: these indicate that we've majorly
+ * screwed up at runtime, i.e. it's not likely that it was just caused by the
+ * data on disk being inconsistent. These BUG():
+ *
+ * XXX: audit and convert to inconsistent() checks
+ */
+
+#define cache_set_bug(c, ...)						\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	BUG();								\
+} while (0)
+
+#define cache_set_bug_on(cond, c, ...)					\
+do {									\
+	if (cond)							\
+		cache_set_bug(c, __VA_ARGS__);				\
+} while (0)
+
+/*
+ * Inconsistency errors: The on disk data is inconsistent. If these occur during
+ * initial recovery, they don't indicate a bug in the running code - we walk all
+ * the metadata before modifying anything. If they occur at runtime, they
+ * indicate either a bug in the running code or (less likely) data is being
+ * silently corrupted under us.
+ *
+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in
+ * BCH_ON_ERROR_CONTINUE mode
+ */
+
+void bch_inconsistent_error(struct cache_set *);
+
+#define cache_set_inconsistent(c, ...)					\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	bch_inconsistent_error(c);					\
+} while (0)
+
+#define cache_set_inconsistent_on(cond, c, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		cache_set_inconsistent(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Later we might want to mark only the particular device inconsistent, not the
+ * entire cache set:
+ */
+
+#define cache_inconsistent(ca, ...)					\
+do {									\
+	__bch_cache_error(ca, __VA_ARGS__);				\
+	bch_inconsistent_error((ca)->set);				\
+} while (0)
+
+#define cache_inconsistent_on(cond, ca, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		cache_inconsistent(ca, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally
+ * be able to repair:
+ */
+
+enum {
+	BCH_FSCK_OK			= 0,
+	BCH_FSCK_ERRORS_NOT_FIXED	= 1,
+	BCH_FSCK_REPAIR_UNIMPLEMENTED	= 2,
+	BCH_FSCK_REPAIR_IMPOSSIBLE	= 3,
+	BCH_FSCK_UNKNOWN_VERSION	= 4,
+};
+
+#define unfixable_fsck_err(c, msg, ...)					\
+do {									\
+	bch_err(c, msg " (repair unimplemented)", ##__VA_ARGS__);	\
+	ret = BCH_FSCK_REPAIR_UNIMPLEMENTED;				\
+	goto fsck_err;							\
+} while (0)
+
+#define unfixable_fsck_err_on(cond, c, ...)				\
+do {									\
+	if (cond)							\
+		unfixable_fsck_err(c, __VA_ARGS__);			\
+} while (0)
+
+#define fsck_err(c, msg, ...)						\
+do {									\
+	if (!(c)->opts.fix_errors) {					\
+		bch_err(c, msg, ##__VA_ARGS__);				\
+		ret = BCH_FSCK_ERRORS_NOT_FIXED;			\
+		goto fsck_err;						\
+	}								\
+	set_bit(CACHE_SET_FSCK_FIXED_ERRORS, &(c)->flags);		\
+	bch_err(c, msg ", fixing", ##__VA_ARGS__);			\
+} while (0)
+
+#define fsck_err_on(cond, c, ...)					\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret)							\
+		fsck_err(c, __VA_ARGS__);				\
+	_ret;								\
+})
+
+/*
+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW
+ * mode - pretty much just due to metadata IO errors:
+ */
+
+void bch_fatal_error(struct cache_set *);
+
+#define cache_set_fatal_error(c, ...)					\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	bch_fatal_error(c);						\
+} while (0)
+
+#define cache_set_fatal_err_on(cond, c, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		cache_set_fatal_error(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+#define cache_fatal_error(ca, ...)					\
+do {									\
+	__bch_cache_error(ca, __VA_ARGS__);				\
+	bch_fatal_error(c);						\
+} while (0)
+
+#define cache_fatal_io_error(ca, fmt, ...)				\
+do {									\
+	char _buf[BDEVNAME_SIZE];					\
+									\
+	printk_ratelimited(KERN_ERR bch_fmt((ca)->set,			\
+		"fatal IO error on %s for " fmt),			\
+		bdevname((ca)->disk_sb.bdev, _buf), ##__VA_ARGS__);	\
+	bch_fatal_error((ca)->set);					\
+} while (0)
+
+#define cache_fatal_io_err_on(cond, ca, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		cache_fatal_io_error(ca, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Nonfatal IO errors: either recoverable metadata IO (because we have
+ * replicas), or data IO - we need to log it and print out a message, but we
+ * don't (necessarily) want to shut down the fs:
+ */
+
+void bch_account_io_completion(struct cache *);
+void bch_account_io_completion_time(struct cache *, unsigned, int);
+
+void bch_nonfatal_io_error_work(struct work_struct *);
+
+/* Does the error handling without logging a message */
+void bch_nonfatal_io_error(struct cache *);
+
+#if 0
+#define cache_set_nonfatal_io_error(c, ...)				\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	bch_nonfatal_io_error(c);					\
+} while (0)
+#endif
+
+/* Logs message and handles the error: */
+#define cache_nonfatal_io_error(ca, fmt, ...)				\
+do {									\
+	char _buf[BDEVNAME_SIZE];					\
+									\
+	printk_ratelimited(KERN_ERR bch_fmt((ca)->set,			\
+		"IO error on %s for " fmt),				\
+		bdevname((ca)->disk_sb.bdev, _buf), ##__VA_ARGS__);	\
+	bch_nonfatal_io_error(ca);					\
+} while (0)
+
+#define cache_nonfatal_io_err_on(cond, ca, ...)				\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret)							\
+		cache_nonfatal_io_error(ca, __VA_ARGS__);		\
+	_ret;								\
+})
+
+/* kill? */
+
+#define __bcache_io_error(c, fmt, ...)					\
+	printk_ratelimited(KERN_ERR bch_fmt(c,				\
+			"IO error: " fmt), ##__VA_ARGS__)
+
+#define bcache_io_error(c, bio, fmt, ...)				\
+do {									\
+	__bcache_io_error(c, fmt, ##__VA_ARGS__);			\
+	(bio)->bi_error = -EIO;						\
+} while (0)
+
+#endif /* _BCACHE_ERROR_H */
diff --git a/libbcache/extents.c b/libbcache/extents.c
new file mode 100644
index 0000000..45fa220
--- /dev/null
+++ b/libbcache/extents.c
@@ -0,0 +1,2514 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Code for managing the extent btree and dynamically updating the writeback
+ * dirty sector count.
+ */
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "debug.h"
+#include "dirent.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "writeback.h"
+#include "xattr.h"
+
+#include <trace/events/bcache.h>
+
+static bool __bch_extent_normalize(struct cache_set *, struct bkey_s, bool);
+static enum merge_result bch_extent_merge(struct cache_set *, struct btree *,
+					  struct bkey_i *, struct bkey_i *);
+
+static void sort_key_next(struct btree_node_iter *iter,
+			  struct btree *b,
+			  struct btree_node_iter_set *i)
+{
+	i->k += __btree_node_offset_to_key(b, i->k)->u64s;
+
+	if (i->k == i->end)
+		*i = iter->data[--iter->used];
+}
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+#define key_sort_cmp(l, r)						\
+({									\
+	int _c = bkey_cmp_packed(b,					\
+				 __btree_node_offset_to_key(b, (l).k),	\
+				 __btree_node_offset_to_key(b, (r).k));	\
+									\
+	_c ? _c > 0 : (l).k > (r).k;					\
+})
+
+static inline bool should_drop_next_key(struct btree_node_iter *iter,
+					struct btree *b)
+{
+	struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
+	struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
+
+	if (bkey_whiteout(k))
+		return true;
+
+	if (iter->used < 2)
+		return false;
+
+	if (iter->used > 2 &&
+	    key_sort_cmp(r[0], r[1]))
+		r++;
+
+	/*
+	 * key_sort_cmp() ensures that when keys compare equal the older key
+	 * comes first; so if l->k compares equal to r->k then l->k is older and
+	 * should be dropped.
+	 */
+	return !bkey_cmp_packed(b,
+				__btree_node_offset_to_key(b, l->k),
+				__btree_node_offset_to_key(b, r->k));
+}
+
+struct btree_nr_keys bch_key_sort_fix_overlapping(struct bset *dst,
+						  struct btree *b,
+						  struct btree_node_iter *iter)
+{
+	struct bkey_packed *out = dst->start;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	heap_resort(iter, key_sort_cmp);
+
+	while (!bch_btree_node_iter_end(iter)) {
+		if (!should_drop_next_key(iter, b)) {
+			struct bkey_packed *k =
+				__btree_node_offset_to_key(b, iter->data->k);
+
+			bkey_copy(out, k);
+			btree_keys_account_key_add(&nr, 0, out);
+			out = bkey_next(out);
+		}
+
+		sort_key_next(iter, b, iter->data);
+		heap_sift(iter, 0, key_sort_cmp);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Common among btree and extent ptrs */
+
+bool bch_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
+{
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		if (ptr->dev == dev)
+			return true;
+
+	return false;
+}
+
+unsigned bch_extent_nr_ptrs_from(struct bkey_s_c_extent e,
+				 const struct bch_extent_ptr *start)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned nr_ptrs = 0;
+
+	extent_for_each_ptr_from(e, ptr, start)
+		nr_ptrs++;
+
+	return nr_ptrs;
+}
+
+unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent e)
+{
+	return bch_extent_nr_ptrs_from(e, &e.v->start->ptr);
+}
+
+/* returns true if equal */
+static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
+{
+	return extent_crc_type(l) == extent_crc_type(r) &&
+		!memcmp(l, r, extent_entry_bytes(to_entry(l)));
+}
+
+/* Increment pointers after @crc by crc's offset until the next crc entry: */
+void bch_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc)
+{
+	union bch_extent_entry *entry;
+
+	extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) {
+		if (!extent_entry_is_ptr(entry))
+			return;
+
+		entry->ptr.offset += crc_offset(crc);
+	}
+}
+
+/*
+ * We're writing another replica for this extent, so while we've got the data in
+ * memory we'll be computing a new checksum for the currently live data.
+ *
+ * If there are other replicas we aren't moving, and they are checksummed but
+ * not compressed, we can modify them to point to only the data that is
+ * currently live (so that readers won't have to bounce) while we've got the
+ * checksum we need:
+ *
+ * XXX: to guard against data being corrupted while in memory, instead of
+ * recomputing the checksum here, it would be better in the read path to instead
+ * of computing the checksum of the entire extent:
+ *
+ * | extent                              |
+ *
+ * compute the checksums of the live and dead data separately
+ * | dead data || live data || dead data |
+ *
+ * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
+ * use crc_live here (that we verified was correct earlier)
+ */
+void bch_extent_narrow_crcs(struct bkey_s_extent e)
+{
+	union bch_extent_crc *crc;
+	bool have_wide = false, have_narrow = false;
+	u64 csum = 0;
+	unsigned csum_type = 0;
+
+	extent_for_each_crc(e, crc) {
+		if (crc_compression_type(crc))
+			continue;
+
+		if (crc_uncompressed_size(e.k, crc) != e.k->size) {
+			have_wide = true;
+		} else {
+			have_narrow = true;
+			csum = crc_csum(crc);
+			csum_type = crc_csum_type(crc);
+		}
+	}
+
+	if (!have_wide || !have_narrow)
+		return;
+
+	extent_for_each_crc(e, crc) {
+		if (crc_compression_type(crc))
+			continue;
+
+		if (crc_uncompressed_size(e.k, crc) != e.k->size) {
+			switch (extent_crc_type(crc)) {
+			case BCH_EXTENT_CRC_NONE:
+				BUG();
+			case BCH_EXTENT_CRC32:
+				if (bch_crc_size[csum_type] > sizeof(crc->crc32.csum))
+					continue;
+
+				bch_extent_crc_narrow_pointers(e, crc);
+				crc->crc32.compressed_size	= e.k->size;
+				crc->crc32.uncompressed_size	= e.k->size;
+				crc->crc32.offset		= 0;
+				crc->crc32.csum_type		= csum_type;
+				crc->crc32.csum			= csum;
+				break;
+			case BCH_EXTENT_CRC64:
+				if (bch_crc_size[csum_type] > sizeof(crc->crc64.csum))
+					continue;
+
+				bch_extent_crc_narrow_pointers(e, crc);
+				crc->crc64.compressed_size	= e.k->size;
+				crc->crc64.uncompressed_size	= e.k->size;
+				crc->crc64.offset		= 0;
+				crc->crc64.csum_type		= csum_type;
+				crc->crc64.csum			= csum;
+				break;
+			}
+		}
+	}
+}
+
+void bch_extent_drop_redundant_crcs(struct bkey_s_extent e)
+{
+	union bch_extent_entry *entry = e.v->start;
+	union bch_extent_crc *crc, *prev = NULL;
+
+	while (entry != extent_entry_last(e)) {
+		union bch_extent_entry *next = extent_entry_next(entry);
+		size_t crc_u64s = extent_entry_u64s(entry);
+
+		if (!extent_entry_is_crc(entry))
+			goto next;
+
+		crc = entry_to_crc(entry);
+
+		if (next == extent_entry_last(e)) {
+			/* crc entry with no pointers after it: */
+			goto drop;
+		}
+
+		if (extent_entry_is_crc(next)) {
+			/* no pointers before next crc entry: */
+			goto drop;
+		}
+
+		if (prev && crc_cmp(crc, prev)) {
+			/* identical to previous crc entry: */
+			goto drop;
+		}
+
+		if (!prev &&
+		    !crc_csum_type(crc) &&
+		    !crc_compression_type(crc)) {
+			/* null crc entry: */
+			bch_extent_crc_narrow_pointers(e, crc);
+			goto drop;
+		}
+
+		prev = crc;
+next:
+		entry = next;
+		continue;
+drop:
+		memmove_u64s_down(crc, next,
+				  (u64 *) extent_entry_last(e) - (u64 *) next);
+		e.k->u64s -= crc_u64s;
+	}
+
+	EBUG_ON(bkey_val_u64s(e.k) && !bch_extent_nr_ptrs(e.c));
+}
+
+static bool should_drop_ptr(const struct cache_set *c,
+			    struct bkey_s_c_extent e,
+			    const struct bch_extent_ptr *ptr)
+{
+	struct cache *ca;
+
+	return (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr);
+}
+
+static void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent e)
+{
+	struct bch_extent_ptr *ptr = &e.v->start->ptr;
+	bool dropped = false;
+
+	/*
+	 * We don't want to change which pointers are considered cached/dirty,
+	 * so don't remove pointers that are considered dirty:
+	 */
+	rcu_read_lock();
+	while ((ptr = extent_ptr_next(e, ptr)) &&
+	       !bch_extent_ptr_is_dirty(c, e.c, ptr))
+		if (should_drop_ptr(c, e.c, ptr)) {
+			__bch_extent_drop_ptr(e, ptr);
+			dropped = true;
+		} else
+			ptr++;
+	rcu_read_unlock();
+
+	if (dropped)
+		bch_extent_drop_redundant_crcs(e);
+}
+
+static bool bch_ptr_normalize(struct cache_set *c, struct btree *bk,
+			      struct bkey_s k)
+{
+	return __bch_extent_normalize(c, k, false);
+}
+
+static void bch_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+{
+	u64 *d = (u64 *) bkeyp_val(f, k);
+	unsigned i;
+
+	for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+		d[i] = swab64(d[i]);
+}
+
+static const char *extent_ptr_invalid(struct bkey_s_c_extent e,
+				      const struct cache_member_rcu *mi,
+				      const struct bch_extent_ptr *ptr,
+				      unsigned size_ondisk)
+{
+	const struct bch_extent_ptr *ptr2;
+	const struct cache_member_cpu *m = mi->m + ptr->dev;
+
+	if (ptr->dev > mi->nr_in_set || !m->valid)
+		return "pointer to invalid device";
+
+	extent_for_each_ptr(e, ptr2)
+		if (ptr != ptr2 && ptr->dev == ptr2->dev)
+			return "multiple pointers to same device";
+
+	if (ptr->offset + size_ondisk > m->bucket_size * m->nbuckets)
+		return "offset past end of device";
+
+	if (ptr->offset < m->bucket_size * m->first_bucket)
+		return "offset before first bucket";
+
+	if ((ptr->offset & (m->bucket_size - 1)) + size_ondisk > m->bucket_size)
+		return "spans multiple buckets";
+
+	return NULL;
+}
+
+static size_t extent_print_ptrs(struct cache_set *c, char *buf,
+				size_t size, struct bkey_s_c_extent e)
+{
+	char *out = buf, *end = buf + size;
+	const union bch_extent_entry *entry;
+	const union bch_extent_crc *crc;
+	const struct bch_extent_ptr *ptr;
+	struct cache *ca;
+	bool first = true;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	rcu_read_lock();
+	extent_for_each_entry(e, entry) {
+		if (!first)
+			p(" ");
+
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+			crc = entry_to_crc(entry);
+			p("crc: c_size %u size %u offset %u csum %u compress %u",
+			  crc_compressed_size(e.k, crc),
+			  crc_uncompressed_size(e.k, crc),
+			  crc_offset(crc), crc_csum_type(crc),
+			  crc_compression_type(crc));
+			break;
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = &entry->ptr;
+			p("ptr: %u:%llu gen %u%s", ptr->dev,
+			  (u64) ptr->offset, ptr->gen,
+			  (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr)
+			  ? " stale" : "");
+			break;
+		default:
+			p("(invalid extent entry %.16llx)", *((u64 *) entry));
+			goto out;
+		}
+
+		first = false;
+	}
+out:
+	rcu_read_unlock();
+
+	if (bkey_extent_is_cached(e.k))
+		p(" cached");
+#undef p
+	return out - buf;
+}
+
+/* Btree ptrs */
+
+static const char *bch_btree_ptr_invalid(const struct cache_set *c,
+					 struct bkey_s_c k)
+{
+	if (bkey_extent_is_cached(k.k))
+		return "cached";
+
+	if (k.k->size)
+		return "nonzero key size";
+
+	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+		return "value too big";
+
+	switch (k.k->type) {
+	case BCH_EXTENT: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		const struct bch_extent_ptr *ptr;
+		const union bch_extent_crc *crc;
+		struct cache_member_rcu *mi;
+		const char *reason;
+
+		extent_for_each_entry(e, entry)
+			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+				return "invalid extent entry type";
+
+		mi = cache_member_info_get(c);
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			reason = extent_ptr_invalid(e, mi, ptr,
+						c->sb.btree_node_size);
+
+			if (reason) {
+				cache_member_info_put();
+				return reason;
+			}
+		}
+
+		cache_member_info_put();
+
+		if (crc)
+			return "has crc field";
+
+		return NULL;
+	}
+
+	default:
+		return "invalid value type";
+	}
+}
+
+static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b,
+				 struct bkey_s_c k)
+{
+	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	const struct bch_extent_ptr *ptr;
+	unsigned seq;
+	const char *err;
+	char buf[160];
+	struct bucket *g;
+	struct cache *ca;
+	unsigned replicas = 0;
+	bool bad;
+
+	rcu_read_lock();
+
+	extent_for_each_online_device(c, e, ptr, ca) {
+		replicas++;
+
+		if ((ca = PTR_CACHE(c, ptr))) {
+			g = PTR_BUCKET(ca, ptr);
+
+			err = "stale";
+			if (ptr_stale(ca, ptr))
+				goto err;
+
+			do {
+				seq = read_seqcount_begin(&c->gc_pos_lock);
+				bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
+				       !g->mark.is_metadata;
+			} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+			err = "inconsistent";
+			if (bad)
+				goto err;
+		}
+	}
+
+	rcu_read_unlock();
+
+	if (replicas < c->sb.meta_replicas_have) {
+		bch_bkey_val_to_text(c, btree_node_type(b),
+				     buf, sizeof(buf), k);
+		cache_set_bug(c,
+			"btree key bad (too few replicas, %u < %u): %s",
+			replicas, c->sb.meta_replicas_have, buf);
+		return;
+	}
+
+	return;
+err:
+	bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
+	cache_set_bug(c, "%s btree pointer %s: bucket %zi prio %i "
+		      "gen %i last_gc %i mark %08x",
+		      err, buf, PTR_BUCKET_NR(ca, ptr),
+		      g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
+		      ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
+		      (unsigned) g->mark.counter);
+	rcu_read_unlock();
+}
+
+static void bch_btree_ptr_to_text(struct cache_set *c, char *buf,
+				  size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+	const char *invalid;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	if (bkey_extent_is_data(k.k))
+		out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+
+	invalid = bch_btree_ptr_invalid(c, k);
+	if (invalid)
+		p(" invalid: %s", invalid);
+#undef p
+}
+
+struct extent_pick_ptr
+bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
+{
+	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
+	const union bch_extent_crc *crc;
+	const struct bch_extent_ptr *ptr;
+	struct cache *ca;
+
+	rcu_read_lock();
+
+	extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+		struct btree *root = btree_node_root(c, b);
+
+		if (cache_set_inconsistent_on(crc, c,
+				"btree node pointer with crc at btree %u level %u/%u bucket %zu",
+				b->btree_id, b->level, root ? root->level : -1,
+				PTR_BUCKET_NR(ca, ptr)))
+			break;
+
+		if (cache_inconsistent_on(ptr_stale(ca, ptr), ca,
+				"stale btree node pointer at btree %u level %u/%u bucket %zu",
+				b->btree_id, b->level, root ? root->level : -1,
+				PTR_BUCKET_NR(ca, ptr)))
+			continue;
+
+		percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		return (struct extent_pick_ptr) { .ptr = *ptr, .ca = ca };
+	}
+
+	rcu_read_unlock();
+
+	return (struct extent_pick_ptr) { .ca = NULL, };
+}
+
+const struct bkey_ops bch_bkey_btree_ops = {
+	.key_invalid	= bch_btree_ptr_invalid,
+	.key_debugcheck	= btree_ptr_debugcheck,
+	.val_to_text	= bch_btree_ptr_to_text,
+	.swab		= bch_ptr_swab,
+};
+
+/* Extents */
+
+static bool __bch_cut_front(struct bpos where, struct bkey_s k)
+{
+	u64 len = 0;
+
+	if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
+		return false;
+
+	EBUG_ON(bkey_cmp(where, k.k->p) > 0);
+
+	len = k.k->p.offset - where.offset;
+
+	BUG_ON(len > k.k->size);
+
+	/*
+	 * Don't readjust offset if the key size is now 0, because that could
+	 * cause offset to point to the next bucket:
+	 */
+	if (!len)
+		__set_bkey_deleted(k.k);
+	else if (bkey_extent_is_data(k.k)) {
+		struct bkey_s_extent e = bkey_s_to_extent(k);
+		struct bch_extent_ptr *ptr;
+		union bch_extent_crc *crc, *prev_crc = NULL;
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			switch (extent_crc_type(crc)) {
+			case BCH_EXTENT_CRC_NONE:
+				ptr->offset += e.k->size - len;
+				break;
+			case BCH_EXTENT_CRC32:
+				if (prev_crc != crc)
+					crc->crc32.offset += e.k->size - len;
+				break;
+			case BCH_EXTENT_CRC64:
+				if (prev_crc != crc)
+					crc->crc64.offset += e.k->size - len;
+				break;
+			}
+			prev_crc = crc;
+		}
+	}
+
+	k.k->size = len;
+
+	return true;
+}
+
+bool bch_cut_front(struct bpos where, struct bkey_i *k)
+{
+	return __bch_cut_front(where, bkey_i_to_s(k));
+}
+
+bool bch_cut_back(struct bpos where, struct bkey *k)
+{
+	u64 len = 0;
+
+	if (bkey_cmp(where, k->p) >= 0)
+		return false;
+
+	EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
+
+	len = where.offset - bkey_start_offset(k);
+
+	BUG_ON(len > k->size);
+
+	k->p = where;
+	k->size = len;
+
+	if (!len)
+		__set_bkey_deleted(k);
+
+	return true;
+}
+
+/**
+ * bch_key_resize - adjust size of @k
+ *
+ * bkey_start_offset(k) will be preserved, modifies where the extent ends
+ */
+void bch_key_resize(struct bkey *k,
+		    unsigned new_size)
+{
+	k->p.offset -= k->size;
+	k->p.offset += new_size;
+	k->size = new_size;
+}
+
+/*
+ * In extent_sort_fix_overlapping(), insert_fixup_extent(),
+ * extent_merge_inline() - we're modifying keys in place that are packed. To do
+ * that we have to unpack the key, modify the unpacked key - then this
+ * copies/repacks the unpacked to the original as necessary.
+ */
+static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
+			  struct bkey_packed *dst, struct bkey *src)
+{
+	struct bkey_format *f = &b->format;
+	struct bkey_i *dst_unpacked;
+	bool ret;
+
+	if ((dst_unpacked = packed_to_bkey(dst))) {
+		dst_unpacked->k = *src;
+		ret = true;
+	} else {
+		ret = bkey_pack_key(dst, src, f);
+	}
+
+	if (ret && iter)
+		bch_verify_key_order(b, iter, dst);
+
+	return ret;
+}
+
+static void extent_save(struct btree *b, struct btree_node_iter *iter,
+			struct bkey_packed *dst, struct bkey *src)
+{
+	BUG_ON(!__extent_save(b, iter, dst, src));
+}
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for sort_fix_overlapping() - if there are multiple keys that
+ * compare equal in different sets, we have to process them newest to oldest.
+ */
+#define extent_sort_cmp(l, r)						\
+({									\
+	struct bkey _ul = bkey_unpack_key(b,				\
+				__btree_node_offset_to_key(b, (l).k));	\
+	struct bkey _ur = bkey_unpack_key(b,				\
+				__btree_node_offset_to_key(b, (r).k));	\
+									\
+	int _c = bkey_cmp(bkey_start_pos(&_ul), bkey_start_pos(&_ur));	\
+	_c ? _c > 0 : (l).k < (r).k;					\
+})
+
+static inline void extent_sort_sift(struct btree_node_iter *iter,
+				    struct btree *b, size_t i)
+{
+	heap_sift(iter, i, extent_sort_cmp);
+}
+
+static inline void extent_sort_next(struct btree_node_iter *iter,
+				    struct btree *b,
+				    struct btree_node_iter_set *i)
+{
+	sort_key_next(iter, b, i);
+	heap_sift(iter, i - iter->data, extent_sort_cmp);
+}
+
+static void extent_sort_append(struct cache_set *c,
+			       struct btree *b,
+			       struct btree_nr_keys *nr,
+			       struct bkey_packed *start,
+			       struct bkey_packed **prev,
+			       struct bkey_packed *k)
+{
+	struct bkey_format *f = &b->format;
+	BKEY_PADDED(k) tmp;
+
+	if (bkey_whiteout(k))
+		return;
+
+	bkey_unpack(b, &tmp.k, k);
+
+	if (*prev &&
+	    bch_extent_merge(c, b, (void *) *prev, &tmp.k))
+		return;
+
+	if (*prev) {
+		bkey_pack(*prev, (void *) *prev, f);
+
+		btree_keys_account_key_add(nr, 0, *prev);
+		*prev = bkey_next(*prev);
+	} else {
+		*prev = start;
+	}
+
+	bkey_copy(*prev, &tmp.k);
+}
+
+struct btree_nr_keys bch_extent_sort_fix_overlapping(struct cache_set *c,
+					struct bset *dst,
+					struct btree *b,
+					struct btree_node_iter *iter)
+{
+	struct bkey_format *f = &b->format;
+	struct btree_node_iter_set *_l = iter->data, *_r;
+	struct bkey_packed *prev = NULL, *out, *lk, *rk;
+	struct bkey l_unpacked, r_unpacked;
+	struct bkey_s l, r;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	heap_resort(iter, extent_sort_cmp);
+
+	while (!bch_btree_node_iter_end(iter)) {
+		lk = __btree_node_offset_to_key(b, _l->k);
+
+		if (iter->used == 1) {
+			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+			extent_sort_next(iter, b, _l);
+			continue;
+		}
+
+		_r = iter->data + 1;
+		if (iter->used > 2 &&
+		    extent_sort_cmp(_r[0], _r[1]))
+			_r++;
+
+		rk = __btree_node_offset_to_key(b, _r->k);
+
+		l = __bkey_disassemble(b, lk, &l_unpacked);
+		r = __bkey_disassemble(b, rk, &r_unpacked);
+
+		/* If current key and next key don't overlap, just append */
+		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
+			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+			extent_sort_next(iter, b, _l);
+			continue;
+		}
+
+		/* Skip 0 size keys */
+		if (!r.k->size) {
+			extent_sort_next(iter, b, _r);
+			continue;
+		}
+
+		/*
+		 * overlap: keep the newer key and trim the older key so they
+		 * don't overlap. comparing pointers tells us which one is
+		 * newer, since the bsets are appended one after the other.
+		 */
+
+		/* can't happen because of comparison func */
+		BUG_ON(_l->k < _r->k &&
+		       !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
+
+		if (_l->k > _r->k) {
+			/* l wins, trim r */
+			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
+				sort_key_next(iter, b, _r);
+			} else {
+				__bch_cut_front(l.k->p, r);
+				extent_save(b, NULL, rk, r.k);
+			}
+
+			extent_sort_sift(iter, b, _r - iter->data);
+		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
+			BKEY_PADDED(k) tmp;
+
+			/*
+			 * r wins, but it overlaps in the middle of l - split l:
+			 */
+			bkey_reassemble(&tmp.k, l.s_c);
+			bch_cut_back(bkey_start_pos(r.k), &tmp.k.k);
+
+			__bch_cut_front(r.k->p, l);
+			extent_save(b, NULL, lk, l.k);
+
+			extent_sort_sift(iter, b, 0);
+
+			extent_sort_append(c, b, &nr, dst->start, &prev,
+					   bkey_to_packed(&tmp.k));
+		} else {
+			bch_cut_back(bkey_start_pos(r.k), l.k);
+			extent_save(b, NULL, lk, l.k);
+		}
+	}
+
+	if (prev) {
+		bkey_pack(prev, (void *) prev, f);
+		btree_keys_account_key_add(&nr, 0, prev);
+		out = bkey_next(prev);
+	} else {
+		out = dst->start;
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+struct extent_insert_state {
+	struct btree_insert		*trans;
+	struct btree_insert_entry	*insert;
+	struct bpos			committed;
+	struct bucket_stats_cache_set	stats;
+
+	/* for deleting: */
+	struct bkey_i			whiteout;
+	bool				do_journal;
+	bool				deleting;
+};
+
+static void bch_add_sectors(struct extent_insert_state *s,
+			    struct bkey_s_c k, u64 offset, s64 sectors)
+{
+	struct cache_set *c = s->trans->c;
+	struct btree *b = s->insert->iter->nodes[0];
+
+	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
+
+	if (!sectors)
+		return;
+
+	bch_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
+		     &s->stats, s->trans->journal_res.seq);
+
+	if (bkey_extent_is_data(k.k) &&
+	    !bkey_extent_is_cached(k.k))
+		bcache_dev_sectors_dirty_add(c, k.k->p.inode, offset, sectors);
+}
+
+static void bch_subtract_sectors(struct extent_insert_state *s,
+				 struct bkey_s_c k, u64 offset, s64 sectors)
+{
+	bch_add_sectors(s, k, offset, -sectors);
+}
+
+/* These wrappers subtract exactly the sectors that we're removing from @k */
+static void bch_cut_subtract_back(struct extent_insert_state *s,
+				  struct bpos where, struct bkey_s k)
+{
+	bch_subtract_sectors(s, k.s_c, where.offset,
+			     k.k->p.offset - where.offset);
+	bch_cut_back(where, k.k);
+}
+
+static void bch_cut_subtract_front(struct extent_insert_state *s,
+				   struct bpos where, struct bkey_s k)
+{
+	bch_subtract_sectors(s, k.s_c, bkey_start_offset(k.k),
+			     where.offset - bkey_start_offset(k.k));
+	__bch_cut_front(where, k);
+}
+
+static void bch_drop_subtract(struct extent_insert_state *s, struct bkey_s k)
+{
+	if (k.k->size)
+		bch_subtract_sectors(s, k.s_c,
+				     bkey_start_offset(k.k), k.k->size);
+	k.k->size = 0;
+	__set_bkey_deleted(k.k);
+}
+
+/*
+ * Note: If this returns true because only some pointers matched,
+ * we can lose some caching that had happened in the interim.
+ * Because cache promotion only promotes the part of the extent
+ * actually read, and not the whole extent, and due to the key
+ * splitting done in bch_extent_insert_fixup, preserving such
+ * caching is difficult.
+ */
+static bool bch_extent_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r)
+{
+	struct bkey_s_c_extent le, re;
+	const struct bch_extent_ptr *lp, *rp;
+	s64 offset;
+
+	BUG_ON(!l.k->size || !r.k->size);
+
+	if (l.k->type != r.k->type ||
+	    l.k->version != r.k->version)
+		return false;
+
+	switch (l.k->type) {
+	case KEY_TYPE_COOKIE:
+		return !memcmp(bkey_s_c_to_cookie(l).v,
+			       bkey_s_c_to_cookie(r).v,
+			       sizeof(struct bch_cookie));
+
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		le = bkey_s_c_to_extent(l);
+		re = bkey_s_c_to_extent(r);
+
+		/*
+		 * bkey_cmpxchg() handles partial matches - when either l or r
+		 * has been trimmed - so we need just to handle l or r not
+		 * starting at the same place when checking for a match here.
+		 *
+		 * If the starts of the keys are different, we just apply that
+		 * offset to the device pointer offsets when checking those -
+		 * matching how bch_cut_front() adjusts device pointer offsets
+		 * when adjusting the start of a key:
+		 */
+		offset = bkey_start_offset(l.k) - bkey_start_offset(r.k);
+
+		/*
+		 * XXX: perhaps we only raced with copygc or tiering replacing
+		 * one of the pointers: it should suffice to find _any_ matching
+		 * pointer
+		 */
+
+		if (bkey_val_u64s(le.k) != bkey_val_u64s(re.k))
+			return false;
+
+		extent_for_each_ptr(le, lp) {
+			const union bch_extent_entry *entry =
+				bkey_idx(re.v, (u64 *) lp - le.v->_data);
+
+			if (!extent_entry_is_ptr(entry))
+				return false;
+
+			rp = &entry->ptr;
+
+			if (lp->offset	!= rp->offset + offset ||
+			    lp->dev	!= rp->dev ||
+			    lp->gen	!= rp->gen)
+				return false;
+		}
+
+		return true;
+	default:
+		return false;
+	}
+
+}
+
+/*
+ * Returns true on success, false on failure (and false means @new no longer
+ * overlaps with @k)
+ *
+ * If returned true, we may have inserted up to one key in @b.
+ * If returned false, we may have inserted up to two keys in @b.
+ *
+ * On return, there is room in @res for at least one more key of the same size
+ * as @new.
+ */
+enum extent_insert_hook_ret bch_extent_cmpxchg(struct extent_insert_hook *hook,
+					       struct bpos committed_pos,
+					       struct bpos next_pos,
+					       struct bkey_s_c k,
+					       const struct bkey_i *new)
+{
+	struct bch_replace_info *replace = container_of(hook,
+					struct bch_replace_info, hook);
+	struct bkey_i *old = &replace->key;
+
+	EBUG_ON(bkey_cmp(committed_pos, bkey_start_pos(&new->k)) < 0);
+
+	/* must have something to compare against */
+	EBUG_ON(!bkey_val_u64s(&old->k));
+
+	/* new must be a subset of old */
+	EBUG_ON(bkey_cmp(new->k.p, old->k.p) > 0 ||
+		bkey_cmp(bkey_start_pos(&new->k), bkey_start_pos(&old->k)) < 0);
+
+	if (k.k && bch_extent_cmpxchg_cmp(k, bkey_i_to_s_c(old))) {
+		replace->successes++;
+		return BTREE_HOOK_DO_INSERT;
+	} else {
+		replace->failures++;
+		return BTREE_HOOK_NO_INSERT;
+	}
+}
+
+static bool bch_extent_merge_inline(struct cache_set *,
+				    struct btree_iter *,
+				    struct bkey_packed *,
+				    struct bkey_packed *,
+				    bool);
+
+#define MAX_LOCK_HOLD_TIME	(5 * NSEC_PER_MSEC)
+
+static enum btree_insert_ret
+extent_insert_should_stop(struct extent_insert_state *s)
+{
+	struct btree *b = s->insert->iter->nodes[0];
+
+	/*
+	 * Check if we have sufficient space in both the btree node and the
+	 * journal reservation:
+	 *
+	 * Each insert checks for room in the journal entry, but we check for
+	 * room in the btree node up-front. In the worst case, bkey_cmpxchg()
+	 * will insert two keys, and one iteration of this room will insert one
+	 * key, so we need room for three keys.
+	 */
+	if (!bch_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s))
+		return BTREE_INSERT_BTREE_NODE_FULL;
+	else if (!journal_res_insert_fits(s->trans, s->insert))
+		return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */
+	else
+		return BTREE_INSERT_OK;
+}
+
+static void extent_bset_insert(struct cache_set *c, struct btree_iter *iter,
+			       struct bkey_i *insert)
+{
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	struct bset_tree *t = bset_tree_last(b);
+	struct bkey_packed *where =
+		bch_btree_node_iter_bset_pos(node_iter, b, t);
+	struct bkey_packed *prev = bkey_prev(b, t, where);
+	struct bkey_packed *next_live_key = where;
+	unsigned clobber_u64s;
+
+	if (prev)
+		where = bkey_next(prev);
+
+	while (next_live_key != btree_bkey_last(b, t) &&
+	       bkey_deleted(next_live_key))
+		next_live_key = bkey_next(next_live_key);
+
+	/*
+	 * Everything between where and next_live_key is now deleted keys, and
+	 * is overwritten:
+	 */
+	clobber_u64s = (u64 *) next_live_key - (u64 *) where;
+
+	if (prev &&
+	    bch_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
+		goto drop_deleted_keys;
+
+	if (next_live_key != btree_bkey_last(b, t) &&
+	    bch_extent_merge_inline(c, iter, bkey_to_packed(insert),
+				    next_live_key, false))
+		goto drop_deleted_keys;
+
+	bch_bset_insert(b, node_iter, where, insert, clobber_u64s);
+	bch_btree_node_iter_fix(iter, b, node_iter, t, where,
+				clobber_u64s, where->u64s);
+	return;
+drop_deleted_keys:
+	bch_bset_delete(b, where, clobber_u64s);
+	bch_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, 0);
+}
+
+static void extent_insert_committed(struct extent_insert_state *s)
+{
+	struct cache_set *c = s->trans->c;
+	struct btree_iter *iter = s->insert->iter;
+	struct bkey_i *insert = !s->deleting
+		? s->insert->k
+		: &s->whiteout;
+	BKEY_PADDED(k) split;
+
+	EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
+	EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
+
+	if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
+		return;
+
+	if (s->deleting && !s->do_journal) {
+		bch_cut_front(s->committed, insert);
+		goto done;
+	}
+
+	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+
+	bkey_copy(&split.k, insert);
+
+	if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
+	    bkey_cmp(s->committed, insert->k.p) &&
+	    bkey_extent_is_compressed(c, bkey_i_to_s_c(insert))) {
+		/* XXX: possibly need to increase our reservation? */
+		bch_cut_subtract_back(s, s->committed,
+				      bkey_i_to_s(&split.k));
+		bch_cut_front(s->committed, insert);
+		bch_add_sectors(s, bkey_i_to_s_c(insert),
+				bkey_start_offset(&insert->k),
+				insert->k.size);
+	} else {
+		bch_cut_back(s->committed, &split.k.k);
+		bch_cut_front(s->committed, insert);
+	}
+
+	if (debug_check_bkeys(c))
+		bkey_debugcheck(c, iter->nodes[iter->level],
+				bkey_i_to_s_c(&split.k));
+
+	bch_btree_journal_key(s->trans, iter, &split.k);
+
+	if (!s->deleting)
+		extent_bset_insert(c, iter, &split.k);
+done:
+	bch_btree_iter_set_pos_same_leaf(iter, s->committed);
+
+	insert->k.needs_whiteout	= false;
+	s->do_journal			= false;
+	s->trans->did_work		= true;
+}
+
+static enum extent_insert_hook_ret
+__extent_insert_advance_pos(struct extent_insert_state *s,
+			    struct bpos next_pos,
+			    struct bkey_s_c k)
+{
+	struct extent_insert_hook *hook = s->trans->hook;
+	enum extent_insert_hook_ret ret;
+
+	if (k.k && k.k->size &&
+	    s->insert->k->k.version &&
+	    k.k->version > s->insert->k->k.version)
+		ret = BTREE_HOOK_NO_INSERT;
+	else if (hook)
+		ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
+	else
+		ret = BTREE_HOOK_DO_INSERT;
+
+	EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
+
+	switch (ret) {
+	case BTREE_HOOK_DO_INSERT:
+		break;
+	case BTREE_HOOK_NO_INSERT:
+		extent_insert_committed(s);
+		bch_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k));
+
+		bch_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos);
+		break;
+	case BTREE_HOOK_RESTART_TRANS:
+		return ret;
+	}
+
+	s->committed = next_pos;
+	return ret;
+}
+
+/*
+ * Update iter->pos, marking how much of @insert we've processed, and call hook
+ * fn:
+ */
+static enum extent_insert_hook_ret
+extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
+{
+	struct btree *b = s->insert->iter->nodes[0];
+	struct bpos next_pos = bpos_min(s->insert->k->k.p,
+					k.k ? k.k->p : b->key.k.p);
+
+	/* hole? */
+	if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
+		bool have_uncommitted = bkey_cmp(s->committed,
+				bkey_start_pos(&s->insert->k->k)) > 0;
+
+		switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k),
+						    bkey_s_c_null)) {
+		case BTREE_HOOK_DO_INSERT:
+			break;
+		case BTREE_HOOK_NO_INSERT:
+			/*
+			 * we had to split @insert and insert the committed
+			 * part - need to bail out and recheck journal
+			 * reservation/btree node before we advance pos past @k:
+			 */
+			if (have_uncommitted)
+				return BTREE_HOOK_NO_INSERT;
+			break;
+		case BTREE_HOOK_RESTART_TRANS:
+			return BTREE_HOOK_RESTART_TRANS;
+		}
+	}
+
+	/* avoid redundant calls to hook fn: */
+	if (!bkey_cmp(s->committed, next_pos))
+		return BTREE_HOOK_DO_INSERT;
+
+	return __extent_insert_advance_pos(s, next_pos, k);
+}
+
+static enum btree_insert_ret
+extent_insert_check_split_compressed(struct extent_insert_state *s,
+				     struct bkey_s_c k,
+				     enum bch_extent_overlap overlap)
+{
+	struct cache_set *c = s->trans->c;
+	unsigned sectors;
+
+	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
+	    (sectors = bkey_extent_is_compressed(c, k))) {
+		int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
+
+		if (s->trans->flags & BTREE_INSERT_NOFAIL)
+			flags |= BCH_DISK_RESERVATION_NOFAIL;
+
+		switch (bch_disk_reservation_add(c,
+				s->trans->disk_res,
+				sectors, flags)) {
+		case 0:
+			break;
+		case -ENOSPC:
+			return BTREE_INSERT_ENOSPC;
+		case -EINTR:
+			return BTREE_INSERT_NEED_GC_LOCK;
+		default:
+			BUG();
+		}
+	}
+
+	return BTREE_INSERT_OK;
+}
+
+static enum btree_insert_ret
+extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
+	      struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
+	      enum bch_extent_overlap overlap)
+{
+	struct cache_set *c = s->trans->c;
+	struct btree_iter *iter = s->insert->iter;
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+
+	switch (overlap) {
+	case BCH_EXTENT_OVERLAP_FRONT:
+		/* insert overlaps with start of k: */
+		bch_cut_subtract_front(s, insert->k.p, k);
+		BUG_ON(bkey_deleted(k.k));
+		extent_save(b, node_iter, _k, k.k);
+		break;
+
+	case BCH_EXTENT_OVERLAP_BACK:
+		/* insert overlaps with end of k: */
+		bch_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
+		BUG_ON(bkey_deleted(k.k));
+		extent_save(b, node_iter, _k, k.k);
+
+		/*
+		 * As the auxiliary tree is indexed by the end of the
+		 * key and we've just changed the end, update the
+		 * auxiliary tree.
+		 */
+		bch_bset_fix_invalidated_key(b, t, _k);
+		bch_btree_node_iter_fix(iter, b, node_iter, t,
+					_k, _k->u64s, _k->u64s);
+		break;
+
+	case BCH_EXTENT_OVERLAP_ALL: {
+		struct bpos orig_pos = k.k->p;
+
+		/* The insert key completely covers k, invalidate k */
+		if (!bkey_whiteout(k.k))
+			btree_keys_account_key_drop(&b->nr,
+						t - b->set, _k);
+
+		bch_drop_subtract(s, k);
+		k.k->p = bkey_start_pos(&insert->k);
+		if (!__extent_save(b, node_iter, _k, k.k)) {
+			/*
+			 * Couldn't repack: we aren't necessarily able
+			 * to repack if the new key is outside the range
+			 * of the old extent, so we have to split
+			 * @insert:
+			 */
+			k.k->p = orig_pos;
+			extent_save(b, node_iter, _k, k.k);
+
+			if (extent_insert_advance_pos(s, k.s_c) ==
+			    BTREE_HOOK_RESTART_TRANS)
+				return BTREE_INSERT_NEED_TRAVERSE;
+
+			extent_insert_committed(s);
+			/*
+			 * We split and inserted upto at k.k->p - that
+			 * has to coincide with iter->pos, so that we
+			 * don't have anything more we have to insert
+			 * until we recheck our journal reservation:
+			 */
+			EBUG_ON(bkey_cmp(s->committed, k.k->p));
+		} else {
+			bch_bset_fix_invalidated_key(b, t, _k);
+			bch_btree_node_iter_fix(iter, b, node_iter, t,
+						_k, _k->u64s, _k->u64s);
+		}
+
+		break;
+	}
+	case BCH_EXTENT_OVERLAP_MIDDLE: {
+		BKEY_PADDED(k) split;
+		/*
+		 * The insert key falls 'in the middle' of k
+		 * The insert key splits k in 3:
+		 * - start only in k, preserve
+		 * - middle common section, invalidate in k
+		 * - end only in k, preserve
+		 *
+		 * We update the old key to preserve the start,
+		 * insert will be the new common section,
+		 * we manually insert the end that we are preserving.
+		 *
+		 * modify k _before_ doing the insert (which will move
+		 * what k points to)
+		 */
+		bkey_reassemble(&split.k, k.s_c);
+		split.k.k.needs_whiteout |= bset_written(b, bset(b, t));
+
+		bch_cut_back(bkey_start_pos(&insert->k), &split.k.k);
+		BUG_ON(bkey_deleted(&split.k.k));
+
+		bch_cut_subtract_front(s, insert->k.p, k);
+		BUG_ON(bkey_deleted(k.k));
+		extent_save(b, node_iter, _k, k.k);
+
+		bch_add_sectors(s, bkey_i_to_s_c(&split.k),
+				bkey_start_offset(&split.k.k),
+				split.k.k.size);
+		extent_bset_insert(c, iter, &split.k);
+		break;
+	}
+	}
+
+	return BTREE_INSERT_OK;
+}
+
+static enum btree_insert_ret
+bch_delete_fixup_extent(struct extent_insert_state *s)
+{
+	struct cache_set *c = s->trans->c;
+	struct btree_iter *iter = s->insert->iter;
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	struct bkey_packed *_k;
+	struct bkey unpacked;
+	struct bkey_i *insert = s->insert->k;
+	enum btree_insert_ret ret = BTREE_INSERT_OK;
+
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+
+	s->whiteout	= *insert;
+	s->do_journal	= false;
+
+	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
+	       (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
+	       (_k = bch_btree_node_iter_peek_all(node_iter, b))) {
+		struct bset_tree *t = bch_bkey_to_bset(b, _k);
+		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
+		enum bch_extent_overlap overlap;
+
+		EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
+
+		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+			break;
+
+		if (bkey_whiteout(k.k)) {
+			s->committed = bpos_min(insert->k.p, k.k->p);
+			goto next;
+		}
+
+		overlap = bch_extent_overlap(&insert->k, k.k);
+
+		ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
+		if (ret != BTREE_INSERT_OK)
+			goto stop;
+
+		switch (extent_insert_advance_pos(s, k.s_c)) {
+		case BTREE_HOOK_DO_INSERT:
+			break;
+		case BTREE_HOOK_NO_INSERT:
+			continue;
+		case BTREE_HOOK_RESTART_TRANS:
+			ret = BTREE_INSERT_NEED_TRAVERSE;
+			goto stop;
+		}
+
+		s->do_journal = true;
+
+		if (overlap == BCH_EXTENT_OVERLAP_ALL) {
+			btree_keys_account_key_drop(&b->nr,
+						t - b->set, _k);
+			bch_subtract_sectors(s, k.s_c,
+					     bkey_start_offset(k.k), k.k->size);
+			_k->type = KEY_TYPE_DISCARD;
+			reserve_whiteout(b, t, _k);
+		} else if (k.k->needs_whiteout ||
+			   bset_written(b, bset(b, t))) {
+			struct bkey_i discard = *insert;
+
+			switch (overlap) {
+			case BCH_EXTENT_OVERLAP_FRONT:
+				bch_cut_front(bkey_start_pos(k.k), &discard);
+				break;
+			case BCH_EXTENT_OVERLAP_BACK:
+				bch_cut_back(k.k->p, &discard.k);
+				break;
+			default:
+				break;
+			}
+
+			discard.k.needs_whiteout = true;
+
+			ret = extent_squash(s, insert, t, _k, k, overlap);
+			BUG_ON(ret != BTREE_INSERT_OK);
+
+			extent_bset_insert(c, iter, &discard);
+		} else {
+			ret = extent_squash(s, insert, t, _k, k, overlap);
+			BUG_ON(ret != BTREE_INSERT_OK);
+		}
+next:
+		bch_cut_front(s->committed, insert);
+		bch_btree_iter_set_pos_same_leaf(iter, s->committed);
+	}
+
+	if (bkey_cmp(s->committed, insert->k.p) < 0 &&
+	    ret == BTREE_INSERT_OK &&
+	    extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
+		ret = BTREE_INSERT_NEED_TRAVERSE;
+stop:
+	extent_insert_committed(s);
+
+	bch_cache_set_stats_apply(c, &s->stats, s->trans->disk_res,
+				  gc_pos_btree_node(b));
+
+	EBUG_ON(bkey_cmp(iter->pos, s->committed));
+	EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
+
+	bch_cut_front(iter->pos, insert);
+
+	if (insert->k.size && iter->at_end_of_leaf)
+		ret = BTREE_INSERT_NEED_TRAVERSE;
+
+	EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
+
+	return ret;
+}
+
+/**
+ * bch_extent_insert_fixup - insert a new extent and deal with overlaps
+ *
+ * this may result in not actually doing the insert, or inserting some subset
+ * of the insert key. For cmpxchg operations this is where that logic lives.
+ *
+ * All subsets of @insert that need to be inserted are inserted using
+ * bch_btree_insert_and_journal(). If @b or @res fills up, this function
+ * returns false, setting @iter->pos for the prefix of @insert that actually got
+ * inserted.
+ *
+ * BSET INVARIANTS: this function is responsible for maintaining all the
+ * invariants for bsets of extents in memory. things get really hairy with 0
+ * size extents
+ *
+ * within one bset:
+ *
+ * bkey_start_pos(bkey_next(k)) >= k
+ * or bkey_start_offset(bkey_next(k)) >= k->offset
+ *
+ * i.e. strict ordering, no overlapping extents.
+ *
+ * multiple bsets (i.e. full btree node):
+ *
+ * ∀ k, j
+ *   k.size != 0 ∧ j.size != 0 →
+ *     ¬ (k > bkey_start_pos(j) ∧ k < j)
+ *
+ * i.e. no two overlapping keys _of nonzero size_
+ *
+ * We can't realistically maintain this invariant for zero size keys because of
+ * the key merging done in bch_btree_insert_key() - for two mergeable keys k, j
+ * there may be another 0 size key between them in another bset, and it will
+ * thus overlap with the merged key.
+ *
+ * In addition, the end of iter->pos indicates how much has been processed.
+ * If the end of iter->pos is not the same as the end of insert, then
+ * key insertion needs to continue/be retried.
+ */
+enum btree_insert_ret
+bch_insert_fixup_extent(struct btree_insert *trans,
+			struct btree_insert_entry *insert)
+{
+	struct cache_set *c = trans->c;
+	struct btree_iter *iter = insert->iter;
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	struct bkey_packed *_k;
+	struct bkey unpacked;
+	enum btree_insert_ret ret = BTREE_INSERT_OK;
+
+	struct extent_insert_state s = {
+		.trans		= trans,
+		.insert		= insert,
+		.committed	= insert->iter->pos,
+		.deleting	= bkey_whiteout(&insert->k->k),
+	};
+
+	EBUG_ON(iter->level);
+	EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size);
+
+	if (s.deleting)
+		return bch_delete_fixup_extent(&s);
+
+	/*
+	 * As we process overlapping extents, we advance @iter->pos both to
+	 * signal to our caller (btree_insert_key()) how much of @insert->k has
+	 * been inserted, and also to keep @iter->pos consistent with
+	 * @insert->k and the node iterator that we're advancing:
+	 */
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
+
+	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+		bch_add_sectors(&s, bkey_i_to_s_c(insert->k),
+				bkey_start_offset(&insert->k->k),
+				insert->k->k.size);
+
+	while (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
+	       (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK &&
+	       (_k = bch_btree_node_iter_peek_all(node_iter, b))) {
+		struct bset_tree *t = bch_bkey_to_bset(b, _k);
+		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
+		enum bch_extent_overlap overlap;
+
+		EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
+		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
+
+		if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
+			break;
+
+		overlap = bch_extent_overlap(&insert->k->k, k.k);
+
+		ret = extent_insert_check_split_compressed(&s, k.s_c, overlap);
+		if (ret != BTREE_INSERT_OK)
+			goto stop;
+
+		if (!k.k->size)
+			goto squash;
+
+		/*
+		 * Only call advance pos & call hook for nonzero size extents:
+		 * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer
+		 * overlaps with @k:
+		 */
+		switch (extent_insert_advance_pos(&s, k.s_c)) {
+		case BTREE_HOOK_DO_INSERT:
+			break;
+		case BTREE_HOOK_NO_INSERT:
+			continue;
+		case BTREE_HOOK_RESTART_TRANS:
+			ret = BTREE_INSERT_NEED_TRAVERSE;
+			goto stop;
+		}
+
+		if (k.k->size &&
+		    (k.k->needs_whiteout || bset_written(b, bset(b, t))))
+			insert->k->k.needs_whiteout = true;
+
+		if (overlap == BCH_EXTENT_OVERLAP_ALL &&
+		    bkey_whiteout(k.k) &&
+		    k.k->needs_whiteout) {
+			unreserve_whiteout(b, t, _k);
+			_k->needs_whiteout = false;
+		}
+squash:
+		ret = extent_squash(&s, insert->k, t, _k, k, overlap);
+		if (ret != BTREE_INSERT_OK)
+			goto stop;
+	}
+
+	if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
+	    ret == BTREE_INSERT_OK &&
+	    extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
+		ret = BTREE_INSERT_NEED_TRAVERSE;
+stop:
+	extent_insert_committed(&s);
+	/*
+	 * Subtract any remaining sectors from @insert, if we bailed out early
+	 * and didn't fully insert @insert:
+	 */
+	if (insert->k->k.size &&
+	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+		bch_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
+				     bkey_start_offset(&insert->k->k),
+				     insert->k->k.size);
+
+	bch_cache_set_stats_apply(c, &s.stats, trans->disk_res,
+				  gc_pos_btree_node(b));
+
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
+	EBUG_ON(bkey_cmp(iter->pos, s.committed));
+	EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
+
+	if (insert->k->k.size && iter->at_end_of_leaf)
+		ret = BTREE_INSERT_NEED_TRAVERSE;
+
+	EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
+
+	return ret;
+}
+
+static const char *bch_extent_invalid(const struct cache_set *c,
+				      struct bkey_s_c k)
+{
+	if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+		return "value too big";
+
+	if (!k.k->size)
+		return "zero key size";
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		const union bch_extent_crc *crc;
+		struct cache_member_rcu *mi = cache_member_info_get(c);
+		unsigned size_ondisk = e.k->size;
+		const char *reason;
+
+		extent_for_each_entry(e, entry) {
+			reason = "invalid extent entry type";
+			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+				goto invalid;
+
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_crc32:
+			case BCH_EXTENT_ENTRY_crc64:
+				crc = entry_to_crc(entry);
+
+				reason = "checksum offset + key size > uncompressed size";
+				if (crc_offset(crc) + e.k->size >
+				    crc_uncompressed_size(e.k, crc))
+					goto invalid;
+
+				size_ondisk = crc_compressed_size(e.k, crc);
+
+				reason = "invalid checksum type";
+				if (crc_csum_type(crc) >= BCH_CSUM_NR)
+					goto invalid;
+
+				reason = "invalid compression type";
+				if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
+					goto invalid;
+				break;
+			case BCH_EXTENT_ENTRY_ptr:
+				reason = extent_ptr_invalid(e, mi,
+						&entry->ptr, size_ondisk);
+				if (reason)
+					goto invalid;
+				break;
+			}
+		}
+
+		cache_member_info_put();
+		return NULL;
+invalid:
+		cache_member_info_put();
+		return reason;
+	}
+
+	case BCH_RESERVATION:
+		return NULL;
+
+	default:
+		return "invalid value type";
+	}
+}
+
+static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
+					 struct bkey_s_c_extent e)
+{
+	const struct bch_extent_ptr *ptr;
+	struct cache_member_rcu *mi;
+	struct cache *ca;
+	struct bucket *g;
+	unsigned seq, stale;
+	char buf[160];
+	bool bad;
+	unsigned ptrs_per_tier[CACHE_TIERS];
+	unsigned tier, replicas = 0;
+
+	/*
+	 * XXX: we should be doing most/all of these checks at startup time,
+	 * where we check bkey_invalid() in btree_node_read_done()
+	 *
+	 * But note that we can't check for stale pointers or incorrect gc marks
+	 * until after journal replay is done (it might be an extent that's
+	 * going to get overwritten during replay)
+	 */
+
+	memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
+
+	mi = cache_member_info_get(c);
+
+	extent_for_each_ptr(e, ptr) {
+		bool dirty = bch_extent_ptr_is_dirty(c, e, ptr);
+
+		replicas++;
+
+		if (ptr->dev >= mi->nr_in_set)
+			goto bad_device;
+
+		/*
+		 * If journal replay hasn't finished, we might be seeing keys
+		 * that will be overwritten by the time journal replay is done:
+		 */
+		if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+			continue;
+
+		if (!mi->m[ptr->dev].valid)
+			goto bad_device;
+
+		tier = mi->m[ptr->dev].tier;
+		ptrs_per_tier[tier]++;
+
+		stale = 0;
+
+		if ((ca = PTR_CACHE(c, ptr))) {
+			g = PTR_BUCKET(ca, ptr);
+
+			do {
+				struct bucket_mark mark;
+
+				seq = read_seqcount_begin(&c->gc_pos_lock);
+				mark = READ_ONCE(g->mark);
+
+				/* between mark and bucket gen */
+				smp_rmb();
+
+				stale = ptr_stale(ca, ptr);
+
+				cache_set_bug_on(stale && dirty, c,
+						 "stale dirty pointer");
+
+				cache_set_bug_on(stale > 96, c,
+						 "key too stale: %i",
+						 stale);
+
+				if (stale)
+					break;
+
+				bad = (mark.is_metadata ||
+				       (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
+					!mark.owned_by_allocator &&
+					!(dirty
+					  ? mark.dirty_sectors
+					  : mark.cached_sectors)));
+			} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+			if (bad)
+				goto bad_ptr;
+		}
+	}
+	cache_member_info_put();
+
+	if (replicas > BCH_REPLICAS_MAX) {
+		bch_bkey_val_to_text(c, btree_node_type(b), buf,
+				     sizeof(buf), e.s_c);
+		cache_set_bug(c,
+			"extent key bad (too many replicas: %u): %s",
+			replicas, buf);
+		return;
+	}
+
+	if (!bkey_extent_is_cached(e.k) &&
+	    replicas < c->sb.data_replicas_have) {
+		bch_bkey_val_to_text(c, btree_node_type(b), buf,
+				     sizeof(buf), e.s_c);
+		cache_set_bug(c,
+			"extent key bad (too few replicas, %u < %u): %s",
+			replicas, c->sb.data_replicas_have, buf);
+		return;
+	}
+
+	return;
+
+bad_device:
+	bch_bkey_val_to_text(c, btree_node_type(b), buf,
+			     sizeof(buf), e.s_c);
+	cache_set_bug(c, "extent pointer to dev %u missing device: %s",
+		      ptr->dev, buf);
+	cache_member_info_put();
+	return;
+
+bad_ptr:
+	bch_bkey_val_to_text(c, btree_node_type(b), buf,
+			     sizeof(buf), e.s_c);
+	cache_set_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i "
+		      "gen %i last_gc %i mark 0x%08x",
+		      buf, PTR_BUCKET_NR(ca, ptr),
+		      g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
+		      ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
+		      (unsigned) g->mark.counter);
+	cache_member_info_put();
+	return;
+}
+
+static void bch_extent_debugcheck(struct cache_set *c, struct btree *b,
+				  struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		bch_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
+	case BCH_RESERVATION:
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void bch_extent_to_text(struct cache_set *c, char *buf,
+			       size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+	const char *invalid;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	if (bkey_extent_is_data(k.k))
+		out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+
+	invalid = bch_extent_invalid(c, k);
+	if (invalid)
+		p(" invalid: %s", invalid);
+#undef p
+}
+
+static unsigned PTR_TIER(struct cache_member_rcu *mi,
+			 const struct bch_extent_ptr *ptr)
+{
+	return ptr->dev < mi->nr_in_set
+		? mi->m[ptr->dev].tier
+		: UINT_MAX;
+}
+
+void bch_extent_entry_append(struct bkey_i_extent *e,
+			     union bch_extent_entry *entry)
+{
+	BUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
+	       BKEY_EXTENT_VAL_U64s_MAX);
+
+	memcpy_u64s(extent_entry_last(extent_i_to_s(e)),
+		    entry,
+		    extent_entry_u64s(entry));
+	e->k.u64s += extent_entry_u64s(entry);
+}
+
+const unsigned bch_crc_size[] = {
+	[BCH_CSUM_NONE]			= 0,
+	[BCH_CSUM_CRC32C]		= 4,
+	[BCH_CSUM_CRC64]		= 8,
+};
+
+static void bch_extent_crc_init(union bch_extent_crc *crc,
+				unsigned compressed_size,
+				unsigned uncompressed_size,
+				unsigned compression_type,
+				u64 csum, unsigned csum_type)
+{
+	if (bch_crc_size[csum_type] <= 4 &&
+	    uncompressed_size <= CRC32_EXTENT_SIZE_MAX) {
+		crc->crc32 = (struct bch_extent_crc32) {
+			.type = 1 << BCH_EXTENT_ENTRY_crc32,
+			.compressed_size	= compressed_size,
+			.uncompressed_size	= uncompressed_size,
+			.offset			= 0,
+			.compression_type	= compression_type,
+			.csum_type		= csum_type,
+			.csum			= csum,
+		};
+	} else {
+		BUG_ON(uncompressed_size > CRC64_EXTENT_SIZE_MAX);
+
+		crc->crc64 = (struct bch_extent_crc64) {
+			.type = 1 << BCH_EXTENT_ENTRY_crc64,
+			.compressed_size	= compressed_size,
+			.uncompressed_size	= uncompressed_size,
+			.offset			= 0,
+			.compression_type	= compression_type,
+			.csum_type		= csum_type,
+			.csum			= csum,
+		};
+	}
+}
+
+void bch_extent_crc_append(struct bkey_i_extent *e,
+			   unsigned compressed_size,
+			   unsigned uncompressed_size,
+			   unsigned compression_type,
+			   u64 csum, unsigned csum_type)
+{
+	union bch_extent_crc *crc;
+	union bch_extent_crc new;
+
+	BUG_ON(compressed_size > uncompressed_size);
+	BUG_ON(uncompressed_size != e->k.size);
+	BUG_ON(!compressed_size || !uncompressed_size);
+
+	/*
+	 * Look up the last crc entry, so we can check if we need to add
+	 * another:
+	 */
+	extent_for_each_crc(extent_i_to_s(e), crc)
+		;
+
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		if (!csum_type && !compression_type)
+			return;
+		break;
+	case BCH_EXTENT_CRC32:
+	case BCH_EXTENT_CRC64:
+		if (crc_compressed_size(&e->k, crc)	== compressed_size &&
+		    crc_uncompressed_size(&e->k, crc)	== uncompressed_size &&
+		    crc_offset(crc)			== 0 &&
+		    crc_compression_type(crc)		== compression_type &&
+		    crc_csum_type(crc)			== csum_type &&
+		    crc_csum(crc)			== csum)
+			return;
+		break;
+	}
+
+	bch_extent_crc_init(&new,
+			    compressed_size,
+			    uncompressed_size,
+			    compression_type,
+			    csum, csum_type);
+	bch_extent_entry_append(e, to_entry(&new));
+}
+
+static void __extent_sort_ptrs(struct cache_member_rcu *mi,
+			       struct bkey_s_extent src)
+{
+	struct bch_extent_ptr *src_ptr, *dst_ptr;
+	union bch_extent_crc *src_crc, *dst_crc;
+	union bch_extent_crc _src;
+	BKEY_PADDED(k) tmp;
+	struct bkey_s_extent dst;
+	size_t u64s, crc_u64s;
+	u64 *p;
+
+	/*
+	 * Insertion sort:
+	 *
+	 * Note: this sort needs to be stable, because pointer order determines
+	 * pointer dirtyness.
+	 */
+
+	tmp.k.k = *src.k;
+	dst = bkey_i_to_s_extent(&tmp.k);
+	set_bkey_val_u64s(dst.k, 0);
+
+	extent_for_each_ptr_crc(src, src_ptr, src_crc) {
+		extent_for_each_ptr_crc(dst, dst_ptr, dst_crc)
+			if (PTR_TIER(mi, src_ptr) < PTR_TIER(mi, dst_ptr))
+				goto found;
+
+		dst_ptr = &extent_entry_last(dst)->ptr;
+		dst_crc = NULL;
+found:
+		/* found insert position: */
+
+		/*
+		 * we're making sure everything has a crc at this point, if
+		 * dst_ptr points to a pointer it better have a crc:
+		 */
+		BUG_ON(dst_ptr != &extent_entry_last(dst)->ptr && !dst_crc);
+		BUG_ON(dst_crc &&
+		       (extent_entry_next(to_entry(dst_crc)) !=
+			to_entry(dst_ptr)));
+
+		if (!src_crc) {
+			bch_extent_crc_init(&_src, src.k->size,
+					    src.k->size, 0, 0, 0);
+			src_crc = &_src;
+		}
+
+		p = dst_ptr != &extent_entry_last(dst)->ptr
+			? (void *) dst_crc
+			: (void *) dst_ptr;
+
+		crc_u64s = extent_entry_u64s(to_entry(src_crc));
+		u64s = crc_u64s + sizeof(*dst_ptr) / sizeof(u64);
+
+		memmove_u64s_up(p + u64s, p,
+				(u64 *) extent_entry_last(dst) - (u64 *) p);
+		set_bkey_val_u64s(dst.k, bkey_val_u64s(dst.k) + u64s);
+
+		memcpy_u64s(p, src_crc, crc_u64s);
+		memcpy_u64s(p + crc_u64s, src_ptr,
+			    sizeof(*src_ptr) / sizeof(u64));
+	}
+
+	/* Sort done - now drop redundant crc entries: */
+	bch_extent_drop_redundant_crcs(dst);
+
+	memcpy_u64s(src.v, dst.v, bkey_val_u64s(dst.k));
+	set_bkey_val_u64s(src.k, bkey_val_u64s(dst.k));
+}
+
+static void extent_sort_ptrs(struct cache_set *c, struct bkey_s_extent e)
+{
+	struct cache_member_rcu *mi;
+	struct bch_extent_ptr *ptr, *prev = NULL;
+	union bch_extent_crc *crc;
+
+	/*
+	 * First check if any pointers are out of order before doing the actual
+	 * sort:
+	 */
+	mi = cache_member_info_get(c);
+
+	extent_for_each_ptr_crc(e, ptr, crc) {
+		if (prev &&
+		    PTR_TIER(mi, ptr) < PTR_TIER(mi, prev)) {
+			__extent_sort_ptrs(mi, e);
+			break;
+		}
+		prev = ptr;
+	}
+
+	cache_member_info_put();
+}
+
+/*
+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
+ *
+ * Returns true if @k should be dropped entirely
+ *
+ * For existing keys, only called when btree nodes are being rewritten, not when
+ * they're merely being compacted/resorted in memory.
+ */
+static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k,
+				   bool sort)
+{
+	struct bkey_s_extent e;
+
+	switch (k.k->type) {
+	case KEY_TYPE_ERROR:
+		return false;
+
+	case KEY_TYPE_DELETED:
+	case KEY_TYPE_COOKIE:
+		return true;
+
+	case KEY_TYPE_DISCARD:
+		return !k.k->version;
+
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		e = bkey_s_to_extent(k);
+
+		bch_extent_drop_stale(c, e);
+
+		if (sort)
+			extent_sort_ptrs(c, e);
+
+		if (!bkey_val_u64s(e.k)) {
+			if (bkey_extent_is_cached(e.k)) {
+				k.k->type = KEY_TYPE_DISCARD;
+				if (!k.k->version)
+					return true;
+			} else {
+				k.k->type = KEY_TYPE_ERROR;
+			}
+		}
+
+		return false;
+	case BCH_RESERVATION:
+		return false;
+	default:
+		BUG();
+	}
+}
+
+bool bch_extent_normalize(struct cache_set *c, struct bkey_s k)
+{
+	return __bch_extent_normalize(c, k, true);
+}
+
+/*
+ * This picks a non-stale pointer, preferabbly from a device other than
+ * avoid.  Avoid can be NULL, meaning pick any.  If there are no non-stale
+ * pointers to other devices, it will still pick a pointer from avoid.
+ * Note that it prefers lowered-numbered pointers to higher-numbered pointers
+ * as the pointers are sorted by tier, hence preferring pointers to tier 0
+ * rather than pointers to tier 1.
+ */
+void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
+				  struct cache *avoid,
+				  struct extent_pick_ptr *ret)
+{
+	struct bkey_s_c_extent e;
+	const union bch_extent_crc *crc;
+	const struct bch_extent_ptr *ptr;
+	struct cache *ca;
+
+	switch (k.k->type) {
+	case KEY_TYPE_DELETED:
+	case KEY_TYPE_DISCARD:
+	case KEY_TYPE_COOKIE:
+		ret->ca = NULL;
+		return;
+
+	case KEY_TYPE_ERROR:
+		ret->ca = ERR_PTR(-EIO);
+		return;
+
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		e = bkey_s_c_to_extent(k);
+		rcu_read_lock();
+		ret->ca = NULL;
+
+		extent_for_each_online_device_crc(c, e, crc, ptr, ca)
+			if (!ptr_stale(ca, ptr)) {
+				*ret = (struct extent_pick_ptr) {
+					.crc = crc_to_64(e.k, crc),
+					.ptr = *ptr,
+					.ca = ca,
+				};
+
+				if (ca != avoid)
+					break;
+			}
+
+		if (ret->ca)
+			percpu_ref_get(&ret->ca->ref);
+		else if (!bkey_extent_is_cached(e.k))
+			ret->ca = ERR_PTR(-EIO);
+
+		rcu_read_unlock();
+		return;
+
+	case BCH_RESERVATION:
+		ret->ca = NULL;
+		return;
+
+	default:
+		BUG();
+	}
+}
+
+static enum merge_result bch_extent_merge(struct cache_set *c,
+					  struct btree *bk,
+					  struct bkey_i *l, struct bkey_i *r)
+{
+	struct bkey_s_extent el, er;
+	union bch_extent_entry *en_l, *en_r;
+
+	if (key_merging_disabled(c))
+		return BCH_MERGE_NOMERGE;
+
+	/*
+	 * Generic header checks
+	 * Assumes left and right are in order
+	 * Left and right must be exactly aligned
+	 */
+
+	if (l->k.u64s		!= r->k.u64s ||
+	    l->k.type		!= r->k.type ||
+	    l->k.version	!= r->k.version ||
+	    bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
+		return BCH_MERGE_NOMERGE;
+
+	switch (l->k.type) {
+	case KEY_TYPE_DELETED:
+	case KEY_TYPE_DISCARD:
+	case KEY_TYPE_ERROR:
+	case BCH_RESERVATION:
+		/* These types are mergeable, and no val to check */
+		break;
+
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		el = bkey_i_to_s_extent(l);
+		er = bkey_i_to_s_extent(r);
+
+		extent_for_each_entry(el, en_l) {
+			struct bch_extent_ptr *lp, *rp;
+			struct cache_member_cpu *m;
+
+			en_r = bkey_idx(er.v, (u64 *) en_l - el.v->_data);
+
+			if ((extent_entry_type(en_l) !=
+			     extent_entry_type(en_r)) ||
+			    extent_entry_is_crc(en_l))
+				return BCH_MERGE_NOMERGE;
+
+			lp = &en_l->ptr;
+			rp = &en_r->ptr;
+
+			if (lp->offset + el.k->size	!= rp->offset ||
+			    lp->dev			!= rp->dev ||
+			    lp->gen			!= rp->gen)
+				return BCH_MERGE_NOMERGE;
+
+			/* We don't allow extents to straddle buckets: */
+
+			m = cache_member_info_get(c)->m + lp->dev;
+			if ((lp->offset & ~((u64) m->bucket_size - 1)) !=
+			    (rp->offset & ~((u64) m->bucket_size - 1))) {
+				cache_member_info_put();
+				return BCH_MERGE_NOMERGE;
+
+			}
+			cache_member_info_put();
+		}
+
+		break;
+	default:
+		return BCH_MERGE_NOMERGE;
+	}
+
+	l->k.needs_whiteout |= r->k.needs_whiteout;
+
+	/* Keys with no pointers aren't restricted to one bucket and could
+	 * overflow KEY_SIZE
+	 */
+	if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
+		bch_key_resize(&l->k, KEY_SIZE_MAX);
+		bch_cut_front(l->k.p, r);
+		return BCH_MERGE_PARTIAL;
+	}
+
+	bch_key_resize(&l->k, l->k.size + r->k.size);
+
+	return BCH_MERGE_MERGE;
+}
+
+static void extent_i_save(struct btree *b, struct bkey_packed *dst,
+			  struct bkey_i *src)
+{
+	struct bkey_format *f = &b->format;
+	struct bkey_i *dst_unpacked;
+
+	BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
+
+	/*
+	 * We don't want the bch_verify_key_order() call in extent_save(),
+	 * because we may be out of order with deleted keys that are about to be
+	 * removed by extent_bset_insert()
+	 */
+
+	if ((dst_unpacked = packed_to_bkey(dst)))
+		bkey_copy(dst_unpacked, src);
+	else
+		BUG_ON(!bkey_pack(dst, src, f));
+}
+
+static bool extent_merge_one_overlapping(struct btree_iter *iter,
+					 struct bpos new_pos,
+					 struct bset_tree *t,
+					 struct bkey_packed *k, struct bkey uk,
+					 bool check, bool could_pack)
+{
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+
+	BUG_ON(!bkey_deleted(k));
+
+	if (check) {
+		return !bkey_packed(k) || could_pack;
+	} else {
+		uk.p = new_pos;
+		extent_save(b, node_iter, k, &uk);
+		bch_bset_fix_invalidated_key(b, t, k);
+		bch_btree_node_iter_fix(iter, b, node_iter, t,
+					k, k->u64s, k->u64s);
+		return true;
+	}
+}
+
+static bool extent_merge_do_overlapping(struct btree_iter *iter,
+					struct bkey *m, bool back_merge)
+{
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	struct bset_tree *t;
+	struct bkey_packed *k;
+	struct bkey uk;
+	struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
+	bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
+	bool check = true;
+
+	/*
+	 * @m is the new merged extent:
+	 *
+	 * The merge took place in the last bset; we know there can't be any 0
+	 * size extents overlapping with m there because if so they would have
+	 * been between the two extents we merged.
+	 *
+	 * But in the other bsets, we have to check for and fix such extents:
+	 */
+do_fixup:
+	for_each_bset(b, t) {
+		if (t == bset_tree_last(b))
+			break;
+
+		/*
+		 * if we don't find this bset in the iterator we already got to
+		 * the end of that bset, so start searching from the end.
+		 */
+		k = bch_btree_node_iter_bset_pos(node_iter, b, t);
+
+		if (k == btree_bkey_last(b, t))
+			k = bkey_prev_all(b, t, k);
+		if (!k)
+			continue;
+
+		if (back_merge) {
+			/*
+			 * Back merge: 0 size extents will be before the key
+			 * that was just inserted (and thus the iterator
+			 * position) - walk backwards to find them
+			 */
+			for (;
+			     k &&
+			     (uk = bkey_unpack_key(b, k),
+			      bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
+			     k = bkey_prev_all(b, t, k)) {
+				if (bkey_cmp(uk.p, m->p) >= 0)
+					continue;
+
+				if (!extent_merge_one_overlapping(iter, new_pos,
+						t, k, uk, check, could_pack))
+					return false;
+			}
+		} else {
+			/* Front merge - walk forwards */
+			for (;
+			     k != btree_bkey_last(b, t) &&
+			     (uk = bkey_unpack_key(b, k),
+			      bkey_cmp(uk.p, m->p) < 0);
+			     k = bkey_next(k)) {
+				if (bkey_cmp(uk.p,
+					     bkey_start_pos(m)) <= 0)
+					continue;
+
+				if (!extent_merge_one_overlapping(iter, new_pos,
+						t, k, uk, check, could_pack))
+					return false;
+			}
+		}
+	}
+
+	if (check) {
+		check = false;
+		goto do_fixup;
+	}
+
+	return true;
+}
+
+/*
+ * When merging an extent that we're inserting into a btree node, the new merged
+ * extent could overlap with an existing 0 size extent - if we don't fix that,
+ * it'll break the btree node iterator so this code finds those 0 size extents
+ * and shifts them out of the way.
+ *
+ * Also unpacks and repacks.
+ */
+static bool bch_extent_merge_inline(struct cache_set *c,
+				    struct btree_iter *iter,
+				    struct bkey_packed *l,
+				    struct bkey_packed *r,
+				    bool back_merge)
+{
+	struct btree *b = iter->nodes[0];
+	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	const struct bkey_format *f = &b->format;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bkey_packed *m;
+	BKEY_PADDED(k) li;
+	BKEY_PADDED(k) ri;
+	struct bkey_i *mi;
+	struct bkey tmp;
+
+	/*
+	 * We need to save copies of both l and r, because we might get a
+	 * partial merge (which modifies both) and then fails to repack
+	 */
+	bkey_unpack(b, &li.k, l);
+	bkey_unpack(b, &ri.k, r);
+
+	m = back_merge ? l : r;
+	mi = back_merge ? &li.k : &ri.k;
+
+	/* l & r should be in last bset: */
+	EBUG_ON(bch_bkey_to_bset(b, m) != t);
+
+	switch (bch_extent_merge(c, b, &li.k, &ri.k)) {
+	case BCH_MERGE_NOMERGE:
+		return false;
+	case BCH_MERGE_PARTIAL:
+		if (bkey_packed(m) && !bkey_pack_key((void *) &tmp, &mi->k, f))
+			return false;
+
+		if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+			return false;
+
+		extent_i_save(b, m, mi);
+		bch_bset_fix_invalidated_key(b, t, m);
+
+		/*
+		 * Update iterator to reflect what we just inserted - otherwise,
+		 * the iter_fix() call is going to put us _before_ the key we
+		 * just partially merged with:
+		 */
+		if (back_merge)
+			bch_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
+
+		bch_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
+					t, m, m->u64s, m->u64s);
+
+		if (!back_merge)
+			bkey_copy(packed_to_bkey(l), &li.k);
+		else
+			bkey_copy(packed_to_bkey(r), &ri.k);
+		return false;
+	case BCH_MERGE_MERGE:
+		if (bkey_packed(m) && !bkey_pack_key((void *) &tmp, &li.k.k, f))
+			return false;
+
+		if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+			return false;
+
+		extent_i_save(b, m, &li.k);
+		bch_bset_fix_invalidated_key(b, t, m);
+
+		bch_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
+					t, m, m->u64s, m->u64s);
+		return true;
+	default:
+		BUG();
+	}
+}
+
+const struct bkey_ops bch_bkey_extent_ops = {
+	.key_invalid	= bch_extent_invalid,
+	.key_debugcheck	= bch_extent_debugcheck,
+	.val_to_text	= bch_extent_to_text,
+	.swab		= bch_ptr_swab,
+	.key_normalize	= bch_ptr_normalize,
+	.key_merge	= bch_extent_merge,
+	.is_extents	= true,
+};
diff --git a/libbcache/extents.h b/libbcache/extents.h
new file mode 100644
index 0000000..2dc6446
--- /dev/null
+++ b/libbcache/extents.h
@@ -0,0 +1,494 @@
+#ifndef _BCACHE_EXTENTS_H
+#define _BCACHE_EXTENTS_H
+
+#include "bkey.h"
+
+#include <linux/bcache.h>
+
+struct bch_replace_info;
+union bch_extent_crc;
+struct btree_iter;
+struct btree_insert;
+struct btree_insert_entry;
+
+struct btree_nr_keys bch_key_sort_fix_overlapping(struct bset *,
+						  struct btree *,
+						  struct btree_node_iter *);
+struct btree_nr_keys bch_extent_sort_fix_overlapping(struct cache_set *c,
+						     struct bset *,
+						     struct btree *,
+						     struct btree_node_iter *);
+
+extern const struct bkey_ops bch_bkey_btree_ops;
+extern const struct bkey_ops bch_bkey_extent_ops;
+
+struct cache_set;
+struct journal_res;
+
+struct extent_pick_ptr {
+	struct bch_extent_crc64		crc;
+	struct bch_extent_ptr		ptr;
+	struct cache			*ca;
+};
+
+struct extent_pick_ptr
+bch_btree_pick_ptr(struct cache_set *, const struct btree *);
+
+void bch_extent_pick_ptr_avoiding(struct cache_set *, struct bkey_s_c,
+				  struct cache *, struct extent_pick_ptr *);
+
+static inline void
+bch_extent_pick_ptr(struct cache_set *c, struct bkey_s_c k,
+		    struct extent_pick_ptr *ret)
+{
+	bch_extent_pick_ptr_avoiding(c, k, NULL, ret);
+}
+
+enum extent_insert_hook_ret
+bch_extent_cmpxchg(struct extent_insert_hook *, struct bpos, struct bpos,
+		   struct bkey_s_c, const struct bkey_i *);
+
+enum btree_insert_ret
+bch_insert_fixup_extent(struct btree_insert *,
+			struct btree_insert_entry *);
+
+bool bch_extent_normalize(struct cache_set *, struct bkey_s);
+
+unsigned bch_extent_nr_ptrs_from(struct bkey_s_c_extent,
+				 const struct bch_extent_ptr *);
+unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent);
+
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+	switch (k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
+{
+	switch (k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+	case BCH_RESERVATION:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_cached(const struct bkey *k)
+{
+	return k->type == BCH_EXTENT_CACHED;
+}
+
+static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
+{
+	EBUG_ON(k->type != BCH_EXTENT &&
+		k->type != BCH_EXTENT_CACHED);
+
+	k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
+}
+
+static inline unsigned
+__extent_entry_type(const union bch_extent_entry *e)
+{
+	return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
+}
+
+static inline enum bch_extent_entry_type
+extent_entry_type(const union bch_extent_entry *e)
+{
+	int ret = __ffs(e->type);
+
+	EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
+
+	return ret;
+}
+
+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
+{
+	switch (extent_entry_type(entry)) {
+	case BCH_EXTENT_ENTRY_crc32:
+		return sizeof(struct bch_extent_crc32);
+	case BCH_EXTENT_ENTRY_crc64:
+		return sizeof(struct bch_extent_crc64);
+	case BCH_EXTENT_ENTRY_ptr:
+		return sizeof(struct bch_extent_ptr);
+	default:
+		BUG();
+	}
+}
+
+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
+{
+	return extent_entry_bytes(entry) / sizeof(u64);
+}
+
+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
+{
+	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
+{
+	return !extent_entry_is_ptr(e);
+}
+
+union bch_extent_crc {
+	u8				type;
+	struct bch_extent_crc32		crc32;
+	struct bch_extent_crc64		crc64;
+};
+
+/* downcast, preserves const */
+#define to_entry(_entry)						\
+({									\
+	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
+		     !type_is(_entry, struct bch_extent_ptr *));	\
+									\
+	__builtin_choose_expr(						\
+		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
+		 type_is_exact(_entry, const struct bch_extent_ptr *)),	\
+		(const union bch_extent_entry *) (_entry),		\
+		(union bch_extent_entry *) (_entry));			\
+})
+
+#define __entry_to_crc(_entry)						\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const union bch_extent_crc *) (_entry),		\
+		(union bch_extent_crc *) (_entry))
+
+#define entry_to_crc(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_crc(_entry));		\
+									\
+	__entry_to_crc(_entry);						\
+})
+
+#define entry_to_ptr(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
+									\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const struct bch_extent_ptr *) (_entry),		\
+		(struct bch_extent_ptr *) (_entry));			\
+})
+
+enum bch_extent_crc_type {
+	BCH_EXTENT_CRC_NONE,
+	BCH_EXTENT_CRC32,
+	BCH_EXTENT_CRC64,
+};
+
+static inline enum bch_extent_crc_type
+extent_crc_type(const union bch_extent_crc *crc)
+{
+	if (!crc)
+		return BCH_EXTENT_CRC_NONE;
+
+	switch (extent_entry_type(to_entry(crc))) {
+	case BCH_EXTENT_ENTRY_crc32:
+		return BCH_EXTENT_CRC32;
+	case BCH_EXTENT_ENTRY_crc64:
+		return BCH_EXTENT_CRC64;
+	default:
+		BUG();
+	}
+}
+
+#define extent_entry_next(_entry)					\
+	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
+#define extent_entry_last(_e)						\
+	bkey_idx((_e).v, bkey_val_u64s((_e).k))
+
+/* Iterate over all entries: */
+
+#define extent_for_each_entry_from(_e, _entry, _start)			\
+	for ((_entry) = _start;						\
+	     (_entry) < extent_entry_last(_e);				\
+	     (_entry) = extent_entry_next(_entry))
+
+#define extent_for_each_entry(_e, _entry)				\
+	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+/* Iterate over crcs only: */
+
+#define extent_crc_next(_e, _p)						\
+({									\
+	typeof(&(_e).v->start[0]) _entry = _p;				\
+									\
+	while ((_entry) < extent_entry_last(_e) &&			\
+	       !extent_entry_is_crc(_entry))				\
+		(_entry) = extent_entry_next(_entry);			\
+									\
+	entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL);	\
+})
+
+#define extent_for_each_crc(_e, _crc)					\
+	for ((_crc) = extent_crc_next(_e, (_e).v->start);		\
+	     (_crc);							\
+	     (_crc) = extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+
+/* Iterate over pointers, with crcs: */
+
+#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter)		\
+({									\
+	__label__ out;							\
+	typeof(&(_e).v->start[0]) _entry;				\
+									\
+	extent_for_each_entry_from(_e, _entry, to_entry(_ptr))		\
+		if (extent_entry_is_crc(_entry)) {			\
+			(_crc) = entry_to_crc(_entry);			\
+		} else {						\
+			_ptr = entry_to_ptr(_entry);			\
+			if (_filter)					\
+				goto out;				\
+		}							\
+									\
+	_ptr = NULL;							\
+out:									\
+	_ptr;								\
+})
+
+#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter)		\
+	for ((_crc) = NULL,						\
+	     (_ptr) = &(_e).v->start->ptr;				\
+	     ((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\
+	     (_ptr)++)
+
+#define extent_for_each_ptr_crc(_e, _ptr, _crc)				\
+	extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true)
+
+#define extent_for_each_online_device_crc(_c, _e, _crc, _ptr, _ca)	\
+	extent_for_each_ptr_crc_filter(_e, _ptr, _crc,			\
+				       ((_ca) = PTR_CACHE(_c, _ptr)))
+
+/* Iterate over pointers only, and from a given position: */
+
+#define extent_ptr_next_filter(_e, _ptr, _filter)			\
+({									\
+	typeof(__entry_to_crc(&(_e).v->start[0])) _crc;			\
+									\
+	extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter);		\
+})
+
+#define extent_ptr_next(_e, _ptr)					\
+	extent_ptr_next_filter(_e, _ptr, true)
+
+#define extent_for_each_ptr_from_filter(_e, _ptr, _start, _filter)	\
+	for ((_ptr) = (_start);				\
+	     ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter));	\
+	     (_ptr)++)
+
+#define extent_for_each_ptr_from(_e, _ptr, _start)			\
+	extent_for_each_ptr_from_filter(_e, _ptr, _start, true)
+
+#define extent_for_each_ptr(_e, _ptr)					\
+	extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr, true)
+
+#define extent_for_each_online_device(_c, _e, _ptr, _ca)		\
+	extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr,	\
+					((_ca) = PTR_CACHE(_c, _ptr)))
+
+#define extent_ptr_prev(_e, _ptr)					\
+({									\
+	typeof(&(_e).v->start->ptr) _p;					\
+	typeof(&(_e).v->start->ptr) _prev = NULL;			\
+									\
+	extent_for_each_ptr(_e, _p) {					\
+		if (_p == (_ptr))					\
+			break;						\
+		_prev = _p;						\
+	}								\
+									\
+	_prev;								\
+})
+
+/*
+ * Use this when you'll be dropping pointers as you iterate. Quadratic,
+ * unfortunately:
+ */
+#define extent_for_each_ptr_backwards(_e, _ptr)				\
+	for ((_ptr) = extent_ptr_prev(_e, NULL);			\
+	     (_ptr);							\
+	     (_ptr) = extent_ptr_prev(_e, _ptr))
+
+void bch_extent_entry_append(struct bkey_i_extent *, union bch_extent_entry *);
+void bch_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned,
+			   unsigned, u64, unsigned);
+
+static inline void extent_ptr_append(struct bkey_i_extent *e,
+				     struct bch_extent_ptr ptr)
+{
+	ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+	bch_extent_entry_append(e, to_entry(&ptr));
+}
+
+/* XXX: inefficient */
+static inline bool bch_extent_ptr_is_dirty(const struct cache_set *c,
+					   struct bkey_s_c_extent e,
+					   const struct bch_extent_ptr *ptr)
+{
+	if (bkey_extent_is_cached(e.k))
+		return false;
+
+	/* Dirty pointers come last */
+	return bch_extent_nr_ptrs_from(e, ptr) <= c->opts.data_replicas;
+}
+
+extern const unsigned bch_crc_size[];
+
+static inline struct bch_extent_crc64 crc_to_64(const struct bkey *k,
+						const union bch_extent_crc *crc)
+{
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return (struct bch_extent_crc64) {
+			.compressed_size	= k->size,
+			.uncompressed_size	= k->size,
+		};
+	case BCH_EXTENT_CRC32:
+		return (struct bch_extent_crc64) {
+			.compressed_size	= crc->crc32.compressed_size,
+			.uncompressed_size	= crc->crc32.uncompressed_size,
+			.offset			= crc->crc32.offset,
+			.csum_type		= crc->crc32.csum_type,
+			.compression_type	= crc->crc32.compression_type,
+			.csum			= crc->crc32.csum,
+		};
+	case BCH_EXTENT_CRC64:
+		return crc->crc64;
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned crc_compressed_size(const struct bkey *k,
+					   const union bch_extent_crc *crc)
+{
+	return crc_to_64(k, crc).compressed_size;
+}
+
+static inline unsigned crc_uncompressed_size(const struct bkey *k,
+					     const union bch_extent_crc *crc)
+{
+	return crc_to_64(k, crc).uncompressed_size;
+}
+
+static inline unsigned crc_offset(const union bch_extent_crc *crc)
+{
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return 0;
+	case BCH_EXTENT_CRC32:
+		return crc->crc32.offset;
+	case BCH_EXTENT_CRC64:
+		return crc->crc64.offset;
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned crc_csum_type(const union bch_extent_crc *crc)
+{
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return 0;
+	case BCH_EXTENT_CRC32:
+		return crc->crc32.csum_type;
+	case BCH_EXTENT_CRC64:
+		return crc->crc64.csum_type;
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned crc_compression_type(const union bch_extent_crc *crc)
+{
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return 0;
+	case BCH_EXTENT_CRC32:
+		return crc->crc32.compression_type;
+	case BCH_EXTENT_CRC64:
+		return crc->crc64.compression_type;
+	default:
+		BUG();
+	}
+}
+
+static inline u64 crc_csum(const union bch_extent_crc *crc)
+{
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return 0;
+	case BCH_EXTENT_CRC32:
+		return crc->crc32.csum;
+	case BCH_EXTENT_CRC64:
+		return crc->crc64.csum;
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned bkey_extent_is_compressed(struct cache_set *c,
+						 struct bkey_s_c k)
+{
+	struct bkey_s_c_extent e;
+	const struct bch_extent_ptr *ptr;
+	const union bch_extent_crc *crc;
+	unsigned ret = 0;
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		e = bkey_s_c_to_extent(k);
+
+		extent_for_each_ptr_crc(e, ptr, crc)
+			if (bch_extent_ptr_is_dirty(c, e, ptr) &&
+			    crc_compression_type(crc) != BCH_COMPRESSION_NONE &&
+			    crc_compressed_size(e.k, crc) < k.k->size)
+				ret = max_t(unsigned, ret,
+					    crc_compressed_size(e.k, crc));
+	}
+
+	return ret;
+}
+
+void bch_extent_narrow_crcs(struct bkey_s_extent);
+void bch_extent_drop_redundant_crcs(struct bkey_s_extent);
+
+/* Doesn't cleanup redundant crcs */
+static inline void __bch_extent_drop_ptr(struct bkey_s_extent e,
+					 struct bch_extent_ptr *ptr)
+{
+	EBUG_ON(ptr < &e.v->start->ptr ||
+		ptr >= &extent_entry_last(e)->ptr);
+	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+	memmove_u64s_down(ptr, ptr + 1,
+			  (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
+	e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+}
+
+static inline void bch_extent_drop_ptr(struct bkey_s_extent e,
+				       struct bch_extent_ptr *ptr)
+{
+	__bch_extent_drop_ptr(e, ptr);
+	bch_extent_drop_redundant_crcs(e);
+}
+
+bool bch_extent_has_device(struct bkey_s_c_extent, unsigned);
+
+bool bch_cut_front(struct bpos, struct bkey_i *);
+bool bch_cut_back(struct bpos, struct bkey *);
+void bch_key_resize(struct bkey *, unsigned);
+
+#endif /* _BCACHE_EXTENTS_H */
diff --git a/libbcache/eytzinger.h b/libbcache/eytzinger.h
new file mode 100644
index 0000000..13d54e5
--- /dev/null
+++ b/libbcache/eytzinger.h
@@ -0,0 +1,196 @@
+#ifndef _EYTZINGER_H
+#define _EYTZINGER_H
+
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "util.h"
+
+/*
+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an
+ * array
+ *
+ * We used one based indexing, not zero based: with one based indexing, each
+ * level of the tree starts at a power of two - leading to better alignment -
+ * and it's what you want for implementing next/prev and to/from inorder.
+ *
+ * To/from inorder also uses 1 based indexing.
+ *
+ * Size parameter is treated as if we were using 0 based indexing, however:
+ * valid nodes, and inorder indices, are in the range [1..size)
+ */
+
+static inline unsigned eytzinger_child(unsigned j, unsigned child)
+{
+	EBUG_ON(child > 1);
+
+	return (j << 1) + child;
+}
+
+static inline unsigned eytzinger_left_child(unsigned j)
+{
+	return eytzinger_child(j, 0);
+}
+
+static inline unsigned eytzinger_right_child(unsigned j)
+{
+	return eytzinger_child(j, 1);
+}
+
+static inline unsigned eytzinger_first(unsigned size)
+{
+	return rounddown_pow_of_two(size - 1);
+}
+
+static inline unsigned eytzinger_last(unsigned size)
+{
+	return rounddown_pow_of_two(size) - 1;
+}
+
+/*
+ * eytzinger_next() and eytzinger_prev() have the nice properties that
+ *
+ * eytzinger_next(0) == eytzinger_first())
+ * eytzinger_prev(0) == eytzinger_last())
+ *
+ * eytzinger_prev(eytzinger_first()) == 0
+ * eytzinger_next(eytzinger_last()) == 0
+ */
+
+static inline unsigned eytzinger_next(unsigned j, unsigned size)
+{
+	EBUG_ON(j >= size);
+
+	if (eytzinger_right_child(j) < size) {
+		j = eytzinger_right_child(j);
+
+		j <<= __fls(size) - __fls(j);
+		j >>= j >= size;
+	} else {
+		j >>= ffz(j) + 1;
+	}
+
+	return j;
+}
+
+static inline unsigned eytzinger_prev(unsigned j, unsigned size)
+{
+	EBUG_ON(j >= size);
+
+	if (eytzinger_left_child(j) < size) {
+		j = eytzinger_left_child(j);
+
+		j <<= __fls(size) - __fls(j);
+		j -= 1;
+		j >>= j >= size;
+	} else {
+		j >>= __ffs(j) + 1;
+	}
+
+	return j;
+}
+
+static inline unsigned eytzinger_extra(unsigned size)
+{
+	return (size - rounddown_pow_of_two(size - 1)) << 1;
+}
+
+static inline unsigned __eytzinger_to_inorder(unsigned j, unsigned size,
+					      unsigned extra)
+{
+	unsigned b = __fls(j);
+	unsigned shift = __fls(size - 1) - b;
+	int s;
+
+	EBUG_ON(!j || j >= size);
+
+	j  ^= 1U << b;
+	j <<= 1;
+	j  |= 1;
+	j <<= shift;
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (j > extra)
+	 *	j -= (j - extra) >> 1;
+	 */
+	s = extra - j;
+	j += (s >> 1) & (s >> 31);
+
+	return j;
+}
+
+static inline unsigned __inorder_to_eytzinger(unsigned j, unsigned size,
+					      unsigned extra)
+{
+	unsigned shift;
+	int s;
+
+	EBUG_ON(!j || j >= size);
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (j > extra)
+	 *	j += j - extra;
+	 */
+	s = extra - j;
+	j -= s & (s >> 31);
+
+	shift = __ffs(j);
+
+	j >>= shift + 1;
+	j  |= 1U << (__fls(size - 1) - shift);
+
+	return j;
+}
+
+static inline unsigned eytzinger_to_inorder(unsigned j, unsigned size)
+{
+	return __eytzinger_to_inorder(j, size, eytzinger_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger(unsigned j, unsigned size)
+{
+	return __inorder_to_eytzinger(j, size, eytzinger_extra(size));
+}
+
+#define eytzinger_for_each(_i, _size)			\
+	for ((_i) = eytzinger_first((_size));		\
+	     (_i) != 0;					\
+	     (_i) = eytzinger_next((_i), (_size)))
+
+#if 0
+void eytzinger_test(void)
+{
+	unsigned i, j, size;
+
+	for (size = 2;
+	     size < 65536000;
+	     size++) {
+		if (!(size % 4096))
+			printk(KERN_INFO "tree size %u\n", size);
+
+		assert(eytzinger_prev(0, size) == eytzinger_last(size));
+		assert(eytzinger_next(0, size) == eytzinger_first(size));
+
+		assert(eytzinger_prev(eytzinger_first(size), size) == 0);
+		assert(eytzinger_next(eytzinger_last(size), size) == 0);
+
+		eytzinger_for_each(j, size) {
+			assert(from_inorder(i, size) == j);
+			assert(to_inorder(j, size) == i);
+
+			if (j != eytzinger_last(size)) {
+				unsigned next = eytzinger_next(j, size);
+
+				assert(eytzinger_prev(next, size) == j);
+			}
+		}
+	}
+
+}
+#endif
+
+#endif /* _EYTZINGER_H */
diff --git a/libbcache/fifo.h b/libbcache/fifo.h
new file mode 100644
index 0000000..2908ca2
--- /dev/null
+++ b/libbcache/fifo.h
@@ -0,0 +1,123 @@
+#ifndef _BCACHE_FIFO_H
+#define _BCACHE_FIFO_H
+
+#define DECLARE_FIFO(type, name)					\
+	struct {							\
+		size_t front, back, size, mask;				\
+		type *data;						\
+	} name
+
+#define init_fifo(fifo, _size, _gfp)					\
+({									\
+	bool _ret = true;						\
+	gfp_t gfp_flags = (_gfp);					\
+									\
+	if (gfp_flags & GFP_KERNEL)					\
+		gfp_flags |= __GFP_NOWARN;				\
+									\
+	(fifo)->size	= (_size);					\
+	(fifo)->front	= (fifo)->back = 0;				\
+	(fifo)->data	= NULL;						\
+									\
+	if ((fifo)->size) {						\
+		size_t _allocated_size, _bytes;				\
+									\
+		_allocated_size = roundup_pow_of_two((fifo)->size);	\
+		_bytes = _allocated_size * sizeof(*(fifo)->data);	\
+									\
+		(fifo)->mask = _allocated_size - 1;			\
+									\
+		if (_bytes < KMALLOC_MAX_SIZE)				\
+			(fifo)->data = kmalloc(_bytes, gfp_flags);	\
+		if ((!(fifo)->data) && (gfp_flags & GFP_KERNEL))	\
+			(fifo)->data = vmalloc(_bytes);			\
+		if ((!(fifo)->data))					\
+			_ret = false;					\
+	}								\
+	_ret;								\
+})
+
+#define free_fifo(fifo)							\
+do {									\
+	kvfree((fifo)->data);						\
+	(fifo)->data = NULL;						\
+} while (0)
+
+#define fifo_swap(l, r)							\
+do {									\
+	swap((l)->front, (r)->front);					\
+	swap((l)->back, (r)->back);					\
+	swap((l)->size, (r)->size);					\
+	swap((l)->mask, (r)->mask);					\
+	swap((l)->data, (r)->data);					\
+} while (0)
+
+#define fifo_move(dest, src)						\
+do {									\
+	typeof(*((dest)->data)) _t;					\
+	while (!fifo_full(dest) &&					\
+	       fifo_pop(src, _t))					\
+		fifo_push(dest, _t);					\
+} while (0)
+
+#define fifo_used(fifo)		(((fifo)->back - (fifo)->front))
+#define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo)	((fifo)->front == (fifo)->back)
+#define fifo_full(fifo)		(fifo_used(fifo) == (fifo)->size)
+
+#define fifo_peek_front(fifo)	((fifo)->data[(fifo)->front & (fifo)->mask])
+#define fifo_peek_back(fifo)	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
+
+#define fifo_push_back(fifo, i)						\
+({									\
+	bool _r = !fifo_full((fifo));					\
+	if (_r)								\
+		(fifo)->data[(fifo)->back++ & (fifo)->mask] = (i);	\
+	_r;								\
+})
+
+#define fifo_pop_front(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[(fifo)->front++ & (fifo)->mask];	\
+	_r;								\
+})
+
+#define fifo_push_front(fifo, i)					\
+({									\
+	bool _r = !fifo_full((fifo));					\
+	if (_r)								\
+		(fifo)->data[--(fifo)->front & (fifo)->mask] = (i);	\
+	_r;								\
+})
+
+#define fifo_pop_back(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask]	\
+	_r;								\
+})
+
+#define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
+#define fifo_peek(fifo)		fifo_peek_front(fifo)
+
+#define fifo_for_each_entry(_entry, _fifo, _iter)			\
+	for (_iter = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     _iter++)
+
+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)			\
+	for (_iter = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     _iter++)
+
+#endif /* _BCACHE_FIFO_H */
+
diff --git a/libbcache/fs-gc.c b/libbcache/fs-gc.c
new file mode 100644
index 0000000..bd2a867
--- /dev/null
+++ b/libbcache/fs-gc.c
@@ -0,0 +1,475 @@
+
+#include "bcache.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-gc.h"
+#include "inode.h"
+#include "keylist.h"
+#include "super.h"
+
+#include <linux/generic-radix-tree.h>
+
+struct nlink {
+	u32	count;
+	u32	dir_count;
+};
+
+DECLARE_GENRADIX_TYPE(nlinks, struct nlink);
+
+static void inc_link(struct cache_set *c, struct nlinks *links,
+		     u64 range_start, u64 *range_end,
+		     u64 inum, bool dir)
+{
+	struct nlink *link;
+
+	if (inum < range_start || inum >= *range_end)
+		return;
+
+	link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
+	if (!link) {
+		bch_verbose(c, "allocation failed during fs gc - will need another pass");
+		*range_end = inum;
+		return;
+	}
+
+	if (dir)
+		link->dir_count++;
+	else
+		link->count++;
+}
+
+/*
+ * XXX: should do a DFS (via filesystem heirarchy), and make sure all dirents
+ * are reachable
+ */
+
+noinline_for_stack
+static int bch_gc_walk_dirents(struct cache_set *c, struct nlinks *links,
+			       u64 range_start, u64 *range_end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	u64 d_inum;
+	int ret;
+
+	inc_link(c, links, range_start, range_end, BCACHE_ROOT_INO, false);
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, k) {
+		switch (k.k->type) {
+		case BCH_DIRENT:
+			d = bkey_s_c_to_dirent(k);
+			d_inum = le64_to_cpu(d.v->d_inum);
+
+			if (d.v->d_type == DT_DIR)
+				inc_link(c, links, range_start, range_end,
+					 d.k->p.inode, true);
+
+			inc_link(c, links, range_start, range_end,
+				 d_inum, false);
+
+			break;
+		}
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		bch_err(c, "error in fs gc: btree error %i while walking dirents", ret);
+
+	return ret;
+}
+
+s64 bch_count_inode_sectors(struct cache_set *c, u64 inum)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 sectors = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), k) {
+		if (k.k->p.inode != inum)
+			break;
+
+		if (bkey_extent_is_allocation(k.k))
+			sectors += k.k->size;
+	}
+
+	return bch_btree_iter_unlock(&iter) ?: sectors;
+}
+
+static int bch_gc_do_inode(struct cache_set *c, struct btree_iter *iter,
+			   struct bkey_s_c_inode inode, struct nlink link)
+{
+	u16 i_mode  = le16_to_cpu(inode.v->i_mode);
+	u32 i_flags = le32_to_cpu(inode.v->i_flags);
+	u32 i_nlink = le32_to_cpu(inode.v->i_nlink);
+	u64 i_size  = le64_to_cpu(inode.v->i_size);
+	s64 i_sectors = 0;
+	int ret = 0;
+	u32 real_i_nlink;
+
+	fsck_err_on(i_nlink < link.count, c,
+		    "inode %llu i_link too small (%u < %u, type %i)",
+		    inode.k->p.inode, i_nlink,
+		    link.count, mode_to_type(i_mode));
+
+	if (S_ISDIR(i_mode)) {
+		unfixable_fsck_err_on(link.count > 1, c,
+			"directory %llu with multiple hardlinks: %u",
+			inode.k->p.inode, link.count);
+
+		real_i_nlink = link.count * 2 + link.dir_count;
+	} else {
+		unfixable_fsck_err_on(link.dir_count, c,
+			"found dirents for non directory %llu",
+			inode.k->p.inode);
+
+		real_i_nlink = link.count + link.dir_count;
+	}
+
+	if (!link.count) {
+		fsck_err_on(c->sb.clean, c,
+			    "filesystem marked clean, "
+			    "but found orphaned inode %llu",
+			    inode.k->p.inode);
+
+		unfixable_fsck_err_on(S_ISDIR(i_mode) &&
+			bch_empty_dir(c, inode.k->p.inode), c,
+			"non empty directory with link count 0, "
+			"inode nlink %u, dir links found %u",
+			i_nlink, link.dir_count);
+
+		bch_verbose(c, "deleting inode %llu", inode.k->p.inode);
+
+		ret = bch_inode_rm(c, inode.k->p.inode);
+		if (ret)
+			bch_err(c, "error in fs gc: error %i "
+				"while deleting inode", ret);
+		return ret;
+	}
+
+	if (i_flags & BCH_INODE_I_SIZE_DIRTY) {
+		fsck_err_on(c->sb.clean, c,
+			    "filesystem marked clean, "
+			    "but inode %llu has i_size dirty",
+			    inode.k->p.inode);
+
+		bch_verbose(c, "truncating inode %llu", inode.k->p.inode);
+
+		/*
+		 * XXX: need to truncate partial blocks too here - or ideally
+		 * just switch units to bytes and that issue goes away
+		 */
+
+		ret = bch_inode_truncate(c, inode.k->p.inode,
+				round_up(i_size, PAGE_SIZE) >> 9,
+				NULL, NULL);
+		if (ret) {
+			bch_err(c, "error in fs gc: error %i "
+				"truncating inode", ret);
+			return ret;
+		}
+
+		/*
+		 * We truncated without our normal sector accounting hook, just
+		 * make sure we recalculate it:
+		 */
+		i_flags |= BCH_INODE_I_SECTORS_DIRTY;
+	}
+
+	if (i_flags & BCH_INODE_I_SECTORS_DIRTY) {
+		fsck_err_on(c->sb.clean, c,
+			    "filesystem marked clean, "
+			    "but inode %llu has i_sectors dirty",
+			    inode.k->p.inode);
+
+		bch_verbose(c, "recounting sectors for inode %llu",
+			    inode.k->p.inode);
+
+		i_sectors = bch_count_inode_sectors(c, inode.k->p.inode);
+		if (i_sectors < 0) {
+			bch_err(c, "error in fs gc: error %i "
+				"recounting inode sectors",
+				(int) i_sectors);
+			return i_sectors;
+		}
+	}
+
+	if (i_nlink != real_i_nlink) {
+		fsck_err_on(c->sb.clean, c,
+			    "filesystem marked clean, "
+			    "but inode %llu has wrong i_nlink "
+			    "(type %u i_nlink %u, should be %u)",
+			    inode.k->p.inode, mode_to_type(i_mode),
+			    i_nlink, real_i_nlink);
+
+		bch_verbose(c, "setting inode %llu nlinks from %u to %u",
+			    inode.k->p.inode, i_nlink, real_i_nlink);
+	}
+
+	if (i_nlink != real_i_nlink||
+	    i_flags & BCH_INODE_I_SECTORS_DIRTY ||
+	    i_flags & BCH_INODE_I_SIZE_DIRTY) {
+		struct bkey_i_inode update;
+
+		bkey_reassemble(&update.k_i, inode.s_c);
+		update.v.i_nlink = cpu_to_le32(real_i_nlink);
+		update.v.i_flags = cpu_to_le32(i_flags &
+				~(BCH_INODE_I_SIZE_DIRTY|
+				  BCH_INODE_I_SECTORS_DIRTY));
+
+		if (i_flags & BCH_INODE_I_SECTORS_DIRTY)
+			update.v.i_sectors = cpu_to_le64(i_sectors);
+
+		ret = bch_btree_insert_at(c, NULL, NULL, NULL,
+					  BTREE_INSERT_NOFAIL,
+					  BTREE_INSERT_ENTRY(iter, &update.k_i));
+		if (ret && ret != -EINTR)
+			bch_err(c, "error in fs gc: error %i "
+				"updating inode", ret);
+	}
+fsck_err:
+	return ret;
+}
+
+noinline_for_stack
+static int bch_gc_walk_inodes(struct cache_set *c, struct nlinks *links,
+			      u64 range_start, u64 range_end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct nlink *link, zero_links = { 0, 0 };
+	struct genradix_iter nlinks_iter;
+	int ret = 0, ret2 = 0;
+	u64 nlinks_pos;
+
+	bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0));
+	genradix_iter_init(&nlinks_iter);
+
+	while ((k = bch_btree_iter_peek(&iter)).k &&
+	       !btree_iter_err(k)) {
+peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
+
+		if (!link && (!k.k || iter.pos.inode >= range_end))
+			break;
+
+		nlinks_pos = range_start + nlinks_iter.pos;
+		if (iter.pos.inode > nlinks_pos) {
+			unfixable_fsck_err_on(link && link->count, c,
+				"missing inode %llu (nlink %u)",
+				nlinks_pos, link->count);
+			genradix_iter_advance(&nlinks_iter, links);
+			goto peek_nlinks;
+		}
+
+		if (iter.pos.inode < nlinks_pos || !link)
+			link = &zero_links;
+
+		if (k.k && k.k->type == BCH_INODE_FS) {
+			/*
+			 * Avoid potential deadlocks with iter for
+			 * truncate/rm/etc.:
+			 */
+			bch_btree_iter_unlock(&iter);
+
+			ret = bch_gc_do_inode(c, &iter,
+					      bkey_s_c_to_inode(k),
+					      *link);
+			if (ret == -EINTR)
+				continue;
+			if (ret)
+				break;
+
+			if (link->count)
+				atomic_long_inc(&c->nr_inodes);
+		} else {
+			unfixable_fsck_err_on(link->count, c,
+				"missing inode %llu (nlink %u)",
+				nlinks_pos, link->count);
+		}
+
+		if (nlinks_pos == iter.pos.inode)
+			genradix_iter_advance(&nlinks_iter, links);
+
+		bch_btree_iter_advance_pos(&iter);
+		bch_btree_iter_cond_resched(&iter);
+	}
+fsck_err:
+	ret2 = bch_btree_iter_unlock(&iter);
+	if (ret2)
+		bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2);
+
+	return ret ?: ret2;
+}
+
+int bch_gc_inode_nlinks(struct cache_set *c)
+{
+	struct nlinks links;
+	u64 this_iter_range_start, next_iter_range_start = 0;
+	int ret = 0;
+
+	genradix_init(&links);
+
+	do {
+		this_iter_range_start = next_iter_range_start;
+		next_iter_range_start = U64_MAX;
+
+		ret = bch_gc_walk_dirents(c, &links,
+					  this_iter_range_start,
+					  &next_iter_range_start);
+		if (ret)
+			break;
+
+		ret = bch_gc_walk_inodes(c, &links,
+					 this_iter_range_start,
+					 next_iter_range_start);
+		if (ret)
+			break;
+
+		genradix_free(&links);
+	} while (next_iter_range_start != U64_MAX);
+
+	genradix_free(&links);
+
+	return ret;
+}
+
+static void next_inode(struct cache_set *c, u64 inum, u64 *cur_inum,
+		       struct bkey_i_inode *inode,
+		       bool *first_this_inode, bool *have_inode,
+		       u64 *i_size, u16 *i_mode)
+{
+	*first_this_inode = inum != *cur_inum;
+	*cur_inum = inum;
+
+	if (*first_this_inode) {
+		*have_inode = !bch_inode_find_by_inum(c, inum, inode);
+
+		if (*have_inode) {
+			*i_mode = le16_to_cpu(inode->v.i_mode);
+			*i_size = le64_to_cpu(inode->v.i_size);
+		}
+	}
+}
+
+/*
+ * Checks for inconsistencies that shouldn't happen, unless we have a bug.
+ * Doesn't fix them yet, mainly because they haven't yet been observed:
+ */
+int bch_fsck(struct cache_set *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_inode inode;
+	bool first_this_inode, have_inode;
+	u64 cur_inum, i_sectors;
+	u64 i_size = 0;
+	u16 i_mode = 0;
+	int ret = 0;
+
+	cur_inum = -1;
+	have_inode = false;
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(BCACHE_ROOT_INO, 0), k) {
+		if (k.k->type == KEY_TYPE_DISCARD)
+			continue;
+
+		next_inode(c, k.k->p.inode, &cur_inum, &inode,
+			   &first_this_inode, &have_inode,
+			   &i_size, &i_mode);
+
+		unfixable_fsck_err_on(!have_inode, c,
+			"extent type %u for missing inode %llu",
+			k.k->type, k.k->p.inode);
+
+		unfixable_fsck_err_on(first_this_inode && have_inode &&
+			le64_to_cpu(inode.v.i_sectors) !=
+			(i_sectors = bch_count_inode_sectors(c, cur_inum)),
+			c, "i_sectors wrong: got %llu, should be %llu",
+			le64_to_cpu(inode.v.i_sectors), i_sectors);
+
+		unfixable_fsck_err_on(have_inode &&
+			!S_ISREG(i_mode) && !S_ISLNK(i_mode), c,
+			"extent type %u for non regular file, inode %llu mode %o",
+			k.k->type, k.k->p.inode, i_mode);
+
+		unfixable_fsck_err_on(k.k->type != BCH_RESERVATION &&
+			k.k->p.offset > round_up(i_size, PAGE_SIZE) >> 9, c,
+			"extent type %u offset %llu past end of inode %llu, i_size %llu",
+			k.k->type, k.k->p.offset, k.k->p.inode, i_size);
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	cur_inum = -1;
+	have_inode = false;
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+			   POS(BCACHE_ROOT_INO, 0), k) {
+		struct bkey_s_c_dirent d;
+		struct bkey_i_inode target;
+		bool have_target;
+		u64 d_inum;
+
+		next_inode(c, k.k->p.inode, &cur_inum, &inode,
+			   &first_this_inode, &have_inode,
+			   &i_size, &i_mode);
+
+		unfixable_fsck_err_on(!have_inode, c,
+			"dirent in nonexisting directory %llu",
+			k.k->p.inode);
+
+		unfixable_fsck_err_on(!S_ISDIR(i_mode), c,
+			"dirent in non directory inode %llu, type %u",
+			k.k->p.inode, mode_to_type(i_mode));
+
+		if (k.k->type != BCH_DIRENT)
+			continue;
+
+		d = bkey_s_c_to_dirent(k);
+		d_inum = le64_to_cpu(d.v->d_inum);
+
+		unfixable_fsck_err_on(d_inum == d.k->p.inode, c,
+			"dirent points to own directory");
+
+		have_target = !bch_inode_find_by_inum(c, d_inum, &target);
+
+		unfixable_fsck_err_on(!have_target, c,
+			"dirent points to missing inode %llu, type %u filename %s",
+			d_inum, d.v->d_type, d.v->d_name);
+
+		unfixable_fsck_err_on(have_target &&
+			d.v->d_type !=
+			mode_to_type(le16_to_cpu(target.v.i_mode)), c,
+			"incorrect d_type: got %u should be %u, filename %s",
+			d.v->d_type,
+			mode_to_type(le16_to_cpu(target.v.i_mode)),
+			d.v->d_name);
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	cur_inum = -1;
+	have_inode = false;
+	for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
+			   POS(BCACHE_ROOT_INO, 0), k) {
+		next_inode(c, k.k->p.inode, &cur_inum, &inode,
+			   &first_this_inode, &have_inode,
+			   &i_size, &i_mode);
+
+		unfixable_fsck_err_on(!have_inode, c,
+			"xattr for missing inode %llu",
+			k.k->p.inode);
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	return 0;
+fsck_err:
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
diff --git a/libbcache/fs-gc.h b/libbcache/fs-gc.h
new file mode 100644
index 0000000..c44086c
--- /dev/null
+++ b/libbcache/fs-gc.h
@@ -0,0 +1,8 @@
+#ifndef _BCACHE_FS_GC_H
+#define _BCACHE_FS_GC_H
+
+s64 bch_count_inode_sectors(struct cache_set *, u64);
+int bch_gc_inode_nlinks(struct cache_set *);
+int bch_fsck(struct cache_set *);
+
+#endif /* _BCACHE_FS_GC_H */
diff --git a/libbcache/fs-io.c b/libbcache/fs-io.c
new file mode 100644
index 0000000..942baeb
--- /dev/null
+++ b/libbcache/fs-io.c
@@ -0,0 +1,2457 @@
+
+#include "bcache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-gc.h"
+#include "fs-io.h"
+#include "inode.h"
+#include "journal.h"
+#include "io.h"
+#include "keylist.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/migrate.h>
+#include <linux/mmu_context.h>
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+#include <trace/events/writeback.h>
+
+struct bio_set *bch_writepage_bioset;
+struct bio_set *bch_dio_read_bioset;
+struct bio_set *bch_dio_write_bioset;
+
+/* pagecache_block must be held */
+static int write_invalidate_inode_pages_range(struct address_space *mapping,
+					      loff_t start, loff_t end)
+{
+	int ret;
+
+	/*
+	 * XXX: the way this is currently implemented, we can spin if a process
+	 * is continually redirtying a specific page
+	 */
+	do {
+		if (!mapping->nrpages &&
+		    !mapping->nrexceptional)
+			return 0;
+
+		ret = filemap_write_and_wait_range(mapping, start, end);
+		if (ret)
+			break;
+
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = invalidate_inode_pages2_range(mapping,
+				start >> PAGE_SHIFT,
+				end >> PAGE_SHIFT);
+	} while (ret == -EBUSY);
+
+	return ret;
+}
+
+/* i_size updates: */
+
+static int inode_set_size(struct bch_inode_info *ei, struct bch_inode *bi,
+			  void *p)
+{
+	loff_t *new_i_size = p;
+	unsigned i_flags = le32_to_cpu(bi->i_flags);
+
+	lockdep_assert_held(&ei->update_lock);
+
+	bi->i_size = cpu_to_le64(*new_i_size);
+
+	if (atomic_long_read(&ei->i_size_dirty_count))
+		i_flags |= BCH_INODE_I_SIZE_DIRTY;
+	else
+		i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+
+	bi->i_flags = cpu_to_le32(i_flags);
+
+	return 0;
+}
+
+static int __must_check bch_write_inode_size(struct cache_set *c,
+					     struct bch_inode_info *ei,
+					     loff_t new_size)
+{
+	return __bch_write_inode(c, ei, inode_set_size, &new_size);
+}
+
+static inline void i_size_dirty_put(struct bch_inode_info *ei)
+{
+	atomic_long_dec_bug(&ei->i_size_dirty_count);
+}
+
+static inline void i_size_dirty_get(struct bch_inode_info *ei)
+{
+	lockdep_assert_held(&ei->vfs_inode.i_rwsem);
+
+	atomic_long_inc(&ei->i_size_dirty_count);
+}
+
+/* i_sectors accounting: */
+
+static enum extent_insert_hook_ret
+i_sectors_hook_fn(struct extent_insert_hook *hook,
+		  struct bpos committed_pos,
+		  struct bpos next_pos,
+		  struct bkey_s_c k,
+		  const struct bkey_i *insert)
+{
+	struct i_sectors_hook *h = container_of(hook,
+				struct i_sectors_hook, hook);
+	s64 sectors = next_pos.offset - committed_pos.offset;
+	int sign = bkey_extent_is_allocation(&insert->k) -
+		(k.k && bkey_extent_is_allocation(k.k));
+
+	EBUG_ON(!(h->ei->i_flags & BCH_INODE_I_SECTORS_DIRTY));
+	EBUG_ON(!atomic_long_read(&h->ei->i_sectors_dirty_count));
+
+	h->sectors += sectors * sign;
+
+	return BTREE_HOOK_DO_INSERT;
+}
+
+static int inode_set_i_sectors_dirty(struct bch_inode_info *ei,
+				    struct bch_inode *bi, void *p)
+{
+	BUG_ON(le32_to_cpu(bi->i_flags) & BCH_INODE_I_SECTORS_DIRTY);
+
+	bi->i_flags = cpu_to_le32(le32_to_cpu(bi->i_flags)|
+				  BCH_INODE_I_SECTORS_DIRTY);
+	return 0;
+}
+
+static int inode_clear_i_sectors_dirty(struct bch_inode_info *ei,
+				       struct bch_inode *bi, void *p)
+{
+	BUG_ON(!(le32_to_cpu(bi->i_flags) & BCH_INODE_I_SECTORS_DIRTY));
+
+	bi->i_sectors	= cpu_to_le64(atomic64_read(&ei->i_sectors));
+	bi->i_flags	= cpu_to_le32(le32_to_cpu(bi->i_flags) &
+				      ~BCH_INODE_I_SECTORS_DIRTY);
+	return 0;
+}
+
+static void i_sectors_dirty_put(struct bch_inode_info *ei,
+				struct i_sectors_hook *h)
+{
+	struct inode *inode = &ei->vfs_inode;
+
+	if (h->sectors) {
+		spin_lock(&inode->i_lock);
+		inode->i_blocks += h->sectors;
+		spin_unlock(&inode->i_lock);
+
+		atomic64_add(h->sectors, &ei->i_sectors);
+		EBUG_ON(atomic64_read(&ei->i_sectors) < 0);
+	}
+
+	EBUG_ON(atomic_long_read(&ei->i_sectors_dirty_count) <= 0);
+
+	mutex_lock(&ei->update_lock);
+
+	if (atomic_long_dec_and_test(&ei->i_sectors_dirty_count)) {
+		struct cache_set *c = ei->vfs_inode.i_sb->s_fs_info;
+		int ret = __bch_write_inode(c, ei, inode_clear_i_sectors_dirty, NULL);
+
+		ret = ret;
+	}
+
+	mutex_unlock(&ei->update_lock);
+}
+
+static int __must_check i_sectors_dirty_get(struct bch_inode_info *ei,
+					    struct i_sectors_hook *h)
+{
+	int ret = 0;
+
+	h->hook.fn	= i_sectors_hook_fn;
+	h->sectors	= 0;
+#ifdef CONFIG_BCACHE_DEBUG
+	h->ei		= ei;
+#endif
+
+	if (atomic_long_inc_not_zero(&ei->i_sectors_dirty_count))
+		return 0;
+
+	mutex_lock(&ei->update_lock);
+
+	if (!(ei->i_flags & BCH_INODE_I_SECTORS_DIRTY)) {
+		struct cache_set *c = ei->vfs_inode.i_sb->s_fs_info;
+
+		ret = __bch_write_inode(c, ei, inode_set_i_sectors_dirty, NULL);
+	}
+
+	if (!ret)
+		atomic_long_inc(&ei->i_sectors_dirty_count);
+
+	mutex_unlock(&ei->update_lock);
+
+	return ret;
+}
+
+struct bchfs_extent_trans_hook {
+	struct bchfs_write_op		*op;
+	struct extent_insert_hook	hook;
+	struct bkey_i_inode		new_inode;
+	bool				need_inode_update;
+};
+
+static enum extent_insert_hook_ret
+bchfs_extent_update_hook(struct extent_insert_hook *hook,
+			 struct bpos committed_pos,
+			 struct bpos next_pos,
+			 struct bkey_s_c k,
+			 const struct bkey_i *insert)
+{
+	struct bchfs_extent_trans_hook *h = container_of(hook,
+				struct bchfs_extent_trans_hook, hook);
+	struct bch_inode_info *ei = h->op->ei;
+	struct inode *inode = &ei->vfs_inode;
+	int sign = bkey_extent_is_allocation(&insert->k) -
+		(k.k && bkey_extent_is_allocation(k.k));
+	s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign;
+	u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
+
+	BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
+
+	/* XXX: ei->i_size locking */
+	if (offset > ei->i_size) {
+		BUG_ON(ei->i_flags & BCH_INODE_I_SIZE_DIRTY);
+
+		if (!h->need_inode_update) {
+			h->need_inode_update = true;
+			return BTREE_HOOK_RESTART_TRANS;
+		}
+
+		h->new_inode.v.i_size = cpu_to_le64(offset);
+		ei->i_size = offset;
+
+		if (h->op->is_dio)
+			i_size_write(inode, offset);
+	}
+
+	if (sectors) {
+		if (!h->need_inode_update) {
+			h->need_inode_update = true;
+			return BTREE_HOOK_RESTART_TRANS;
+		}
+
+		le64_add_cpu(&h->new_inode.v.i_sectors, sectors);
+		atomic64_add(sectors, &ei->i_sectors);
+
+		h->op->sectors_added += sectors;
+
+		if (h->op->is_dio) {
+			spin_lock(&inode->i_lock);
+			inode->i_blocks += sectors;
+			spin_unlock(&inode->i_lock);
+		}
+	}
+
+	return BTREE_HOOK_DO_INSERT;
+}
+
+static int bchfs_write_index_update(struct bch_write_op *wop)
+{
+	struct bchfs_write_op *op = container_of(wop,
+				struct bchfs_write_op, op);
+	struct keylist *keys = &op->op.insert_keys;
+	struct btree_iter extent_iter, inode_iter;
+	struct bchfs_extent_trans_hook hook;
+	struct bkey_i *k = bch_keylist_front(keys);
+	int ret;
+
+	BUG_ON(k->k.p.inode != op->ei->vfs_inode.i_ino);
+
+	bch_btree_iter_init_intent(&extent_iter, wop->c, BTREE_ID_EXTENTS,
+				   bkey_start_pos(&bch_keylist_front(keys)->k));
+	bch_btree_iter_init_intent(&inode_iter, wop->c,	BTREE_ID_INODES,
+				   POS(extent_iter.pos.inode, 0));
+
+	hook.op			= op;
+	hook.hook.fn		= bchfs_extent_update_hook;
+	hook.need_inode_update	= false;
+
+	do {
+		ret = bch_btree_iter_traverse(&extent_iter);
+		if (ret)
+			goto err;
+
+		/* XXX: ei->i_size locking */
+		k = bch_keylist_front(keys);
+		if (min(k->k.p.offset << 9, op->new_i_size) > op->ei->i_size)
+			hook.need_inode_update = true;
+
+		if (hook.need_inode_update) {
+			struct bkey_s_c inode;
+
+			if (!btree_iter_linked(&inode_iter))
+				bch_btree_iter_link(&extent_iter, &inode_iter);
+
+			inode = bch_btree_iter_peek_with_holes(&inode_iter);
+			if ((ret = btree_iter_err(inode)))
+				goto err;
+
+			if (WARN_ONCE(inode.k->type != BCH_INODE_FS,
+				      "inode %llu not found when updating",
+				      extent_iter.pos.inode)) {
+				ret = -ENOENT;
+				break;
+			}
+
+			bkey_reassemble(&hook.new_inode.k_i, inode);
+
+			ret = bch_btree_insert_at(wop->c, &wop->res,
+					&hook.hook, op_journal_seq(wop),
+					BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
+					BTREE_INSERT_ENTRY(&extent_iter, k),
+					BTREE_INSERT_ENTRY(&inode_iter, &hook.new_inode.k_i));
+		} else {
+			ret = bch_btree_insert_at(wop->c, &wop->res,
+					&hook.hook, op_journal_seq(wop),
+					BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
+					BTREE_INSERT_ENTRY(&extent_iter, k));
+		}
+err:
+		if (ret == -EINTR)
+			continue;
+		if (ret)
+			break;
+
+		bch_keylist_pop_front(keys);
+	} while (!bch_keylist_empty(keys));
+
+	bch_btree_iter_unlock(&extent_iter);
+	bch_btree_iter_unlock(&inode_iter);
+
+	return ret;
+}
+
+/* page state: */
+
+/* stored in page->private: */
+
+/*
+ * bch_page_state has to (unfortunately) be manipulated with cmpxchg - we could
+ * almost protected it with the page lock, except that bch_writepage_io_done has
+ * to update the sector counts (and from interrupt/bottom half context).
+ */
+struct bch_page_state {
+union { struct {
+	/*
+	 * BCH_PAGE_ALLOCATED: page is _fully_ written on disk, and not
+	 * compressed - which means to write this page we don't have to reserve
+	 * space (the new write will never take up more space on disk than what
+	 * it's overwriting)
+	 *
+	 * BCH_PAGE_UNALLOCATED: page is not fully written on disk, or is
+	 * compressed - before writing we have to reserve space with
+	 * bch_reserve_sectors()
+	 *
+	 * BCH_PAGE_RESERVED: page has space reserved on disk (reservation will
+	 * be consumed when the page is written).
+	 */
+	enum {
+		BCH_PAGE_UNALLOCATED	= 0,
+		BCH_PAGE_ALLOCATED,
+	}			alloc_state:2;
+
+	/* Owns PAGE_SECTORS sized reservation: */
+	unsigned		reserved:1;
+
+	/*
+	 * Number of sectors on disk - for i_blocks
+	 * Uncompressed size, not compressed size:
+	 */
+	u8			sectors;
+	u8			dirty_sectors;
+};
+	/* for cmpxchg: */
+	unsigned long		v;
+};
+};
+
+#define page_state_cmpxchg(_ptr, _new, _expr)				\
+({									\
+	unsigned long _v = READ_ONCE((_ptr)->v);			\
+	struct bch_page_state _old;					\
+									\
+	do {								\
+		_old.v = _new.v = _v;					\
+		_expr;							\
+									\
+		EBUG_ON(_new.sectors + _new.dirty_sectors > PAGE_SECTORS);\
+	} while (_old.v != _new.v &&					\
+		 (_v = cmpxchg(&(_ptr)->v, _old.v, _new.v)) != _old.v);	\
+									\
+	_old;								\
+})
+
+static inline struct bch_page_state *page_state(struct page *page)
+{
+	struct bch_page_state *s = (void *) &page->private;
+
+	BUILD_BUG_ON(sizeof(*s) > sizeof(page->private));
+
+	if (!PagePrivate(page))
+		SetPagePrivate(page);
+
+	return s;
+}
+
+static void bch_put_page_reservation(struct cache_set *c, struct page *page)
+{
+	struct disk_reservation res = { .sectors = PAGE_SECTORS };
+	struct bch_page_state s;
+
+	s = page_state_cmpxchg(page_state(page), s, {
+		if (!s.reserved)
+			return;
+		s.reserved = 0;
+	});
+
+	bch_disk_reservation_put(c, &res);
+}
+
+static int bch_get_page_reservation(struct cache_set *c, struct page *page,
+				    bool check_enospc)
+{
+	struct bch_page_state *s = page_state(page), new;
+	struct disk_reservation res;
+	int ret = 0;
+
+	BUG_ON(s->alloc_state == BCH_PAGE_ALLOCATED &&
+	       s->sectors != PAGE_SECTORS);
+
+	if (s->reserved ||
+	    s->alloc_state == BCH_PAGE_ALLOCATED)
+		return 0;
+
+	ret = bch_disk_reservation_get(c, &res, PAGE_SECTORS, !check_enospc
+				       ? BCH_DISK_RESERVATION_NOFAIL : 0);
+	if (ret)
+		return ret;
+
+	page_state_cmpxchg(s, new, {
+		if (new.reserved) {
+			bch_disk_reservation_put(c, &res);
+			return 0;
+		}
+		new.reserved = 1;
+	});
+
+	return 0;
+}
+
+static void bch_clear_page_bits(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct disk_reservation res = { .sectors = PAGE_SECTORS };
+	struct bch_page_state s;
+
+	if (!PagePrivate(page))
+		return;
+
+	s = xchg(page_state(page), (struct bch_page_state) { .v = 0 });
+	ClearPagePrivate(page);
+
+	if (s.dirty_sectors) {
+		spin_lock(&inode->i_lock);
+		inode->i_blocks -= s.dirty_sectors;
+		spin_unlock(&inode->i_lock);
+	}
+
+	if (s.reserved)
+		bch_disk_reservation_put(c, &res);
+}
+
+int bch_set_page_dirty(struct page *page)
+{
+	struct bch_page_state old, new;
+
+	old = page_state_cmpxchg(page_state(page), new,
+		new.dirty_sectors = PAGE_SECTORS - new.sectors;
+	);
+
+	if (old.dirty_sectors != new.dirty_sectors) {
+		struct inode *inode = page->mapping->host;
+
+		spin_lock(&inode->i_lock);
+		inode->i_blocks += new.dirty_sectors - old.dirty_sectors;
+		spin_unlock(&inode->i_lock);
+	}
+
+	return __set_page_dirty_nobuffers(page);
+}
+
+/* readpages/writepages: */
+
+static bool bio_can_add_page_contig(struct bio *bio, struct page *page)
+{
+	sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9);
+
+	return bio->bi_vcnt < bio->bi_max_vecs &&
+		bio_end_sector(bio) == offset;
+}
+
+static int bio_add_page_contig(struct bio *bio, struct page *page)
+{
+	sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9);
+
+	BUG_ON(!bio->bi_max_vecs);
+
+	if (!bio->bi_vcnt)
+		bio->bi_iter.bi_sector = offset;
+	else if (!bio_can_add_page_contig(bio, page))
+		return -1;
+
+	bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
+		.bv_page = page,
+		.bv_len = PAGE_SIZE,
+		.bv_offset = 0,
+	};
+
+	bio->bi_iter.bi_size += PAGE_SIZE;
+
+	return 0;
+}
+
+static void bch_readpages_end_io(struct bio *bio)
+{
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
+
+		if (!bio->bi_error) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		unlock_page(page);
+	}
+
+	bio_put(bio);
+}
+
+static inline struct page *__readpage_next_page(struct address_space *mapping,
+						struct list_head *pages,
+						unsigned *nr_pages)
+{
+	struct page *page;
+	int ret;
+
+	while (*nr_pages) {
+		page = list_entry(pages->prev, struct page, lru);
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		ret = add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS);
+
+		/* if add_to_page_cache_lru() succeeded, page is locked: */
+		put_page(page);
+
+		if (!ret)
+			return page;
+
+		(*nr_pages)--;
+	}
+
+	return NULL;
+}
+
+#define for_each_readpage_page(_mapping, _pages, _nr_pages, _page)	\
+	for (;								\
+	     ((_page) = __readpage_next_page(_mapping, _pages, &(_nr_pages)));\
+	     (_nr_pages)--)
+
+static void bch_mark_pages_unalloc(struct bio *bio)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+
+	bio_for_each_segment(bv, bio, iter)
+		page_state(bv.bv_page)->alloc_state = BCH_PAGE_UNALLOCATED;
+}
+
+static void bch_add_page_sectors(struct bio *bio, const struct bkey *k)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+
+	bio_for_each_segment(bv, bio, iter) {
+		struct bch_page_state *s = page_state(bv.bv_page);
+
+		/* sectors in @k from the start of this page: */
+		unsigned k_sectors = k->size - (iter.bi_sector - k->p.offset);
+
+		unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
+
+		BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
+
+		s->sectors += page_sectors;
+	}
+}
+
+static void bchfs_read(struct cache_set *c, struct bch_read_bio *rbio, u64 inode)
+{
+	struct bio *bio = &rbio->bio;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bio_vec *bv;
+	unsigned i;
+	int ret;
+
+	bch_increment_clock(c, bio_sectors(bio), READ);
+
+	/*
+	 * Initialize page state:
+	 * If a page is partly allocated and partly a hole, we want it to be
+	 * marked BCH_PAGE_UNALLOCATED - so we initially mark all pages
+	 * allocated and then mark them unallocated as we find holes:
+	 *
+	 * Note that the bio hasn't been split yet - it's the only bio that
+	 * points to these pages. As we walk extents and split @bio, that
+	 * necessarily be true, the splits won't necessarily be on page
+	 * boundaries:
+	 */
+	bio_for_each_segment_all(bv, bio, i) {
+		struct bch_page_state *s = page_state(bv->bv_page);
+
+		EBUG_ON(s->reserved);
+
+		s->alloc_state = BCH_PAGE_ALLOCATED;
+		s->sectors = 0;
+	}
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
+				      POS(inode, bio->bi_iter.bi_sector), k) {
+		BKEY_PADDED(k) tmp;
+		struct extent_pick_ptr pick;
+		unsigned bytes, sectors;
+		bool is_last;
+
+		bkey_reassemble(&tmp.k, k);
+		bch_btree_iter_unlock(&iter);
+		k = bkey_i_to_s_c(&tmp.k);
+
+		if (!bkey_extent_is_allocation(k.k) ||
+		    bkey_extent_is_compressed(c, k))
+			bch_mark_pages_unalloc(bio);
+
+		bch_extent_pick_ptr(c, k, &pick);
+		if (IS_ERR(pick.ca)) {
+			bcache_io_error(c, bio, "no device to read from");
+			bio_endio(bio);
+			return;
+		}
+
+		sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
+			bio->bi_iter.bi_sector;
+		bytes = sectors << 9;
+		is_last = bytes == bio->bi_iter.bi_size;
+		swap(bio->bi_iter.bi_size, bytes);
+
+		if (bkey_extent_is_allocation(k.k))
+			bch_add_page_sectors(bio, k.k);
+
+		if (pick.ca) {
+			PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
+				c->prio_clock[READ].hand;
+
+			bch_read_extent(c, rbio, k, &pick,
+					BCH_READ_RETRY_IF_STALE|
+					BCH_READ_PROMOTE|
+					(is_last ? BCH_READ_IS_LAST : 0));
+		} else {
+			zero_fill_bio_iter(bio, bio->bi_iter);
+
+			if (is_last)
+				bio_endio(bio);
+		}
+
+		if (is_last)
+			return;
+
+		swap(bio->bi_iter.bi_size, bytes);
+		bio_advance(bio, bytes);
+	}
+
+	/*
+	 * If we get here, it better have been because there was an error
+	 * reading a btree node
+	 */
+	ret = bch_btree_iter_unlock(&iter);
+	BUG_ON(!ret);
+	bcache_io_error(c, bio, "btree IO error %i", ret);
+	bio_endio(bio);
+}
+
+int bch_readpages(struct file *file, struct address_space *mapping,
+		  struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bch_read_bio *rbio = NULL;
+	struct page *page;
+
+	pr_debug("reading %u pages", nr_pages);
+
+	if (current->pagecache_lock != &mapping->add_lock)
+		pagecache_add_get(&mapping->add_lock);
+
+	for_each_readpage_page(mapping, pages, nr_pages, page) {
+again:
+		if (!rbio) {
+			rbio = container_of(bio_alloc_bioset(GFP_NOFS,
+						min_t(unsigned, nr_pages,
+						      BIO_MAX_PAGES),
+						&c->bio_read),
+					   struct bch_read_bio, bio);
+
+			rbio->bio.bi_end_io = bch_readpages_end_io;
+		}
+
+		if (bio_add_page_contig(&rbio->bio, page)) {
+			bchfs_read(c, rbio, inode->i_ino);
+			rbio = NULL;
+			goto again;
+		}
+	}
+
+	if (rbio)
+		bchfs_read(c, rbio, inode->i_ino);
+
+	if (current->pagecache_lock != &mapping->add_lock)
+		pagecache_add_put(&mapping->add_lock);
+
+	pr_debug("success");
+	return 0;
+}
+
+int bch_readpage(struct file *file, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bch_read_bio *rbio;
+
+	rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1,
+					    &c->bio_read),
+			   struct bch_read_bio, bio);
+	bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
+	rbio->bio.bi_end_io = bch_readpages_end_io;
+
+	bio_add_page_contig(&rbio->bio, page);
+	bchfs_read(c, rbio, inode->i_ino);
+
+	return 0;
+}
+
+struct bch_writepage_state {
+	struct bch_writepage_io	*io;
+};
+
+static void bch_writepage_io_free(struct closure *cl)
+{
+	struct bch_writepage_io *io = container_of(cl,
+					struct bch_writepage_io, cl);
+	struct bio *bio = &io->bio.bio;
+
+	bio_put(bio);
+}
+
+static void bch_writepage_io_done(struct closure *cl)
+{
+	struct bch_writepage_io *io = container_of(cl,
+					struct bch_writepage_io, cl);
+	struct cache_set *c = io->op.op.c;
+	struct bio *bio = &io->bio.bio;
+	struct bio_vec *bvec;
+	unsigned i;
+
+	atomic_sub(bio->bi_vcnt, &c->writeback_pages);
+	wake_up(&c->writeback_wait);
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		if (io->op.op.error) {
+			SetPageError(page);
+			if (page->mapping)
+				set_bit(AS_EIO, &page->mapping->flags);
+		}
+
+		if (io->op.op.written >= PAGE_SECTORS) {
+			struct bch_page_state old, new;
+
+			old = page_state_cmpxchg(page_state(page), new, {
+				new.sectors = PAGE_SECTORS;
+				new.dirty_sectors = 0;
+			});
+
+			io->op.sectors_added -= old.dirty_sectors;
+			io->op.op.written -= PAGE_SECTORS;
+		}
+	}
+
+	/*
+	 * racing with fallocate can cause us to add fewer sectors than
+	 * expected - but we shouldn't add more sectors than expected:
+	 *
+	 * (error (due to going RO) halfway through a page can screw that up
+	 * slightly)
+	 */
+	BUG_ON(io->op.sectors_added >= (s64) PAGE_SECTORS);
+
+	/*
+	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
+	 * before calling end_page_writeback:
+	 */
+	if (io->op.sectors_added) {
+		struct inode *inode = &io->op.ei->vfs_inode;
+
+		spin_lock(&inode->i_lock);
+		inode->i_blocks += io->op.sectors_added;
+		spin_unlock(&inode->i_lock);
+	}
+
+	bio_for_each_segment_all(bvec, bio, i)
+		end_page_writeback(bvec->bv_page);
+
+	closure_return_with_destructor(&io->cl, bch_writepage_io_free);
+}
+
+static void bch_writepage_do_io(struct bch_writepage_state *w)
+{
+	struct bch_writepage_io *io = w->io;
+
+	w->io = NULL;
+	atomic_add(io->bio.bio.bi_vcnt, &io->op.op.c->writeback_pages);
+
+	io->op.op.pos.offset = io->bio.bio.bi_iter.bi_sector;
+
+	closure_call(&io->op.op.cl, bch_write, NULL, &io->cl);
+	continue_at(&io->cl, bch_writepage_io_done, NULL);
+}
+
+/*
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
+ * possible, else allocating a new one:
+ */
+static void bch_writepage_io_alloc(struct cache_set *c,
+				   struct bch_writepage_state *w,
+				   struct bch_inode_info *ei,
+				   struct page *page)
+{
+	u64 inum = ei->vfs_inode.i_ino;
+
+	if (!w->io) {
+alloc_io:
+		w->io = container_of(bio_alloc_bioset(GFP_NOFS,
+						      BIO_MAX_PAGES,
+						      bch_writepage_bioset),
+				     struct bch_writepage_io, bio.bio);
+
+		closure_init(&w->io->cl, NULL);
+		w->io->op.ei		= ei;
+		w->io->op.sectors_added	= 0;
+		w->io->op.is_dio	= false;
+		bch_write_op_init(&w->io->op.op, c, &w->io->bio,
+				  (struct disk_reservation) {
+					.nr_replicas = c->opts.data_replicas,
+				  },
+				  foreground_write_point(c, inum),
+				  POS(inum, 0),
+				  &ei->journal_seq, 0);
+		w->io->op.op.index_update_fn = bchfs_write_index_update;
+	}
+
+	if (bio_add_page_contig(&w->io->bio.bio, page)) {
+		bch_writepage_do_io(w);
+		goto alloc_io;
+	}
+
+	/*
+	 * We shouldn't ever be handed pages for multiple inodes in a single
+	 * pass - right?
+	 */
+	BUG_ON(ei != w->io->op.ei);
+}
+
+static int __bch_writepage(struct cache_set *c, struct page *page,
+			   struct writeback_control *wbc,
+			   struct bch_writepage_state *w)
+{
+	struct inode *inode = page->mapping->host;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct bch_page_state new, old;
+	unsigned offset;
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_SHIFT;
+
+	EBUG_ON(!PageUptodate(page));
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		goto do_io;
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_SIZE - 1);
+	if (page->index > end_index || !offset) {
+		unlock_page(page);
+		return 0;
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_SIZE);
+do_io:
+	bch_writepage_io_alloc(c, w, ei, page);
+
+	/* while page is locked: */
+	w->io->op.new_i_size = i_size;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		w->io->bio.bio.bi_opf |= WRITE_SYNC;
+
+	/* Before unlocking the page, transfer reservation to w->io: */
+	old = page_state_cmpxchg(page_state(page), new, {
+		BUG_ON(!new.reserved &&
+		       (new.sectors != PAGE_SECTORS ||
+			new.alloc_state != BCH_PAGE_ALLOCATED));
+
+		if (new.alloc_state == BCH_PAGE_ALLOCATED &&
+		    w->io->op.op.compression_type != BCH_COMPRESSION_NONE)
+			new.alloc_state = BCH_PAGE_UNALLOCATED;
+		else if (!new.reserved)
+			goto out;
+		new.reserved = 0;
+	});
+
+	w->io->op.op.res.sectors += PAGE_SECTORS * (old.reserved - new.reserved);
+out:
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+
+	return 0;
+}
+
+int bch_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct cache_set *c = mapping->host->i_sb->s_fs_info;
+	struct bch_writepage_state w = { NULL };
+	struct pagecache_iter iter;
+	struct page *page;
+	int ret = 0;
+	int done = 0;
+	pgoff_t uninitialized_var(writeback_index);
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	pgoff_t done_index;
+	int cycled;
+	int range_whole = 0;
+	int tag;
+
+	if (wbc->range_cyclic) {
+		writeback_index = mapping->writeback_index; /* prev offset */
+		index = writeback_index;
+		if (index == 0)
+			cycled = 1;
+		else
+			cycled = 0;
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_SHIFT;
+		end = wbc->range_end >> PAGE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		cycled = 1; /* ignore range_cyclic tests */
+	}
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag_pages_for_writeback(mapping, index, end);
+
+	done_index = index;
+get_pages:
+	for_each_pagecache_tag(&iter, mapping, tag, index, end, page) {
+		done_index = page->index;
+
+		if (w.io &&
+		    !bio_can_add_page_contig(&w.io->bio.bio, page))
+			bch_writepage_do_io(&w);
+
+		if (!w.io &&
+		    atomic_read(&c->writeback_pages) >=
+		    c->writeback_pages_max) {
+			/* don't sleep with pages pinned: */
+			pagecache_iter_release(&iter);
+
+			__wait_event(c->writeback_wait,
+				     atomic_read(&c->writeback_pages) <
+				     c->writeback_pages_max);
+			goto get_pages;
+		}
+
+		lock_page(page);
+
+		/*
+		 * Page truncated or invalidated. We can freely skip it
+		 * then, even for data integrity operations: the page
+		 * has disappeared concurrently, so there could be no
+		 * real expectation of this data interity operation
+		 * even if there is now a new, dirty page at the same
+		 * pagecache address.
+		 */
+		if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+			unlock_page(page);
+			continue;
+		}
+
+		if (!PageDirty(page)) {
+			/* someone wrote it for us */
+			goto continue_unlock;
+		}
+
+		if (PageWriteback(page)) {
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+			else
+				goto continue_unlock;
+		}
+
+		BUG_ON(PageWriteback(page));
+		if (!clear_page_dirty_for_io(page))
+			goto continue_unlock;
+
+		trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
+		ret = __bch_writepage(c, page, wbc, &w);
+		if (unlikely(ret)) {
+			if (ret == AOP_WRITEPAGE_ACTIVATE) {
+				unlock_page(page);
+				ret = 0;
+			} else {
+				/*
+				 * done_index is set past this page,
+				 * so media errors will not choke
+				 * background writeout for the entire
+				 * file. This has consequences for
+				 * range_cyclic semantics (ie. it may
+				 * not be suitable for data integrity
+				 * writeout).
+				 */
+				done_index = page->index + 1;
+				done = 1;
+				break;
+			}
+		}
+
+		/*
+		 * We stop writing back only if we are not doing
+		 * integrity sync. In case of integrity sync we have to
+		 * keep going until we have written all the pages
+		 * we tagged for writeback prior to entering this loop.
+		 */
+		if (--wbc->nr_to_write <= 0 &&
+		    wbc->sync_mode == WB_SYNC_NONE) {
+			done = 1;
+			break;
+		}
+	}
+	pagecache_iter_release(&iter);
+
+	if (w.io)
+		bch_writepage_do_io(&w);
+
+	if (!cycled && !done) {
+		/*
+		 * range_cyclic:
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		cycled = 1;
+		index = 0;
+		end = writeback_index - 1;
+		goto retry;
+	}
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = done_index;
+
+	return ret;
+}
+
+int bch_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct cache_set *c = page->mapping->host->i_sb->s_fs_info;
+	struct bch_writepage_state w = { NULL };
+	int ret;
+
+	ret = __bch_writepage(c, page, wbc, &w);
+	if (w.io)
+		bch_writepage_do_io(&w);
+
+	return ret;
+}
+
+static void bch_read_single_page_end_io(struct bio *bio)
+{
+	complete(bio->bi_private);
+}
+
+static int bch_read_single_page(struct page *page,
+				struct address_space *mapping)
+{
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bch_read_bio *rbio;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1,
+					     &c->bio_read),
+			    struct bch_read_bio, bio);
+	bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
+	rbio->bio.bi_private = &done;
+	rbio->bio.bi_end_io = bch_read_single_page_end_io;
+	bio_add_page_contig(&rbio->bio, page);
+
+	bchfs_read(c, rbio, inode->i_ino);
+	wait_for_completion(&done);
+
+	ret = rbio->bio.bi_error;
+	bio_put(&rbio->bio);
+
+	if (ret < 0)
+		return ret;
+
+	SetPageUptodate(page);
+	return 0;
+}
+
+int bch_write_begin(struct file *file, struct address_space *mapping,
+		    loff_t pos, unsigned len, unsigned flags,
+		    struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	pgoff_t index = pos >> PAGE_SHIFT;
+	unsigned offset = pos & (PAGE_SIZE - 1);
+	struct page *page;
+	int ret = -ENOMEM;
+
+	BUG_ON(inode_unhashed(mapping->host));
+
+	/* Not strictly necessary - same reason as mkwrite(): */
+	pagecache_add_get(&mapping->add_lock);
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		goto err_unlock;
+
+	if (PageUptodate(page))
+		goto out;
+
+	/* If we're writing entire page, don't need to read it in first: */
+	if (len == PAGE_SIZE)
+		goto out;
+
+	if (!offset && pos + len >= inode->i_size) {
+		zero_user_segment(page, len, PAGE_SIZE);
+		flush_dcache_page(page);
+		goto out;
+	}
+
+	if (index > inode->i_size >> PAGE_SHIFT) {
+		zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
+		flush_dcache_page(page);
+		goto out;
+	}
+readpage:
+	ret = bch_read_single_page(page, mapping);
+	if (ret)
+		goto err;
+out:
+	ret = bch_get_page_reservation(c, page, true);
+	if (ret) {
+		if (!PageUptodate(page)) {
+			/*
+			 * If the page hasn't been read in, we won't know if we
+			 * actually need a reservation - we don't actually need
+			 * to read here, we just need to check if the page is
+			 * fully backed by uncompressed data:
+			 */
+			goto readpage;
+		}
+
+		goto err;
+	}
+
+	*pagep = page;
+	return 0;
+err:
+	unlock_page(page);
+	put_page(page);
+	*pagep = NULL;
+err_unlock:
+	pagecache_add_put(&mapping->add_lock);
+	return ret;
+}
+
+int bch_write_end(struct file *filp, struct address_space *mapping,
+		  loff_t pos, unsigned len, unsigned copied,
+		  struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	if (unlikely(copied < len && !PageUptodate(page))) {
+		/*
+		 * The page needs to be read in, but that would destroy
+		 * our partial write - simplest thing is to just force
+		 * userspace to redo the write:
+		 */
+		zero_user(page, 0, PAGE_SIZE);
+		flush_dcache_page(page);
+		copied = 0;
+	}
+
+	if (pos + copied > inode->i_size)
+		i_size_write(inode, pos + copied);
+
+	if (copied) {
+		if (!PageUptodate(page))
+			SetPageUptodate(page);
+		if (!PageDirty(page))
+			set_page_dirty(page);
+	} else {
+		bch_put_page_reservation(c, page);
+	}
+
+	unlock_page(page);
+	put_page(page);
+	pagecache_add_put(&mapping->add_lock);
+
+	return copied;
+}
+
+/* O_DIRECT */
+
+static void bch_dio_read_complete(struct closure *cl)
+{
+	struct dio_read *dio = container_of(cl, struct dio_read, cl);
+
+	dio->req->ki_complete(dio->req, dio->ret, 0);
+	bio_check_pages_dirty(&dio->rbio.bio);	/* transfers ownership */
+}
+
+static void bch_direct_IO_read_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+
+	if (bio->bi_error)
+		dio->ret = bio->bi_error;
+
+	closure_put(&dio->cl);
+}
+
+static void bch_direct_IO_read_split_endio(struct bio *bio)
+{
+	bch_direct_IO_read_endio(bio);
+	bio_check_pages_dirty(bio);	/* transfers ownership */
+}
+
+static int bch_direct_IO_read(struct cache_set *c, struct kiocb *req,
+			      struct file *file, struct inode *inode,
+			      struct iov_iter *iter, loff_t offset)
+{
+	struct dio_read *dio;
+	struct bio *bio;
+	bool sync = is_sync_kiocb(req);
+	ssize_t ret;
+
+	if ((offset|iter->count) & (block_bytes(c) - 1))
+		return -EINVAL;
+
+	ret = min_t(loff_t, iter->count,
+		    max_t(loff_t, 0, i_size_read(inode) - offset));
+	iov_iter_truncate(iter, round_up(ret, block_bytes(c)));
+
+	if (!ret)
+		return ret;
+
+	bio = bio_alloc_bioset(GFP_KERNEL,
+			       iov_iter_npages(iter, BIO_MAX_PAGES),
+			       bch_dio_read_bioset);
+
+	bio->bi_end_io = bch_direct_IO_read_endio;
+
+	dio = container_of(bio, struct dio_read, rbio.bio);
+	closure_init(&dio->cl, NULL);
+
+	/*
+	 * this is a _really_ horrible hack just to avoid an atomic sub at the
+	 * end:
+	 */
+	if (!sync) {
+		set_closure_fn(&dio->cl, bch_dio_read_complete, NULL);
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER -
+			   CLOSURE_RUNNING +
+			   CLOSURE_DESTRUCTOR);
+	} else {
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER + 1);
+	}
+
+	dio->req	= req;
+	dio->ret	= ret;
+
+	goto start;
+	while (iter->count) {
+		bio = bio_alloc_bioset(GFP_KERNEL,
+				       iov_iter_npages(iter, BIO_MAX_PAGES),
+				       &c->bio_read);
+		bio->bi_end_io		= bch_direct_IO_read_split_endio;
+start:
+		bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
+		bio->bi_iter.bi_sector	= offset >> 9;
+		bio->bi_private		= dio;
+
+		ret = bio_get_user_pages(bio, iter, 1);
+		if (ret < 0) {
+			/* XXX: fault inject this path */
+			bio->bi_error = ret;
+			bio_endio(bio);
+			break;
+		}
+
+		offset += bio->bi_iter.bi_size;
+		bio_set_pages_dirty(bio);
+
+		if (iter->count)
+			closure_get(&dio->cl);
+
+		bch_read(c, container_of(bio,
+				struct bch_read_bio, bio),
+			 inode->i_ino);
+	}
+
+	if (sync) {
+		closure_sync(&dio->cl);
+		closure_debug_destroy(&dio->cl);
+		ret = dio->ret;
+		bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+		return ret;
+	} else {
+		return -EIOCBQUEUED;
+	}
+}
+
+static long __bch_dio_write_complete(struct dio_write *dio)
+{
+	struct file *file = dio->req->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = file->f_inode;
+	long ret = dio->error ?: dio->written;
+
+	bch_disk_reservation_put(dio->c, &dio->res);
+
+	__pagecache_block_put(&mapping->add_lock);
+	inode_dio_end(inode);
+
+	if (dio->iovec && dio->iovec != dio->inline_vecs)
+		kfree(dio->iovec);
+
+	bio_put(&dio->bio.bio);
+	return ret;
+}
+
+static void bch_dio_write_complete(struct closure *cl)
+{
+	struct dio_write *dio = container_of(cl, struct dio_write, cl);
+	struct kiocb *req = dio->req;
+
+	req->ki_complete(req, __bch_dio_write_complete(dio), 0);
+}
+
+static void bch_dio_write_done(struct dio_write *dio)
+{
+	struct bio_vec *bv;
+	int i;
+
+	dio->written += dio->iop.op.written << 9;
+
+	if (dio->iop.op.error)
+		dio->error = dio->iop.op.error;
+
+	bio_for_each_segment_all(bv, &dio->bio.bio, i)
+		put_page(bv->bv_page);
+
+	if (dio->iter.count)
+		bio_reset(&dio->bio.bio);
+}
+
+static void bch_do_direct_IO_write(struct dio_write *dio)
+{
+	struct file *file = dio->req->ki_filp;
+	struct inode *inode = file->f_inode;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct bio *bio = &dio->bio.bio;
+	unsigned flags = 0;
+	int ret;
+
+	if ((dio->req->ki_flags & IOCB_DSYNC) &&
+	    !dio->c->opts.journal_flush_disabled)
+		flags |= BCH_WRITE_FLUSH;
+
+	bio->bi_iter.bi_sector = (dio->offset + dio->written) >> 9;
+
+	ret = bio_get_user_pages(bio, &dio->iter, 0);
+	if (ret < 0) {
+		/*
+		 * these didn't get initialized, but bch_dio_write_done() will
+		 * look at them:
+		 */
+		dio->iop.op.error = 0;
+		dio->iop.op.written = 0;
+		dio->error = ret;
+		return;
+	}
+
+	dio->iop.ei		= ei;
+	dio->iop.sectors_added	= 0;
+	dio->iop.is_dio		= true;
+	dio->iop.new_i_size	= U64_MAX;
+	bch_write_op_init(&dio->iop.op, dio->c, &dio->bio,
+			  dio->res,
+			  foreground_write_point(dio->c, inode->i_ino),
+			  POS(inode->i_ino, bio->bi_iter.bi_sector),
+			  &ei->journal_seq, flags);
+	dio->iop.op.index_update_fn = bchfs_write_index_update;
+
+	dio->res.sectors -= bio_sectors(bio);
+	dio->iop.op.res.sectors = bio_sectors(bio);
+
+	task_io_account_write(bio->bi_iter.bi_size);
+
+	closure_call(&dio->iop.op.cl, bch_write, NULL, &dio->cl);
+}
+
+static void bch_dio_write_loop_async(struct closure *cl)
+{
+	struct dio_write *dio =
+		container_of(cl, struct dio_write, cl);
+	struct address_space *mapping = dio->req->ki_filp->f_mapping;
+
+	bch_dio_write_done(dio);
+
+	if (dio->iter.count && !dio->error) {
+		use_mm(dio->mm);
+		pagecache_block_get(&mapping->add_lock);
+
+		bch_do_direct_IO_write(dio);
+
+		pagecache_block_put(&mapping->add_lock);
+		unuse_mm(dio->mm);
+
+		continue_at(&dio->cl, bch_dio_write_loop_async, NULL);
+	} else {
+#if 0
+		closure_return_with_destructor(cl, bch_dio_write_complete);
+#else
+		closure_debug_destroy(cl);
+		bch_dio_write_complete(cl);
+#endif
+	}
+}
+
+static int bch_direct_IO_write(struct cache_set *c, struct kiocb *req,
+			       struct file *file, struct inode *inode,
+			       struct iov_iter *iter, loff_t offset)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct dio_write *dio;
+	struct bio *bio;
+	ssize_t ret;
+	bool sync = is_sync_kiocb(req);
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	if (unlikely(!iter->count))
+		return 0;
+
+	if (unlikely((offset|iter->count) & (block_bytes(c) - 1)))
+		return -EINVAL;
+
+	bio = bio_alloc_bioset(GFP_KERNEL,
+			       iov_iter_npages(iter, BIO_MAX_PAGES),
+			       bch_dio_write_bioset);
+	dio = container_of(bio, struct dio_write, bio.bio);
+	dio->req	= req;
+	dio->c		= c;
+	dio->written	= 0;
+	dio->error	= 0;
+	dio->offset	= offset;
+	dio->iovec	= NULL;
+	dio->iter	= *iter;
+	dio->mm		= current->mm;
+	closure_init(&dio->cl, NULL);
+
+	if (offset + iter->count > inode->i_size)
+		sync = true;
+
+	/*
+	 * XXX: we shouldn't return -ENOSPC if we're overwriting existing data -
+	 * if getting a reservation fails we should check if we are doing an
+	 * overwrite.
+	 *
+	 * Have to then guard against racing with truncate (deleting data that
+	 * we would have been overwriting)
+	 */
+	ret = bch_disk_reservation_get(c, &dio->res, iter->count >> 9, 0);
+	if (unlikely(ret)) {
+		closure_debug_destroy(&dio->cl);
+		bio_put(bio);
+		return ret;
+	}
+
+	inode_dio_begin(inode);
+	__pagecache_block_get(&mapping->add_lock);
+
+	if (sync) {
+		do {
+			bch_do_direct_IO_write(dio);
+
+			closure_sync(&dio->cl);
+			bch_dio_write_done(dio);
+		} while (dio->iter.count && !dio->error);
+
+		closure_debug_destroy(&dio->cl);
+		return __bch_dio_write_complete(dio);
+	} else {
+		bch_do_direct_IO_write(dio);
+
+		if (dio->iter.count && !dio->error) {
+			if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+				dio->iovec = kmalloc(dio->iter.nr_segs *
+						     sizeof(struct iovec),
+						     GFP_KERNEL);
+				if (!dio->iovec)
+					dio->error = -ENOMEM;
+			} else {
+				dio->iovec = dio->inline_vecs;
+			}
+
+			memcpy(dio->iovec,
+			       dio->iter.iov,
+			       dio->iter.nr_segs * sizeof(struct iovec));
+			dio->iter.iov = dio->iovec;
+		}
+
+		continue_at_noreturn(&dio->cl, bch_dio_write_loop_async, NULL);
+		return -EIOCBQUEUED;
+	}
+}
+
+ssize_t bch_direct_IO(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct inode *inode = file->f_inode;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	return ((iov_iter_rw(iter) == WRITE)
+		? bch_direct_IO_write
+		: bch_direct_IO_read)(c, req, file, inode, iter, req->ki_pos);
+}
+
+static ssize_t
+bch_direct_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_inode;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct address_space *mapping = file->f_mapping;
+	loff_t pos = iocb->ki_pos;
+	ssize_t	ret;
+
+	pagecache_block_get(&mapping->add_lock);
+
+	/* Write and invalidate pagecache range that we're writing to: */
+	ret = write_invalidate_inode_pages_range(file->f_mapping, pos,
+					pos + iov_iter_count(iter) - 1);
+	if (unlikely(ret))
+		goto err;
+
+	ret = bch_direct_IO_write(c, iocb, file, inode, iter, pos);
+err:
+	pagecache_block_put(&mapping->add_lock);
+
+	return ret;
+}
+
+static ssize_t __bch_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t	ret;
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = inode_to_bdi(inode);
+	ret = file_remove_privs(file);
+	if (ret)
+		goto out;
+
+	ret = file_update_time(file);
+	if (ret)
+		goto out;
+
+	ret = iocb->ki_flags & IOCB_DIRECT
+		? bch_direct_write(iocb, from)
+		: generic_perform_write(file, from, iocb->ki_pos);
+
+	if (likely(ret > 0))
+		iocb->ki_pos += ret;
+out:
+	current->backing_dev_info = NULL;
+	return ret;
+}
+
+ssize_t bch_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	bool direct = iocb->ki_flags & IOCB_DIRECT;
+	ssize_t ret;
+
+	inode_lock(inode);
+	ret = generic_write_checks(iocb, from);
+	if (ret > 0)
+		ret = __bch_write_iter(iocb, from);
+	inode_unlock(inode);
+
+	if (ret > 0 && !direct)
+		ret = generic_write_sync(iocb, ret);
+
+	return ret;
+}
+
+int bch_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	struct address_space *mapping = inode->i_mapping;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	int ret = VM_FAULT_LOCKED;
+
+	sb_start_pagefault(inode->i_sb);
+	file_update_time(vma->vm_file);
+
+	/*
+	 * Not strictly necessary, but helps avoid dio writes livelocking in
+	 * write_invalidate_inode_pages_range() - can drop this if/when we get
+	 * a write_invalidate_inode_pages_range() that works without dropping
+	 * page lock before invalidating page
+	 */
+	if (current->pagecache_lock != &mapping->add_lock)
+		pagecache_add_get(&mapping->add_lock);
+
+	lock_page(page);
+	if (page->mapping != mapping ||
+	    page_offset(page) > i_size_read(inode)) {
+		unlock_page(page);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
+
+	if (bch_get_page_reservation(c, page, true)) {
+		unlock_page(page);
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+	if (!PageDirty(page))
+		set_page_dirty(page);
+	wait_for_stable_page(page);
+out:
+	if (current->pagecache_lock != &mapping->add_lock)
+		pagecache_add_put(&mapping->add_lock);
+	sb_end_pagefault(inode->i_sb);
+	return ret;
+}
+
+void bch_invalidatepage(struct page *page, unsigned int offset,
+			unsigned int length)
+{
+	EBUG_ON(!PageLocked(page));
+	EBUG_ON(PageWriteback(page));
+
+	if (offset || length < PAGE_SIZE)
+		return;
+
+	bch_clear_page_bits(page);
+}
+
+int bch_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	EBUG_ON(!PageLocked(page));
+	EBUG_ON(PageWriteback(page));
+
+	if (PageDirty(page))
+		return 0;
+
+	bch_clear_page_bits(page);
+	return 1;
+}
+
+#ifdef CONFIG_MIGRATION
+int bch_migrate_page(struct address_space *mapping, struct page *newpage,
+		     struct page *page, enum migrate_mode mode)
+{
+	int ret;
+
+	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+	if (ret != MIGRATEPAGE_SUCCESS)
+		return ret;
+
+	if (PagePrivate(page)) {
+		*page_state(newpage) = *page_state(page);
+		ClearPagePrivate(page);
+	}
+
+	migrate_page_copy(newpage, page);
+	return MIGRATEPAGE_SUCCESS;
+}
+#endif
+
+int bch_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	int ret;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
+	if (c->opts.journal_flush_disabled)
+		return 0;
+
+	return bch_journal_flush_seq(&c->journal, ei->journal_seq);
+}
+
+static int __bch_truncate_page(struct address_space *mapping,
+			       pgoff_t index, loff_t start, loff_t end)
+{
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	unsigned start_offset = start & (PAGE_SIZE - 1);
+	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+	struct page *page;
+	int ret = 0;
+
+	/* Page boundary? Nothing to do */
+	if (!((index == start >> PAGE_SHIFT && start_offset) ||
+	      (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
+		return 0;
+
+	/* Above i_size? */
+	if (index << PAGE_SHIFT >= inode->i_size)
+		return 0;
+
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		struct btree_iter iter;
+		struct bkey_s_c k = bkey_s_c_null;
+
+		/*
+		 * XXX: we're doing two index lookups when we end up reading the
+		 * page
+		 */
+		for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+				   POS(inode->i_ino,
+				       index << (PAGE_SHIFT - 9)), k) {
+			if (bkey_cmp(bkey_start_pos(k.k),
+				     POS(inode->i_ino,
+					 (index + 1) << (PAGE_SHIFT - 9))) >= 0)
+				break;
+
+			if (k.k->type != KEY_TYPE_DISCARD &&
+			    k.k->type != BCH_RESERVATION) {
+				bch_btree_iter_unlock(&iter);
+				goto create;
+			}
+		}
+		bch_btree_iter_unlock(&iter);
+		return 0;
+create:
+		page = find_or_create_page(mapping, index, GFP_KERNEL);
+		if (unlikely(!page)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	if (!PageUptodate(page)) {
+		ret = bch_read_single_page(page, mapping);
+		if (ret)
+			goto unlock;
+	}
+
+	/*
+	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
+	 *
+	 * XXX: because we aren't currently tracking whether the page has actual
+	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
+	 */
+	ret = bch_get_page_reservation(c, page, false);
+	BUG_ON(ret);
+
+	if (index == start >> PAGE_SHIFT &&
+	    index == end >> PAGE_SHIFT)
+		zero_user_segment(page, start_offset, end_offset);
+	else if (index == start >> PAGE_SHIFT)
+		zero_user_segment(page, start_offset, PAGE_SIZE);
+	else if (index == end >> PAGE_SHIFT)
+		zero_user_segment(page, 0, end_offset);
+
+	if (!PageDirty(page))
+		set_page_dirty(page);
+unlock:
+	unlock_page(page);
+	put_page(page);
+out:
+	return ret;
+}
+
+static int bch_truncate_page(struct address_space *mapping, loff_t from)
+{
+	return __bch_truncate_page(mapping, from >> PAGE_SHIFT,
+				   from, from + PAGE_SIZE);
+}
+
+int bch_truncate(struct inode *inode, struct iattr *iattr)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	bool shrink = iattr->ia_size <= inode->i_size;
+	int ret = 0;
+
+	inode_dio_wait(inode);
+	pagecache_block_get(&mapping->add_lock);
+
+	truncate_setsize(inode, iattr->ia_size);
+
+	/* sync appends.. */
+	/* XXX what protects ei->i_size? */
+	if (iattr->ia_size > ei->i_size)
+		ret = filemap_write_and_wait_range(mapping, ei->i_size, S64_MAX);
+	if (ret)
+		goto err_put_pagecache;
+
+	mutex_lock(&ei->update_lock);
+	i_size_dirty_get(ei);
+	ret = bch_write_inode_size(c, ei, inode->i_size);
+	mutex_unlock(&ei->update_lock);
+
+	if (unlikely(ret))
+		goto err;
+
+	/*
+	 * There might be persistent reservations (from fallocate())
+	 * above i_size, which bch_inode_truncate() will discard - we're
+	 * only supposed to discard them if we're doing a real truncate
+	 * here (new i_size < current i_size):
+	 */
+	if (shrink) {
+		struct i_sectors_hook i_sectors_hook;
+		int ret;
+
+		ret = i_sectors_dirty_get(ei, &i_sectors_hook);
+		if (unlikely(ret))
+			goto err;
+
+		ret = bch_truncate_page(inode->i_mapping, iattr->ia_size);
+		if (unlikely(ret)) {
+			i_sectors_dirty_put(ei, &i_sectors_hook);
+			goto err;
+		}
+
+		ret = bch_inode_truncate(c, inode->i_ino,
+					 round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+					 &i_sectors_hook.hook,
+					 &ei->journal_seq);
+
+		i_sectors_dirty_put(ei, &i_sectors_hook);
+
+		if (unlikely(ret))
+			goto err;
+	}
+
+	mutex_lock(&ei->update_lock);
+	setattr_copy(inode, iattr);
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	/* clear I_SIZE_DIRTY: */
+	i_size_dirty_put(ei);
+	ret = bch_write_inode_size(c, ei, inode->i_size);
+	mutex_unlock(&ei->update_lock);
+
+	pagecache_block_put(&mapping->add_lock);
+
+	return 0;
+err:
+	i_size_dirty_put(ei);
+err_put_pagecache:
+	pagecache_block_put(&mapping->add_lock);
+	return ret;
+}
+
+static long bch_fpunch(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	u64 ino = inode->i_ino;
+	u64 discard_start = round_up(offset, PAGE_SIZE) >> 9;
+	u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9;
+	int ret = 0;
+
+	inode_lock(inode);
+	inode_dio_wait(inode);
+	pagecache_block_get(&mapping->add_lock);
+
+	ret = __bch_truncate_page(inode->i_mapping,
+				  offset >> PAGE_SHIFT,
+				  offset, offset + len);
+	if (unlikely(ret))
+		goto out;
+
+	if (offset >> PAGE_SHIFT !=
+	    (offset + len) >> PAGE_SHIFT) {
+		ret = __bch_truncate_page(inode->i_mapping,
+					  (offset + len) >> PAGE_SHIFT,
+					  offset, offset + len);
+		if (unlikely(ret))
+			goto out;
+	}
+
+	truncate_pagecache_range(inode, offset, offset + len - 1);
+
+	if (discard_start < discard_end) {
+		struct disk_reservation disk_res;
+		struct i_sectors_hook i_sectors_hook;
+		int ret;
+
+		BUG_ON(bch_disk_reservation_get(c, &disk_res, 0, 0));
+
+		ret = i_sectors_dirty_get(ei, &i_sectors_hook);
+		if (unlikely(ret))
+			goto out;
+
+		ret = bch_discard(c,
+				  POS(ino, discard_start),
+				  POS(ino, discard_end),
+				  0,
+				  &disk_res,
+				  &i_sectors_hook.hook,
+				  &ei->journal_seq);
+
+		i_sectors_dirty_put(ei, &i_sectors_hook);
+		bch_disk_reservation_put(c, &disk_res);
+	}
+out:
+	pagecache_block_put(&mapping->add_lock);
+	inode_unlock(inode);
+
+	return ret;
+}
+
+static long bch_fcollapse(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct btree_iter src;
+	struct btree_iter dst;
+	BKEY_PADDED(k) copy;
+	struct bkey_s_c k;
+	struct i_sectors_hook i_sectors_hook;
+	loff_t new_size;
+	int ret;
+
+	if ((offset | len) & (PAGE_SIZE - 1))
+		return -EINVAL;
+
+	bch_btree_iter_init_intent(&dst, c, BTREE_ID_EXTENTS,
+				   POS(inode->i_ino, offset >> 9));
+	/* position will be set from dst iter's position: */
+	bch_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN);
+	bch_btree_iter_link(&src, &dst);
+
+	/*
+	 * We need i_mutex to keep the page cache consistent with the extents
+	 * btree, and the btree consistent with i_size - we don't need outside
+	 * locking for the extents btree itself, because we're using linked
+	 * iterators
+	 */
+	inode_lock(inode);
+	inode_dio_wait(inode);
+	pagecache_block_get(&mapping->add_lock);
+
+	ret = -EINVAL;
+	if (offset + len >= inode->i_size)
+		goto err;
+
+	if (inode->i_size < len)
+		goto err;
+
+	new_size = inode->i_size - len;
+
+	ret = write_invalidate_inode_pages_range(inode->i_mapping,
+						 offset, LLONG_MAX);
+	if (ret)
+		goto err;
+
+	ret = i_sectors_dirty_get(ei, &i_sectors_hook);
+	if (ret)
+		goto err;
+
+	while (bkey_cmp(dst.pos,
+			POS(inode->i_ino,
+			    round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
+		struct disk_reservation disk_res;
+
+		bch_btree_iter_set_pos(&src,
+			POS(dst.pos.inode, dst.pos.offset + (len >> 9)));
+
+		ret = bch_btree_iter_traverse(&dst);
+		if (ret)
+			goto btree_iter_err;
+
+		k = bch_btree_iter_peek_with_holes(&src);
+		if ((ret = btree_iter_err(k)))
+			goto btree_iter_err;
+
+		bkey_reassemble(&copy.k, k);
+
+		if (bkey_deleted(&copy.k.k))
+			copy.k.k.type = KEY_TYPE_DISCARD;
+
+		bch_cut_front(src.pos, &copy.k);
+		copy.k.k.p.offset -= len >> 9;
+
+		BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(&copy.k.k)));
+
+		ret = bch_disk_reservation_get(c, &disk_res, copy.k.k.size,
+					       BCH_DISK_RESERVATION_NOFAIL);
+		BUG_ON(ret);
+
+		ret = bch_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
+					  &ei->journal_seq,
+					  BTREE_INSERT_ATOMIC|
+					  BTREE_INSERT_NOFAIL,
+					  BTREE_INSERT_ENTRY(&dst, &copy.k));
+		bch_disk_reservation_put(c, &disk_res);
+btree_iter_err:
+		if (ret < 0 && ret != -EINTR)
+			goto err_unwind;
+
+		bch_btree_iter_cond_resched(&src);
+	}
+
+	bch_btree_iter_unlock(&src);
+	bch_btree_iter_unlock(&dst);
+
+	ret = bch_inode_truncate(c, inode->i_ino,
+				 round_up(new_size, PAGE_SIZE) >> 9,
+				 &i_sectors_hook.hook,
+				 &ei->journal_seq);
+	if (ret)
+		goto err_unwind;
+
+	i_sectors_dirty_put(ei, &i_sectors_hook);
+
+	mutex_lock(&ei->update_lock);
+	i_size_write(inode, new_size);
+	ret = bch_write_inode_size(c, ei, inode->i_size);
+	mutex_unlock(&ei->update_lock);
+
+	pagecache_block_put(&mapping->add_lock);
+	inode_unlock(inode);
+
+	return ret;
+err_unwind:
+	/*
+	 * XXX: we've left data with multiple pointers... which isn't a _super_
+	 * serious problem...
+	 */
+	i_sectors_dirty_put(ei, &i_sectors_hook);
+err:
+	bch_btree_iter_unlock(&src);
+	bch_btree_iter_unlock(&dst);
+	pagecache_block_put(&mapping->add_lock);
+	inode_unlock(inode);
+	return ret;
+}
+
+static long bch_fallocate(struct inode *inode, int mode,
+			  loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct i_sectors_hook i_sectors_hook;
+	struct btree_iter iter;
+	struct bkey_i reservation;
+	struct bkey_s_c k;
+	struct bpos end;
+	loff_t block_start, block_end;
+	loff_t new_size = offset + len;
+	unsigned sectors;
+	int ret;
+
+	bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+	inode_lock(inode);
+	inode_dio_wait(inode);
+	pagecache_block_get(&mapping->add_lock);
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    new_size > inode->i_size) {
+		ret = inode_newsize_ok(inode, new_size);
+		if (ret)
+			goto err;
+	}
+
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = __bch_truncate_page(inode->i_mapping,
+					  offset >> PAGE_SHIFT,
+					  offset, offset + len);
+
+		if (!ret &&
+		    offset >> PAGE_SHIFT !=
+		    (offset + len) >> PAGE_SHIFT)
+			ret = __bch_truncate_page(inode->i_mapping,
+						  (offset + len) >> PAGE_SHIFT,
+						  offset, offset + len);
+
+		if (unlikely(ret))
+			goto err;
+
+		truncate_pagecache_range(inode, offset, offset + len - 1);
+
+		block_start	= round_up(offset, PAGE_SIZE);
+		block_end	= round_down(offset + len, PAGE_SIZE);
+	} else {
+		block_start	= round_down(offset, PAGE_SIZE);
+		block_end	= round_up(offset + len, PAGE_SIZE);
+	}
+
+	bch_btree_iter_set_pos(&iter, POS(inode->i_ino, block_start >> 9));
+	end = POS(inode->i_ino, block_end >> 9);
+
+	ret = i_sectors_dirty_get(ei, &i_sectors_hook);
+	if (unlikely(ret))
+		goto err;
+
+	while (bkey_cmp(iter.pos, end) < 0) {
+		struct disk_reservation disk_res = { 0 };
+
+		k = bch_btree_iter_peek_with_holes(&iter);
+		if ((ret = btree_iter_err(k)))
+			goto btree_iter_err;
+
+		/* already reserved */
+		if (k.k->type == BCH_RESERVATION) {
+			bch_btree_iter_advance_pos(&iter);
+			continue;
+		}
+
+		if (bkey_extent_is_data(k.k)) {
+			if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+				bch_btree_iter_advance_pos(&iter);
+				continue;
+			}
+		}
+
+		bkey_init(&reservation.k);
+		reservation.k.type	= BCH_RESERVATION;
+		reservation.k.p		= k.k->p;
+		reservation.k.size	= k.k->size;
+
+		bch_cut_front(iter.pos, &reservation);
+		bch_cut_back(end, &reservation.k);
+
+		sectors = reservation.k.size;
+
+		if (!bkey_extent_is_allocation(k.k) ||
+		    bkey_extent_is_compressed(c, k)) {
+			ret = bch_disk_reservation_get(c, &disk_res,
+						       sectors, 0);
+			if (ret)
+				goto err_put_sectors_dirty;
+		}
+
+		ret = bch_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
+					  &ei->journal_seq,
+					  BTREE_INSERT_ATOMIC|
+					  BTREE_INSERT_NOFAIL,
+					  BTREE_INSERT_ENTRY(&iter, &reservation));
+		bch_disk_reservation_put(c, &disk_res);
+btree_iter_err:
+		if (ret < 0 && ret != -EINTR)
+			goto err_put_sectors_dirty;
+
+	}
+	bch_btree_iter_unlock(&iter);
+
+	i_sectors_dirty_put(ei, &i_sectors_hook);
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    new_size > inode->i_size) {
+		i_size_write(inode, new_size);
+
+		mutex_lock(&ei->update_lock);
+		ret = bch_write_inode_size(c, ei, inode->i_size);
+		mutex_unlock(&ei->update_lock);
+	}
+
+	/* blech */
+	if ((mode & FALLOC_FL_KEEP_SIZE) &&
+	    (mode & FALLOC_FL_ZERO_RANGE) &&
+	    ei->i_size != inode->i_size) {
+		/* sync appends.. */
+		ret = filemap_write_and_wait_range(mapping, ei->i_size, S64_MAX);
+		if (ret)
+			goto err;
+
+		if (ei->i_size != inode->i_size) {
+			mutex_lock(&ei->update_lock);
+			ret = bch_write_inode_size(c, ei, inode->i_size);
+			mutex_unlock(&ei->update_lock);
+		}
+	}
+
+	pagecache_block_put(&mapping->add_lock);
+	inode_unlock(inode);
+
+	return 0;
+err_put_sectors_dirty:
+	i_sectors_dirty_put(ei, &i_sectors_hook);
+err:
+	bch_btree_iter_unlock(&iter);
+	pagecache_block_put(&mapping->add_lock);
+	inode_unlock(inode);
+	return ret;
+}
+
+long bch_fallocate_dispatch(struct file *file, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(file);
+
+	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
+		return bch_fallocate(inode, mode, offset, len);
+
+	if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
+		return bch_fpunch(inode, offset, len);
+
+	if (mode == FALLOC_FL_COLLAPSE_RANGE)
+		return bch_fcollapse(inode, offset, len);
+
+	return -EOPNOTSUPP;
+}
+
+static bool page_is_data(struct page *page)
+{
+	/* XXX: should only have to check PageDirty */
+	return PagePrivate(page) &&
+		(page_state(page)->sectors ||
+		 page_state(page)->dirty_sectors);
+}
+
+static loff_t bch_next_pagecache_data(struct inode *inode,
+				      loff_t start_offset,
+				      loff_t end_offset)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	pgoff_t index;
+
+	for (index = start_offset >> PAGE_SHIFT;
+	     index < end_offset >> PAGE_SHIFT;
+	     index++) {
+		if (find_get_pages(mapping, index, 1, &page)) {
+			lock_page(page);
+			index = page->index;
+
+			if (page_is_data(page))
+				end_offset =
+					min(end_offset,
+					max(start_offset,
+					    ((loff_t) index) << PAGE_SHIFT));
+			unlock_page(page);
+			put_page(page);
+		} else {
+			break;
+		}
+	}
+
+	return end_offset;
+}
+
+static loff_t bch_seek_data(struct file *file, u64 offset)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 isize, next_data = MAX_LFS_FILESIZE;
+	int ret;
+
+	isize = i_size_read(inode);
+	if (offset >= isize)
+		return -ENXIO;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(inode->i_ino, offset >> 9), k) {
+		if (k.k->p.inode != inode->i_ino) {
+			break;
+		} else if (bkey_extent_is_data(k.k)) {
+			next_data = max(offset, bkey_start_offset(k.k) << 9);
+			break;
+		} else if (k.k->p.offset >> 9 > isize)
+			break;
+	}
+
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	if (next_data > offset)
+		next_data = bch_next_pagecache_data(inode, offset, next_data);
+
+	if (next_data > isize)
+		return -ENXIO;
+
+	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
+}
+
+static bool page_slot_is_data(struct address_space *mapping, pgoff_t index)
+{
+	struct page *page;
+	bool ret;
+
+	page = find_lock_entry(mapping, index);
+	if (!page || radix_tree_exception(page))
+		return false;
+
+	ret = page_is_data(page);
+	unlock_page(page);
+
+	return ret;
+}
+
+static loff_t bch_next_pagecache_hole(struct inode *inode,
+				      loff_t start_offset,
+				      loff_t end_offset)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t index;
+
+	for (index = start_offset >> PAGE_SHIFT;
+	     index < end_offset >> PAGE_SHIFT;
+	     index++)
+		if (!page_slot_is_data(mapping, index))
+			end_offset = max(start_offset,
+					 ((loff_t) index) << PAGE_SHIFT);
+
+	return end_offset;
+}
+
+static loff_t bch_seek_hole(struct file *file, u64 offset)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 isize, next_hole = MAX_LFS_FILESIZE;
+	int ret;
+
+	isize = i_size_read(inode);
+	if (offset >= isize)
+		return -ENXIO;
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
+				      POS(inode->i_ino, offset >> 9), k) {
+		if (k.k->p.inode != inode->i_ino) {
+			next_hole = bch_next_pagecache_hole(inode,
+					offset, MAX_LFS_FILESIZE);
+			break;
+		} else if (!bkey_extent_is_data(k.k)) {
+			next_hole = bch_next_pagecache_hole(inode,
+					max(offset, bkey_start_offset(k.k) << 9),
+					k.k->p.offset << 9);
+
+			if (next_hole < k.k->p.offset << 9)
+				break;
+		} else {
+			offset = max(offset, bkey_start_offset(k.k) << 9);
+		}
+	}
+
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	if (next_hole > isize)
+		next_hole = isize;
+
+	return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
+}
+
+loff_t bch_llseek(struct file *file, loff_t offset, int whence)
+{
+	switch (whence) {
+	case SEEK_SET:
+	case SEEK_CUR:
+	case SEEK_END:
+		return generic_file_llseek(file, offset, whence);
+	case SEEK_DATA:
+		return bch_seek_data(file, offset);
+	case SEEK_HOLE:
+		return bch_seek_hole(file, offset);
+	}
+
+	return -EINVAL;
+}
diff --git a/libbcache/fs-io.h b/libbcache/fs-io.h
new file mode 100644
index 0000000..d598bc8
--- /dev/null
+++ b/libbcache/fs-io.h
@@ -0,0 +1,96 @@
+#ifndef _BCACHE_FS_IO_H
+#define _BCACHE_FS_IO_H
+
+#include "buckets.h"
+#include <linux/uio.h>
+
+int bch_set_page_dirty(struct page *);
+
+int bch_writepage(struct page *, struct writeback_control *);
+int bch_readpage(struct file *, struct page *);
+
+int bch_writepages(struct address_space *, struct writeback_control *);
+int bch_readpages(struct file *, struct address_space *,
+		  struct list_head *, unsigned);
+
+int bch_write_begin(struct file *, struct address_space *, loff_t,
+		    unsigned, unsigned, struct page **, void **);
+int bch_write_end(struct file *, struct address_space *, loff_t,
+		  unsigned, unsigned, struct page *, void *);
+
+ssize_t bch_direct_IO(struct kiocb *, struct iov_iter *);
+
+ssize_t bch_write_iter(struct kiocb *, struct iov_iter *);
+
+int bch_fsync(struct file *, loff_t, loff_t, int);
+
+int bch_truncate(struct inode *, struct iattr *);
+long bch_fallocate_dispatch(struct file *, int, loff_t, loff_t);
+
+loff_t bch_llseek(struct file *, loff_t, int);
+
+int bch_page_mkwrite(struct vm_area_struct *, struct vm_fault *);
+void bch_invalidatepage(struct page *, unsigned int, unsigned int);
+int bch_releasepage(struct page *, gfp_t);
+int bch_migrate_page(struct address_space *, struct page *,
+		     struct page *, enum migrate_mode);
+
+struct i_sectors_hook {
+	struct extent_insert_hook	hook;
+	s64				sectors;
+	struct bch_inode_info		*ei;
+};
+
+struct bchfs_write_op {
+	struct bch_inode_info	*ei;
+	s64			sectors_added;
+	bool			is_dio;
+	u64			new_i_size;
+	struct bch_write_op	op;
+};
+
+struct bch_writepage_io {
+	struct closure		cl;
+
+	struct bchfs_write_op	op;
+
+	/* must come last: */
+	struct bch_write_bio	bio;
+};
+
+extern struct bio_set *bch_writepage_bioset;
+
+struct dio_write {
+	struct closure		cl;
+	struct kiocb		*req;
+	struct cache_set	*c;
+	long			written;
+	long			error;
+	loff_t			offset;
+
+	struct disk_reservation	res;
+
+	struct iovec		*iovec;
+	struct iovec		inline_vecs[UIO_FASTIOV];
+	struct iov_iter		iter;
+
+	struct mm_struct	*mm;
+
+	struct bchfs_write_op	iop;
+
+	/* must be last: */
+	struct bch_write_bio	bio;
+};
+
+extern struct bio_set *bch_dio_write_bioset;
+
+struct dio_read {
+	struct closure		cl;
+	struct kiocb		*req;
+	long			ret;
+	struct bch_read_bio	rbio;
+};
+
+extern struct bio_set *bch_dio_read_bioset;
+
+#endif /* _BCACHE_FS_IO_H */
diff --git a/libbcache/fs.c b/libbcache/fs.c
new file mode 100644
index 0000000..1f01e48
--- /dev/null
+++ b/libbcache/fs.c
@@ -0,0 +1,1506 @@
+
+#include "bcache.h"
+#include "acl.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "dirent.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-gc.h"
+#include "fs-io.h"
+#include "inode.h"
+#include "journal.h"
+#include "keylist.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/compat.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/random.h>
+#include <linux/statfs.h>
+#include <linux/xattr.h>
+
+static struct kmem_cache *bch_inode_cache;
+
+static void bch_inode_init(struct bch_inode_info *, struct bkey_s_c_inode);
+
+/*
+ * I_SIZE_DIRTY requires special handling:
+ *
+ * To the recovery code, the flag means that there is stale data past i_size
+ * that needs to be deleted; it's used for implementing atomic appends and
+ * truncates.
+ *
+ * On append, we set I_SIZE_DIRTY before doing the write, then after the write
+ * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size
+ * that exposes the data we just wrote.
+ *
+ * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting
+ * i_size to the new smaller size, then we delete the data that we just made
+ * invisible, and then we clear I_SIZE_DIRTY.
+ *
+ * Because there can be multiple appends in flight at a time, we need a refcount
+ * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero
+ * refcount means I_SIZE_DIRTY is set, zero means it's cleared.
+ *
+ * Because write_inode() can be called at any time, i_size_dirty_count means
+ * something different to the runtime code - it means to write_inode() "don't
+ * update i_size yet".
+ *
+ * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when
+ * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must
+ * be set explicitly.
+ */
+
+int __must_check __bch_write_inode(struct cache_set *c,
+				   struct bch_inode_info *ei,
+				   inode_set_fn set,
+				   void *p)
+{
+	struct btree_iter iter;
+	struct inode *inode = &ei->vfs_inode;
+	struct bkey_i_inode new_inode;
+	struct bch_inode *bi;
+	u64 inum = inode->i_ino;
+	int ret;
+
+	lockdep_assert_held(&ei->update_lock);
+
+	bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(inum, 0));
+
+	do {
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
+
+		if ((ret = btree_iter_err(k)))
+			goto out;
+
+		if (WARN_ONCE(k.k->type != BCH_INODE_FS,
+			      "inode %llu not found when updating", inum)) {
+			bch_btree_iter_unlock(&iter);
+			return -ENOENT;
+		}
+
+		bkey_reassemble(&new_inode.k_i, k);
+		bi = &new_inode.v;
+
+		if (set) {
+			ret = set(ei, bi, p);
+			if (ret)
+				goto out;
+		}
+
+		bi->i_mode	= cpu_to_le16(inode->i_mode);
+		bi->i_uid	= cpu_to_le32(i_uid_read(inode));
+		bi->i_gid	= cpu_to_le32(i_gid_read(inode));
+		bi->i_nlink	= cpu_to_le32(inode->i_nlink);
+		bi->i_dev	= cpu_to_le32(inode->i_rdev);
+		bi->i_atime	= cpu_to_le64(timespec_to_ns(&inode->i_atime));
+		bi->i_mtime	= cpu_to_le64(timespec_to_ns(&inode->i_mtime));
+		bi->i_ctime	= cpu_to_le64(timespec_to_ns(&inode->i_ctime));
+
+		ret = bch_btree_insert_at(c, NULL, NULL, &ei->journal_seq,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL,
+				BTREE_INSERT_ENTRY(&iter, &new_inode.k_i));
+	} while (ret == -EINTR);
+
+	if (!ret) {
+		ei->i_size	= le64_to_cpu(bi->i_size);
+		ei->i_flags	= le32_to_cpu(bi->i_flags);
+	}
+out:
+	bch_btree_iter_unlock(&iter);
+
+	return ret < 0 ? ret : 0;
+}
+
+int __must_check bch_write_inode(struct cache_set *c,
+				 struct bch_inode_info *ei)
+{
+	return __bch_write_inode(c, ei, NULL, NULL);
+}
+
+int bch_inc_nlink(struct cache_set *c, struct bch_inode_info *ei)
+{
+	int ret;
+
+	mutex_lock(&ei->update_lock);
+	inc_nlink(&ei->vfs_inode);
+	ret = bch_write_inode(c, ei);
+	mutex_unlock(&ei->update_lock);
+
+	return ret;
+}
+
+int bch_dec_nlink(struct cache_set *c, struct bch_inode_info *ei)
+{
+	int ret;
+
+	mutex_lock(&ei->update_lock);
+	drop_nlink(&ei->vfs_inode);
+	ret = bch_write_inode(c, ei);
+	mutex_unlock(&ei->update_lock);
+
+	return ret;
+}
+
+static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum)
+{
+	struct cache_set *c = sb->s_fs_info;
+	struct inode *inode;
+	struct bch_inode_info *ei;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	pr_debug("inum %llu", inum);
+
+	inode = iget_locked(sb, inum);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0));
+	k = bch_btree_iter_peek_with_holes(&iter);
+
+	if ((ret = btree_iter_err(k)) || k.k->type != BCH_INODE_FS) {
+		ret = bch_btree_iter_unlock(&iter);
+		iget_failed(inode);
+		return ERR_PTR(ret ?: -ENOENT);
+	}
+
+	ei = to_bch_ei(inode);
+	bch_inode_init(ei, bkey_s_c_to_inode(k));
+
+	ei->journal_seq = bch_inode_journal_seq(&c->journal, inum);
+
+	unlock_new_inode(inode);
+
+	bch_btree_iter_unlock(&iter);
+
+	return inode;
+}
+
+static struct inode *bch_vfs_inode_create(struct cache_set *c,
+					  struct inode *parent,
+					  umode_t mode, dev_t rdev)
+{
+	struct inode *inode;
+	struct posix_acl *default_acl = NULL, *acl = NULL;
+	struct bch_inode_info *ei;
+	struct bch_inode *bi;
+	struct bkey_i_inode bkey_inode;
+	struct timespec ts = CURRENT_TIME;
+	s64 now = timespec_to_ns(&ts);
+	int ret;
+
+	inode = new_inode(parent->i_sb);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+
+	inode_init_owner(inode, parent, mode);
+
+	ret = posix_acl_create(parent, &inode->i_mode, &default_acl, &acl);
+	if (ret) {
+		make_bad_inode(inode);
+		goto err;
+	}
+
+	ei = to_bch_ei(inode);
+
+	bi = &bkey_inode_init(&bkey_inode.k_i)->v;
+	bi->i_uid	= cpu_to_le32(i_uid_read(inode));
+	bi->i_gid	= cpu_to_le32(i_gid_read(inode));
+
+	bi->i_mode	= cpu_to_le16(inode->i_mode);
+	bi->i_dev	= cpu_to_le32(rdev);
+	bi->i_atime	= cpu_to_le64(now);
+	bi->i_mtime	= cpu_to_le64(now);
+	bi->i_ctime	= cpu_to_le64(now);
+	bi->i_nlink	= cpu_to_le32(S_ISDIR(mode) ? 2 : 1);
+
+	get_random_bytes(&bi->i_hash_seed, sizeof(bi->i_hash_seed));
+	SET_INODE_STR_HASH_TYPE(bi, c->sb.str_hash_type);
+
+	ret = bch_inode_create(c, &bkey_inode.k_i,
+			       BLOCKDEV_INODE_MAX, 0,
+			       &c->unused_inode_hint);
+	if (unlikely(ret)) {
+		/*
+		 * indicate to bch_evict_inode that the inode was never actually
+		 * created:
+		 */
+		make_bad_inode(inode);
+		goto err;
+	}
+
+	bch_inode_init(ei, inode_i_to_s_c(&bkey_inode));
+
+	if (default_acl) {
+		ret = bch_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	if (acl) {
+		ret = bch_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	insert_inode_hash(inode);
+	atomic_long_inc(&c->nr_inodes);
+out:
+	posix_acl_release(default_acl);
+	posix_acl_release(acl);
+	return inode;
+err:
+	clear_nlink(inode);
+	iput(inode);
+	inode = ERR_PTR(ret);
+	goto out;
+}
+
+static int bch_vfs_dirent_create(struct cache_set *c, struct inode *dir,
+				 u8 type, const struct qstr *name,
+				 struct inode *dst)
+{
+	int ret;
+
+	ret = bch_dirent_create(c, dir, type, name, dst->i_ino);
+	if (unlikely(ret))
+		return ret;
+
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty_sync(dir);
+	return 0;
+}
+
+static int __bch_create(struct inode *dir, struct dentry *dentry,
+			umode_t mode, dev_t rdev)
+{
+	struct bch_inode_info *dir_ei = to_bch_ei(dir);
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode;
+	struct bch_inode_info *ei;
+	int ret;
+
+	inode = bch_vfs_inode_create(c, dir, mode, rdev);
+	if (unlikely(IS_ERR(inode)))
+		return PTR_ERR(inode);
+
+	ei = to_bch_ei(inode);
+
+	ret = bch_vfs_dirent_create(c, dir, mode_to_type(mode),
+				    &dentry->d_name, inode);
+	if (unlikely(ret)) {
+		clear_nlink(inode);
+		iput(inode);
+		return ret;
+	}
+
+	if (dir_ei->journal_seq > ei->journal_seq)
+		ei->journal_seq = dir_ei->journal_seq;
+
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+/* methods */
+
+static struct dentry *bch_lookup(struct inode *dir, struct dentry *dentry,
+				 unsigned int flags)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode = NULL;
+	u64 inum;
+
+	inum = bch_dirent_lookup(c, dir, &dentry->d_name);
+
+	if (inum)
+		inode = bch_vfs_inode_get(dir->i_sb, inum);
+
+	return d_splice_alias(inode, dentry);
+}
+
+static int bch_create(struct inode *dir, struct dentry *dentry,
+		      umode_t mode, bool excl)
+{
+	return __bch_create(dir, dentry, mode|S_IFREG, 0);
+}
+
+static int bch_link(struct dentry *old_dentry, struct inode *dir,
+		    struct dentry *dentry)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode = old_dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	int ret;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	inode->i_ctime = CURRENT_TIME;
+
+	ret = bch_inc_nlink(c, ei);
+	if (ret)
+		return ret;
+
+	ihold(inode);
+
+	ret = bch_vfs_dirent_create(c, dir, mode_to_type(inode->i_mode),
+				    &dentry->d_name, inode);
+	if (unlikely(ret)) {
+		bch_dec_nlink(c, ei);
+		iput(inode);
+		return ret;
+	}
+
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+static int bch_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct bch_inode_info *dir_ei = to_bch_ei(dir);
+	struct inode *inode = dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	int ret;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	ret = bch_dirent_delete(c, dir, &dentry->d_name);
+	if (ret)
+		return ret;
+
+	if (dir_ei->journal_seq > ei->journal_seq)
+		ei->journal_seq = dir_ei->journal_seq;
+
+	inode->i_ctime = dir->i_ctime;
+
+	if (S_ISDIR(inode->i_mode)) {
+		bch_dec_nlink(c, dir_ei);
+		drop_nlink(inode);
+	}
+
+	drop_nlink(inode);
+	if (inode->i_nlink) {
+		mutex_lock(&ei->update_lock);
+		ret = bch_write_inode(c, ei);
+		mutex_unlock(&ei->update_lock);
+	}
+
+	return 0;
+}
+
+static int bch_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *symname)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode;
+	struct bch_inode_info *ei, *dir_ei = to_bch_ei(dir);
+	int ret;
+
+	inode = bch_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0);
+	if (unlikely(IS_ERR(inode)))
+		return PTR_ERR(inode);
+
+	ei = to_bch_ei(inode);
+
+	inode_lock(inode);
+	ret = page_symlink(inode, symname, strlen(symname) + 1);
+	inode_unlock(inode);
+
+	if (unlikely(ret))
+		goto err;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
+	if (unlikely(ret))
+		goto err;
+
+	/* XXX: racy */
+	if (dir_ei->journal_seq < ei->journal_seq)
+		dir_ei->journal_seq = ei->journal_seq;
+
+	ret = bch_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name, inode);
+	if (unlikely(ret))
+		goto err;
+
+	d_instantiate(dentry, inode);
+	return 0;
+err:
+	clear_nlink(inode);
+	iput(inode);
+	return ret;
+}
+
+static int bch_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	int ret;
+
+	lockdep_assert_held(&dir->i_rwsem);
+
+	ret = __bch_create(dir, dentry, mode|S_IFDIR, 0);
+	if (unlikely(ret))
+		return ret;
+
+	bch_inc_nlink(c, to_bch_ei(dir));
+
+	return 0;
+}
+
+static int bch_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode = dentry->d_inode;
+
+	if (bch_empty_dir(c, inode->i_ino))
+		return -ENOTEMPTY;
+
+	return bch_unlink(dir, dentry);
+}
+
+static int bch_mknod(struct inode *dir, struct dentry *dentry,
+		     umode_t mode, dev_t rdev)
+{
+	return __bch_create(dir, dentry, mode, rdev);
+}
+
+static int bch_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct cache_set *c = old_dir->i_sb->s_fs_info;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(old_inode);
+	struct inode *new_inode = new_dentry->d_inode;
+	struct timespec now = CURRENT_TIME;
+	int ret;
+
+	lockdep_assert_held(&old_dir->i_rwsem);
+	lockdep_assert_held(&new_dir->i_rwsem);
+
+	if (new_inode)
+		filemap_write_and_wait_range(old_inode->i_mapping,
+					     0, LLONG_MAX);
+
+	if (new_inode && S_ISDIR(old_inode->i_mode)) {
+		lockdep_assert_held(&new_inode->i_rwsem);
+
+		if (!S_ISDIR(new_inode->i_mode))
+			return -ENOTDIR;
+
+		if (bch_empty_dir(c, new_inode->i_ino))
+			return -ENOTEMPTY;
+
+		ret = bch_dirent_rename(c,
+					old_dir, &old_dentry->d_name,
+					new_dir, &new_dentry->d_name,
+					&ei->journal_seq, BCH_RENAME_OVERWRITE);
+		if (unlikely(ret))
+			return ret;
+
+		clear_nlink(new_inode);
+		bch_dec_nlink(c, to_bch_ei(old_dir));
+	} else if (new_inode) {
+		lockdep_assert_held(&new_inode->i_rwsem);
+
+		ret = bch_dirent_rename(c,
+					old_dir, &old_dentry->d_name,
+					new_dir, &new_dentry->d_name,
+					&ei->journal_seq, BCH_RENAME_OVERWRITE);
+		if (unlikely(ret))
+			return ret;
+
+		new_inode->i_ctime = now;
+		bch_dec_nlink(c, to_bch_ei(new_inode));
+	} else if (S_ISDIR(old_inode->i_mode)) {
+		ret = bch_dirent_rename(c,
+					old_dir, &old_dentry->d_name,
+					new_dir, &new_dentry->d_name,
+					&ei->journal_seq, BCH_RENAME);
+		if (unlikely(ret))
+			return ret;
+
+		bch_inc_nlink(c, to_bch_ei(new_dir));
+		bch_dec_nlink(c, to_bch_ei(old_dir));
+	} else {
+		ret = bch_dirent_rename(c,
+					old_dir, &old_dentry->d_name,
+					new_dir, &new_dentry->d_name,
+					&ei->journal_seq, BCH_RENAME);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	old_dir->i_ctime = old_dir->i_mtime = now;
+	new_dir->i_ctime = new_dir->i_mtime = now;
+	mark_inode_dirty_sync(old_dir);
+	mark_inode_dirty_sync(new_dir);
+
+	old_inode->i_ctime = now;
+	mark_inode_dirty_sync(old_inode);
+
+	return 0;
+}
+
+static int bch_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
+			       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct cache_set *c = old_dir->i_sb->s_fs_info;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(old_inode);
+	struct timespec now = CURRENT_TIME;
+	int ret;
+
+	ret = bch_dirent_rename(c,
+				old_dir, &old_dentry->d_name,
+				new_dir, &new_dentry->d_name,
+				&ei->journal_seq, BCH_RENAME_EXCHANGE);
+	if (unlikely(ret))
+		return ret;
+
+	if (S_ISDIR(old_inode->i_mode) !=
+	    S_ISDIR(new_inode->i_mode)) {
+		if (S_ISDIR(old_inode->i_mode)) {
+			bch_inc_nlink(c, to_bch_ei(new_dir));
+			bch_dec_nlink(c, to_bch_ei(old_dir));
+		} else {
+			bch_dec_nlink(c, to_bch_ei(new_dir));
+			bch_inc_nlink(c, to_bch_ei(old_dir));
+		}
+	}
+
+	old_dir->i_ctime = old_dir->i_mtime = now;
+	new_dir->i_ctime = new_dir->i_mtime = now;
+	mark_inode_dirty_sync(old_dir);
+	mark_inode_dirty_sync(new_dir);
+
+	old_inode->i_ctime = now;
+	new_inode->i_ctime = now;
+	mark_inode_dirty_sync(old_inode);
+	mark_inode_dirty_sync(new_inode);
+
+	return 0;
+}
+
+static int bch_rename2(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry,
+		       unsigned flags)
+{
+	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+		return -EINVAL;
+
+	if (flags & RENAME_EXCHANGE)
+		return bch_rename_exchange(old_dir, old_dentry,
+					   new_dir, new_dentry);
+
+	return bch_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+static int bch_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	int ret = 0;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	pr_debug("i_size was %llu update has %llu",
+		 inode->i_size, iattr->ia_size);
+
+	ret = setattr_prepare(dentry, iattr);
+	if (ret)
+		return ret;
+
+	if (iattr->ia_valid & ATTR_SIZE) {
+		ret = bch_truncate(inode, iattr);
+	} else {
+		mutex_lock(&ei->update_lock);
+		setattr_copy(inode, iattr);
+		ret = bch_write_inode(c, ei);
+		mutex_unlock(&ei->update_lock);
+	}
+
+	if (unlikely(ret))
+		return ret;
+
+	if (iattr->ia_valid & ATTR_MODE)
+		ret = posix_acl_chmod(inode, inode->i_mode);
+
+	return ret;
+}
+
+static int bch_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode;
+
+	/* XXX: i_nlink should be 0? */
+	inode = bch_vfs_inode_create(c, dir, mode, 0);
+	if (unlikely(IS_ERR(inode)))
+		return PTR_ERR(inode);
+
+	d_tmpfile(dentry, inode);
+	return 0;
+}
+
+static int bch_fill_extent(struct fiemap_extent_info *info,
+			   const struct bkey_i *k, unsigned flags)
+{
+	if (bkey_extent_is_data(&k->k)) {
+		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+		const struct bch_extent_ptr *ptr;
+		const union bch_extent_crc *crc;
+		int ret;
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			int flags2 = 0;
+			u64 offset = ptr->offset;
+
+			if (crc_compression_type(crc))
+				flags2 |= FIEMAP_EXTENT_ENCODED;
+			else
+				offset += crc_offset(crc);
+
+			if ((offset & (PAGE_SECTORS - 1)) ||
+			    (e.k->size & (PAGE_SECTORS - 1)))
+				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+			ret = fiemap_fill_next_extent(info,
+						      bkey_start_offset(e.k) << 9,
+						      offset << 9,
+						      e.k->size << 9, flags|flags2);
+			if (ret)
+				return ret;
+		}
+
+		return 0;
+	} else if (k->k.type == BCH_RESERVATION) {
+		return fiemap_fill_next_extent(info,
+					       bkey_start_offset(&k->k) << 9,
+					       0, k->k.size << 9,
+					       flags|
+					       FIEMAP_EXTENT_DELALLOC|
+					       FIEMAP_EXTENT_UNWRITTEN);
+	} else {
+		BUG();
+	}
+}
+
+static int bch_fiemap(struct inode *inode, struct fiemap_extent_info *info,
+		      u64 start, u64 len)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	BKEY_PADDED(k) tmp;
+	bool have_extent = false;
+	int ret = 0;
+
+	if (start + len < start)
+		return -EINVAL;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(inode->i_ino, start >> 9), k)
+		if (bkey_extent_is_data(k.k) ||
+		    k.k->type == BCH_RESERVATION) {
+			if (bkey_cmp(bkey_start_pos(k.k),
+				     POS(inode->i_ino, (start + len) >> 9)) >= 0)
+				break;
+
+			if (have_extent) {
+				ret = bch_fill_extent(info, &tmp.k, 0);
+				if (ret)
+					goto out;
+			}
+
+			bkey_reassemble(&tmp.k, k);
+			have_extent = true;
+		}
+
+	if (have_extent)
+		ret = bch_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
+out:
+	bch_btree_iter_unlock(&iter);
+	return ret < 0 ? ret : 0;
+}
+
+static const struct vm_operations_struct bch_vm_ops = {
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite   = bch_page_mkwrite,
+};
+
+static int bch_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+
+	vma->vm_ops = &bch_vm_ops;
+	return 0;
+}
+
+/* Inode flags: */
+
+static const unsigned bch_inode_flags_to_vfs_flags_map[] = {
+	[__BCH_INODE_SYNC]	= S_SYNC,
+	[__BCH_INODE_IMMUTABLE]	= S_IMMUTABLE,
+	[__BCH_INODE_APPEND]	= S_APPEND,
+	[__BCH_INODE_NOATIME]	= S_NOATIME,
+};
+
+static const unsigned bch_inode_flags_to_user_flags_map[] = {
+	[__BCH_INODE_SYNC]	= FS_SYNC_FL,
+	[__BCH_INODE_IMMUTABLE]	= FS_IMMUTABLE_FL,
+	[__BCH_INODE_APPEND]	= FS_APPEND_FL,
+	[__BCH_INODE_NODUMP]	= FS_NODUMP_FL,
+	[__BCH_INODE_NOATIME]	= FS_NOATIME_FL,
+};
+
+/* Set VFS inode flags from bcache inode: */
+static void bch_inode_flags_to_vfs(struct inode *inode)
+{
+	unsigned i, flags = to_bch_ei(inode)->i_flags;
+
+	for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_vfs_flags_map); i++)
+		if (flags & (1 << i))
+			inode->i_flags |=  bch_inode_flags_to_vfs_flags_map[i];
+		else
+			inode->i_flags &= ~bch_inode_flags_to_vfs_flags_map[i];
+}
+
+/* Get FS_IOC_GETFLAGS flags from bcache inode: */
+static unsigned bch_inode_flags_to_user_flags(unsigned flags)
+{
+	unsigned i, ret = 0;
+
+	for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_user_flags_map); i++)
+		if (flags & (1 << i))
+			ret |= bch_inode_flags_to_user_flags_map[i];
+
+	return ret;
+}
+
+static int bch_inode_user_flags_set(struct bch_inode_info *ei,
+				    struct bch_inode *bi,
+				    void *p)
+{
+	/*
+	 * We're relying on btree locking here for exclusion with other ioctl
+	 * calls - use the flags in the btree (@bi), not ei->i_flags:
+	 */
+	unsigned bch_flags = le32_to_cpu(bi->i_flags);
+	unsigned oldflags = bch_inode_flags_to_user_flags(bch_flags);
+	unsigned newflags = *((unsigned *) p);
+	unsigned i;
+
+	if (((newflags ^ oldflags) & (FS_APPEND_FL|FS_IMMUTABLE_FL)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_user_flags_map); i++) {
+		if (newflags & bch_inode_flags_to_user_flags_map[i])
+			bch_flags |=  (1 << i);
+		else
+			bch_flags &= ~(1 << i);
+
+		newflags &= ~bch_inode_flags_to_user_flags_map[i];
+		oldflags &= ~bch_inode_flags_to_user_flags_map[i];
+	}
+
+	if (oldflags != newflags)
+		return -EOPNOTSUPP;
+
+	bi->i_flags = cpu_to_le32(bch_flags);
+	ei->vfs_inode.i_ctime = CURRENT_TIME;
+
+	return 0;
+}
+
+#define FS_IOC_GOINGDOWN	     _IOR ('X', 125, __u32)
+
+static long bch_fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct super_block *sb = inode->i_sb;
+	struct cache_set *c = sb->s_fs_info;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	unsigned flags;
+	int ret;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return put_user(bch_inode_flags_to_user_flags(ei->i_flags),
+				(int __user *) arg);
+
+	case FS_IOC_SETFLAGS: {
+		ret = mnt_want_write_file(filp);
+		if (ret)
+			return ret;
+
+		if (!inode_owner_or_capable(inode)) {
+			ret = -EACCES;
+			goto setflags_out;
+		}
+
+		if (get_user(flags, (int __user *) arg)) {
+			ret = -EFAULT;
+			goto setflags_out;
+		}
+
+		if (!S_ISREG(inode->i_mode) &&
+		    !S_ISDIR(inode->i_mode) &&
+		    (flags & (FS_NODUMP_FL|FS_NOATIME_FL)) != flags) {
+			ret = -EINVAL;
+			goto setflags_out;
+		}
+
+		inode_lock(inode);
+
+		mutex_lock(&ei->update_lock);
+		ret = __bch_write_inode(c, ei, bch_inode_user_flags_set, &flags);
+		mutex_unlock(&ei->update_lock);
+
+		if (!ret)
+			bch_inode_flags_to_vfs(inode);
+
+		inode_unlock(inode);
+setflags_out:
+		mnt_drop_write_file(filp);
+		return ret;
+	}
+
+	case FS_IOC_GETVERSION:
+		return -ENOTTY;
+	case FS_IOC_SETVERSION:
+		return -ENOTTY;
+
+	case FS_IOC_GOINGDOWN:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		down_write(&sb->s_umount);
+		sb->s_flags |= MS_RDONLY;
+		bch_cache_set_emergency_read_only(c);
+		up_write(&sb->s_umount);
+		return 0;
+
+	default:
+		return bch_cache_set_ioctl(c, cmd, (void __user *) arg);
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long bch_compat_fs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return bch_fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+/* Directories: */
+
+static loff_t bch_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	return generic_file_llseek_size(file, offset, whence,
+					S64_MAX, S64_MAX);
+}
+
+static int bch_vfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	return bch_readdir(c, file, ctx);
+}
+
+static const struct file_operations bch_file_operations = {
+	.llseek		= bch_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= bch_write_iter,
+	.mmap		= bch_mmap,
+	.open		= generic_file_open,
+	.fsync		= bch_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.fallocate	= bch_fallocate_dispatch,
+	.unlocked_ioctl = bch_fs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_file_inode_operations = {
+	.setattr	= bch_setattr,
+	.fiemap		= bch_fiemap,
+	.listxattr	= bch_xattr_list,
+	.get_acl	= bch_get_acl,
+	.set_acl	= bch_set_acl,
+};
+
+static const struct inode_operations bch_dir_inode_operations = {
+	.lookup		= bch_lookup,
+	.create		= bch_create,
+	.link		= bch_link,
+	.unlink		= bch_unlink,
+	.symlink	= bch_symlink,
+	.mkdir		= bch_mkdir,
+	.rmdir		= bch_rmdir,
+	.mknod		= bch_mknod,
+	.rename		= bch_rename2,
+	.setattr	= bch_setattr,
+	.tmpfile	= bch_tmpfile,
+	.listxattr	= bch_xattr_list,
+	.get_acl	= bch_get_acl,
+	.set_acl	= bch_set_acl,
+};
+
+static const struct file_operations bch_dir_file_operations = {
+	.llseek		= bch_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate	= bch_vfs_readdir,
+	.fsync		= bch_fsync,
+	.unlocked_ioctl = bch_fs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.get_link	= page_get_link,
+	.setattr	= bch_setattr,
+	.listxattr	= bch_xattr_list,
+	.get_acl	= bch_get_acl,
+	.set_acl	= bch_set_acl,
+};
+
+static const struct inode_operations bch_special_inode_operations = {
+	.setattr	= bch_setattr,
+	.listxattr	= bch_xattr_list,
+	.get_acl	= bch_get_acl,
+	.set_acl	= bch_set_acl,
+};
+
+static const struct address_space_operations bch_address_space_operations = {
+	.writepage	= bch_writepage,
+	.readpage	= bch_readpage,
+	.writepages	= bch_writepages,
+	.readpages	= bch_readpages,
+	.set_page_dirty	= bch_set_page_dirty,
+	.write_begin	= bch_write_begin,
+	.write_end	= bch_write_end,
+	.invalidatepage	= bch_invalidatepage,
+	.releasepage	= bch_releasepage,
+	.direct_IO	= bch_direct_IO,
+#ifdef CONFIG_MIGRATION
+	.migratepage	= bch_migrate_page,
+#endif
+	.error_remove_page = generic_error_remove_page,
+};
+
+static void bch_inode_init(struct bch_inode_info *ei,
+			   struct bkey_s_c_inode bkey_inode)
+{
+	struct inode *inode = &ei->vfs_inode;
+	const struct bch_inode *bi = bkey_inode.v;
+
+	pr_debug("init inode %llu with mode %o",
+		 bkey_inode.k->p.inode, bi->i_mode);
+
+	ei->i_flags	= le32_to_cpu(bi->i_flags);
+	ei->i_size	= le64_to_cpu(bi->i_size);
+
+	inode->i_mode	= le16_to_cpu(bi->i_mode);
+	i_uid_write(inode, le32_to_cpu(bi->i_uid));
+	i_gid_write(inode, le32_to_cpu(bi->i_gid));
+
+	atomic64_set(&ei->i_sectors, le64_to_cpu(bi->i_sectors));
+	inode->i_blocks = atomic64_read(&ei->i_sectors);
+
+	inode->i_ino	= bkey_inode.k->p.inode;
+	set_nlink(inode, le32_to_cpu(bi->i_nlink));
+	inode->i_rdev	= le32_to_cpu(bi->i_dev);
+	inode->i_size	= le64_to_cpu(bi->i_size);
+	inode->i_atime	= ns_to_timespec(le64_to_cpu(bi->i_atime));
+	inode->i_mtime	= ns_to_timespec(le64_to_cpu(bi->i_mtime));
+	inode->i_ctime	= ns_to_timespec(le64_to_cpu(bi->i_ctime));
+	bch_inode_flags_to_vfs(inode);
+
+	ei->str_hash.seed = le64_to_cpu(bi->i_hash_seed);
+	ei->str_hash.type = INODE_STR_HASH_TYPE(bi);
+
+	inode->i_mapping->a_ops = &bch_address_space_operations;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &bch_file_inode_operations;
+		inode->i_fop = &bch_file_operations;
+		break;
+	case S_IFDIR:
+		inode->i_op = &bch_dir_inode_operations;
+		inode->i_fop = &bch_dir_file_operations;
+		break;
+	case S_IFLNK:
+		inode_nohighmem(inode);
+		inode->i_op = &bch_symlink_inode_operations;
+		break;
+	default:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		inode->i_op = &bch_special_inode_operations;
+		break;
+	}
+}
+
+static struct inode *bch_alloc_inode(struct super_block *sb)
+{
+	struct bch_inode_info *ei;
+
+	ei = kmem_cache_alloc(bch_inode_cache, GFP_NOFS);
+	if (!ei)
+		return NULL;
+
+	pr_debug("allocated %p", &ei->vfs_inode);
+
+	inode_init_once(&ei->vfs_inode);
+	mutex_init(&ei->update_lock);
+	ei->journal_seq = 0;
+	atomic_long_set(&ei->i_size_dirty_count, 0);
+	atomic_long_set(&ei->i_sectors_dirty_count, 0);
+
+	return &ei->vfs_inode;
+}
+
+static void bch_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	kmem_cache_free(bch_inode_cache, to_bch_ei(inode));
+}
+
+static void bch_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, bch_i_callback);
+}
+
+static int bch_vfs_write_inode(struct inode *inode,
+			       struct writeback_control *wbc)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	int ret;
+
+	mutex_lock(&ei->update_lock);
+	ret = bch_write_inode(c, ei);
+	mutex_unlock(&ei->update_lock);
+
+	if (c->opts.journal_flush_disabled)
+		return ret;
+
+	if (!ret && wbc->sync_mode == WB_SYNC_ALL)
+		ret = bch_journal_flush_seq(&c->journal, ei->journal_seq);
+
+	return ret;
+}
+
+static void bch_evict_inode(struct inode *inode)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	truncate_inode_pages_final(&inode->i_data);
+
+	if (!bch_journal_error(&c->journal) && !is_bad_inode(inode)) {
+		struct bch_inode_info *ei = to_bch_ei(inode);
+
+		/* XXX - we want to check this stuff iff there weren't IO errors: */
+		BUG_ON(atomic_long_read(&ei->i_sectors_dirty_count));
+		BUG_ON(atomic64_read(&ei->i_sectors) != inode->i_blocks);
+	}
+
+	clear_inode(inode);
+
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		bch_inode_rm(c, inode->i_ino);
+		atomic_long_dec(&c->nr_inodes);
+	}
+}
+
+static int bch_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct cache_set *c = sb->s_fs_info;
+	u64 fsid;
+
+	buf->f_type	= BCACHE_STATFS_MAGIC;
+	buf->f_bsize	= sb->s_blocksize;
+	buf->f_blocks	= c->capacity >> PAGE_SECTOR_SHIFT;
+	buf->f_bfree	= (c->capacity - cache_set_sectors_used(c)) >> PAGE_SECTOR_SHIFT;
+	buf->f_bavail	= buf->f_bfree;
+	buf->f_files	= atomic_long_read(&c->nr_inodes);
+	buf->f_ffree	= U64_MAX;
+
+	fsid = le64_to_cpup((void *) c->disk_sb.user_uuid.b) ^
+	       le64_to_cpup((void *) c->disk_sb.user_uuid.b + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	buf->f_namelen	= NAME_MAX;
+
+	return 0;
+}
+
+static int bch_sync_fs(struct super_block *sb, int wait)
+{
+	struct cache_set *c = sb->s_fs_info;
+
+	if (!wait) {
+		bch_journal_flush_async(&c->journal, NULL);
+		return 0;
+	}
+
+	return bch_journal_flush(&c->journal);
+}
+
+static struct cache_set *bdev_to_cache_set(struct block_device *bdev)
+{
+	struct cache_set *c;
+	struct cache *ca;
+	unsigned i;
+
+	rcu_read_lock();
+
+	list_for_each_entry(c, &bch_cache_sets, list)
+		for_each_cache_rcu(ca, c, i)
+			if (ca->disk_sb.bdev == bdev) {
+				rcu_read_unlock();
+				return c;
+			}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static struct cache_set *bch_open_as_blockdevs(const char *_dev_name,
+					       struct cache_set_opts opts)
+{
+	size_t nr_devs = 0, i = 0;
+	char *dev_name, *s, **devs;
+	struct cache_set *c = NULL;
+	const char *err;
+
+	dev_name = kstrdup(_dev_name, GFP_KERNEL);
+	if (!dev_name)
+		return NULL;
+
+	for (s = dev_name; s; s = strchr(s + 1, ':'))
+		nr_devs++;
+
+	devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
+	if (!devs)
+		goto err;
+
+	for (i = 0, s = dev_name;
+	     s;
+	     (s = strchr(s, ':')) && (*s++ = '\0'))
+		devs[i++] = s;
+
+	err = bch_register_cache_set(devs, nr_devs, opts, &c);
+	if (err) {
+		/*
+		 * Already open?
+		 * Look up each block device, make sure they all belong to a
+		 * cache set and they all belong to the _same_ cache set
+		 */
+
+		mutex_lock(&bch_register_lock);
+
+		for (i = 0; i < nr_devs; i++) {
+			struct block_device *bdev = lookup_bdev(devs[i]);
+			struct cache_set *c2;
+
+			if (IS_ERR(bdev))
+				goto err_unlock;
+
+			c2 = bdev_to_cache_set(bdev);
+			bdput(bdev);
+
+			if (!c)
+				c = c2;
+
+			if (c != c2)
+				goto err_unlock;
+		}
+
+		if (!c)
+			goto err_unlock;
+
+		if (!test_bit(CACHE_SET_RUNNING, &c->flags)) {
+			err = "incomplete cache set";
+			c = NULL;
+			goto err_unlock;
+		}
+
+		closure_get(&c->cl);
+		mutex_unlock(&bch_register_lock);
+	}
+
+	set_bit(CACHE_SET_BDEV_MOUNTED, &c->flags);
+err:
+	kfree(devs);
+	kfree(dev_name);
+
+	return c;
+err_unlock:
+	mutex_unlock(&bch_register_lock);
+	pr_err("register_cache_set err %s", err);
+	goto err;
+}
+
+static int bch_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct cache_set *c = sb->s_fs_info;
+	struct cache_set_opts opts;
+	int ret;
+
+	ret = bch_parse_options(&opts, *flags, data);
+	if (ret)
+		return ret;
+
+	mutex_lock(&bch_register_lock);
+
+	if (opts.read_only >= 0 &&
+	    opts.read_only != c->opts.read_only) {
+		const char *err = NULL;
+
+		if (opts.read_only) {
+			bch_cache_set_read_only_sync(c);
+
+			sb->s_flags |= MS_RDONLY;
+		} else {
+			err = bch_cache_set_read_write(c);
+			if (err) {
+				bch_err(c, "error going rw: %s", err);
+				ret = -EINVAL;
+				goto unlock;
+			}
+
+			sb->s_flags &= ~MS_RDONLY;
+		}
+
+		c->opts.read_only = opts.read_only;
+	}
+
+	if (opts.errors >= 0)
+		c->opts.errors = opts.errors;
+
+unlock:
+	mutex_unlock(&bch_register_lock);
+
+	return ret;
+}
+
+static const struct super_operations bch_super_operations = {
+	.alloc_inode	= bch_alloc_inode,
+	.destroy_inode	= bch_destroy_inode,
+	.write_inode	= bch_vfs_write_inode,
+	.evict_inode	= bch_evict_inode,
+	.sync_fs	= bch_sync_fs,
+	.statfs		= bch_statfs,
+	.show_options	= generic_show_options,
+	.remount_fs	= bch_remount,
+#if 0
+	.put_super	= bch_put_super,
+	.freeze_fs	= bch_freeze,
+	.unfreeze_fs	= bch_unfreeze,
+#endif
+};
+
+static int bch_test_super(struct super_block *s, void *data)
+{
+	return s->s_fs_info == data;
+}
+
+static int bch_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+	return 0;
+}
+
+static struct dentry *bch_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	struct cache_set *c;
+	struct cache *ca;
+	struct super_block *sb;
+	struct inode *inode;
+	struct cache_set_opts opts;
+	unsigned i;
+	int ret;
+
+	ret = bch_parse_options(&opts, flags, data);
+	if (ret)
+		return ERR_PTR(ret);
+
+	c = bch_open_as_blockdevs(dev_name, opts);
+	if (!c)
+		return ERR_PTR(-ENOENT);
+
+	sb = sget(fs_type, bch_test_super, bch_set_super, flags|MS_NOSEC, c);
+	if (IS_ERR(sb)) {
+		closure_put(&c->cl);
+		return ERR_CAST(sb);
+	}
+
+	BUG_ON(sb->s_fs_info != c);
+
+	if (sb->s_root) {
+		closure_put(&c->cl);
+
+		if ((flags ^ sb->s_flags) & MS_RDONLY) {
+			ret = -EBUSY;
+			goto err_put_super;
+		}
+		goto out;
+	}
+
+	/* XXX: blocksize */
+	sb->s_blocksize		= PAGE_SIZE;
+	sb->s_blocksize_bits	= PAGE_SHIFT;
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_op		= &bch_super_operations;
+	sb->s_xattr		= bch_xattr_handlers;
+	sb->s_magic		= BCACHE_STATFS_MAGIC;
+	sb->s_time_gran		= 1;
+	c->vfs_sb		= sb;
+	sb->s_bdi		= &c->bdi;
+
+	rcu_read_lock();
+	for_each_cache_rcu(ca, c, i) {
+		struct block_device *bdev = ca->disk_sb.bdev;
+
+		BUILD_BUG_ON(sizeof(sb->s_id) < BDEVNAME_SIZE);
+
+		bdevname(bdev, sb->s_id);
+
+		/* XXX: do we even need s_bdev? */
+		sb->s_bdev	= bdev;
+		sb->s_dev	= bdev->bd_dev;
+		break;
+	}
+	rcu_read_unlock();
+
+	if (opts.posix_acl < 0)
+		sb->s_flags	|= MS_POSIXACL;
+	else
+		sb->s_flags	|= opts.posix_acl ? MS_POSIXACL : 0;
+
+	inode = bch_vfs_inode_get(sb, BCACHE_ROOT_INO);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto err_put_super;
+	}
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
+		ret = -ENOMEM;
+		goto err_put_super;
+	}
+
+	sb->s_flags |= MS_ACTIVE;
+out:
+	return dget(sb->s_root);
+
+err_put_super:
+	deactivate_locked_super(sb);
+	return ERR_PTR(ret);
+}
+
+static void bch_kill_sb(struct super_block *sb)
+{
+	struct cache_set *c = sb->s_fs_info;
+
+	generic_shutdown_super(sb);
+
+	if (test_bit(CACHE_SET_BDEV_MOUNTED, &c->flags)) {
+		DECLARE_COMPLETION_ONSTACK(complete);
+
+		c->stop_completion = &complete;
+		bch_cache_set_stop(c);
+		closure_put(&c->cl);
+
+		/* Killable? */
+		wait_for_completion(&complete);
+	} else
+		closure_put(&c->cl);
+}
+
+static struct file_system_type bcache_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "bcache",
+	.mount		= bch_mount,
+	.kill_sb	= bch_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+MODULE_ALIAS_FS("bcache");
+
+void bch_fs_exit(void)
+{
+	unregister_filesystem(&bcache_fs_type);
+	if (bch_dio_write_bioset)
+		bioset_free(bch_dio_write_bioset);
+	if (bch_dio_read_bioset)
+		bioset_free(bch_dio_read_bioset);
+	if (bch_writepage_bioset)
+		bioset_free(bch_writepage_bioset);
+	if (bch_inode_cache)
+		kmem_cache_destroy(bch_inode_cache);
+}
+
+int __init bch_fs_init(void)
+{
+	int ret = -ENOMEM;
+
+	bch_inode_cache = KMEM_CACHE(bch_inode_info, 0);
+	if (!bch_inode_cache)
+		goto err;
+
+	bch_writepage_bioset =
+		bioset_create(4, offsetof(struct bch_writepage_io, bio.bio));
+	if (!bch_writepage_bioset)
+		goto err;
+
+	bch_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, rbio.bio));
+	if (!bch_dio_read_bioset)
+		goto err;
+
+	bch_dio_write_bioset = bioset_create(4, offsetof(struct dio_write, bio.bio));
+	if (!bch_dio_write_bioset)
+		goto err;
+
+	ret = register_filesystem(&bcache_fs_type);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	bch_fs_exit();
+	return ret;
+}
diff --git a/libbcache/fs.h b/libbcache/fs.h
new file mode 100644
index 0000000..c982024
--- /dev/null
+++ b/libbcache/fs.h
@@ -0,0 +1,49 @@
+#ifndef _BCACHE_FS_H
+#define _BCACHE_FS_H
+
+#include "str_hash.h"
+
+#include <linux/seqlock.h>
+
+struct bch_inode_info {
+	struct inode		vfs_inode;
+
+	struct mutex		update_lock;
+	u64			journal_seq;
+
+	atomic_long_t		i_size_dirty_count;
+
+	/*
+	 * these are updated whenever we update the inode in the btree - for
+	 * e.g. fsync
+	 */
+	u64			i_size;
+	u32			i_flags;
+
+	atomic_long_t		i_sectors_dirty_count;
+	atomic64_t		i_sectors;
+
+	struct bch_hash_info	str_hash;
+};
+
+#define to_bch_ei(_inode)					\
+	container_of(_inode, struct bch_inode_info, vfs_inode)
+
+static inline u8 mode_to_type(umode_t mode)
+{
+	return (mode >> 12) & 15;
+}
+
+/* returns 0 if we want to do the update, or error is passed up */
+typedef int (*inode_set_fn)(struct bch_inode_info *,
+			    struct bch_inode *, void *);
+
+int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *,
+				   inode_set_fn, void *);
+int __must_check bch_write_inode(struct cache_set *,
+				 struct bch_inode_info *);
+
+void bch_fs_exit(void);
+int bch_fs_init(void);
+
+#endif /* _BCACHE_FS_H */
diff --git a/libbcache/inode.c b/libbcache/inode.c
new file mode 100644
index 0000000..d36de43
--- /dev/null
+++ b/libbcache/inode.c
@@ -0,0 +1,283 @@
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "inode.h"
+#include "io.h"
+#include "keylist.h"
+
+ssize_t bch_inode_status(char *buf, size_t len, const struct bkey *k)
+{
+	if (k->p.offset)
+		return scnprintf(buf, len, "offset nonzero: %llu", k->p.offset);
+
+	if (k->size)
+		return scnprintf(buf, len, "size nonzero: %u", k->size);
+
+	switch (k->type) {
+	case KEY_TYPE_DELETED:
+		return scnprintf(buf, len, "deleted");
+	case KEY_TYPE_DISCARD:
+		return scnprintf(buf, len, "discarded");
+	case KEY_TYPE_ERROR:
+		return scnprintf(buf, len, "error");
+	case KEY_TYPE_COOKIE:
+		return scnprintf(buf, len, "cookie");
+
+	case BCH_INODE_FS:
+		if (bkey_val_bytes(k) != sizeof(struct bch_inode))
+			return scnprintf(buf, len, "bad size: %zu",
+					 bkey_val_bytes(k));
+
+		if (k->p.inode < BLOCKDEV_INODE_MAX)
+			return scnprintf(buf, len,
+					 "fs inode in blockdev range: %llu",
+					 k->p.inode);
+		return 0;
+
+	case BCH_INODE_BLOCKDEV:
+		if (bkey_val_bytes(k) != sizeof(struct bch_inode_blockdev))
+			return scnprintf(buf, len, "bad size: %zu",
+					 bkey_val_bytes(k));
+
+		if (k->p.inode >= BLOCKDEV_INODE_MAX)
+			return scnprintf(buf, len,
+					 "blockdev inode in fs range: %llu",
+					 k->p.inode);
+		return 0;
+
+	default:
+		return scnprintf(buf, len, "unknown inode type: %u", k->type);
+	}
+}
+
+static const char *bch_inode_invalid(const struct cache_set *c,
+				     struct bkey_s_c k)
+{
+	if (k.k->p.offset)
+		return "nonzero offset";
+
+	switch (k.k->type) {
+	case BCH_INODE_FS: {
+		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode))
+			return "incorrect value size";
+
+		if (k.k->p.inode < BLOCKDEV_INODE_MAX)
+			return "fs inode in blockdev range";
+
+		if (INODE_STR_HASH_TYPE(inode.v) >= BCH_STR_HASH_NR)
+			return "invalid str hash type";
+
+		return NULL;
+	}
+	case BCH_INODE_BLOCKDEV:
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev))
+			return "incorrect value size";
+
+		if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
+			return "blockdev inode in fs range";
+
+		return NULL;
+	default:
+		return "invalid type";
+	}
+}
+
+static void bch_inode_to_text(struct cache_set *c, char *buf,
+			      size_t size, struct bkey_s_c k)
+{
+	struct bkey_s_c_inode inode;
+
+	switch (k.k->type) {
+	case BCH_INODE_FS:
+		inode = bkey_s_c_to_inode(k);
+
+		scnprintf(buf, size, "i_size %llu", inode.v->i_size);
+		break;
+	}
+}
+
+const struct bkey_ops bch_bkey_inode_ops = {
+	.key_invalid	= bch_inode_invalid,
+	.val_to_text	= bch_inode_to_text,
+};
+
+int bch_inode_create(struct cache_set *c, struct bkey_i *inode,
+		     u64 min, u64 max, u64 *hint)
+{
+	struct btree_iter iter;
+	bool searched_from_start = false;
+	int ret;
+
+	if (!max)
+		max = ULLONG_MAX;
+
+	if (c->opts.inodes_32bit)
+		max = min_t(u64, max, U32_MAX);
+
+	if (*hint >= max || *hint < min)
+		*hint = min;
+
+	if (*hint == min)
+		searched_from_start = true;
+again:
+	bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(*hint, 0));
+
+	while (1) {
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
+
+		ret = btree_iter_err(k);
+		if (ret) {
+			bch_btree_iter_unlock(&iter);
+			return ret;
+		}
+
+		if (k.k->type < BCH_INODE_FS) {
+			inode->k.p = k.k->p;
+
+			pr_debug("inserting inode %llu (size %u)",
+				 inode->k.p.inode, inode->k.u64s);
+
+			ret = bch_btree_insert_at(c, NULL, NULL, NULL,
+					BTREE_INSERT_ATOMIC,
+					BTREE_INSERT_ENTRY(&iter, inode));
+
+			if (ret == -EINTR)
+				continue;
+
+			bch_btree_iter_unlock(&iter);
+			if (!ret)
+				*hint = k.k->p.inode + 1;
+
+			return ret;
+		} else {
+			if (iter.pos.inode == max)
+				break;
+			/* slot used */
+			bch_btree_iter_advance_pos(&iter);
+		}
+	}
+	bch_btree_iter_unlock(&iter);
+
+	if (!searched_from_start) {
+		/* Retry from start */
+		*hint = min;
+		searched_from_start = true;
+		goto again;
+	}
+
+	return -ENOSPC;
+}
+
+int bch_inode_truncate(struct cache_set *c, u64 inode_nr, u64 new_size,
+		       struct extent_insert_hook *hook, u64 *journal_seq)
+{
+	return bch_discard(c, POS(inode_nr, new_size), POS(inode_nr + 1, 0),
+			   0, NULL, hook, journal_seq);
+}
+
+int bch_inode_rm(struct cache_set *c, u64 inode_nr)
+{
+	struct bkey_i delete;
+	int ret;
+
+	ret = bch_inode_truncate(c, inode_nr, 0, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	ret = bch_btree_delete_range(c, BTREE_ID_XATTRS,
+				     POS(inode_nr, 0),
+				     POS(inode_nr + 1, 0),
+				     0, NULL, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * If this was a directory, there shouldn't be any real dirents left -
+	 * but there could be whiteouts (from hash collisions) that we should
+	 * delete:
+	 *
+	 * XXX: the dirent could ideally would delete whitouts when they're no
+	 * longer needed
+	 */
+	ret = bch_btree_delete_range(c, BTREE_ID_DIRENTS,
+				     POS(inode_nr, 0),
+				     POS(inode_nr + 1, 0),
+				     0, NULL, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	bkey_init(&delete.k);
+	delete.k.p.inode = inode_nr;
+
+	return bch_btree_insert(c, BTREE_ID_INODES, &delete, NULL,
+				NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+int bch_inode_update(struct cache_set *c, struct bkey_i *inode,
+		     u64 *journal_seq)
+{
+	return bch_btree_update(c, BTREE_ID_INODES, inode, journal_seq);
+}
+
+int bch_inode_find_by_inum(struct cache_set *c, u64 inode_nr,
+			   struct bkey_i_inode *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = -ENOENT;
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES,
+				      POS(inode_nr, 0), k) {
+		switch (k.k->type) {
+		case BCH_INODE_FS:
+			ret = 0;
+			bkey_reassemble(&inode->k_i, k);
+			break;
+		default:
+			/* hole, not found */
+			break;
+		}
+
+		break;
+
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+int bch_cached_dev_inode_find_by_uuid(struct cache_set *c, uuid_le *uuid,
+				      struct bkey_i_inode_blockdev *ret)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS(0, 0), k) {
+		if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
+			break;
+
+		if (k.k->type == BCH_INODE_BLOCKDEV) {
+			struct bkey_s_c_inode_blockdev inode =
+				bkey_s_c_to_inode_blockdev(k);
+
+			pr_debug("found inode %llu: %pU (u64s %u)",
+				 inode.k->p.inode, inode.v->i_uuid.b,
+				 inode.k->u64s);
+
+			if (CACHED_DEV(inode.v) &&
+			    !memcmp(uuid, &inode.v->i_uuid, 16)) {
+				bkey_reassemble(&ret->k_i, k);
+				bch_btree_iter_unlock(&iter);
+				return 0;
+			}
+		}
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+	bch_btree_iter_unlock(&iter);
+	return -ENOENT;
+}
diff --git a/libbcache/inode.h b/libbcache/inode.h
new file mode 100644
index 0000000..d8b28c7
--- /dev/null
+++ b/libbcache/inode.h
@@ -0,0 +1,18 @@
+#ifndef _BCACHE_INODE_H
+#define _BCACHE_INODE_H
+
+extern const struct bkey_ops bch_bkey_inode_ops;
+
+ssize_t bch_inode_status(char *, size_t, const struct bkey *);
+
+int bch_inode_create(struct cache_set *, struct bkey_i *, u64, u64, u64 *);
+int bch_inode_truncate(struct cache_set *, u64, u64,
+		       struct extent_insert_hook *, u64 *);
+int bch_inode_rm(struct cache_set *, u64);
+int bch_inode_update(struct cache_set *, struct bkey_i *, u64 *);
+
+int bch_inode_find_by_inum(struct cache_set *, u64, struct bkey_i_inode *);
+int bch_cached_dev_inode_find_by_uuid(struct cache_set *, uuid_le *,
+				      struct bkey_i_inode_blockdev *);
+
+#endif
diff --git a/libbcache/io.c b/libbcache/io.c
new file mode 100644
index 0000000..7219b65
--- /dev/null
+++ b/libbcache/io.c
@@ -0,0 +1,1378 @@
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "alloc.h"
+#include "bset.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "compress.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "notify.h"
+#include "stats.h"
+#include "super.h"
+
+#include <linux/blkdev.h>
+#include <linux/random.h>
+
+#include <trace/events/bcache.h>
+
+static inline void __bio_inc_remaining(struct bio *bio)
+{
+	bio_set_flag(bio, BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
+void bch_generic_make_request(struct bio *bio, struct cache_set *c)
+{
+	if (current->bio_list) {
+		spin_lock(&c->bio_submit_lock);
+		bio_list_add(&c->bio_submit_list, bio);
+		spin_unlock(&c->bio_submit_lock);
+		queue_work(bcache_io_wq, &c->bio_submit_work);
+	} else {
+		generic_make_request(bio);
+	}
+}
+
+void bch_bio_submit_work(struct work_struct *work)
+{
+	struct cache_set *c = container_of(work, struct cache_set,
+					   bio_submit_work);
+	struct bio_list bl;
+	struct bio *bio;
+
+	spin_lock(&c->bio_submit_lock);
+	bl = c->bio_submit_list;
+	bio_list_init(&c->bio_submit_list);
+	spin_unlock(&c->bio_submit_lock);
+
+	while ((bio = bio_list_pop(&bl)))
+		generic_make_request(bio);
+}
+
+/* Allocate, free from mempool: */
+
+void bch_bio_free_pages_pool(struct cache_set *c, struct bio *bio)
+{
+	struct bio_vec *bv;
+	unsigned i;
+
+	bio_for_each_segment_all(bv, bio, i)
+		if (bv->bv_page != ZERO_PAGE(0))
+			mempool_free(bv->bv_page, &c->bio_bounce_pages);
+	bio->bi_vcnt = 0;
+}
+
+static void bch_bio_alloc_page_pool(struct cache_set *c, struct bio *bio,
+				    bool *using_mempool)
+{
+	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
+
+	if (likely(!*using_mempool)) {
+		bv->bv_page = alloc_page(GFP_NOIO);
+		if (unlikely(!bv->bv_page)) {
+			mutex_lock(&c->bio_bounce_pages_lock);
+			*using_mempool = true;
+			goto pool_alloc;
+
+		}
+	} else {
+pool_alloc:
+		bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+	}
+
+	bv->bv_len = PAGE_SIZE;
+	bv->bv_offset = 0;
+}
+
+void bch_bio_alloc_pages_pool(struct cache_set *c, struct bio *bio,
+			      size_t bytes)
+{
+	bool using_mempool = false;
+
+	bio->bi_iter.bi_size = bytes;
+
+	while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
+		bch_bio_alloc_page_pool(c, bio, &using_mempool);
+
+	if (using_mempool)
+		mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+/* Bios with headers */
+
+static void bch_submit_wbio(struct cache_set *c, struct bch_write_bio *wbio,
+			    struct cache *ca, const struct bch_extent_ptr *ptr,
+			    bool punt)
+{
+	wbio->ca		= ca;
+	wbio->submit_time_us	= local_clock_us();
+	wbio->bio.bi_iter.bi_sector = ptr->offset;
+	wbio->bio.bi_bdev	= ca ? ca->disk_sb.bdev : NULL;
+
+	if (!ca)
+		bcache_io_error(c, &wbio->bio, "device has been removed");
+	else if (punt)
+		bch_generic_make_request(&wbio->bio, c);
+	else
+		generic_make_request(&wbio->bio);
+}
+
+void bch_submit_wbio_replicas(struct bch_write_bio *wbio, struct cache_set *c,
+			      const struct bkey_i *k, bool punt)
+{
+	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+	const struct bch_extent_ptr *ptr;
+	struct bch_write_bio *n;
+	struct cache *ca;
+
+	wbio->split = false;
+	wbio->c = c;
+
+	extent_for_each_ptr(e, ptr) {
+		rcu_read_lock();
+		ca = PTR_CACHE(c, ptr);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		if (!ca) {
+			bch_submit_wbio(c, wbio, ca, ptr, punt);
+			break;
+		}
+
+		if (ptr + 1 < &extent_entry_last(e)->ptr) {
+			n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
+						   &ca->replica_set));
+
+			n->bio.bi_end_io	= wbio->bio.bi_end_io;
+			n->bio.bi_private	= wbio->bio.bi_private;
+			n->c			= c;
+			n->orig			= &wbio->bio;
+			n->bounce		= false;
+			n->split		= true;
+			n->put_bio		= true;
+			n->bio.bi_opf		= wbio->bio.bi_opf;
+			__bio_inc_remaining(n->orig);
+		} else {
+			n = wbio;
+		}
+
+		if (!journal_flushes_device(ca))
+			n->bio.bi_opf |= REQ_FUA;
+
+		bch_submit_wbio(c, n, ca, ptr, punt);
+	}
+}
+
+/* IO errors */
+
+/* Writes */
+
+static struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+	return op->alloc_reserve == RESERVE_MOVINGGC
+		? op->c->copygc_wq
+		: op->c->wq;
+}
+
+static void __bch_write(struct closure *);
+
+static void bch_write_done(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+
+	BUG_ON(!(op->flags & BCH_WRITE_DONE));
+
+	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
+		op->error = bch_journal_error(&op->c->journal);
+
+	bch_disk_reservation_put(op->c, &op->res);
+	percpu_ref_put(&op->c->writes);
+	bch_keylist_free(&op->insert_keys, op->inline_keys);
+	closure_return(cl);
+}
+
+static u64 keylist_sectors(struct keylist *keys)
+{
+	struct bkey_i *k;
+	u64 ret = 0;
+
+	for_each_keylist_key(keys, k)
+		ret += k->k.size;
+
+	return ret;
+}
+
+static int bch_write_index_default(struct bch_write_op *op)
+{
+	struct keylist *keys = &op->insert_keys;
+	struct btree_iter iter;
+	int ret;
+
+	bch_btree_iter_init_intent(&iter, op->c, BTREE_ID_EXTENTS,
+		bkey_start_pos(&bch_keylist_front(keys)->k));
+
+	ret = bch_btree_insert_list_at(&iter, keys, &op->res,
+				       NULL, op_journal_seq(op),
+				       BTREE_INSERT_NOFAIL);
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+/**
+ * bch_write_index - after a write, update index to point to new data
+ */
+static void bch_write_index(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct cache_set *c = op->c;
+	struct keylist *keys = &op->insert_keys;
+	unsigned i;
+
+	op->flags |= BCH_WRITE_LOOPED;
+
+	if (!bch_keylist_empty(keys)) {
+		u64 sectors_start = keylist_sectors(keys);
+		int ret = op->index_update_fn(op);
+
+		BUG_ON(keylist_sectors(keys) && !ret);
+
+		op->written += sectors_start - keylist_sectors(keys);
+
+		if (ret) {
+			__bcache_io_error(c, "btree IO error %i", ret);
+			op->error = ret;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++)
+		if (op->open_buckets[i]) {
+			bch_open_bucket_put(c,
+					    c->open_buckets +
+					    op->open_buckets[i]);
+			op->open_buckets[i] = 0;
+		}
+
+	if (!(op->flags & BCH_WRITE_DONE))
+		continue_at(cl, __bch_write, op->io_wq);
+
+	if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
+		bch_journal_flush_seq_async(&c->journal,
+					    *op_journal_seq(op),
+					    cl);
+		continue_at(cl, bch_write_done, index_update_wq(op));
+	} else {
+		continue_at_nobarrier(cl, bch_write_done, NULL);
+	}
+}
+
+/**
+ * bch_write_discard - discard range of keys
+ *
+ * Used to implement discard, and to handle when writethrough write hits
+ * a write error on the cache device.
+ */
+static void bch_write_discard(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bio *bio = &op->bio->bio;
+	struct bpos end = op->pos;
+
+	end.offset += bio_sectors(bio);
+
+	op->error = bch_discard(op->c, op->pos, end, op->version,
+				&op->res, NULL, NULL);
+}
+
+/*
+ * Convert extents to be inserted to discards after an error:
+ */
+static void bch_write_io_error(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+
+	if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
+		struct bkey_i *src = bch_keylist_front(&op->insert_keys);
+		struct bkey_i *dst = bch_keylist_front(&op->insert_keys);
+
+		/*
+		 * Our data write just errored, which means we've got a bunch
+		 * of keys to insert that point to data that wasn't
+		 * successfully written.
+		 *
+		 * We don't have to insert those keys but we still have to
+		 * invalidate that region of the cache - so, if we just strip
+		 * off all the pointers from the keys we'll accomplish just
+		 * that.
+		 */
+
+		while (src != op->insert_keys.top) {
+			struct bkey_i *n = bkey_next(src);
+
+			set_bkey_val_u64s(&src->k, 0);
+			src->k.type = KEY_TYPE_DISCARD;
+			bkey_copy(dst, src);
+
+			dst = bkey_next(dst);
+			src = n;
+		}
+
+		op->insert_keys.top = dst;
+		op->flags |= BCH_WRITE_DISCARD;
+	} else {
+		/* TODO: We could try to recover from this. */
+		while (!bch_keylist_empty(&op->insert_keys))
+			bch_keylist_pop_front(&op->insert_keys);
+
+		op->error = -EIO;
+		op->flags |= BCH_WRITE_DONE;
+	}
+
+	bch_write_index(cl);
+}
+
+static void bch_write_endio(struct bio *bio)
+{
+	struct closure *cl = bio->bi_private;
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_write_bio *wbio = to_wbio(bio);
+	struct cache_set *c = wbio->c;
+	struct bio *orig = wbio->orig;
+	struct cache *ca = wbio->ca;
+
+	if (cache_nonfatal_io_err_on(bio->bi_error, ca,
+				     "data write"))
+		set_closure_fn(cl, bch_write_io_error, index_update_wq(op));
+
+	bch_account_io_completion_time(ca, wbio->submit_time_us,
+				       REQ_OP_WRITE);
+	if (ca)
+		percpu_ref_put(&ca->ref);
+
+	if (bio->bi_error && orig)
+		orig->bi_error = bio->bi_error;
+
+	if (wbio->bounce)
+		bch_bio_free_pages_pool(c, bio);
+
+	if (wbio->put_bio)
+		bio_put(bio);
+
+	if (orig)
+		bio_endio(orig);
+	else
+		closure_put(cl);
+}
+
+static void init_append_extent(struct bch_write_op *op,
+			       unsigned compressed_size,
+			       unsigned uncompressed_size,
+			       unsigned compression_type,
+			       u64 csum, unsigned csum_type,
+			       struct open_bucket *ob)
+{
+	struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
+
+	op->pos.offset += uncompressed_size;
+	e->k.p = op->pos;
+	e->k.size = uncompressed_size;
+
+	bch_extent_crc_append(e, compressed_size,
+			      uncompressed_size,
+			      compression_type,
+			      csum, csum_type);
+
+	bch_alloc_sectors_append_ptrs(op->c, e, op->nr_replicas,
+				      ob, compressed_size);
+
+	bkey_extent_set_cached(&e->k, (op->flags & BCH_WRITE_CACHED));
+	bch_keylist_push(&op->insert_keys);
+}
+
+static int bch_write_extent(struct bch_write_op *op,
+			    struct open_bucket *ob,
+			    struct bio *orig)
+{
+	struct cache_set *c = op->c;
+	struct bio *bio;
+	struct bch_write_bio *wbio;
+	unsigned key_to_write_offset = op->insert_keys.top_p -
+		op->insert_keys.keys_p;
+	struct bkey_i *key_to_write;
+	unsigned csum_type = c->opts.data_checksum;
+	unsigned compression_type = op->compression_type;
+	int ret;
+
+	/* don't refetch csum type/compression type */
+	barrier();
+
+	/* Need to decompress data? */
+	if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
+	    (op->crc.uncompressed_size != op->size ||
+	     op->crc.compressed_size > ob->sectors_free)) {
+		int ret;
+
+		ret = bch_bio_uncompress_inplace(c, orig, op->size, op->crc);
+		if (ret)
+			return ret;
+
+		op->flags &= ~BCH_WRITE_DATA_COMPRESSED;
+	}
+
+	if (op->flags & BCH_WRITE_DATA_COMPRESSED) {
+		init_append_extent(op,
+				   op->crc.compressed_size,
+				   op->crc.uncompressed_size,
+				   op->crc.compression_type,
+				   op->crc.csum,
+				   op->crc.csum_type,
+				   ob);
+
+		bio			= orig;
+		wbio			= to_wbio(bio);
+		wbio->orig		= NULL;
+		wbio->bounce		= false;
+		wbio->put_bio		= false;
+		ret			= 0;
+	} else if (csum_type != BCH_CSUM_NONE ||
+		   compression_type != BCH_COMPRESSION_NONE) {
+		/* all units here in bytes */
+		unsigned total_output = 0, output_available =
+			min(ob->sectors_free << 9, orig->bi_iter.bi_size);
+		u64 csum;
+
+		bio = bio_alloc_bioset(GFP_NOIO,
+				       DIV_ROUND_UP(output_available, PAGE_SIZE),
+				       &c->bio_write);
+		/*
+		 * XXX: can't use mempool for more than
+		 * BCH_COMPRESSED_EXTENT_MAX worth of pages
+		 */
+		bch_bio_alloc_pages_pool(c, bio, output_available);
+
+		/* copy WRITE_SYNC flag */
+		bio->bi_opf		= orig->bi_opf;
+		wbio			= to_wbio(bio);
+		wbio->orig		= NULL;
+		wbio->bounce		= true;
+		wbio->put_bio		= true;
+
+		do {
+			unsigned fragment_compression_type = compression_type;
+			size_t dst_len, src_len;
+
+			bch_bio_compress(c, bio, &dst_len,
+					 orig, &src_len,
+					 &fragment_compression_type);
+
+			BUG_ON(!dst_len || dst_len > bio->bi_iter.bi_size);
+			BUG_ON(!src_len || src_len > orig->bi_iter.bi_size);
+			BUG_ON(dst_len & (block_bytes(c) - 1));
+			BUG_ON(src_len & (block_bytes(c) - 1));
+
+			swap(bio->bi_iter.bi_size, dst_len);
+			csum = bch_checksum_bio(bio, csum_type);
+			swap(bio->bi_iter.bi_size, dst_len);
+
+			init_append_extent(op,
+					   dst_len >> 9, src_len >> 9,
+					   fragment_compression_type,
+					   csum, csum_type, ob);
+
+			total_output += dst_len;
+			bio_advance(bio, dst_len);
+			bio_advance(orig, src_len);
+		} while (bio->bi_iter.bi_size &&
+			 orig->bi_iter.bi_size &&
+			 !bch_keylist_realloc(&op->insert_keys,
+					      op->inline_keys,
+					      ARRAY_SIZE(op->inline_keys),
+					      BKEY_EXTENT_U64s_MAX));
+
+		BUG_ON(total_output > output_available);
+
+		memset(&bio->bi_iter, 0, sizeof(bio->bi_iter));
+		bio->bi_iter.bi_size = total_output;
+
+		/*
+		 * Free unneeded pages after compressing:
+		 */
+		while (bio->bi_vcnt * PAGE_SIZE >
+		       round_up(bio->bi_iter.bi_size, PAGE_SIZE))
+			mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page,
+				     &c->bio_bounce_pages);
+
+		ret = orig->bi_iter.bi_size != 0;
+	} else {
+		bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO,
+				     &c->bio_write);
+
+		wbio			= to_wbio(bio);
+		wbio->orig		= NULL;
+		wbio->bounce		= false;
+		wbio->put_bio		= bio != orig;
+
+		init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
+				   compression_type, 0, csum_type, ob);
+
+		ret = bio != orig;
+	}
+
+	bio->bi_end_io	= bch_write_endio;
+	bio->bi_private	= &op->cl;
+	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+	closure_get(bio->bi_private);
+
+	/* might have done a realloc... */
+
+	key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
+
+	if (!(op->flags & BCH_WRITE_CACHED))
+		bch_check_mark_super(c, key_to_write, false);
+
+#ifndef CONFIG_BCACHE_NO_IO
+	bch_submit_wbio_replicas(to_wbio(bio), c, key_to_write, false);
+#else
+	to_wbio(bio)->ca = NULL;
+	bio_endio(bio);
+#endif
+	return ret;
+}
+
+static void __bch_write(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct cache_set *c = op->c;
+	struct bio *bio = &op->bio->bio;
+	unsigned open_bucket_nr = 0;
+	struct open_bucket *b;
+	int ret;
+
+	memset(op->open_buckets, 0, sizeof(op->open_buckets));
+
+	if (op->flags & BCH_WRITE_DISCARD) {
+		op->flags |= BCH_WRITE_DONE;
+		bch_write_discard(cl);
+		bio_put(bio);
+		continue_at(cl, bch_write_done, index_update_wq(op));
+	}
+
+	/*
+	 * Journal writes are marked REQ_PREFLUSH; if the original write was a
+	 * flush, it'll wait on the journal write.
+	 */
+	bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
+
+	do {
+		EBUG_ON(bio->bi_iter.bi_sector != op->pos.offset);
+		EBUG_ON(!bio_sectors(bio));
+
+		if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
+			continue_at(cl, bch_write_index, index_update_wq(op));
+
+		/* for the device pointers and 1 for the chksum */
+		if (bch_keylist_realloc(&op->insert_keys,
+					op->inline_keys,
+					ARRAY_SIZE(op->inline_keys),
+					BKEY_EXTENT_U64s_MAX))
+			continue_at(cl, bch_write_index, index_update_wq(op));
+
+		b = bch_alloc_sectors_start(c, op->wp, op->nr_replicas,
+			op->alloc_reserve,
+			(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
+		EBUG_ON(!b);
+
+		if (unlikely(IS_ERR(b))) {
+			if (unlikely(PTR_ERR(b) != -EAGAIN)) {
+				ret = PTR_ERR(b);
+				goto err;
+			}
+
+			/*
+			 * If we already have some keys, must insert them first
+			 * before allocating another open bucket. We only hit
+			 * this case if open_bucket_nr > 1.
+			 */
+			if (!bch_keylist_empty(&op->insert_keys))
+				continue_at(cl, bch_write_index,
+					    index_update_wq(op));
+
+			/*
+			 * If we've looped, we're running out of a workqueue -
+			 * not the bch_write() caller's context - and we don't
+			 * want to block the workqueue:
+			 */
+			if (op->flags & BCH_WRITE_LOOPED)
+				continue_at(cl, __bch_write, op->io_wq);
+
+			/*
+			 * Otherwise, we do want to block the caller on alloc
+			 * failure instead of letting it queue up more and more
+			 * writes:
+			 * XXX: this technically needs a try_to_freeze() -
+			 * except that that's not safe because caller may have
+			 * issued other IO... hmm..
+			 */
+			closure_sync(cl);
+			continue;
+		}
+
+		BUG_ON(b - c->open_buckets == 0 ||
+		       b - c->open_buckets > U8_MAX);
+		op->open_buckets[open_bucket_nr++] = b - c->open_buckets;
+
+		ret = bch_write_extent(op, b, bio);
+
+		bch_alloc_sectors_done(c, op->wp, b);
+
+		if (ret < 0)
+			goto err;
+	} while (ret);
+
+	op->flags |= BCH_WRITE_DONE;
+	continue_at(cl, bch_write_index, index_update_wq(op));
+err:
+	if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
+		/*
+		 * If we were writing cached data, not doing the write is fine
+		 * so long as we discard whatever would have been overwritten -
+		 * then it's equivalent to doing the write and immediately
+		 * reclaiming it.
+		 */
+
+		bch_write_discard(cl);
+	} else {
+		/*
+		 * Right now we can only error here if we went RO - the
+		 * allocation failed, but we already checked for -ENOSPC when we
+		 * got our reservation.
+		 *
+		 * XXX capacity might have changed, but we don't check for that
+		 * yet:
+		 */
+		op->error = ret;
+	}
+
+	op->flags |= BCH_WRITE_DONE;
+
+	/*
+	 * No reason not to insert keys for whatever data was successfully
+	 * written (especially for a cmpxchg operation that's moving data
+	 * around)
+	 */
+	continue_at(cl, !bch_keylist_empty(&op->insert_keys)
+		    ? bch_write_index
+		    : bch_write_done, index_update_wq(op));
+}
+
+void bch_wake_delayed_writes(unsigned long data)
+{
+	struct cache_set *c = (void *) data;
+	struct bch_write_op *op;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
+
+	while ((op = c->write_wait_head)) {
+		if (!test_bit(CACHE_SET_RO, &c->flags) &&
+		    !test_bit(CACHE_SET_STOPPING, &c->flags) &&
+		    time_after(op->expires, jiffies)) {
+			mod_timer(&c->foreground_write_wakeup, op->expires);
+			break;
+		}
+
+		c->write_wait_head = op->next;
+		if (!c->write_wait_head)
+			c->write_wait_tail = NULL;
+
+		closure_put(&op->cl);
+	}
+
+	spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
+}
+
+/**
+ * bch_write - handle a write to a cache device or flash only volume
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * It inserts the data in op->bio; bi_sector is used for the key offset, and
+ * op->inode is used for the key inode.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+void bch_write(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bio *bio = &op->bio->bio;
+	struct cache_set *c = op->c;
+	u64 inode = op->pos.inode;
+
+	trace_bcache_write(c, inode, bio,
+			   !(op->flags & BCH_WRITE_CACHED),
+			   op->flags & BCH_WRITE_DISCARD);
+
+	if (!percpu_ref_tryget(&c->writes)) {
+		__bcache_io_error(c, "read only");
+		op->error = -EROFS;
+		bch_disk_reservation_put(c, &op->res);
+		closure_return(cl);
+	}
+
+	if (!(op->flags & BCH_WRITE_DISCARD))
+		bch_increment_clock(c, bio_sectors(bio), WRITE);
+
+	if (!(op->flags & BCH_WRITE_DISCARD))
+		bch_mark_foreground_write(c, bio_sectors(bio));
+	else
+		bch_mark_discard(c, bio_sectors(bio));
+
+	/* Don't call bch_next_delay() if rate is >= 1 GB/sec */
+
+	if (c->foreground_write_ratelimit_enabled &&
+	    c->foreground_write_pd.rate.rate < (1 << 30) &&
+	    !(op->flags & BCH_WRITE_DISCARD) && op->wp->throttle) {
+		unsigned long flags;
+		u64 delay;
+
+		spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
+		bch_ratelimit_increment(&c->foreground_write_pd.rate,
+					bio->bi_iter.bi_size);
+
+		delay = bch_ratelimit_delay(&c->foreground_write_pd.rate);
+
+		if (delay >= HZ / 100) {
+			trace_bcache_write_throttle(c, inode, bio, delay);
+
+			closure_get(&op->cl); /* list takes a ref */
+
+			op->expires = jiffies + delay;
+			op->next = NULL;
+
+			if (c->write_wait_tail)
+				c->write_wait_tail->next = op;
+			else
+				c->write_wait_head = op;
+			c->write_wait_tail = op;
+
+			if (!timer_pending(&c->foreground_write_wakeup))
+				mod_timer(&c->foreground_write_wakeup,
+					  op->expires);
+
+			spin_unlock_irqrestore(&c->foreground_write_pd_lock,
+					       flags);
+			continue_at(cl, __bch_write, index_update_wq(op));
+		}
+
+		spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
+	}
+
+	continue_at_nobarrier(cl, __bch_write, NULL);
+}
+
+void bch_write_op_init(struct bch_write_op *op, struct cache_set *c,
+		       struct bch_write_bio *bio, struct disk_reservation res,
+		       struct write_point *wp, struct bpos pos,
+		       u64 *journal_seq, unsigned flags)
+{
+	op->c		= c;
+	op->io_wq	= index_update_wq(op);
+	op->bio		= bio;
+	op->written	= 0;
+	op->error	= 0;
+	op->flags	= flags;
+	op->compression_type = c->opts.compression;
+	op->nr_replicas	= res.nr_replicas;
+	op->alloc_reserve = RESERVE_NONE;
+	op->pos		= pos;
+	op->version	= 0;
+	op->res		= res;
+	op->wp		= wp;
+
+	if (journal_seq) {
+		op->journal_seq_p = journal_seq;
+		op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+	} else {
+		op->journal_seq = 0;
+	}
+
+	op->index_update_fn = bch_write_index_default;
+
+	bch_keylist_init(&op->insert_keys,
+			 op->inline_keys,
+			 ARRAY_SIZE(op->inline_keys));
+
+	if (version_stress_test(c))
+		get_random_bytes(&op->version, sizeof(op->version));
+}
+
+/* Discard */
+
+/* bch_discard - discard a range of keys from start_key to end_key.
+ * @c		cache set
+ * @start_key	pointer to start location
+ *		NOTE: discard starts at bkey_start_offset(start_key)
+ * @end_key	pointer to end location
+ *		NOTE: discard ends at KEY_OFFSET(end_key)
+ * @version	version of discard (0ULL if none)
+ *
+ * Returns:
+ *	 0 on success
+ *	<0 on error
+ *
+ * XXX: this needs to be refactored with inode_truncate, or more
+ *	appropriately inode_truncate should call this
+ */
+int bch_discard(struct cache_set *c, struct bpos start,
+		struct bpos end, u64 version,
+		struct disk_reservation *disk_res,
+		struct extent_insert_hook *hook,
+		u64 *journal_seq)
+{
+	return bch_btree_delete_range(c, BTREE_ID_EXTENTS, start, end, version,
+				      disk_res, hook, journal_seq);
+}
+
+/* Cache promotion on read */
+
+struct cache_promote_op {
+	struct closure		cl;
+	struct migrate_write	write;
+	struct bio_vec		bi_inline_vecs[0]; /* must be last */
+};
+
+/* Read */
+
+static int bio_checksum_uncompress(struct cache_set *c,
+				   struct bch_read_bio *rbio)
+{
+	struct bio *src = &rbio->bio;
+	struct bio *dst = &bch_rbio_parent(rbio)->bio;
+	struct bvec_iter dst_iter = rbio->parent_iter;
+	u64 csum;
+	int ret = 0;
+
+	/*
+	 * reset iterator for checksumming and copying bounced data: here we've
+	 * set rbio->compressed_size to the amount of data we actually read,
+	 * which was not necessarily the full extent if we were only bouncing
+	 * in order to promote
+	 */
+	if (rbio->bounce) {
+		src->bi_iter.bi_size		= rbio->crc.compressed_size << 9;
+		src->bi_iter.bi_idx		= 0;
+		src->bi_iter.bi_bvec_done	= 0;
+	} else {
+		src->bi_iter = rbio->parent_iter;
+	}
+
+	csum = bch_checksum_bio(src, rbio->crc.csum_type);
+	if (cache_nonfatal_io_err_on(rbio->crc.csum != csum, rbio->ca,
+			"data checksum error, inode %llu offset %llu: expected %0llx got %0llx (type %u)",
+			rbio->inode, (u64) rbio->parent_iter.bi_sector << 9,
+			rbio->crc.csum, csum, rbio->crc.csum_type))
+		ret = -EIO;
+
+	/*
+	 * If there was a checksum error, still copy the data back - unless it
+	 * was compressed, we don't want to decompress bad data:
+	 */
+	if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) {
+		if (!ret) {
+			ret = bch_bio_uncompress(c, src, dst,
+						 dst_iter, rbio->crc);
+			if (ret)
+				__bcache_io_error(c, "decompression error");
+		}
+	} else if (rbio->bounce) {
+		bio_advance(src, rbio->crc.offset << 9);
+		bio_copy_data_iter(dst, dst_iter,
+				   src, src->bi_iter);
+	}
+
+	return ret;
+}
+
+static void bch_rbio_free(struct cache_set *c, struct bch_read_bio *rbio)
+{
+	struct bio *bio = &rbio->bio;
+
+	BUG_ON(rbio->ca);
+	BUG_ON(!rbio->split);
+
+	if (rbio->promote)
+		kfree(rbio->promote);
+	if (rbio->bounce)
+		bch_bio_free_pages_pool(c, bio);
+
+	bio_put(bio);
+}
+
+static void bch_rbio_done(struct cache_set *c, struct bch_read_bio *rbio)
+{
+	struct bio *orig = &bch_rbio_parent(rbio)->bio;
+
+	percpu_ref_put(&rbio->ca->ref);
+	rbio->ca = NULL;
+
+	if (rbio->split) {
+		if (rbio->bio.bi_error)
+			orig->bi_error = rbio->bio.bi_error;
+
+		bio_endio(orig);
+		bch_rbio_free(c, rbio);
+	} else {
+		if (rbio->promote)
+			kfree(rbio->promote);
+
+		orig->bi_end_io = rbio->orig_bi_end_io;
+		bio_endio_nodec(orig);
+	}
+}
+
+/*
+ * Decide if we want to retry the read - returns true if read is being retried,
+ * false if caller should pass error on up
+ */
+static void bch_read_error_maybe_retry(struct cache_set *c,
+				       struct bch_read_bio *rbio,
+				       int error)
+{
+	unsigned long flags;
+
+	if ((error == -EINTR) &&
+	    (rbio->flags & BCH_READ_RETRY_IF_STALE)) {
+		atomic_long_inc(&c->cache_read_races);
+		goto retry;
+	}
+
+	if (error == -EIO) {
+		/* io error - do we have another replica? */
+	}
+
+	bch_rbio_parent(rbio)->bio.bi_error = error;
+	bch_rbio_done(c, rbio);
+	return;
+retry:
+	percpu_ref_put(&rbio->ca->ref);
+	rbio->ca = NULL;
+
+	spin_lock_irqsave(&c->read_retry_lock, flags);
+	bio_list_add(&c->read_retry_list, &rbio->bio);
+	spin_unlock_irqrestore(&c->read_retry_lock, flags);
+	queue_work(c->wq, &c->read_retry_work);
+}
+
+static void cache_promote_done(struct closure *cl)
+{
+	struct cache_promote_op *op =
+		container_of(cl, struct cache_promote_op, cl);
+
+	bch_bio_free_pages_pool(op->write.op.c, &op->write.wbio.bio);
+	kfree(op);
+}
+
+/* Inner part that may run in process context */
+static void __bch_read_endio(struct cache_set *c, struct bch_read_bio *rbio)
+{
+	int ret;
+
+	ret = bio_checksum_uncompress(c, rbio);
+	if (ret) {
+		bch_read_error_maybe_retry(c, rbio, ret);
+		return;
+	}
+
+	if (rbio->promote &&
+	    !test_bit(CACHE_SET_RO, &c->flags) &&
+	    !test_bit(CACHE_SET_STOPPING, &c->flags)) {
+		struct cache_promote_op *promote = rbio->promote;
+		struct closure *cl = &promote->cl;
+
+		BUG_ON(!rbio->split || !rbio->bounce);
+
+		/* we now own pages: */
+		swap(promote->write.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt);
+		rbio->promote = NULL;
+
+		bch_rbio_done(c, rbio);
+
+		closure_init(cl, &c->cl);
+		closure_call(&promote->write.op.cl, bch_write, c->wq, cl);
+		closure_return_with_destructor(cl, cache_promote_done);
+	} else {
+		bch_rbio_done(c, rbio);
+	}
+}
+
+void bch_bio_decompress_work(struct work_struct *work)
+{
+	struct bio_decompress_worker *d =
+		container_of(work, struct bio_decompress_worker, work);
+	struct llist_node *list, *next;
+	struct bch_read_bio *rbio;
+
+	while ((list = llist_del_all(&d->bio_list)))
+		for (list = llist_reverse_order(list);
+		     list;
+		     list = next) {
+			next = llist_next(list);
+			rbio = container_of(list, struct bch_read_bio, list);
+
+			__bch_read_endio(d->c, rbio);
+		}
+}
+
+static void bch_read_endio(struct bio *bio)
+{
+	struct bch_read_bio *rbio =
+		container_of(bio, struct bch_read_bio, bio);
+	struct cache_set *c = rbio->ca->set;
+	int stale = ((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+		ptr_stale(rbio->ca, &rbio->ptr) ? -EINTR : 0;
+	int error = bio->bi_error ?: stale;
+
+	bch_account_io_completion_time(rbio->ca, rbio->submit_time_us, REQ_OP_READ);
+
+	cache_nonfatal_io_err_on(bio->bi_error, rbio->ca, "data read");
+
+	if (error) {
+		bch_read_error_maybe_retry(c, rbio, error);
+		return;
+	}
+
+	if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) {
+		struct bio_decompress_worker *d;
+
+		preempt_disable();
+		d = this_cpu_ptr(c->bio_decompress_worker);
+		llist_add(&rbio->list, &d->bio_list);
+		queue_work(system_unbound_wq, &d->work);
+		preempt_enable();
+	} else {
+		__bch_read_endio(c, rbio);
+	}
+}
+
+void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
+			  struct bvec_iter iter, struct bkey_s_c k,
+			  struct extent_pick_ptr *pick, unsigned flags)
+{
+	struct bch_read_bio *rbio;
+	struct cache_promote_op *promote_op = NULL;
+	unsigned skip = iter.bi_sector - bkey_start_offset(k.k);
+	bool bounce = false, split, read_full = false;
+
+	EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
+		k.k->p.offset < bvec_iter_end_sector(iter));
+
+	/* only promote if we're not reading from the fastest tier: */
+
+	/*
+	 * XXX: multiple promotes can race with each other, wastefully. Keep a
+	 * list of outstanding promotes?
+	 */
+	if ((flags & BCH_READ_PROMOTE) && pick->ca->mi.tier) {
+		/*
+		 * biovec needs to be big enough to hold decompressed data, if
+		 * the bch_write_extent() has to decompress/recompress it:
+		 */
+		unsigned sectors =
+			max_t(unsigned, k.k->size,
+			      pick->crc.uncompressed_size);
+		unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+
+		promote_op = kmalloc(sizeof(*promote_op) +
+				sizeof(struct bio_vec) * pages, GFP_NOIO);
+		if (promote_op) {
+			struct bio *promote_bio = &promote_op->write.wbio.bio;
+
+			bio_init(promote_bio);
+			promote_bio->bi_max_vecs = pages;
+			promote_bio->bi_io_vec	= promote_bio->bi_inline_vecs;
+			bounce = true;
+			/* could also set read_full */
+		}
+	}
+
+	/*
+	 * note: if compression_type and crc_type both == none, then
+	 * compressed/uncompressed size is zero
+	 */
+	if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
+	    (pick->crc.csum_type != BCH_CSUM_NONE &&
+	     (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
+	      (flags & BCH_READ_FORCE_BOUNCE)))) {
+		read_full = true;
+		bounce = true;
+	}
+
+	if (bounce) {
+		unsigned sectors = read_full
+			? (pick->crc.compressed_size ?: k.k->size)
+			: bvec_iter_sectors(iter);
+
+		rbio = container_of(bio_alloc_bioset(GFP_NOIO,
+					DIV_ROUND_UP(sectors, PAGE_SECTORS),
+					&c->bio_read_split),
+				    struct bch_read_bio, bio);
+
+		bch_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+		split = true;
+	} else if (!(flags & BCH_READ_MAY_REUSE_BIO) ||
+		   !(flags & BCH_READ_IS_LAST)) {
+		/*
+		 * Have to clone if there were any splits, due to error
+		 * reporting issues (if a split errored, and retrying didn't
+		 * work, when it reports the error to its parent (us) we don't
+		 * know if the error was from our bio, and we should retry, or
+		 * from the whole bio, in which case we don't want to retry and
+		 * lose the error)
+		 */
+		rbio = container_of(bio_clone_fast(&orig->bio,
+					GFP_NOIO, &c->bio_read_split),
+				    struct bch_read_bio, bio);
+		rbio->bio.bi_iter = iter;
+		split = true;
+	} else {
+		rbio = orig;
+		rbio->bio.bi_iter = iter;
+		split = false;
+		BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+	}
+
+	if (!(flags & BCH_READ_IS_LAST))
+		__bio_inc_remaining(&orig->bio);
+
+	if (split)
+		rbio->parent	= orig;
+	else
+		rbio->orig_bi_end_io = orig->bio.bi_end_io;
+	rbio->parent_iter	= iter;
+
+	rbio->inode		= k.k->p.inode;
+	rbio->flags		= flags;
+	rbio->bounce		= bounce;
+	rbio->split		= split;
+	rbio->crc		= pick->crc;
+	/*
+	 * crc.compressed_size will be 0 if there wasn't any checksum
+	 * information, also we need to stash the original size of the bio if we
+	 * bounced (which isn't necessarily the original key size, if we bounced
+	 * only for promoting)
+	 */
+	rbio->crc.compressed_size = bio_sectors(&rbio->bio);
+	rbio->ptr		= pick->ptr;
+	rbio->ca		= pick->ca;
+	rbio->promote		= promote_op;
+
+	rbio->bio.bi_bdev	= pick->ca->disk_sb.bdev;
+	rbio->bio.bi_opf	= orig->bio.bi_opf;
+	rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
+	rbio->bio.bi_end_io	= bch_read_endio;
+
+	if (promote_op) {
+		struct bio *promote_bio = &promote_op->write.wbio.bio;
+
+		promote_bio->bi_iter = rbio->bio.bi_iter;
+		memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec,
+		       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+
+		bch_migrate_write_init(c, &promote_op->write,
+				       &c->promote_write_point,
+				       k, NULL,
+				       BCH_WRITE_ALLOC_NOWAIT);
+		promote_op->write.promote = true;
+
+		if (rbio->crc.compression_type) {
+			promote_op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED;
+			promote_op->write.op.crc = rbio->crc;
+			promote_op->write.op.size = k.k->size;
+		} else if (read_full) {
+			/*
+			 * Adjust bio to correspond to _live_ portion of @k -
+			 * which might be less than what we're actually reading:
+			 */
+			bio_advance(promote_bio, rbio->crc.offset << 9);
+			BUG_ON(bio_sectors(promote_bio) < k.k->size);
+			promote_bio->bi_iter.bi_size = k.k->size << 9;
+		} else {
+			/*
+			 * Set insert pos to correspond to what we're actually
+			 * reading:
+			 */
+			promote_op->write.op.pos.offset = iter.bi_sector;
+		}
+
+		promote_bio->bi_iter.bi_sector =
+			promote_op->write.op.pos.offset;
+	}
+
+	/* _after_ promete stuff has looked at rbio->crc.offset */
+	if (read_full)
+		rbio->crc.offset += skip;
+	else
+		rbio->bio.bi_iter.bi_sector += skip;
+
+	rbio->submit_time_us = local_clock_us();
+
+#ifndef CONFIG_BCACHE_NO_IO
+	generic_make_request(&rbio->bio);
+#else
+	bio_endio(&rbio->bio);
+#endif
+}
+
+static void bch_read_iter(struct cache_set *c, struct bch_read_bio *rbio,
+			  struct bvec_iter bvec_iter, u64 inode,
+			  unsigned flags)
+{
+	struct bio *bio = &rbio->bio;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
+				      POS(inode, bvec_iter.bi_sector), k) {
+		BKEY_PADDED(k) tmp;
+		struct extent_pick_ptr pick;
+		unsigned bytes, sectors;
+		bool is_last;
+
+		/*
+		 * Unlock the iterator while the btree node's lock is still in
+		 * cache, before doing the IO:
+		 */
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+		bch_btree_iter_unlock(&iter);
+
+		bch_extent_pick_ptr(c, k, &pick);
+		if (IS_ERR(pick.ca)) {
+			bcache_io_error(c, bio, "no device to read from");
+			bio_endio(bio);
+			return;
+		}
+
+		sectors = min_t(u64, k.k->p.offset,
+				bvec_iter_end_sector(bvec_iter)) -
+			bvec_iter.bi_sector;
+		bytes = sectors << 9;
+		is_last = bytes == bvec_iter.bi_size;
+		swap(bvec_iter.bi_size, bytes);
+
+		if (is_last)
+			flags |= BCH_READ_IS_LAST;
+
+		if (pick.ca) {
+			PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
+				c->prio_clock[READ].hand;
+
+			bch_read_extent_iter(c, rbio, bvec_iter,
+					     k, &pick, flags);
+
+			flags &= ~BCH_READ_MAY_REUSE_BIO;
+		} else {
+			zero_fill_bio_iter(bio, bvec_iter);
+
+			if (is_last)
+				bio_endio(bio);
+		}
+
+		if (is_last)
+			return;
+
+		swap(bvec_iter.bi_size, bytes);
+		bio_advance_iter(bio, &bvec_iter, bytes);
+	}
+
+	/*
+	 * If we get here, it better have been because there was an error
+	 * reading a btree node
+	 */
+	ret = bch_btree_iter_unlock(&iter);
+	BUG_ON(!ret);
+	bcache_io_error(c, bio, "btree IO error %i", ret);
+	bio_endio(bio);
+}
+
+void bch_read(struct cache_set *c, struct bch_read_bio *bio, u64 inode)
+{
+	bch_increment_clock(c, bio_sectors(&bio->bio), READ);
+
+	bch_read_iter(c, bio, bio->bio.bi_iter, inode,
+		      BCH_READ_FORCE_BOUNCE|
+		      BCH_READ_RETRY_IF_STALE|
+		      BCH_READ_PROMOTE|
+		      BCH_READ_MAY_REUSE_BIO);
+}
+EXPORT_SYMBOL(bch_read);
+
+/**
+ * bch_read_retry - re-submit a bio originally from bch_read()
+ */
+static void bch_read_retry(struct cache_set *c, struct bch_read_bio *rbio)
+{
+	struct bch_read_bio *parent = bch_rbio_parent(rbio);
+	struct bvec_iter iter = rbio->parent_iter;
+	u64 inode = rbio->inode;
+
+	trace_bcache_read_retry(&rbio->bio);
+
+	if (rbio->split)
+		bch_rbio_free(c, rbio);
+	else
+		rbio->bio.bi_end_io = rbio->orig_bi_end_io;
+
+	bch_read_iter(c, parent, iter, inode,
+		      BCH_READ_FORCE_BOUNCE|
+		      BCH_READ_RETRY_IF_STALE|
+		      BCH_READ_PROMOTE);
+}
+
+void bch_read_retry_work(struct work_struct *work)
+{
+	struct cache_set *c = container_of(work, struct cache_set,
+					   read_retry_work);
+	struct bch_read_bio *rbio;
+	struct bio *bio;
+	unsigned long flags;
+
+	while (1) {
+		spin_lock_irqsave(&c->read_retry_lock, flags);
+		bio = bio_list_pop(&c->read_retry_list);
+		spin_unlock_irqrestore(&c->read_retry_lock, flags);
+
+		if (!bio)
+			break;
+
+		rbio = container_of(bio, struct bch_read_bio, bio);
+		bch_read_retry(c, rbio);
+	}
+}
diff --git a/libbcache/io.h b/libbcache/io.h
new file mode 100644
index 0000000..b7668b4
--- /dev/null
+++ b/libbcache/io.h
@@ -0,0 +1,90 @@
+#ifndef _BCACHE_IO_H
+#define _BCACHE_IO_H
+
+#include "io_types.h"
+
+#define to_wbio(_bio)			\
+	container_of((_bio), struct bch_write_bio, bio)
+
+#define to_rbio(_bio)			\
+	container_of((_bio), struct bch_read_bio, bio)
+
+void bch_bio_free_pages_pool(struct cache_set *, struct bio *);
+void bch_bio_alloc_pages_pool(struct cache_set *, struct bio *, size_t);
+
+enum bch_write_flags {
+	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
+	BCH_WRITE_DISCARD		= (1 << 1),
+	BCH_WRITE_CACHED		= (1 << 2),
+	BCH_WRITE_FLUSH			= (1 << 3),
+	BCH_WRITE_DISCARD_ON_ERROR	= (1 << 4),
+	BCH_WRITE_DATA_COMPRESSED	= (1 << 5),
+
+	/* Internal: */
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 6),
+	BCH_WRITE_DONE			= (1 << 7),
+	BCH_WRITE_LOOPED		= (1 << 8),
+};
+
+static inline u64 *op_journal_seq(struct bch_write_op *op)
+{
+	return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
+		? op->journal_seq_p : &op->journal_seq;
+}
+
+static inline struct write_point *foreground_write_point(struct cache_set *c,
+							 unsigned long v)
+{
+	return c->write_points +
+		hash_long(v, ilog2(ARRAY_SIZE(c->write_points)));
+}
+
+void bch_write_op_init(struct bch_write_op *, struct cache_set *,
+		       struct bch_write_bio *,
+		       struct disk_reservation, struct write_point *,
+		       struct bpos, u64 *, unsigned);
+void bch_write(struct closure *);
+
+struct cache_promote_op;
+
+struct extent_pick_ptr;
+
+void bch_read_extent_iter(struct cache_set *, struct bch_read_bio *,
+			  struct bvec_iter, struct bkey_s_c k,
+			  struct extent_pick_ptr *, unsigned);
+
+static inline void bch_read_extent(struct cache_set *c,
+				   struct bch_read_bio *orig,
+				   struct bkey_s_c k,
+				   struct extent_pick_ptr *pick,
+				   unsigned flags)
+{
+	bch_read_extent_iter(c, orig, orig->bio.bi_iter,
+			     k, pick, flags);
+}
+
+enum bch_read_flags {
+	BCH_READ_FORCE_BOUNCE		= 1 << 0,
+	BCH_READ_RETRY_IF_STALE		= 1 << 1,
+	BCH_READ_PROMOTE		= 1 << 2,
+	BCH_READ_IS_LAST		= 1 << 3,
+	BCH_READ_MAY_REUSE_BIO		= 1 << 4,
+};
+
+void bch_read(struct cache_set *, struct bch_read_bio *, u64);
+
+void bch_generic_make_request(struct bio *, struct cache_set *);
+void bch_bio_submit_work(struct work_struct *);
+void bch_submit_wbio_replicas(struct bch_write_bio *, struct cache_set *,
+			      const struct bkey_i *, bool);
+
+int bch_discard(struct cache_set *, struct bpos, struct bpos,
+		u64, struct disk_reservation *,
+		struct extent_insert_hook *, u64 *);
+
+void bch_read_retry_work(struct work_struct *);
+void bch_wake_delayed_writes(unsigned long data);
+
+void bch_bio_decompress_work(struct work_struct *);
+
+#endif /* _BCACHE_IO_H */
diff --git a/libbcache/io_types.h b/libbcache/io_types.h
new file mode 100644
index 0000000..f7d99cd
--- /dev/null
+++ b/libbcache/io_types.h
@@ -0,0 +1,148 @@
+#ifndef _BCACHE_IO_TYPES_H
+#define _BCACHE_IO_TYPES_H
+
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "keylist_types.h"
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bch_read_bio {
+	/*
+	 * Reads will often have to be split, and if the extent being read from
+	 * was checksummed or compressed we'll also have to allocate bounce
+	 * buffers and copy the data back into the original bio.
+	 *
+	 * If we didn't have to split, we have to save and restore the original
+	 * bi_end_io - @split below indicates which:
+	 */
+	union {
+	struct bch_read_bio	*parent;
+	bio_end_io_t		*orig_bi_end_io;
+	};
+
+	/*
+	 * Saved copy of parent->bi_iter, from submission time - allows us to
+	 * resubmit on IO error, and also to copy data back to the original bio
+	 * when we're bouncing:
+	 */
+	struct bvec_iter	parent_iter;
+
+	/*
+	 * If we have to retry the read (IO error, checksum failure, read stale
+	 * data (raced with allocator), we retry the portion of the parent bio
+	 * that failed (i.e. this bio's portion, parent_iter).
+	 *
+	 * But we need to stash the inode somewhere:
+	 */
+	u64			inode;
+
+	unsigned		submit_time_us;
+	u16			flags;
+	u8			bounce:1,
+				split:1;
+
+	struct bch_extent_crc64	crc;
+	struct bch_extent_ptr	ptr;
+	struct cache		*ca;
+
+	struct cache_promote_op *promote;
+
+	/* bio_decompress_worker list */
+	struct llist_node	list;
+
+	struct bio		bio;
+};
+
+static inline struct bch_read_bio *
+bch_rbio_parent(struct bch_read_bio *rbio)
+{
+	return rbio->split ? rbio->parent : rbio;
+}
+
+struct bch_write_bio {
+	struct cache_set	*c;
+	struct cache		*ca;
+	union {
+		struct bio	*orig;
+		struct closure	*cl;
+	};
+
+	unsigned		submit_time_us;
+	unsigned		split:1,
+				bounce:1,
+				put_bio:1;
+
+	/* Only for btree writes: */
+	unsigned		used_mempool:1;
+	u8			order;
+
+	struct bio		bio;
+};
+
+struct bch_replace_info {
+	struct extent_insert_hook	hook;
+	/* How many insertions succeeded */
+	unsigned			successes;
+	/* How many insertions failed */
+	unsigned			failures;
+	BKEY_PADDED(key);
+};
+
+struct bch_write_op {
+	struct closure		cl;
+	struct cache_set	*c;
+	struct workqueue_struct	*io_wq;
+	struct bch_write_bio	*bio;
+
+	unsigned		written; /* sectors */
+
+	short			error;
+
+	u16			flags;
+	unsigned		compression_type:4;
+	unsigned		nr_replicas:4;
+	unsigned		alloc_reserve:4;
+
+	struct bpos		pos;
+	unsigned		version;
+
+	/* For BCH_WRITE_DATA_COMPRESSED: */
+	struct bch_extent_crc64	crc;
+	unsigned		size;
+
+	struct disk_reservation	res;
+
+	struct write_point	*wp;
+
+	union {
+	u8			open_buckets[16];
+	struct {
+	struct bch_write_op	*next;
+	unsigned long		expires;
+	};
+	};
+
+	/*
+	 * If caller wants to flush but hasn't passed us a journal_seq ptr, we
+	 * still need to stash the journal_seq somewhere:
+	 */
+	union {
+		u64			*journal_seq_p;
+		u64			journal_seq;
+	};
+
+	int			(*index_update_fn)(struct bch_write_op *);
+
+	struct keylist		insert_keys;
+	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+};
+
+struct bio_decompress_worker {
+	struct cache_set		*c;
+	struct work_struct		work;
+	struct llist_head		bio_list;
+};
+
+#endif /* _BCACHE_IO_TYPES_H */
diff --git a/libbcache/journal.c b/libbcache/journal.c
new file mode 100644
index 0000000..ffc9573
--- /dev/null
+++ b/libbcache/journal.c
@@ -0,0 +1,2585 @@
+/*
+ * bcache journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "alloc.h"
+#include "bkey_methods.h"
+#include "buckets.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_io.h"
+#include "checksum.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "keylist.h"
+#include "journal.h"
+#include "super.h"
+
+#include <trace/events/bcache.h>
+
+static void journal_write(struct closure *);
+static void journal_reclaim_fast(struct journal *);
+static void journal_pin_add_entry(struct journal *,
+				  struct journal_entry_pin_list *,
+				  struct journal_entry_pin *,
+				  journal_pin_flush_fn);
+
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+	return j->buf + j->reservations.idx;
+}
+
+static inline struct journal_buf *journal_prev_buf(struct journal *j)
+{
+	return j->buf + !j->reservations.idx;
+}
+
+/* Sequence number of oldest dirty journal entry */
+
+static inline u64 last_seq(struct journal *j)
+{
+	return atomic64_read(&j->seq) - fifo_used(&j->pin) + 1;
+}
+
+static inline u64 journal_pin_seq(struct journal *j,
+				  struct journal_entry_pin_list *pin_list)
+{
+	return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
+}
+
+#define for_each_jset_entry(entry, jset)				\
+	for (entry = (jset)->start;					\
+	     entry < bkey_idx(jset, le32_to_cpu((jset)->u64s));		\
+	     entry = jset_keys_next(entry))
+
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+					struct jset_entry *entry, unsigned type)
+{
+	while (entry < bkey_idx(jset, le32_to_cpu(jset->u64s))) {
+		if (JOURNAL_ENTRY_TYPE(entry) == type)
+			return entry;
+
+		entry = jset_keys_next(entry);
+	}
+
+	return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type)			\
+	for (entry = (jset)->start;					\
+	     (entry = __jset_entry_type_next(jset, entry, type));	\
+	     entry = jset_keys_next(entry))
+
+#define for_each_jset_key(k, _n, entry, jset)				\
+	for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS)	\
+		for (k = (entry)->start;			\
+		     (k < bkey_idx(entry, le16_to_cpu((entry)->u64s)) &&\
+		      (_n = bkey_next(k), 1));			\
+		     k = _n)
+
+static inline void bch_journal_add_entry(struct journal_buf *buf,
+					 const void *data, size_t u64s,
+					 unsigned type, enum btree_id id,
+					 unsigned level)
+{
+	struct jset *jset = buf->data;
+
+	bch_journal_add_entry_at(buf, data, u64s, type, id, level,
+				 le32_to_cpu(jset->u64s));
+	le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+}
+
+static struct jset_entry *bch_journal_find_entry(struct jset *j, unsigned type,
+						 enum btree_id id)
+{
+	struct jset_entry *entry;
+
+	for_each_jset_entry_type(entry, j, type)
+		if (entry->btree_id == id)
+			return entry;
+
+	return NULL;
+}
+
+struct bkey_i *bch_journal_find_btree_root(struct cache_set *c, struct jset *j,
+					   enum btree_id id, unsigned *level)
+{
+	struct bkey_i *k;
+	struct jset_entry *entry =
+		bch_journal_find_entry(j, JOURNAL_ENTRY_BTREE_ROOT, id);
+
+	if (!entry)
+		return NULL;
+
+	k = entry->start;
+	*level = entry->level;
+	*level = entry->level;
+	return k;
+}
+
+static void bch_journal_add_btree_root(struct journal_buf *buf,
+				       enum btree_id id, struct bkey_i *k,
+				       unsigned level)
+{
+	bch_journal_add_entry(buf, k, k->k.u64s,
+			      JOURNAL_ENTRY_BTREE_ROOT, id, level);
+}
+
+static inline void bch_journal_add_prios(struct journal *j,
+					 struct journal_buf *buf)
+{
+	/*
+	 * no prio bucket ptrs yet... XXX should change the allocator so this
+	 * can't happen:
+	 */
+	if (!buf->nr_prio_buckets)
+		return;
+
+	bch_journal_add_entry(buf, j->prio_buckets, buf->nr_prio_buckets,
+			      JOURNAL_ENTRY_PRIO_PTRS, 0, 0);
+}
+
+static void journal_seq_blacklist_flush(struct journal *j,
+					struct journal_entry_pin *pin)
+{
+	struct cache_set *c =
+		container_of(j, struct cache_set, journal);
+	struct journal_seq_blacklist *bl =
+		container_of(pin, struct journal_seq_blacklist, pin);
+	struct blacklisted_node n;
+	struct closure cl;
+	unsigned i;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	for (i = 0;; i++) {
+		struct btree_iter iter;
+		struct btree *b;
+
+		mutex_lock(&j->blacklist_lock);
+		if (i >= bl->nr_entries) {
+			mutex_unlock(&j->blacklist_lock);
+			break;
+		}
+		n = bl->entries[i];
+		mutex_unlock(&j->blacklist_lock);
+
+		bch_btree_iter_init(&iter, c, n.btree_id, n.pos);
+		iter.is_extents = false;
+redo_peek:
+		b = bch_btree_iter_peek_node(&iter);
+
+		/* The node might have already been rewritten: */
+
+		if (b->data->keys.seq == n.seq &&
+		    !bkey_cmp(b->key.k.p, n.pos)) {
+			ret = bch_btree_node_rewrite(&iter, b, &cl);
+			if (ret) {
+				bch_btree_iter_unlock(&iter);
+				closure_sync(&cl);
+
+				if (ret == -EAGAIN ||
+				    ret == -EINTR)
+					goto redo_peek;
+
+				/* -EROFS or perhaps -ENOSPC - bail out: */
+				/* XXX warn here */
+				return;
+			}
+		}
+
+		bch_btree_iter_unlock(&iter);
+	}
+
+	closure_sync(&cl);
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	for (i = 0;; i++) {
+		struct btree_interior_update *as;
+		struct pending_btree_node_free *d;
+
+		mutex_lock(&j->blacklist_lock);
+		if (i >= bl->nr_entries) {
+			mutex_unlock(&j->blacklist_lock);
+			break;
+		}
+		n = bl->entries[i];
+		mutex_unlock(&j->blacklist_lock);
+
+		/*
+		 * Is the node on the list of pending interior node updates -
+		 * being freed? If so, wait for that to finish:
+		 */
+		for_each_pending_btree_node_free(c, as, d)
+			if (n.seq	== d->seq &&
+			    n.btree_id	== d->btree_id &&
+			    !d->level &&
+			    !bkey_cmp(n.pos, d->key.k.p)) {
+				closure_wait(&as->wait, &cl);
+				mutex_unlock(&c->btree_interior_update_lock);
+				closure_sync(&cl);
+				break;
+			}
+	}
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	mutex_lock(&j->blacklist_lock);
+
+	bch_journal_pin_drop(j, &bl->pin);
+	list_del(&bl->list);
+	kfree(bl->entries);
+	kfree(bl);
+
+	mutex_unlock(&j->blacklist_lock);
+}
+
+static struct journal_seq_blacklist *
+journal_seq_blacklist_find(struct journal *j, u64 seq)
+{
+	struct journal_seq_blacklist *bl;
+
+	lockdep_assert_held(&j->blacklist_lock);
+
+	list_for_each_entry(bl, &j->seq_blacklist, list)
+		if (seq == bl->seq)
+			return bl;
+
+	return NULL;
+}
+
+static struct journal_seq_blacklist *
+bch_journal_seq_blacklisted_new(struct journal *j, u64 seq)
+{
+	struct journal_seq_blacklist *bl;
+
+	lockdep_assert_held(&j->blacklist_lock);
+
+	bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+	if (!bl)
+		return NULL;
+
+	bl->seq = seq;
+	list_add_tail(&bl->list, &j->seq_blacklist);
+	return bl;
+}
+
+/*
+ * Returns true if @seq is newer than the most recent journal entry that got
+ * written, and data corresponding to @seq should be ignored - also marks @seq
+ * as blacklisted so that on future restarts the corresponding data will still
+ * be ignored:
+ */
+int bch_journal_seq_should_ignore(struct cache_set *c, u64 seq, struct btree *b)
+{
+	struct journal *j = &c->journal;
+	struct journal_seq_blacklist *bl = NULL;
+	struct blacklisted_node *n;
+	u64 journal_seq, i;
+	int ret = 0;
+
+	if (!seq)
+		return 0;
+
+	journal_seq = atomic64_read(&j->seq);
+
+	/* Interier updates aren't journalled: */
+	BUG_ON(b->level);
+	BUG_ON(seq > journal_seq && test_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags));
+
+	if (seq <= journal_seq) {
+		if (list_empty_careful(&j->seq_blacklist))
+			return 0;
+
+		mutex_lock(&j->blacklist_lock);
+		ret = journal_seq_blacklist_find(j, seq) != NULL;
+		mutex_unlock(&j->blacklist_lock);
+		return ret;
+	}
+
+	/*
+	 * Decrease this back to j->seq + 2 when we next rev the on disk format:
+	 * increasing it temporarily to work around bug in old kernels
+	 */
+	cache_set_inconsistent_on(seq > journal_seq + 4, c,
+			 "bset journal seq too far in the future: %llu > %llu",
+			 seq, journal_seq);
+
+	bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
+		    b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
+
+	/*
+	 * When we start the journal, bch_journal_start() will skip over @seq:
+	 */
+
+	mutex_lock(&j->blacklist_lock);
+
+	for (i = journal_seq + 1; i <= seq; i++) {
+		bl = journal_seq_blacklist_find(j, i) ?:
+			bch_journal_seq_blacklisted_new(j, i);
+
+		if (!bl) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
+		if (b->data->keys.seq	== n->seq &&
+		    b->btree_id		== n->btree_id &&
+		    !bkey_cmp(b->key.k.p, n->pos))
+			goto found_entry;
+
+	if (!bl->nr_entries ||
+	    is_power_of_2(bl->nr_entries)) {
+		n = krealloc(bl->entries,
+			     max(bl->nr_entries * 2, 8UL) * sizeof(*n),
+			     GFP_KERNEL);
+		if (!n) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		bl->entries = n;
+	}
+
+	bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
+		.seq		= b->data->keys.seq,
+		.btree_id	= b->btree_id,
+		.pos		= b->key.k.p,
+	};
+found_entry:
+	ret = 1;
+out:
+	mutex_unlock(&j->blacklist_lock);
+	return ret;
+}
+
+/*
+ * Journal replay/recovery:
+ *
+ * This code is all driven from run_cache_set(); we first read the journal
+ * entries, do some other stuff, then we mark all the keys in the journal
+ * entries (same as garbage collection would), then we replay them - reinserting
+ * them into the cache in precisely the same order as they appear in the
+ * journal.
+ *
+ * We only journal keys that go in leaf nodes, which simplifies things quite a
+ * bit.
+ */
+
+struct journal_list {
+	struct closure		cl;
+	struct mutex		lock;
+	struct mutex		cache_set_buffer_lock;
+	struct list_head	*head;
+	int			ret;
+};
+
+#define JOURNAL_ENTRY_ADD_OK		0
+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE	5
+
+/*
+ * Given a journal entry we just read, add it to the list of journal entries to
+ * be replayed:
+ */
+static int journal_entry_add(struct cache_set *c, struct journal_list *jlist,
+		    struct jset *j)
+{
+	struct journal_replay *i, *pos;
+	struct list_head *where;
+	size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s));
+	__le64 last_seq;
+	int ret;
+
+	mutex_lock(&jlist->lock);
+
+	last_seq = !list_empty(jlist->head)
+		? list_last_entry(jlist->head, struct journal_replay,
+				  list)->j.last_seq
+		: 0;
+
+	/* Is this entry older than the range we need? */
+	if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
+		ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+		goto out;
+	}
+
+	/* Drop entries we don't need anymore */
+	list_for_each_entry_safe(i, pos, jlist->head, list) {
+		if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+			break;
+		list_del(&i->list);
+		kfree(i);
+	}
+
+	list_for_each_entry_reverse(i, jlist->head, list) {
+		/* Duplicate? */
+		if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+			fsck_err_on(bytes != __set_bytes(&i->j,
+						le32_to_cpu(i->j.u64s)) ||
+				    memcmp(j, &i->j, bytes), c,
+				    "found duplicate but non identical journal entries (seq %llu)",
+				    le64_to_cpu(j->seq));
+
+			ret = JOURNAL_ENTRY_ADD_OK;
+			goto out;
+		}
+
+		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
+			where = &i->list;
+			goto add;
+		}
+	}
+
+	where = jlist->head;
+add:
+	i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+	if (!i) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(&i->j, j, bytes);
+	list_add(&i->list, where);
+	ret = JOURNAL_ENTRY_ADD_OK;
+out:
+fsck_err:
+	mutex_unlock(&jlist->lock);
+	return ret;
+}
+
+static void journal_entry_null_range(void *start, void *end)
+{
+	struct jset_entry *entry;
+
+	for (entry = start; entry != end; entry = jset_keys_next(entry)) {
+		entry->u64s	= 0;
+		entry->btree_id	= 0;
+		entry->level	= 0;
+		entry->flags	= 0;
+		SET_JOURNAL_ENTRY_TYPE(entry, 0);
+	}
+}
+
+static int journal_validate_key(struct cache_set *c, struct jset *j,
+				struct jset_entry *entry,
+				struct bkey_i *k, enum bkey_type key_type,
+				const char *type)
+{
+	void *next = jset_keys_next(entry);
+	const char *invalid;
+	char buf[160];
+	int ret = 0;
+
+	if (fsck_err_on(!k->k.u64s, c,
+			"invalid %s in journal: k->u64s 0", type)) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(jset_keys_next(entry), next);
+		return 0;
+	}
+
+	if (fsck_err_on((void *) bkey_next(k) >
+			(void *) jset_keys_next(entry), c,
+			"invalid %s in journal: extends past end of journal entry",
+			type)) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(jset_keys_next(entry), next);
+		return 0;
+	}
+
+	if (fsck_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
+			"invalid %s in journal: bad format %u",
+			type, k->k.format)) {
+		le16_add_cpu(&entry->u64s, -k->k.u64s);
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(jset_keys_next(entry), next);
+		return 0;
+	}
+
+	if (JSET_BIG_ENDIAN(j) != CPU_BIG_ENDIAN)
+		bch_bkey_swab(key_type, NULL, bkey_to_packed(k));
+
+	invalid = bkey_invalid(c, key_type, bkey_i_to_s_c(k));
+	if (invalid) {
+		bch_bkey_val_to_text(c, key_type, buf, sizeof(buf),
+				     bkey_i_to_s_c(k));
+		fsck_err(c, "invalid %s in journal: %s", type, buf);
+
+		le16_add_cpu(&entry->u64s, -k->k.u64s);
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(jset_keys_next(entry), next);
+		return 0;
+	}
+fsck_err:
+	return ret;
+}
+
+#define JOURNAL_ENTRY_REREAD	5
+#define JOURNAL_ENTRY_NONE	6
+#define JOURNAL_ENTRY_BAD	7
+
+static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 sector,
+				  unsigned bucket_sectors_left,
+				  unsigned sectors_read)
+{
+	struct jset_entry *entry;
+	size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s));
+	u64 got, expect;
+	int ret = 0;
+
+	if (le64_to_cpu(j->magic) != jset_magic(&c->disk_sb))
+		return JOURNAL_ENTRY_NONE;
+
+	if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
+		bch_err(c, "unknown journal entry version %u",
+			le32_to_cpu(j->version));
+		return BCH_FSCK_UNKNOWN_VERSION;
+	}
+
+	if (fsck_err_on(bytes > bucket_sectors_left << 9 ||
+			bytes > c->journal.entry_size_max, c,
+			"journal entry too big (%zu bytes), sector %lluu",
+			bytes, sector)) {
+		/* XXX: note we might have missing journal entries */
+		return JOURNAL_ENTRY_BAD;
+	}
+
+	if (bytes > sectors_read << 9)
+		return JOURNAL_ENTRY_REREAD;
+
+	got = le64_to_cpu(j->csum);
+	expect = __csum_set(j, le32_to_cpu(j->u64s), JSET_CSUM_TYPE(j));
+	if (fsck_err_on(got != expect, c,
+			"journal checksum bad (got %llu expect %llu), sector %lluu",
+			got, expect, sector)) {
+		/* XXX: retry IO, when we start retrying checksum errors */
+		/* XXX: note we might have missing journal entries */
+		return JOURNAL_ENTRY_BAD;
+	}
+
+	if (fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
+			"invalid journal entry: last_seq > seq"))
+		j->last_seq = j->seq;
+
+	for_each_jset_entry(entry, j) {
+		struct bkey_i *k;
+
+		if (fsck_err_on(jset_keys_next(entry) >
+				bkey_idx(j, le32_to_cpu(j->u64s)), c,
+				"journal entry extents past end of jset")) {
+			j->u64s = cpu_to_le64((u64 *) entry - j->_data);
+			break;
+		}
+
+		switch (JOURNAL_ENTRY_TYPE(entry)) {
+		case JOURNAL_ENTRY_BTREE_KEYS:
+			for (k = entry->start;
+			     k < bkey_idx(entry, le16_to_cpu(entry->u64s));
+			     k = bkey_next(k)) {
+				ret = journal_validate_key(c, j, entry, k,
+						bkey_type(entry->level,
+							  entry->btree_id),
+						"key");
+				if (ret)
+					goto fsck_err;
+			}
+			break;
+
+		case JOURNAL_ENTRY_BTREE_ROOT:
+			k = entry->start;
+
+			if (fsck_err_on(!entry->u64s ||
+					le16_to_cpu(entry->u64s) != k->k.u64s, c,
+					"invalid btree root journal entry: wrong number of keys")) {
+				journal_entry_null_range(entry,
+						jset_keys_next(entry));
+				continue;
+			}
+
+			ret = journal_validate_key(c, j, entry, k,
+						   BKEY_TYPE_BTREE, "btree root");
+			if (ret)
+				goto fsck_err;
+			break;
+
+		case JOURNAL_ENTRY_PRIO_PTRS:
+			break;
+
+		case JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED:
+			if (fsck_err_on(le16_to_cpu(entry->u64s) != 1, c,
+				"invalid journal seq blacklist entry: bad size")) {
+				journal_entry_null_range(entry,
+						jset_keys_next(entry));
+			}
+
+			break;
+		default:
+			fsck_err(c, "invalid journal entry type %llu",
+				 JOURNAL_ENTRY_TYPE(entry));
+			journal_entry_null_range(entry, jset_keys_next(entry));
+			break;
+		}
+	}
+
+fsck_err:
+	return ret;
+}
+
+static int journal_read_bucket(struct cache *ca, struct journal_list *jlist,
+			       unsigned bucket, u64 *seq, bool *entries_found)
+{
+	struct cache_set *c = ca->set;
+	struct journal_device *ja = &ca->journal;
+	struct bio *bio = ja->bio;
+	struct jset *j, *data;
+	unsigned blocks, sectors_read, bucket_offset = 0;
+	unsigned max_entry_sectors = c->journal.entry_size_max >> 9;
+	u64 sector = bucket_to_sector(ca,
+				journal_bucket(ca->disk_sb.sb, bucket));
+	bool saw_bad = false;
+	int ret = 0;
+
+	data = (void *) __get_free_pages(GFP_KERNEL,
+				get_order(c->journal.entry_size_max));
+	if (!data) {
+		mutex_lock(&jlist->cache_set_buffer_lock);
+		data = c->journal.buf[0].data;
+	}
+
+	pr_debug("reading %u", bucket);
+
+	while (bucket_offset < ca->mi.bucket_size) {
+reread:
+		sectors_read = min_t(unsigned,
+				     ca->mi.bucket_size - bucket_offset,
+				     max_entry_sectors);
+
+		bio_reset(bio);
+		bio->bi_bdev		= ca->disk_sb.bdev;
+		bio->bi_iter.bi_sector	= sector + bucket_offset;
+		bio->bi_iter.bi_size	= sectors_read << 9;
+		bio_set_op_attrs(bio, REQ_OP_READ, 0);
+		bch_bio_map(bio, data);
+
+		ret = submit_bio_wait(bio);
+
+		if (cache_fatal_io_err_on(ret, ca,
+					  "journal read from sector %llu",
+					  sector + bucket_offset) ||
+		    bch_meta_read_fault("journal")) {
+			ret = -EIO;
+			goto err;
+		}
+
+		/* This function could be simpler now since we no longer write
+		 * journal entries that overlap bucket boundaries; this means
+		 * the start of a bucket will always have a valid journal entry
+		 * if it has any journal entries at all.
+		 */
+
+		j = data;
+		while (sectors_read) {
+			ret = journal_entry_validate(c, j,
+					sector + bucket_offset,
+					ca->mi.bucket_size - bucket_offset,
+					sectors_read);
+			switch (ret) {
+			case BCH_FSCK_OK:
+				break;
+			case JOURNAL_ENTRY_REREAD:
+				goto reread;
+			case JOURNAL_ENTRY_NONE:
+				if (!saw_bad)
+					goto out;
+				blocks = 1;
+				goto next_block;
+			case JOURNAL_ENTRY_BAD:
+				saw_bad = true;
+				blocks = 1;
+				goto next_block;
+			default:
+				goto err;
+			}
+
+			/*
+			 * This happens sometimes if we don't have discards on -
+			 * when we've partially overwritten a bucket with new
+			 * journal entries. We don't need the rest of the
+			 * bucket:
+			 */
+			if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+				goto out;
+
+			ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
+
+			ret = journal_entry_add(c, jlist, j);
+			switch (ret) {
+			case JOURNAL_ENTRY_ADD_OK:
+				*entries_found = true;
+				break;
+			case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+				break;
+			default:
+				goto err;
+			}
+
+			if (le64_to_cpu(j->seq) > *seq)
+				*seq = le64_to_cpu(j->seq);
+next_block:
+			blocks = __set_blocks(j, le32_to_cpu(j->u64s),
+					      block_bytes(c));
+
+			pr_debug("next");
+			bucket_offset	+= blocks * c->sb.block_size;
+			sectors_read	-= blocks * c->sb.block_size;
+			j = ((void *) j) + blocks * block_bytes(c);
+		}
+	}
+out:
+	ret = 0;
+err:
+	if (data == c->journal.buf[0].data)
+		mutex_unlock(&jlist->cache_set_buffer_lock);
+	else
+		free_pages((unsigned long) data,
+				get_order(c->journal.entry_size_max));
+
+	return ret;
+}
+
+static void bch_journal_read_device(struct closure *cl)
+{
+#define read_bucket(b)							\
+	({								\
+		bool entries_found = false;				\
+		int ret = journal_read_bucket(ca, jlist, b,		\
+					      &seq, &entries_found);	\
+		__set_bit(b, bitmap);					\
+		if (ret) {						\
+			mutex_lock(&jlist->lock);			\
+			jlist->ret = ret;				\
+			mutex_unlock(&jlist->lock);			\
+			closure_return(cl);				\
+		}							\
+		entries_found;						\
+	 })
+
+	struct journal_device *ja =
+		container_of(cl, struct journal_device, read);
+	struct cache *ca = container_of(ja, struct cache, journal);
+	struct journal_list *jlist =
+		container_of(cl->parent, struct journal_list, cl);
+	struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
+
+	unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
+	DECLARE_BITMAP(bitmap, nr_buckets);
+	unsigned i, l, r;
+	u64 seq = 0;
+
+	if (!nr_buckets)
+		closure_return(cl);
+
+	bitmap_zero(bitmap, nr_buckets);
+	pr_debug("%u journal buckets", nr_buckets);
+
+	/*
+	 * If the device supports discard but not secure discard, we can't do
+	 * the fancy fibonacci hash/binary search because the live journal
+	 * entries might not form a contiguous range:
+	 */
+	for (i = 0; i < nr_buckets; i++)
+		read_bucket(i);
+	goto search_done;
+
+	if (!blk_queue_nonrot(q))
+		goto linear_scan;
+
+	/*
+	 * Read journal buckets ordered by golden ratio hash to quickly
+	 * find a sequence of buckets with valid journal entries
+	 */
+	for (i = 0; i < nr_buckets; i++) {
+		l = (i * 2654435769U) % nr_buckets;
+
+		if (test_bit(l, bitmap))
+			break;
+
+		if (read_bucket(l))
+			goto bsearch;
+	}
+
+	/*
+	 * If that fails, check all the buckets we haven't checked
+	 * already
+	 */
+	pr_debug("falling back to linear search");
+linear_scan:
+	for (l = find_first_zero_bit(bitmap, nr_buckets);
+	     l < nr_buckets;
+	     l = find_next_zero_bit(bitmap, nr_buckets, l + 1))
+		if (read_bucket(l))
+			goto bsearch;
+
+	/* no journal entries on this device? */
+	if (l == nr_buckets)
+		closure_return(cl);
+bsearch:
+	/* Binary search */
+	r = find_next_bit(bitmap, nr_buckets, l + 1);
+	pr_debug("starting binary search, l %u r %u", l, r);
+
+	while (l + 1 < r) {
+		unsigned m = (l + r) >> 1;
+		u64 cur_seq = seq;
+
+		read_bucket(m);
+
+		if (cur_seq != seq)
+			l = m;
+		else
+			r = m;
+	}
+
+search_done:
+	/*
+	 * Find the journal bucket with the highest sequence number:
+	 *
+	 * If there's duplicate journal entries in multiple buckets (which
+	 * definitely isn't supposed to happen, but...) - make sure to start
+	 * cur_idx at the last of those buckets, so we don't deadlock trying to
+	 * allocate
+	 */
+	seq = 0;
+
+	for (i = 0; i < nr_buckets; i++)
+		if (ja->bucket_seq[i] >= seq &&
+		    ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % nr_buckets]) {
+			/*
+			 * When journal_next_bucket() goes to allocate for
+			 * the first time, it'll use the bucket after
+			 * ja->cur_idx
+			 */
+			ja->cur_idx = i;
+			seq = ja->bucket_seq[i];
+		}
+
+	/*
+	 * Set last_idx to indicate the entire journal is full and needs to be
+	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
+	 * pinned when it first runs:
+	 */
+	ja->last_idx = (ja->cur_idx + 1) % nr_buckets;
+
+	/*
+	 * Read buckets in reverse order until we stop finding more journal
+	 * entries:
+	 */
+	for (i = (ja->cur_idx + nr_buckets - 1) % nr_buckets;
+	     i != ja->cur_idx;
+	     i = (i + nr_buckets - 1) % nr_buckets)
+		if (!test_bit(i, bitmap) &&
+		    !read_bucket(i))
+			break;
+
+	closure_return(cl);
+#undef read_bucket
+}
+
+void bch_journal_entries_free(struct list_head *list)
+{
+
+	while (!list_empty(list)) {
+		struct journal_replay *i =
+			list_first_entry(list, struct journal_replay, list);
+		list_del(&i->list);
+		kvfree(i);
+	}
+}
+
+static int journal_seq_blacklist_read(struct journal *j,
+				      struct journal_replay *i,
+				      struct journal_entry_pin_list *p)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	struct jset_entry *entry;
+	struct journal_seq_blacklist *bl;
+	u64 seq;
+
+	for_each_jset_entry_type(entry, &i->j,
+			JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
+		seq = le64_to_cpu(entry->_data[0]);
+
+		bch_verbose(c, "blacklisting existing journal seq %llu", seq);
+
+		bl = bch_journal_seq_blacklisted_new(j, seq);
+		if (!bl)
+			return -ENOMEM;
+
+		journal_pin_add_entry(j, p, &bl->pin,
+				  journal_seq_blacklist_flush);
+		bl->written = true;
+	}
+
+	return 0;
+}
+
+int bch_journal_read(struct cache_set *c, struct list_head *list)
+{
+	struct jset_entry *prio_ptrs;
+	struct journal_list jlist;
+	struct journal_replay *i;
+	struct jset *j;
+	struct journal_entry_pin_list *p;
+	struct cache *ca;
+	u64 cur_seq, end_seq;
+	unsigned iter;
+	int ret = 0;
+
+	closure_init_stack(&jlist.cl);
+	mutex_init(&jlist.lock);
+	mutex_init(&jlist.cache_set_buffer_lock);
+	jlist.head = list;
+	jlist.ret = 0;
+
+	for_each_cache(ca, c, iter)
+		closure_call(&ca->journal.read,
+			     bch_journal_read_device,
+			     system_unbound_wq,
+			     &jlist.cl);
+
+	closure_sync(&jlist.cl);
+
+	if (jlist.ret)
+		return jlist.ret;
+
+	if (list_empty(list)){
+		bch_err(c, "no journal entries found");
+		return BCH_FSCK_REPAIR_IMPOSSIBLE;
+	}
+
+	j = &list_entry(list->prev, struct journal_replay, list)->j;
+
+	unfixable_fsck_err_on(le64_to_cpu(j->seq) -
+			le64_to_cpu(j->last_seq) + 1 >
+			c->journal.pin.size, c,
+			"too many journal entries open for refcount fifo");
+
+	c->journal.pin.back = le64_to_cpu(j->seq) -
+		le64_to_cpu(j->last_seq) + 1;
+
+	atomic64_set(&c->journal.seq, le64_to_cpu(j->seq));
+	c->journal.last_seq_ondisk = le64_to_cpu(j->last_seq);
+
+	BUG_ON(last_seq(&c->journal) != le64_to_cpu(j->last_seq));
+
+	i = list_first_entry(list, struct journal_replay, list);
+
+	mutex_lock(&c->journal.blacklist_lock);
+
+	fifo_for_each_entry_ptr(p, &c->journal.pin, iter) {
+		u64 seq = journal_pin_seq(&c->journal, p);
+
+		INIT_LIST_HEAD(&p->list);
+
+		if (i && le64_to_cpu(i->j.seq) == seq) {
+			atomic_set(&p->count, 1);
+
+			if (journal_seq_blacklist_read(&c->journal, i, p)) {
+				mutex_unlock(&c->journal.blacklist_lock);
+				return -ENOMEM;
+			}
+
+			i = list_is_last(&i->list, list)
+				? NULL
+				: list_next_entry(i, list);
+		} else {
+			atomic_set(&p->count, 0);
+		}
+	}
+
+	mutex_unlock(&c->journal.blacklist_lock);
+
+	cur_seq = last_seq(&c->journal);
+	end_seq = le64_to_cpu(list_last_entry(list,
+				struct journal_replay, list)->j.seq);
+
+	list_for_each_entry(i, list, list) {
+		bool blacklisted;
+
+		mutex_lock(&c->journal.blacklist_lock);
+		while (cur_seq < le64_to_cpu(i->j.seq) &&
+		       journal_seq_blacklist_find(&c->journal, cur_seq))
+			cur_seq++;
+
+		blacklisted = journal_seq_blacklist_find(&c->journal,
+							 le64_to_cpu(i->j.seq));
+		mutex_unlock(&c->journal.blacklist_lock);
+
+		fsck_err_on(blacklisted, c,
+			    "found blacklisted journal entry %llu",
+			    le64_to_cpu(i->j.seq));
+
+		fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
+			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
+			cur_seq, le64_to_cpu(i->j.seq) - 1,
+			last_seq(&c->journal), end_seq);
+
+		cur_seq = le64_to_cpu(i->j.seq) + 1;
+	}
+
+	prio_ptrs = bch_journal_find_entry(j, JOURNAL_ENTRY_PRIO_PTRS, 0);
+	if (prio_ptrs) {
+		memcpy_u64s(c->journal.prio_buckets,
+			    prio_ptrs->_data,
+			    le16_to_cpu(prio_ptrs->u64s));
+		c->journal.nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
+	}
+fsck_err:
+	return ret;
+}
+
+void bch_journal_mark(struct cache_set *c, struct list_head *list)
+{
+	struct bkey_i *k, *n;
+	struct jset_entry *j;
+	struct journal_replay *r;
+
+	list_for_each_entry(r, list, list)
+		for_each_jset_key(k, n, j, &r->j) {
+			enum bkey_type type = bkey_type(j->level, j->btree_id);
+			struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
+
+			if (btree_type_has_ptrs(type))
+				__bch_btree_mark_key(c, type, k_s_c);
+		}
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+	return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+void bch_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+
+	if (!need_write_just_set &&
+	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
+		__bch_time_stats_update(j->delay_time,
+					j->need_write_time);
+#if 0
+	closure_call(&j->io, journal_write, NULL, &c->cl);
+#else
+	/* Shut sparse up: */
+	closure_init(&j->io, &c->cl);
+	set_closure_fn(&j->io, journal_write, NULL);
+	journal_write(&j->io);
+#endif
+}
+
+static void __bch_journal_next_entry(struct journal *j)
+{
+	struct journal_entry_pin_list pin_list, *p;
+	struct journal_buf *buf;
+
+	/*
+	 * The fifo_push() needs to happen at the same time as j->seq is
+	 * incremented for last_seq() to be calculated correctly
+	 */
+	atomic64_inc(&j->seq);
+	BUG_ON(!fifo_push(&j->pin, pin_list));
+	p = &fifo_peek_back(&j->pin);
+
+	INIT_LIST_HEAD(&p->list);
+	atomic_set(&p->count, 1);
+
+	if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) {
+		smp_wmb();
+		j->cur_pin_list = p;
+	}
+
+	buf = journal_cur_buf(j);
+	memset(buf->has_inode, 0, sizeof(buf->has_inode));
+
+	memset(buf->data, 0, sizeof(*buf->data));
+	buf->data->seq	= cpu_to_le64(atomic64_read(&j->seq));
+	buf->data->u64s	= 0;
+
+	BUG_ON(journal_pin_seq(j, p) != atomic64_read(&j->seq));
+}
+
+static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
+{
+	unsigned ret = BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
+
+	if (buf->nr_prio_buckets)
+		ret += JSET_KEYS_U64s + buf->nr_prio_buckets;
+
+	return ret;
+}
+
+static enum {
+	JOURNAL_ENTRY_ERROR,
+	JOURNAL_ENTRY_INUSE,
+	JOURNAL_ENTRY_CLOSED,
+	JOURNAL_UNLOCKED,
+} journal_buf_switch(struct journal *j, bool need_write_just_set)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	struct journal_buf *buf;
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+		if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
+			return JOURNAL_ENTRY_CLOSED;
+
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+			return JOURNAL_ENTRY_ERROR;
+
+		if (new.prev_buf_unwritten)
+			return JOURNAL_ENTRY_INUSE;
+
+		/*
+		 * avoid race between setting buf->data->u64s and
+		 * journal_res_put starting write:
+		 */
+		journal_state_inc(&new);
+
+		new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
+		new.idx++;
+		new.prev_buf_unwritten = 1;
+
+		BUG_ON(journal_state_count(new, new.idx));
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	journal_reclaim_fast(j);
+
+	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+	buf = &j->buf[old.idx];
+	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
+	buf->data->last_seq	= cpu_to_le64(last_seq(j));
+
+	j->prev_buf_sectors =
+		__set_blocks(buf->data,
+			     le32_to_cpu(buf->data->u64s) +
+			     journal_entry_u64s_reserve(buf),
+			     block_bytes(c)) * c->sb.block_size;
+
+	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
+
+	atomic_dec_bug(&fifo_peek_back(&j->pin).count);
+	__bch_journal_next_entry(j);
+
+	cancel_delayed_work(&j->write_work);
+	spin_unlock(&j->lock);
+
+	if (c->bucket_journal_seq > 1 << 14) {
+		c->bucket_journal_seq = 0;
+		bch_bucket_seq_cleanup(c);
+	}
+
+	/* ugh - might be called from __journal_res_get() under wait_event() */
+	__set_current_state(TASK_RUNNING);
+	bch_journal_buf_put(j, old.idx, need_write_just_set);
+
+	return JOURNAL_UNLOCKED;
+}
+
+void bch_journal_halt(struct journal *j)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+			return;
+
+		new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	wake_up(&j->wait);
+	closure_wake_up(&journal_cur_buf(j)->wait);
+	closure_wake_up(&journal_prev_buf(j)->wait);
+}
+
+static unsigned journal_dev_buckets_available(struct journal *j,
+					      struct cache *ca)
+{
+	struct journal_device *ja = &ca->journal;
+	unsigned nr = bch_nr_journal_buckets(ca->disk_sb.sb);
+	unsigned next = (ja->cur_idx + 1) % nr;
+	unsigned available = (ja->last_idx + nr - next) % nr;
+
+	/*
+	 * Hack to avoid a deadlock during journal replay:
+	 * journal replay might require setting a new btree
+	 * root, which requires writing another journal entry -
+	 * thus, if the journal is full (and this happens when
+	 * replaying the first journal bucket's entries) we're
+	 * screwed.
+	 *
+	 * So don't let the journal fill up unless we're in
+	 * replay:
+	 */
+	if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
+		available = max((int) available - 2, 0);
+
+	/*
+	 * Don't use the last bucket unless writing the new last_seq
+	 * will make another bucket available:
+	 */
+	if (ja->bucket_seq[ja->last_idx] >= last_seq(j))
+		available = max((int) available - 1, 0);
+
+	return available;
+}
+
+/* returns number of sectors available for next journal entry: */
+static int journal_entry_sectors(struct journal *j)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	struct cache *ca;
+	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
+	unsigned sectors_available = j->entry_size_max >> 9;
+	unsigned i, nr_online = 0, nr_devs = 0;
+
+	lockdep_assert_held(&j->lock);
+
+	rcu_read_lock();
+	group_for_each_cache_rcu(ca, &j->devs, i) {
+		unsigned buckets_required = 0;
+
+		sectors_available = min_t(unsigned, sectors_available,
+					  ca->mi.bucket_size);
+
+		/*
+		 * Note that we don't allocate the space for a journal entry
+		 * until we write it out - thus, if we haven't started the write
+		 * for the previous entry we have to make sure we have space for
+		 * it too:
+		 */
+		if (bch_extent_has_device(e.c, ca->sb.nr_this_dev)) {
+			if (j->prev_buf_sectors > ca->journal.sectors_free)
+				buckets_required++;
+
+			if (j->prev_buf_sectors + sectors_available >
+			    ca->journal.sectors_free)
+				buckets_required++;
+		} else {
+			if (j->prev_buf_sectors + sectors_available >
+			    ca->mi.bucket_size)
+				buckets_required++;
+
+			buckets_required++;
+		}
+
+		if (journal_dev_buckets_available(j, ca) >= buckets_required)
+			nr_devs++;
+		nr_online++;
+	}
+	rcu_read_unlock();
+
+	if (nr_online < c->opts.metadata_replicas)
+		return -EROFS;
+
+	if (nr_devs < c->opts.metadata_replicas)
+		return 0;
+
+	return sectors_available;
+}
+
+/*
+ * should _only_ called from journal_res_get() - when we actually want a
+ * journal reservation - journal entry is open means journal is dirty:
+ */
+static int journal_entry_open(struct journal *j)
+{
+	struct journal_buf *buf = journal_cur_buf(j);
+	ssize_t u64s;
+	int ret = 0, sectors;
+
+	lockdep_assert_held(&j->lock);
+	BUG_ON(journal_entry_is_open(j));
+
+	if (!fifo_free(&j->pin))
+		return 0;
+
+	sectors = journal_entry_sectors(j);
+	if (sectors <= 0)
+		return sectors;
+
+	j->cur_buf_sectors	= sectors;
+	buf->nr_prio_buckets	= j->nr_prio_buckets;
+
+	u64s = (sectors << 9) / sizeof(u64);
+
+	/* Subtract the journal header */
+	u64s -= sizeof(struct jset) / sizeof(u64);
+	/*
+	 * Btree roots, prio pointers don't get added until right before we do
+	 * the write:
+	 */
+	u64s -= journal_entry_u64s_reserve(buf);
+	u64s  = max_t(ssize_t, 0L, u64s);
+
+	BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
+
+	if (u64s > le32_to_cpu(buf->data->u64s)) {
+		union journal_res_state old, new;
+		u64 v = atomic64_read(&j->reservations.counter);
+
+		/*
+		 * Must be set before marking the journal entry as open:
+		 */
+		j->cur_entry_u64s = u64s;
+
+		do {
+			old.v = new.v = v;
+
+			if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+				return false;
+
+			/* Handle any already added entries */
+			new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+		} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+					       old.v, new.v)) != old.v);
+		ret = 1;
+
+		wake_up(&j->wait);
+
+		if (j->res_get_blocked_start) {
+			__bch_time_stats_update(j->blocked_time,
+						j->res_get_blocked_start);
+			j->res_get_blocked_start = 0;
+		}
+
+		mod_delayed_work(system_freezable_wq,
+				 &j->write_work,
+				 msecs_to_jiffies(j->write_delay_ms));
+	}
+
+	return ret;
+}
+
+void bch_journal_start(struct cache_set *c)
+{
+	struct journal *j = &c->journal;
+	struct journal_seq_blacklist *bl;
+	struct cache *ca;
+	u64 new_seq = 0;
+	unsigned i;
+
+	for_each_cache(ca, c, i)
+		if (is_journal_device(ca))
+			bch_cache_group_add_cache(&c->journal.devs, ca);
+
+	list_for_each_entry(bl, &j->seq_blacklist, list)
+		new_seq = max(new_seq, bl->seq);
+
+	spin_lock(&j->lock);
+
+	set_bit(JOURNAL_STARTED, &j->flags);
+
+	while (atomic64_read(&j->seq) < new_seq) {
+		struct journal_entry_pin_list pin_list, *p;
+
+		BUG_ON(!fifo_push(&j->pin, pin_list));
+		p = &fifo_peek_back(&j->pin);
+
+		INIT_LIST_HEAD(&p->list);
+		atomic_set(&p->count, 0);
+		atomic64_inc(&j->seq);
+	}
+
+	/*
+	 * journal_buf_switch() only inits the next journal entry when it
+	 * closes an open journal entry - the very first journal entry gets
+	 * initialized here:
+	 */
+	__bch_journal_next_entry(j);
+
+	/*
+	 * Adding entries to the next journal entry before allocating space on
+	 * disk for the next journal entry - this is ok, because these entries
+	 * only have to go down with the next journal entry we write:
+	 */
+	list_for_each_entry(bl, &j->seq_blacklist, list)
+		if (!bl->written) {
+			bch_journal_add_entry(journal_cur_buf(j), &bl->seq, 1,
+					JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED,
+					0, 0);
+
+			journal_pin_add_entry(j,
+					      &fifo_peek_back(&j->pin),
+					      &bl->pin,
+					      journal_seq_blacklist_flush);
+			bl->written = true;
+		}
+
+	spin_unlock(&j->lock);
+
+	queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+}
+
+int bch_journal_replay(struct cache_set *c, struct list_head *list)
+{
+	int ret = 0, keys = 0, entries = 0;
+	struct journal *j = &c->journal;
+	struct bkey_i *k, *_n;
+	struct jset_entry *entry;
+	struct journal_replay *i, *n;
+
+	list_for_each_entry_safe(i, n, list, list) {
+		j->cur_pin_list =
+			&j->pin.data[((j->pin.back - 1 -
+				       (atomic64_read(&j->seq) -
+					le64_to_cpu(i->j.seq))) &
+				      j->pin.mask)];
+
+		for_each_jset_key(k, _n, entry, &i->j) {
+			struct disk_reservation disk_res;
+
+			/*
+			 * We might cause compressed extents to be split, so we
+			 * need to pass in a disk_reservation:
+			 */
+			BUG_ON(bch_disk_reservation_get(c, &disk_res, 0, 0));
+
+			trace_bcache_journal_replay_key(&k->k);
+
+			ret = bch_btree_insert(c, entry->btree_id, k,
+					       &disk_res, NULL, NULL,
+					       BTREE_INSERT_NOFAIL|
+					       BTREE_INSERT_JOURNAL_REPLAY);
+			bch_disk_reservation_put(c, &disk_res);
+
+			if (ret)
+				goto err;
+
+			cond_resched();
+			keys++;
+		}
+
+		if (atomic_dec_and_test(&j->cur_pin_list->count))
+			wake_up(&j->wait);
+
+		entries++;
+	}
+
+	bch_info(c, "journal replay done, %i keys in %i entries, seq %llu",
+		 keys, entries, (u64) atomic64_read(&j->seq));
+
+	fsck_err_on(c->sb.clean && keys, c,
+		    "filesystem marked clean, but journal had keys to replay");
+
+	bch_journal_set_replay_done(&c->journal);
+err:
+	if (ret)
+		bch_err(c, "journal replay error: %d", ret);
+fsck_err:
+	bch_journal_entries_free(list);
+
+	return ret;
+}
+
+static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
+{
+	unsigned u64s = bch_journal_buckets_offset(ca->disk_sb.sb) + nr;
+	u64 *p;
+	int ret;
+
+	ret = bch_super_realloc(&ca->disk_sb, u64s);
+	if (ret)
+		return ret;
+
+	p = krealloc(ca->journal.bucket_seq,
+		     nr * sizeof(u64),
+		     GFP_KERNEL|__GFP_ZERO);
+	if (!p)
+		return -ENOMEM;
+
+	ca->journal.bucket_seq = p;
+	ca->disk_sb.sb->u64s = cpu_to_le16(u64s);
+
+	return 0;
+}
+
+int bch_cache_journal_alloc(struct cache *ca)
+{
+	int ret;
+	unsigned i;
+
+	if (ca->mi.tier != 0)
+		return 0;
+
+	if (dynamic_fault("bcache:add:journal_alloc"))
+		return -ENOMEM;
+
+	/*
+	 * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+	 * is smaller:
+	 */
+	ret = bch_set_nr_journal_buckets(ca,
+			clamp_t(unsigned, ca->mi.nbuckets >> 8,
+				BCH_JOURNAL_BUCKETS_MIN,
+				min(1 << 10,
+				    (1 << 20) / ca->mi.bucket_size)));
+	if (ret)
+		return ret;
+
+	for (i = 0; i < bch_nr_journal_buckets(ca->disk_sb.sb); i++) {
+		unsigned long r = ca->mi.first_bucket + i;
+
+		bch_mark_metadata_bucket(ca, &ca->buckets[r], true);
+		set_journal_bucket(ca->disk_sb.sb, i, r);
+	}
+
+	return 0;
+}
+
+/* Journalling */
+
+/**
+ * journal_reclaim_fast - do the fast part of journal reclaim
+ *
+ * Called from IO submission context, does not block. Cleans up after btree
+ * write completions by advancing the journal pin and each cache's last_idx,
+ * kicking off discards and background reclaim as necessary.
+ */
+static void journal_reclaim_fast(struct journal *j)
+{
+	struct journal_entry_pin_list temp;
+	bool popped = false;
+
+	lockdep_assert_held(&j->lock);
+
+	/*
+	 * Unpin journal entries whose reference counts reached zero, meaning
+	 * all btree nodes got written out
+	 */
+	while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+		BUG_ON(!fifo_pop(&j->pin, temp));
+		popped = true;
+	}
+
+	if (popped)
+		wake_up(&j->wait);
+}
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, marking it as dirty:
+ */
+
+static inline void __journal_pin_add(struct journal *j,
+				     struct journal_entry_pin_list *pin_list,
+				     struct journal_entry_pin *pin,
+				     journal_pin_flush_fn flush_fn)
+{
+	BUG_ON(journal_pin_active(pin));
+
+	atomic_inc(&pin_list->count);
+	pin->pin_list	= pin_list;
+	pin->flush	= flush_fn;
+
+	if (flush_fn)
+		list_add(&pin->list, &pin_list->list);
+	else
+		INIT_LIST_HEAD(&pin->list);
+}
+
+static void journal_pin_add_entry(struct journal *j,
+				  struct journal_entry_pin_list *pin_list,
+				  struct journal_entry_pin *pin,
+				  journal_pin_flush_fn flush_fn)
+{
+	spin_lock_irq(&j->pin_lock);
+	__journal_pin_add(j, pin_list, pin, flush_fn);
+	spin_unlock_irq(&j->pin_lock);
+}
+
+void bch_journal_pin_add(struct journal *j,
+			 struct journal_entry_pin *pin,
+			 journal_pin_flush_fn flush_fn)
+{
+	spin_lock_irq(&j->pin_lock);
+	__journal_pin_add(j, j->cur_pin_list, pin, flush_fn);
+	spin_unlock_irq(&j->pin_lock);
+}
+
+static inline bool __journal_pin_drop(struct journal *j,
+				      struct journal_entry_pin *pin)
+{
+	struct journal_entry_pin_list *pin_list = pin->pin_list;
+
+	pin->pin_list = NULL;
+
+	/* journal_reclaim_work() might have already taken us off the list */
+	if (!list_empty_careful(&pin->list))
+		list_del_init(&pin->list);
+
+	return atomic_dec_and_test(&pin_list->count);
+}
+
+void bch_journal_pin_drop(struct journal *j,
+			  struct journal_entry_pin *pin)
+{
+	unsigned long flags;
+	bool wakeup;
+
+	if (!journal_pin_active(pin))
+		return;
+
+	spin_lock_irqsave(&j->pin_lock, flags);
+	wakeup = __journal_pin_drop(j, pin);
+	spin_unlock_irqrestore(&j->pin_lock, flags);
+
+	/*
+	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
+	 * writing a new last_seq will now make another bucket available:
+	 *
+	 * Nested irqsave is expensive, don't do the wakeup with lock held:
+	 */
+	if (wakeup)
+		wake_up(&j->wait);
+}
+
+void bch_journal_pin_add_if_older(struct journal *j,
+				  struct journal_entry_pin *src_pin,
+				  struct journal_entry_pin *pin,
+				  journal_pin_flush_fn flush_fn)
+{
+	spin_lock_irq(&j->pin_lock);
+
+	if (journal_pin_active(src_pin) &&
+	    (!journal_pin_active(pin) ||
+	     fifo_entry_idx(&j->pin, src_pin->pin_list) <
+	     fifo_entry_idx(&j->pin, pin->pin_list))) {
+		if (journal_pin_active(pin))
+			__journal_pin_drop(j, pin);
+		__journal_pin_add(j, src_pin->pin_list,
+				  pin, NULL);
+	}
+
+	spin_unlock_irq(&j->pin_lock);
+}
+
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j, u64 seq_to_flush)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *ret = NULL;
+	unsigned iter;
+
+	/* so we don't iterate over empty fifo entries below: */
+	if (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+		spin_lock(&j->lock);
+		journal_reclaim_fast(j);
+		spin_unlock(&j->lock);
+	}
+
+	spin_lock_irq(&j->pin_lock);
+	fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
+		if (journal_pin_seq(j, pin_list) > seq_to_flush)
+			break;
+
+		ret = list_first_entry_or_null(&pin_list->list,
+				struct journal_entry_pin, list);
+		if (ret) {
+			/* must be list_del_init(), see bch_journal_pin_drop() */
+			list_del_init(&ret->list);
+			break;
+		}
+	}
+	spin_unlock_irq(&j->pin_lock);
+
+	return ret;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = (ja->last_idx != ja->cur_idx &&
+	       ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/**
+ * journal_reclaim_work - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+static void journal_reclaim_work(struct work_struct *work)
+{
+	struct cache_set *c = container_of(to_delayed_work(work),
+				struct cache_set, journal.reclaim_work);
+	struct journal *j = &c->journal;
+	struct cache *ca;
+	struct journal_entry_pin *pin;
+	u64 seq_to_flush = 0;
+	unsigned iter, nr, bucket_to_flush;
+	unsigned long next_flush;
+	bool reclaim_lock_held = false, need_flush;
+
+	/*
+	 * Advance last_idx to point to the oldest journal entry containing
+	 * btree node updates that have not yet been written out
+	 */
+	group_for_each_cache(ca, &j->devs, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		while (should_discard_bucket(j, ja)) {
+			if (!reclaim_lock_held) {
+				/*
+				 * ugh:
+				 * might be called from __journal_res_get()
+				 * under wait_event() - have to go back to
+				 * TASK_RUNNING before doing something that
+				 * would block, but only if we're doing work:
+				 */
+				__set_current_state(TASK_RUNNING);
+
+				mutex_lock(&j->reclaim_lock);
+				reclaim_lock_held = true;
+				/* recheck under reclaim_lock: */
+				continue;
+			}
+
+			if (ca->mi.discard &&
+			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca,
+						journal_bucket(ca->disk_sb.sb,
+							       ja->last_idx)),
+					ca->mi.bucket_size, GFP_NOIO, 0);
+
+			spin_lock(&j->lock);
+			ja->last_idx = (ja->last_idx + 1) %
+				bch_nr_journal_buckets(ca->disk_sb.sb);
+			spin_unlock(&j->lock);
+
+			wake_up(&j->wait);
+		}
+
+		/*
+		 * Write out enough btree nodes to free up 50% journal
+		 * buckets
+		 */
+		spin_lock(&j->lock);
+		nr = bch_nr_journal_buckets(ca->disk_sb.sb),
+		bucket_to_flush = (ja->cur_idx + (nr >> 1)) % nr;
+		seq_to_flush = max_t(u64, seq_to_flush,
+				     ja->bucket_seq[bucket_to_flush]);
+		spin_unlock(&j->lock);
+	}
+
+	if (reclaim_lock_held)
+		mutex_unlock(&j->reclaim_lock);
+
+	/* Also flush if the pin fifo is more than half full */
+	seq_to_flush = max_t(s64, seq_to_flush,
+			     (s64) atomic64_read(&j->seq) -
+			     (j->pin.size >> 1));
+
+	/*
+	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
+	 * make sure to flush at least one journal pin:
+	 */
+	next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+	need_flush = time_after(jiffies, next_flush);
+
+	while ((pin = journal_get_next_pin(j, need_flush
+					   ? U64_MAX
+					   : seq_to_flush))) {
+		__set_current_state(TASK_RUNNING);
+		pin->flush(j, pin);
+		need_flush = false;
+
+		j->last_flushed = jiffies;
+	}
+
+	if (!test_bit(CACHE_SET_RO, &c->flags))
+		queue_delayed_work(system_freezable_wq, &j->reclaim_work,
+				   msecs_to_jiffies(j->reclaim_delay_ms));
+}
+
+/**
+ * journal_next_bucket - move on to the next journal bucket if possible
+ */
+static int journal_write_alloc(struct journal *j, unsigned sectors)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
+	struct bch_extent_ptr *ptr;
+	struct cache *ca;
+	unsigned iter, replicas, replicas_want =
+		READ_ONCE(c->opts.metadata_replicas);
+
+	spin_lock(&j->lock);
+	rcu_read_lock();
+
+	/*
+	 * Drop any pointers to devices that have been removed, are no longer
+	 * empty, or filled up their current journal bucket:
+	 *
+	 * Note that a device may have had a small amount of free space (perhaps
+	 * one sector) that wasn't enough for the smallest possible journal
+	 * entry - that's why we drop pointers to devices <= current free space,
+	 * i.e. whichever device was limiting the current journal entry size.
+	 */
+	extent_for_each_ptr_backwards(e, ptr)
+		if (!(ca = PTR_CACHE(c, ptr)) ||
+		    ca->mi.state != CACHE_ACTIVE ||
+		    ca->journal.sectors_free <= sectors)
+			__bch_extent_drop_ptr(e, ptr);
+		else
+			ca->journal.sectors_free -= sectors;
+
+	replicas = bch_extent_nr_ptrs(e.c);
+
+	/*
+	 * Determine location of the next journal write:
+	 * XXX: sort caches by free journal space
+	 */
+	group_for_each_cache_rcu(ca, &j->devs, iter) {
+		struct journal_device *ja = &ca->journal;
+		unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
+
+		if (replicas >= replicas_want)
+			break;
+
+		/*
+		 * Check that we can use this device, and aren't already using
+		 * it:
+		 */
+		if (bch_extent_has_device(e.c, ca->sb.nr_this_dev) ||
+		    !journal_dev_buckets_available(j, ca) ||
+		    sectors > ca->mi.bucket_size)
+			continue;
+
+		ja->sectors_free = ca->mi.bucket_size - sectors;
+		ja->cur_idx = (ja->cur_idx + 1) % nr_buckets;
+		ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq);
+
+		extent_ptr_append(bkey_i_to_extent(&j->key),
+			(struct bch_extent_ptr) {
+				  .offset = bucket_to_sector(ca,
+					journal_bucket(ca->disk_sb.sb,
+						       ja->cur_idx)),
+				  .dev = ca->sb.nr_this_dev,
+		});
+		replicas++;
+
+		trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
+	}
+
+	rcu_read_unlock();
+
+	j->prev_buf_sectors = 0;
+	spin_unlock(&j->lock);
+
+	if (replicas < replicas_want)
+		return -EROFS;
+
+	return 0;
+}
+
+static void journal_write_compact(struct jset *jset)
+{
+	struct jset_entry *i, *next, *prev = NULL;
+
+	/*
+	 * Simple compaction, dropping empty jset_entries (from journal
+	 * reservations that weren't fully used) and merging jset_entries that
+	 * can be.
+	 *
+	 * If we wanted to be really fancy here, we could sort all the keys in
+	 * the jset and drop keys that were overwritten - probably not worth it:
+	 */
+	for (i = jset->start;
+	     i < (struct jset_entry *) bkey_idx(jset, le32_to_cpu(jset->u64s)) &&
+	     (next = jset_keys_next(i), true);
+	     i = next) {
+		unsigned u64s = le16_to_cpu(i->u64s);
+
+		/* Empty entry: */
+		if (!u64s)
+			continue;
+
+		/* Can we merge with previous entry? */
+		if (prev &&
+		    i->btree_id == prev->btree_id &&
+		    i->level	== prev->level &&
+		    JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) &&
+		    JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS &&
+		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+			memmove_u64s_down(jset_keys_next(prev),
+					  i->_data,
+					  u64s);
+			le16_add_cpu(&prev->u64s, u64s);
+			continue;
+		}
+
+		/* Couldn't merge, move i into new position (after prev): */
+		prev = prev ? jset_keys_next(prev) : jset->start;
+		if (i != prev)
+			memmove_u64s_down(prev, i, jset_u64s(u64s));
+	}
+
+	prev = prev ? jset_keys_next(prev) : jset->start;
+	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+}
+
+static void journal_write_endio(struct bio *bio)
+{
+	struct cache *ca = bio->bi_private;
+	struct journal *j = &ca->set->journal;
+
+	if (cache_fatal_io_err_on(bio->bi_error, ca, "journal write") ||
+	    bch_meta_write_fault("journal"))
+		bch_journal_halt(j);
+
+	closure_put(&j->io);
+	percpu_ref_put(&ca->ref);
+}
+
+static void journal_write_done(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct journal_buf *w = journal_prev_buf(j);
+
+	j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
+
+	__bch_time_stats_update(j->write_time, j->write_start_time);
+
+	BUG_ON(!j->reservations.prev_buf_unwritten);
+	atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
+		     &j->reservations.counter);
+
+	/*
+	 * XXX: this is racy, we could technically end up doing the wake up
+	 * after the journal_buf struct has been reused for the next write
+	 * (because we're clearing JOURNAL_IO_IN_FLIGHT) and wake up things that
+	 * are waiting on the _next_ write, not this one.
+	 *
+	 * The wake up can't come before, because journal_flush_seq_async() is
+	 * looking at JOURNAL_IO_IN_FLIGHT when it has to wait on a journal
+	 * write that was already in flight.
+	 *
+	 * The right fix is to use a lock here, but using j.lock here means it
+	 * has to be a spin_lock_irqsave() lock which then requires propagating
+	 * the irq()ness to other locks and it's all kinds of nastiness.
+	 */
+
+	closure_wake_up(&w->wait);
+	wake_up(&j->wait);
+
+	/*
+	 * Updating last_seq_ondisk may let journal_reclaim_work() discard more
+	 * buckets:
+	 */
+	mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+}
+
+static void journal_write(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	struct cache *ca;
+	struct journal_buf *w = journal_prev_buf(j);
+	struct bio *bio;
+	struct bch_extent_ptr *ptr;
+	unsigned i, sectors, bytes;
+
+	j->write_start_time = local_clock();
+
+	bch_journal_add_prios(j, w);
+
+	mutex_lock(&c->btree_root_lock);
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_root *r = &c->btree_roots[i];
+
+		if (r->alive)
+			bch_journal_add_btree_root(w, i, &r->key, r->level);
+	}
+	mutex_unlock(&c->btree_root_lock);
+
+	journal_write_compact(w->data);
+
+	w->data->read_clock	= cpu_to_le16(c->prio_clock[READ].hand);
+	w->data->write_clock	= cpu_to_le16(c->prio_clock[WRITE].hand);
+	w->data->magic		= cpu_to_le64(jset_magic(&c->disk_sb));
+	w->data->version	= cpu_to_le32(BCACHE_JSET_VERSION);
+
+	SET_JSET_BIG_ENDIAN(w->data, CPU_BIG_ENDIAN);
+	SET_JSET_CSUM_TYPE(w->data, c->opts.metadata_checksum);
+	w->data->csum = cpu_to_le64(__csum_set(w->data,
+					       le32_to_cpu(w->data->u64s),
+					       JSET_CSUM_TYPE(w->data)));
+
+	sectors = __set_blocks(w->data, le32_to_cpu(w->data->u64s),
+			       block_bytes(c)) * c->sb.block_size;
+	BUG_ON(sectors > j->prev_buf_sectors);
+
+	bytes = __set_bytes(w->data, le32_to_cpu(w->data->u64s));
+	memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
+
+	if (journal_write_alloc(j, sectors)) {
+		bch_journal_halt(j);
+		bch_err(c, "Unable to allocate journal write");
+		bch_fatal_error(c);
+		closure_return_with_destructor(cl, journal_write_done);
+	}
+
+	bch_check_mark_super(c, &j->key, true);
+
+	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) {
+		rcu_read_lock();
+		ca = PTR_CACHE(c, ptr);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		if (!ca) {
+			/* XXX: fix this */
+			bch_err(c, "missing device for journal write\n");
+			continue;
+		}
+
+		atomic64_add(sectors, &ca->meta_sectors_written);
+
+		bio = ca->journal.bio;
+		bio_reset(bio);
+		bio->bi_iter.bi_sector	= ptr->offset;
+		bio->bi_bdev		= ca->disk_sb.bdev;
+		bio->bi_iter.bi_size	= sectors << 9;
+		bio->bi_end_io		= journal_write_endio;
+		bio->bi_private		= ca;
+		bio_set_op_attrs(bio, REQ_OP_WRITE,
+				 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+		bch_bio_map(bio, w->data);
+
+		trace_bcache_journal_write(bio);
+		closure_bio_submit_punt(bio, cl, c);
+
+		ptr->offset += sectors;
+
+		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
+	}
+
+	for_each_cache(ca, c, i)
+		if (ca->mi.state == CACHE_ACTIVE &&
+		    journal_flushes_device(ca) &&
+		    !bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) {
+			percpu_ref_get(&ca->ref);
+
+			bio = ca->journal.bio;
+			bio_reset(bio);
+			bio->bi_bdev		= ca->disk_sb.bdev;
+			bio->bi_end_io		= journal_write_endio;
+			bio->bi_private		= ca;
+			bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+			closure_bio_submit_punt(bio, cl, c);
+		}
+
+	closure_return_with_destructor(cl, journal_write_done);
+}
+
+static void journal_write_work(struct work_struct *work)
+{
+	struct journal *j = container_of(to_delayed_work(work),
+					 struct journal, write_work);
+	spin_lock(&j->lock);
+	set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+	if (journal_buf_switch(j, false) != JOURNAL_UNLOCKED)
+		spin_unlock(&j->lock);
+}
+
+/*
+ * Given an inode number, if that inode number has data in the journal that
+ * hasn't yet been flushed, return the journal sequence number that needs to be
+ * flushed:
+ */
+u64 bch_inode_journal_seq(struct journal *j, u64 inode)
+{
+	size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
+	u64 seq = 0;
+
+	if (!test_bit(h, j->buf[0].has_inode) &&
+	    !test_bit(h, j->buf[1].has_inode))
+		return 0;
+
+	spin_lock(&j->lock);
+	if (test_bit(h, journal_cur_buf(j)->has_inode))
+		seq = atomic64_read(&j->seq);
+	else if (test_bit(h, journal_prev_buf(j)->has_inode))
+		seq = atomic64_read(&j->seq) - 1;
+	spin_unlock(&j->lock);
+
+	return seq;
+}
+
+static int __journal_res_get(struct journal *j, struct journal_res *res,
+			      unsigned u64s_min, unsigned u64s_max)
+{
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+	int ret;
+retry:
+	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
+	if (ret)
+		return ret;
+
+	spin_lock(&j->lock);
+	/*
+	 * Recheck after taking the lock, so we don't race with another thread
+	 * that just did journal_entry_open() and call journal_entry_close()
+	 * unnecessarily
+	 */
+	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
+	if (ret) {
+		spin_unlock(&j->lock);
+		return 1;
+	}
+
+	/*
+	 * Ok, no more room in the current journal entry - try to start a new
+	 * one:
+	 */
+	switch (journal_buf_switch(j, false)) {
+	case JOURNAL_ENTRY_ERROR:
+		spin_unlock(&j->lock);
+		return -EIO;
+	case JOURNAL_ENTRY_INUSE:
+		/* haven't finished writing out the previous one: */
+		spin_unlock(&j->lock);
+		trace_bcache_journal_entry_full(c);
+		goto blocked;
+	case JOURNAL_ENTRY_CLOSED:
+		break;
+	case JOURNAL_UNLOCKED:
+		goto retry;
+	}
+
+	/* We now have a new, closed journal buf - see if we can open it: */
+	ret = journal_entry_open(j);
+	spin_unlock(&j->lock);
+
+	if (ret < 0)
+		return ret;
+	if (ret)
+		goto retry;
+
+	/* Journal's full, we have to wait */
+
+	/*
+	 * Direct reclaim - can't rely on reclaim from work item
+	 * due to freezing..
+	 */
+	journal_reclaim_work(&j->reclaim_work.work);
+
+	trace_bcache_journal_full(c);
+blocked:
+	if (!j->res_get_blocked_start)
+		j->res_get_blocked_start = local_clock() ?: 1;
+	return 0;
+}
+
+/*
+ * Essentially the entry function to the journaling code. When bcache is doing
+ * a btree insert, it calls this function to get the current journal write.
+ * Journal write is the structure used set up journal writes. The calling
+ * function will then add its keys to the structure, queuing them for the
+ * next write.
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks.
+ */
+int bch_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
+				 unsigned u64s_min, unsigned u64s_max)
+{
+	int ret;
+
+	wait_event(j->wait,
+		   (ret = __journal_res_get(j, res, u64s_min,
+					    u64s_max)));
+	return ret < 0 ? ret : 0;
+}
+
+void bch_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent)
+{
+	spin_lock(&j->lock);
+
+	BUG_ON(seq > atomic64_read(&j->seq));
+
+	if (bch_journal_error(j)) {
+		spin_unlock(&j->lock);
+		return;
+	}
+
+	if (seq == atomic64_read(&j->seq)) {
+		if (!closure_wait(&journal_cur_buf(j)->wait, parent))
+			BUG();
+	} else if (seq + 1 == atomic64_read(&j->seq) &&
+		   j->reservations.prev_buf_unwritten) {
+		if (!closure_wait(&journal_prev_buf(j)->wait, parent))
+			BUG();
+
+		smp_mb();
+
+		/* check if raced with write completion (or failure) */
+		if (!j->reservations.prev_buf_unwritten ||
+		    bch_journal_error(j))
+			closure_wake_up(&journal_prev_buf(j)->wait);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+void bch_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
+{
+	spin_lock(&j->lock);
+
+	BUG_ON(seq > atomic64_read(&j->seq));
+
+	if (bch_journal_error(j)) {
+		spin_unlock(&j->lock);
+		return;
+	}
+
+	if (seq == atomic64_read(&j->seq)) {
+		bool set_need_write = false;
+
+		if (parent &&
+		    !closure_wait(&journal_cur_buf(j)->wait, parent))
+			BUG();
+
+		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+			j->need_write_time = local_clock();
+			set_need_write = true;
+		}
+
+		switch (journal_buf_switch(j, set_need_write)) {
+		case JOURNAL_ENTRY_ERROR:
+			if (parent)
+				closure_wake_up(&journal_cur_buf(j)->wait);
+			break;
+		case JOURNAL_ENTRY_CLOSED:
+			/*
+			 * Journal entry hasn't been opened yet, but caller
+			 * claims it has something (seq == j->seq):
+			 */
+			BUG();
+		case JOURNAL_ENTRY_INUSE:
+			break;
+		case JOURNAL_UNLOCKED:
+			return;
+		}
+	} else if (parent &&
+		   seq + 1 == atomic64_read(&j->seq) &&
+		   j->reservations.prev_buf_unwritten) {
+		if (!closure_wait(&journal_prev_buf(j)->wait, parent))
+			BUG();
+
+		smp_mb();
+
+		/* check if raced with write completion (or failure) */
+		if (!j->reservations.prev_buf_unwritten ||
+		    bch_journal_error(j))
+			closure_wake_up(&journal_prev_buf(j)->wait);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+int bch_journal_flush_seq(struct journal *j, u64 seq)
+{
+	struct closure cl;
+	u64 start_time = local_clock();
+
+	closure_init_stack(&cl);
+	bch_journal_flush_seq_async(j, seq, &cl);
+	closure_sync(&cl);
+
+	bch_time_stats_update(j->flush_seq_time, start_time);
+
+	return bch_journal_error(j);
+}
+
+void bch_journal_meta_async(struct journal *j, struct closure *parent)
+{
+	struct journal_res res;
+	unsigned u64s = jset_u64s(0);
+
+	memset(&res, 0, sizeof(res));
+
+	bch_journal_res_get(j, &res, u64s, u64s);
+	bch_journal_res_put(j, &res);
+
+	bch_journal_flush_seq_async(j, res.seq, parent);
+}
+
+int bch_journal_meta(struct journal *j)
+{
+	struct journal_res res;
+	unsigned u64s = jset_u64s(0);
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	ret = bch_journal_res_get(j, &res, u64s, u64s);
+	if (ret)
+		return ret;
+
+	bch_journal_res_put(j, &res);
+
+	return bch_journal_flush_seq(j, res.seq);
+}
+
+void bch_journal_flush_async(struct journal *j, struct closure *parent)
+{
+	u64 seq, journal_seq;
+
+	spin_lock(&j->lock);
+	journal_seq = atomic64_read(&j->seq);
+
+	if (journal_entry_is_open(j)) {
+		seq = journal_seq;
+	} else if (journal_seq) {
+		seq = journal_seq - 1;
+	} else {
+		spin_unlock(&j->lock);
+		return;
+	}
+	spin_unlock(&j->lock);
+
+	bch_journal_flush_seq_async(j, seq, parent);
+}
+
+int bch_journal_flush(struct journal *j)
+{
+	u64 seq, journal_seq;
+
+	spin_lock(&j->lock);
+	journal_seq = atomic64_read(&j->seq);
+
+	if (journal_entry_is_open(j)) {
+		seq = journal_seq;
+	} else if (journal_seq) {
+		seq = journal_seq - 1;
+	} else {
+		spin_unlock(&j->lock);
+		return 0;
+	}
+	spin_unlock(&j->lock);
+
+	return bch_journal_flush_seq(j, seq);
+}
+
+void bch_journal_free(struct journal *j)
+{
+	unsigned order = get_order(j->entry_size_max);
+
+	free_pages((unsigned long) j->buf[1].data, order);
+	free_pages((unsigned long) j->buf[0].data, order);
+	free_fifo(&j->pin);
+}
+
+int bch_journal_alloc(struct journal *j, unsigned entry_size_max)
+{
+	static struct lock_class_key res_key;
+	unsigned order = get_order(entry_size_max);
+
+	spin_lock_init(&j->lock);
+	spin_lock_init(&j->pin_lock);
+	init_waitqueue_head(&j->wait);
+	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+	INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
+	mutex_init(&j->blacklist_lock);
+	INIT_LIST_HEAD(&j->seq_blacklist);
+	spin_lock_init(&j->devs.lock);
+	mutex_init(&j->reclaim_lock);
+
+	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+	j->entry_size_max	= entry_size_max;
+	j->write_delay_ms	= 100;
+	j->reclaim_delay_ms	= 100;
+
+	bkey_extent_init(&j->key);
+
+	atomic64_set(&j->reservations.counter,
+		((union journal_res_state)
+		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+	    !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
+	    !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
+		return -ENOMEM;
+
+	return 0;
+}
+
+ssize_t bch_journal_print_debug(struct journal *j, char *buf)
+{
+	union journal_res_state *s = &j->reservations;
+	struct cache *ca;
+	unsigned iter;
+	ssize_t ret = 0;
+
+	rcu_read_lock();
+	spin_lock(&j->lock);
+
+	ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+			 "active journal entries:\t%zu\n"
+			 "seq:\t\t\t%llu\n"
+			 "last_seq:\t\t%llu\n"
+			 "last_seq_ondisk:\t%llu\n"
+			 "reservation count:\t%u\n"
+			 "reservation offset:\t%u\n"
+			 "current entry u64s:\t%u\n"
+			 "io in flight:\t\t%i\n"
+			 "need write:\t\t%i\n"
+			 "dirty:\t\t\t%i\n"
+			 "replay done:\t\t%i\n",
+			 fifo_used(&j->pin),
+			 (u64) atomic64_read(&j->seq),
+			 last_seq(j),
+			 j->last_seq_ondisk,
+			 journal_state_count(*s, s->idx),
+			 s->cur_entry_offset,
+			 j->cur_entry_u64s,
+			 s->prev_buf_unwritten,
+			 test_bit(JOURNAL_NEED_WRITE,	&j->flags),
+			 journal_entry_is_open(j),
+			 test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
+
+	group_for_each_cache_rcu(ca, &j->devs, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+				 "dev %u:\n"
+				 "\tnr\t\t%u\n"
+				 "\tcur_idx\t\t%u (seq %llu)\n"
+				 "\tlast_idx\t%u (seq %llu)\n",
+				 iter, bch_nr_journal_buckets(ca->disk_sb.sb),
+				 ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
+				 ja->last_idx,	ja->bucket_seq[ja->last_idx]);
+	}
+
+	spin_unlock(&j->lock);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static bool bch_journal_writing_to_device(struct cache *ca)
+{
+	struct journal *j = &ca->set->journal;
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = bch_extent_has_device(bkey_i_to_s_c_extent(&j->key),
+				    ca->sb.nr_this_dev);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/*
+ * This asumes that ca has already been marked read-only so that
+ * journal_next_bucket won't pick buckets out of ca any more.
+ * Hence, if the journal is not currently pointing to ca, there
+ * will be no new writes to journal entries in ca after all the
+ * pending ones have been flushed to disk.
+ *
+ * If the journal is being written to ca, write a new record, and
+ * journal_next_bucket will notice that the device is no longer
+ * writeable and pick a new set of devices to write to.
+ */
+
+int bch_journal_move(struct cache *ca)
+{
+	unsigned i, nr_buckets;
+	u64 last_flushed_seq;
+	struct cache_set *c = ca->set;
+	struct journal *j = &c->journal;
+	int ret = 0;		/* Success */
+
+	if (bch_journal_writing_to_device(ca)) {
+		/*
+		 * bch_journal_meta will write a record and we'll wait
+		 * for the write to complete.
+		 * Actually writing the journal (journal_write_locked)
+		 * will call journal_next_bucket which notices that the
+		 * device is no longer writeable, and picks a new one.
+		 */
+		bch_journal_meta(j);
+		BUG_ON(bch_journal_writing_to_device(ca));
+	}
+
+	/*
+	 * Flush all btree updates to backing store so that any
+	 * journal entries written to ca become stale and are no
+	 * longer needed.
+	 */
+
+	/*
+	 * XXX: switch to normal journal reclaim machinery
+	 */
+	bch_btree_flush(c);
+
+	/*
+	 * Force a meta-data journal entry to be written so that
+	 * we have newer journal entries in devices other than ca,
+	 * and wait for the meta data write to complete.
+	 */
+	bch_journal_meta(j);
+
+	/*
+	 * Verify that we no longer need any of the journal entries in
+	 * the device
+	 */
+	spin_lock(&j->lock);
+	last_flushed_seq = last_seq(j);
+	spin_unlock(&j->lock);
+
+	nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
+
+	for (i = 0; i < nr_buckets; i += 1)
+		BUG_ON(ca->journal.bucket_seq[i] > last_flushed_seq);
+
+	return ret;
+}
diff --git a/libbcache/journal.h b/libbcache/journal.h
new file mode 100644
index 0000000..759ed60
--- /dev/null
+++ b/libbcache/journal.h
@@ -0,0 +1,387 @@
+#ifndef _BCACHE_JOURNAL_H
+#define _BCACHE_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The primary purpose of the journal is to log updates (insertions) to the
+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
+ *
+ * Without the journal, the b-tree is always internally consistent on
+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal
+ * but did handle unclean shutdowns by doing all index updates synchronously
+ * (with coalescing).
+ *
+ * Updates to interior nodes still happen synchronously and without the journal
+ * (for simplicity) - this may change eventually but updates to interior nodes
+ * are rare enough it's not a huge priority.
+ *
+ * This means the journal is relatively separate from the b-tree; it consists of
+ * just a list of keys and journal replay consists of just redoing those
+ * insertions in same order that they appear in the journal.
+ *
+ * PERSISTENCE:
+ *
+ * For synchronous updates (where we're waiting on the index update to hit
+ * disk), the journal entry will be written out immediately (or as soon as
+ * possible, if the write for the previous journal entry was still in flight).
+ *
+ * Synchronous updates are specified by passing a closure (@flush_cl) to
+ * bch_btree_insert() or bch_btree_insert_node(), which then pass that parameter
+ * down to the journalling code. That closure will will wait on the journal
+ * write to complete (via closure_wait()).
+ *
+ * If the index update wasn't synchronous, the journal entry will be
+ * written out after 10 ms have elapsed, by default (the delay_ms field
+ * in struct journal).
+ *
+ * JOURNAL ENTRIES:
+ *
+ * A journal entry is variable size (struct jset), it's got a fixed length
+ * header and then a variable number of struct jset_entry entries.
+ *
+ * Journal entries are identified by monotonically increasing 64 bit sequence
+ * numbers - jset->seq; other places in the code refer to this sequence number.
+ *
+ * A jset_entry entry contains one or more bkeys (which is what gets inserted
+ * into the b-tree). We need a container to indicate which b-tree the key is
+ * for; also, the roots of the various b-trees are stored in jset_entry entries
+ * (one for each b-tree) - this lets us add new b-tree types without changing
+ * the on disk format.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * JOURNAL LAYOUT ON DISK:
+ *
+ * The journal is written to a ringbuffer of buckets (which is kept in the
+ * superblock); the individual buckets are not necessarily contiguous on disk
+ * which means that journal entries are not allowed to span buckets, but also
+ * that we can resize the journal at runtime if desired (unimplemented).
+ *
+ * The journal buckets exist in the same pool as all the other buckets that are
+ * managed by the allocator and garbage collection - garbage collection marks
+ * the journal buckets as metadata buckets.
+ *
+ * OPEN/DIRTY JOURNAL ENTRIES:
+ *
+ * Open/dirty journal entries are journal entries that contain b-tree updates
+ * that have not yet been written out to the b-tree on disk. We have to track
+ * which journal entries are dirty, and we also have to avoid wrapping around
+ * the journal and overwriting old but still dirty journal entries with new
+ * journal entries.
+ *
+ * On disk, this is represented with the "last_seq" field of struct jset;
+ * last_seq is the first sequence number that journal replay has to replay.
+ *
+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
+ * journal_device->seq) of for each journal bucket, the highest sequence number
+ * any journal entry it contains. Then, by comparing that against last_seq we
+ * can determine whether that journal bucket contains dirty journal entries or
+ * not.
+ *
+ * To track which journal entries are dirty, we maintain a fifo of refcounts
+ * (where each entry corresponds to a specific sequence number) - when a ref
+ * goes to 0, that journal entry is no longer dirty.
+ *
+ * Journalling of index updates is done at the same time as the b-tree itself is
+ * being modified (see btree_insert_key()); when we add the key to the journal
+ * the pending b-tree write takes a ref on the journal entry the key was added
+ * to. If a pending b-tree write would need to take refs on multiple dirty
+ * journal entries, it only keeps the ref on the oldest one (since a newer
+ * journal entry will still be replayed if an older entry was dirty).
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+#include <linux/hash.h>
+
+#include "journal_types.h"
+
+static inline struct jset_entry *jset_keys_next(struct jset_entry *j)
+{
+	return (void *) __bkey_idx(j, le16_to_cpu(j->u64s));
+}
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+	struct list_head	list;
+	struct jset		j;
+};
+
+#define JOURNAL_PIN	((32 * 1024) - 1)
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+	return pin->pin_list != NULL;
+}
+
+void bch_journal_pin_add(struct journal *, struct journal_entry_pin *,
+			 journal_pin_flush_fn);
+void bch_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+void bch_journal_pin_add_if_older(struct journal *,
+				  struct journal_entry_pin *,
+				  struct journal_entry_pin *,
+				  journal_pin_flush_fn);
+
+struct closure;
+struct cache_set;
+struct keylist;
+
+struct bkey_i *bch_journal_find_btree_root(struct cache_set *, struct jset *,
+					   enum btree_id, unsigned *);
+
+int bch_journal_seq_should_ignore(struct cache_set *, u64, struct btree *);
+
+u64 bch_inode_journal_seq(struct journal *, u64);
+
+static inline int journal_state_count(union journal_res_state s, int idx)
+{
+	return idx == 0 ? s.buf0_count : s.buf1_count;
+}
+
+static inline void journal_state_inc(union journal_res_state *s)
+{
+	s->buf0_count += s->idx == 0;
+	s->buf1_count += s->idx == 1;
+}
+
+static inline void bch_journal_set_has_inode(struct journal_buf *buf, u64 inum)
+{
+	set_bit(hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)), buf->has_inode);
+}
+
+/*
+ * Amount of space that will be taken up by some keys in the journal (i.e.
+ * including the jset header)
+ */
+static inline unsigned jset_u64s(unsigned u64s)
+{
+	return u64s + sizeof(struct jset_entry) / sizeof(u64);
+}
+
+static inline void bch_journal_add_entry_at(struct journal_buf *buf,
+					    const void *data, size_t u64s,
+					    unsigned type, enum btree_id id,
+					    unsigned level, unsigned offset)
+{
+	struct jset_entry *entry = bkey_idx(buf->data, offset);
+
+	entry->u64s = cpu_to_le16(u64s);
+	entry->btree_id = id;
+	entry->level = level;
+	entry->flags = 0;
+	SET_JOURNAL_ENTRY_TYPE(entry, type);
+
+	memcpy_u64s(entry->_data, data, u64s);
+}
+
+static inline void bch_journal_add_keys(struct journal *j, struct journal_res *res,
+					enum btree_id id, const struct bkey_i *k)
+{
+	struct journal_buf *buf = &j->buf[res->idx];
+	unsigned actual = jset_u64s(k->k.u64s);
+
+	EBUG_ON(!res->ref);
+	BUG_ON(actual > res->u64s);
+
+	bch_journal_set_has_inode(buf, k->k.p.inode);
+
+	bch_journal_add_entry_at(buf, k, k->k.u64s,
+				 JOURNAL_ENTRY_BTREE_KEYS, id,
+				 0, res->offset);
+
+	res->offset	+= actual;
+	res->u64s	-= actual;
+}
+
+void bch_journal_buf_put_slowpath(struct journal *, bool);
+
+static inline void bch_journal_buf_put(struct journal *j, unsigned idx,
+				       bool need_write_just_set)
+{
+	union journal_res_state s;
+
+	s.v = atomic64_sub_return(((union journal_res_state) {
+				    .buf0_count = idx == 0,
+				    .buf1_count = idx == 1,
+				    }).v, &j->reservations.counter);
+
+	EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
+
+	/*
+	 * Do not initiate a journal write if the journal is in an error state
+	 * (previous journal entry write may have failed)
+	 */
+	if (s.idx != idx &&
+	    !journal_state_count(s, idx) &&
+	    s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
+		bch_journal_buf_put_slowpath(j, need_write_just_set);
+}
+
+/*
+ * This function releases the journal write structure so other threads can
+ * then proceed to add their keys as well.
+ */
+static inline void bch_journal_res_put(struct journal *j,
+				       struct journal_res *res)
+{
+	if (!res->ref)
+		return;
+
+	lock_release(&j->res_map, 0, _RET_IP_);
+
+	while (res->u64s) {
+		bch_journal_add_entry_at(&j->buf[res->idx], NULL, 0,
+					 JOURNAL_ENTRY_BTREE_KEYS,
+					 0, 0, res->offset);
+		res->offset	+= jset_u64s(0);
+		res->u64s	-= jset_u64s(0);
+	}
+
+	bch_journal_buf_put(j, res->idx, false);
+
+	res->ref = 0;
+}
+
+int bch_journal_res_get_slowpath(struct journal *, struct journal_res *,
+				 unsigned, unsigned);
+
+static inline int journal_res_get_fast(struct journal *j,
+				       struct journal_res *res,
+				       unsigned u64s_min,
+				       unsigned u64s_max)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+
+		/*
+		 * Check if there is still room in the current journal
+		 * entry:
+		 */
+		if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
+			return 0;
+
+		res->offset	= old.cur_entry_offset;
+		res->u64s	= min(u64s_max, j->cur_entry_u64s -
+				      old.cur_entry_offset);
+
+		journal_state_inc(&new);
+		new.cur_entry_offset += res->u64s;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	res->ref = true;
+	res->idx = new.idx;
+	res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
+	return 1;
+}
+
+static inline int bch_journal_res_get(struct journal *j, struct journal_res *res,
+				      unsigned u64s_min, unsigned u64s_max)
+{
+	int ret;
+
+	EBUG_ON(res->ref);
+	EBUG_ON(u64s_max < u64s_min);
+
+	if (journal_res_get_fast(j, res, u64s_min, u64s_max))
+		goto out;
+
+	ret = bch_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
+	if (ret)
+		return ret;
+out:
+	lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
+	EBUG_ON(!res->ref);
+	return 0;
+}
+
+void bch_journal_wait_on_seq(struct journal *, u64, struct closure *);
+void bch_journal_flush_seq_async(struct journal *, u64, struct closure *);
+void bch_journal_flush_async(struct journal *, struct closure *);
+void bch_journal_meta_async(struct journal *, struct closure *);
+
+int bch_journal_flush_seq(struct journal *, u64);
+int bch_journal_flush(struct journal *);
+int bch_journal_meta(struct journal *);
+
+void bch_journal_halt(struct journal *);
+
+static inline int bch_journal_error(struct journal *j)
+{
+	return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
+		? -EIO : 0;
+}
+
+static inline bool is_journal_device(struct cache *ca)
+{
+	return ca->mi.state == CACHE_ACTIVE && ca->mi.tier == 0;
+}
+
+static inline bool journal_flushes_device(struct cache *ca)
+{
+	return true;
+}
+
+void bch_journal_start(struct cache_set *);
+void bch_journal_mark(struct cache_set *, struct list_head *);
+void bch_journal_entries_free(struct list_head *);
+int bch_journal_read(struct cache_set *, struct list_head *);
+int bch_journal_replay(struct cache_set *, struct list_head *);
+
+static inline void bch_journal_set_replay_done(struct journal *j)
+{
+	spin_lock(&j->lock);
+	BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+
+	set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+	j->cur_pin_list = &fifo_peek_back(&j->pin);
+	spin_unlock(&j->lock);
+}
+
+void bch_journal_free(struct journal *);
+int bch_journal_alloc(struct journal *, unsigned);
+
+ssize_t bch_journal_print_debug(struct journal *, char *);
+
+int bch_cache_journal_alloc(struct cache *);
+
+static inline __le64 *__journal_buckets(struct cache_sb *sb)
+{
+	return sb->_data + bch_journal_buckets_offset(sb);
+}
+
+static inline u64 journal_bucket(struct cache_sb *sb, unsigned nr)
+{
+	return le64_to_cpu(__journal_buckets(sb)[nr]);
+}
+
+static inline void set_journal_bucket(struct cache_sb *sb, unsigned nr, u64 bucket)
+{
+	__journal_buckets(sb)[nr] = cpu_to_le64(bucket);
+}
+
+int bch_journal_move(struct cache *);
+
+#endif /* _BCACHE_JOURNAL_H */
diff --git a/libbcache/journal_types.h b/libbcache/journal_types.h
new file mode 100644
index 0000000..e3698b5
--- /dev/null
+++ b/libbcache/journal_types.h
@@ -0,0 +1,240 @@
+#ifndef _BCACHE_JOURNAL_TYPES_H
+#define _BCACHE_JOURNAL_TYPES_H
+
+#include <linux/cache.h>
+#include <linux/workqueue.h>
+
+#include "alloc_types.h"
+#include "fifo.h"
+
+struct journal_res;
+
+/*
+ * We put two of these in struct journal; we used them for writes to the
+ * journal that are being staged or in flight.
+ */
+struct journal_buf {
+	struct jset		*data;
+	struct closure_waitlist	wait;
+
+	/*
+	 * ugh, prio_buckets are stupid - need to convert them to new
+	 * transaction machinery when it arrives
+	 */
+	unsigned		nr_prio_buckets;
+
+	/* bloom filter: */
+	unsigned long		has_inode[1024 / sizeof(unsigned long)];
+};
+
+/*
+ * Something that makes a journal entry dirty - i.e. a btree node that has to be
+ * flushed:
+ */
+
+struct journal_entry_pin_list {
+	struct list_head		list;
+	atomic_t			count;
+};
+
+struct journal;
+struct journal_entry_pin;
+typedef void (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin *);
+
+struct journal_entry_pin {
+	struct list_head		list;
+	journal_pin_flush_fn		flush;
+	struct journal_entry_pin_list	*pin_list;
+};
+
+/* corresponds to a btree node with a blacklisted bset: */
+struct blacklisted_node {
+	__le64			seq;
+	enum btree_id		btree_id;
+	struct bpos		pos;
+};
+
+struct journal_seq_blacklist {
+	struct list_head	list;
+	u64			seq;
+	bool			written;
+	struct journal_entry_pin pin;
+
+	struct blacklisted_node	*entries;
+	size_t			nr_entries;
+};
+
+struct journal_res {
+	bool			ref;
+	u8			idx;
+	u16			u64s;
+	u32			offset;
+	u64			seq;
+};
+
+union journal_res_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u64		cur_entry_offset:20,
+				idx:1,
+				prev_buf_unwritten:1,
+				buf0_count:21,
+				buf1_count:21;
+	};
+};
+
+/* 4 mb, in bytes: */
+#define JOURNAL_ENTRY_SIZE_MAX		(4U << 20)
+
+/*
+ * We stash some journal state as sentinal values in cur_entry_offset:
+ */
+#define JOURNAL_ENTRY_OFFSET_MAX	((1U << 20) - 1)
+
+#define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
+#define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
+
+/*
+ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
+ * either because something's waiting on the write to complete or because it's
+ * been dirty too long and the timer's expired.
+ */
+
+enum {
+	JOURNAL_REPLAY_DONE,
+	JOURNAL_STARTED,
+	JOURNAL_NEED_WRITE,
+};
+
+/* Embedded in struct cache_set */
+struct journal {
+	/* Fastpath stuff up front: */
+
+	unsigned long		flags;
+
+	union journal_res_state reservations;
+	unsigned		cur_entry_u64s;
+	unsigned		prev_buf_sectors;
+	unsigned		cur_buf_sectors;
+	unsigned		entry_size_max; /* bytes */
+
+	/*
+	 * Two journal entries -- one is currently open for new entries, the
+	 * other is possibly being written out.
+	 */
+	struct journal_buf	buf[2];
+
+	spinlock_t		lock;
+
+	/* Used when waiting because the journal was full */
+	wait_queue_head_t	wait;
+
+	struct closure		io;
+	struct delayed_work	write_work;
+
+	/* Sequence number of most recent journal entry (last entry in @pin) */
+	atomic64_t		seq;
+
+	/* last_seq from the most recent journal entry written */
+	u64			last_seq_ondisk;
+
+	/*
+	 * FIFO of journal entries whose btree updates have not yet been
+	 * written out.
+	 *
+	 * Each entry is a reference count. The position in the FIFO is the
+	 * entry's sequence number relative to @seq.
+	 *
+	 * The journal entry itself holds a reference count, put when the
+	 * journal entry is written out. Each btree node modified by the journal
+	 * entry also holds a reference count, put when the btree node is
+	 * written.
+	 *
+	 * When a reference count reaches zero, the journal entry is no longer
+	 * needed. When all journal entries in the oldest journal bucket are no
+	 * longer needed, the bucket can be discarded and reused.
+	 */
+	DECLARE_FIFO(struct journal_entry_pin_list, pin);
+	struct journal_entry_pin_list *cur_pin_list;
+
+	/*
+	 * Protects the pin lists - the fifo itself is still protected by
+	 * j->lock though:
+	 */
+	spinlock_t		pin_lock;
+
+	struct mutex		blacklist_lock;
+	struct list_head	seq_blacklist;
+
+	BKEY_PADDED(key);
+	struct cache_group	devs;
+
+	struct delayed_work	reclaim_work;
+	unsigned long		last_flushed;
+
+	/* protects advancing ja->last_idx: */
+	struct mutex		reclaim_lock;
+
+	/*
+	 * ugh: need to get prio_buckets converted over to the eventual new
+	 * transaction machinery
+	 */
+	__le64			prio_buckets[MAX_CACHES_PER_SET];
+	unsigned		nr_prio_buckets;
+
+	unsigned		write_delay_ms;
+	unsigned		reclaim_delay_ms;
+
+	u64			res_get_blocked_start;
+	u64			need_write_time;
+	u64			write_start_time;
+
+	struct time_stats	*write_time;
+	struct time_stats	*delay_time;
+	struct time_stats	*blocked_time;
+	struct time_stats	*flush_seq_time;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	res_map;
+#endif
+};
+
+/*
+ * Embedded in struct cache. First three fields refer to the array of journal
+ * buckets, in cache_sb.
+ */
+struct journal_device {
+	/*
+	 * For each journal bucket, contains the max sequence number of the
+	 * journal writes it contains - so we know when a bucket can be reused.
+	 */
+	u64			*bucket_seq;
+
+	unsigned		sectors_free;
+
+	/* Journal bucket we're currently writing to */
+	unsigned		cur_idx;
+
+	/* Last journal bucket that still contains an open journal entry */
+
+	/*
+	 * j->lock and j->reclaim_lock must both be held to modify, j->lock
+	 * sufficient to read:
+	 */
+	unsigned		last_idx;
+
+	/* Bio for journal reads/writes to this device */
+	struct bio		*bio;
+
+	/* for bch_journal_read_device */
+	struct closure		read;
+};
+
+#endif /* _BCACHE_JOURNAL_TYPES_H */
diff --git a/libbcache/keybuf.c b/libbcache/keybuf.c
new file mode 100644
index 0000000..a3c6b03
--- /dev/null
+++ b/libbcache/keybuf.c
@@ -0,0 +1,195 @@
+
+#include "bcache.h"
+#include "btree_gc.h"
+#include "btree_iter.h"
+#include "keybuf.h"
+
+#include <trace/events/bcache.h>
+
+/*
+ * For buffered iteration over the btree, with predicates and ratelimiting and
+ * whatnot
+ */
+
+static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
+{
+	/* Overlapping keys compare equal */
+	if (bkey_cmp(l->key.k.p, bkey_start_pos(&r->key.k)) <= 0)
+		return -1;
+	if (bkey_cmp(bkey_start_pos(&l->key.k), r->key.k.p) >= 0)
+		return 1;
+	return 0;
+}
+
+static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
+					    struct keybuf_key *r)
+{
+	return clamp_t(s64, bkey_cmp(l->key.k.p, r->key.k.p), -1, 1);
+}
+
+void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
+		       struct bpos end, keybuf_pred_fn *pred)
+{
+	struct bpos start = buf->last_scanned;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	unsigned nr_found = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, buf->last_scanned, k) {
+		if (bkey_cmp(k.k->p, end) >= 0) {
+			buf->last_scanned = k.k->p;
+			goto done;
+		}
+
+		if (pred(buf, k)) {
+			struct keybuf_key *w;
+
+			spin_lock(&buf->lock);
+
+			w = array_alloc(&buf->freelist);
+			if (!w) {
+				spin_unlock(&buf->lock);
+				goto done;
+			}
+
+			bkey_reassemble(&w->key, k);
+			atomic_set(&w->ref, -1); /* -1 means hasn't started */
+
+			if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
+				array_free(&buf->freelist, w);
+			else
+				nr_found++;
+
+			spin_unlock(&buf->lock);
+		}
+
+		buf->last_scanned = k.k->p;
+		bch_btree_iter_cond_resched(&iter);
+	}
+
+	/* If we end up here, it means:
+	 * - the map_fn didn't fill up the keybuf
+	 * - the map_fn didn't see the end key
+	 * - there were no more keys to map over
+	 * Therefore, we are at the end of the key space */
+	buf->last_scanned = POS_MAX;
+done:
+	bch_btree_iter_unlock(&iter);
+
+	trace_bcache_keyscan(nr_found,
+			     start.inode, start.offset,
+			     buf->last_scanned.inode,
+			     buf->last_scanned.offset);
+
+	spin_lock(&buf->lock);
+
+	if (!RB_EMPTY_ROOT(&buf->keys)) {
+		struct keybuf_key *w;
+
+		w = RB_FIRST(&buf->keys, struct keybuf_key, node);
+		buf->start	= bkey_start_pos(&w->key.k);
+
+		w = RB_LAST(&buf->keys, struct keybuf_key, node);
+		buf->end	= w->key.k.p;
+	} else {
+		buf->start	= POS_MAX;
+		buf->end	= POS_MAX;
+	}
+
+	spin_unlock(&buf->lock);
+}
+
+static void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
+{
+	rb_erase(&w->node, &buf->keys);
+	array_free(&buf->freelist, w);
+}
+
+void bch_keybuf_put(struct keybuf *buf, struct keybuf_key *w)
+{
+	BUG_ON(atomic_read(&w->ref) <= 0);
+
+	if (atomic_dec_and_test(&w->ref)) {
+		up(&buf->in_flight);
+
+		spin_lock(&buf->lock);
+		bch_keybuf_del(buf, w);
+		spin_unlock(&buf->lock);
+	}
+}
+
+void bch_keybuf_recalc_oldest_gens(struct cache_set *c, struct keybuf *buf)
+{
+	struct keybuf_key *w, *n;
+
+	spin_lock(&buf->lock);
+	rbtree_postorder_for_each_entry_safe(w, n,
+				&buf->keys, node)
+		bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(&w->key));
+	spin_unlock(&buf->lock);
+}
+
+bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bpos start,
+				  struct bpos end)
+{
+	bool ret = false;
+	struct keybuf_key *w, *next, s = { .key.k.p = start };
+
+	if (bkey_cmp(end, buf->start) <= 0 ||
+	    bkey_cmp(start, buf->end) >= 0)
+		return false;
+
+	spin_lock(&buf->lock);
+
+	for (w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
+	     w && bkey_cmp(bkey_start_pos(&w->key.k), end) < 0;
+	     w = next) {
+		next = RB_NEXT(w, node);
+
+		if (atomic_read(&w->ref) == -1)
+			bch_keybuf_del(buf, w);
+		else
+			ret = true;
+	}
+
+	spin_unlock(&buf->lock);
+	return ret;
+}
+
+struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
+{
+	struct keybuf_key *w;
+
+	spin_lock(&buf->lock);
+
+	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
+
+	while (w && atomic_read(&w->ref) != -1)
+		w = RB_NEXT(w, node);
+
+	if (!w) {
+		spin_unlock(&buf->lock);
+		return NULL;
+	}
+
+	atomic_set(&w->ref, 1);
+	spin_unlock(&buf->lock);
+
+	down(&buf->in_flight);
+
+	return w;
+}
+
+void bch_keybuf_init(struct keybuf *buf)
+{
+	sema_init(&buf->in_flight, KEYBUF_REFILL_BATCH / 2);
+
+	buf->last_scanned	= POS_MAX;
+	buf->start		= POS_MIN;
+	buf->end		= POS_MIN;
+
+	buf->keys		= RB_ROOT;
+
+	spin_lock_init(&buf->lock);
+	array_allocator_init(&buf->freelist);
+}
diff --git a/libbcache/keybuf.h b/libbcache/keybuf.h
new file mode 100644
index 0000000..d6fdda9
--- /dev/null
+++ b/libbcache/keybuf.h
@@ -0,0 +1,16 @@
+#ifndef _BCACHE_KEYBUF_H
+#define _BCACHE_KEYBUF_H
+
+#include "keybuf_types.h"
+
+typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey_s_c);
+
+void bch_keybuf_init(struct keybuf *);
+void bch_refill_keybuf(struct cache_set *, struct keybuf *,
+		       struct bpos, keybuf_pred_fn *);
+void bch_keybuf_recalc_oldest_gens(struct cache_set *, struct keybuf *);
+bool bch_keybuf_check_overlapping(struct keybuf *, struct bpos, struct bpos);
+void bch_keybuf_put(struct keybuf *, struct keybuf_key *);
+struct keybuf_key *bch_keybuf_next(struct keybuf *);
+
+#endif /* _BCACHE_KEYBUF_H */
diff --git a/libbcache/keybuf_types.h b/libbcache/keybuf_types.h
new file mode 100644
index 0000000..3facc4a
--- /dev/null
+++ b/libbcache/keybuf_types.h
@@ -0,0 +1,33 @@
+#ifndef _BCACHE_KEYBUF_TYPES_H
+#define _BCACHE_KEYBUF_TYPES_H
+
+struct keybuf_key {
+	struct rb_node		node;
+	BKEY_PADDED(key);
+	atomic_t		ref;
+};
+
+#define KEYBUF_REFILL_BATCH	500
+
+struct keybuf {
+	struct bpos		last_scanned;
+	spinlock_t		lock;
+
+	/*
+	 * Beginning and end of range in rb tree - so that we can skip taking
+	 * lock and checking the rb tree when we need to check for overlapping
+	 * keys.
+	 */
+	struct bpos		start;
+	struct bpos		end;
+
+	struct rb_root		keys;
+
+	unsigned		max_in_flight;
+	struct semaphore	in_flight;
+
+	DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist,
+				KEYBUF_REFILL_BATCH);
+};
+
+#endif /* _BCACHE_KEYBUF_TYPES_H */
diff --git a/libbcache/keylist.c b/libbcache/keylist.c
new file mode 100644
index 0000000..adf5eeb
--- /dev/null
+++ b/libbcache/keylist.c
@@ -0,0 +1,55 @@
+
+#include "bcache.h"
+#include "keylist.h"
+
+int bch_keylist_realloc(struct keylist *l, u64 *inline_u64s,
+			size_t nr_inline_u64s, size_t new_u64s)
+{
+	size_t oldsize = bch_keylist_u64s(l);
+	size_t newsize = oldsize + new_u64s;
+	u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
+	u64 *new_keys;
+
+	newsize = roundup_pow_of_two(newsize);
+
+	if (newsize <= nr_inline_u64s ||
+	    (old_buf && roundup_pow_of_two(oldsize) == newsize))
+		return 0;
+
+	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
+	if (!new_keys)
+		return -ENOMEM;
+
+	if (!old_buf)
+		memcpy_u64s(new_keys, inline_u64s, oldsize);
+
+	l->keys_p = new_keys;
+	l->top_p = new_keys + oldsize;
+
+	return 0;
+}
+
+void bch_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
+{
+	struct bkey_i *where;
+
+	for_each_keylist_key(l, where)
+		if (bkey_cmp(insert->k.p, where->k.p) < 0)
+			break;
+
+	memmove_u64s_up((u64 *) where + insert->k.u64s,
+			where,
+			((u64 *) l->top) - ((u64 *) where));
+
+	l->top_p += insert->k.u64s;
+	bkey_copy(where, insert);
+}
+
+void bch_keylist_pop_front(struct keylist *l)
+{
+	l->top_p -= bch_keylist_front(l)->k.u64s;
+
+	memmove_u64s_down(l->keys,
+			  bkey_next(l->keys),
+			  bch_keylist_u64s(l));
+}
diff --git a/libbcache/keylist.h b/libbcache/keylist.h
new file mode 100644
index 0000000..1166f94
--- /dev/null
+++ b/libbcache/keylist.h
@@ -0,0 +1,62 @@
+#ifndef _BCACHE_KEYLIST_H
+#define _BCACHE_KEYLIST_H
+
+#include "keylist_types.h"
+
+int bch_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
+void bch_keylist_add_in_order(struct keylist *, struct bkey_i *);
+void bch_keylist_pop_front(struct keylist *);
+
+static inline void bch_keylist_init(struct keylist *l, u64 *inline_keys,
+				    size_t nr_inline_u64s)
+{
+	l->top_p = l->keys_p = inline_keys;
+}
+
+static inline void bch_keylist_free(struct keylist *l, u64 *inline_keys)
+{
+	if (l->keys_p != inline_keys)
+		kfree(l->keys_p);
+	memset(l, 0, sizeof(*l));
+}
+
+static inline void bch_keylist_push(struct keylist *l)
+{
+	l->top = bkey_next(l->top);
+}
+
+static inline void bch_keylist_add(struct keylist *l, const struct bkey_i *k)
+{
+	bkey_copy(l->top, k);
+	bch_keylist_push(l);
+}
+
+static inline bool bch_keylist_empty(struct keylist *l)
+{
+	return l->top == l->keys;
+}
+
+static inline size_t bch_keylist_u64s(struct keylist *l)
+{
+	return l->top_p - l->keys_p;
+}
+
+static inline size_t bch_keylist_bytes(struct keylist *l)
+{
+	return bch_keylist_u64s(l) * sizeof(u64);
+}
+
+static inline struct bkey_i *bch_keylist_front(struct keylist *l)
+{
+	return l->keys;
+}
+
+#define for_each_keylist_key(_keylist, _k)			\
+	for (_k = (_keylist)->keys;				\
+	     _k != (_keylist)->top;				\
+	     _k = bkey_next(_k))
+
+#define keylist_single(k)					\
+	((struct keylist) { .keys = k, .top = bkey_next(k) })
+
+#endif /* _BCACHE_KEYLIST_H */
diff --git a/libbcache/keylist_types.h b/libbcache/keylist_types.h
new file mode 100644
index 0000000..195785b
--- /dev/null
+++ b/libbcache/keylist_types.h
@@ -0,0 +1,15 @@
+#ifndef _BCACHE_KEYLIST_TYPES_H
+#define _BCACHE_KEYLIST_TYPES_H
+
+struct keylist {
+	union {
+		struct bkey_i		*keys;
+		u64			*keys_p;
+	};
+	union {
+		struct bkey_i		*top;
+		u64			*top_p;
+	};
+};
+
+#endif /* _BCACHE_KEYLIST_TYPES_H */
diff --git a/libbcache/migrate.c b/libbcache/migrate.c
new file mode 100644
index 0000000..5a26e22
--- /dev/null
+++ b/libbcache/migrate.c
@@ -0,0 +1,369 @@
+/*
+ * Code for moving data off a device.
+ */
+
+#include "bcache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "migrate.h"
+#include "move.h"
+
+static int issue_migration_move(struct cache *ca,
+				struct moving_context *ctxt,
+				struct bkey_s_c k)
+{
+	struct cache_set *c = ca->set;
+	struct disk_reservation res;
+	const struct bch_extent_ptr *ptr;
+	int ret;
+
+	if (bch_disk_reservation_get(c, &res, k.k->size, 0))
+		return -ENOSPC;
+
+	extent_for_each_ptr(bkey_s_c_to_extent(k), ptr)
+		if (ptr->dev == ca->sb.nr_this_dev)
+			goto found;
+
+	BUG();
+found:
+	/* XXX: we need to be doing something with the disk reservation */
+
+	ret = bch_data_move(c, ctxt, &c->migration_write_point, k, ptr);
+	if (ret)
+		bch_disk_reservation_put(c, &res);
+	return ret;
+}
+
+#define MAX_DATA_OFF_ITER	10
+
+/*
+ * This moves only the data off, leaving the meta-data (if any) in place.
+ * It walks the key space, and for any key with a valid pointer to the
+ * relevant device, it copies it elsewhere, updating the key to point to
+ * the copy.
+ * The meta-data is moved off by bch_move_meta_data_off_device.
+ *
+ * Note: If the number of data replicas desired is > 1, ideally, any
+ * new copies would not be made in the same device that already have a
+ * copy (if there are enough devices).
+ * This is _not_ currently implemented.  The multiple replicas can
+ * land in the same device even if there are others available.
+ */
+
+int bch_move_data_off_device(struct cache *ca)
+{
+	struct moving_context ctxt;
+	struct cache_set *c = ca->set;
+	unsigned pass = 0;
+	u64 seen_key_count;
+	int ret = 0;
+
+	BUG_ON(ca->mi.state == CACHE_ACTIVE);
+
+	bch_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
+	ctxt.avoid = ca;
+
+	/*
+	 * In theory, only one pass should be necessary as we've
+	 * quiesced all writes before calling this.
+	 *
+	 * However, in practice, more than one pass may be necessary:
+	 * - Some move fails due to an error. We can can find this out
+	 *   from the moving_context.
+	 * - Some key swap failed because some of the pointers in the
+	 *   key in the tree changed due to caching behavior, btree gc
+	 *   pruning stale pointers, or tiering (if the device being
+	 *   removed is in tier 0).  A smarter bkey_cmpxchg would
+	 *   handle these cases.
+	 *
+	 * Thus this scans the tree one more time than strictly necessary,
+	 * but that can be viewed as a verification pass.
+	 */
+
+	do {
+		struct btree_iter iter;
+		struct bkey_s_c k;
+
+		seen_key_count = 0;
+		atomic_set(&ctxt.error_count, 0);
+		atomic_set(&ctxt.error_flags, 0);
+
+		bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+		while (!bch_move_ctxt_wait(&ctxt) &&
+		       (k = bch_btree_iter_peek(&iter)).k &&
+		       !(ret = btree_iter_err(k))) {
+			if (!bkey_extent_is_data(k.k) ||
+			    !bch_extent_has_device(bkey_s_c_to_extent(k),
+						   ca->sb.nr_this_dev))
+				goto next;
+
+			ret = issue_migration_move(ca, &ctxt, k);
+			if (ret == -ENOMEM) {
+				bch_btree_iter_unlock(&iter);
+
+				/*
+				 * memory allocation failure, wait for some IO
+				 * to finish
+				 */
+				bch_move_ctxt_wait_for_io(&ctxt);
+				continue;
+			}
+			if (ret == -ENOSPC)
+				break;
+			BUG_ON(ret);
+
+			seen_key_count++;
+next:
+			bch_btree_iter_advance_pos(&iter);
+			bch_btree_iter_cond_resched(&iter);
+
+		}
+		bch_btree_iter_unlock(&iter);
+		bch_move_ctxt_exit(&ctxt);
+
+		if (ret)
+			return ret;
+	} while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
+
+	if (seen_key_count) {
+		pr_err("Unable to migrate all data in %d iterations.",
+		       MAX_DATA_OFF_ITER);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * This walks the btree, and for any node on the relevant device it moves the
+ * node elsewhere.
+ */
+static int bch_move_btree_off(struct cache *ca, enum btree_id id)
+{
+	struct cache_set *c = ca->set;
+	struct btree_iter iter;
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	BUG_ON(ca->mi.state == CACHE_ACTIVE);
+
+	closure_init_stack(&cl);
+
+	for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
+retry:
+		if (!bch_extent_has_device(e, ca->sb.nr_this_dev))
+			continue;
+
+		ret = bch_btree_node_rewrite(&iter, b, &cl);
+		if (ret == -EINTR || ret == -ENOSPC) {
+			/*
+			 * Drop locks to upgrade locks or wait on
+			 * reserve: after retaking, recheck in case we
+			 * raced.
+			 */
+			bch_btree_iter_unlock(&iter);
+			closure_sync(&cl);
+			b = bch_btree_iter_peek_node(&iter);
+			goto retry;
+		}
+		if (ret) {
+			bch_btree_iter_unlock(&iter);
+			return ret;
+		}
+
+		bch_btree_iter_set_locks_want(&iter, 0);
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret; /* btree IO error */
+
+	if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+		for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+			struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
+
+			BUG_ON(bch_extent_has_device(e, ca->sb.nr_this_dev));
+		}
+		bch_btree_iter_unlock(&iter);
+	}
+
+	return 0;
+}
+
+/*
+ * This moves only the meta-data off, leaving the data (if any) in place.
+ * The data is moved off by bch_move_data_off_device, if desired, and
+ * called first.
+ *
+ * Before calling this, allocation of buckets to the device must have
+ * been disabled, as else we'll continue to write meta-data to the device
+ * when new buckets are picked for meta-data writes.
+ * In addition, the copying gc and allocator threads for the device
+ * must have been stopped.  The allocator thread is the only thread
+ * that writes prio/gen information.
+ *
+ * Meta-data consists of:
+ * - Btree nodes
+ * - Prio/gen information
+ * - Journal entries
+ * - Superblock
+ *
+ * This has to move the btree nodes and the journal only:
+ * - prio/gen information is not written once the allocator thread is stopped.
+ *   also, as the prio/gen information is per-device it is not moved.
+ * - the superblock will be written by the caller once after everything
+ *   is stopped.
+ *
+ * Note that currently there is no way to stop btree node and journal
+ * meta-data writes to a device without moving the meta-data because
+ * once a bucket is open for a btree node, unless a replacement btree
+ * node is allocated (and the tree updated), the bucket will continue
+ * to be written with updates.  Similarly for the journal (it gets
+ * written until filled).
+ *
+ * This routine leaves the data (if any) in place.  Whether the data
+ * should be moved off is a decision independent of whether the meta
+ * data should be moved off and stopped:
+ *
+ * - For device removal, both data and meta-data are moved off, in
+ *   that order.
+ *
+ * - However, for turning a device read-only without removing it, only
+ *   meta-data is moved off since that's the only way to prevent it
+ *   from being written.  Data is left in the device, but no new data
+ *   is written.
+ */
+
+int bch_move_meta_data_off_device(struct cache *ca)
+{
+	unsigned i;
+	int ret;
+
+	/* 1st, Move the btree nodes off the device */
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		ret = bch_move_btree_off(ca, i);
+		if (ret)
+			return ret;
+	}
+
+	/* There are no prios/gens to move -- they are already in the device. */
+
+	/* 2nd. Move the journal off the device */
+
+	ret = bch_journal_move(ca);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/*
+ * Flagging data bad when forcibly removing a device after failing to
+ * migrate the data off the device.
+ */
+
+static int bch_flag_key_bad(struct btree_iter *iter,
+			    struct cache *ca,
+			    struct bkey_s_c_extent orig)
+{
+	BKEY_PADDED(key) tmp;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct cache_set *c = ca->set;
+
+	bkey_reassemble(&tmp.key, orig.s_c);
+	e = bkey_i_to_s_extent(&tmp.key);
+
+	extent_for_each_ptr_backwards(e, ptr)
+		if (ptr->dev == ca->sb.nr_this_dev)
+			bch_extent_drop_ptr(e, ptr);
+
+	/*
+	 * If the new extent no longer has any pointers, bch_extent_normalize()
+	 * will do the appropriate thing with it (turning it into a
+	 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+	 */
+	bch_extent_normalize(c, e.s);
+
+	return bch_btree_insert_at(c, NULL, NULL, NULL,
+				   BTREE_INSERT_ATOMIC,
+				   BTREE_INSERT_ENTRY(iter, &tmp.key));
+}
+
+/*
+ * This doesn't actually move any data -- it marks the keys as bad
+ * if they contain a pointer to a device that is forcibly removed
+ * and don't have other valid pointers.  If there are valid pointers,
+ * the necessary pointers to the removed device are replaced with
+ * bad pointers instead.
+ * This is only called if bch_move_data_off_device above failed, meaning
+ * that we've already tried to move the data MAX_DATA_OFF_ITER times and
+ * are not likely to succeed if we try again.
+ */
+
+int bch_flag_data_bad(struct cache *ca)
+{
+	int ret = 0;
+	struct bkey_s_c k;
+	struct bkey_s_c_extent e;
+	struct btree_iter iter;
+
+	bch_btree_iter_init(&iter, ca->set, BTREE_ID_EXTENTS, POS_MIN);
+
+	while ((k = bch_btree_iter_peek(&iter)).k &&
+	       !(ret = btree_iter_err(k))) {
+		if (!bkey_extent_is_data(k.k))
+			goto advance;
+
+		e = bkey_s_c_to_extent(k);
+		if (!bch_extent_has_device(e, ca->sb.nr_this_dev))
+			goto advance;
+
+		ret = bch_flag_key_bad(&iter, ca, e);
+
+		/*
+		 * don't want to leave ret == -EINTR, since if we raced and
+		 * something else overwrote the key we could spuriously return
+		 * -EINTR below:
+		 */
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+
+		/*
+		 * If the replica we're dropping was dirty and there is an
+		 * additional cached replica, the cached replica will now be
+		 * considered dirty - upon inserting the new version of the key,
+		 * the bucket accounting will be updated to reflect the fact
+		 * that the cached data is now dirty and everything works out as
+		 * if by magic without us having to do anything.
+		 *
+		 * The one thing we need to be concerned with here is there's a
+		 * race between when we drop any stale pointers from the key
+		 * we're about to insert, and when the key actually gets
+		 * inserted and the cached data is marked as dirty - we could
+		 * end up trying to insert a key with a pointer that should be
+		 * dirty, but points to stale data.
+		 *
+		 * If that happens the insert code just bails out and doesn't do
+		 * the insert - however, it doesn't return an error. Hence we
+		 * need to always recheck the current key before advancing to
+		 * the next:
+		 */
+		continue;
+advance:
+		bch_btree_iter_advance_pos(&iter);
+	}
+
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
diff --git a/libbcache/migrate.h b/libbcache/migrate.h
new file mode 100644
index 0000000..55636e0
--- /dev/null
+++ b/libbcache/migrate.h
@@ -0,0 +1,8 @@
+#ifndef _BCACHE_MIGRATE_H
+#define _BCACHE_MIGRATE_H
+
+int bch_move_data_off_device(struct cache *);
+int bch_move_meta_data_off_device(struct cache *);
+int bch_flag_data_bad(struct cache *);
+
+#endif /* _BCACHE_MIGRATE_H */
diff --git a/libbcache/move.c b/libbcache/move.c
new file mode 100644
index 0000000..f3ab9e8
--- /dev/null
+++ b/libbcache/move.c
@@ -0,0 +1,388 @@
+
+#include "bcache.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "io.h"
+#include "move.h"
+#include "super.h"
+#include "keylist.h"
+
+#include <linux/ioprio.h>
+
+#include <trace/events/bcache.h>
+
+static struct bch_extent_ptr *bkey_find_ptr(struct cache_set *c,
+					    struct bkey_s_extent e,
+					    struct bch_extent_ptr ptr)
+{
+	struct bch_extent_ptr *ptr2;
+	struct cache_member_rcu *mi;
+	unsigned bucket_bits;
+
+	mi = cache_member_info_get(c);
+	bucket_bits = ilog2(mi->m[ptr.dev].bucket_size);
+	cache_member_info_put();
+
+	extent_for_each_ptr(e, ptr2)
+		if (ptr2->dev == ptr.dev &&
+		    ptr2->gen == ptr.gen &&
+		    (ptr2->offset >> bucket_bits) ==
+		    (ptr.offset >> bucket_bits))
+			return ptr2;
+
+	return NULL;
+}
+
+static struct bch_extent_ptr *bch_migrate_matching_ptr(struct migrate_write *m,
+						       struct bkey_s_extent e)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_ptr *ret;
+
+	if (m->move)
+		ret = bkey_find_ptr(m->op.c, e, m->move_ptr);
+	else
+		extent_for_each_ptr(bkey_i_to_s_c_extent(&m->key), ptr)
+			if ((ret = bkey_find_ptr(m->op.c, e, *ptr)))
+				break;
+
+	return ret;
+}
+
+static int bch_migrate_index_update(struct bch_write_op *op)
+{
+	struct cache_set *c = op->c;
+	struct migrate_write *m =
+		container_of(op, struct migrate_write, op);
+	struct keylist *keys = &op->insert_keys;
+	struct btree_iter iter;
+	int ret = 0;
+
+	bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS,
+		bkey_start_pos(&bch_keylist_front(keys)->k));
+
+	while (1) {
+		struct bkey_i *insert = bch_keylist_front(keys);
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
+		struct bch_extent_ptr *ptr;
+		struct bkey_s_extent e;
+		BKEY_PADDED(k) new;
+
+		if (!k.k) {
+			ret = bch_btree_iter_unlock(&iter);
+			break;
+		}
+
+		if (!bkey_extent_is_data(k.k))
+			goto nomatch;
+
+		bkey_reassemble(&new.k, k);
+		bch_cut_front(iter.pos, &new.k);
+		bch_cut_back(insert->k.p, &new.k.k);
+		e = bkey_i_to_s_extent(&new.k);
+
+		/* hack - promotes can race: */
+		if (m->promote)
+			extent_for_each_ptr(bkey_i_to_s_extent(insert), ptr)
+				if (bch_extent_has_device(e.c, ptr->dev))
+					goto nomatch;
+
+		ptr = bch_migrate_matching_ptr(m, e);
+		if (ptr) {
+			unsigned insert_flags =
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL;
+
+			/* copygc uses btree node reserve: */
+			if (m->move)
+				insert_flags |= BTREE_INSERT_USE_RESERVE;
+
+			if (m->move)
+				__bch_extent_drop_ptr(e, ptr);
+
+			memcpy_u64s(extent_entry_last(e),
+				    &insert->v,
+				    bkey_val_u64s(&insert->k));
+			e.k->u64s += bkey_val_u64s(&insert->k);
+
+			bch_extent_narrow_crcs(e);
+			bch_extent_drop_redundant_crcs(e);
+			bch_extent_normalize(c, e.s);
+
+			ret = bch_btree_insert_at(c, &op->res,
+					NULL, op_journal_seq(op),
+					insert_flags,
+					BTREE_INSERT_ENTRY(&iter, &new.k));
+			if (ret && ret != -EINTR)
+				break;
+		} else {
+nomatch:
+			bch_btree_iter_advance_pos(&iter);
+		}
+
+		while (bkey_cmp(iter.pos, bch_keylist_front(keys)->k.p) >= 0) {
+			bch_keylist_pop_front(keys);
+			if (bch_keylist_empty(keys))
+				goto out;
+		}
+
+		bch_cut_front(iter.pos, bch_keylist_front(keys));
+	}
+out:
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
+
+void bch_migrate_write_init(struct cache_set *c,
+			    struct migrate_write *m,
+			    struct write_point *wp,
+			    struct bkey_s_c k,
+			    const struct bch_extent_ptr *move_ptr,
+			    unsigned flags)
+{
+	bkey_reassemble(&m->key, k);
+
+	m->promote = false;
+	m->move = move_ptr != NULL;
+	if (move_ptr)
+		m->move_ptr = *move_ptr;
+
+	if (bkey_extent_is_cached(k.k))
+		flags |= BCH_WRITE_CACHED;
+
+	bch_write_op_init(&m->op, c, &m->wbio,
+			  (struct disk_reservation) { 0 },
+			  wp,
+			  bkey_start_pos(k.k),
+			  NULL, flags);
+
+	if (m->move)
+		m->op.alloc_reserve = RESERVE_MOVINGGC;
+
+	m->op.nr_replicas	= 1;
+	m->op.index_update_fn	= bch_migrate_index_update;
+}
+
+static void migrate_bio_init(struct moving_io *io, struct bio *bio,
+			     unsigned sectors)
+{
+	bio_init(bio);
+	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	bio->bi_iter.bi_size	= sectors << 9;
+	bio->bi_max_vecs	= DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	bio->bi_private		= &io->cl;
+	bio->bi_io_vec		= io->bi_inline_vecs;
+	bch_bio_map(bio, NULL);
+}
+
+static void moving_io_destructor(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_context *ctxt = io->ctxt;
+	struct bio_vec *bv;
+	int i;
+
+	//if (io->replace.failures)
+	//	trace_bcache_copy_collision(q, &io->key.k);
+
+	atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight);
+	wake_up(&ctxt->wait);
+
+	bio_for_each_segment_all(bv, &io->write.wbio.bio, i)
+		if (bv->bv_page)
+			__free_page(bv->bv_page);
+
+	kfree(io);
+}
+
+static void moving_error(struct moving_context *ctxt, unsigned flag)
+{
+	atomic_inc(&ctxt->error_count);
+	//atomic_or(flag, &ctxt->error_flags);
+}
+
+static void moving_io_after_write(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_context *ctxt = io->ctxt;
+
+	if (io->write.op.error)
+		moving_error(ctxt, MOVING_FLAG_WRITE);
+
+	moving_io_destructor(cl);
+}
+
+static void write_moving(struct moving_io *io)
+{
+	struct bch_write_op *op = &io->write.op;
+
+	if (op->error) {
+		closure_return_with_destructor(&io->cl, moving_io_destructor);
+	} else {
+		closure_call(&op->cl, bch_write, NULL, &io->cl);
+		closure_return_with_destructor(&io->cl, moving_io_after_write);
+	}
+}
+
+static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
+{
+	struct moving_io *io =
+		list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
+
+	return io && io->read_completed ? io : NULL;
+}
+
+static void read_moving_endio(struct bio *bio)
+{
+	struct closure *cl = bio->bi_private;
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_context *ctxt = io->ctxt;
+
+	trace_bcache_move_read_done(&io->write.key.k);
+
+	if (bio->bi_error) {
+		io->write.op.error = bio->bi_error;
+		moving_error(io->ctxt, MOVING_FLAG_READ);
+	}
+
+	io->read_completed = true;
+	if (next_pending_write(ctxt))
+		wake_up(&ctxt->wait);
+
+	closure_put(&ctxt->cl);
+}
+
+static void __bch_data_move(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct cache_set *c = io->write.op.c;
+	struct extent_pick_ptr pick;
+
+	bch_extent_pick_ptr_avoiding(c, bkey_i_to_s_c(&io->write.key),
+				     io->ctxt->avoid, &pick);
+	if (IS_ERR_OR_NULL(pick.ca))
+		closure_return_with_destructor(cl, moving_io_destructor);
+
+	bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
+	io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&io->write.key.k);
+	io->rbio.bio.bi_end_io	= read_moving_endio;
+
+	/*
+	 * dropped by read_moving_endio() - guards against use after free of
+	 * ctxt when doing wakeup
+	 */
+	closure_get(&io->ctxt->cl);
+
+	bch_read_extent(c, &io->rbio,
+			bkey_i_to_s_c(&io->write.key),
+			&pick, BCH_READ_IS_LAST);
+}
+
+int bch_data_move(struct cache_set *c,
+		  struct moving_context *ctxt,
+		  struct write_point *wp,
+		  struct bkey_s_c k,
+		  const struct bch_extent_ptr *move_ptr)
+{
+	struct moving_io *io;
+
+	io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) *
+		     DIV_ROUND_UP(k.k->size, PAGE_SECTORS),
+		     GFP_KERNEL);
+	if (!io)
+		return -ENOMEM;
+
+	io->ctxt = ctxt;
+
+	migrate_bio_init(io, &io->rbio.bio, k.k->size);
+
+	if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) {
+		kfree(io);
+		return -ENOMEM;
+	}
+
+	migrate_bio_init(io, &io->write.wbio.bio, k.k->size);
+	bio_get(&io->write.wbio.bio);
+	io->write.wbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
+
+	bch_migrate_write_init(c, &io->write, wp, k, move_ptr, 0);
+
+	trace_bcache_move_read(&io->write.key.k);
+
+	ctxt->keys_moved++;
+	ctxt->sectors_moved += k.k->size;
+	if (ctxt->rate)
+		bch_ratelimit_increment(ctxt->rate, k.k->size);
+
+	atomic_add(k.k->size, &ctxt->sectors_in_flight);
+	list_add_tail(&io->list, &ctxt->reads);
+
+	closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl);
+	return 0;
+}
+
+static void do_pending_writes(struct moving_context *ctxt)
+{
+	struct moving_io *io;
+
+	while ((io = next_pending_write(ctxt))) {
+		list_del(&io->list);
+		trace_bcache_move_write(&io->write.key.k);
+		write_moving(io);
+	}
+}
+
+#define move_ctxt_wait_event(_ctxt, _cond)			\
+do {								\
+	do_pending_writes(_ctxt);				\
+								\
+	if (_cond)						\
+		break;						\
+	__wait_event((_ctxt)->wait,				\
+		     next_pending_write(_ctxt) || (_cond));	\
+} while (1)
+
+int bch_move_ctxt_wait(struct moving_context *ctxt)
+{
+	move_ctxt_wait_event(ctxt,
+			     atomic_read(&ctxt->sectors_in_flight) <
+			     ctxt->max_sectors_in_flight);
+
+	return ctxt->rate
+		? bch_ratelimit_wait_freezable_stoppable(ctxt->rate)
+		: 0;
+}
+
+void bch_move_ctxt_wait_for_io(struct moving_context *ctxt)
+{
+	unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight);
+
+	move_ctxt_wait_event(ctxt,
+		!atomic_read(&ctxt->sectors_in_flight) ||
+		atomic_read(&ctxt->sectors_in_flight) != sectors_pending);
+}
+
+void bch_move_ctxt_exit(struct moving_context *ctxt)
+{
+	move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight));
+	closure_sync(&ctxt->cl);
+
+	EBUG_ON(!list_empty(&ctxt->reads));
+	EBUG_ON(atomic_read(&ctxt->sectors_in_flight));
+}
+
+void bch_move_ctxt_init(struct moving_context *ctxt,
+			struct bch_ratelimit *rate,
+			unsigned max_sectors_in_flight)
+{
+	memset(ctxt, 0, sizeof(*ctxt));
+	closure_init_stack(&ctxt->cl);
+
+	ctxt->rate = rate;
+	ctxt->max_sectors_in_flight = max_sectors_in_flight;
+
+	INIT_LIST_HEAD(&ctxt->reads);
+	init_waitqueue_head(&ctxt->wait);
+}
diff --git a/libbcache/move.h b/libbcache/move.h
new file mode 100644
index 0000000..787023e
--- /dev/null
+++ b/libbcache/move.h
@@ -0,0 +1,87 @@
+#ifndef _BCACHE_MOVE_H
+#define _BCACHE_MOVE_H
+
+#include "buckets.h"
+#include "io_types.h"
+#include "move_types.h"
+
+enum moving_flag_bitnos {
+	MOVING_FLAG_BITNO_READ = 0,
+	MOVING_FLAG_BITNO_WRITE,
+};
+
+#define MOVING_FLAG_READ	(1U << MOVING_FLAG_BITNO_READ)
+#define MOVING_FLAG_WRITE	(1U << MOVING_FLAG_BITNO_WRITE)
+
+struct migrate_write {
+	BKEY_PADDED(key);
+	bool			promote;
+	bool			move;
+	struct bch_extent_ptr	move_ptr;
+	struct bch_write_op	op;
+	struct bch_write_bio	wbio;
+};
+
+void bch_migrate_write_init(struct cache_set *,
+			    struct migrate_write *,
+			    struct write_point *,
+			    struct bkey_s_c,
+			    const struct bch_extent_ptr *,
+			    unsigned);
+
+#define SECTORS_IN_FLIGHT_PER_DEVICE	2048
+
+struct moving_context {
+	/* Closure for waiting on all reads and writes to complete */
+	struct closure		cl;
+
+	/* Number and types of errors reported */
+	atomic_t		error_count;
+	atomic_t		error_flags;
+
+	/* Key and sector moves issued, updated from submission context */
+	u64			keys_moved;
+	u64			sectors_moved;
+
+	/* Rate-limiter counting submitted reads */
+	struct bch_ratelimit	*rate;
+
+	/* Try to avoid reading the following device */
+	struct cache		*avoid;
+
+	struct list_head	reads;
+
+	/* Configuration */
+	unsigned		max_sectors_in_flight;
+	atomic_t		sectors_in_flight;
+
+	wait_queue_head_t	wait;
+};
+
+struct moving_io {
+	struct list_head	list;
+	struct rb_node		node;
+	struct closure		cl;
+	struct moving_context	*ctxt;
+	struct migrate_write	write;
+	bool			read_completed;
+
+	struct bch_read_bio	rbio;
+	/* Must be last since it is variable size */
+	struct bio_vec		bi_inline_vecs[0];
+};
+
+int bch_data_move(struct cache_set *,
+		  struct moving_context *,
+		  struct write_point *,
+		  struct bkey_s_c,
+		  const struct bch_extent_ptr *);
+
+int bch_move_ctxt_wait(struct moving_context *);
+void bch_move_ctxt_wait_for_io(struct moving_context *);
+
+void bch_move_ctxt_exit(struct moving_context *);
+void bch_move_ctxt_init(struct moving_context *, struct bch_ratelimit *,
+			unsigned);
+
+#endif /* _BCACHE_MOVE_H */
diff --git a/libbcache/move_types.h b/libbcache/move_types.h
new file mode 100644
index 0000000..0e2275e
--- /dev/null
+++ b/libbcache/move_types.h
@@ -0,0 +1,4 @@
+#ifndef _BCACHE_MOVE_TYPES_H
+#define _BCACHE_MOVE_TYPES_H
+
+#endif /* _BCACHE_MOVE_TYPES_H */
diff --git a/libbcache/movinggc.c b/libbcache/movinggc.c
new file mode 100644
index 0000000..3c85d49
--- /dev/null
+++ b/libbcache/movinggc.c
@@ -0,0 +1,298 @@
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "extents.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "movinggc.h"
+
+#include <trace/events/bcache.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+
+/* Moving GC - IO loop */
+
+static const struct bch_extent_ptr *moving_pred(struct cache *ca,
+						struct bkey_s_c k)
+{
+	const struct bch_extent_ptr *ptr;
+
+	if (bkey_extent_is_data(k.k)) {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
+		extent_for_each_ptr(e, ptr)
+			if ((ca->sb.nr_this_dev == ptr->dev) &&
+			    PTR_BUCKET(ca, ptr)->mark.copygc)
+				return ptr;
+	}
+
+	return NULL;
+}
+
+static int issue_moving_gc_move(struct cache *ca,
+				struct moving_context *ctxt,
+				struct bkey_s_c k)
+{
+	struct cache_set *c = ca->set;
+	const struct bch_extent_ptr *ptr;
+	int ret;
+
+	ptr = moving_pred(ca, k);
+	if (!ptr) /* We raced - bucket's been reused */
+		return 0;
+
+	ret = bch_data_move(c, ctxt, &ca->copygc_write_point, k, ptr);
+	if (!ret)
+		trace_bcache_gc_copy(k.k);
+	else
+		trace_bcache_moving_gc_alloc_fail(c, k.k->size);
+	return ret;
+}
+
+static void read_moving(struct cache *ca, size_t buckets_to_move,
+			u64 sectors_to_move)
+{
+	struct cache_set *c = ca->set;
+	struct bucket *g;
+	struct moving_context ctxt;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 sectors_not_moved = 0;
+	size_t buckets_not_moved = 0;
+
+	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+	bch_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate,
+				SECTORS_IN_FLIGHT_PER_DEVICE);
+	bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+	while (1) {
+		if (kthread_should_stop())
+			goto out;
+		if (bch_move_ctxt_wait(&ctxt))
+			goto out;
+		k = bch_btree_iter_peek(&iter);
+		if (!k.k)
+			break;
+		if (btree_iter_err(k))
+			goto out;
+
+		if (!moving_pred(ca, k))
+			goto next;
+
+		if (issue_moving_gc_move(ca, &ctxt, k)) {
+			bch_btree_iter_unlock(&iter);
+
+			/* memory allocation failure, wait for some IO to finish */
+			bch_move_ctxt_wait_for_io(&ctxt);
+			continue;
+		}
+next:
+		bch_btree_iter_advance_pos(&iter);
+		//bch_btree_iter_cond_resched(&iter);
+
+		/* unlock before calling moving_context_wait() */
+		bch_btree_iter_unlock(&iter);
+		cond_resched();
+	}
+
+	bch_btree_iter_unlock(&iter);
+	bch_move_ctxt_exit(&ctxt);
+	trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
+				   buckets_to_move);
+
+	/* don't check this if we bailed out early: */
+	for_each_bucket(g, ca)
+		if (g->mark.copygc && bucket_sectors_used(g)) {
+			sectors_not_moved += bucket_sectors_used(g);
+			buckets_not_moved++;
+		}
+
+	if (sectors_not_moved)
+		bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved",
+			 sectors_not_moved, sectors_to_move,
+			 buckets_not_moved, buckets_to_move);
+	return;
+out:
+	bch_btree_iter_unlock(&iter);
+	bch_move_ctxt_exit(&ctxt);
+	trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
+				   buckets_to_move);
+}
+
+static bool have_copygc_reserve(struct cache *ca)
+{
+	bool ret;
+
+	spin_lock(&ca->freelist_lock);
+	ret = fifo_used(&ca->free[RESERVE_MOVINGGC]) >=
+		COPYGC_BUCKETS_PER_ITER(ca);
+	spin_unlock(&ca->freelist_lock);
+
+	return ret;
+}
+
+static void bch_moving_gc(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct bucket *g;
+	struct bucket_mark new;
+	u64 sectors_to_move;
+	size_t buckets_to_move, buckets_unused = 0;
+	struct bucket_heap_entry e;
+	unsigned sectors_used, i;
+	int reserve_sectors;
+
+	if (!have_copygc_reserve(ca)) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+		while (1) {
+			closure_wait(&c->freelist_wait, &cl);
+			if (have_copygc_reserve(ca))
+				break;
+			closure_sync(&cl);
+		}
+		closure_wake_up(&c->freelist_wait);
+	}
+
+	reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
+
+	trace_bcache_moving_gc_start(ca);
+
+	/*
+	 * Find buckets with lowest sector counts, skipping completely
+	 * empty buckets, by building a maxheap sorted by sector count,
+	 * and repeatedly replacing the maximum element until all
+	 * buckets have been visited.
+	 */
+
+	/*
+	 * We need bucket marks to be up to date, so gc can't be recalculating
+	 * them, and we don't want the allocator invalidating a bucket after
+	 * we've decided to evacuate it but before we set copygc:
+	 */
+	down_read(&c->gc_lock);
+	mutex_lock(&ca->heap_lock);
+	mutex_lock(&ca->set->bucket_lock);
+
+	ca->heap.used = 0;
+	for_each_bucket(g, ca) {
+		bucket_cmpxchg(g, new, new.copygc = 0);
+
+		if (bucket_unused(g)) {
+			buckets_unused++;
+			continue;
+		}
+
+		if (g->mark.owned_by_allocator ||
+		    g->mark.is_metadata)
+			continue;
+
+		sectors_used = bucket_sectors_used(g);
+
+		if (sectors_used >= ca->mi.bucket_size)
+			continue;
+
+		bucket_heap_push(ca, g, sectors_used);
+	}
+
+	sectors_to_move = 0;
+	for (i = 0; i < ca->heap.used; i++)
+		sectors_to_move += ca->heap.data[i].val;
+
+	while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
+		BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp));
+		sectors_to_move -= e.val;
+	}
+
+	for (i = 0; i < ca->heap.used; i++)
+		bucket_cmpxchg(ca->heap.data[i].g, new, new.copygc = 1);
+
+	buckets_to_move = ca->heap.used;
+
+	mutex_unlock(&ca->set->bucket_lock);
+	mutex_unlock(&ca->heap_lock);
+	up_read(&c->gc_lock);
+
+	read_moving(ca, buckets_to_move, sectors_to_move);
+}
+
+static int bch_moving_gc_thread(void *arg)
+{
+	struct cache *ca = arg;
+	struct cache_set *c = ca->set;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	unsigned long last;
+	u64 available, want, next;
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		if (kthread_wait_freezable(c->copy_gc_enabled))
+			break;
+
+		last = atomic_long_read(&clock->now);
+		/*
+		 * don't start copygc until less than half the gc reserve is
+		 * available:
+		 */
+		available = buckets_available_cache(ca);
+		want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
+				 c->opts.gc_reserve_percent, 200);
+		if (available > want) {
+			next = last + (available - want) *
+				ca->mi.bucket_size;
+			bch_kthread_io_clock_wait(clock, next);
+			continue;
+		}
+
+		bch_moving_gc(ca);
+	}
+
+	return 0;
+}
+
+void bch_moving_init_cache(struct cache *ca)
+{
+	bch_pd_controller_init(&ca->moving_gc_pd);
+	ca->moving_gc_pd.d_term = 0;
+}
+
+int bch_moving_gc_thread_start(struct cache *ca)
+{
+	struct task_struct *t;
+
+	/* The moving gc read thread must be stopped */
+	BUG_ON(ca->moving_gc_read != NULL);
+
+	if (cache_set_init_fault("moving_gc_start"))
+		return -ENOMEM;
+
+	t = kthread_create(bch_moving_gc_thread, ca, "bch_copygc_read");
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	ca->moving_gc_read = t;
+	wake_up_process(ca->moving_gc_read);
+
+	return 0;
+}
+
+void bch_moving_gc_stop(struct cache *ca)
+{
+	ca->moving_gc_pd.rate.rate = UINT_MAX;
+	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+
+	if (ca->moving_gc_read)
+		kthread_stop(ca->moving_gc_read);
+	ca->moving_gc_read = NULL;
+}
diff --git a/libbcache/movinggc.h b/libbcache/movinggc.h
new file mode 100644
index 0000000..5f15308
--- /dev/null
+++ b/libbcache/movinggc.h
@@ -0,0 +1,30 @@
+#ifndef _BCACHE_MOVINGGC_H
+#define _BCACHE_MOVINGGC_H
+
+/*
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
+ * need the buckets we're freeing up to go back into the copygc reserve to make
+ * forward progress, but if the copygc reserve is full they'll be available for
+ * any allocation - and it's possible that in a given iteration, we free up most
+ * of the buckets we're going to free before we allocate most of the buckets
+ * we're going to allocate.
+ *
+ * If we only use half of the reserve per iteration, then in steady state we'll
+ * always have room in the reserve for the buckets we're going to need in the
+ * next iteration:
+ */
+#define COPYGC_BUCKETS_PER_ITER(ca)					\
+	((ca)->free[RESERVE_MOVINGGC].size / 2)
+
+/*
+ * Max sectors to move per iteration: Have to take into account internal
+ * fragmentation from the multiple write points for each generation:
+ */
+#define COPYGC_SECTORS_PER_ITER(ca)					\
+	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
+
+void bch_moving_init_cache(struct cache *);
+void bch_moving_gc_stop(struct cache *);
+int bch_moving_gc_thread_start(struct cache *);
+
+#endif
diff --git a/libbcache/notify.c b/libbcache/notify.c
new file mode 100644
index 0000000..e9b5568
--- /dev/null
+++ b/libbcache/notify.c
@@ -0,0 +1,133 @@
+/*
+ * Code for sending uevent notifications to user-space.
+ *
+ * Copyright 2015 Datera, Inc.
+ */
+
+#include "bcache.h"
+#include "notify.h"
+
+#include <linux/kobject.h>
+
+#define notify_var(c, format, ...)					\
+({									\
+	int ret;							\
+	lockdep_assert_held(&(c)->uevent_lock);				\
+	ret = add_uevent_var(&(c)->uevent_env, format, ##__VA_ARGS__);	\
+	WARN_ON_ONCE(ret);						\
+})
+
+static void notify_get(struct cache_set *c)
+{
+	struct kobj_uevent_env *env = &c->uevent_env;
+
+	mutex_lock(&c->uevent_lock);
+	env->envp_idx = 0;
+	env->buflen = 0;
+
+	notify_var(c, "SET_UUID=%pU", c->disk_sb.user_uuid.b);
+}
+
+static void notify_get_cache(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	char buf[BDEVNAME_SIZE];
+
+	notify_get(c);
+	notify_var(c, "UUID=%pU", ca->disk_sb.sb->disk_uuid.b);
+	notify_var(c, "BLOCKDEV=%s", bdevname(ca->disk_sb.bdev, buf));
+}
+
+static void notify_put(struct cache_set *c)
+{
+	struct kobj_uevent_env *env = &c->uevent_env;
+
+	env->envp[env->envp_idx] = NULL;
+	kobject_uevent_env(&c->kobj, KOBJ_CHANGE, env->envp);
+	mutex_unlock(&c->uevent_lock);
+}
+
+void bch_notify_cache_set_read_write(struct cache_set *c)
+{
+	notify_get(c);
+	notify_var(c, "STATE=active");
+	notify_put(c);
+}
+
+void bch_notify_cache_set_read_only(struct cache_set *c)
+{
+	notify_get(c);
+	notify_var(c, "STATE=readonly");
+	notify_put(c);
+}
+
+void bch_notify_cache_set_stopped(struct cache_set *c)
+{
+	notify_get(c);
+	notify_var(c, "STATE=stopped");
+	notify_put(c);
+}
+
+void bch_notify_cache_read_write(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=active");
+	notify_put(c);
+}
+
+void bch_notify_cache_read_only(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=readonly");
+	notify_put(c);
+}
+
+void bch_notify_cache_added(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=removing");
+	notify_put(c);
+}
+
+void bch_notify_cache_removing(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=removing");
+	notify_put(c);
+}
+
+void bch_notify_cache_remove_failed(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=remove_failed");
+	notify_put(c);
+}
+
+void bch_notify_cache_removed(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=removed");
+	notify_put(c);
+}
+
+void bch_notify_cache_error(struct cache *ca, bool fatal)
+{
+	struct cache_set *c = ca->set;
+
+	notify_get_cache(ca);
+	notify_var(c, "STATE=error");
+	notify_var(c, "FATAL=%d", fatal);
+	notify_put(c);
+}
diff --git a/libbcache/notify.h b/libbcache/notify.h
new file mode 100644
index 0000000..80d6587
--- /dev/null
+++ b/libbcache/notify.h
@@ -0,0 +1,22 @@
+/*
+ * Code for sending uevent notifications to user-space.
+ *
+ * Copyright 2015 Datera, Inc.
+ */
+
+#ifndef _NOTIFY_H
+#define _NOTIFY_H
+
+void bch_notify_cache_set_read_write(struct cache_set *);
+void bch_notify_cache_set_read_only(struct cache_set *);
+void bch_notify_cache_set_stopped(struct cache_set *);
+
+void bch_notify_cache_read_write(struct cache *);
+void bch_notify_cache_read_only(struct cache *);
+void bch_notify_cache_added(struct cache *);
+void bch_notify_cache_removing(struct cache *);
+void bch_notify_cache_removed(struct cache *);
+void bch_notify_cache_remove_failed(struct cache *);
+void bch_notify_cache_error(struct cache *, bool);
+
+#endif /* _NOTIFY_H */
diff --git a/libbcache/opts.c b/libbcache/opts.c
new file mode 100644
index 0000000..249dd5d
--- /dev/null
+++ b/libbcache/opts.c
@@ -0,0 +1,179 @@
+
+#include <linux/kernel.h>
+
+#include "opts.h"
+#include "util.h"
+
+const char * const bch_bool_opt[] = {
+	"0",
+	"1",
+	NULL
+};
+
+const char * const bch_uint_opt[] = {
+	NULL
+};
+
+const char * const bch_error_actions[] = {
+	"continue",
+	"remount-ro",
+	"panic",
+	NULL
+};
+
+const char * const bch_csum_types[] = {
+	"none",
+	"crc32c",
+	"crc64",
+	NULL
+};
+
+const char * const bch_compression_types[] = {
+	"none",
+	"lz4",
+	"gzip",
+	NULL
+};
+
+const char * const bch_str_hash_types[] = {
+	"crc32c",
+	"crc64",
+	"siphash",
+	"sha1",
+	NULL
+};
+
+enum bch_opts {
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+	Opt_##_name,
+
+	CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+
+	Opt_bad_opt,
+};
+
+struct bch_option {
+	const char		*name;
+	const char * const	*opts;
+	unsigned long		min, max;
+};
+
+struct bch_opt_result {
+	enum bch_opts		opt;
+	unsigned		val;
+};
+
+static int parse_bool_opt(const struct bch_option *opt, const char *s)
+{
+	if (!strcmp(opt->name, s))
+		return true;
+
+	if (!strncmp("no", s, 2) && !strcmp(opt->name, s + 2))
+		return false;
+
+	return -1;
+}
+
+static int parse_uint_opt(const struct bch_option *opt, const char *s)
+{
+	unsigned long v;
+	int ret;
+
+	if (strncmp(opt->name, s, strlen(opt->name)))
+		return -1;
+
+	s += strlen(opt->name);
+
+	if (*s != '=')
+		return -1;
+
+	s++;
+
+	ret = kstrtoul(s, 10, &v);
+	if (ret)
+		return ret;
+
+	if (v < opt->min || v >= opt->max)
+		return -ERANGE;
+
+	return 0;
+}
+
+static int parse_string_opt(const struct bch_option *opt, const char *s)
+{
+	if (strncmp(opt->name, s, strlen(opt->name)))
+		return -1;
+
+	s += strlen(opt->name);
+
+	if (*s != '=')
+		return -1;
+
+	s++;
+
+	return bch_read_string_list(s, opt->opts);
+}
+
+static struct bch_opt_result parse_one_opt(const char *opt)
+{
+	static const struct bch_option opt_table[] = {
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+		[Opt_##_name] = {					\
+			.name = #_name,					\
+			.opts = _choices,				\
+			.min = _min,					\
+			.max = _max,					\
+		},
+		CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+	}, *i;
+
+	for (i = opt_table;
+	     i < opt_table + ARRAY_SIZE(opt_table);
+	     i++) {
+		int res = i->opts == bch_bool_opt ? parse_bool_opt(i, opt)
+			: i->opts == bch_uint_opt ? parse_uint_opt(i, opt)
+			: parse_string_opt(i, opt);
+
+		if (res >= 0)
+			return (struct bch_opt_result) {
+				i - opt_table, res
+			};
+	}
+
+	return (struct bch_opt_result) { Opt_bad_opt };
+}
+
+int bch_parse_options(struct cache_set_opts *opts, int flags, char *options)
+{
+	char *p;
+
+	*opts = cache_set_opts_empty();
+
+	opts->read_only = (flags & MS_RDONLY) != 0;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		struct bch_opt_result res = parse_one_opt(p);
+
+		switch (res.opt) {
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+		case Opt_##_name:					\
+			opts->_name = res.val;				\
+			break;
+
+		CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+
+		case Opt_bad_opt:
+			return -EINVAL;
+		default:
+			BUG();
+		}
+	}
+
+	return 0;
+}
diff --git a/libbcache/opts.h b/libbcache/opts.h
new file mode 100644
index 0000000..1d19ac6
--- /dev/null
+++ b/libbcache/opts.h
@@ -0,0 +1,100 @@
+#ifndef _BCACHE_OPTS_H
+#define _BCACHE_OPTS_H
+
+#include <linux/bcache.h>
+#include <linux/bug.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+
+/*
+ * Mount options; we also store defaults in the superblock.
+ *
+ * Also exposed via sysfs: if an option is writeable, and it's also stored in
+ * the superblock, changing it via sysfs (currently? might change this) also
+ * updates the superblock.
+ *
+ * We store options as signed integers, where -1 means undefined. This means we
+ * can pass the mount options to cache_set_alloc() as a whole struct, and then
+ * only apply the options from that struct that are defined.
+ */
+
+extern const char * const bch_bool_opt[];
+extern const char * const bch_uint_opt[];
+extern const char * const bch_error_actions[];
+extern const char * const bch_csum_types[];
+extern const char * const bch_compression_types[];
+extern const char * const bch_str_hash_types[];
+
+/* dummy option, for options that aren't stored in the superblock */
+LE64_BITMASK(NO_SB_OPT,		struct cache_sb, flags, 0, 0);
+
+#define CACHE_SET_VISIBLE_OPTS()				\
+	CACHE_SET_OPT(verbose_recovery,				\
+		      bch_bool_opt, 0, 2,			\
+		      NO_SB_OPT, false)				\
+	CACHE_SET_OPT(posix_acl,				\
+		      bch_bool_opt, 0, 2,			\
+		      NO_SB_OPT, false)				\
+	CACHE_SET_OPT(journal_flush_disabled,			\
+		      bch_bool_opt, 0, 2,			\
+		      NO_SB_OPT, true)				\
+	CACHE_SET_OPT(nofsck,					\
+		      bch_bool_opt, 0, 2,			\
+		      NO_SB_OPT, true)				\
+	CACHE_SET_OPT(fix_errors,				\
+		      bch_bool_opt, 0, 2,			\
+		      NO_SB_OPT, true)				\
+	CACHE_SET_SB_OPTS()
+
+#define CACHE_SET_OPTS()					\
+	CACHE_SET_OPT(read_only,				\
+		      bch_bool_opt, 0, 2,			\
+		      NO_SB_OPT, 0)				\
+	CACHE_SET_VISIBLE_OPTS()
+
+struct cache_set_opts {
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
+	s8 _name;
+
+	CACHE_SET_OPTS()
+#undef CACHE_SET_OPT
+};
+
+static inline struct cache_set_opts cache_set_opts_empty(void)
+{
+	struct cache_set_opts ret;
+
+	memset(&ret, 255, sizeof(ret));
+	return ret;
+}
+
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+static inline struct cache_set_opts cache_superblock_opts(struct cache_sb *sb)
+{
+	return (struct cache_set_opts) {
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
+		._name = _sb_opt##_BITS ? _sb_opt(sb) : 0,
+
+	CACHE_SET_OPTS()
+#undef CACHE_SET_OPT
+	};
+}
+
+static inline void cache_set_opts_apply(struct cache_set_opts *dst,
+					struct cache_set_opts src)
+{
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
+	BUILD_BUG_ON(_max > S8_MAX);				\
+	if (src._name >= 0)					\
+		dst->_name = src._name;
+
+	CACHE_SET_OPTS()
+#undef CACHE_SET_OPT
+}
+
+int bch_parse_options(struct cache_set_opts *, int, char *);
+
+#endif /* _BCACHE_OPTS_H */
diff --git a/libbcache/request.c b/libbcache/request.c
new file mode 100644
index 0000000..b41d472
--- /dev/null
+++ b/libbcache/request.c
@@ -0,0 +1,825 @@
+/*
+ * Handle a read or a write request and decide what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ *
+ * Main pieces here:
+ *
+ * 1) Data insert path, via bch_data_insert() -- writes data to cache and
+ *    updates extents btree
+ * 2) Read path, via bch_read() -- for now only used by bcachefs and ioctl
+ *    interface
+ * 3) Read path, via cache_lookup() and struct search -- used by block device
+ *    make_request functions
+ * 4) Cache promotion -- used by bch_read() and cache_lookup() to copy data to
+ *    the cache, either from a backing device or a cache device in a higher tier
+ *
+ * One tricky thing that comes up is a race condition where a bucket may be
+ * re-used while reads from it are still in flight. To guard against this, we
+ * save the ptr that is being read and check if it is stale once the read
+ * completes. If the ptr is stale, the read is retried.
+ *
+ * #2 and #3 will be unified further in the future.
+ */
+
+#include "bcache.h"
+#include "blockdev.h"
+#include "btree_update.h"
+#include "btree_iter.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "keybuf.h"
+#include "request.h"
+#include "writeback.h"
+#include "stats.h"
+
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+#include <linux/backing-dev.h>
+
+#include <trace/events/bcache.h>
+
+#define CUTOFF_CACHE_ADD	10
+#define CUTOFF_CACHE_READA	15
+
+/* Congested? */
+
+unsigned bch_get_congested(struct cache_set *c)
+{
+	int i;
+	long rand;
+
+	if (!c->congested_read_threshold_us &&
+	    !c->congested_write_threshold_us)
+		return 0;
+
+	i = (local_clock_us() - c->congested_last_us) / 1024;
+	if (i < 0)
+		return 0;
+
+	i += atomic_read(&c->congested);
+	if (i >= 0)
+		return 0;
+
+	i += CONGESTED_MAX;
+
+	if (i > 0)
+		i = fract_exp_two(i, 6);
+
+	rand = get_random_int();
+	i -= bitmap_weight(&rand, BITS_PER_LONG);
+
+	return i > 0 ? i : 1;
+}
+
+static void add_sequential(struct task_struct *t)
+{
+	t->sequential_io_avg = ewma_add(t->sequential_io_avg,
+					t->sequential_io, 3);
+	t->sequential_io = 0;
+}
+
+static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
+{
+	return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
+}
+
+static bool check_should_bypass(struct cached_dev *dc, struct bio *bio, int rw)
+{
+	struct cache_set *c = dc->disk.c;
+	unsigned mode = BDEV_CACHE_MODE(dc->disk_sb.sb);
+	unsigned sectors, congested = bch_get_congested(c);
+	struct task_struct *task = current;
+	struct io *i;
+
+	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
+	    sectors_available(c) * 100 < c->capacity * CUTOFF_CACHE_ADD ||
+	    (bio_op(bio) == REQ_OP_DISCARD))
+		goto skip;
+
+	if (mode == CACHE_MODE_NONE ||
+	    (mode == CACHE_MODE_WRITEAROUND &&
+	     op_is_write(bio_op(bio))))
+		goto skip;
+
+	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
+	    bio_sectors(bio) & (c->sb.block_size - 1)) {
+		pr_debug("skipping unaligned io");
+		goto skip;
+	}
+
+	if (bypass_torture_test(dc)) {
+		if ((get_random_int() & 3) == 3)
+			goto skip;
+		else
+			goto rescale;
+	}
+
+	if (!congested && !dc->sequential_cutoff)
+		goto rescale;
+
+	if (!congested &&
+	    mode == CACHE_MODE_WRITEBACK &&
+	    op_is_write(bio_op(bio)) &&
+	    (bio->bi_opf & REQ_SYNC))
+		goto rescale;
+
+	spin_lock(&dc->io_lock);
+
+	hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
+		if (i->last == bio->bi_iter.bi_sector &&
+		    time_before(jiffies, i->last_io))
+			goto found;
+
+	i = list_first_entry(&dc->io_lru, struct io, lru);
+
+	add_sequential(task);
+	i->sequential = 0;
+found:
+	if (i->sequential + bio->bi_iter.bi_size > i->sequential)
+		i->sequential	+= bio->bi_iter.bi_size;
+
+	i->last			 = bio_end_sector(bio);
+	i->last_io		 = jiffies + msecs_to_jiffies(5000);
+	task->sequential_io	 = i->sequential;
+
+	hlist_del(&i->hash);
+	hlist_add_head(&i->hash, iohash(dc, i->last));
+	list_move_tail(&i->lru, &dc->io_lru);
+
+	spin_unlock(&dc->io_lock);
+
+	sectors = max(task->sequential_io,
+		      task->sequential_io_avg) >> 9;
+
+	if (dc->sequential_cutoff &&
+	    sectors >= dc->sequential_cutoff >> 9) {
+		trace_bcache_bypass_sequential(bio);
+		goto skip;
+	}
+
+	if (congested && sectors >= congested) {
+		trace_bcache_bypass_congested(bio);
+		goto skip;
+	}
+
+rescale:
+	return false;
+skip:
+	bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
+	return true;
+}
+
+/* Common code for the make_request functions */
+
+/**
+ * request_endio - endio function for backing device bios
+ */
+static void request_endio(struct bio *bio)
+{
+	struct closure *cl = bio->bi_private;
+
+	if (bio->bi_error) {
+		struct search *s = container_of(cl, struct search, cl);
+		s->iop.error = bio->bi_error;
+		/* Only cache read errors are recoverable */
+		s->recoverable = false;
+	}
+
+	bio_put(bio);
+	closure_put(cl);
+}
+
+static void bio_complete(struct search *s)
+{
+	if (s->orig_bio) {
+		generic_end_io_acct(bio_data_dir(s->orig_bio),
+				    &s->d->disk->part0, s->start_time);
+
+		trace_bcache_request_end(s->d, s->orig_bio);
+		s->orig_bio->bi_error = s->iop.error;
+		bio_endio(s->orig_bio);
+		s->orig_bio = NULL;
+	}
+}
+
+static void do_bio_hook(struct search *s, struct bio *orig_bio)
+{
+	int rw = bio_data_dir(orig_bio);
+	struct bio *bio = rw ? &s->wbio.bio : &s->rbio.bio;
+
+	bio_init(bio);
+	__bio_clone_fast(bio, orig_bio);
+	bio->bi_end_io		= request_endio;
+	bio->bi_private		= &s->cl;
+
+	bio_cnt_set(bio, 3);
+}
+
+static void search_free(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+
+	bio_complete(s);
+
+	if (s->iop.bio)
+		bio_put(&s->iop.bio->bio);
+
+	closure_debug_destroy(cl);
+	mempool_free(s, &s->d->c->search);
+}
+
+static inline struct search *search_alloc(struct bio *bio,
+					  struct bcache_device *d)
+{
+	struct search *s;
+
+	s = mempool_alloc(&d->c->search, GFP_NOIO);
+
+	closure_init(&s->cl, NULL);
+	do_bio_hook(s, bio);
+
+	s->orig_bio		= bio;
+	s->d			= d;
+	s->recoverable		= 1;
+	s->bypass		= 0;
+	s->write		= op_is_write(bio_op(bio));
+	s->read_dirty_data	= 0;
+	s->cache_miss		= 0;
+	s->start_time		= jiffies;
+	s->inode		= bcache_dev_inum(d);
+
+	s->iop.c		= d->c;
+	s->iop.bio		= NULL;
+	s->iop.error		= 0;
+
+	return s;
+}
+
+/* Cached devices */
+
+static void cached_dev_bio_complete(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	search_free(cl);
+	cached_dev_put(dc);
+}
+
+/* Process reads */
+
+static void cached_dev_read_error(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct bio *bio = &s->rbio.bio;
+
+	if (s->recoverable) {
+		/* Read bucket invalidate races are handled here, also plain
+		 * old IO errors from the cache that can be retried from the
+		 * backing device (reads of clean data) */
+		trace_bcache_read_retry(s->orig_bio);
+
+		s->iop.error = 0;
+		do_bio_hook(s, s->orig_bio);
+
+		/* XXX: invalidate cache, don't count twice */
+
+		closure_bio_submit(bio, cl);
+	}
+
+	continue_at(cl, cached_dev_bio_complete, NULL);
+}
+
+static void cached_dev_read_done(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	if (dc->verify && s->recoverable && !s->read_dirty_data)
+		bch_data_verify(dc, s->orig_bio);
+
+	continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
+}
+
+static void cached_dev_read_done_bh(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	bch_mark_cache_accounting(s->iop.c, dc, !s->cache_miss, s->bypass);
+	trace_bcache_read(s->orig_bio, !s->cache_miss, s->bypass);
+
+	if (s->iop.error)
+		continue_at_nobarrier(cl, cached_dev_read_error, s->iop.c->wq);
+	else if (dc->verify)
+		continue_at_nobarrier(cl, cached_dev_read_done, s->iop.c->wq);
+	else
+		continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
+}
+
+/**
+ * __cache_promote -- insert result of read bio into cache
+ *
+ * Used for backing devices and flash-only volumes.
+ *
+ * @orig_bio must actually be a bbio with a valid key.
+ */
+void __cache_promote(struct cache_set *c, struct bch_read_bio *orig_bio,
+		     struct bkey_s_c old,
+		     struct bkey_s_c new,
+		     unsigned write_flags)
+{
+#if 0
+	struct cache_promote_op *op;
+	struct bio *bio;
+	unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE);
+
+	/* XXX: readahead? */
+
+	op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+	if (!op)
+		goto out_submit;
+
+	/* clone the bbio */
+	memcpy(&op->bio, orig_bio, offsetof(struct bbio, bio));
+
+	bio = &op->bio.bio.bio;
+	bio_init(bio);
+	bio_get(bio);
+	bio->bi_bdev		= orig_bio->bio.bi_bdev;
+	bio->bi_iter.bi_sector	= orig_bio->bio.bi_iter.bi_sector;
+	bio->bi_iter.bi_size	= orig_bio->bio.bi_iter.bi_size;
+	bio->bi_end_io		= cache_promote_endio;
+	bio->bi_private		= &op->cl;
+	bio->bi_io_vec		= bio->bi_inline_vecs;
+	bch_bio_map(bio, NULL);
+
+	if (bio_alloc_pages(bio, __GFP_NOWARN|GFP_NOIO))
+		goto out_free;
+
+	orig_bio->ca = NULL;
+
+	closure_init(&op->cl, &c->cl);
+	op->orig_bio		= &orig_bio->bio;
+	op->stale		= 0;
+
+	bch_write_op_init(&op->iop, c, &op->bio, &c->promote_write_point,
+			  new, old,
+			  BCH_WRITE_ALLOC_NOWAIT|write_flags);
+	op->iop.nr_replicas = 1;
+
+	//bch_cut_front(bkey_start_pos(&orig_bio->key.k), &op->iop.insert_key);
+	//bch_cut_back(orig_bio->key.k.p, &op->iop.insert_key.k);
+
+	trace_bcache_promote(&orig_bio->bio);
+
+	op->bio.bio.submit_time_us = local_clock_us();
+	closure_bio_submit(bio, &op->cl);
+
+	continue_at(&op->cl, cache_promote_write, c->wq);
+out_free:
+	kfree(op);
+out_submit:
+	generic_make_request(&orig_bio->bio);
+#endif
+}
+
+/**
+ * cached_dev_cache_miss - populate cache with data from backing device
+ *
+ * We don't write to the cache if s->bypass is set.
+ */
+static int cached_dev_cache_miss(struct btree_iter *iter, struct search *s,
+				 struct bio *bio, unsigned sectors)
+{
+	int ret;
+	unsigned reada = 0;
+	struct bio *miss;
+	BKEY_PADDED(key) replace;
+
+	s->cache_miss = 1;
+
+	if (s->bypass)
+		goto nopromote;
+#if 0
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	/* XXX: broken */
+	if (!(bio->bi_opf & REQ_RAHEAD) &&
+	    !(bio->bi_opf & REQ_META) &&
+	    ((u64) sectors_available(dc->disk.c) * 100 <
+	     (u64) iter->c->capacity * CUTOFF_CACHE_READA))
+		reada = min_t(sector_t, dc->readahead >> 9,
+			      bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
+#endif
+	sectors = min(sectors, bio_sectors(bio) + reada);
+
+	replace.key.k = KEY(s->inode,
+			    bio->bi_iter.bi_sector + sectors,
+			    sectors);
+
+	ret = bch_btree_insert_check_key(iter, &replace.key);
+	if (ret == -EINTR)
+		return ret;
+
+	miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
+
+	miss->bi_end_io		= request_endio;
+	miss->bi_private	= &s->cl;
+
+	//to_bbio(miss)->key.k = KEY(s->inode,
+	//			   bio_end_sector(miss),
+	//			   bio_sectors(miss));
+	to_rbio(miss)->ca = NULL;
+
+	closure_get(&s->cl);
+	__cache_promote(s->iop.c, to_rbio(miss),
+			bkey_i_to_s_c(&replace.key),
+			bkey_to_s_c(&KEY(replace.key.k.p.inode,
+					 replace.key.k.p.offset,
+					 replace.key.k.size)),
+			BCH_WRITE_CACHED);
+
+	return 0;
+nopromote:
+	miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
+
+	miss->bi_end_io		= request_endio;
+	miss->bi_private	= &s->cl;
+	closure_bio_submit(miss, &s->cl);
+
+	return 0;
+}
+
+static void cached_dev_read(struct cached_dev *dc, struct search *s)
+{
+	struct cache_set *c = s->iop.c;
+	struct closure *cl = &s->cl;
+	struct bio *bio = &s->rbio.bio;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch_increment_clock(c, bio_sectors(bio), READ);
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
+				POS(s->inode, bio->bi_iter.bi_sector), k) {
+		BKEY_PADDED(k) tmp;
+		struct extent_pick_ptr pick;
+		unsigned sectors, bytes;
+		bool is_last;
+retry:
+		bkey_reassemble(&tmp.k, k);
+		bch_btree_iter_unlock(&iter);
+		k = bkey_i_to_s_c(&tmp.k);
+
+		bch_extent_pick_ptr(c, k, &pick);
+		if (IS_ERR(pick.ca)) {
+			bcache_io_error(c, bio, "no device to read from");
+			goto out;
+		}
+
+		sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
+			bio->bi_iter.bi_sector;
+		bytes = sectors << 9;
+		is_last = bytes == bio->bi_iter.bi_size;
+		swap(bio->bi_iter.bi_size, bytes);
+
+		if (pick.ca) {
+			PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
+				c->prio_clock[READ].hand;
+
+			if (!bkey_extent_is_cached(k.k))
+				s->read_dirty_data = true;
+
+			bch_read_extent(c, &s->rbio, k, &pick,
+					BCH_READ_FORCE_BOUNCE|
+					BCH_READ_RETRY_IF_STALE|
+					(!s->bypass ? BCH_READ_PROMOTE : 0)|
+					(is_last ? BCH_READ_IS_LAST : 0));
+		} else {
+			/* not present (hole), or stale cached data */
+			if (cached_dev_cache_miss(&iter, s, bio, sectors)) {
+				k = bch_btree_iter_peek_with_holes(&iter);
+				if (btree_iter_err(k))
+					break;
+				goto retry;
+			}
+		}
+
+		swap(bio->bi_iter.bi_size, bytes);
+		bio_advance(bio, bytes);
+
+		if (is_last) {
+			bch_btree_iter_unlock(&iter);
+			goto out;
+		}
+	}
+
+	/*
+	 * If we get here, it better have been because there was an error
+	 * reading a btree node
+	 */
+	ret = bch_btree_iter_unlock(&iter);
+	BUG_ON(!ret);
+	bcache_io_error(c, bio, "btree IO error %i", ret);
+out:
+	continue_at(cl, cached_dev_read_done_bh, NULL);
+}
+
+/* Process writes */
+
+static void cached_dev_write_complete(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	up_read_non_owner(&dc->writeback_lock);
+	cached_dev_bio_complete(cl);
+}
+
+static void cached_dev_write(struct cached_dev *dc, struct search *s)
+{
+	struct closure *cl = &s->cl;
+	struct bio *bio = &s->wbio.bio;
+	bool writeback = false;
+	bool bypass = s->bypass;
+	struct bkey insert_key = KEY(s->inode,
+				     bio_end_sector(bio),
+				     bio_sectors(bio));
+	unsigned flags = BCH_WRITE_DISCARD_ON_ERROR;
+
+	down_read_non_owner(&dc->writeback_lock);
+	if (bch_keybuf_check_overlapping(&dc->writeback_keys,
+					 bkey_start_pos(&insert_key),
+					 insert_key.p)) {
+		/*
+		 * We overlap with some dirty data undergoing background
+		 * writeback, force this write to writeback
+		 */
+		bypass = false;
+		writeback = true;
+	}
+
+	/*
+	 * Discards aren't _required_ to do anything, so skipping if
+	 * check_overlapping returned true is ok
+	 *
+	 * But check_overlapping drops dirty keys for which io hasn't started,
+	 * so we still want to call it.
+	 */
+	if (bio_op(bio) == REQ_OP_DISCARD)
+		bypass = true;
+
+	if (should_writeback(dc, bio, BDEV_CACHE_MODE(dc->disk_sb.sb),
+			     bypass)) {
+		bypass = false;
+		writeback = true;
+	}
+
+	if (bypass) {
+		/*
+		 * If this is a bypass-write (as opposed to a discard), send
+		 * it down to the backing device. If this is a discard, only
+		 * send it to the backing device if the backing device
+		 * supports discards. Otherwise, we simply discard the key
+		 * range from the cache and don't touch the backing device.
+		 */
+		if ((bio_op(bio) != REQ_OP_DISCARD) ||
+		    blk_queue_discard(bdev_get_queue(dc->disk_sb.bdev)))
+			closure_bio_submit(s->orig_bio, cl);
+	} else if (writeback) {
+		bch_writeback_add(dc);
+
+		if (bio->bi_opf & REQ_PREFLUSH) {
+			/* Also need to send a flush to the backing device */
+			struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
+							     &dc->disk.bio_split);
+
+			flush->bi_bdev	= bio->bi_bdev;
+			flush->bi_end_io = request_endio;
+			flush->bi_private = cl;
+			bio_set_op_attrs(flush, REQ_OP_WRITE, WRITE_FLUSH);
+
+			closure_bio_submit(flush, cl);
+		}
+	} else {
+		struct bio *writethrough =
+			bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split);
+
+		closure_bio_submit(writethrough, cl);
+
+		flags |= BCH_WRITE_CACHED;
+		flags |= BCH_WRITE_ALLOC_NOWAIT;
+	}
+
+	if (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
+		flags |= BCH_WRITE_FLUSH;
+	if (bypass)
+		flags |= BCH_WRITE_DISCARD;
+
+	bch_write_op_init(&s->iop, dc->disk.c, &s->wbio,
+			  (struct disk_reservation) { 0 },
+			  foreground_write_point(dc->disk.c,
+					(unsigned long) current),
+			  bkey_start_pos(&insert_key),
+			  NULL, flags);
+
+	closure_call(&s->iop.cl, bch_write, NULL, cl);
+	continue_at(cl, cached_dev_write_complete, NULL);
+}
+
+/* Cached devices - read & write stuff */
+
+static void __cached_dev_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct search *s;
+	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	int rw = bio_data_dir(bio);
+
+	generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
+
+	bio->bi_bdev = dc->disk_sb.bdev;
+	bio->bi_iter.bi_sector += le64_to_cpu(dc->disk_sb.sb->data_offset);
+
+	if (cached_dev_get(dc)) {
+		struct bio *clone;
+
+		s = search_alloc(bio, d);
+		trace_bcache_request_start(s->d, bio);
+
+		clone = rw ? &s->wbio.bio : &s->rbio.bio;
+
+		if (!bio->bi_iter.bi_size) {
+			if (s->orig_bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
+				bch_journal_flush_async(&s->iop.c->journal,
+							&s->cl);
+
+			/*
+			 * If it's a flush, we send the flush to the backing
+			 * device too
+			 */
+			closure_bio_submit(clone, &s->cl);
+
+			continue_at(&s->cl, cached_dev_bio_complete, NULL);
+		} else {
+			s->bypass = check_should_bypass(dc, bio, rw);
+
+			if (rw)
+				cached_dev_write(dc, s);
+			else
+				cached_dev_read(dc, s);
+		}
+	} else {
+		if ((bio_op(bio) == REQ_OP_DISCARD) &&
+		    !blk_queue_discard(bdev_get_queue(dc->disk_sb.bdev)))
+			bio_endio(bio);
+		else
+			generic_make_request(bio);
+	}
+}
+
+static blk_qc_t cached_dev_make_request(struct request_queue *q,
+					struct bio *bio)
+{
+	__cached_dev_make_request(q, bio);
+	return BLK_QC_T_NONE;
+}
+
+static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
+			    unsigned int cmd, unsigned long arg)
+{
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	return __blkdev_driver_ioctl(dc->disk_sb.bdev, mode, cmd, arg);
+}
+
+static int cached_dev_congested(void *data, int bits)
+{
+	struct bcache_device *d = data;
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	struct request_queue *q = bdev_get_queue(dc->disk_sb.bdev);
+	int ret = 0;
+
+	if (bdi_congested(&q->backing_dev_info, bits))
+		return 1;
+
+	if (cached_dev_get(dc)) {
+		unsigned i;
+		struct cache *ca;
+
+		for_each_cache(ca, d->c, i) {
+			q = bdev_get_queue(ca->disk_sb.bdev);
+			ret |= bdi_congested(&q->backing_dev_info, bits);
+		}
+
+		cached_dev_put(dc);
+	}
+
+	return ret;
+}
+
+void bch_cached_dev_request_init(struct cached_dev *dc)
+{
+	struct gendisk *g = dc->disk.disk;
+
+	g->queue->make_request_fn		= cached_dev_make_request;
+	g->queue->backing_dev_info.congested_fn = cached_dev_congested;
+	dc->disk.ioctl				= cached_dev_ioctl;
+}
+
+/* Blockdev volumes */
+
+static void __blockdev_volume_make_request(struct request_queue *q,
+					   struct bio *bio)
+{
+	struct search *s;
+	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+	int rw = bio_data_dir(bio);
+
+	generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
+
+	trace_bcache_request_start(d, bio);
+
+	s = search_alloc(bio, d);
+
+	if (!bio->bi_iter.bi_size) {
+		if (s->orig_bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
+			bch_journal_flush_async(&s->iop.c->journal,
+						&s->cl);
+
+		continue_at(&s->cl, search_free, NULL);
+	} else if (rw) {
+		struct disk_reservation res = { 0 };
+		unsigned flags = 0;
+
+		if (bio_op(bio) != REQ_OP_DISCARD &&
+		    bch_disk_reservation_get(d->c, &res, bio_sectors(bio), 0)) {
+			s->iop.error = -ENOSPC;
+			continue_at(&s->cl, search_free, NULL);
+			return;
+		}
+
+		if (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
+			flags |= BCH_WRITE_FLUSH;
+		if (bio_op(bio) == REQ_OP_DISCARD)
+			flags |= BCH_WRITE_DISCARD;
+
+		bch_write_op_init(&s->iop, d->c, &s->wbio, res,
+				  foreground_write_point(d->c,
+						(unsigned long) current),
+				  POS(s->inode, bio->bi_iter.bi_sector),
+				  NULL, flags);
+
+		closure_call(&s->iop.cl, bch_write, NULL, &s->cl);
+	} else {
+		closure_get(&s->cl);
+		bch_read(d->c, &s->rbio, bcache_dev_inum(d));
+	}
+	continue_at(&s->cl, search_free, NULL);
+}
+
+static blk_qc_t blockdev_volume_make_request(struct request_queue *q,
+					     struct bio *bio)
+{
+	__blockdev_volume_make_request(q, bio);
+	return BLK_QC_T_NONE;
+}
+
+static int blockdev_volume_ioctl(struct bcache_device *d, fmode_t mode,
+				 unsigned int cmd, unsigned long arg)
+{
+	return -ENOTTY;
+}
+
+static int blockdev_volume_congested(void *data, int bits)
+{
+	struct bcache_device *d = data;
+	struct request_queue *q;
+	struct cache *ca;
+	unsigned i;
+	int ret = 0;
+
+	for_each_cache(ca, d->c, i) {
+		q = bdev_get_queue(ca->disk_sb.bdev);
+		ret |= bdi_congested(&q->backing_dev_info, bits);
+	}
+
+	return ret;
+}
+
+void bch_blockdev_volume_request_init(struct bcache_device *d)
+{
+	struct gendisk *g = d->disk;
+
+	g->queue->make_request_fn		= blockdev_volume_make_request;
+	g->queue->backing_dev_info.congested_fn = blockdev_volume_congested;
+	d->ioctl				= blockdev_volume_ioctl;
+}
diff --git a/libbcache/request.h b/libbcache/request.h
new file mode 100644
index 0000000..cd3fe12
--- /dev/null
+++ b/libbcache/request.h
@@ -0,0 +1,16 @@
+#ifndef _BCACHE_REQUEST_H_
+#define _BCACHE_REQUEST_H_
+
+#include "stats.h"
+
+struct cache_set;
+struct cached_dev;
+struct bcache_device;
+struct kmem_cache;
+
+unsigned bch_get_congested(struct cache_set *);
+
+void bch_cached_dev_request_init(struct cached_dev *dc);
+void bch_blockdev_volume_request_init(struct bcache_device *d);
+
+#endif /* _BCACHE_REQUEST_H_ */
diff --git a/libbcache/siphash.c b/libbcache/siphash.c
new file mode 100644
index 0000000..5ba80b5
--- /dev/null
+++ b/libbcache/siphash.c
@@ -0,0 +1,185 @@
+/*	$OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
+
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
+ * are the number of compression rounds and the number of finalization rounds.
+ * A compression round is identical to a finalization round and this round
+ * function is called SipRound.  Given a 128-bit key k and a (possibly empty)
+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
+ *
+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
+ * by Jean-Philippe Aumasson and Daniel J. Bernstein,
+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
+ * https://131002.net/siphash/siphash.pdf
+ * https://131002.net/siphash/
+ */
+
+//#include <sys/param.h>
+//#include <sys/systm.h>
+
+#include <asm/byteorder.h>
+#include <linux/string.h>
+
+#include "siphash.h"
+
+static void	SipHash_CRounds(SIPHASH_CTX *, int);
+static void	SipHash_Rounds(SIPHASH_CTX *, int);
+
+void
+SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
+{
+	u64 k0, k1;
+
+	k0 = le64_to_cpu(key->k0);
+	k1 = le64_to_cpu(key->k1);
+
+	ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
+	ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
+	ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
+	ctx->v[3] = 0x7465646279746573ULL ^ k1;
+
+	memset(ctx->buf, 0, sizeof(ctx->buf));
+	ctx->bytes = 0;
+}
+
+void
+SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
+{
+	const u8 *ptr = src;
+	size_t left, used;
+
+	if (len == 0)
+		return;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	ctx->bytes += len;
+
+	if (used > 0) {
+		left = sizeof(ctx->buf) - used;
+
+		if (len >= left) {
+			memcpy(&ctx->buf[used], ptr, left);
+			SipHash_CRounds(ctx, rc);
+			len -= left;
+			ptr += left;
+		} else {
+			memcpy(&ctx->buf[used], ptr, len);
+			return;
+		}
+	}
+
+	while (len >= sizeof(ctx->buf)) {
+		memcpy(ctx->buf, ptr, sizeof(ctx->buf));
+		SipHash_CRounds(ctx, rc);
+		len -= sizeof(ctx->buf);
+		ptr += sizeof(ctx->buf);
+	}
+
+	if (len > 0)
+		memcpy(&ctx->buf[used], ptr, len);
+}
+
+void
+SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+
+	r = SipHash_End(ctx, rc, rf);
+
+	*((__le64 *) dst) = cpu_to_le64(r);
+}
+
+u64
+SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+	size_t left, used;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	left = sizeof(ctx->buf) - used;
+	memset(&ctx->buf[used], 0, left - 1);
+	ctx->buf[7] = ctx->bytes;
+
+	SipHash_CRounds(ctx, rc);
+	ctx->v[2] ^= 0xff;
+	SipHash_Rounds(ctx, rf);
+
+	r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
+	memset(ctx, 0, sizeof(*ctx));
+	return (r);
+}
+
+u64
+SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
+{
+	SIPHASH_CTX ctx;
+
+	SipHash_Init(&ctx, key);
+	SipHash_Update(&ctx, rc, rf, src, len);
+	return (SipHash_End(&ctx, rc, rf));
+}
+
+#define SIP_ROTL(x, b) ((x) << (b)) | ( (x) >> (64 - (b)))
+
+static void
+SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
+{
+	while (rounds--) {
+		ctx->v[0] += ctx->v[1];
+		ctx->v[2] += ctx->v[3];
+		ctx->v[1] = SIP_ROTL(ctx->v[1], 13);
+		ctx->v[3] = SIP_ROTL(ctx->v[3], 16);
+
+		ctx->v[1] ^= ctx->v[0];
+		ctx->v[3] ^= ctx->v[2];
+		ctx->v[0] = SIP_ROTL(ctx->v[0], 32);
+
+		ctx->v[2] += ctx->v[1];
+		ctx->v[0] += ctx->v[3];
+		ctx->v[1] = SIP_ROTL(ctx->v[1], 17);
+		ctx->v[3] = SIP_ROTL(ctx->v[3], 21);
+
+		ctx->v[1] ^= ctx->v[2];
+		ctx->v[3] ^= ctx->v[0];
+		ctx->v[2] = SIP_ROTL(ctx->v[2], 32);
+	}
+}
+
+static void
+SipHash_CRounds(SIPHASH_CTX *ctx, int rounds)
+{
+	u64 m = le64_to_cpu(*((__le64 *)ctx->buf));
+
+	ctx->v[3] ^= m;
+	SipHash_Rounds(ctx, rounds);
+	ctx->v[0] ^= m;
+}
diff --git a/libbcache/siphash.h b/libbcache/siphash.h
new file mode 100644
index 0000000..7a4b224
--- /dev/null
+++ b/libbcache/siphash.h
@@ -0,0 +1,86 @@
+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
+ * optimized for speed on short messages returning a 64bit hash/digest value.
+ *
+ * The number of rounds is defined during the initialization:
+ *  SipHash24_Init() for the fast and resonable strong version
+ *  SipHash48_Init() for the strong version (half as fast)
+ *
+ * struct SIPHASH_CTX ctx;
+ * SipHash24_Init(&ctx);
+ * SipHash_SetKey(&ctx, "16bytes long key");
+ * SipHash_Update(&ctx, pointer_to_string, length_of_string);
+ * SipHash_Final(output, &ctx);
+ */
+
+#ifndef _SIPHASH_H_
+#define _SIPHASH_H_
+
+#include <linux/types.h>
+
+#define SIPHASH_BLOCK_LENGTH	 8
+#define SIPHASH_KEY_LENGTH	16
+#define SIPHASH_DIGEST_LENGTH	 8
+
+typedef struct _SIPHASH_CTX {
+	u64		v[4];
+	u8		buf[SIPHASH_BLOCK_LENGTH];
+	u32		bytes;
+} SIPHASH_CTX;
+
+typedef struct {
+	__le64		k0;
+	__le64		k1;
+} SIPHASH_KEY;
+
+void	SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
+void	SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
+u64	SipHash_End(SIPHASH_CTX *, int, int);
+void	SipHash_Final(void *, SIPHASH_CTX *, int, int);
+u64	SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
+
+#define SipHash24_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash24_Update(_c, _p, _l)	SipHash_Update((_c), 2, 4, (_p), (_l))
+#define SipHash24_End(_d)		SipHash_End((_d), 2, 4)
+#define SipHash24_Final(_d, _c)		SipHash_Final((_d), (_c), 2, 4)
+#define SipHash24(_k, _p, _l)		SipHash((_k), 2, 4, (_p), (_l))
+
+#define SipHash48_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash48_Update(_c, _p, _l)	SipHash_Update((_c), 4, 8, (_p), (_l))
+#define SipHash48_End(_d)		SipHash_End((_d), 4, 8)
+#define SipHash48_Final(_d, _c)		SipHash_Final((_d), (_c), 4, 8)
+#define SipHash48(_k, _p, _l)		SipHash((_k), 4, 8, (_p), (_l))
+
+#endif /* _SIPHASH_H_ */
diff --git a/libbcache/six.c b/libbcache/six.c
new file mode 100644
index 0000000..1bb8bfc
--- /dev/null
+++ b/libbcache/six.c
@@ -0,0 +1,396 @@
+
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+
+#include "six.h"
+
+#define six_acquire(l, t)	lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
+#define six_release(l)		lock_release(l, 0, _RET_IP_)
+
+#define __SIX_LOCK_HELD_read		__SIX_VAL(read_lock, ~0)
+#define __SIX_LOCK_HELD_intent		__SIX_VAL(intent_lock, ~0)
+#define __SIX_LOCK_HELD_write		__SIX_VAL(seq, 1)
+
+struct six_lock_vals {
+	/* Value we add to the lock in order to take the lock: */
+	u64			lock_val;
+
+	/* If the lock has this value (used as a mask), taking the lock fails: */
+	u64			lock_fail;
+
+	/* Value we add to the lock in order to release the lock: */
+	u64			unlock_val;
+
+	/* Mask that indicates lock is held for this type: */
+	u64			held_mask;
+
+	/* Waitlist we wakeup when releasing the lock: */
+	enum six_lock_type	unlock_wakeup;
+};
+
+#define LOCK_VALS {							\
+	[SIX_LOCK_read] = {						\
+		.lock_val	= __SIX_VAL(read_lock, 1),		\
+		.lock_fail	= __SIX_LOCK_HELD_write,		\
+		.unlock_val	= -__SIX_VAL(read_lock, 1),		\
+		.held_mask	= __SIX_LOCK_HELD_read,			\
+		.unlock_wakeup	= SIX_LOCK_write,			\
+	},								\
+	[SIX_LOCK_intent] = {						\
+		.lock_val	= __SIX_VAL(intent_lock, 1),		\
+		.lock_fail	= __SIX_LOCK_HELD_intent,		\
+		.unlock_val	= -__SIX_VAL(intent_lock, 1),		\
+		.held_mask	= __SIX_LOCK_HELD_intent,		\
+		.unlock_wakeup	= SIX_LOCK_intent,			\
+	},								\
+	[SIX_LOCK_write] = {						\
+		.lock_val	= __SIX_VAL(seq, 1),			\
+		.lock_fail	= __SIX_LOCK_HELD_read,			\
+		.unlock_val	= __SIX_VAL(seq, 1),			\
+		.held_mask	= __SIX_LOCK_HELD_write,		\
+		.unlock_wakeup	= SIX_LOCK_read,			\
+	},								\
+}
+
+static void six_set_owner(struct six_lock *lock, enum six_lock_type type)
+{
+	if (type == SIX_LOCK_intent)
+		lock->owner = current;
+}
+
+static void six_clear_owner(struct six_lock *lock, enum six_lock_type type)
+{
+	if (type == SIX_LOCK_intent)
+		lock->owner = NULL;
+}
+
+static inline bool __six_trylock_type(struct six_lock *lock,
+				      enum six_lock_type type)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state old;
+	u64 v = READ_ONCE(lock->state.v);
+
+	do {
+		old.v = v;
+
+		EBUG_ON(type == SIX_LOCK_write &&
+			((old.v & __SIX_LOCK_HELD_write) ||
+			 !(old.v & __SIX_LOCK_HELD_intent)));
+
+		if (old.v & l[type].lock_fail)
+			return false;
+	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+				old.v,
+				old.v + l[type].lock_val)) != old.v);
+	return true;
+}
+
+bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	bool ret = __six_trylock_type(lock, type);
+
+	if (ret) {
+		six_acquire(&lock->dep_map, 1);
+		six_set_owner(lock, type);
+	}
+
+	return ret;
+}
+
+bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+		     unsigned seq)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state old;
+	u64 v = READ_ONCE(lock->state.v);
+
+	do {
+		old.v = v;
+
+		if (old.seq != seq || old.v & l[type].lock_fail)
+			return false;
+	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+				old.v,
+				old.v + l[type].lock_val)) != old.v);
+
+	six_acquire(&lock->dep_map, 1);
+	six_set_owner(lock, type);
+	return true;
+}
+
+struct six_lock_waiter {
+	struct list_head	list;
+	struct task_struct	*task;
+};
+
+/* This is probably up there with the more evil things I've done */
+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
+
+static inline int six_can_spin_on_owner(struct six_lock *lock)
+{
+	struct task_struct *owner;
+	int retval = 1;
+
+	if (need_resched())
+		return 0;
+
+	rcu_read_lock();
+	owner = READ_ONCE(lock->owner);
+	if (owner)
+		retval = owner->on_cpu;
+	rcu_read_unlock();
+	/*
+	 * if lock->owner is not set, the mutex owner may have just acquired
+	 * it and not set the owner yet or the mutex has been released.
+	 */
+	return retval;
+}
+
+static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner)
+{
+	bool ret = true;
+
+	rcu_read_lock();
+	while (lock->owner == owner) {
+		/*
+		 * Ensure we emit the owner->on_cpu, dereference _after_
+		 * checking lock->owner still matches owner. If that fails,
+		 * owner might point to freed memory. If it still matches,
+		 * the rcu_read_lock() ensures the memory stays valid.
+		 */
+		barrier();
+
+		if (!owner->on_cpu || need_resched()) {
+			ret = false;
+			break;
+		}
+
+		cpu_relax_lowlatency();
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+	struct task_struct *task = current;
+
+	if (type == SIX_LOCK_write)
+		return false;
+
+	preempt_disable();
+	if (!six_can_spin_on_owner(lock))
+		goto fail;
+
+	if (!osq_lock(&lock->osq))
+		goto fail;
+
+	while (1) {
+		struct task_struct *owner;
+
+		/*
+		 * If there's an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = READ_ONCE(lock->owner);
+		if (owner && !six_spin_on_owner(lock, owner))
+			break;
+
+		if (__six_trylock_type(lock, type)) {
+			osq_unlock(&lock->osq);
+			preempt_enable();
+			return true;
+		}
+
+		/*
+		 * When there's no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we're an RT task that will live-lock because we won't let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax_lowlatency();
+	}
+
+	osq_unlock(&lock->osq);
+fail:
+	preempt_enable();
+
+	/*
+	 * If we fell out of the spin path because of need_resched(),
+	 * reschedule now, before we try-lock again. This avoids getting
+	 * scheduled out right after we obtained the lock.
+	 */
+	if (need_resched())
+		schedule();
+
+	return false;
+}
+
+void six_lock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state old, new;
+	struct six_lock_waiter wait;
+	u64 v;
+
+	six_acquire(&lock->dep_map, 0);
+
+	if (__six_trylock_type(lock, type))
+		goto done;
+
+	if (six_optimistic_spin(lock, type))
+		goto done;
+
+	lock_contended(&lock->dep_map, _RET_IP_);
+
+	INIT_LIST_HEAD(&wait.list);
+	wait.task = current;
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (list_empty_careful(&wait.list)) {
+			raw_spin_lock(&lock->wait_lock);
+			list_add_tail(&wait.list, &lock->wait_list[type]);
+			raw_spin_unlock(&lock->wait_lock);
+		}
+
+		v = READ_ONCE(lock->state.v);
+		do {
+			new.v = old.v = v;
+
+			if (!(old.v & l[type].lock_fail))
+				new.v += l[type].lock_val;
+			else if (!(new.waiters & (1 << type)))
+				new.waiters |= 1 << type;
+			else
+				break; /* waiting bit already set */
+		} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+					old.v, new.v)) != old.v);
+
+		if (!(old.v & l[type].lock_fail))
+			break;
+
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+
+	if (!list_empty_careful(&wait.list)) {
+		raw_spin_lock(&lock->wait_lock);
+		list_del_init(&wait.list);
+		raw_spin_unlock(&lock->wait_lock);
+	}
+done:
+	lock_acquired(&lock->dep_map, _RET_IP_);
+	six_set_owner(lock, type);
+}
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+				   union six_lock_state state,
+				   unsigned waitlist_id)
+{
+	struct list_head *wait_list = &lock->wait_list[waitlist_id];
+	struct six_lock_waiter *w, *next;
+
+	if (waitlist_id == SIX_LOCK_write && state.read_lock)
+		return;
+
+	if (!(state.waiters & (1 << waitlist_id)))
+		return;
+
+	clear_bit(waitlist_bitnr(waitlist_id),
+		  (unsigned long *) &lock->state.v);
+
+	raw_spin_lock(&lock->wait_lock);
+
+	list_for_each_entry_safe(w, next, wait_list, list) {
+		list_del_init(&w->list);
+
+		if (wake_up_process(w->task) &&
+		    waitlist_id != SIX_LOCK_read) {
+			if (!list_empty(wait_list))
+				set_bit(waitlist_bitnr(waitlist_id),
+					(unsigned long *) &lock->state.v);
+			break;
+		}
+	}
+
+	raw_spin_unlock(&lock->wait_lock);
+}
+
+void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state state;
+
+	six_clear_owner(lock, type);
+
+	EBUG_ON(!(lock->state.v & l[type].held_mask));
+	EBUG_ON(type == SIX_LOCK_write &&
+		!(lock->state.v & __SIX_LOCK_HELD_intent));
+
+	state.v = atomic64_add_return_release(l[type].unlock_val,
+					      &lock->state.counter);
+	six_release(&lock->dep_map);
+	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
+}
+
+bool six_trylock_convert(struct six_lock *lock,
+			 enum six_lock_type from,
+			 enum six_lock_type to)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+	union six_lock_state old, new;
+	u64 v = READ_ONCE(lock->state.v);
+
+	do {
+		new.v = old.v = v;
+		new.v += l[from].unlock_val;
+
+		if (new.v & l[to].lock_fail)
+			return false;
+	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+				old.v,
+				new.v + l[to].lock_val)) != old.v);
+
+	six_clear_owner(lock, from);
+	six_set_owner(lock, to);
+
+	six_lock_wakeup(lock, new, l[from].unlock_wakeup);
+
+	return true;
+}
+
+/*
+ * Increment read/intent lock count, assuming we already have it read or intent
+ * locked:
+ */
+void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
+{
+	const struct six_lock_vals l[] = LOCK_VALS;
+
+	EBUG_ON(type == SIX_LOCK_write);
+	six_acquire(&lock->dep_map, 0);
+
+	/* XXX: assert already locked, and that we don't overflow: */
+
+	atomic64_add(l[type].lock_val, &lock->state.counter);
+}
+
+/* Convert from intent to read: */
+void six_lock_downgrade(struct six_lock *lock)
+{
+	six_lock_increment(lock, SIX_LOCK_read);
+	six_unlock_intent(lock);
+}
diff --git a/libbcache/six.h b/libbcache/six.h
new file mode 100644
index 0000000..01ed338
--- /dev/null
+++ b/libbcache/six.h
@@ -0,0 +1,136 @@
+
+#ifndef _BCACHE_SIX_H
+#define _BCACHE_SIX_H
+
+#include <linux/lockdep.h>
+#include <linux/osq_lock.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#include "util.h"
+
+/*
+ * LOCK STATES:
+ *
+ * read, intent, write (i.e. shared/intent/exclusive, hence the name)
+ *
+ * read and write work as with normal read/write locks - a lock can have
+ * multiple readers, but write excludes reads and other write locks.
+ *
+ * Intent does not block read, but it does block other intent locks. The idea is
+ * by taking an intent lock, you can then later upgrade to a write lock without
+ * dropping your read lock and without deadlocking - because no other thread has
+ * the intent lock and thus no other thread could be trying to take the write
+ * lock.
+ */
+
+union six_lock_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		/* for waitlist_bitnr() */
+		unsigned long	l;
+	};
+
+	struct {
+		unsigned	read_lock:26;
+		unsigned	intent_lock:3;
+		unsigned	waiters:3;
+		/*
+		 * seq works much like in seqlocks: it's incremented every time
+		 * we lock and unlock for write.
+		 *
+		 * If it's odd write lock is held, even unlocked.
+		 *
+		 * Thus readers can unlock, and then lock again later iff it
+		 * hasn't been modified in the meantime.
+		 */
+		u32		seq;
+	};
+};
+
+#define SIX_LOCK_MAX_RECURSE	((1 << 3) - 1)
+
+enum six_lock_type {
+	SIX_LOCK_read,
+	SIX_LOCK_intent,
+	SIX_LOCK_write,
+};
+
+struct six_lock {
+	union six_lock_state	state;
+	struct task_struct	*owner;
+	struct optimistic_spin_queue osq;
+
+	raw_spinlock_t		wait_lock;
+	struct list_head	wait_list[3];
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+static __always_inline void __six_lock_init(struct six_lock *lock,
+					    const char *name,
+					    struct lock_class_key *key)
+{
+	atomic64_set(&lock->state.counter, 0);
+	raw_spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
+	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
+	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_write]);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+}
+
+#define six_lock_init(lock)						\
+do {									\
+	static struct lock_class_key __key;				\
+									\
+	__six_lock_init((lock), #lock, &__key);				\
+} while (0)
+
+bool six_trylock_type(struct six_lock *, enum six_lock_type);
+bool six_relock_type(struct six_lock *, enum six_lock_type, unsigned);
+void six_lock_type(struct six_lock *, enum six_lock_type);
+void six_unlock_type(struct six_lock *, enum six_lock_type);
+bool six_trylock_convert(struct six_lock *, enum six_lock_type,
+			 enum six_lock_type);
+void six_lock_increment(struct six_lock *, enum six_lock_type);
+void six_lock_downgrade(struct six_lock *);
+
+#define __SIX_VAL(field, _v)	(((union six_lock_state) { .field = _v }).v)
+
+#define __SIX_LOCK(type)						\
+static __always_inline bool six_trylock_##type(struct six_lock *lock)	\
+{									\
+	return six_trylock_type(lock, SIX_LOCK_##type);			\
+}									\
+									\
+static __always_inline bool six_relock_##type(struct six_lock *lock, u32 seq)\
+{									\
+	return six_relock_type(lock, SIX_LOCK_##type, seq);		\
+}									\
+									\
+static __always_inline void six_lock_##type(struct six_lock *lock)	\
+{									\
+	six_lock_type(lock, SIX_LOCK_##type);				\
+}									\
+									\
+static __always_inline void six_unlock_##type(struct six_lock *lock)	\
+{									\
+	six_unlock_type(lock, SIX_LOCK_##type);				\
+}
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+
+#endif /* _BCACHE_SIX_H */
diff --git a/libbcache/stats.c b/libbcache/stats.c
new file mode 100644
index 0000000..a8a4eb3
--- /dev/null
+++ b/libbcache/stats.c
@@ -0,0 +1,219 @@
+/*
+ * bcache stats code
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "stats.h"
+#include "sysfs.h"
+
+/*
+ * We keep absolute totals of various statistics, and addionally a set of three
+ * rolling averages.
+ *
+ * Every so often, a timer goes off and rescales the rolling averages.
+ * accounting_rescale[] is how many times the timer has to go off before we
+ * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
+ * and one day.
+ *
+ * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
+ * and accounting_weight is what we use to rescale:
+ *
+ * pow(31 / 32, 22) ~= 1/2
+ *
+ * So that we don't have to increment each set of numbers every time we (say)
+ * get a cache hit, we increment a single atomic_t in acc->collector, and when
+ * the rescale function runs it resets the atomic counter to 0 and adds its
+ * old value to each of the exported numbers.
+ *
+ * To reduce rounding error, the numbers in struct cache_stats are all
+ * stored left shifted by 16, and scaled back in the sysfs show() function.
+ */
+
+static const unsigned DAY_RESCALE		= 288;
+static const unsigned HOUR_RESCALE		= 12;
+static const unsigned FIVE_MINUTE_RESCALE	= 1;
+static const unsigned accounting_delay		= (HZ * 300) / 22;
+static const unsigned accounting_weight		= 5;
+
+/* sysfs reading/writing */
+
+read_attribute(cache_hits);
+read_attribute(cache_misses);
+read_attribute(cache_bypass_hits);
+read_attribute(cache_bypass_misses);
+read_attribute(cache_hit_ratio);
+read_attribute(cache_readaheads);
+read_attribute(cache_miss_collisions);
+read_attribute(bypassed);
+read_attribute(foreground_write_ratio);
+read_attribute(foreground_writes);
+read_attribute(gc_writes);
+read_attribute(discards);
+
+SHOW(bch_stats)
+{
+	struct cache_stats *s =
+		container_of(kobj, struct cache_stats, kobj);
+#define var(stat)		(s->stat >> 16)
+	var_print(cache_hits);
+	var_print(cache_misses);
+	var_print(cache_bypass_hits);
+	var_print(cache_bypass_misses);
+
+	sysfs_print(cache_hit_ratio,
+		    DIV_SAFE(var(cache_hits) * 100,
+			     var(cache_hits) + var(cache_misses)));
+
+	var_print(cache_readaheads);
+	var_print(cache_miss_collisions);
+
+	sysfs_hprint(bypassed,		var(sectors_bypassed) << 9);
+	sysfs_hprint(foreground_writes,	var(foreground_write_sectors) << 9);
+	sysfs_hprint(gc_writes,		var(gc_write_sectors) << 9);
+	sysfs_hprint(discards,		var(discard_sectors) << 9);
+
+	sysfs_print(foreground_write_ratio,
+		    DIV_SAFE(var(foreground_write_sectors) * 100,
+			     var(foreground_write_sectors) +
+			     var(gc_write_sectors)));
+#undef var
+	return 0;
+}
+
+STORE(bch_stats)
+{
+	return size;
+}
+
+static void bch_stats_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_stats_files[] = {
+	&sysfs_cache_hits,
+	&sysfs_cache_misses,
+	&sysfs_cache_bypass_hits,
+	&sysfs_cache_bypass_misses,
+	&sysfs_cache_hit_ratio,
+	&sysfs_cache_readaheads,
+	&sysfs_cache_miss_collisions,
+	&sysfs_bypassed,
+	&sysfs_foreground_write_ratio,
+	&sysfs_foreground_writes,
+	&sysfs_gc_writes,
+	&sysfs_discards,
+	NULL
+};
+static KTYPE(bch_stats);
+
+int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
+				   struct kobject *parent)
+{
+	int ret = kobject_add(&acc->total.kobj, parent,
+			      "stats_total");
+	ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
+				 "stats_five_minute");
+	ret = ret ?: kobject_add(&acc->hour.kobj, parent,
+				 "stats_hour");
+	ret = ret ?: kobject_add(&acc->day.kobj, parent,
+				 "stats_day");
+	return ret;
+}
+
+void bch_cache_accounting_clear(struct cache_accounting *acc)
+{
+	memset(&acc->total.cache_hits,
+	       0,
+	       sizeof(unsigned long) * 9);
+}
+
+void bch_cache_accounting_destroy(struct cache_accounting *acc)
+{
+	kobject_put(&acc->total.kobj);
+	kobject_put(&acc->five_minute.kobj);
+	kobject_put(&acc->hour.kobj);
+	kobject_put(&acc->day.kobj);
+
+	atomic_set(&acc->closing, 1);
+	if (del_timer_sync(&acc->timer))
+		closure_return(&acc->cl);
+}
+
+/* EWMA scaling */
+
+static void scale_stat(unsigned long *stat)
+{
+	*stat =  ewma_add(*stat, 0, accounting_weight);
+}
+
+static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
+{
+	if (++stats->rescale == rescale_at) {
+		stats->rescale = 0;
+		scale_stat(&stats->cache_hits);
+		scale_stat(&stats->cache_misses);
+		scale_stat(&stats->cache_bypass_hits);
+		scale_stat(&stats->cache_bypass_misses);
+		scale_stat(&stats->cache_readaheads);
+		scale_stat(&stats->cache_miss_collisions);
+		scale_stat(&stats->sectors_bypassed);
+		scale_stat(&stats->foreground_write_sectors);
+		scale_stat(&stats->gc_write_sectors);
+		scale_stat(&stats->discard_sectors);
+	}
+}
+
+static void scale_accounting(unsigned long data)
+{
+	struct cache_accounting *acc = (struct cache_accounting *) data;
+
+#define move_stat(name) do {						\
+	unsigned t = atomic_xchg(&acc->collector.name, 0);		\
+	t <<= 16;							\
+	acc->five_minute.name += t;					\
+	acc->hour.name += t;						\
+	acc->day.name += t;						\
+	acc->total.name += t;						\
+} while (0)
+
+	move_stat(cache_hits);
+	move_stat(cache_misses);
+	move_stat(cache_bypass_hits);
+	move_stat(cache_bypass_misses);
+	move_stat(cache_readaheads);
+	move_stat(cache_miss_collisions);
+	move_stat(sectors_bypassed);
+	move_stat(foreground_write_sectors);
+	move_stat(gc_write_sectors);
+	move_stat(discard_sectors);
+
+	scale_stats(&acc->total, 0);
+	scale_stats(&acc->day, DAY_RESCALE);
+	scale_stats(&acc->hour, HOUR_RESCALE);
+	scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
+
+	acc->timer.expires += accounting_delay;
+
+	if (!atomic_read(&acc->closing))
+		add_timer(&acc->timer);
+	else
+		closure_return(&acc->cl);
+}
+
+void bch_cache_accounting_init(struct cache_accounting *acc,
+			       struct closure *parent)
+{
+	kobject_init(&acc->total.kobj,		&bch_stats_ktype);
+	kobject_init(&acc->five_minute.kobj,	&bch_stats_ktype);
+	kobject_init(&acc->hour.kobj,		&bch_stats_ktype);
+	kobject_init(&acc->day.kobj,		&bch_stats_ktype);
+
+	closure_init(&acc->cl, parent);
+	init_timer(&acc->timer);
+	acc->timer.expires	= jiffies + accounting_delay;
+	acc->timer.data		= (unsigned long) acc;
+	acc->timer.function	= scale_accounting;
+	add_timer(&acc->timer);
+}
diff --git a/libbcache/stats.h b/libbcache/stats.h
new file mode 100644
index 0000000..39877f9
--- /dev/null
+++ b/libbcache/stats.h
@@ -0,0 +1,52 @@
+#ifndef _BCACHE_STATS_H_
+#define _BCACHE_STATS_H_
+
+#include "stats_types.h"
+
+struct cache_set;
+struct cached_dev;
+struct bcache_device;
+
+void bch_cache_accounting_init(struct cache_accounting *, struct closure *);
+int bch_cache_accounting_add_kobjs(struct cache_accounting *, struct kobject *);
+void bch_cache_accounting_clear(struct cache_accounting *);
+void bch_cache_accounting_destroy(struct cache_accounting *);
+
+static inline void mark_cache_stats(struct cache_stat_collector *stats,
+				    bool hit, bool bypass)
+{
+	atomic_inc(&stats->cache_hit_array[!bypass][!hit]);
+}
+
+static inline void bch_mark_cache_accounting(struct cache_set *c,
+					     struct cached_dev *dc,
+					     bool hit, bool bypass)
+{
+	mark_cache_stats(&dc->accounting.collector, hit, bypass);
+	mark_cache_stats(&c->accounting.collector, hit, bypass);
+}
+
+static inline void bch_mark_sectors_bypassed(struct cache_set *c,
+					     struct cached_dev *dc,
+					     unsigned sectors)
+{
+	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
+	atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
+}
+
+static inline void bch_mark_gc_write(struct cache_set *c, int sectors)
+{
+	atomic_add(sectors, &c->accounting.collector.gc_write_sectors);
+}
+
+static inline void bch_mark_foreground_write(struct cache_set *c, int sectors)
+{
+	atomic_add(sectors, &c->accounting.collector.foreground_write_sectors);
+}
+
+static inline void bch_mark_discard(struct cache_set *c, int sectors)
+{
+	atomic_add(sectors, &c->accounting.collector.discard_sectors);
+}
+
+#endif /* _BCACHE_STATS_H_ */
diff --git a/libbcache/stats_types.h b/libbcache/stats_types.h
new file mode 100644
index 0000000..28e4c69
--- /dev/null
+++ b/libbcache/stats_types.h
@@ -0,0 +1,56 @@
+#ifndef _BCACHE_STATS_TYPES_H_
+#define _BCACHE_STATS_TYPES_H_
+
+struct cache_stat_collector {
+	union {
+	struct {
+		atomic_t	cache_hits;
+		atomic_t	cache_misses;
+		atomic_t	cache_bypass_hits;
+		atomic_t	cache_bypass_misses;
+	};
+
+	/* cache_hit_array[!bypass][!hit]: */
+	atomic_t		cache_hit_array[2][2];
+	};
+
+
+	atomic_t		cache_readaheads;
+	atomic_t		cache_miss_collisions;
+	atomic_t		sectors_bypassed;
+	atomic_t		foreground_write_sectors;
+	atomic_t		gc_write_sectors;
+	atomic_t		discard_sectors;
+};
+
+struct cache_stats {
+	struct kobject		kobj;
+
+	unsigned long		cache_hits;
+	unsigned long		cache_misses;
+	unsigned long		cache_bypass_hits;
+	unsigned long		cache_bypass_misses;
+	unsigned long		cache_readaheads;
+	unsigned long		cache_miss_collisions;
+	unsigned long		sectors_bypassed;
+	unsigned long		foreground_write_sectors;
+	unsigned long		gc_write_sectors;
+	unsigned long		discard_sectors;
+
+	unsigned		rescale;
+};
+
+struct cache_accounting {
+	struct closure		cl;
+	struct timer_list	timer;
+	atomic_t		closing;
+
+	struct cache_stat_collector collector;
+
+	struct cache_stats	total;
+	struct cache_stats	five_minute;
+	struct cache_stats	hour;
+	struct cache_stats	day;
+};
+
+#endif /* _BCACHE_STATS_TYPES_H_ */
diff --git a/libbcache/str_hash.h b/libbcache/str_hash.h
new file mode 100644
index 0000000..9a718a8
--- /dev/null
+++ b/libbcache/str_hash.h
@@ -0,0 +1,352 @@
+#ifndef _BCACHE_STR_HASH_H
+#define _BCACHE_STR_HASH_H
+
+#include "btree_iter.h"
+#include "checksum.h"
+#include "siphash.h"
+#include "super.h"
+
+#include <crypto/sha1_base.h>
+#include <linux/crc32c.h>
+
+static const SIPHASH_KEY bch_siphash_key = {
+	.k0 = cpu_to_le64(0x5a9585fd80087730ULL),
+	.k1 = cpu_to_le64(0xc8de666d50b45664ULL ),
+};
+
+struct bch_str_hash_ctx {
+	union {
+		u32			crc32c;
+		u64			crc64;
+		SIPHASH_CTX		siphash;
+	};
+};
+
+static inline void bch_str_hash_init(struct bch_str_hash_ctx *ctx,
+				     enum bch_str_hash_type type)
+{
+	switch (type) {
+	case BCH_STR_HASH_CRC32C:
+		ctx->crc32c = ~0;
+		break;
+	case BCH_STR_HASH_CRC64:
+		ctx->crc64 = ~0;
+		break;
+	case BCH_STR_HASH_SIPHASH:
+		SipHash24_Init(&ctx->siphash, &bch_siphash_key);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void bch_str_hash_update(struct bch_str_hash_ctx *ctx,
+				enum bch_str_hash_type type,
+				const void *data, size_t len)
+{
+	switch (type) {
+	case BCH_STR_HASH_CRC32C:
+		ctx->crc32c = crc32c(ctx->crc32c, data, len);
+		break;
+	case BCH_STR_HASH_CRC64:
+		ctx->crc64 = bch_crc64_update(ctx->crc64, data, len);
+		break;
+	case BCH_STR_HASH_SIPHASH:
+		SipHash24_Update(&ctx->siphash, data, len);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline u64 bch_str_hash_end(struct bch_str_hash_ctx *ctx,
+				   enum bch_str_hash_type type)
+{
+	switch (type) {
+	case BCH_STR_HASH_CRC32C:
+		return ctx->crc32c;
+	case BCH_STR_HASH_CRC64:
+		return ctx->crc64 >> 1;
+	case BCH_STR_HASH_SIPHASH:
+		return SipHash24_End(&ctx->siphash) >> 1;
+	default:
+		BUG();
+	}
+}
+
+struct bch_hash_info {
+	u64		seed;
+	u8		type;
+};
+
+struct bch_hash_desc {
+	enum btree_id	btree_id;
+	u8		key_type;
+	u8		whiteout_type;
+
+	u64		(*hash_key)(const struct bch_hash_info *, const void *);
+	u64		(*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
+	bool		(*cmp_key)(struct bkey_s_c, const void *);
+	bool		(*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+};
+
+static inline struct bkey_s_c
+bch_hash_lookup_at(const struct bch_hash_desc desc,
+		   const struct bch_hash_info *info,
+		   struct btree_iter *iter, const void *search)
+{
+	u64 inode = iter->pos.inode;
+
+	do {
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
+
+		if (btree_iter_err(k))
+			return k;
+
+		if (k.k->type == desc.key_type) {
+			if (!desc.cmp_key(k, search))
+				return k;
+		} else if (k.k->type == desc.whiteout_type) {
+			;
+		} else {
+			/* hole, not found */
+			break;
+		}
+
+		bch_btree_iter_advance_pos(iter);
+	} while (iter->pos.inode == inode);
+
+	return bkey_s_c_err(-ENOENT);
+}
+
+static inline struct bkey_s_c
+bch_hash_lookup_bkey_at(const struct bch_hash_desc desc,
+			const struct bch_hash_info *info,
+			struct btree_iter *iter, struct bkey_s_c search)
+{
+	u64 inode = iter->pos.inode;
+
+	do {
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
+
+		if (btree_iter_err(k))
+			return k;
+
+		if (k.k->type == desc.key_type) {
+			if (!desc.cmp_bkey(k, search))
+				return k;
+		} else if (k.k->type == desc.whiteout_type) {
+			;
+		} else {
+			/* hole, not found */
+			break;
+		}
+
+		bch_btree_iter_advance_pos(iter);
+	} while (iter->pos.inode == inode);
+
+	return bkey_s_c_err(-ENOENT);
+}
+
+static inline struct bkey_s_c
+bch_hash_lookup(const struct bch_hash_desc desc,
+		const struct bch_hash_info *info,
+		struct cache_set *c, u64 inode,
+		struct btree_iter *iter, const void *key)
+{
+	bch_btree_iter_init(iter, c, desc.btree_id,
+			    POS(inode, desc.hash_key(info, key)));
+
+	return bch_hash_lookup_at(desc, info, iter, key);
+}
+
+static inline struct bkey_s_c
+bch_hash_lookup_intent(const struct bch_hash_desc desc,
+		       const struct bch_hash_info *info,
+		       struct cache_set *c, u64 inode,
+		       struct btree_iter *iter, const void *key)
+{
+	bch_btree_iter_init_intent(iter, c, desc.btree_id,
+			    POS(inode, desc.hash_key(info, key)));
+
+	return bch_hash_lookup_at(desc, info, iter, key);
+}
+
+static inline struct bkey_s_c
+bch_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter)
+{
+	while (1) {
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
+
+		if (btree_iter_err(k))
+			return k;
+
+		if (k.k->type != desc.key_type)
+			return k;
+
+		/* hash collision, keep going */
+		bch_btree_iter_advance_pos(iter);
+		if (iter->pos.inode != k.k->p.inode)
+			return bkey_s_c_err(-ENOENT);
+	}
+}
+
+static inline struct bkey_s_c bch_hash_hole(const struct bch_hash_desc desc,
+					    const struct bch_hash_info *info,
+					    struct cache_set *c, u64 inode,
+					    struct btree_iter *iter,
+					    const void *key)
+{
+	bch_btree_iter_init_intent(iter, c, desc.btree_id,
+			    POS(inode, desc.hash_key(info, key)));
+
+	return bch_hash_hole_at(desc, iter);
+}
+
+static inline int bch_hash_needs_whiteout(const struct bch_hash_desc desc,
+					   const struct bch_hash_info *info,
+					   struct btree_iter *iter,
+					   struct btree_iter *start)
+{
+	bch_btree_iter_set_pos(iter,
+			btree_type_successor(start->btree_id, start->pos));
+
+	while (1) {
+		struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
+		int ret = btree_iter_err(k);
+
+		if (ret)
+			return ret;
+
+		if (k.k->type != desc.key_type &&
+		    k.k->type != desc.whiteout_type)
+			return false;
+
+		if (k.k->type == desc.key_type &&
+		    desc.hash_bkey(info, k) <= start->pos.offset)
+			return true;
+
+		bch_btree_iter_advance_pos(iter);
+	}
+}
+
+#define BCH_HASH_SET_MUST_CREATE	1
+#define BCH_HASH_SET_MUST_REPLACE	2
+
+static inline int bch_hash_set(const struct bch_hash_desc desc,
+			       const struct bch_hash_info *info,
+			       struct cache_set *c, u64 inode,
+			       u64 *journal_seq,
+			       struct bkey_i *insert, int flags)
+{
+	struct btree_iter iter, hashed_slot;
+	struct bkey_s_c k;
+	int ret;
+
+	bch_btree_iter_init_intent(&hashed_slot, c, desc.btree_id,
+		POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))));
+	bch_btree_iter_init_intent(&iter, c, desc.btree_id, hashed_slot.pos);
+	bch_btree_iter_link(&hashed_slot, &iter);
+retry:
+	/*
+	 * On hash collision, we have to keep the slot we hashed to locked while
+	 * we do the insert - to avoid racing with another thread deleting
+	 * whatever's in the slot we hashed to:
+	 */
+	ret = bch_btree_iter_traverse(&hashed_slot);
+	if (ret)
+		goto err;
+
+	/*
+	 * On -EINTR/retry, we dropped locks - always restart from the slot we
+	 * hashed to:
+	 */
+	bch_btree_iter_copy(&iter, &hashed_slot);
+
+	k = bch_hash_lookup_bkey_at(desc, info, &iter, bkey_i_to_s_c(insert));
+
+	ret = btree_iter_err(k);
+	if (ret == -ENOENT) {
+		if (flags & BCH_HASH_SET_MUST_REPLACE) {
+			ret = -ENOENT;
+			goto err;
+		}
+
+		/*
+		 * Not found, so we're now looking for any open
+		 * slot - we might have skipped over a whiteout
+		 * that we could have used, so restart from the
+		 * slot we hashed to:
+		 */
+		bch_btree_iter_copy(&iter, &hashed_slot);
+		k = bch_hash_hole_at(desc, &iter);
+		if ((ret = btree_iter_err(k)))
+			goto err;
+	} else if (!ret) {
+		if (flags & BCH_HASH_SET_MUST_CREATE) {
+			ret = -EEXIST;
+			goto err;
+		}
+	} else {
+		goto err;
+	}
+
+	insert->k.p = iter.pos;
+	ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
+				  BTREE_INSERT_ATOMIC,
+				  BTREE_INSERT_ENTRY(&iter, insert));
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	/*
+	 * On successful insert, we don't want to clobber ret with error from
+	 * iter:
+	 */
+	bch_btree_iter_unlock(&iter);
+	bch_btree_iter_unlock(&hashed_slot);
+	return ret;
+}
+
+static inline int bch_hash_delete(const struct bch_hash_desc desc,
+				  const struct bch_hash_info *info,
+				  struct cache_set *c, u64 inode,
+				  u64 *journal_seq, const void *key)
+{
+	struct btree_iter iter, whiteout_iter;
+	struct bkey_s_c k;
+	struct bkey_i delete;
+	int ret = -ENOENT;
+
+	bch_btree_iter_init_intent(&iter, c, desc.btree_id,
+			    POS(inode, desc.hash_key(info, key)));
+	bch_btree_iter_init(&whiteout_iter, c, desc.btree_id,
+			    POS(inode, desc.hash_key(info, key)));
+	bch_btree_iter_link(&iter, &whiteout_iter);
+retry:
+	k = bch_hash_lookup_at(desc, info, &iter, key);
+	if ((ret = btree_iter_err(k)))
+		goto err;
+
+	ret = bch_hash_needs_whiteout(desc, info, &whiteout_iter, &iter);
+	if (ret < 0)
+		goto err;
+
+	bkey_init(&delete.k);
+	delete.k.p = k.k->p;
+	delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
+
+	ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_ATOMIC,
+				  BTREE_INSERT_ENTRY(&iter, &delete));
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	bch_btree_iter_unlock(&whiteout_iter);
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
+
+#endif /* _BCACHE_STR_HASH_H */
diff --git a/libbcache/super.c b/libbcache/super.c
new file mode 100644
index 0000000..5f6a85e
--- /dev/null
+++ b/libbcache/super.c
@@ -0,0 +1,2503 @@
+/*
+ * bcache setup/teardown code, and some metadata io - read a superblock and
+ * figure out what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "blockdev.h"
+#include "alloc.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_io.h"
+#include "chardev.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "debug.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-gc.h"
+#include "inode.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "migrate.h"
+#include "movinggc.h"
+#include "notify.h"
+#include "stats.h"
+#include "super.h"
+#include "tier.h"
+#include "writeback.h"
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/sysfs.h>
+#include <crypto/hash.h>
+
+#include <trace/events/bcache.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+
+static const uuid_le invalid_uuid = {
+	.b = {
+		0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
+		0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
+	}
+};
+
+static struct kset *bcache_kset;
+struct mutex bch_register_lock;
+LIST_HEAD(bch_cache_sets);
+
+static int bch_chardev_major;
+static struct class *bch_chardev_class;
+static struct device *bch_chardev;
+static DEFINE_IDR(bch_chardev_minor);
+static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
+struct workqueue_struct *bcache_io_wq;
+struct crypto_shash *bch_sha1;
+
+static void bch_cache_stop(struct cache *);
+static int bch_cache_online(struct cache *);
+
+static bool bch_is_open_cache(struct block_device *bdev)
+{
+	struct cache_set *c;
+	struct cache *ca;
+	unsigned i;
+
+	rcu_read_lock();
+	list_for_each_entry(c, &bch_cache_sets, list)
+		for_each_cache_rcu(ca, c, i)
+			if (ca->disk_sb.bdev == bdev) {
+				rcu_read_unlock();
+				return true;
+			}
+	rcu_read_unlock();
+	return false;
+}
+
+static bool bch_is_open(struct block_device *bdev)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+}
+
+static const char *bch_blkdev_open(const char *path, void *holder,
+				   struct block_device **ret)
+{
+	struct block_device *bdev;
+	const char *err;
+
+	*ret = NULL;
+	bdev = blkdev_get_by_path(path, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				  holder);
+
+	if (bdev == ERR_PTR(-EBUSY)) {
+		bdev = lookup_bdev(path);
+		if (IS_ERR(bdev))
+			return "device busy";
+
+		err = bch_is_open(bdev)
+			? "device already registered"
+			: "device busy";
+
+		bdput(bdev);
+		return err;
+	}
+
+	if (IS_ERR(bdev))
+		return "failed to open device";
+
+	bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
+
+	*ret = bdev;
+	return NULL;
+}
+
+static int bch_congested_fn(void *data, int bdi_bits)
+{
+	struct backing_dev_info *bdi;
+	struct cache_set *c = data;
+	struct cache *ca;
+	unsigned i;
+	int ret = 0;
+
+	rcu_read_lock();
+	if (bdi_bits & (1 << WB_sync_congested)) {
+		/* Reads - check all devices: */
+		for_each_cache_rcu(ca, c, i) {
+			bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
+
+			if (bdi_congested(bdi, bdi_bits)) {
+				ret = 1;
+				break;
+			}
+		}
+	} else {
+		/* Writes only go to tier 0: */
+		group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
+			bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
+
+			if (bdi_congested(bdi, bdi_bits)) {
+				ret = 1;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/* Superblock */
+
+static struct cache_member_cpu cache_mi_to_cpu_mi(struct cache_member *mi)
+{
+	return (struct cache_member_cpu) {
+		.nbuckets	= le64_to_cpu(mi->nbuckets),
+		.first_bucket	= le16_to_cpu(mi->first_bucket),
+		.bucket_size	= le16_to_cpu(mi->bucket_size),
+		.state		= CACHE_STATE(mi),
+		.tier		= CACHE_TIER(mi),
+		.replication_set= CACHE_REPLICATION_SET(mi),
+		.has_metadata	= CACHE_HAS_METADATA(mi),
+		.has_data	= CACHE_HAS_DATA(mi),
+		.replacement	= CACHE_REPLACEMENT(mi),
+		.discard	= CACHE_DISCARD(mi),
+		.valid		= !bch_is_zero(mi->uuid.b, sizeof(uuid_le)),
+	};
+}
+
+static const char *validate_cache_super(struct bcache_superblock *disk_sb)
+{
+	struct cache_sb *sb = disk_sb->sb;
+	struct cache_member_cpu	mi;
+	u16 block_size;
+	unsigned i;
+
+	switch (le64_to_cpu(sb->version)) {
+	case BCACHE_SB_VERSION_CDEV_V0:
+	case BCACHE_SB_VERSION_CDEV_WITH_UUID:
+	case BCACHE_SB_VERSION_CDEV_V2:
+	case BCACHE_SB_VERSION_CDEV_V3:
+		break;
+	default:
+		return"Unsupported superblock version";
+	}
+
+	if (CACHE_SET_SYNC(sb) &&
+	    le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V3)
+		return "Unsupported superblock version";
+
+	block_size = le16_to_cpu(sb->block_size);
+
+	if (!is_power_of_2(block_size) ||
+	    block_size > PAGE_SECTORS)
+		return "Bad block size";
+
+	if (bch_is_zero(sb->disk_uuid.b, sizeof(uuid_le)))
+		return "Bad disk UUID";
+
+	if (bch_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
+		return "Bad user UUID";
+
+	if (bch_is_zero(sb->set_uuid.b, sizeof(uuid_le)))
+		return "Bad set UUID";
+
+	if (!sb->nr_in_set ||
+	    sb->nr_in_set <= sb->nr_this_dev ||
+	    sb->nr_in_set > MAX_CACHES_PER_SET)
+		return "Bad cache device number in set";
+
+	if (!CACHE_SET_META_REPLICAS_WANT(sb) ||
+	    CACHE_SET_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of metadata replicas";
+
+	if (!CACHE_SET_META_REPLICAS_HAVE(sb) ||
+	    CACHE_SET_META_REPLICAS_HAVE(sb) >
+	    CACHE_SET_META_REPLICAS_WANT(sb))
+		return "Invalid number of metadata replicas";
+
+	if (!CACHE_SET_DATA_REPLICAS_WANT(sb) ||
+	    CACHE_SET_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of data replicas";
+
+	if (!CACHE_SET_DATA_REPLICAS_HAVE(sb) ||
+	    CACHE_SET_DATA_REPLICAS_HAVE(sb) >
+	    CACHE_SET_DATA_REPLICAS_WANT(sb))
+		return "Invalid number of data replicas";
+
+	if (CACHE_SB_CSUM_TYPE(sb) >= BCH_CSUM_NR)
+		return "Invalid checksum type";
+
+	if (!CACHE_SET_BTREE_NODE_SIZE(sb))
+		return "Btree node size not set";
+
+	if (!is_power_of_2(CACHE_SET_BTREE_NODE_SIZE(sb)))
+		return "Btree node size not a power of two";
+
+	if (CACHE_SET_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
+		return "Btree node size too large";
+
+	/* Default value, for old filesystems: */
+	if (!CACHE_SET_GC_RESERVE(sb))
+		SET_CACHE_SET_GC_RESERVE(sb, 10);
+
+	if (CACHE_SET_GC_RESERVE(sb) < 5)
+		return "gc reserve percentage too small";
+
+	if (!CACHE_SET_JOURNAL_ENTRY_SIZE(sb))
+		SET_CACHE_SET_JOURNAL_ENTRY_SIZE(sb, 9);
+
+	/* 4 mb max: */
+	if (512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX)
+		return "max journal entry size too big";
+
+	if (le16_to_cpu(sb->u64s) < bch_journal_buckets_offset(sb))
+		return "Invalid superblock: member info area missing";
+
+	mi = cache_mi_to_cpu_mi(sb->members + sb->nr_this_dev);
+
+	if (mi.nbuckets > LONG_MAX)
+		return "Too many buckets";
+
+	if (mi.nbuckets < 1 << 8)
+		return "Not enough buckets";
+
+	if (!is_power_of_2(mi.bucket_size) ||
+	    mi.bucket_size < PAGE_SECTORS ||
+	    mi.bucket_size < block_size)
+		return "Bad bucket size";
+
+	if (get_capacity(disk_sb->bdev->bd_disk) <
+	    mi.bucket_size * mi.nbuckets)
+		return "Invalid superblock: device too small";
+
+	if (le64_to_cpu(sb->offset) +
+	    (__set_blocks(sb, le16_to_cpu(sb->u64s),
+			  block_size << 9) * block_size) >
+	    mi.first_bucket * mi.bucket_size)
+		return "Invalid superblock: first bucket comes before end of super";
+
+	for (i = 0; i < bch_nr_journal_buckets(sb); i++)
+		if (journal_bucket(sb, i) <  mi.first_bucket ||
+		    journal_bucket(sb, i) >= mi.nbuckets)
+			return "bad journal bucket";
+
+	return NULL;
+}
+
+void free_super(struct bcache_superblock *sb)
+{
+	if (sb->bio)
+		bio_put(sb->bio);
+	if (!IS_ERR_OR_NULL(sb->bdev))
+		blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+
+	free_pages((unsigned long) sb->sb, sb->page_order);
+	memset(sb, 0, sizeof(*sb));
+}
+
+static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
+{
+	struct cache_sb *new_sb;
+	struct bio *bio;
+
+	if (sb->page_order >= order && sb->sb)
+		return 0;
+
+	new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
+	if (!new_sb)
+		return -ENOMEM;
+
+	bio = (dynamic_fault("bcache:add:super_realloc")
+	       ? NULL
+	       : bio_kmalloc(GFP_KERNEL, 1 << order));
+	if (!bio) {
+		free_pages((unsigned long) new_sb, order);
+		return -ENOMEM;
+	}
+
+	if (sb->sb)
+		memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
+
+	free_pages((unsigned long) sb->sb, sb->page_order);
+	sb->sb = new_sb;
+
+	if (sb->bio)
+		bio_put(sb->bio);
+	sb->bio = bio;
+
+	sb->page_order = order;
+
+	return 0;
+}
+
+int bch_super_realloc(struct bcache_superblock *sb, unsigned u64s)
+{
+	struct cache_member *mi = sb->sb->members + sb->sb->nr_this_dev;
+	char buf[BDEVNAME_SIZE];
+	size_t bytes = __set_bytes((struct cache_sb *) NULL, u64s);
+	u64 want = bytes + (SB_SECTOR << 9);
+
+	u64 first_bucket_offset = (u64) le16_to_cpu(mi->first_bucket) *
+		((u64) le16_to_cpu(mi->bucket_size) << 9);
+
+	if (want > first_bucket_offset) {
+		pr_err("%s: superblock too big: want %llu but have %llu",
+		       bdevname(sb->bdev, buf), want, first_bucket_offset);
+		return -ENOSPC;
+	}
+
+	return __bch_super_realloc(sb, get_order(bytes));
+}
+
+static const char *read_super(struct bcache_superblock *sb,
+			      const char *path)
+{
+	const char *err;
+	unsigned order = 0;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	memset(sb, 0, sizeof(*sb));
+
+	err = bch_blkdev_open(path, &sb, &sb->bdev);
+	if (err)
+		return err;
+retry:
+	err = "cannot allocate memory";
+	if (__bch_super_realloc(sb, order))
+		goto err;
+
+	err = "dynamic fault";
+	if (cache_set_init_fault("read_super"))
+		goto err;
+
+	bio_reset(sb->bio);
+	sb->bio->bi_bdev = sb->bdev;
+	sb->bio->bi_iter.bi_sector = SB_SECTOR;
+	sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
+	bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
+	bch_bio_map(sb->bio, sb->sb);
+
+	err = "IO error";
+	if (submit_bio_wait(sb->bio))
+		goto err;
+
+	err = "Not a bcache superblock";
+	if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
+		goto err;
+
+	err = "Superblock has incorrect offset";
+	if (le64_to_cpu(sb->sb->offset) != SB_SECTOR)
+		goto err;
+
+	pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
+		 le64_to_cpu(sb->sb->version),
+		 le64_to_cpu(sb->sb->flags),
+		 le64_to_cpu(sb->sb->seq),
+		 le16_to_cpu(sb->sb->u64s));
+
+	err = "Superblock block size smaller than device block size";
+	if (le16_to_cpu(sb->sb->block_size) << 9 <
+	    bdev_logical_block_size(sb->bdev))
+		goto err;
+
+	order = get_order(__set_bytes(sb->sb, le16_to_cpu(sb->sb->u64s)));
+	if (order > sb->page_order)
+		goto retry;
+
+	err = "bad checksum reading superblock";
+	if (le64_to_cpu(sb->sb->csum) !=
+	    __csum_set(sb->sb, le16_to_cpu(sb->sb->u64s),
+		       le64_to_cpu(sb->sb->version) <
+		       BCACHE_SB_VERSION_CDEV_V3
+		       ? BCH_CSUM_CRC64
+		       : CACHE_SB_CSUM_TYPE(sb->sb)))
+		goto err;
+
+	return NULL;
+err:
+	free_super(sb);
+	return err;
+}
+
+void __write_super(struct cache_set *c, struct bcache_superblock *disk_sb)
+{
+	struct cache_sb *sb = disk_sb->sb;
+	struct bio *bio = disk_sb->bio;
+
+	bio->bi_bdev		= disk_sb->bdev;
+	bio->bi_iter.bi_sector	= SB_SECTOR;
+	bio->bi_iter.bi_size	=
+		roundup(__set_bytes(sb, le16_to_cpu(sb->u64s)),
+			bdev_logical_block_size(disk_sb->bdev));
+	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
+	bch_bio_map(bio, sb);
+
+	pr_debug("ver %llu, flags %llu, seq %llu",
+		 le64_to_cpu(sb->version),
+		 le64_to_cpu(sb->flags),
+		 le64_to_cpu(sb->seq));
+
+	bch_generic_make_request(bio, c);
+}
+
+static void write_super_endio(struct bio *bio)
+{
+	struct cache *ca = bio->bi_private;
+
+	/* XXX: return errors directly */
+
+	cache_fatal_io_err_on(bio->bi_error, ca, "superblock write");
+
+	bch_account_io_completion(ca);
+
+	closure_put(&ca->set->sb_write);
+	percpu_ref_put(&ca->ref);
+}
+
+static void bcache_write_super_unlock(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, sb_write);
+
+	up(&c->sb_write_mutex);
+}
+
+/* Update cached mi: */
+static int cache_set_mi_update(struct cache_set *c,
+			       struct cache_member *mi,
+			       unsigned nr_in_set)
+{
+	struct cache_member_rcu *new, *old;
+	struct cache *ca;
+	unsigned i;
+
+	mutex_lock(&c->mi_lock);
+
+	new = kzalloc(sizeof(struct cache_member_rcu) +
+		      sizeof(struct cache_member_cpu) * nr_in_set,
+		      GFP_KERNEL);
+	if (!new) {
+		mutex_unlock(&c->mi_lock);
+		return -ENOMEM;
+	}
+
+	new->nr_in_set = nr_in_set;
+
+	for (i = 0; i < nr_in_set; i++)
+		new->m[i] = cache_mi_to_cpu_mi(&mi[i]);
+
+	rcu_read_lock();
+	for_each_cache(ca, c, i)
+		ca->mi = new->m[i];
+	rcu_read_unlock();
+
+	old = rcu_dereference_protected(c->members,
+				lockdep_is_held(&c->mi_lock));
+
+	rcu_assign_pointer(c->members, new);
+	if (old)
+		kfree_rcu(old, rcu);
+
+	mutex_unlock(&c->mi_lock);
+	return 0;
+}
+
+/* doesn't copy member info */
+static void __copy_super(struct cache_sb *dst, struct cache_sb *src)
+{
+	dst->version		= src->version;
+	dst->seq		= src->seq;
+	dst->user_uuid		= src->user_uuid;
+	dst->set_uuid		= src->set_uuid;
+	memcpy(dst->label, src->label, SB_LABEL_SIZE);
+	dst->flags		= src->flags;
+	dst->flags2		= src->flags2;
+	dst->nr_in_set		= src->nr_in_set;
+	dst->block_size		= src->block_size;
+}
+
+static int cache_sb_to_cache_set(struct cache_set *c, struct cache_sb *src)
+{
+	struct cache_member *new;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	new = kzalloc(sizeof(struct cache_member) * src->nr_in_set,
+		      GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	memcpy(new, src->members,
+	       src->nr_in_set * sizeof(struct cache_member));
+
+	if (cache_set_mi_update(c, new, src->nr_in_set)) {
+		kfree(new);
+		return -ENOMEM;
+	}
+
+	kfree(c->disk_mi);
+	c->disk_mi = new;
+
+	__copy_super(&c->disk_sb, src);
+
+	c->sb.block_size	= le16_to_cpu(src->block_size);
+	c->sb.btree_node_size	= CACHE_SET_BTREE_NODE_SIZE(src);
+	c->sb.nr_in_set		= src->nr_in_set;
+	c->sb.clean		= CACHE_SET_CLEAN(src);
+	c->sb.meta_replicas_have= CACHE_SET_META_REPLICAS_HAVE(src);
+	c->sb.data_replicas_have= CACHE_SET_DATA_REPLICAS_HAVE(src);
+	c->sb.str_hash_type	= CACHE_SET_STR_HASH_TYPE(src);
+
+	return 0;
+}
+
+static int cache_sb_from_cache_set(struct cache_set *c, struct cache *ca)
+{
+	struct cache_sb *src = &c->disk_sb, *dst = ca->disk_sb.sb;
+
+	if (src->nr_in_set != dst->nr_in_set) {
+		/*
+		 * We have to preserve the list of journal buckets on the
+		 * cache's superblock:
+		 */
+		unsigned old_offset = bch_journal_buckets_offset(dst);
+		unsigned u64s = bch_journal_buckets_offset(src)
+			+ bch_nr_journal_buckets(dst);
+		int ret = bch_super_realloc(&ca->disk_sb, u64s);
+
+		if (ret)
+			return ret;
+
+		dst->nr_in_set	= src->nr_in_set;
+		dst->u64s	= cpu_to_le16(u64s);
+
+		memmove(dst->_data + bch_journal_buckets_offset(dst),
+			dst->_data + old_offset,
+			bch_nr_journal_buckets(dst) * sizeof(u64));
+	}
+
+	memcpy(dst->_data,
+	       c->disk_mi,
+	       src->nr_in_set * sizeof(struct cache_member));
+
+	__copy_super(dst, src);
+
+	return 0;
+}
+
+static void __bcache_write_super(struct cache_set *c)
+{
+	struct closure *cl = &c->sb_write;
+	struct cache *ca;
+	unsigned i;
+
+	cache_set_mi_update(c, c->disk_mi, c->sb.nr_in_set);
+
+	closure_init(cl, &c->cl);
+
+	le64_add_cpu(&c->disk_sb.seq, 1);
+
+	for_each_cache(ca, c, i) {
+		struct cache_sb *sb = ca->disk_sb.sb;
+		struct bio *bio = ca->disk_sb.bio;
+
+		cache_sb_from_cache_set(c, ca);
+
+		SET_CACHE_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
+		sb->csum = cpu_to_le64(__csum_set(sb,
+						  le16_to_cpu(sb->u64s),
+						  CACHE_SB_CSUM_TYPE(sb)));
+
+		bio_reset(bio);
+		bio->bi_bdev	= ca->disk_sb.bdev;
+		bio->bi_end_io	= write_super_endio;
+		bio->bi_private = ca;
+
+		closure_get(cl);
+		percpu_ref_get(&ca->ref);
+		__write_super(c, &ca->disk_sb);
+	}
+
+	closure_return_with_destructor(cl, bcache_write_super_unlock);
+}
+
+void bcache_write_super(struct cache_set *c)
+{
+	down(&c->sb_write_mutex);
+	__bcache_write_super(c);
+}
+
+void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k,
+				   bool meta)
+{
+	struct cache_member *mi;
+	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+	const struct bch_extent_ptr *ptr;
+
+	if (!CACHE_SET_SYNC(&c->disk_sb))
+		return;
+
+	down(&c->sb_write_mutex);
+
+	/* recheck, might have raced */
+	if (bch_check_super_marked(c, k, meta)) {
+		up(&c->sb_write_mutex);
+		return;
+	}
+
+	mi = c->disk_mi;
+
+	extent_for_each_ptr(e, ptr)
+		if (bch_extent_ptr_is_dirty(c, e, ptr))
+			(meta
+			 ? SET_CACHE_HAS_METADATA
+			 : SET_CACHE_HAS_DATA)(mi + ptr->dev, true);
+
+	__bcache_write_super(c);
+}
+
+/* Cache set RO/RW: */
+
+/*
+ * For startup/shutdown of RW stuff, the dependencies are:
+ *
+ * - foreground writes depend on copygc and tiering (to free up space)
+ *
+ * - copygc and tiering depend on mark and sweep gc (they actually probably
+ *   don't because they either reserve ahead of time or don't block if
+ *   allocations fail, but allocations can require mark and sweep gc to run
+ *   because of generation number wraparound)
+ *
+ * - all of the above depends on the allocator threads
+ *
+ * - allocator depends on the journal (when it rewrites prios and gens)
+ */
+
+static void __bch_cache_set_read_only(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+
+	c->tiering_pd.rate.rate = UINT_MAX;
+	bch_ratelimit_reset(&c->tiering_pd.rate);
+	bch_tiering_read_stop(c);
+
+	for_each_cache(ca, c, i)
+		bch_moving_gc_stop(ca);
+
+	bch_gc_thread_stop(c);
+
+	bch_btree_flush(c);
+
+	for_each_cache(ca, c, i)
+		bch_cache_allocator_stop(ca);
+
+	/*
+	 * Write a journal entry after flushing the btree, so we don't end up
+	 * replaying everything we just flushed:
+	 */
+	if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+		int ret;
+
+		bch_journal_flush_async(&c->journal, NULL);
+		ret = bch_journal_meta(&c->journal);
+		BUG_ON(ret && !bch_journal_error(&c->journal));
+	}
+
+	cancel_delayed_work_sync(&c->journal.write_work);
+	cancel_delayed_work_sync(&c->journal.reclaim_work);
+}
+
+static void bch_writes_disabled(struct percpu_ref *writes)
+{
+	struct cache_set *c = container_of(writes, struct cache_set, writes);
+
+	set_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags);
+	wake_up(&bch_read_only_wait);
+}
+
+static void bch_cache_set_read_only_work(struct work_struct *work)
+{
+	struct cache_set *c =
+		container_of(work, struct cache_set, read_only_work);
+
+	percpu_ref_put(&c->writes);
+
+	del_timer(&c->foreground_write_wakeup);
+	cancel_delayed_work(&c->pd_controllers_update);
+
+	c->foreground_write_pd.rate.rate = UINT_MAX;
+	bch_wake_delayed_writes((unsigned long) c);
+
+	if (!test_bit(CACHE_SET_EMERGENCY_RO, &c->flags)) {
+		/*
+		 * If we're not doing an emergency shutdown, we want to wait on
+		 * outstanding writes to complete so they don't see spurious
+		 * errors due to shutting down the allocator:
+		 */
+		wait_event(bch_read_only_wait,
+			   test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
+
+		__bch_cache_set_read_only(c);
+
+		if (!bch_journal_error(&c->journal) &&
+		    !test_bit(CACHE_SET_ERROR, &c->flags)) {
+			SET_CACHE_SET_CLEAN(&c->disk_sb, true);
+			bcache_write_super(c);
+		}
+	} else {
+		/*
+		 * If we are doing an emergency shutdown outstanding writes may
+		 * hang until we shutdown the allocator so we don't want to wait
+		 * on outstanding writes before shutting everything down - but
+		 * we do need to wait on them before returning and signalling
+		 * that going RO is complete:
+		 */
+		__bch_cache_set_read_only(c);
+
+		wait_event(bch_read_only_wait,
+			   test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
+	}
+
+	bch_notify_cache_set_read_only(c);
+	trace_bcache_cache_set_read_only_done(c);
+
+	set_bit(CACHE_SET_RO_COMPLETE, &c->flags);
+	wake_up(&bch_read_only_wait);
+}
+
+bool bch_cache_set_read_only(struct cache_set *c)
+{
+	if (test_and_set_bit(CACHE_SET_RO, &c->flags))
+		return false;
+
+	trace_bcache_cache_set_read_only(c);
+
+	percpu_ref_get(&c->writes);
+
+	/*
+	 * Block new foreground-end write operations from starting - any new
+	 * writes will return -EROFS:
+	 *
+	 * (This is really blocking new _allocations_, writes to previously
+	 * allocated space can still happen until stopping the allocator in
+	 * bch_cache_allocator_stop()).
+	 */
+	percpu_ref_kill(&c->writes);
+
+	queue_work(system_freezable_wq, &c->read_only_work);
+	return true;
+}
+
+bool bch_cache_set_emergency_read_only(struct cache_set *c)
+{
+	bool ret = !test_and_set_bit(CACHE_SET_EMERGENCY_RO, &c->flags);
+
+	bch_cache_set_read_only(c);
+	bch_journal_halt(&c->journal);
+
+	wake_up(&bch_read_only_wait);
+	return ret;
+}
+
+void bch_cache_set_read_only_sync(struct cache_set *c)
+{
+	/* so we don't race with bch_cache_set_read_write() */
+	lockdep_assert_held(&bch_register_lock);
+
+	bch_cache_set_read_only(c);
+
+	wait_event(bch_read_only_wait,
+		   test_bit(CACHE_SET_RO_COMPLETE, &c->flags) &&
+		   test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
+}
+
+static const char *__bch_cache_set_read_write(struct cache_set *c)
+{
+	struct cache *ca;
+	const char *err;
+	unsigned i;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	err = "error starting allocator thread";
+	for_each_cache(ca, c, i)
+		if (ca->mi.state == CACHE_ACTIVE &&
+		    bch_cache_allocator_start(ca)) {
+			percpu_ref_put(&ca->ref);
+			goto err;
+		}
+
+	err = "error starting btree GC thread";
+	if (bch_gc_thread_start(c))
+		goto err;
+
+	for_each_cache(ca, c, i) {
+		if (ca->mi.state != CACHE_ACTIVE)
+			continue;
+
+		err = "error starting moving GC thread";
+		if (bch_moving_gc_thread_start(ca)) {
+			percpu_ref_put(&ca->ref);
+			goto err;
+		}
+	}
+
+	err = "error starting tiering thread";
+	if (bch_tiering_read_start(c))
+		goto err;
+
+	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
+
+	return NULL;
+err:
+	__bch_cache_set_read_only(c);
+	return err;
+}
+
+const char *bch_cache_set_read_write(struct cache_set *c)
+{
+	const char *err;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	if (!test_bit(CACHE_SET_RO_COMPLETE, &c->flags))
+		return NULL;
+
+	err = __bch_cache_set_read_write(c);
+	if (err)
+		return err;
+
+	percpu_ref_reinit(&c->writes);
+
+	clear_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags);
+	clear_bit(CACHE_SET_EMERGENCY_RO, &c->flags);
+	clear_bit(CACHE_SET_RO_COMPLETE, &c->flags);
+	clear_bit(CACHE_SET_RO, &c->flags);
+	return NULL;
+}
+
+/* Cache set startup/shutdown: */
+
+static void cache_set_free(struct cache_set *c)
+{
+	del_timer_sync(&c->foreground_write_wakeup);
+	cancel_delayed_work_sync(&c->pd_controllers_update);
+	cancel_work_sync(&c->read_only_work);
+	cancel_work_sync(&c->bio_submit_work);
+	cancel_work_sync(&c->read_retry_work);
+
+	bch_btree_cache_free(c);
+	bch_journal_free(&c->journal);
+	bch_io_clock_exit(&c->io_clock[WRITE]);
+	bch_io_clock_exit(&c->io_clock[READ]);
+	bch_compress_free(c);
+	bdi_destroy(&c->bdi);
+	lg_lock_free(&c->bucket_stats_lock);
+	free_percpu(c->bucket_stats_percpu);
+	mempool_exit(&c->btree_bounce_pool);
+	mempool_exit(&c->bio_bounce_pages);
+	bioset_exit(&c->bio_write);
+	bioset_exit(&c->bio_read_split);
+	bioset_exit(&c->bio_read);
+	bioset_exit(&c->btree_read_bio);
+	mempool_exit(&c->btree_interior_update_pool);
+	mempool_exit(&c->btree_reserve_pool);
+	mempool_exit(&c->fill_iter);
+	mempool_exit(&c->search);
+	percpu_ref_exit(&c->writes);
+
+	if (c->copygc_wq)
+		destroy_workqueue(c->copygc_wq);
+	if (c->wq)
+		destroy_workqueue(c->wq);
+
+	kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */
+	kfree(c->disk_mi);
+	kfree(c);
+	module_put(THIS_MODULE);
+}
+
+/*
+ * should be __cache_set_stop4 - block devices are closed, now we can finally
+ * free it
+ */
+void bch_cache_set_release(struct kobject *kobj)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+	struct completion *stop_completion = c->stop_completion;
+
+	bch_notify_cache_set_stopped(c);
+	bch_info(c, "stopped");
+
+	cache_set_free(c);
+
+	if (stop_completion)
+		complete(stop_completion);
+}
+
+/*
+ * All activity on the cache_set should have stopped now - close devices:
+ */
+static void __cache_set_stop3(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, cl);
+	struct cache *ca;
+	unsigned i;
+
+	mutex_lock(&bch_register_lock);
+	for_each_cache(ca, c, i)
+		bch_cache_stop(ca);
+	mutex_unlock(&bch_register_lock);
+
+	mutex_lock(&bch_register_lock);
+	list_del(&c->list);
+	if (c->minor >= 0)
+		idr_remove(&bch_chardev_minor, c->minor);
+	mutex_unlock(&bch_register_lock);
+
+	closure_debug_destroy(&c->cl);
+	kobject_put(&c->kobj);
+}
+
+/*
+ * Openers (i.e. block devices) should have exited, shutdown all userspace
+ * interfaces and wait for &c->cl to hit 0
+ */
+static void __cache_set_stop2(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, caching);
+
+	bch_debug_exit_cache_set(c);
+
+	if (!IS_ERR_OR_NULL(c->chardev))
+		device_unregister(c->chardev);
+
+	if (c->kobj.state_in_sysfs)
+		kobject_del(&c->kobj);
+
+	bch_cache_accounting_destroy(&c->accounting);
+
+	kobject_put(&c->time_stats);
+	kobject_put(&c->opts_dir);
+	kobject_put(&c->internal);
+
+	mutex_lock(&bch_register_lock);
+	bch_cache_set_read_only_sync(c);
+	mutex_unlock(&bch_register_lock);
+
+	closure_return(cl);
+}
+
+/*
+ * First phase of the shutdown process that's kicked off by cache_set_stop(); we
+ * haven't waited for anything to stop yet, we're just punting to process
+ * context to shut down block devices:
+ */
+static void __cache_set_stop1(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, caching);
+
+	bch_blockdevs_stop(c);
+
+	continue_at(cl, __cache_set_stop2, system_wq);
+}
+
+void bch_cache_set_stop(struct cache_set *c)
+{
+	if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
+		closure_queue(&c->caching);
+}
+
+void bch_cache_set_unregister(struct cache_set *c)
+{
+	if (!test_and_set_bit(CACHE_SET_UNREGISTERING, &c->flags))
+		bch_cache_set_stop(c);
+}
+
+static unsigned cache_set_nr_devices(struct cache_set *c)
+{
+	unsigned i, nr = 0;
+	struct cache_member *mi = c->disk_mi;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	for (i = 0; i < c->disk_sb.nr_in_set; i++)
+		if (!bch_is_zero(mi[i].uuid.b, sizeof(uuid_le)))
+			nr++;
+
+	return nr;
+}
+
+static unsigned cache_set_nr_online_devices(struct cache_set *c)
+{
+	unsigned i, nr = 0;
+
+	for (i = 0; i < c->sb.nr_in_set; i++)
+		if (c->cache[i])
+			nr++;
+
+	return nr;
+}
+
+#define alloc_bucket_pages(gfp, ca)			\
+	((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
+
+static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
+					     struct cache_set_opts opts)
+{
+	struct cache_set *c;
+	unsigned iter_size, journal_entry_bytes;
+
+	c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
+	if (!c)
+		return NULL;
+
+	__module_get(THIS_MODULE);
+
+	c->minor		= -1;
+
+	sema_init(&c->sb_write_mutex, 1);
+	INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
+	mutex_init(&c->btree_cache_lock);
+	mutex_init(&c->bucket_lock);
+	mutex_init(&c->btree_root_lock);
+	INIT_WORK(&c->read_only_work, bch_cache_set_read_only_work);
+	mutex_init(&c->mi_lock);
+
+	init_rwsem(&c->gc_lock);
+
+#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+	spin_lock_init(&c->name##_time.lock);
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
+	bch_open_buckets_init(c);
+	bch_tiering_init_cache_set(c);
+
+	INIT_LIST_HEAD(&c->list);
+	INIT_LIST_HEAD(&c->cached_devs);
+	INIT_LIST_HEAD(&c->btree_cache);
+	INIT_LIST_HEAD(&c->btree_cache_freeable);
+	INIT_LIST_HEAD(&c->btree_cache_freed);
+
+	INIT_LIST_HEAD(&c->btree_interior_update_list);
+	mutex_init(&c->btree_reserve_cache_lock);
+	mutex_init(&c->btree_interior_update_lock);
+
+	mutex_init(&c->bio_bounce_pages_lock);
+	INIT_WORK(&c->bio_submit_work, bch_bio_submit_work);
+	spin_lock_init(&c->bio_submit_lock);
+	bio_list_init(&c->read_retry_list);
+	spin_lock_init(&c->read_retry_lock);
+	INIT_WORK(&c->read_retry_work, bch_read_retry_work);
+	mutex_init(&c->zlib_workspace_lock);
+
+	seqcount_init(&c->gc_pos_lock);
+
+	c->prio_clock[READ].hand = 1;
+	c->prio_clock[READ].min_prio = 0;
+	c->prio_clock[WRITE].hand = 1;
+	c->prio_clock[WRITE].min_prio = 0;
+
+	c->congested_read_threshold_us	= 2000;
+	c->congested_write_threshold_us	= 20000;
+	c->error_limit	= 16 << IO_ERROR_SHIFT;
+	init_waitqueue_head(&c->writeback_wait);
+
+	c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
+
+	c->copy_gc_enabled = 1;
+	c->tiering_enabled = 1;
+	c->tiering_percent = 10;
+
+	c->foreground_target_percent = 20;
+
+	c->journal.write_time	= &c->journal_write_time;
+	c->journal.delay_time	= &c->journal_delay_time;
+	c->journal.blocked_time	= &c->journal_blocked_time;
+	c->journal.flush_seq_time = &c->journal_flush_seq_time;
+
+	mutex_init(&c->uevent_lock);
+
+	if (cache_sb_to_cache_set(c, sb))
+		goto err;
+
+	scnprintf(c->name, sizeof(c->name), "%pU", &c->disk_sb.user_uuid);
+
+	c->opts = cache_superblock_opts(sb);
+	cache_set_opts_apply(&c->opts, opts);
+
+	c->block_bits		= ilog2(c->sb.block_size);
+
+	if (cache_set_init_fault("cache_set_alloc"))
+		goto err;
+
+	iter_size = (btree_blocks(c) + 1) * 2 *
+		sizeof(struct btree_node_iter_set);
+
+	journal_entry_bytes = 512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb);
+
+	if (!(c->wq = alloc_workqueue("bcache",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
+	    !(c->copygc_wq = alloc_workqueue("bcache_copygc",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
+	    percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) ||
+	    mempool_init_slab_pool(&c->search, 1, bch_search_cache) ||
+	    mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
+				      sizeof(struct btree_reserve)) ||
+	    mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+				      sizeof(struct btree_interior_update)) ||
+	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
+	    bioset_init(&c->btree_read_bio, 1, 0) ||
+	    bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
+	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
+	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
+	    mempool_init_page_pool(&c->bio_bounce_pages,
+				   max_t(unsigned,
+					 c->sb.btree_node_size,
+					 CRC32_EXTENT_SIZE_MAX) /
+				   PAGE_SECTORS, 0) ||
+	    !(c->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache_set)) ||
+	    lg_lock_init(&c->bucket_stats_lock) ||
+	    mempool_init_page_pool(&c->btree_bounce_pool, 1,
+				   ilog2(btree_pages(c))) ||
+	    bdi_setup_and_register(&c->bdi, "bcache") ||
+	    bch_io_clock_init(&c->io_clock[READ]) ||
+	    bch_io_clock_init(&c->io_clock[WRITE]) ||
+	    bch_journal_alloc(&c->journal, journal_entry_bytes) ||
+	    bch_btree_cache_alloc(c) ||
+	    bch_compress_init(c))
+		goto err;
+
+	c->bdi.ra_pages		= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+	c->bdi.congested_fn	= bch_congested_fn;
+	c->bdi.congested_data	= c;
+
+	/*
+	 * Now that all allocations have succeeded, init various refcounty
+	 * things that let us shutdown:
+	 */
+	closure_init(&c->cl, NULL);
+
+	c->kobj.kset = bcache_kset;
+	kobject_init(&c->kobj, &bch_cache_set_ktype);
+	kobject_init(&c->internal, &bch_cache_set_internal_ktype);
+	kobject_init(&c->opts_dir, &bch_cache_set_opts_dir_ktype);
+	kobject_init(&c->time_stats, &bch_cache_set_time_stats_ktype);
+
+	bch_cache_accounting_init(&c->accounting, &c->cl);
+
+	closure_init(&c->caching, &c->cl);
+	set_closure_fn(&c->caching, __cache_set_stop1, system_wq);
+
+	continue_at_noreturn(&c->cl, __cache_set_stop3, system_wq);
+	return c;
+err:
+	cache_set_free(c);
+	return NULL;
+}
+
+static int bch_cache_set_online(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	if (c->kobj.state_in_sysfs)
+		return 0;
+
+	c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
+	if (c->minor < 0)
+		return c->minor;
+
+	c->chardev = device_create(bch_chardev_class, NULL,
+				   MKDEV(bch_chardev_major, c->minor), NULL,
+				   "bcache%u-ctl", c->minor);
+	if (IS_ERR(c->chardev))
+		return PTR_ERR(c->chardev);
+
+	if (kobject_add(&c->kobj, NULL, "%pU", c->disk_sb.user_uuid.b) ||
+	    kobject_add(&c->internal, &c->kobj, "internal") ||
+	    kobject_add(&c->opts_dir, &c->kobj, "options") ||
+	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
+	    bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
+		return -1;
+
+	for_each_cache(ca, c, i)
+		if (bch_cache_online(ca)) {
+			percpu_ref_put(&ca->ref);
+			return -1;
+		}
+
+	list_add(&c->list, &bch_cache_sets);
+	return 0;
+}
+
+static const char *run_cache_set(struct cache_set *c)
+{
+	const char *err = "cannot allocate memory";
+	struct cache *ca;
+	unsigned i, id;
+	time64_t now;
+	LIST_HEAD(journal);
+	struct jset *j;
+	int ret = -EINVAL;
+
+	lockdep_assert_held(&bch_register_lock);
+	BUG_ON(test_bit(CACHE_SET_RUNNING, &c->flags));
+
+	/* We don't want bch_fatal_error() to free underneath us */
+	closure_get(&c->caching);
+
+	/*
+	 * Make sure that each cache object's mi is up to date before
+	 * we start testing it.
+	 */
+	for_each_cache(ca, c, i)
+		cache_sb_from_cache_set(c, ca);
+
+	/*
+	 * CACHE_SET_SYNC is true if the cache set has already been run
+	 * and potentially has data.
+	 * It is false if it is the first time it is run.
+	 */
+
+	if (CACHE_SET_SYNC(&c->disk_sb)) {
+		ret = bch_journal_read(c, &journal);
+		if (ret)
+			goto err;
+
+		pr_debug("btree_journal_read() done");
+
+		j = &list_entry(journal.prev, struct journal_replay, list)->j;
+
+		err = "error reading priorities";
+		for_each_cache(ca, c, i) {
+			ret = bch_prio_read(ca);
+			if (ret) {
+				percpu_ref_put(&ca->ref);
+				goto err;
+			}
+		}
+
+		c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
+		c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+
+		for_each_cache(ca, c, i) {
+			bch_recalc_min_prio(ca, READ);
+			bch_recalc_min_prio(ca, WRITE);
+		}
+
+		/*
+		 * If bch_prio_read() fails it'll call cache_set_error and we'll
+		 * tear everything down right away, but if we perhaps checked
+		 * sooner we could avoid journal replay.
+		 */
+
+		for (id = 0; id < BTREE_ID_NR; id++) {
+			unsigned level;
+			struct bkey_i *k;
+
+			err = "bad btree root";
+			k = bch_journal_find_btree_root(c, j, id, &level);
+			if (!k && id == BTREE_ID_EXTENTS)
+				goto err;
+			if (!k) {
+				pr_debug("missing btree root: %d", id);
+				continue;
+			}
+
+			err = "error reading btree root";
+			if (bch_btree_root_read(c, id, k, level))
+				goto err;
+		}
+
+		bch_verbose(c, "starting mark and sweep:");
+
+		err = "error in recovery";
+		if (bch_initial_gc(c, &journal))
+			goto err;
+
+		bch_verbose(c, "mark and sweep done");
+
+		/*
+		 * bch_journal_start() can't happen sooner, or btree_gc_finish()
+		 * will give spurious errors about oldest_gen > bucket_gen -
+		 * this is a hack but oh well.
+		 */
+		bch_journal_start(c);
+
+		err = "error starting allocator thread";
+		for_each_cache(ca, c, i)
+			if (ca->mi.state == CACHE_ACTIVE &&
+			    bch_cache_allocator_start(ca)) {
+				percpu_ref_put(&ca->ref);
+				goto err;
+			}
+
+		bch_verbose(c, "starting journal replay:");
+
+		err = "journal replay failed";
+		ret = bch_journal_replay(c, &journal);
+		if (ret)
+			goto err;
+
+		bch_verbose(c, "journal replay done");
+
+		/*
+		 * Write a new journal entry _before_ we start journalling new
+		 * data - otherwise, we could end up with btree node bsets with
+		 * journal seqs arbitrarily far in the future vs. the most
+		 * recently written journal entry on disk, if we crash before
+		 * writing the next journal entry:
+		 */
+		err = "error writing journal entry";
+		if (bch_journal_meta(&c->journal))
+			goto err;
+
+		bch_verbose(c, "starting fs gc:");
+		err = "error in fs gc";
+		ret = bch_gc_inode_nlinks(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "fs gc done");
+
+		if (!c->opts.nofsck) {
+			bch_verbose(c, "starting fsck:");
+			err = "error in fsck";
+			ret = bch_fsck(c);
+			if (ret)
+				goto err;
+			bch_verbose(c, "fsck done");
+		}
+	} else {
+		struct bkey_i_inode inode;
+		struct closure cl;
+
+		closure_init_stack(&cl);
+
+		bch_notice(c, "initializing new filesystem");
+
+		err = "unable to allocate journal buckets";
+		for_each_cache(ca, c, i)
+			if (bch_cache_journal_alloc(ca)) {
+				percpu_ref_put(&ca->ref);
+				goto err;
+			}
+
+		bch_initial_gc(c, NULL);
+
+		/*
+		 * journal_res_get() will crash if called before this has
+		 * set up the journal.pin FIFO and journal.cur pointer:
+		 */
+		bch_journal_start(c);
+		bch_journal_set_replay_done(&c->journal);
+
+		err = "error starting allocator thread";
+		for_each_cache(ca, c, i)
+			if (ca->mi.state == CACHE_ACTIVE &&
+			    bch_cache_allocator_start(ca)) {
+				percpu_ref_put(&ca->ref);
+				goto err;
+			}
+
+		err = "cannot allocate new btree root";
+		for (id = 0; id < BTREE_ID_NR; id++)
+			if (bch_btree_root_alloc(c, id, &cl)) {
+				closure_sync(&cl);
+				goto err;
+			}
+
+		/* Wait for new btree roots to be written: */
+		closure_sync(&cl);
+
+		bkey_inode_init(&inode.k_i);
+		inode.k.p.inode = BCACHE_ROOT_INO;
+		inode.v.i_mode = cpu_to_le16(S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO);
+		inode.v.i_nlink = cpu_to_le32(2);
+		get_random_bytes(&inode.v.i_hash_seed, sizeof(inode.v.i_hash_seed));
+		SET_INODE_STR_HASH_TYPE(&inode.v, c->sb.str_hash_type);
+
+		err = "error creating root directory";
+		if (bch_btree_insert(c, BTREE_ID_INODES, &inode.k_i,
+				     NULL, NULL, NULL, 0))
+			goto err;
+
+		err = "error writing first journal entry";
+		if (bch_journal_meta(&c->journal))
+			goto err;
+	}
+
+	if (c->opts.read_only) {
+		bch_cache_set_read_only_sync(c);
+	} else {
+		err = __bch_cache_set_read_write(c);
+		if (err)
+			goto err;
+	}
+
+	now = ktime_get_seconds();
+	rcu_read_lock();
+	for_each_cache_rcu(ca, c, i)
+		c->disk_mi[ca->sb.nr_this_dev].last_mount = cpu_to_le64(now);
+	rcu_read_unlock();
+
+	/* Mark cache set as initialized: */
+	SET_CACHE_SET_SYNC(&c->disk_sb, true);
+	SET_CACHE_SET_CLEAN(&c->disk_sb, false);
+	bcache_write_super(c);
+
+	err = "dynamic fault";
+	if (cache_set_init_fault("run_cache_set"))
+		goto err;
+
+	err = "error creating kobject";
+	if (bch_cache_set_online(c))
+		goto err;
+
+	err = "can't bring up blockdev volumes";
+	if (bch_blockdev_volumes_start(c))
+		goto err;
+
+	bch_debug_init_cache_set(c);
+	set_bit(CACHE_SET_RUNNING, &c->flags);
+	bch_attach_backing_devs(c);
+
+	closure_put(&c->caching);
+
+	bch_notify_cache_set_read_write(c);
+
+	BUG_ON(!list_empty(&journal));
+	return NULL;
+err:
+	switch (ret) {
+	case BCH_FSCK_ERRORS_NOT_FIXED:
+		bch_err(c, "filesystem contains errors: please report this to the developers");
+		pr_cont("mount with -o fix_errors to repair");
+		err = "fsck error";
+		break;
+	case BCH_FSCK_REPAIR_UNIMPLEMENTED:
+		bch_err(c, "filesystem contains errors: please report this to the developers");
+		pr_cont("repair unimplemented: inform the developers so that it can be added");
+		err = "fsck error";
+		break;
+	case BCH_FSCK_REPAIR_IMPOSSIBLE:
+		bch_err(c, "filesystem contains errors, but repair impossible");
+		err = "fsck error";
+		break;
+	case BCH_FSCK_UNKNOWN_VERSION:
+		err = "unknown metadata version";;
+		break;
+	case -ENOMEM:
+		err = "cannot allocate memory";
+		break;
+	case -EIO:
+		err = "IO error";
+		break;
+	}
+
+	BUG_ON(!err);
+
+	bch_journal_entries_free(&journal);
+	set_bit(CACHE_SET_ERROR, &c->flags);
+	bch_cache_set_unregister(c);
+	closure_put(&c->caching);
+	return err;
+}
+
+static const char *can_add_cache(struct cache_sb *sb,
+				 struct cache_set *c)
+{
+	if (le16_to_cpu(sb->block_size) != c->sb.block_size)
+		return "mismatched block size";
+
+	if (le16_to_cpu(sb->members[sb->nr_this_dev].bucket_size) <
+	    CACHE_SET_BTREE_NODE_SIZE(&c->disk_sb))
+		return "new cache bucket_size is too small";
+
+	return NULL;
+}
+
+static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c)
+{
+	const char *err;
+	bool match;
+
+	err = can_add_cache(sb, c);
+	if (err)
+		return err;
+
+	/*
+	 * When attaching an existing device, the cache set superblock must
+	 * already contain member_info with a matching UUID
+	 */
+	match = le64_to_cpu(sb->seq) <= le64_to_cpu(c->disk_sb.seq)
+		? (sb->nr_this_dev < c->disk_sb.nr_in_set &&
+		   !memcmp(&c->disk_mi[sb->nr_this_dev].uuid,
+			   &sb->disk_uuid, sizeof(uuid_le)))
+		: (sb->nr_this_dev < sb->nr_in_set &&
+		   !memcmp(&sb->members[sb->nr_this_dev].uuid,
+			   &sb->disk_uuid, sizeof(uuid_le)));
+
+	if (!match)
+		return "cache sb does not match set";
+
+	return NULL;
+}
+
+/* Cache device */
+
+bool bch_cache_read_only(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	char buf[BDEVNAME_SIZE];
+
+	bdevname(ca->disk_sb.bdev, buf);
+
+	lockdep_assert_held(&bch_register_lock);
+
+	if (ca->mi.state != CACHE_ACTIVE)
+		return false;
+
+	if (!bch_cache_may_remove(ca)) {
+		bch_err(c, "required member %s going RO, forcing fs RO", buf);
+		bch_cache_set_read_only_sync(c);
+	}
+
+	trace_bcache_cache_read_only(ca);
+
+	bch_moving_gc_stop(ca);
+
+	/*
+	 * This stops new data writes (e.g. to existing open data
+	 * buckets) and then waits for all existing writes to
+	 * complete.
+	 */
+	bch_cache_allocator_stop(ca);
+
+	bch_cache_group_remove_cache(&c->journal.devs, ca);
+
+	/*
+	 * Device data write barrier -- no non-meta-data writes should
+	 * occur after this point.  However, writes to btree buckets,
+	 * journal buckets, and the superblock can still occur.
+	 */
+	trace_bcache_cache_read_only_done(ca);
+
+	bch_notice(c, "%s read only", bdevname(ca->disk_sb.bdev, buf));
+	bch_notify_cache_read_only(ca);
+
+	SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_RO);
+	bcache_write_super(c);
+	return true;
+}
+
+static const char *__bch_cache_read_write(struct cache_set *c, struct cache *ca)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (ca->mi.state == CACHE_ACTIVE)
+		return NULL;
+
+	if (test_bit(CACHE_DEV_REMOVING, &ca->flags))
+		return "removing";
+
+	trace_bcache_cache_read_write(ca);
+
+	if (bch_cache_allocator_start(ca))
+		return "error starting allocator thread";
+
+	if (bch_moving_gc_thread_start(ca))
+		return "error starting moving GC thread";
+
+	bch_cache_group_add_cache(&c->journal.devs, ca);
+
+	wake_up_process(c->tiering_read);
+
+	bch_notify_cache_read_write(ca);
+	trace_bcache_cache_read_write_done(ca);
+
+	return NULL;
+}
+
+const char *bch_cache_read_write(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	const char *err;
+
+	err = __bch_cache_read_write(c, ca);
+	if (err)
+		return err;
+
+	SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_ACTIVE);
+	bcache_write_super(c);
+
+	return NULL;
+}
+
+/*
+ * bch_cache_stop has already returned, so we no longer hold the register
+ * lock at the point this is called.
+ */
+
+void bch_cache_release(struct kobject *kobj)
+{
+	struct cache *ca = container_of(kobj, struct cache, kobj);
+
+	percpu_ref_exit(&ca->ref);
+	kfree(ca);
+}
+
+static void bch_cache_free_work(struct work_struct *work)
+{
+	struct cache *ca = container_of(work, struct cache, free_work);
+	struct cache_set *c = ca->set;
+	unsigned i;
+
+	cancel_work_sync(&ca->io_error_work);
+
+	if (c && c->kobj.state_in_sysfs) {
+		char buf[12];
+
+		sprintf(buf, "cache%u", ca->sb.nr_this_dev);
+		sysfs_remove_link(&c->kobj, buf);
+	}
+
+	if (ca->kobj.state_in_sysfs)
+		kobject_del(&ca->kobj);
+
+	free_super(&ca->disk_sb);
+
+	/*
+	 * bch_cache_stop can be called in the middle of initialization
+	 * of the struct cache object.
+	 * As such, not all the sub-structures may be initialized.
+	 * However, they were zeroed when the object was allocated.
+	 */
+
+	free_percpu(ca->sectors_written);
+	bioset_exit(&ca->replica_set);
+	free_percpu(ca->bucket_stats_percpu);
+	kfree(ca->journal.bucket_seq);
+	free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
+	kfree(ca->prio_buckets);
+	kfree(ca->bio_prio);
+	kfree(ca->journal.bio);
+	vfree(ca->buckets);
+	vfree(ca->oldest_gens);
+	free_heap(&ca->heap);
+	free_fifo(&ca->free_inc);
+
+	for (i = 0; i < RESERVE_NR; i++)
+		free_fifo(&ca->free[i]);
+
+	kobject_put(&ca->kobj);
+
+	if (c)
+		kobject_put(&c->kobj);
+}
+
+static void bch_cache_percpu_ref_release(struct percpu_ref *ref)
+{
+	struct cache *ca = container_of(ref, struct cache, ref);
+
+	schedule_work(&ca->free_work);
+}
+
+static void bch_cache_free_rcu(struct rcu_head *rcu)
+{
+	struct cache *ca = container_of(rcu, struct cache, free_rcu);
+
+	/*
+	 * This decrements the ref count to ca, and once the ref count
+	 * is 0 (outstanding bios to the ca also incremented it and
+	 * decrement it on completion/error), bch_cache_percpu_ref_release
+	 * is called, and that eventually results in bch_cache_free_work
+	 * being called, which in turn results in bch_cache_release being
+	 * called.
+	 *
+	 * In particular, these functions won't be called until there are no
+	 * bios outstanding (the per-cpu ref counts are all 0), so it
+	 * is safe to remove the actual sysfs device at that point,
+	 * and that can indicate success to the user.
+	 */
+
+	percpu_ref_kill(&ca->ref);
+}
+
+static void bch_cache_stop(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	if (c) {
+		BUG_ON(rcu_access_pointer(c->cache[ca->sb.nr_this_dev]) != ca);
+		rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], NULL);
+	}
+
+	call_rcu(&ca->free_rcu, bch_cache_free_rcu);
+}
+
+static void bch_cache_remove_work(struct work_struct *work)
+{
+	struct cache *ca = container_of(work, struct cache, remove_work);
+	struct cache_set *c = ca->set;
+	char name[BDEVNAME_SIZE];
+	bool force = test_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags);
+	unsigned dev = ca->sb.nr_this_dev;
+
+	bdevname(ca->disk_sb.bdev, name);
+
+	/*
+	 * Device should already be RO, now migrate data off:
+	 *
+	 * XXX: locking is sketchy, bch_cache_read_write() has to check
+	 * CACHE_DEV_REMOVING bit
+	 */
+	if (!ca->mi.has_data) {
+		/* Nothing to do: */
+	} else if (!bch_move_data_off_device(ca)) {
+		lockdep_assert_held(&bch_register_lock);
+		SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false);
+
+		bcache_write_super(c);
+	} else if (force) {
+		bch_flag_data_bad(ca);
+
+		lockdep_assert_held(&bch_register_lock);
+		SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false);
+
+		bcache_write_super(c);
+	} else {
+		bch_err(c, "Remove of %s failed, unable to migrate data off",
+			name);
+		clear_bit(CACHE_DEV_REMOVING, &ca->flags);
+		return;
+	}
+
+	/* Now metadata: */
+
+	if (!ca->mi.has_metadata) {
+		/* Nothing to do: */
+	} else if (!bch_move_meta_data_off_device(ca)) {
+		lockdep_assert_held(&bch_register_lock);
+		SET_CACHE_HAS_METADATA(&c->disk_mi[ca->sb.nr_this_dev], false);
+
+		bcache_write_super(c);
+	} else {
+		bch_err(c, "Remove of %s failed, unable to migrate metadata off",
+			name);
+		clear_bit(CACHE_DEV_REMOVING, &ca->flags);
+		return;
+	}
+
+	/*
+	 * Ok, really doing the remove:
+	 * Drop device's prio pointer before removing it from superblock:
+	 */
+	bch_notify_cache_removed(ca);
+
+	spin_lock(&c->journal.lock);
+	c->journal.prio_buckets[dev] = 0;
+	spin_unlock(&c->journal.lock);
+
+	bch_journal_meta(&c->journal);
+
+	/*
+	 * Stop device before removing it from the cache set's list of devices -
+	 * and get our own ref on cache set since ca is going away:
+	 */
+	closure_get(&c->cl);
+
+	mutex_lock(&bch_register_lock);
+	bch_cache_stop(ca);
+
+	/*
+	 * RCU barrier between dropping between c->cache and dropping from
+	 * member info:
+	 */
+	synchronize_rcu();
+
+	lockdep_assert_held(&bch_register_lock);
+
+	/*
+	 * Free this device's slot in the cache_member array - all pointers to
+	 * this device must be gone:
+	 */
+	memset(&c->disk_mi[dev].uuid, 0, sizeof(c->disk_mi[dev].uuid));
+
+	bcache_write_super(c);
+	mutex_unlock(&bch_register_lock);
+
+	closure_put(&c->cl);
+}
+
+bool bch_cache_remove(struct cache *ca, bool force)
+{
+	mutex_lock(&bch_register_lock);
+
+	if (test_bit(CACHE_DEV_REMOVING, &ca->flags))
+		return false;
+
+	if (!bch_cache_may_remove(ca)) {
+		bch_err(ca->set, "Can't remove last device in tier %u",
+			ca->mi.tier);
+		bch_notify_cache_remove_failed(ca);
+		return false;
+	}
+
+	/* First, go RO before we try to migrate data off: */
+	bch_cache_read_only(ca);
+
+	if (force)
+		set_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags);
+	set_bit(CACHE_DEV_REMOVING, &ca->flags);
+	bch_notify_cache_removing(ca);
+
+	mutex_unlock(&bch_register_lock);
+
+	/* Migrate the data and finish removal asynchronously: */
+
+	queue_work(system_long_wq, &ca->remove_work);
+	return true;
+}
+
+static int bch_cache_online(struct cache *ca)
+{
+	char buf[12];
+
+	lockdep_assert_held(&bch_register_lock);
+
+	sprintf(buf, "cache%u", ca->sb.nr_this_dev);
+
+	if (kobject_add(&ca->kobj,
+			&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
+			"bcache") ||
+	    sysfs_create_link(&ca->kobj, &ca->set->kobj, "set") ||
+	    sysfs_create_link(&ca->set->kobj, &ca->kobj, buf))
+		return -1;
+
+	return 0;
+}
+
+static const char *cache_alloc(struct bcache_superblock *sb,
+			       struct cache_set *c,
+			       struct cache **ret)
+{
+	size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
+	size_t heap_size;
+	unsigned i, journal_entry_pages;
+	const char *err = "cannot allocate memory";
+	struct cache *ca;
+
+	if (c->sb.nr_in_set == 1)
+		bdevname(sb->bdev, c->name);
+
+	if (cache_set_init_fault("cache_alloc"))
+		return err;
+
+	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+	if (!ca)
+		return err;
+
+	if (percpu_ref_init(&ca->ref, bch_cache_percpu_ref_release,
+			    0, GFP_KERNEL)) {
+		kfree(ca);
+		return err;
+	}
+
+	kobject_init(&ca->kobj, &bch_cache_ktype);
+
+	spin_lock_init(&ca->self.lock);
+	ca->self.nr_devices = 1;
+	rcu_assign_pointer(ca->self.d[0].dev, ca);
+	ca->sb.nr_this_dev = sb->sb->nr_this_dev;
+
+	INIT_WORK(&ca->free_work, bch_cache_free_work);
+	INIT_WORK(&ca->remove_work, bch_cache_remove_work);
+	spin_lock_init(&ca->freelist_lock);
+	spin_lock_init(&ca->prio_buckets_lock);
+	mutex_init(&ca->heap_lock);
+	bch_moving_init_cache(ca);
+
+	ca->disk_sb = *sb;
+	ca->disk_sb.bdev->bd_holder = ca;
+	memset(sb, 0, sizeof(*sb));
+
+	INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
+
+	err = "dynamic fault";
+	if (cache_set_init_fault("cache_alloc"))
+		goto err;
+
+	ca->mi = cache_mi_to_cpu_mi(ca->disk_sb.sb->members +
+				    ca->disk_sb.sb->nr_this_dev);
+	ca->bucket_bits = ilog2(ca->mi.bucket_size);
+
+	/* XXX: tune these */
+	movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
+	reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
+	/*
+	 * free_inc must be smaller than the copygc reserve: if it was bigger,
+	 * one copygc iteration might not make enough buckets available to fill
+	 * up free_inc and allow the allocator to make forward progress
+	 */
+	free_inc_reserve = movinggc_reserve / 2;
+	heap_size = movinggc_reserve * 8;
+
+	journal_entry_pages =
+		DIV_ROUND_UP(1U << CACHE_SET_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb),
+			     PAGE_SECTORS);
+
+	if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_MOVINGGC],
+		       movinggc_reserve, GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
+	    !init_fifo(&ca->free_inc,	free_inc_reserve, GFP_KERNEL) ||
+	    !init_heap(&ca->heap,	heap_size, GFP_KERNEL) ||
+	    !(ca->oldest_gens	= vzalloc(sizeof(u8) *
+					  ca->mi.nbuckets)) ||
+	    !(ca->buckets	= vzalloc(sizeof(struct bucket) *
+					  ca->mi.nbuckets)) ||
+	    !(ca->prio_buckets	= kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
+					  2, GFP_KERNEL)) ||
+	    !(ca->disk_buckets	= alloc_bucket_pages(GFP_KERNEL, ca)) ||
+	    !(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache)) ||
+	    !(ca->journal.bucket_seq = kcalloc(bch_nr_journal_buckets(ca->disk_sb.sb),
+					       sizeof(u64), GFP_KERNEL)) ||
+	    !(ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages)) ||
+	    !(ca->bio_prio = bio_kmalloc(GFP_KERNEL, bucket_pages(ca))) ||
+	    bioset_init(&ca->replica_set, 4,
+			offsetof(struct bch_write_bio, bio)) ||
+	    !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
+		goto err;
+
+	ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
+
+	total_reserve = ca->free_inc.size;
+	for (i = 0; i < RESERVE_NR; i++)
+		total_reserve += ca->free[i].size;
+	pr_debug("%zu buckets reserved", total_reserve);
+
+	ca->copygc_write_point.group = &ca->self;
+	ca->tiering_write_point.group = &ca->self;
+
+	kobject_get(&c->kobj);
+	ca->set = c;
+
+	kobject_get(&ca->kobj);
+	rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], ca);
+
+	if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb.seq))
+		cache_sb_to_cache_set(c, ca->disk_sb.sb);
+
+	/*
+	 * Increase journal write timeout if flushes to this device are
+	 * expensive:
+	 */
+	if (!blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev)) &&
+	    journal_flushes_device(ca))
+		c->journal.write_delay_ms =
+			max(c->journal.write_delay_ms, 1000U);
+
+	err = "error creating kobject";
+	if (c->kobj.state_in_sysfs &&
+	    bch_cache_online(ca))
+		goto err;
+
+	if (ret)
+		*ret = ca;
+	else
+		kobject_put(&ca->kobj);
+	return NULL;
+err:
+	bch_cache_stop(ca);
+	return err;
+}
+
+static struct cache_set *cache_set_lookup(uuid_le uuid)
+{
+	struct cache_set *c;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	list_for_each_entry(c, &bch_cache_sets, list)
+		if (!memcmp(&c->disk_sb.set_uuid, &uuid, sizeof(uuid_le)))
+			return c;
+
+	return NULL;
+}
+
+static const char *register_cache(struct bcache_superblock *sb,
+				  struct cache_set_opts opts)
+{
+	char name[BDEVNAME_SIZE];
+	const char *err = "cannot allocate memory";
+	struct cache_set *c;
+
+	err = validate_cache_super(sb);
+	if (err)
+		return err;
+
+	bdevname(sb->bdev, name);
+
+	c = cache_set_lookup(sb->sb->set_uuid);
+	if (c) {
+		if ((err = (can_attach_cache(sb->sb, c) ?:
+			    cache_alloc(sb, c, NULL))))
+			return err;
+
+		if (cache_set_nr_online_devices(c) == cache_set_nr_devices(c)) {
+			err = run_cache_set(c);
+			if (err)
+				return err;
+		}
+		goto out;
+	}
+
+	c = bch_cache_set_alloc(sb->sb, opts);
+	if (!c)
+		return err;
+
+	err = cache_alloc(sb, c, NULL);
+	if (err)
+		goto err_stop;
+
+	if (cache_set_nr_online_devices(c) == cache_set_nr_devices(c)) {
+		err = run_cache_set(c);
+		if (err)
+			goto err_stop;
+	}
+
+	err = "error creating kobject";
+	if (bch_cache_set_online(c))
+		goto err_stop;
+out:
+
+	bch_info(c, "started");
+	return NULL;
+err_stop:
+	bch_cache_set_stop(c);
+	return err;
+}
+
+int bch_cache_set_add_cache(struct cache_set *c, const char *path)
+{
+	struct bcache_superblock sb;
+	const char *err;
+	struct cache *ca;
+	struct cache_member *new_mi = NULL;
+	struct cache_member mi;
+	unsigned nr_this_dev, nr_in_set, u64s;
+	int ret = -EINVAL;
+
+	mutex_lock(&bch_register_lock);
+
+	err = read_super(&sb, path);
+	if (err)
+		goto err_unlock;
+
+	err = validate_cache_super(&sb);
+	if (err)
+		goto err_unlock;
+
+	err = can_add_cache(sb.sb, c);
+	if (err)
+		goto err_unlock;
+
+	/*
+	 * Preserve the old cache member information (esp. tier)
+	 * before we start bashing the disk stuff.
+	 */
+	mi = sb.sb->members[sb.sb->nr_this_dev];
+	mi.last_mount = cpu_to_le64(ktime_get_seconds());
+
+	down_read(&c->gc_lock);
+
+	if (dynamic_fault("bcache:add:no_slot"))
+		goto no_slot;
+
+	if (test_bit(CACHE_SET_GC_FAILURE, &c->flags))
+		goto no_slot;
+
+	for (nr_this_dev = 0; nr_this_dev < MAX_CACHES_PER_SET; nr_this_dev++)
+		if (nr_this_dev >= c->sb.nr_in_set ||
+		    bch_is_zero(c->disk_mi[nr_this_dev].uuid.b,
+				 sizeof(uuid_le)))
+			goto have_slot;
+no_slot:
+	up_read(&c->gc_lock);
+
+	err = "no slots available in superblock";
+	ret = -ENOSPC;
+	goto err_unlock;
+
+have_slot:
+	nr_in_set = max_t(unsigned, nr_this_dev + 1, c->sb.nr_in_set);
+	up_read(&c->gc_lock);
+
+	u64s = nr_in_set * (sizeof(struct cache_member) / sizeof(u64));
+	err = "no space in superblock for member info";
+	if (bch_super_realloc(&sb, u64s))
+		goto err_unlock;
+
+	new_mi = dynamic_fault("bcache:add:member_info_realloc")
+		? NULL
+		: kmalloc(sizeof(struct cache_member) * nr_in_set,
+			  GFP_KERNEL);
+	if (!new_mi) {
+		err = "cannot allocate memory";
+		ret = -ENOMEM;
+		goto err_unlock;
+	}
+
+	memcpy(new_mi, c->disk_mi,
+	       sizeof(struct cache_member) * nr_in_set);
+	new_mi[nr_this_dev] = mi;
+
+	sb.sb->nr_this_dev	= nr_this_dev;
+	sb.sb->nr_in_set	= nr_in_set;
+	sb.sb->u64s		= cpu_to_le16(u64s);
+	memcpy(sb.sb->members, new_mi,
+	       sizeof(struct cache_member) * nr_in_set);
+
+	if (cache_set_mi_update(c, new_mi, nr_in_set)) {
+		err = "cannot allocate memory";
+		ret = -ENOMEM;
+		goto err_unlock;
+	}
+
+	/* commit new member info */
+	swap(c->disk_mi, new_mi);
+	kfree(new_mi);
+	new_mi = NULL;
+	c->disk_sb.nr_in_set = nr_in_set;
+	c->sb.nr_in_set = nr_in_set;
+
+	err = cache_alloc(&sb, c, &ca);
+	if (err)
+		goto err_unlock;
+
+	bcache_write_super(c);
+
+	err = "journal alloc failed";
+	if (bch_cache_journal_alloc(ca))
+		goto err_put;
+
+	bch_notify_cache_added(ca);
+
+	if (ca->mi.state == CACHE_ACTIVE) {
+		err = __bch_cache_read_write(c, ca);
+		if (err)
+			goto err_put;
+	}
+
+	kobject_put(&ca->kobj);
+	mutex_unlock(&bch_register_lock);
+	return 0;
+err_put:
+	bch_cache_stop(ca);
+err_unlock:
+	kfree(new_mi);
+	free_super(&sb);
+	mutex_unlock(&bch_register_lock);
+
+	bch_err(c, "Unable to add device: %s", err);
+	return ret ?: -EINVAL;
+}
+
+const char *bch_register_cache_set(char * const *devices, unsigned nr_devices,
+				   struct cache_set_opts opts,
+				   struct cache_set **ret)
+{
+	const char *err;
+	struct cache_set *c = NULL;
+	struct bcache_superblock *sb;
+	uuid_le uuid;
+	unsigned i;
+
+	memset(&uuid, 0, sizeof(uuid_le));
+
+	if (!nr_devices)
+		return "need at least one device";
+
+	if (!try_module_get(THIS_MODULE))
+		return "module unloading";
+
+	err = "cannot allocate memory";
+	sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
+	if (!sb)
+		goto err;
+
+	/*
+	 * read_super() needs to happen under register_lock, so that the
+	 * exclusive open is atomic with adding the new cache set to the list of
+	 * cache sets:
+	 */
+	mutex_lock(&bch_register_lock);
+
+	for (i = 0; i < nr_devices; i++) {
+		err = read_super(&sb[i], devices[i]);
+		if (err)
+			goto err_unlock;
+
+		err = "attempting to register backing device";
+		if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
+			goto err_unlock;
+
+		err = validate_cache_super(&sb[i]);
+		if (err)
+			goto err_unlock;
+	}
+
+	err = "cache set already registered";
+	if (cache_set_lookup(sb->sb->set_uuid))
+		goto err_unlock;
+
+	err = "cannot allocate memory";
+	c = bch_cache_set_alloc(sb[0].sb, opts);
+	if (!c)
+		goto err_unlock;
+
+	for (i = 0; i < nr_devices; i++) {
+		err = cache_alloc(&sb[i], c, NULL);
+		if (err)
+			goto err_unlock;
+	}
+
+	err = "insufficient devices";
+	if (cache_set_nr_online_devices(c) != cache_set_nr_devices(c))
+		goto err_unlock;
+
+	err = run_cache_set(c);
+	if (err)
+		goto err_unlock;
+
+	err = "error creating kobject";
+	if (bch_cache_set_online(c))
+		goto err_unlock;
+
+	if (ret) {
+		closure_get(&c->cl);
+		*ret = c;
+	}
+
+	mutex_unlock(&bch_register_lock);
+
+	err = NULL;
+out:
+	kfree(sb);
+	module_put(THIS_MODULE);
+	return err;
+err_unlock:
+	if (c)
+		bch_cache_set_stop(c);
+	mutex_unlock(&bch_register_lock);
+err:
+	for (i = 0; i < nr_devices; i++)
+		free_super(&sb[i]);
+	goto out;
+}
+
+const char *bch_register_one(const char *path)
+{
+	struct bcache_superblock sb;
+	const char *err;
+
+	mutex_lock(&bch_register_lock);
+
+	err = read_super(&sb, path);
+	if (err)
+		goto err;
+
+	if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
+		err = bch_backing_dev_register(&sb);
+	else
+		err = register_cache(&sb, cache_set_opts_empty());
+
+	free_super(&sb);
+err:
+	mutex_unlock(&bch_register_lock);
+	return err;
+}
+
+/* Global interfaces/init */
+
+#define kobj_attribute_write(n, fn)					\
+	static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
+
+#define kobj_attribute_rw(n, show, store)				\
+	static struct kobj_attribute ksysfs_##n =			\
+		__ATTR(n, S_IWUSR|S_IRUSR, show, store)
+
+static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
+			       const char *, size_t);
+
+kobj_attribute_write(register,		register_bcache);
+kobj_attribute_write(register_quiet,	register_bcache);
+
+static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+			       const char *buffer, size_t size)
+{
+	ssize_t ret = -EINVAL;
+	const char *err = "cannot allocate memory";
+	char *path = NULL;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EBUSY;
+
+	if (!(path = kstrndup(skip_spaces(buffer), size, GFP_KERNEL)))
+		goto err;
+
+	err = bch_register_one(strim(path));
+	if (err)
+		goto err;
+
+	ret = size;
+out:
+	kfree(path);
+	module_put(THIS_MODULE);
+	return ret;
+err:
+	pr_err("error opening %s: %s", path, err);
+	goto out;
+}
+
+static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
+{
+	if (code == SYS_DOWN ||
+	    code == SYS_HALT ||
+	    code == SYS_POWER_OFF) {
+		struct cache_set *c;
+
+		mutex_lock(&bch_register_lock);
+
+		if (!list_empty(&bch_cache_sets))
+			pr_info("Setting all devices read only:");
+
+		list_for_each_entry(c, &bch_cache_sets, list)
+			bch_cache_set_read_only(c);
+
+		list_for_each_entry(c, &bch_cache_sets, list)
+			bch_cache_set_read_only_sync(c);
+
+		mutex_unlock(&bch_register_lock);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block reboot = {
+	.notifier_call	= bcache_reboot,
+	.priority	= INT_MAX, /* before any real devices */
+};
+
+static ssize_t reboot_test(struct kobject *k, struct kobj_attribute *attr,
+			   const char *buffer, size_t size)
+{
+	bcache_reboot(NULL, SYS_DOWN, NULL);
+	return size;
+}
+
+kobj_attribute_write(reboot,		reboot_test);
+
+static void bcache_exit(void)
+{
+	bch_debug_exit();
+	bch_fs_exit();
+	bch_blockdev_exit();
+	if (bcache_kset)
+		kset_unregister(bcache_kset);
+	if (bcache_io_wq)
+		destroy_workqueue(bcache_io_wq);
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		device_destroy(bch_chardev_class,
+			       MKDEV(bch_chardev_major, 0));
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		class_destroy(bch_chardev_class);
+	if (bch_chardev_major > 0)
+		unregister_chrdev(bch_chardev_major, "bcache");
+	if (!IS_ERR_OR_NULL(bch_sha1))
+		crypto_free_shash(bch_sha1);
+	unregister_reboot_notifier(&reboot);
+}
+
+static int __init bcache_init(void)
+{
+	static const struct attribute *files[] = {
+		&ksysfs_register.attr,
+		&ksysfs_register_quiet.attr,
+		&ksysfs_reboot.attr,
+		NULL
+	};
+
+	mutex_init(&bch_register_lock);
+	register_reboot_notifier(&reboot);
+	closure_debug_init();
+	bkey_pack_test();
+
+	bch_sha1 = crypto_alloc_shash("sha1", 0, 0);
+	if (IS_ERR(bch_sha1))
+		goto err;
+
+	bch_chardev_major = register_chrdev(0, "bcache-ctl", &bch_chardev_fops);
+	if (bch_chardev_major < 0)
+		goto err;
+
+	bch_chardev_class = class_create(THIS_MODULE, "bcache");
+	if (IS_ERR(bch_chardev_class))
+		goto err;
+
+	bch_chardev = device_create(bch_chardev_class, NULL,
+				    MKDEV(bch_chardev_major, 255),
+				    NULL, "bcache-ctl");
+	if (IS_ERR(bch_chardev))
+		goto err;
+
+	if (!(bcache_io_wq = create_freezable_workqueue("bcache_io")) ||
+	    !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) ||
+	    sysfs_create_files(&bcache_kset->kobj, files) ||
+	    bch_blockdev_init() ||
+	    bch_fs_init() ||
+	    bch_debug_init())
+		goto err;
+
+	return 0;
+err:
+	bcache_exit();
+	return -ENOMEM;
+}
+
+#define BCH_DEBUG_PARAM(name, description)			\
+	bool bch_##name;					\
+	module_param_named(name, bch_##name, bool, 0644);	\
+	MODULE_PARM_DESC(name, description);
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+module_exit(bcache_exit);
+module_init(bcache_init);
diff --git a/libbcache/super.h b/libbcache/super.h
new file mode 100644
index 0000000..635e1a6
--- /dev/null
+++ b/libbcache/super.h
@@ -0,0 +1,160 @@
+#ifndef _BCACHE_SUPER_H
+#define _BCACHE_SUPER_H
+
+#include "extents.h"
+
+static inline size_t sector_to_bucket(const struct cache *ca, sector_t s)
+{
+	return s >> ca->bucket_bits;
+}
+
+static inline sector_t bucket_to_sector(const struct cache *ca, size_t b)
+{
+	return ((sector_t) b) << ca->bucket_bits;
+}
+
+static inline sector_t bucket_remainder(const struct cache *ca, sector_t s)
+{
+	return s & (ca->mi.bucket_size - 1);
+}
+
+#define cache_member_info_get(_c)					\
+	(rcu_read_lock(), rcu_dereference((_c)->members))
+
+#define cache_member_info_put()	rcu_read_unlock()
+
+static inline struct cache *bch_next_cache_rcu(struct cache_set *c,
+					       unsigned *iter)
+{
+	struct cache *ret = NULL;
+
+	while (*iter < c->sb.nr_in_set &&
+	       !(ret = rcu_dereference(c->cache[*iter])))
+		(*iter)++;
+
+	return ret;
+}
+
+#define for_each_cache_rcu(ca, c, iter)					\
+	for ((iter) = 0; ((ca) = bch_next_cache_rcu((c), &(iter))); (iter)++)
+
+static inline struct cache *bch_get_next_cache(struct cache_set *c,
+					       unsigned *iter)
+{
+	struct cache *ret;
+
+	rcu_read_lock();
+	if ((ret = bch_next_cache_rcu(c, iter)))
+		percpu_ref_get(&ret->ref);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/*
+ * If you break early, you must drop your ref on the current cache
+ */
+#define for_each_cache(ca, c, iter)					\
+	for ((iter) = 0;						\
+	     (ca = bch_get_next_cache(c, &(iter)));			\
+	     percpu_ref_put(&ca->ref), (iter)++)
+
+void bch_check_mark_super_slowpath(struct cache_set *,
+				   const struct bkey_i *, bool);
+
+static inline bool bch_check_super_marked(struct cache_set *c,
+					  const struct bkey_i *k, bool meta)
+{
+	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+	const struct bch_extent_ptr *ptr;
+	struct cache_member_cpu *mi = cache_member_info_get(c)->m;
+	bool ret = true;
+
+	extent_for_each_ptr(e, ptr)
+		if (!(meta
+		      ? mi[ptr->dev].has_metadata
+		      : mi[ptr->dev].has_data) &&
+		    bch_extent_ptr_is_dirty(c, e, ptr)) {
+			ret = false;
+			break;
+		}
+
+	cache_member_info_put();
+
+	return ret;
+}
+
+static inline void bch_check_mark_super(struct cache_set *c,
+					const struct bkey_i *k, bool meta)
+{
+	if (bch_check_super_marked(c, k, meta))
+		return;
+
+	bch_check_mark_super_slowpath(c, k, meta);
+}
+
+static inline bool bch_cache_may_remove(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+
+	/*
+	 * Right now, we can't remove the last device from a tier,
+	 * - For tier 0, because all metadata lives in tier 0 and because
+	 *   there is no way to have foreground writes go directly to tier 1.
+	 * - For tier 1, because the code doesn't completely support an
+	 *   empty tier 1.
+	 */
+
+	/*
+	 * Turning a device read-only removes it from the cache group,
+	 * so there may only be one read-write device in a tier, and yet
+	 * the device we are removing is in the same tier, so we have
+	 * to check for identity.
+	 * Removing the last RW device from a tier requires turning the
+	 * whole cache set RO.
+	 */
+
+	return tier->nr_devices != 1 ||
+		rcu_access_pointer(tier->d[0].dev) != ca;
+}
+
+void free_super(struct bcache_superblock *);
+int bch_super_realloc(struct bcache_superblock *, unsigned);
+void bcache_write_super(struct cache_set *);
+void __write_super(struct cache_set *, struct bcache_superblock *);
+
+void bch_cache_set_release(struct kobject *);
+void bch_cache_release(struct kobject *);
+
+void bch_cache_set_unregister(struct cache_set *);
+void bch_cache_set_stop(struct cache_set *);
+
+const char *bch_register_one(const char *path);
+const char *bch_register_cache_set(char * const *, unsigned,
+				   struct cache_set_opts,
+				   struct cache_set **);
+
+bool bch_cache_set_read_only(struct cache_set *);
+bool bch_cache_set_emergency_read_only(struct cache_set *);
+void bch_cache_set_read_only_sync(struct cache_set *);
+const char *bch_cache_set_read_write(struct cache_set *);
+
+bool bch_cache_read_only(struct cache *);
+const char *bch_cache_read_write(struct cache *);
+bool bch_cache_remove(struct cache *, bool force);
+int bch_cache_set_add_cache(struct cache_set *, const char *);
+
+extern struct mutex bch_register_lock;
+extern struct list_head bch_cache_sets;
+extern struct idr bch_cache_set_minor;
+extern struct workqueue_struct *bcache_io_wq;
+extern struct crypto_shash *bch_sha1;
+
+extern struct kobj_type bch_cache_set_ktype;
+extern struct kobj_type bch_cache_set_internal_ktype;
+extern struct kobj_type bch_cache_set_time_stats_ktype;
+extern struct kobj_type bch_cache_set_opts_dir_ktype;
+extern struct kobj_type bch_cache_ktype;
+
+#endif /* _BCACHE_SUPER_H */
diff --git a/libbcache/super_types.h b/libbcache/super_types.h
new file mode 100644
index 0000000..d89f780
--- /dev/null
+++ b/libbcache/super_types.h
@@ -0,0 +1,11 @@
+#ifndef _BCACHE_SUPER_TYPES_H
+#define _BCACHE_SUPER_TYPES_H
+
+struct bcache_superblock {
+	struct cache_sb		*sb;
+	struct block_device	*bdev;
+	struct bio		*bio;
+	unsigned		page_order;
+};
+
+#endif /* _BCACHE_SUPER_TYPES_H */
diff --git a/libbcache/sysfs.c b/libbcache/sysfs.c
new file mode 100644
index 0000000..40d006b
--- /dev/null
+++ b/libbcache/sysfs.c
@@ -0,0 +1,1397 @@
+/*
+ * bcache sysfs interfaces
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "alloc.h"
+#include "blockdev.h"
+#include "sysfs.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "inode.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "opts.h"
+#include "request.h"
+#include "writeback.h"
+
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+
+static const char * const cache_replacement_policies[] = {
+	"lru",
+	"fifo",
+	"random",
+	NULL
+};
+
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+static const char * const bch_cache_modes[] = {
+	"default",
+	"writethrough",
+	"writeback",
+	"writearound",
+	"none",
+	NULL
+};
+
+static const char * const bch_cache_state[] = {
+	"active",
+	"readonly",
+	"failed",
+	"spare",
+	NULL
+};
+
+write_attribute(attach);
+write_attribute(detach);
+write_attribute(unregister);
+write_attribute(stop);
+write_attribute(clear_stats);
+write_attribute(trigger_btree_coalesce);
+write_attribute(trigger_gc);
+write_attribute(prune_cache);
+write_attribute(blockdev_volume_create);
+write_attribute(add_device);
+
+read_attribute(uuid);
+read_attribute(minor);
+read_attribute(bucket_size);
+read_attribute(bucket_size_bytes);
+read_attribute(block_size);
+read_attribute(block_size_bytes);
+read_attribute(btree_node_size);
+read_attribute(btree_node_size_bytes);
+read_attribute(first_bucket);
+read_attribute(nbuckets);
+read_attribute(tree_depth);
+read_attribute(root_usage_percent);
+read_attribute(read_priority_stats);
+read_attribute(write_priority_stats);
+read_attribute(fragmentation_stats);
+read_attribute(oldest_gen_stats);
+read_attribute(reserve_stats);
+read_attribute(btree_cache_size);
+read_attribute(cache_available_percent);
+read_attribute(compression_stats);
+read_attribute(written);
+read_attribute(btree_written);
+read_attribute(metadata_written);
+read_attribute(journal_debug);
+write_attribute(journal_flush);
+read_attribute(internal_uuid);
+
+read_attribute(btree_gc_running);
+
+read_attribute(btree_nodes);
+read_attribute(btree_used_percent);
+read_attribute(average_key_size);
+read_attribute(available_buckets);
+read_attribute(free_buckets);
+read_attribute(dirty_data);
+read_attribute(dirty_bytes);
+read_attribute(dirty_buckets);
+read_attribute(cached_data);
+read_attribute(cached_bytes);
+read_attribute(cached_buckets);
+read_attribute(meta_buckets);
+read_attribute(alloc_buckets);
+read_attribute(has_data);
+read_attribute(has_metadata);
+read_attribute(bset_tree_stats);
+read_attribute(alloc_debug);
+
+read_attribute(state);
+read_attribute(cache_read_races);
+read_attribute(writeback_keys_done);
+read_attribute(writeback_keys_failed);
+read_attribute(io_errors);
+rw_attribute(io_error_limit);
+rw_attribute(io_error_halflife);
+read_attribute(congested);
+rw_attribute(congested_read_threshold_us);
+rw_attribute(congested_write_threshold_us);
+
+rw_attribute(sequential_cutoff);
+rw_attribute(cache_mode);
+rw_attribute(writeback_metadata);
+rw_attribute(writeback_running);
+rw_attribute(writeback_percent);
+sysfs_pd_controller_attribute(writeback);
+
+read_attribute(stripe_size);
+read_attribute(partial_stripes_expensive);
+
+rw_attribute(journal_write_delay_ms);
+rw_attribute(journal_reclaim_delay_ms);
+read_attribute(journal_entry_size_max);
+
+rw_attribute(discard);
+rw_attribute(running);
+rw_attribute(label);
+rw_attribute(readahead);
+rw_attribute(verify);
+rw_attribute(bypass_torture_test);
+rw_attribute(cache_replacement_policy);
+
+rw_attribute(foreground_write_ratelimit_enabled);
+rw_attribute(copy_gc_enabled);
+sysfs_pd_controller_attribute(copy_gc);
+rw_attribute(tiering_enabled);
+rw_attribute(tiering_percent);
+sysfs_pd_controller_attribute(tiering);
+
+sysfs_pd_controller_attribute(foreground_write);
+
+rw_attribute(pd_controllers_update_seconds);
+
+rw_attribute(foreground_target_percent);
+
+rw_attribute(size);
+read_attribute(meta_replicas_have);
+read_attribute(data_replicas_have);
+read_attribute(tier);
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	rw_attribute(name);
+
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+	static struct attribute sysfs_opt_##_name = {			\
+		.name = #_name,						\
+		.mode = S_IRUGO|(_perm ? S_IWUSR : 0)			\
+	};
+
+	CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+
+#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+	sysfs_time_stats_attribute(name, frequency_units, duration_units);
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
+static struct attribute sysfs_state_rw = {
+	.name = "state",
+	.mode = S_IRUGO|S_IWUSR
+};
+
+SHOW(bch_cached_dev)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+	const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
+
+#define var(stat)		(dc->stat)
+
+	if (attr == &sysfs_cache_mode)
+		return bch_snprint_string_list(buf, PAGE_SIZE,
+					       bch_cache_modes + 1,
+					       BDEV_CACHE_MODE(dc->disk_sb.sb));
+
+	var_printf(verify,		"%i");
+	var_printf(bypass_torture_test,	"%i");
+	var_printf(writeback_metadata,	"%i");
+	var_printf(writeback_running,	"%i");
+	var_print(writeback_percent);
+	sysfs_pd_controller_show(writeback, &dc->writeback_pd);
+
+	sysfs_hprint(dirty_data,
+		     bcache_dev_sectors_dirty(&dc->disk) << 9);
+	sysfs_print(dirty_bytes,
+		    bcache_dev_sectors_dirty(&dc->disk) << 9);
+
+	sysfs_hprint(stripe_size,	dc->disk.stripe_size << 9);
+	var_printf(partial_stripes_expensive,	"%u");
+
+	var_hprint(sequential_cutoff);
+	var_hprint(readahead);
+
+	sysfs_print(running,		atomic_read(&dc->running));
+	sysfs_print(state,		states[BDEV_STATE(dc->disk_sb.sb)]);
+
+	if (attr == &sysfs_label) {
+		memcpy(buf, dc->disk_sb.sb->label, SB_LABEL_SIZE);
+		buf[SB_LABEL_SIZE + 1] = '\0';
+		strcat(buf, "\n");
+		return strlen(buf);
+	}
+
+#undef var
+	return 0;
+}
+
+STORE(__cached_dev)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+	unsigned v = size;
+	struct cache_set *c;
+	struct kobj_uevent_env *env;
+
+#define d_strtoul(var)		sysfs_strtoul(var, dc->var)
+#define d_strtoul_nonzero(var)	sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
+#define d_strtoi_h(var)		sysfs_hatoi(var, dc->var)
+
+	d_strtoul(verify);
+	d_strtoul(bypass_torture_test);
+	d_strtoul(writeback_metadata);
+	d_strtoul(writeback_running);
+	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
+	sysfs_pd_controller_store(writeback, &dc->writeback_pd);
+
+	d_strtoi_h(sequential_cutoff);
+	d_strtoi_h(readahead);
+
+	if (attr == &sysfs_clear_stats)
+		bch_cache_accounting_clear(&dc->accounting);
+
+	if (attr == &sysfs_running &&
+	    strtoul_or_return(buf))
+		bch_cached_dev_run(dc);
+
+	if (attr == &sysfs_cache_mode) {
+		ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
+
+		if (v < 0)
+			return v;
+
+		if ((unsigned) v != BDEV_CACHE_MODE(dc->disk_sb.sb)) {
+			SET_BDEV_CACHE_MODE(dc->disk_sb.sb, v);
+			bch_write_bdev_super(dc, NULL);
+		}
+	}
+
+	if (attr == &sysfs_label) {
+		u64 journal_seq = 0;
+		int ret = 0;
+
+		if (size > SB_LABEL_SIZE)
+			return -EINVAL;
+
+		mutex_lock(&dc->disk.inode_lock);
+
+		memcpy(dc->disk_sb.sb->label, buf, size);
+		if (size < SB_LABEL_SIZE)
+			dc->disk_sb.sb->label[size] = '\0';
+		if (size && dc->disk_sb.sb->label[size - 1] == '\n')
+			dc->disk_sb.sb->label[size - 1] = '\0';
+
+		memcpy(dc->disk.inode.v.i_label,
+		       dc->disk_sb.sb->label, SB_LABEL_SIZE);
+
+		bch_write_bdev_super(dc, NULL);
+
+		if (dc->disk.c)
+			ret = bch_inode_update(dc->disk.c, &dc->disk.inode.k_i,
+					       &journal_seq);
+
+		mutex_unlock(&dc->disk.inode_lock);
+
+		if (ret)
+			return ret;
+
+		if (dc->disk.c)
+			ret = bch_journal_flush_seq(&dc->disk.c->journal,
+						    journal_seq);
+		if (ret)
+			return ret;
+
+		env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
+		if (!env)
+			return -ENOMEM;
+		add_uevent_var(env, "DRIVER=bcache");
+		add_uevent_var(env, "CACHED_UUID=%pU", dc->disk_sb.sb->disk_uuid.b),
+		add_uevent_var(env, "CACHED_LABEL=%s", buf);
+		kobject_uevent_env(
+			&disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp);
+		kfree(env);
+	}
+
+	if (attr == &sysfs_attach) {
+		if (uuid_parse(buf, &dc->disk_sb.sb->user_uuid))
+			return -EINVAL;
+
+		list_for_each_entry(c, &bch_cache_sets, list) {
+			v = bch_cached_dev_attach(dc, c);
+			if (!v)
+				return size;
+		}
+
+		pr_err("Can't attach %s: cache set not found", buf);
+		size = v;
+	}
+
+	if (attr == &sysfs_detach && dc->disk.c)
+		bch_cached_dev_detach(dc);
+
+	if (attr == &sysfs_stop)
+		bch_blockdev_stop(&dc->disk);
+
+	return size;
+}
+
+STORE(bch_cached_dev)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+
+	mutex_lock(&bch_register_lock);
+	size = __cached_dev_store(kobj, attr, buf, size);
+
+	if (attr == &sysfs_writeback_running)
+		bch_writeback_queue(dc);
+
+	if (attr == &sysfs_writeback_percent)
+		schedule_delayed_work(&dc->writeback_pd_update,
+				      dc->writeback_pd_update_seconds * HZ);
+
+	mutex_unlock(&bch_register_lock);
+	return size;
+}
+
+static struct attribute *bch_cached_dev_files[] = {
+	&sysfs_attach,
+	&sysfs_detach,
+	&sysfs_stop,
+	&sysfs_cache_mode,
+	&sysfs_writeback_metadata,
+	&sysfs_writeback_running,
+	&sysfs_writeback_percent,
+	sysfs_pd_controller_files(writeback),
+	&sysfs_dirty_data,
+	&sysfs_dirty_bytes,
+	&sysfs_stripe_size,
+	&sysfs_partial_stripes_expensive,
+	&sysfs_sequential_cutoff,
+	&sysfs_clear_stats,
+	&sysfs_running,
+	&sysfs_state,
+	&sysfs_label,
+	&sysfs_readahead,
+#ifdef CONFIG_BCACHE_DEBUG
+	&sysfs_verify,
+	&sysfs_bypass_torture_test,
+#endif
+	NULL
+};
+KTYPE(bch_cached_dev);
+
+SHOW(bch_blockdev_volume)
+{
+	struct bcache_device *d = container_of(kobj, struct bcache_device,
+					       kobj);
+
+	sysfs_hprint(size,	le64_to_cpu(d->inode.v.i_size));
+
+	if (attr == &sysfs_label) {
+		memcpy(buf, d->inode.v.i_label, SB_LABEL_SIZE);
+		buf[SB_LABEL_SIZE + 1] = '\0';
+		strcat(buf, "\n");
+		return strlen(buf);
+	}
+
+	return 0;
+}
+
+STORE(__bch_blockdev_volume)
+{
+	struct bcache_device *d = container_of(kobj, struct bcache_device,
+					       kobj);
+
+	if (attr == &sysfs_size) {
+		u64 journal_seq = 0;
+		u64 v = strtoi_h_or_return(buf);
+		int ret;
+
+		mutex_lock(&d->inode_lock);
+
+		if (v < le64_to_cpu(d->inode.v.i_size) ){
+			ret = bch_inode_truncate(d->c, d->inode.k.p.inode,
+						 v >> 9, NULL, NULL);
+			if (ret) {
+				mutex_unlock(&d->inode_lock);
+				return ret;
+			}
+		}
+		d->inode.v.i_size = cpu_to_le64(v);
+		ret = bch_inode_update(d->c, &d->inode.k_i, &journal_seq);
+
+		mutex_unlock(&d->inode_lock);
+
+		if (ret)
+			return ret;
+
+		ret = bch_journal_flush_seq(&d->c->journal, journal_seq);
+		if (ret)
+			return ret;
+
+		set_capacity(d->disk, v >> 9);
+	}
+
+	if (attr == &sysfs_label) {
+		u64 journal_seq = 0;
+		int ret;
+
+		mutex_lock(&d->inode_lock);
+
+		memcpy(d->inode.v.i_label, buf, SB_LABEL_SIZE);
+		ret = bch_inode_update(d->c, &d->inode.k_i, &journal_seq);
+
+		mutex_unlock(&d->inode_lock);
+
+		return ret ?: bch_journal_flush_seq(&d->c->journal, journal_seq);
+	}
+
+	if (attr == &sysfs_unregister) {
+		set_bit(BCACHE_DEV_DETACHING, &d->flags);
+		bch_blockdev_stop(d);
+	}
+
+	return size;
+}
+STORE_LOCKED(bch_blockdev_volume)
+
+static struct attribute *bch_blockdev_volume_files[] = {
+	&sysfs_unregister,
+	&sysfs_label,
+	&sysfs_size,
+	NULL
+};
+KTYPE(bch_blockdev_volume);
+
+static int bch_bset_print_stats(struct cache_set *c, char *buf)
+{
+	struct bset_stats stats;
+	size_t nodes = 0;
+	struct btree *b;
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	unsigned iter;
+
+	memset(&stats, 0, sizeof(stats));
+
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, iter, pos) {
+		bch_btree_keys_stats(b, &stats);
+		nodes++;
+	}
+	rcu_read_unlock();
+
+	return snprintf(buf, PAGE_SIZE,
+			"btree nodes:		%zu\n"
+			"written sets:		%zu\n"
+			"written key bytes:	%zu\n"
+			"unwritten sets:		%zu\n"
+			"unwritten key bytes:	%zu\n"
+			"no table sets:		%zu\n"
+			"no table key bytes:	%zu\n"
+			"floats:			%zu\n"
+			"failed unpacked:	%zu\n"
+			"failed prev:		%zu\n"
+			"failed overflow:	%zu\n",
+			nodes,
+			stats.sets[BSET_RO_AUX_TREE].nr,
+			stats.sets[BSET_RO_AUX_TREE].bytes,
+			stats.sets[BSET_RW_AUX_TREE].nr,
+			stats.sets[BSET_RW_AUX_TREE].bytes,
+			stats.sets[BSET_NO_AUX_TREE].nr,
+			stats.sets[BSET_NO_AUX_TREE].bytes,
+			stats.floats,
+			stats.failed_unpacked,
+			stats.failed_prev,
+			stats.failed_overflow);
+}
+
+static unsigned bch_root_usage(struct cache_set *c)
+{
+	unsigned bytes = 0;
+	struct bkey_packed *k;
+	struct btree *b;
+	struct btree_node_iter iter;
+
+	goto lock_root;
+
+	do {
+		six_unlock_read(&b->lock);
+lock_root:
+		b = c->btree_roots[BTREE_ID_EXTENTS].b;
+		six_lock_read(&b->lock);
+	} while (b != c->btree_roots[BTREE_ID_EXTENTS].b);
+
+	for_each_btree_node_key(b, k, &iter, btree_node_is_extents(b))
+		bytes += bkey_bytes(k);
+
+	six_unlock_read(&b->lock);
+
+	return (bytes * 100) / btree_bytes(c);
+}
+
+static size_t bch_cache_size(struct cache_set *c)
+{
+	size_t ret = 0;
+	struct btree *b;
+
+	mutex_lock(&c->btree_cache_lock);
+	list_for_each_entry(b, &c->btree_cache, list)
+		ret += btree_bytes(c);
+
+	mutex_unlock(&c->btree_cache_lock);
+	return ret;
+}
+
+static unsigned bch_cache_available_percent(struct cache_set *c)
+{
+	return div64_u64((u64) sectors_available(c) * 100,
+			 c->capacity ?: 1);
+}
+
+#if 0
+static unsigned bch_btree_used(struct cache_set *c)
+{
+	return div64_u64(c->gc_stats.key_bytes * 100,
+			 (c->gc_stats.nodes ?: 1) * btree_bytes(c));
+}
+
+static unsigned bch_average_key_size(struct cache_set *c)
+{
+	return c->gc_stats.nkeys
+		? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
+		: 0;
+}
+#endif
+
+static ssize_t show_cache_set_alloc_debug(struct cache_set *c, char *buf)
+{
+	struct bucket_stats_cache_set stats = bch_bucket_stats_read_cache_set(c);
+
+	return scnprintf(buf, PAGE_SIZE,
+			 "capacity:\t\t%llu\n"
+			 "compressed:\n"
+			 "\tmeta:\t\t%llu\n"
+			 "\tdirty:\t\t%llu\n"
+			 "\tcached:\t\t%llu\n"
+			 "uncompressed:\n"
+			 "\tmeta:\t\t%llu\n"
+			 "\tdirty:\t\t%llu\n"
+			 "\tcached:\t\t%llu\n"
+			 "persistent reserved sectors:\t%llu\n"
+			 "online reserved sectors:\t%llu\n",
+			 c->capacity,
+			 stats.s[S_COMPRESSED][S_META],
+			 stats.s[S_COMPRESSED][S_DIRTY],
+			 stats.s[S_COMPRESSED][S_CACHED],
+			 stats.s[S_UNCOMPRESSED][S_META],
+			 stats.s[S_UNCOMPRESSED][S_DIRTY],
+			 stats.s[S_UNCOMPRESSED][S_CACHED],
+			 stats.persistent_reserved,
+			 stats.online_reserved);
+}
+
+static ssize_t bch_compression_stats(struct cache_set *c, char *buf)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
+	    nr_compressed_extents = 0,
+	    compressed_sectors_compressed = 0,
+	    compressed_sectors_uncompressed = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k)
+		if (k.k->type == BCH_EXTENT) {
+			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+			const struct bch_extent_ptr *ptr;
+			const union bch_extent_crc *crc;
+
+			extent_for_each_ptr_crc(e, ptr, crc) {
+				if (crc_compression_type(crc) == BCH_COMPRESSION_NONE) {
+					nr_uncompressed_extents++;
+					uncompressed_sectors += e.k->size;
+				} else {
+					nr_compressed_extents++;
+					compressed_sectors_compressed +=
+						crc_compressed_size(e.k, crc);
+					compressed_sectors_uncompressed +=
+						crc_uncompressed_size(e.k, crc);
+				}
+
+				/* only looking at the first ptr */
+				break;
+			}
+		}
+	bch_btree_iter_unlock(&iter);
+
+	return snprintf(buf, PAGE_SIZE,
+			"uncompressed data:\n"
+			"	nr extents:			%llu\n"
+			"	size (bytes):			%llu\n"
+			"compressed data:\n"
+			"	nr extents:			%llu\n"
+			"	compressed size (bytes):	%llu\n"
+			"	uncompressed size (bytes):	%llu\n",
+			nr_uncompressed_extents,
+			uncompressed_sectors << 9,
+			nr_compressed_extents,
+			compressed_sectors_compressed << 9,
+			compressed_sectors_uncompressed << 9);
+}
+
+SHOW(bch_cache_set)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+	sysfs_print(minor,			c->minor);
+
+	sysfs_print(journal_write_delay_ms,	c->journal.write_delay_ms);
+	sysfs_print(journal_reclaim_delay_ms,	c->journal.reclaim_delay_ms);
+	sysfs_hprint(journal_entry_size_max,	c->journal.entry_size_max);
+
+	sysfs_hprint(block_size,		block_bytes(c));
+	sysfs_print(block_size_bytes,		block_bytes(c));
+	sysfs_hprint(btree_node_size,		c->sb.btree_node_size << 9);
+	sysfs_print(btree_node_size_bytes,	c->sb.btree_node_size << 9);
+
+	sysfs_hprint(btree_cache_size,		bch_cache_size(c));
+	sysfs_print(cache_available_percent,	bch_cache_available_percent(c));
+
+	sysfs_print(btree_gc_running,		c->gc_pos.phase != GC_PHASE_DONE);
+
+#if 0
+	/* XXX: reimplement */
+	sysfs_print(btree_used_percent,	bch_btree_used(c));
+	sysfs_print(btree_nodes,	c->gc_stats.nodes);
+	sysfs_hprint(average_key_size,	bch_average_key_size(c));
+#endif
+
+	sysfs_print(cache_read_races,
+		    atomic_long_read(&c->cache_read_races));
+
+	sysfs_print(writeback_keys_done,
+		    atomic_long_read(&c->writeback_keys_done));
+	sysfs_print(writeback_keys_failed,
+		    atomic_long_read(&c->writeback_keys_failed));
+
+	/* See count_io_errors for why 88 */
+	sysfs_print(io_error_halflife,	c->error_decay * 88);
+	sysfs_print(io_error_limit,	c->error_limit >> IO_ERROR_SHIFT);
+
+	sysfs_hprint(congested,
+		     ((uint64_t) bch_get_congested(c)) << 9);
+	sysfs_print(congested_read_threshold_us,
+		    c->congested_read_threshold_us);
+	sysfs_print(congested_write_threshold_us,
+		    c->congested_write_threshold_us);
+
+	sysfs_printf(foreground_write_ratelimit_enabled, "%i",
+		     c->foreground_write_ratelimit_enabled);
+	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+	sysfs_pd_controller_show(foreground_write, &c->foreground_write_pd);
+
+	sysfs_print(pd_controllers_update_seconds,
+		    c->pd_controllers_update_seconds);
+	sysfs_print(foreground_target_percent, c->foreground_target_percent);
+
+	sysfs_printf(tiering_enabled,		"%i", c->tiering_enabled);
+	sysfs_print(tiering_percent,		c->tiering_percent);
+	sysfs_pd_controller_show(tiering,	&c->tiering_pd);
+
+	sysfs_printf(meta_replicas_have, "%llu",
+		     CACHE_SET_META_REPLICAS_HAVE(&c->disk_sb));
+	sysfs_printf(data_replicas_have, "%llu",
+		     CACHE_SET_DATA_REPLICAS_HAVE(&c->disk_sb));
+
+	/* Debugging: */
+
+	if (attr == &sysfs_journal_debug)
+		return bch_journal_print_debug(&c->journal, buf);
+
+#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+	if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+		return -EPERM;
+
+	if (attr == &sysfs_bset_tree_stats)
+		return bch_bset_print_stats(c, buf);
+	if (attr == &sysfs_alloc_debug)
+		return show_cache_set_alloc_debug(c, buf);
+
+	sysfs_print(tree_depth, c->btree_roots[BTREE_ID_EXTENTS].b->level);
+	sysfs_print(root_usage_percent,		bch_root_usage(c));
+
+	if (attr == &sysfs_compression_stats)
+		return bch_compression_stats(c, buf);
+
+	sysfs_printf(internal_uuid, "%pU", c->disk_sb.set_uuid.b);
+
+	return 0;
+}
+
+STORE(__bch_cache_set)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+	if (attr == &sysfs_unregister) {
+		bch_cache_set_unregister(c);
+		return size;
+	}
+
+	if (attr == &sysfs_stop) {
+		bch_cache_set_stop(c);
+		return size;
+	}
+
+	if (attr == &sysfs_clear_stats) {
+		atomic_long_set(&c->writeback_keys_done,	0);
+		atomic_long_set(&c->writeback_keys_failed,	0);
+		bch_cache_accounting_clear(&c->accounting);
+
+		return size;
+	}
+
+	sysfs_strtoul(congested_read_threshold_us,
+		      c->congested_read_threshold_us);
+	sysfs_strtoul(congested_write_threshold_us,
+		      c->congested_write_threshold_us);
+
+	if (attr == &sysfs_io_error_limit) {
+		c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
+		return size;
+	}
+
+	/* See count_io_errors() for why 88 */
+	if (attr == &sysfs_io_error_halflife) {
+		c->error_decay = strtoul_or_return(buf) / 88;
+		return size;
+	}
+
+	sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
+	sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
+
+	sysfs_strtoul(foreground_write_ratelimit_enabled,
+		      c->foreground_write_ratelimit_enabled);
+
+	if (attr == &sysfs_copy_gc_enabled) {
+		struct cache *ca;
+		unsigned i;
+		ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
+			?: (ssize_t) size;
+
+		for_each_cache(ca, c, i)
+			if (ca->moving_gc_read)
+				wake_up_process(ca->moving_gc_read);
+		return ret;
+	}
+
+	if (attr == &sysfs_tiering_enabled) {
+		ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
+			?: (ssize_t) size;
+
+		if (c->tiering_read)
+			wake_up_process(c->tiering_read);
+		return ret;
+	}
+
+	sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
+
+	if (attr == &sysfs_journal_flush) {
+		bch_journal_meta_async(&c->journal, NULL);
+
+		return size;
+	}
+
+	sysfs_strtoul(pd_controllers_update_seconds,
+		      c->pd_controllers_update_seconds);
+	sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
+
+	sysfs_strtoul(tiering_percent,		c->tiering_percent);
+	sysfs_pd_controller_store(tiering,	&c->tiering_pd);
+
+	/* Debugging: */
+
+#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+	if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+		return -EPERM;
+
+	if (test_bit(CACHE_SET_STOPPING, &c->flags))
+		return -EINTR;
+
+	if (attr == &sysfs_blockdev_volume_create) {
+		u64 v = strtoi_h_or_return(buf);
+		int r = bch_blockdev_volume_create(c, v);
+
+		if (r)
+			return r;
+	}
+
+	if (attr == &sysfs_trigger_btree_coalesce)
+		bch_coalesce(c);
+
+	/* Debugging: */
+
+	if (attr == &sysfs_trigger_gc)
+		bch_gc(c);
+
+	if (attr == &sysfs_prune_cache) {
+		struct shrink_control sc;
+
+		sc.gfp_mask = GFP_KERNEL;
+		sc.nr_to_scan = strtoul_or_return(buf);
+		c->btree_cache_shrink.scan_objects(&c->btree_cache_shrink, &sc);
+	}
+
+	return size;
+}
+
+STORE(bch_cache_set)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+	mutex_lock(&bch_register_lock);
+	size = __bch_cache_set_store(kobj, attr, buf, size);
+	mutex_unlock(&bch_register_lock);
+
+	if (attr == &sysfs_add_device) {
+		char *path = kstrdup(buf, GFP_KERNEL);
+		int r = bch_cache_set_add_cache(c, strim(path));
+
+		kfree(path);
+		if (r)
+			return r;
+	}
+
+	return size;
+}
+
+static struct attribute *bch_cache_set_files[] = {
+	&sysfs_unregister,
+	&sysfs_stop,
+	&sysfs_journal_write_delay_ms,
+	&sysfs_journal_reclaim_delay_ms,
+	&sysfs_journal_entry_size_max,
+	&sysfs_blockdev_volume_create,
+	&sysfs_add_device,
+
+	&sysfs_block_size,
+	&sysfs_block_size_bytes,
+	&sysfs_btree_node_size,
+	&sysfs_btree_node_size_bytes,
+	&sysfs_tree_depth,
+	&sysfs_root_usage_percent,
+	&sysfs_btree_cache_size,
+	&sysfs_cache_available_percent,
+	&sysfs_compression_stats,
+
+	&sysfs_average_key_size,
+
+	&sysfs_io_error_limit,
+	&sysfs_io_error_halflife,
+	&sysfs_congested,
+	&sysfs_congested_read_threshold_us,
+	&sysfs_congested_write_threshold_us,
+	&sysfs_clear_stats,
+
+	&sysfs_meta_replicas_have,
+	&sysfs_data_replicas_have,
+
+	&sysfs_foreground_target_percent,
+	&sysfs_tiering_percent,
+
+	&sysfs_journal_flush,
+	NULL
+};
+KTYPE(bch_cache_set);
+
+/* internal dir - just a wrapper */
+
+SHOW(bch_cache_set_internal)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, internal);
+	return bch_cache_set_show(&c->kobj, attr, buf);
+}
+
+STORE(bch_cache_set_internal)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, internal);
+	return bch_cache_set_store(&c->kobj, attr, buf, size);
+}
+
+static void bch_cache_set_internal_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_cache_set_internal_files[] = {
+	&sysfs_journal_debug,
+
+	&sysfs_alloc_debug,
+
+	&sysfs_btree_gc_running,
+
+	&sysfs_btree_nodes,
+	&sysfs_btree_used_percent,
+
+	&sysfs_bset_tree_stats,
+	&sysfs_cache_read_races,
+	&sysfs_writeback_keys_done,
+	&sysfs_writeback_keys_failed,
+
+	&sysfs_trigger_btree_coalesce,
+	&sysfs_trigger_gc,
+	&sysfs_prune_cache,
+	&sysfs_foreground_write_ratelimit_enabled,
+	&sysfs_copy_gc_enabled,
+	&sysfs_tiering_enabled,
+	sysfs_pd_controller_files(tiering),
+	sysfs_pd_controller_files(foreground_write),
+	&sysfs_internal_uuid,
+
+#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+	NULL
+};
+KTYPE(bch_cache_set_internal);
+
+/* options */
+
+SHOW(bch_cache_set_opts_dir)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, opts_dir);
+
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+	if (attr == &sysfs_opt_##_name)					\
+		return _choices == bch_bool_opt || _choices == bch_uint_opt\
+			? snprintf(buf, PAGE_SIZE, "%i\n", c->opts._name)\
+			: bch_snprint_string_list(buf, PAGE_SIZE,	\
+						_choices, c->opts._name);\
+
+	CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+
+	return 0;
+}
+
+STORE(bch_cache_set_opts_dir)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, opts_dir);
+
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+	if (attr == &sysfs_opt_##_name) {				\
+		ssize_t v = (_choices == bch_bool_opt ||		\
+			     _choices == bch_uint_opt)			\
+			? strtoul_restrict_or_return(buf, _min, _max - 1)\
+			: bch_read_string_list(buf, _choices);		\
+									\
+		if (v < 0)						\
+			return v;					\
+									\
+		c->opts._name = v;					\
+									\
+		if (_sb_opt##_BITS && v != _sb_opt(&c->disk_sb)) {	\
+			SET_##_sb_opt(&c->disk_sb, v);			\
+			bcache_write_super(c);				\
+		}							\
+									\
+		return size;						\
+	}
+
+	CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+
+	return size;
+}
+
+static void bch_cache_set_opts_dir_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_cache_set_opts_dir_files[] = {
+#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)	\
+	&sysfs_opt_##_name,
+
+	CACHE_SET_VISIBLE_OPTS()
+#undef CACHE_SET_OPT
+
+	NULL
+};
+KTYPE(bch_cache_set_opts_dir);
+
+/* time stats */
+
+SHOW(bch_cache_set_time_stats)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, time_stats);
+
+#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+	sysfs_print_time_stats(&c->name##_time, name,			\
+			       frequency_units, duration_units);
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
+	return 0;
+}
+
+STORE(bch_cache_set_time_stats)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, time_stats);
+
+#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+	sysfs_clear_time_stats(&c->name##_time, name);
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
+	return size;
+}
+
+static void bch_cache_set_time_stats_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_cache_set_time_stats_files[] = {
+#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+	sysfs_time_stats_attribute_list(name, frequency_units, duration_units)
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
+	NULL
+};
+KTYPE(bch_cache_set_time_stats);
+
+typedef unsigned (bucket_map_fn)(struct cache *, struct bucket *, void *);
+
+static unsigned bucket_priority_fn(struct cache *ca, struct bucket *g,
+				   void *private)
+{
+	int rw = (private ? 1 : 0);
+
+	return ca->set->prio_clock[rw].hand - g->prio[rw];
+}
+
+static unsigned bucket_sectors_used_fn(struct cache *ca, struct bucket *g,
+				       void *private)
+{
+	return bucket_sectors_used(g);
+}
+
+static unsigned bucket_oldest_gen_fn(struct cache *ca, struct bucket *g,
+				     void *private)
+{
+	return bucket_gc_gen(ca, g);
+}
+
+static ssize_t show_quantiles(struct cache *ca, char *buf,
+			      bucket_map_fn *fn, void *private)
+{
+	int cmp(const void *l, const void *r)
+	{	return *((unsigned *) r) - *((unsigned *) l); }
+
+	size_t n = ca->mi.nbuckets, i;
+	/* Compute 31 quantiles */
+	unsigned q[31], *p;
+	ssize_t ret = 0;
+
+	p = vzalloc(ca->mi.nbuckets * sizeof(unsigned));
+	if (!p)
+		return -ENOMEM;
+
+	for (i = ca->mi.first_bucket; i < n; i++)
+		p[i] = fn(ca, &ca->buckets[i], private);
+
+	sort(p, n, sizeof(unsigned), cmp, NULL);
+
+	while (n &&
+	       !p[n - 1])
+		--n;
+
+	for (i = 0; i < ARRAY_SIZE(q); i++)
+		q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)];
+
+	vfree(p);
+
+	for (i = 0; i < ARRAY_SIZE(q); i++)
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+				 "%u ", q[i]);
+	buf[ret - 1] = '\n';
+
+	return ret;
+
+}
+
+static ssize_t show_reserve_stats(struct cache *ca, char *buf)
+{
+	enum alloc_reserve i;
+	ssize_t ret;
+
+	spin_lock(&ca->freelist_lock);
+
+	ret = scnprintf(buf, PAGE_SIZE,
+			"free_inc:\t%zu\t%zu\n",
+			fifo_used(&ca->free_inc),
+			ca->free_inc.size);
+
+	for (i = 0; i < RESERVE_NR; i++)
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+				 "free[%u]:\t%zu\t%zu\n", i,
+				 fifo_used(&ca->free[i]),
+				 ca->free[i].size);
+
+	spin_unlock(&ca->freelist_lock);
+
+	return ret;
+}
+
+static ssize_t show_cache_alloc_debug(struct cache *ca, char *buf)
+{
+	struct cache_set *c = ca->set;
+	struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
+
+	return scnprintf(buf, PAGE_SIZE,
+		"free_inc:               %zu/%zu\n"
+		"free[RESERVE_PRIO]:     %zu/%zu\n"
+		"free[RESERVE_BTREE]:    %zu/%zu\n"
+		"free[RESERVE_MOVINGGC]: %zu/%zu\n"
+		"free[RESERVE_NONE]:     %zu/%zu\n"
+		"alloc:                  %llu/%llu\n"
+		"meta:                   %llu/%llu\n"
+		"dirty:                  %llu/%llu\n"
+		"available:              %llu/%llu\n"
+		"freelist_wait:          %s\n"
+		"open buckets:           %u/%u (reserved %u)\n"
+		"open_buckets_wait:      %s\n",
+		fifo_used(&ca->free_inc),		ca->free_inc.size,
+		fifo_used(&ca->free[RESERVE_PRIO]),	ca->free[RESERVE_PRIO].size,
+		fifo_used(&ca->free[RESERVE_BTREE]),	ca->free[RESERVE_BTREE].size,
+		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
+		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
+		stats.buckets_alloc,			ca->mi.nbuckets - ca->mi.first_bucket,
+		stats.buckets_meta,			ca->mi.nbuckets - ca->mi.first_bucket,
+		stats.buckets_dirty,			ca->mi.nbuckets - ca->mi.first_bucket,
+		__buckets_available_cache(ca, stats),	ca->mi.nbuckets - ca->mi.first_bucket,
+		c->freelist_wait.list.first		? "waiting" : "empty",
+		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
+		c->open_buckets_wait.list.first		? "waiting" : "empty");
+}
+
+static u64 sectors_written(struct cache *ca)
+{
+	u64 ret = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		ret += *per_cpu_ptr(ca->sectors_written, cpu);
+
+	return ret;
+}
+
+SHOW(bch_cache)
+{
+	struct cache *ca = container_of(kobj, struct cache, kobj);
+	struct cache_set *c = ca->set;
+	struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
+
+	sysfs_printf(uuid,		"%pU\n", ca->disk_sb.sb->disk_uuid.b);
+
+	sysfs_hprint(bucket_size,	bucket_bytes(ca));
+	sysfs_print(bucket_size_bytes,	bucket_bytes(ca));
+	sysfs_hprint(block_size,	block_bytes(c));
+	sysfs_print(block_size_bytes,	block_bytes(c));
+	sysfs_print(first_bucket,	ca->mi.first_bucket);
+	sysfs_print(nbuckets,		ca->mi.nbuckets);
+	sysfs_print(discard,		ca->mi.discard);
+	sysfs_hprint(written, sectors_written(ca) << 9);
+	sysfs_hprint(btree_written,
+		     atomic64_read(&ca->btree_sectors_written) << 9);
+	sysfs_hprint(metadata_written,
+		     (atomic64_read(&ca->meta_sectors_written) +
+		      atomic64_read(&ca->btree_sectors_written)) << 9);
+
+	sysfs_print(io_errors,
+		    atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
+
+	sysfs_hprint(dirty_data,	stats.sectors_dirty << 9);
+	sysfs_print(dirty_bytes,	stats.sectors_dirty << 9);
+	sysfs_print(dirty_buckets,	stats.buckets_dirty);
+	sysfs_hprint(cached_data,	stats.sectors_cached << 9);
+	sysfs_print(cached_bytes,	stats.sectors_cached << 9);
+	sysfs_print(cached_buckets,	stats.buckets_cached);
+	sysfs_print(meta_buckets,	stats.buckets_meta);
+	sysfs_print(alloc_buckets,	stats.buckets_alloc);
+	sysfs_print(available_buckets,	buckets_available_cache(ca));
+	sysfs_print(free_buckets,	buckets_free_cache(ca));
+	sysfs_print(has_data,		ca->mi.has_data);
+	sysfs_print(has_metadata,	ca->mi.has_metadata);
+
+	sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
+
+	if (attr == &sysfs_cache_replacement_policy)
+		return bch_snprint_string_list(buf, PAGE_SIZE,
+					       cache_replacement_policies,
+					       ca->mi.replacement);
+
+	sysfs_print(tier,		ca->mi.tier);
+
+	if (attr == &sysfs_state_rw)
+		return bch_snprint_string_list(buf, PAGE_SIZE,
+					       bch_cache_state,
+					       ca->mi.state);
+
+	if (attr == &sysfs_read_priority_stats)
+		return show_quantiles(ca, buf, bucket_priority_fn, (void *) 0);
+	if (attr == &sysfs_write_priority_stats)
+		return show_quantiles(ca, buf, bucket_priority_fn, (void *) 1);
+	if (attr == &sysfs_fragmentation_stats)
+		return show_quantiles(ca, buf, bucket_sectors_used_fn, NULL);
+	if (attr == &sysfs_oldest_gen_stats)
+		return show_quantiles(ca, buf, bucket_oldest_gen_fn, NULL);
+	if (attr == &sysfs_reserve_stats)
+		return show_reserve_stats(ca, buf);
+	if (attr == &sysfs_alloc_debug)
+		return show_cache_alloc_debug(ca, buf);
+
+	return 0;
+}
+
+STORE(__bch_cache)
+{
+	struct cache *ca = container_of(kobj, struct cache, kobj);
+	struct cache_set *c = ca->set;
+	struct cache_member *mi = &c->disk_mi[ca->sb.nr_this_dev];
+
+	sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd);
+
+	if (attr == &sysfs_discard) {
+		bool v = strtoul_or_return(buf);
+
+		if (v != CACHE_DISCARD(mi)) {
+			SET_CACHE_DISCARD(mi, v);
+			bcache_write_super(c);
+		}
+	}
+
+	if (attr == &sysfs_cache_replacement_policy) {
+		ssize_t v = bch_read_string_list(buf, cache_replacement_policies);
+
+		if (v < 0)
+			return v;
+
+		if ((unsigned) v != CACHE_REPLACEMENT(mi)) {
+			SET_CACHE_REPLACEMENT(mi, v);
+			bcache_write_super(c);
+		}
+	}
+
+	if (attr == &sysfs_state_rw) {
+		char name[BDEVNAME_SIZE];
+		const char *err = NULL;
+		ssize_t v = bch_read_string_list(buf, bch_cache_state);
+
+		if (v < 0)
+			return v;
+
+		if (v == ca->mi.state)
+			return size;
+
+		switch (v) {
+		case CACHE_ACTIVE:
+			err = bch_cache_read_write(ca);
+			break;
+		case CACHE_RO:
+			bch_cache_read_only(ca);
+			break;
+		case CACHE_FAILED:
+		case CACHE_SPARE:
+			/*
+			 * XXX: need to migrate data off and set correct state
+			 */
+			pr_err("can't set %s %s: not supported",
+			       bdevname(ca->disk_sb.bdev, name),
+			       bch_cache_state[v]);
+			return -EINVAL;
+		}
+
+		if (err) {
+			pr_err("can't set %s %s: %s",
+			       bdevname(ca->disk_sb.bdev, name),
+			       bch_cache_state[v], err);
+			return -EINVAL;
+		}
+	}
+
+	if (attr == &sysfs_unregister) {
+		bool force = false;
+
+		if (!strncmp(buf, "force", 5) &&
+		    (buf[5] == '\0' || buf[5] == '\n'))
+			force = true;
+		bch_cache_remove(ca, force);
+	}
+
+	if (attr == &sysfs_clear_stats) {
+		int cpu;
+
+		for_each_possible_cpu(cpu)
+			*per_cpu_ptr(ca->sectors_written, cpu) = 0;
+
+		atomic64_set(&ca->btree_sectors_written, 0);
+		atomic64_set(&ca->meta_sectors_written, 0);
+		atomic_set(&ca->io_count, 0);
+		atomic_set(&ca->io_errors, 0);
+	}
+
+	return size;
+}
+STORE_LOCKED(bch_cache)
+
+static struct attribute *bch_cache_files[] = {
+	&sysfs_uuid,
+	&sysfs_unregister,
+	&sysfs_bucket_size,
+	&sysfs_bucket_size_bytes,
+	&sysfs_block_size,
+	&sysfs_block_size_bytes,
+	&sysfs_first_bucket,
+	&sysfs_nbuckets,
+	&sysfs_read_priority_stats,
+	&sysfs_write_priority_stats,
+	&sysfs_fragmentation_stats,
+	&sysfs_oldest_gen_stats,
+	&sysfs_reserve_stats,
+	&sysfs_available_buckets,
+	&sysfs_free_buckets,
+	&sysfs_dirty_data,
+	&sysfs_dirty_bytes,
+	&sysfs_dirty_buckets,
+	&sysfs_cached_data,
+	&sysfs_cached_bytes,
+	&sysfs_cached_buckets,
+	&sysfs_meta_buckets,
+	&sysfs_alloc_buckets,
+	&sysfs_has_data,
+	&sysfs_has_metadata,
+	&sysfs_discard,
+	&sysfs_written,
+	&sysfs_btree_written,
+	&sysfs_metadata_written,
+	&sysfs_io_errors,
+	&sysfs_clear_stats,
+	&sysfs_cache_replacement_policy,
+	&sysfs_tier,
+	&sysfs_state_rw,
+	&sysfs_alloc_debug,
+
+	sysfs_pd_controller_files(copy_gc),
+	NULL
+};
+KTYPE(bch_cache);
diff --git a/libbcache/sysfs.h b/libbcache/sysfs.h
new file mode 100644
index 0000000..9d58458
--- /dev/null
+++ b/libbcache/sysfs.h
@@ -0,0 +1,113 @@
+#ifndef _BCACHE_SYSFS_H_
+#define _BCACHE_SYSFS_H_
+
+#include "util.h"
+
+#define KTYPE(type)							\
+struct kobj_type type ## _ktype = {					\
+	.release	= type ## _release,				\
+	.sysfs_ops	= &((const struct sysfs_ops) {			\
+		.show	= type ## _show,				\
+		.store	= type ## _store				\
+	}),								\
+	.default_attrs	= type ## _files				\
+}
+
+#define SHOW(fn)							\
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+			   char *buf)					\
+
+#define STORE(fn)							\
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+			    const char *buf, size_t size)		\
+
+#define STORE_LOCKED(fn)						\
+STORE(fn)								\
+{									\
+	ssize_t ret;							\
+	mutex_lock(&bch_register_lock);					\
+	ret = __ ## fn ## _store(kobj, attr, buf, size);		\
+	mutex_unlock(&bch_register_lock);				\
+	return ret;							\
+}
+
+#define __sysfs_attribute(_name, _mode)					\
+	static struct attribute sysfs_##_name =				\
+		{ .name = #_name, .mode = _mode }
+
+#define write_attribute(n)	__sysfs_attribute(n, S_IWUSR)
+#define read_attribute(n)	__sysfs_attribute(n, S_IRUGO)
+#define rw_attribute(n)		__sysfs_attribute(n, S_IRUGO|S_IWUSR)
+
+#define sysfs_printf(file, fmt, ...)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);	\
+} while (0)
+
+#define sysfs_print(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return snprint(buf, PAGE_SIZE, var);			\
+} while (0)
+
+#define sysfs_hprint(file, val)						\
+do {									\
+	if (attr == &sysfs_ ## file) {					\
+		ssize_t ret = bch_hprint(buf, val);			\
+		strcat(buf, "\n");					\
+		return ret + 1;						\
+	}								\
+} while (0)
+
+#define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
+#define var_print(_var)		sysfs_print(_var, var(_var))
+#define var_hprint(_var)	sysfs_hprint(_var, var(_var))
+
+#define sysfs_strtoul(file, var)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe(buf, var) ?: (ssize_t) size;	\
+} while (0)
+
+#define sysfs_strtoul_clamp(file, var, min, max)			\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe_clamp(buf, var, min, max)		\
+			?: (ssize_t) size;				\
+} while (0)
+
+#define strtoul_or_return(cp)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (_r)								\
+		return _r;						\
+	_v;								\
+})
+
+#define strtoul_restrict_or_return(cp, min, max)			\
+({									\
+	unsigned long __v = 0;						\
+	int _r = strtoul_safe_restrict(cp, __v, min, max);		\
+	if (_r)								\
+		return _r;						\
+	__v;								\
+})
+
+#define strtoi_h_or_return(cp)						\
+({									\
+	u64 _v;								\
+	int _r = strtoi_h(cp, &_v);					\
+	if (_r)								\
+		return _r;						\
+	_v;								\
+})
+
+#define sysfs_hatoi(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
+} while (0)
+
+#endif  /* _BCACHE_SYSFS_H_ */
diff --git a/libbcache/tier.c b/libbcache/tier.c
new file mode 100644
index 0000000..2b568e1
--- /dev/null
+++ b/libbcache/tier.c
@@ -0,0 +1,243 @@
+
+#include "bcache.h"
+#include "alloc.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "extents.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "tier.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <trace/events/bcache.h>
+
+struct tiering_state {
+	struct cache_group	*tier;
+	unsigned		tier_idx;
+	unsigned		sectors;
+	unsigned		stripe_size;
+	unsigned		dev_idx;
+	struct cache		*ca;
+};
+
+static bool tiering_pred(struct cache_set *c,
+			 struct tiering_state *s,
+			 struct bkey_s_c k)
+{
+	if (bkey_extent_is_data(k.k)) {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
+		struct cache_member_rcu *mi;
+		unsigned replicas = 0;
+
+		/* Make sure we have room to add a new pointer: */
+		if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+		    BKEY_EXTENT_VAL_U64s_MAX)
+			return false;
+
+		mi = cache_member_info_get(c);
+		extent_for_each_ptr(e, ptr)
+			if (ptr->dev < mi->nr_in_set &&
+			    mi->m[ptr->dev].tier >= s->tier_idx)
+				replicas++;
+		cache_member_info_put();
+
+		return replicas < c->opts.data_replicas;
+	}
+
+	return false;
+}
+
+static void tier_put_device(struct tiering_state *s)
+{
+	if (s->ca)
+		percpu_ref_put(&s->ca->ref);
+	s->ca = NULL;
+}
+
+/**
+ * refill_next - move on to refilling the next cache's tiering keylist
+ */
+static void tier_next_device(struct cache_set *c, struct tiering_state *s)
+{
+	if (!s->ca || s->sectors > s->stripe_size) {
+		tier_put_device(s);
+		s->sectors = 0;
+		s->dev_idx++;
+
+		spin_lock(&s->tier->lock);
+		if (s->dev_idx >= s->tier->nr_devices)
+			s->dev_idx = 0;
+
+		if (s->tier->nr_devices) {
+			s->ca = s->tier->d[s->dev_idx].dev;
+			percpu_ref_get(&s->ca->ref);
+		}
+		spin_unlock(&s->tier->lock);
+	}
+}
+
+static int issue_tiering_move(struct cache_set *c,
+			      struct tiering_state *s,
+			      struct moving_context *ctxt,
+			      struct bkey_s_c k)
+{
+	int ret;
+
+	ret = bch_data_move(c, ctxt, &s->ca->tiering_write_point, k, NULL);
+	if (!ret) {
+		trace_bcache_tiering_copy(k.k);
+		s->sectors += k.k->size;
+	} else {
+		trace_bcache_tiering_alloc_fail(c, k.k->size);
+	}
+
+	return ret;
+}
+
+/**
+ * tiering_next_cache - issue a move to write an extent to the next cache
+ * device in round robin order
+ */
+static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
+{
+	struct moving_context ctxt;
+	struct tiering_state s;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	unsigned nr_devices = READ_ONCE(tier->nr_devices);
+	int ret;
+
+	if (!nr_devices)
+		return 0;
+
+	trace_bcache_tiering_start(c);
+
+	memset(&s, 0, sizeof(s));
+	s.tier		= tier;
+	s.tier_idx	= tier - c->cache_tiers;
+	s.stripe_size	= 2048; /* 1 mb for now */
+
+	bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate,
+			   nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
+	bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+	while (!kthread_should_stop() &&
+	       !bch_move_ctxt_wait(&ctxt) &&
+	       (k = bch_btree_iter_peek(&iter)).k &&
+	       !btree_iter_err(k)) {
+		if (!tiering_pred(c, &s, k))
+			goto next;
+
+		tier_next_device(c, &s);
+		if (!s.ca)
+			break;
+
+		ret = issue_tiering_move(c, &s, &ctxt, k);
+		if (ret) {
+			bch_btree_iter_unlock(&iter);
+
+			/* memory allocation failure, wait for some IO to finish */
+			bch_move_ctxt_wait_for_io(&ctxt);
+			continue;
+		}
+next:
+		bch_btree_iter_advance_pos(&iter);
+		//bch_btree_iter_cond_resched(&iter);
+
+		/* unlock before calling moving_context_wait() */
+		bch_btree_iter_unlock(&iter);
+		cond_resched();
+	}
+
+	bch_btree_iter_unlock(&iter);
+	tier_put_device(&s);
+	bch_move_ctxt_exit(&ctxt);
+	trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
+
+	return ctxt.sectors_moved;
+}
+
+static int bch_tiering_thread(void *arg)
+{
+	struct cache_set *c = arg;
+	struct cache_group *tier = &c->cache_tiers[1];
+	struct io_clock *clock = &c->io_clock[WRITE];
+	struct cache *ca;
+	u64 tier_capacity, available_sectors;
+	unsigned long last;
+	unsigned i;
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		if (kthread_wait_freezable(c->tiering_enabled &&
+					   tier->nr_devices))
+			break;
+
+		while (1) {
+			struct cache_group *faster_tier;
+
+			last = atomic_long_read(&clock->now);
+
+			tier_capacity = available_sectors = 0;
+			rcu_read_lock();
+			for (faster_tier = c->cache_tiers;
+			     faster_tier != tier;
+			     faster_tier++) {
+				group_for_each_cache_rcu(ca, faster_tier, i) {
+					tier_capacity +=
+						(ca->mi.nbuckets -
+						 ca->mi.first_bucket) << ca->bucket_bits;
+					available_sectors +=
+						buckets_available_cache(ca) << ca->bucket_bits;
+				}
+			}
+			rcu_read_unlock();
+
+			if (available_sectors < (tier_capacity >> 1))
+				break;
+
+			bch_kthread_io_clock_wait(clock,
+						  last +
+						  available_sectors -
+						  (tier_capacity >> 1));
+			if (kthread_should_stop())
+				return 0;
+		}
+
+		read_tiering(c, tier);
+	}
+
+	return 0;
+}
+
+void bch_tiering_init_cache_set(struct cache_set *c)
+{
+	bch_pd_controller_init(&c->tiering_pd);
+}
+
+int bch_tiering_read_start(struct cache_set *c)
+{
+	struct task_struct *t;
+
+	t = kthread_create(bch_tiering_thread, c, "bch_tier_read");
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	c->tiering_read = t;
+	wake_up_process(c->tiering_read);
+
+	return 0;
+}
+
+void bch_tiering_read_stop(struct cache_set *c)
+{
+	if (!IS_ERR_OR_NULL(c->tiering_read)) {
+		kthread_stop(c->tiering_read);
+		c->tiering_read = NULL;
+	}
+}
diff --git a/libbcache/tier.h b/libbcache/tier.h
new file mode 100644
index 0000000..89c2bff
--- /dev/null
+++ b/libbcache/tier.h
@@ -0,0 +1,8 @@
+#ifndef _BCACHE_TIER_H
+#define _BCACHE_TIER_H
+
+void bch_tiering_init_cache_set(struct cache_set *);
+int bch_tiering_read_start(struct cache_set *);
+void bch_tiering_read_stop(struct cache_set *);
+
+#endif
diff --git a/libbcache/trace.c b/libbcache/trace.c
new file mode 100644
index 0000000..def525d
--- /dev/null
+++ b/libbcache/trace.c
@@ -0,0 +1,11 @@
+#include "bcache.h"
+#include "alloc_types.h"
+#include "blockdev_types.h"
+#include "buckets.h"
+#include "btree_types.h"
+#include "keylist.h"
+
+#include <linux/blktrace_api.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/bcache.h>
diff --git a/libbcache/util.c b/libbcache/util.c
new file mode 100644
index 0000000..5f81659
--- /dev/null
+++ b/libbcache/util.c
@@ -0,0 +1,418 @@
+/*
+ * random utiility code, for bcache but in theory not specific to bcache
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/types.h>
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+
+#include "util.h"
+
+#define simple_strtoint(c, end, base)	simple_strtol(c, end, base)
+#define simple_strtouint(c, end, base)	simple_strtoul(c, end, base)
+
+#define STRTO_H(name, type)					\
+int bch_ ## name ## _h(const char *cp, type *res)		\
+{								\
+	int u = 0;						\
+	char *e;						\
+	type i = simple_ ## name(cp, &e, 10);			\
+								\
+	switch (tolower(*e)) {					\
+	default:						\
+		return -EINVAL;					\
+	case 'y':						\
+	case 'z':						\
+		u++;						\
+	case 'e':						\
+		u++;						\
+	case 'p':						\
+		u++;						\
+	case 't':						\
+		u++;						\
+	case 'g':						\
+		u++;						\
+	case 'm':						\
+		u++;						\
+	case 'k':						\
+		u++;						\
+		if (e++ == cp)					\
+			return -EINVAL;				\
+	case '\n':						\
+	case '\0':						\
+		if (*e == '\n')					\
+			e++;					\
+	}							\
+								\
+	if (*e)							\
+		return -EINVAL;					\
+								\
+	while (u--) {						\
+		if ((type) ~0 > 0 &&				\
+		    (type) ~0 / 1024 <= i)			\
+			return -EINVAL;				\
+		if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) ||	\
+		    (i < 0 && -ANYSINT_MAX(type) / 1024 > i))	\
+			return -EINVAL;				\
+		i *= 1024;					\
+	}							\
+								\
+	*res = i;						\
+	return 0;						\
+}								\
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+
+ssize_t bch_hprint(char *buf, s64 v)
+{
+	static const char units[] = "?kMGTPEZY";
+	char dec[4] = "";
+	int u, t = 0;
+
+	for (u = 0; v >= 1024 || v <= -1024; u++) {
+		t = v & ~(~0 << 10);
+		v >>= 10;
+	}
+
+	if (!u)
+		return sprintf(buf, "%lli", v);
+
+	/*
+	 * 103 is magic: t is in the range [-1023, 1023] and we want
+	 * to turn it into [-9, 9]
+	 */
+	if (v < 100 && v > -100)
+		snprintf(dec, sizeof(dec), ".%i", t / 103);
+
+	return sprintf(buf, "%lli%s%c", v, dec, units[u]);
+}
+
+ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+			    size_t selected)
+{
+	char *out = buf;
+	size_t i;
+
+	for (i = 0; list[i]; i++)
+		out += snprintf(out, buf + size - out,
+				i == selected ? "[%s] " : "%s ", list[i]);
+
+	out[-1] = '\n';
+	return out - buf;
+}
+
+ssize_t bch_read_string_list(const char *buf, const char * const list[])
+{
+	size_t i;
+	char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	s = strim(d);
+
+	for (i = 0; list[i]; i++)
+		if (!strcmp(list[i], s))
+			break;
+
+	kfree(d);
+
+	if (!list[i])
+		return -EINVAL;
+
+	return i;
+}
+
+bool bch_is_zero(const void *_p, size_t n)
+{
+	const char *p = _p;
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		if (p[i])
+			return false;
+	return true;
+}
+
+void bch_time_stats_clear(struct time_stats *stats)
+{
+	spin_lock(&stats->lock);
+
+	stats->count = 0;
+	stats->last_duration = 0;
+	stats->max_duration = 0;
+	stats->average_duration = 0;
+	stats->average_frequency = 0;
+	stats->last = 0;
+
+	spin_unlock(&stats->lock);
+}
+
+void __bch_time_stats_update(struct time_stats *stats, u64 start_time)
+{
+	u64 now, duration, last;
+
+	stats->count++;
+
+	now		= local_clock();
+	duration	= time_after64(now, start_time)
+		? now - start_time : 0;
+	last		= time_after64(now, stats->last)
+		? now - stats->last : 0;
+
+	stats->last_duration = duration;
+	stats->max_duration = max(stats->max_duration, duration);
+
+	if (stats->last) {
+		stats->average_duration = ewma_add(stats->average_duration,
+						   duration << 8, 3);
+
+		if (stats->average_frequency)
+			stats->average_frequency =
+				ewma_add(stats->average_frequency,
+					 last << 8, 3);
+		else
+			stats->average_frequency  = last << 8;
+	} else {
+		stats->average_duration = duration << 8;
+	}
+
+	stats->last = now ?: 1;
+}
+
+void bch_time_stats_update(struct time_stats *stats, u64 start_time)
+{
+	spin_lock(&stats->lock);
+	__bch_time_stats_update(stats, start_time);
+	spin_unlock(&stats->lock);
+}
+
+/**
+ * bch_ratelimit_delay() - return how long to delay until the next time to do
+ * some work
+ *
+ * @d - the struct bch_ratelimit to update
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+u64 bch_ratelimit_delay(struct bch_ratelimit *d)
+{
+	u64 now = local_clock();
+
+	return time_after64(d->next, now)
+		? nsecs_to_jiffies(d->next - now)
+		: 0;
+}
+
+/**
+ * bch_ratelimit_increment() - increment @d by the amount of work done
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ */
+void bch_ratelimit_increment(struct bch_ratelimit *d, u64 done)
+{
+	u64 now = local_clock();
+
+	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+
+	if (time_before64(now + NSEC_PER_SEC, d->next))
+		d->next = now + NSEC_PER_SEC;
+
+	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
+		d->next = now - NSEC_PER_SEC * 2;
+}
+
+int bch_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
+{
+	while (1) {
+		u64 delay = bch_ratelimit_delay(d);
+
+		if (delay)
+			set_current_state(TASK_INTERRUPTIBLE);
+
+		if (kthread_should_stop())
+			return 1;
+
+		if (!delay)
+			return 0;
+
+		schedule_timeout(delay);
+		try_to_freeze();
+	}
+}
+
+/*
+ * Updates pd_controller. Attempts to scale inputed values to units per second.
+ * @target: desired value
+ * @actual: current value
+ *
+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
+ * it makes actual go down.
+ */
+void bch_pd_controller_update(struct bch_pd_controller *pd,
+			      s64 target, s64 actual, int sign)
+{
+	s64 proportional, derivative, change;
+
+	unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
+
+	if (seconds_since_update == 0)
+		return;
+
+	pd->last_update = jiffies;
+
+	proportional = actual - target;
+	proportional *= seconds_since_update;
+	proportional = div_s64(proportional, pd->p_term_inverse);
+
+	derivative = actual - pd->last_actual;
+	derivative = div_s64(derivative, seconds_since_update);
+	derivative = ewma_add(pd->smoothed_derivative, derivative,
+			      (pd->d_term / seconds_since_update) ?: 1);
+	derivative = derivative * pd->d_term;
+	derivative = div_s64(derivative, pd->p_term_inverse);
+
+	change = proportional + derivative;
+
+	/* Don't increase rate if not keeping up */
+	if (change > 0 &&
+	    pd->backpressure &&
+	    time_after64(local_clock(),
+			 pd->rate.next + NSEC_PER_MSEC))
+		change = 0;
+
+	change *= (sign * -1);
+
+	pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
+				1, UINT_MAX);
+
+	pd->last_actual		= actual;
+	pd->last_derivative	= derivative;
+	pd->last_proportional	= proportional;
+	pd->last_change		= change;
+	pd->last_target		= target;
+}
+
+void bch_pd_controller_init(struct bch_pd_controller *pd)
+{
+	pd->rate.rate		= 1024;
+	pd->last_update		= jiffies;
+	pd->p_term_inverse	= 6000;
+	pd->d_term		= 30;
+	pd->d_smooth		= pd->d_term;
+	pd->backpressure	= 1;
+}
+
+size_t bch_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
+{
+	/* 2^64 - 1 is 20 digits, plus null byte */
+	char rate[21];
+	char actual[21];
+	char target[21];
+	char proportional[21];
+	char derivative[21];
+	char change[21];
+	s64 next_io;
+
+	bch_hprint(rate,	pd->rate.rate);
+	bch_hprint(actual,	pd->last_actual);
+	bch_hprint(target,	pd->last_target);
+	bch_hprint(proportional, pd->last_proportional);
+	bch_hprint(derivative,	pd->last_derivative);
+	bch_hprint(change,	pd->last_change);
+
+	next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
+
+	return sprintf(buf,
+		       "rate:\t\t%s/sec\n"
+		       "target:\t\t%s\n"
+		       "actual:\t\t%s\n"
+		       "proportional:\t%s\n"
+		       "derivative:\t%s\n"
+		       "change:\t\t%s/sec\n"
+		       "next io:\t%llims\n",
+		       rate, target, actual, proportional,
+		       derivative, change, next_io);
+}
+
+void bch_bio_map(struct bio *bio, void *base)
+{
+	size_t size = bio->bi_iter.bi_size;
+	struct bio_vec *bv = bio->bi_io_vec;
+
+	BUG_ON(!bio->bi_iter.bi_size);
+	BUG_ON(bio->bi_vcnt);
+
+	bv->bv_offset = base ? offset_in_page(base) : 0;
+	goto start;
+
+	for (; size; bio->bi_vcnt++, bv++) {
+		bv->bv_offset	= 0;
+start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
+					size);
+		BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
+		if (base) {
+			bv->bv_page = is_vmalloc_addr(base)
+				? vmalloc_to_page(base)
+				: virt_to_page(base);
+
+			base += bv->bv_len;
+		}
+
+		size -= bv->bv_len;
+	}
+}
+
+size_t bch_rand_range(size_t max)
+{
+	size_t rand;
+
+	do {
+		get_random_bytes(&rand, sizeof(rand));
+		rand &= roundup_pow_of_two(max) - 1;
+	} while (rand >= max);
+
+	return rand;
+}
+
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, dst, iter, dst_iter) {
+		void *dstp = kmap_atomic(bv.bv_page);
+		memcpy(dstp + bv.bv_offset, src, bv.bv_len);
+		kunmap_atomic(dstp);
+
+		src += bv.bv_len;
+	}
+}
+
+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, src, iter, src_iter) {
+		void *srcp = kmap_atomic(bv.bv_page);
+		memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
+		kunmap_atomic(srcp);
+
+		dst += bv.bv_len;
+	}
+}
diff --git a/libbcache/util.h b/libbcache/util.h
new file mode 100644
index 0000000..2b171a1
--- /dev/null
+++ b/libbcache/util.h
@@ -0,0 +1,725 @@
+#ifndef _BCACHE_UTIL_H
+#define _BCACHE_UTIL_H
+
+#include <linux/blkdev.h>
+#include <linux/errno.h>
+#include <linux/blkdev.h>
+#include <linux/freezer.h>
+#include <linux/kernel.h>
+#include <linux/llist.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#include "closure.h"
+
+#define PAGE_SECTOR_SHIFT	(PAGE_SHIFT - 9)
+#define PAGE_SECTORS		(1UL << PAGE_SECTOR_SHIFT)
+
+struct closure;
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+#define EBUG_ON(cond)		BUG_ON(cond)
+#define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
+#define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
+#define atomic_sub_bug(i, v)	BUG_ON(atomic_sub_return(i, v) < 0)
+#define atomic_add_bug(i, v)	BUG_ON(atomic_add_return(i, v) < 0)
+#define atomic_long_dec_bug(v)		BUG_ON(atomic_long_dec_return(v) < 0)
+#define atomic_long_sub_bug(i, v)	BUG_ON(atomic_long_sub_return(i, v) < 0)
+#define atomic64_dec_bug(v)	BUG_ON(atomic64_dec_return(v) < 0)
+#define atomic64_inc_bug(v, i)	BUG_ON(atomic64_inc_return(v) <= i)
+#define atomic64_sub_bug(i, v)	BUG_ON(atomic64_sub_return(i, v) < 0)
+#define atomic64_add_bug(i, v)	BUG_ON(atomic64_add_return(i, v) < 0)
+
+#define memcpy(_dst, _src, _len)					\
+do {									\
+	BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) ||		\
+		 (void *) (_dst) + (_len) <= (void *) (_src)));		\
+	memcpy(_dst, _src, _len);					\
+} while (0)
+
+#else /* DEBUG */
+
+#define EBUG_ON(cond)
+#define atomic_dec_bug(v)	atomic_dec(v)
+#define atomic_inc_bug(v, i)	atomic_inc(v)
+#define atomic_sub_bug(i, v)	atomic_sub(i, v)
+#define atomic_add_bug(i, v)	atomic_add(i, v)
+#define atomic_long_dec_bug(v)		atomic_long_dec(v)
+#define atomic_long_sub_bug(i, v)	atomic_long_sub(i, v)
+#define atomic64_dec_bug(v)	atomic64_dec(v)
+#define atomic64_inc_bug(v, i)	atomic64_inc(v)
+#define atomic64_sub_bug(i, v)	atomic64_sub(i, v)
+#define atomic64_add_bug(i, v)	atomic64_add(i, v)
+
+#endif
+
+#ifndef __CHECKER__
+#define __flatten __attribute__((flatten))
+#else
+/* sparse doesn't know about attribute((flatten)) */
+#define __flatten
+#endif
+
+#ifdef __LITTLE_ENDIAN
+#define CPU_BIG_ENDIAN		0
+#else
+#define CPU_BIG_ENDIAN		1
+#endif
+
+/* type hackery */
+
+#define type_is_exact(_val, _type)					\
+	__builtin_types_compatible_p(typeof(_val), _type)
+
+#define type_is(_val, _type)						\
+	(__builtin_types_compatible_p(typeof(_val), _type) ||		\
+	 __builtin_types_compatible_p(typeof(_val), const _type))
+
+static inline void *kvmalloc(size_t bytes, gfp_t gfp)
+{
+	if (bytes <= PAGE_SIZE ||
+	    !(gfp & GFP_KERNEL))
+		return kmalloc(bytes, gfp);
+
+	return ((bytes <= KMALLOC_MAX_SIZE)
+		? kmalloc(bytes, gfp|__GFP_NOWARN)
+		: NULL) ?:
+		vmalloc(bytes);
+}
+
+#define DECLARE_HEAP(type, name)					\
+	struct {							\
+		size_t size, used;					\
+		type *data;						\
+	} name
+
+#define init_heap(heap, _size, gfp)					\
+({									\
+	size_t _bytes;							\
+	(heap)->used = 0;						\
+	(heap)->size = (_size);						\
+	_bytes = (heap)->size * sizeof(*(heap)->data);			\
+	(heap)->data = kvmalloc(_bytes, (gfp));				\
+	(heap)->data;							\
+})
+
+#define free_heap(heap)							\
+do {									\
+	kvfree((heap)->data);						\
+	(heap)->data = NULL;						\
+} while (0)
+
+#define heap_swap(h, i, j)	swap((h)->data[i], (h)->data[j])
+
+#define heap_sift(h, i, cmp)						\
+do {									\
+	size_t _r, _j = i;						\
+									\
+	for (; _j * 2 + 1 < (h)->used; _j = _r) {			\
+		_r = _j * 2 + 1;					\
+		if (_r + 1 < (h)->used &&				\
+		    cmp((h)->data[_r], (h)->data[_r + 1]))		\
+			_r++;						\
+									\
+		if (cmp((h)->data[_r], (h)->data[_j]))			\
+			break;						\
+		heap_swap(h, _r, _j);					\
+	}								\
+} while (0)
+
+#define heap_sift_down(h, i, cmp)					\
+do {									\
+	while (i) {							\
+		size_t p = (i - 1) / 2;					\
+		if (cmp((h)->data[i], (h)->data[p]))			\
+			break;						\
+		heap_swap(h, i, p);					\
+		i = p;							\
+	}								\
+} while (0)
+
+#define heap_add(h, d, cmp)						\
+({									\
+	bool _r = !heap_full(h);					\
+	if (_r) {							\
+		size_t _i = (h)->used++;				\
+		(h)->data[_i] = d;					\
+									\
+		heap_sift_down(h, _i, cmp);				\
+		heap_sift(h, _i, cmp);					\
+	}								\
+	_r;								\
+})
+
+#define heap_del(h, i, cmp)						\
+do {									\
+	size_t _i = (i);						\
+									\
+	BUG_ON(_i >= (h)->used);					\
+	(h)->used--;							\
+	heap_swap(h, _i, (h)->used);					\
+	heap_sift_down(h, _i, cmp);					\
+	heap_sift(h, _i, cmp);						\
+} while (0)
+
+#define heap_pop(h, d, cmp)						\
+({									\
+	bool _r = (h)->used;						\
+	if (_r) {							\
+		(d) = (h)->data[0];					\
+		heap_del(h, 0, cmp);					\
+	}								\
+	_r;								\
+})
+
+#define heap_peek(h)							\
+({									\
+	EBUG_ON(!(h)->used);						\
+	(h)->data[0];							\
+})
+
+#define heap_full(h)	((h)->used == (h)->size)
+
+#define heap_resort(heap, cmp)						\
+do {									\
+	ssize_t _i;							\
+	for (_i = (ssize_t) (heap)->used / 2 -  1; _i >= 0; --_i)	\
+		heap_sift(heap, _i, cmp);				\
+} while (0)
+
+/*
+ * Simple array based allocator - preallocates a number of elements and you can
+ * never allocate more than that, also has no locking.
+ *
+ * Handy because if you know you only need a fixed number of elements you don't
+ * have to worry about memory allocation failure, and sometimes a mempool isn't
+ * what you want.
+ *
+ * We treat the free elements as entries in a singly linked list, and the
+ * freelist as a stack - allocating and freeing push and pop off the freelist.
+ */
+
+#define DECLARE_ARRAY_ALLOCATOR(type, name, size)			\
+	struct {							\
+		type	*freelist;					\
+		type	data[size];					\
+	} name
+
+#define array_alloc(array)						\
+({									\
+	typeof((array)->freelist) _ret = (array)->freelist;		\
+									\
+	if (_ret)							\
+		(array)->freelist = *((typeof((array)->freelist) *) _ret);\
+									\
+	_ret;								\
+})
+
+#define array_free(array, ptr)						\
+do {									\
+	typeof((array)->freelist) _ptr = ptr;				\
+									\
+	*((typeof((array)->freelist) *) _ptr) = (array)->freelist;	\
+	(array)->freelist = _ptr;					\
+} while (0)
+
+#define array_allocator_init(array)					\
+do {									\
+	typeof((array)->freelist) _i;					\
+									\
+	BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *));	\
+	(array)->freelist = NULL;					\
+									\
+	for (_i = (array)->data;					\
+	     _i < (array)->data + ARRAY_SIZE((array)->data);		\
+	     _i++)							\
+		array_free(array, _i);					\
+} while (0)
+
+#define array_freelist_empty(array)	((array)->freelist == NULL)
+
+#define ANYSINT_MAX(t)							\
+	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+int bch_strtoint_h(const char *, int *);
+int bch_strtouint_h(const char *, unsigned int *);
+int bch_strtoll_h(const char *, long long *);
+int bch_strtoull_h(const char *, unsigned long long *);
+
+static inline int bch_strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch_strtoint_h(cp, (int *) res);
+#else
+	return bch_strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int bch_strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch_strtouint_h(cp, (unsigned int *) res);
+#else
+	return bch_strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res)						\
+	( type_is(*res, int)		? bch_strtoint_h(cp, (void *) res)\
+	: type_is(*res, long)		? bch_strtol_h(cp, (void *) res)\
+	: type_is(*res, long long)	? bch_strtoll_h(cp, (void *) res)\
+	: type_is(*res, unsigned)	? bch_strtouint_h(cp, (void *) res)\
+	: type_is(*res, unsigned long)	? bch_strtoul_h(cp, (void *) res)\
+	: type_is(*res, unsigned long long) ? bch_strtoull_h(cp, (void *) res)\
+	: -EINVAL)
+
+#define strtoul_safe(cp, var)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = _v;						\
+	_r;								\
+})
+
+#define strtoul_safe_clamp(cp, var, min, max)				\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = clamp_t(typeof(var), _v, min, max);		\
+	_r;								\
+})
+
+#define strtoul_safe_restrict(cp, var, min, max)			\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r && _v >= min && _v <= max)				\
+		var = _v;						\
+	else								\
+		_r = -EINVAL;						\
+	_r;								\
+})
+
+#define snprint(buf, size, var)						\
+	snprintf(buf, size,						\
+		   type_is(var, int)		? "%i\n"		\
+		 : type_is(var, unsigned)	? "%u\n"		\
+		 : type_is(var, long)		? "%li\n"		\
+		 : type_is(var, unsigned long)	? "%lu\n"		\
+		 : type_is(var, s64)		? "%lli\n"		\
+		 : type_is(var, u64)		? "%llu\n"		\
+		 : type_is(var, char *)		? "%s\n"		\
+		 : "%i\n", var)
+
+ssize_t bch_hprint(char *buf, s64 v);
+
+bool bch_is_zero(const void *, size_t);
+
+ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+			    size_t selected);
+
+ssize_t bch_read_string_list(const char *buf, const char * const list[]);
+
+struct time_stats {
+	spinlock_t	lock;
+	u64		count;
+	/*
+	 * all fields are in nanoseconds, averages are ewmas stored left shifted
+	 * by 8
+	 */
+	u64		last_duration;
+	u64		max_duration;
+	u64		average_duration;
+	u64		average_frequency;
+	u64		last;
+};
+
+void bch_time_stats_clear(struct time_stats *stats);
+void __bch_time_stats_update(struct time_stats *stats, u64 time);
+void bch_time_stats_update(struct time_stats *stats, u64 time);
+
+static inline unsigned local_clock_us(void)
+{
+	return local_clock() >> 10;
+}
+
+#define NSEC_PER_ns			1L
+#define NSEC_PER_us			NSEC_PER_USEC
+#define NSEC_PER_ms			NSEC_PER_MSEC
+#define NSEC_PER_sec			NSEC_PER_SEC
+
+#define __print_time_stat(stats, name, stat, units)			\
+	sysfs_print(name ## _ ## stat ## _ ## units,			\
+		    div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
+
+#define sysfs_print_time_stats(stats, name,				\
+			       frequency_units,				\
+			       duration_units)				\
+do {									\
+	__print_time_stat(stats, name,					\
+			  average_frequency,	frequency_units);	\
+	__print_time_stat(stats, name,					\
+			  average_duration,	duration_units);	\
+	sysfs_print(name ## _ ##count, (stats)->count);			\
+	sysfs_print(name ## _ ##last_duration ## _ ## duration_units,	\
+			div_u64((stats)->last_duration,			\
+				NSEC_PER_ ## duration_units));		\
+	sysfs_print(name ## _ ##max_duration ## _ ## duration_units,	\
+			div_u64((stats)->max_duration,			\
+				NSEC_PER_ ## duration_units));		\
+									\
+	sysfs_print(name ## _last_ ## frequency_units, (stats)->last	\
+		    ? div_s64(local_clock() - (stats)->last,		\
+			      NSEC_PER_ ## frequency_units)		\
+		    : -1LL);						\
+} while (0)
+
+#define sysfs_clear_time_stats(stats, name)				\
+do {									\
+	if (attr == &sysfs_ ## name ## _clear)				\
+		bch_time_stats_clear(stats);				\
+} while (0)
+
+#define sysfs_time_stats_attribute(name,				\
+				   frequency_units,			\
+				   duration_units)			\
+write_attribute(name ## _clear);					\
+read_attribute(name ## _count);						\
+read_attribute(name ## _average_frequency_ ## frequency_units);		\
+read_attribute(name ## _average_duration_ ## duration_units);		\
+read_attribute(name ## _last_duration_ ## duration_units);		\
+read_attribute(name ## _max_duration_ ## duration_units);		\
+read_attribute(name ## _last_ ## frequency_units)
+
+#define sysfs_time_stats_attribute_list(name,				\
+					frequency_units,		\
+					duration_units)			\
+&sysfs_ ## name ## _clear,						\
+&sysfs_ ## name ## _count,						\
+&sysfs_ ## name ## _average_frequency_ ## frequency_units,		\
+&sysfs_ ## name ## _average_duration_ ## duration_units,		\
+&sysfs_ ## name ## _last_duration_ ## duration_units,			\
+&sysfs_ ## name ## _max_duration_ ## duration_units,			\
+&sysfs_ ## name ## _last_ ## frequency_units,
+
+#define ewma_add(ewma, val, weight)					\
+({									\
+	typeof(ewma) _ewma = (ewma);					\
+	typeof(weight) _weight = (weight);				\
+									\
+	(((_ewma << _weight) - _ewma) + (val)) >> _weight;		\
+})
+
+struct bch_ratelimit {
+	/* Next time we want to do some work, in nanoseconds */
+	u64			next;
+
+	/*
+	 * Rate at which we want to do work, in units per nanosecond
+	 * The units here correspond to the units passed to
+	 * bch_ratelimit_increment()
+	 */
+	unsigned		rate;
+};
+
+static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
+{
+	d->next = local_clock();
+}
+
+u64 bch_ratelimit_delay(struct bch_ratelimit *);
+void bch_ratelimit_increment(struct bch_ratelimit *, u64);
+int bch_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *);
+
+struct bch_pd_controller {
+	struct bch_ratelimit	rate;
+	unsigned long		last_update;
+
+	s64			last_actual;
+	s64			smoothed_derivative;
+
+	unsigned		p_term_inverse;
+	unsigned		d_smooth;
+	unsigned		d_term;
+
+	/* for exporting to sysfs (no effect on behavior) */
+	s64			last_derivative;
+	s64			last_proportional;
+	s64			last_change;
+	s64			last_target;
+
+	/* If true, the rate will not increase if bch_ratelimit_delay()
+	 * is not being called often enough. */
+	bool			backpressure;
+};
+
+void bch_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
+void bch_pd_controller_init(struct bch_pd_controller *);
+size_t bch_pd_controller_print_debug(struct bch_pd_controller *, char *);
+
+#define sysfs_pd_controller_attribute(name)				\
+	rw_attribute(name##_rate);					\
+	rw_attribute(name##_rate_bytes);				\
+	rw_attribute(name##_rate_d_term);				\
+	rw_attribute(name##_rate_p_term_inverse);			\
+	read_attribute(name##_rate_debug)
+
+#define sysfs_pd_controller_files(name)					\
+	&sysfs_##name##_rate,						\
+	&sysfs_##name##_rate_bytes,					\
+	&sysfs_##name##_rate_d_term,					\
+	&sysfs_##name##_rate_p_term_inverse,				\
+	&sysfs_##name##_rate_debug
+
+#define sysfs_pd_controller_show(name, var)				\
+do {									\
+	sysfs_hprint(name##_rate,		(var)->rate.rate);	\
+	sysfs_print(name##_rate_bytes,		(var)->rate.rate);	\
+	sysfs_print(name##_rate_d_term,		(var)->d_term);		\
+	sysfs_print(name##_rate_p_term_inverse,	(var)->p_term_inverse);	\
+									\
+	if (attr == &sysfs_##name##_rate_debug)				\
+		return bch_pd_controller_print_debug(var, buf);		\
+} while (0)
+
+#define sysfs_pd_controller_store(name, var)				\
+do {									\
+	sysfs_strtoul_clamp(name##_rate,				\
+			    (var)->rate.rate, 1, UINT_MAX);		\
+	sysfs_strtoul_clamp(name##_rate_bytes,				\
+			    (var)->rate.rate, 1, UINT_MAX);		\
+	sysfs_strtoul(name##_rate_d_term,	(var)->d_term);		\
+	sysfs_strtoul_clamp(name##_rate_p_term_inverse,			\
+			    (var)->p_term_inverse, 1, INT_MAX);		\
+} while (0)
+
+#define __DIV_SAFE(n, d, zero)						\
+({									\
+	typeof(n) _n = (n);						\
+	typeof(d) _d = (d);						\
+	_d ? _n / _d : zero;						\
+})
+
+#define DIV_SAFE(n, d)	__DIV_SAFE(n, d, 0)
+
+#define container_of_or_null(ptr, type, member)				\
+({									\
+	typeof(ptr) _ptr = ptr;						\
+	_ptr ? container_of(_ptr, type, member) : NULL;			\
+})
+
+#define RB_INSERT(root, new, member, cmp)				\
+({									\
+	__label__ dup;							\
+	struct rb_node **n = &(root)->rb_node, *parent = NULL;		\
+	typeof(new) this;						\
+	int res, ret = -1;						\
+									\
+	while (*n) {							\
+		parent = *n;						\
+		this = container_of(*n, typeof(*(new)), member);	\
+		res = cmp(new, this);					\
+		if (!res)						\
+			goto dup;					\
+		n = res < 0						\
+			? &(*n)->rb_left				\
+			: &(*n)->rb_right;				\
+	}								\
+									\
+	rb_link_node(&(new)->member, parent, n);			\
+	rb_insert_color(&(new)->member, root);				\
+	ret = 0;							\
+dup:									\
+	ret;								\
+})
+
+#define RB_SEARCH(root, search, member, cmp)				\
+({									\
+	struct rb_node *n = (root)->rb_node;				\
+	typeof(&(search)) this, ret = NULL;				\
+	int res;							\
+									\
+	while (n) {							\
+		this = container_of(n, typeof(search), member);		\
+		res = cmp(&(search), this);				\
+		if (!res) {						\
+			ret = this;					\
+			break;						\
+		}							\
+		n = res < 0						\
+			? n->rb_left					\
+			: n->rb_right;					\
+	}								\
+	ret;								\
+})
+
+#define RB_GREATER(root, search, member, cmp)				\
+({									\
+	struct rb_node *n = (root)->rb_node;				\
+	typeof(&(search)) this, ret = NULL;				\
+	int res;							\
+									\
+	while (n) {							\
+		this = container_of(n, typeof(search), member);		\
+		res = cmp(&(search), this);				\
+		if (res < 0) {						\
+			ret = this;					\
+			n = n->rb_left;					\
+		} else							\
+			n = n->rb_right;				\
+	}								\
+	ret;								\
+})
+
+#define RB_FIRST(root, type, member)					\
+	container_of_or_null(rb_first(root), type, member)
+
+#define RB_LAST(root, type, member)					\
+	container_of_or_null(rb_last(root), type, member)
+
+#define RB_NEXT(ptr, member)						\
+	container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
+
+#define RB_PREV(ptr, member)						\
+	container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+	unsigned fract = x & ~(~0 << fract_bits);
+
+	x >>= fract_bits;
+	x   = 1 << x;
+	x  += (x * fract) >> fract_bits;
+
+	return x;
+}
+
+void bch_bio_map(struct bio *bio, void *base);
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+	return bdev->bd_inode->i_size >> 9;
+}
+
+#define closure_bio_submit(bio, cl)					\
+do {									\
+	closure_get(cl);						\
+	generic_make_request(bio);					\
+} while (0)
+
+#define closure_bio_submit_punt(bio, cl, c)				\
+do {									\
+	closure_get(cl);						\
+	bch_generic_make_request(bio, c);				\
+} while (0)
+
+#define kthread_wait_freezable(cond)					\
+({									\
+	int _ret = 0;							\
+	while (1) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (kthread_should_stop()) {				\
+			_ret = -1;					\
+			break;						\
+		}							\
+									\
+		if (cond)						\
+			break;						\
+									\
+		schedule();						\
+		try_to_freeze();					\
+	}								\
+	set_current_state(TASK_RUNNING);				\
+	_ret;								\
+})
+
+size_t bch_rand_range(size_t);
+
+void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
+void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+
+static inline void __memcpy_u64s(void *dst, const void *src,
+				 unsigned u64s)
+{
+#ifdef CONFIG_X86_64
+	long d0, d1, d2;
+	asm volatile("rep ; movsq"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (u64s), "1" (dst), "2" (src)
+		     : "memory");
+#else
+	u64 *d = dst;
+	const u64 *s = src;
+
+	while (u64s--)
+		*d++ = *s++;
+#endif
+}
+
+static inline void memcpy_u64s(void *dst, const void *src,
+			       unsigned u64s)
+{
+	EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
+		 dst + u64s * sizeof(u64) <= src));
+
+	__memcpy_u64s(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_down(void *dst, const void *src,
+				       unsigned u64s)
+{
+	__memcpy_u64s(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down(void *dst, const void *src,
+				     unsigned u64s)
+{
+	EBUG_ON(dst > src);
+
+	__memmove_u64s_down(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up(void *_dst, const void *_src,
+				     unsigned u64s)
+{
+	u64 *dst = (u64 *) _dst + u64s - 1;
+	u64 *src = (u64 *) _src + u64s - 1;
+
+#ifdef CONFIG_X86_64
+	long d0, d1, d2;
+	asm volatile("std ;\n"
+		     "rep ; movsq\n"
+		     "cld ;\n"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (u64s), "1" (dst), "2" (src)
+		     : "memory");
+#else
+	while (u64s--)
+		*dst-- = *src--;
+#endif
+}
+
+static inline void memmove_u64s_up(void *dst, const void *src,
+				   unsigned u64s)
+{
+	EBUG_ON(dst < src);
+
+	__memmove_u64s_up(dst, src, u64s);
+}
+
+static inline void memmove_u64s(void *dst, const void *src,
+				unsigned u64s)
+{
+	if (dst < src)
+		__memmove_u64s_down(dst, src, u64s);
+	else
+		__memmove_u64s_up(dst, src, u64s);
+}
+
+#endif /* _BCACHE_UTIL_H */
diff --git a/libbcache/writeback.c b/libbcache/writeback.c
new file mode 100644
index 0000000..600bfbf
--- /dev/null
+++ b/libbcache/writeback.c
@@ -0,0 +1,657 @@
+/*
+ * background writeback - scan btree for dirty data and write it to the backing
+ * device
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree_update.h"
+#include "clock.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "keybuf.h"
+#include "keylist.h"
+#include "writeback.h"
+
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <trace/events/bcache.h>
+
+/* Rate limiting */
+
+static void __update_writeback_rate(struct cached_dev *dc)
+{
+	struct cache_set *c = dc->disk.c;
+	u64 cache_dirty_target =
+		div_u64(c->capacity * dc->writeback_percent, 100);
+	s64 target = div64_u64(cache_dirty_target *
+			       bdev_sectors(dc->disk_sb.bdev),
+			       c->cached_dev_sectors);
+	s64 dirty = bcache_dev_sectors_dirty(&dc->disk);
+
+	bch_pd_controller_update(&dc->writeback_pd, target << 9,
+				 dirty << 9, -1);
+}
+
+static void update_writeback_rate(struct work_struct *work)
+{
+	struct cached_dev *dc = container_of(to_delayed_work(work),
+					     struct cached_dev,
+					     writeback_pd_update);
+
+	down_read(&dc->writeback_lock);
+
+	if (atomic_read(&dc->has_dirty) &&
+	    dc->writeback_percent &&
+	    !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
+		__update_writeback_rate(dc);
+	else
+		dc->writeback_pd.rate.rate = UINT_MAX;
+
+	up_read(&dc->writeback_lock);
+
+	schedule_delayed_work(&dc->writeback_pd_update,
+			      dc->writeback_pd_update_seconds * HZ);
+}
+
+struct dirty_io {
+	struct closure		cl;
+	struct bch_replace_info	replace;
+	struct cached_dev	*dc;
+	struct cache		*ca;
+	struct keybuf_key	*w;
+	struct bch_extent_ptr	ptr;
+	int			error;
+	bool			from_mempool;
+	/* Must be last */
+	struct bio		bio;
+};
+
+#define DIRTY_IO_MEMPOOL_BVECS		64
+#define DIRTY_IO_MEMPOOL_SECTORS	(DIRTY_IO_MEMPOOL_BVECS * PAGE_SECTORS)
+
+static void dirty_init(struct dirty_io *io)
+{
+	struct bio *bio = &io->bio;
+
+	bio_init(bio);
+	if (!io->dc->writeback_percent)
+		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	bio->bi_iter.bi_size	= io->replace.key.k.size << 9;
+	bio->bi_max_vecs	=
+		DIV_ROUND_UP(io->replace.key.k.size, PAGE_SECTORS);
+	bio->bi_io_vec		= bio->bi_inline_vecs;
+	bch_bio_map(bio, NULL);
+}
+
+static void dirty_io_destructor(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
+	if (io->from_mempool)
+		mempool_free(io, &io->dc->writeback_io_pool);
+	else
+		kfree(io);
+}
+
+static void write_dirty_finish(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	struct cached_dev *dc = io->dc;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, &io->bio, i)
+		mempool_free(bv->bv_page, &dc->writeback_page_pool);
+
+	if (!io->error) {
+		BKEY_PADDED(k) tmp;
+		int ret;
+
+		bkey_copy(&tmp.k, &io->replace.key);
+		io->replace.hook.fn = bch_extent_cmpxchg;
+		bkey_extent_set_cached(&tmp.k.k, true);
+
+		ret = bch_btree_insert(dc->disk.c, BTREE_ID_EXTENTS, &tmp.k,
+				       NULL, &io->replace.hook, NULL, 0);
+		if (io->replace.successes == 0)
+			trace_bcache_writeback_collision(&io->replace.key.k);
+
+		atomic_long_inc(ret
+				? &dc->disk.c->writeback_keys_failed
+				: &dc->disk.c->writeback_keys_done);
+	}
+
+	bch_keybuf_put(&dc->writeback_keys, io->w);
+
+	closure_return_with_destructor(cl, dirty_io_destructor);
+}
+
+static void dirty_endio(struct bio *bio)
+{
+	struct dirty_io *io = container_of(bio, struct dirty_io, bio);
+
+	if (bio->bi_error) {
+		trace_bcache_writeback_error(&io->replace.key.k,
+					     op_is_write(bio_op(&io->bio)),
+					     bio->bi_error);
+		io->error = bio->bi_error;
+	}
+
+	closure_put(&io->cl);
+}
+
+static void write_dirty(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
+	if (!io->error) {
+		dirty_init(io);
+		bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
+		io->bio.bi_iter.bi_sector =
+			bkey_start_offset(&io->replace.key.k);
+		io->bio.bi_bdev		= io->dc->disk_sb.bdev;
+		io->bio.bi_end_io	= dirty_endio;
+
+		closure_bio_submit(&io->bio, cl);
+	}
+
+	continue_at(cl, write_dirty_finish, io->dc->disk.c->wq);
+}
+
+static void read_dirty_endio(struct bio *bio)
+{
+	struct dirty_io *io = container_of(bio, struct dirty_io, bio);
+
+	cache_nonfatal_io_err_on(bio->bi_error, io->ca, "writeback read");
+
+	bch_account_io_completion(io->ca);
+
+	if (ptr_stale(io->ca, &io->ptr))
+		bio->bi_error = -EINTR;
+
+	dirty_endio(bio);
+}
+
+static void read_dirty_submit(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
+	closure_bio_submit(&io->bio, cl);
+
+	continue_at(cl, write_dirty, system_freezable_wq);
+}
+
+static u64 read_dirty(struct cached_dev *dc)
+{
+	struct keybuf_key *w;
+	struct dirty_io *io;
+	struct closure cl;
+	unsigned i;
+	struct bio_vec *bv;
+	u64 sectors_written = 0;
+	BKEY_PADDED(k) tmp;
+
+	closure_init_stack(&cl);
+
+	while (!bch_ratelimit_wait_freezable_stoppable(&dc->writeback_pd.rate)) {
+		w = bch_keybuf_next(&dc->writeback_keys);
+		if (!w)
+			break;
+
+		sectors_written += w->key.k.size;
+		bkey_copy(&tmp.k, &w->key);
+
+		while (tmp.k.k.size) {
+			struct extent_pick_ptr pick;
+
+			bch_extent_pick_ptr(dc->disk.c,
+					    bkey_i_to_s_c(&tmp.k),
+					    &pick);
+			if (IS_ERR_OR_NULL(pick.ca))
+				break;
+
+			io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
+				     DIV_ROUND_UP(tmp.k.k.size,
+						  PAGE_SECTORS),
+				     GFP_KERNEL);
+			if (!io) {
+				trace_bcache_writeback_alloc_fail(pick.ca->set,
+								  tmp.k.k.size);
+				io = mempool_alloc(&dc->writeback_io_pool,
+						   GFP_KERNEL);
+				memset(io, 0, sizeof(*io) +
+				       sizeof(struct bio_vec) *
+				       DIRTY_IO_MEMPOOL_BVECS);
+				io->from_mempool = true;
+
+				bkey_copy(&io->replace.key, &tmp.k);
+
+				if (DIRTY_IO_MEMPOOL_SECTORS <
+				    io->replace.key.k.size)
+					bch_key_resize(&io->replace.key.k,
+						DIRTY_IO_MEMPOOL_SECTORS);
+			} else {
+				bkey_copy(&io->replace.key, &tmp.k);
+			}
+
+			io->dc		= dc;
+			io->ca		= pick.ca;
+			io->w		= w;
+			io->ptr		= pick.ptr;
+			atomic_inc(&w->ref);
+
+			dirty_init(io);
+			bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
+			io->bio.bi_iter.bi_sector = pick.ptr.offset;
+			io->bio.bi_bdev		= pick.ca->disk_sb.bdev;
+			io->bio.bi_end_io	= read_dirty_endio;
+
+			bio_for_each_segment_all(bv, &io->bio, i) {
+				bv->bv_page =
+					mempool_alloc(&dc->writeback_page_pool,
+						      i ? GFP_NOWAIT
+						      : GFP_KERNEL);
+				if (!bv->bv_page) {
+					BUG_ON(!i);
+					io->bio.bi_vcnt = i;
+
+					io->bio.bi_iter.bi_size =
+						io->bio.bi_vcnt * PAGE_SIZE;
+
+					bch_key_resize(&io->replace.key.k,
+						       bio_sectors(&io->bio));
+					break;
+				}
+			}
+
+			bch_cut_front(io->replace.key.k.p, &tmp.k);
+			trace_bcache_writeback(&io->replace.key.k);
+
+			bch_ratelimit_increment(&dc->writeback_pd.rate,
+						io->replace.key.k.size << 9);
+
+			closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+		}
+
+		bch_keybuf_put(&dc->writeback_keys, w);
+	}
+
+	/*
+	 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
+	 * freed) before refilling again
+	 */
+	closure_sync(&cl);
+
+	return sectors_written;
+}
+
+/* Scan for dirty data */
+
+static void __bcache_dev_sectors_dirty_add(struct bcache_device *d,
+					   u64 offset, int nr_sectors)
+{
+	unsigned stripe_offset, stripe, sectors_dirty;
+
+	if (!d)
+		return;
+
+	if (!d->stripe_sectors_dirty)
+		return;
+
+	stripe = offset_to_stripe(d, offset);
+	stripe_offset = offset & (d->stripe_size - 1);
+
+	while (nr_sectors) {
+		int s = min_t(unsigned, abs(nr_sectors),
+			      d->stripe_size - stripe_offset);
+
+		if (nr_sectors < 0)
+			s = -s;
+
+		if (stripe >= d->nr_stripes)
+			return;
+
+		sectors_dirty = atomic_add_return(s,
+					d->stripe_sectors_dirty + stripe);
+		if (sectors_dirty == d->stripe_size)
+			set_bit(stripe, d->full_dirty_stripes);
+		else
+			clear_bit(stripe, d->full_dirty_stripes);
+
+		nr_sectors -= s;
+		stripe_offset = 0;
+		stripe++;
+	}
+}
+
+void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
+				  u64 offset, int nr_sectors)
+{
+	struct bcache_device *d;
+
+	rcu_read_lock();
+	d = bch_dev_find(c, inode);
+	if (d)
+		__bcache_dev_sectors_dirty_add(d, offset, nr_sectors);
+	rcu_read_unlock();
+}
+
+static bool dirty_pred(struct keybuf *buf, struct bkey_s_c k)
+{
+	struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
+
+	BUG_ON(k.k->p.inode != bcache_dev_inum(&dc->disk));
+
+	return bkey_extent_is_data(k.k) &&
+		!bkey_extent_is_cached(k.k);
+}
+
+static void refill_full_stripes(struct cached_dev *dc)
+{
+	struct keybuf *buf = &dc->writeback_keys;
+	unsigned inode = bcache_dev_inum(&dc->disk);
+	unsigned start_stripe, stripe, next_stripe;
+	bool wrapped = false;
+
+	stripe = offset_to_stripe(&dc->disk, buf->last_scanned.offset);
+
+	if (stripe >= dc->disk.nr_stripes)
+		stripe = 0;
+
+	start_stripe = stripe;
+
+	while (1) {
+		stripe = find_next_bit(dc->disk.full_dirty_stripes,
+				       dc->disk.nr_stripes, stripe);
+
+		if (stripe == dc->disk.nr_stripes)
+			goto next;
+
+		next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
+						 dc->disk.nr_stripes, stripe);
+
+		buf->last_scanned = POS(inode,
+					stripe * dc->disk.stripe_size);
+
+		bch_refill_keybuf(dc->disk.c, buf,
+				  POS(inode,
+				      next_stripe * dc->disk.stripe_size),
+				  dirty_pred);
+
+		if (array_freelist_empty(&buf->freelist))
+			return;
+
+		stripe = next_stripe;
+next:
+		if (wrapped && stripe > start_stripe)
+			return;
+
+		if (stripe == dc->disk.nr_stripes) {
+			stripe = 0;
+			wrapped = true;
+		}
+	}
+}
+
+static u64 bch_writeback(struct cached_dev *dc)
+{
+	struct keybuf *buf = &dc->writeback_keys;
+	unsigned inode = bcache_dev_inum(&dc->disk);
+	struct bpos start = POS(inode, 0);
+	struct bpos end = POS(inode, KEY_OFFSET_MAX);
+	struct bpos start_pos;
+	u64 sectors_written = 0;
+
+	buf->last_scanned = POS(inode, 0);
+
+	while (bkey_cmp(buf->last_scanned, end) < 0 &&
+	       !kthread_should_stop()) {
+		down_write(&dc->writeback_lock);
+
+		if (!atomic_read(&dc->has_dirty)) {
+			up_write(&dc->writeback_lock);
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			if (kthread_should_stop())
+				return sectors_written;
+
+			schedule();
+			try_to_freeze();
+			return sectors_written;
+		}
+
+		if (bkey_cmp(buf->last_scanned, end) >= 0)
+			buf->last_scanned = POS(inode, 0);
+
+		if (dc->partial_stripes_expensive) {
+			refill_full_stripes(dc);
+			if (array_freelist_empty(&buf->freelist))
+				goto refill_done;
+		}
+
+		start_pos = buf->last_scanned;
+		bch_refill_keybuf(dc->disk.c, buf, end, dirty_pred);
+
+		if (bkey_cmp(buf->last_scanned, end) >= 0) {
+			/*
+			 * If we get to the end start scanning again from the
+			 * beginning, and only scan up to where we initially
+			 * started scanning from:
+			 */
+			buf->last_scanned = start;
+			bch_refill_keybuf(dc->disk.c, buf, start_pos,
+					  dirty_pred);
+		}
+
+		if (RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
+			atomic_set(&dc->has_dirty, 0);
+			cached_dev_put(dc);
+			SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN);
+			bch_write_bdev_super(dc, NULL);
+		}
+
+refill_done:
+		up_write(&dc->writeback_lock);
+
+		bch_ratelimit_reset(&dc->writeback_pd.rate);
+		sectors_written += read_dirty(dc);
+	}
+
+	return sectors_written;
+}
+
+static int bch_writeback_thread(void *arg)
+{
+	struct cached_dev *dc = arg;
+	struct cache_set *c = dc->disk.c;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	unsigned long last;
+	u64 sectors_written;
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		if (kthread_wait_freezable(dc->writeback_running ||
+				test_bit(BCACHE_DEV_DETACHING,
+					 &dc->disk.flags)))
+			break;
+
+		last = atomic_long_read(&clock->now);
+
+		sectors_written = bch_writeback(dc);
+
+		if (sectors_written < c->capacity >> 4)
+			bch_kthread_io_clock_wait(clock,
+					  last + (c->capacity >> 5));
+	}
+
+	return 0;
+}
+
+/**
+ * bch_keylist_recalc_oldest_gens - update oldest_gen pointers from writeback keys
+ *
+ * This prevents us from wrapping around gens for a bucket only referenced from
+ * writeback keybufs. We don't actually care that the data in those buckets is
+ * marked live, only that we don't wrap the gens.
+ */
+void bch_writeback_recalc_oldest_gens(struct cache_set *c)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+
+	rcu_read_lock();
+
+	radix_tree_for_each_slot(slot, &c->devices, &iter, 0) {
+		struct bcache_device *d;
+		struct cached_dev *dc;
+
+		d = radix_tree_deref_slot(slot);
+
+		if (!CACHED_DEV(&d->inode.v))
+			continue;
+		dc = container_of(d, struct cached_dev, disk);
+
+		bch_keybuf_recalc_oldest_gens(c, &dc->writeback_keys);
+	}
+
+	rcu_read_unlock();
+}
+
+/* Init */
+
+void bch_sectors_dirty_init(struct cached_dev *dc, struct cache_set *c)
+{
+	struct bcache_device *d = &dc->disk;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	/*
+	 * We have to do this before the disk is added to the radix tree or we
+	 * race with moving GC
+	 */
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(bcache_dev_inum(d), 0), k) {
+		if (k.k->p.inode > bcache_dev_inum(d))
+			break;
+
+		if (bkey_extent_is_data(k.k) &&
+		    !bkey_extent_is_cached(k.k))
+			__bcache_dev_sectors_dirty_add(d,
+						       bkey_start_offset(k.k),
+						       k.k->size);
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+	bch_btree_iter_unlock(&iter);
+
+	dc->writeback_pd.last_actual = bcache_dev_sectors_dirty(d);
+}
+
+void bch_cached_dev_writeback_stop(struct cached_dev *dc)
+{
+	cancel_delayed_work_sync(&dc->writeback_pd_update);
+	if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
+		kthread_stop(dc->writeback_thread);
+		dc->writeback_thread = NULL;
+	}
+}
+
+void bch_cached_dev_writeback_free(struct cached_dev *dc)
+{
+	struct bcache_device *d = &dc->disk;
+
+	mempool_exit(&dc->writeback_page_pool);
+	mempool_exit(&dc->writeback_io_pool);
+	kvfree(d->full_dirty_stripes);
+	kvfree(d->stripe_sectors_dirty);
+}
+
+int bch_cached_dev_writeback_init(struct cached_dev *dc)
+{
+	struct bcache_device *d = &dc->disk;
+	sector_t sectors;
+	size_t n;
+
+	sectors = get_capacity(dc->disk.disk);
+
+	if (!d->stripe_size) {
+#ifdef CONFIG_BCACHE_DEBUG
+		d->stripe_size = 1 << 0;
+#else
+		d->stripe_size = 1 << 31;
+#endif
+	}
+
+	pr_debug("stripe size: %d sectors", d->stripe_size);
+	d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
+
+	if (!d->nr_stripes ||
+	    d->nr_stripes > INT_MAX ||
+	    d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
+		pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
+			(unsigned)d->nr_stripes);
+		return -ENOMEM;
+	}
+
+	n = d->nr_stripes * sizeof(atomic_t);
+	d->stripe_sectors_dirty = n < PAGE_SIZE << 6
+		? kzalloc(n, GFP_KERNEL)
+		: vzalloc(n);
+	if (!d->stripe_sectors_dirty) {
+		pr_err("cannot allocate stripe_sectors_dirty");
+		return -ENOMEM;
+	}
+
+	n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
+	d->full_dirty_stripes = n < PAGE_SIZE << 6
+		? kzalloc(n, GFP_KERNEL)
+		: vzalloc(n);
+	if (!d->full_dirty_stripes) {
+		pr_err("cannot allocate full_dirty_stripes");
+		return -ENOMEM;
+	}
+
+	if (mempool_init_kmalloc_pool(&dc->writeback_io_pool, 4,
+				      sizeof(struct dirty_io) +
+				      sizeof(struct bio_vec) *
+				      DIRTY_IO_MEMPOOL_BVECS) ||
+	    mempool_init_page_pool(&dc->writeback_page_pool,
+				   (64 << 10) / PAGE_SIZE, 0))
+		return -ENOMEM;
+
+	init_rwsem(&dc->writeback_lock);
+	bch_keybuf_init(&dc->writeback_keys);
+
+	dc->writeback_metadata		= true;
+	dc->writeback_running		= true;
+	dc->writeback_percent		= 10;
+	dc->writeback_pd_update_seconds	= 5;
+
+	bch_pd_controller_init(&dc->writeback_pd);
+	INIT_DELAYED_WORK(&dc->writeback_pd_update, update_writeback_rate);
+
+	return 0;
+}
+
+int bch_cached_dev_writeback_start(struct cached_dev *dc)
+{
+	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
+					      "bcache_writeback");
+	if (IS_ERR(dc->writeback_thread))
+		return PTR_ERR(dc->writeback_thread);
+
+	schedule_delayed_work(&dc->writeback_pd_update,
+			      dc->writeback_pd_update_seconds * HZ);
+
+	bch_writeback_queue(dc);
+
+	return 0;
+}
diff --git a/libbcache/writeback.h b/libbcache/writeback.h
new file mode 100644
index 0000000..77e5965
--- /dev/null
+++ b/libbcache/writeback.h
@@ -0,0 +1,100 @@
+#ifndef _BCACHE_WRITEBACK_H
+#define _BCACHE_WRITEBACK_H
+
+#include "blockdev.h"
+#include "buckets.h"
+
+#define CUTOFF_WRITEBACK	60
+#define CUTOFF_WRITEBACK_SYNC	30
+
+static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
+{
+	uint64_t i, ret = 0;
+
+	for (i = 0; i < d->nr_stripes; i++)
+		ret += atomic_read(d->stripe_sectors_dirty + i);
+
+	return ret;
+}
+
+static inline unsigned offset_to_stripe(struct bcache_device *d,
+					uint64_t offset)
+{
+	do_div(offset, d->stripe_size);
+	return offset;
+}
+
+static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
+					   uint64_t offset,
+					   unsigned nr_sectors)
+{
+	unsigned stripe = offset_to_stripe(&dc->disk, offset);
+
+	while (1) {
+		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
+			return true;
+
+		if (nr_sectors <= dc->disk.stripe_size)
+			return false;
+
+		nr_sectors -= dc->disk.stripe_size;
+		stripe++;
+	}
+}
+
+static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
+				    unsigned cache_mode, bool would_skip)
+{
+	struct cache_set *c = dc->disk.c;
+	u64 available = sectors_available(c);
+
+	if (cache_mode != CACHE_MODE_WRITEBACK ||
+	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
+	    available * 100 < c->capacity * CUTOFF_WRITEBACK_SYNC)
+		return false;
+
+	if (dc->partial_stripes_expensive &&
+	    bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector,
+				    bio_sectors(bio)))
+		return true;
+
+	if (would_skip)
+		return false;
+
+	return bio->bi_opf & REQ_SYNC ||
+		available * 100 < c->capacity * CUTOFF_WRITEBACK;
+}
+
+static inline void bch_writeback_queue(struct cached_dev *dc)
+{
+	if (!IS_ERR_OR_NULL(dc->writeback_thread))
+		wake_up_process(dc->writeback_thread);
+}
+
+static inline void bch_writeback_add(struct cached_dev *dc)
+{
+	if (!atomic_read(&dc->has_dirty) &&
+	    !atomic_xchg(&dc->has_dirty, 1)) {
+		atomic_inc(&dc->count);
+
+		if (BDEV_STATE(dc->disk_sb.sb) != BDEV_STATE_DIRTY) {
+			SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_DIRTY);
+			/* XXX: should do this synchronously */
+			bch_write_bdev_super(dc, NULL);
+		}
+
+		bch_writeback_queue(dc);
+	}
+}
+
+void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, u64, int);
+
+void bch_writeback_recalc_oldest_gens(struct cache_set *);
+void bch_sectors_dirty_init(struct cached_dev *, struct cache_set *c);
+
+void bch_cached_dev_writeback_stop(struct cached_dev *);
+void bch_cached_dev_writeback_free(struct cached_dev *);
+int bch_cached_dev_writeback_init(struct cached_dev *);
+int bch_cached_dev_writeback_start(struct cached_dev *);
+
+#endif
diff --git a/libbcache/xattr.c b/libbcache/xattr.c
new file mode 100644
index 0000000..e9e0a9a
--- /dev/null
+++ b/libbcache/xattr.c
@@ -0,0 +1,379 @@
+
+#include "bcache.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "fs.h"
+#include "str_hash.h"
+#include "xattr.h"
+
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+#include <crypto/hash.h>
+
+struct xattr_search_key {
+	u8		type;
+	struct qstr	name;
+};
+
+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key)	\
+	{ .type = _type, .name = QSTR_INIT(_name, _len) })
+
+static u64 bch_xattr_hash(const struct bch_hash_info *info,
+			  const struct xattr_search_key *key)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_SHA1: {
+		SHASH_DESC_ON_STACK(desc, bch_sha1);
+		u8 digest[SHA1_DIGEST_SIZE];
+		u64 ret;
+
+		desc->tfm = bch_sha1;
+		desc->flags = 0;
+		crypto_shash_init(desc);
+
+		crypto_shash_update(desc, (void *) &info->seed, sizeof(info->seed));
+
+		crypto_shash_update(desc, (void *) &key->type, sizeof(key->type));
+		crypto_shash_update(desc, (void *) key->name.name, key->name.len);
+
+		crypto_shash_final(desc, digest);
+		memcpy(&ret, &digest, sizeof(ret));
+		return ret >> 1;
+	}
+	default: {
+		struct bch_str_hash_ctx ctx;
+
+		bch_str_hash_init(&ctx, info->type);
+		bch_str_hash_update(&ctx, info->type, &info->seed, sizeof(info->seed));
+
+		bch_str_hash_update(&ctx, info->type, &key->type, sizeof(key->type));
+		bch_str_hash_update(&ctx, info->type, key->name.name, key->name.len);
+
+		return bch_str_hash_end(&ctx, info->type);
+	}
+	}
+}
+
+#define xattr_val(_xattr)	((_xattr)->x_name + (_xattr)->x_name_len)
+
+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch_xattr_hash(info, key);
+}
+
+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
+
+	return bch_xattr_hash(info,
+		 &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
+}
+
+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+	const struct xattr_search_key *r = _r;
+
+	return l.v->x_type != r->type ||
+		l.v->x_name_len != r->name.len ||
+		memcmp(l.v->x_name, r->name.name, r->name.len);
+}
+
+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+	struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
+
+	return l.v->x_type != r.v->x_type ||
+		l.v->x_name_len != r.v->x_name_len ||
+		memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
+}
+
+static const struct bch_hash_desc xattr_hash_desc = {
+	.btree_id	= BTREE_ID_XATTRS,
+	.key_type	= BCH_XATTR,
+	.whiteout_type	= BCH_XATTR_WHITEOUT,
+	.hash_key	= xattr_hash_key,
+	.hash_bkey	= xattr_hash_bkey,
+	.cmp_key	= xattr_cmp_key,
+	.cmp_bkey	= xattr_cmp_bkey,
+};
+
+static const char *bch_xattr_invalid(const struct cache_set *c,
+				     struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_XATTR:
+		return bkey_val_bytes(k.k) < sizeof(struct bch_xattr)
+			? "value too small"
+			: NULL;
+
+	case BCH_XATTR_WHITEOUT:
+		return bkey_val_bytes(k.k) != 0
+			? "value size should be zero"
+			: NULL;
+
+	default:
+		return "invalid type";
+	}
+}
+
+static void bch_xattr_to_text(struct cache_set *c, char *buf,
+			      size_t size, struct bkey_s_c k)
+{
+	struct bkey_s_c_xattr xattr;
+	int n;
+
+	switch (k.k->type) {
+	case BCH_XATTR:
+		xattr = bkey_s_c_to_xattr(k);
+
+		if (size) {
+			n = min_t(unsigned, size, xattr.v->x_name_len);
+			memcpy(buf, xattr.v->x_name, n);
+			buf[size - 1] = '\0';
+			buf += n;
+			size -= n;
+		}
+
+		n = scnprintf(buf, size, " -> ");
+		buf += n;
+		size -= n;
+
+		if (size) {
+			n = min_t(unsigned, size,
+				  le16_to_cpu(xattr.v->x_val_len));
+			memcpy(buf, xattr_val(xattr.v), n);
+			buf[size - 1] = '\0';
+			buf += n;
+			size -= n;
+		}
+
+		break;
+	case BCH_XATTR_WHITEOUT:
+		scnprintf(buf, size, "whiteout");
+		break;
+	}
+}
+
+const struct bkey_ops bch_bkey_xattr_ops = {
+	.key_invalid	= bch_xattr_invalid,
+	.val_to_text	= bch_xattr_to_text,
+};
+
+int bch_xattr_get(struct cache_set *c, struct inode *inode,
+		  const char *name, void *buffer, size_t size, int type)
+{
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_xattr xattr;
+	int ret;
+
+	k = bch_hash_lookup(xattr_hash_desc, &ei->str_hash, c,
+			    ei->vfs_inode.i_ino, &iter,
+			    &X_SEARCH(type, name, strlen(name)));
+	if (IS_ERR(k.k))
+		return bch_btree_iter_unlock(&iter) ?: -ENODATA;
+
+	xattr = bkey_s_c_to_xattr(k);
+	ret = le16_to_cpu(xattr.v->x_val_len);
+	if (buffer) {
+		if (ret > size)
+			ret = -ERANGE;
+		else
+			memcpy(buffer, xattr_val(xattr.v), ret);
+	}
+
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
+
+int bch_xattr_set(struct cache_set *c, struct inode *inode,
+		  const char *name, const void *value, size_t size,
+		  int flags, int type)
+{
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
+	int ret;
+
+	if (!value) {
+		ret = bch_hash_delete(xattr_hash_desc, &ei->str_hash,
+				      c, ei->vfs_inode.i_ino,
+				      &ei->journal_seq, &search);
+	} else {
+		struct bkey_i_xattr *xattr;
+		unsigned u64s = BKEY_U64s +
+			DIV_ROUND_UP(sizeof(struct bch_xattr) +
+				     search.name.len + size,
+				     sizeof(u64));
+
+		if (u64s > U8_MAX)
+			return -ERANGE;
+
+		xattr = kmalloc(u64s * sizeof(u64), GFP_NOFS);
+		if (!xattr)
+			return -ENOMEM;
+
+		bkey_xattr_init(&xattr->k_i);
+		xattr->k.u64s		= u64s;
+		xattr->v.x_type		= type;
+		xattr->v.x_name_len	= search.name.len;
+		xattr->v.x_val_len	= cpu_to_le16(size);
+		memcpy(xattr->v.x_name, search.name.name, search.name.len);
+		memcpy(xattr_val(&xattr->v), value, size);
+
+		ret = bch_hash_set(xattr_hash_desc, &ei->str_hash, c,
+				ei->vfs_inode.i_ino, &ei->journal_seq,
+				&xattr->k_i,
+				(flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
+				(flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
+		kfree(xattr);
+	}
+
+	if (ret == -ENOENT)
+		ret = flags & XATTR_REPLACE ? -ENODATA : 0;
+
+	return ret;
+}
+
+static const struct xattr_handler *bch_xattr_type_to_handler(unsigned);
+
+static size_t bch_xattr_emit(struct dentry *dentry,
+			     const struct bch_xattr *xattr,
+			     char *buffer, size_t buffer_size)
+{
+	const struct xattr_handler *handler =
+		bch_xattr_type_to_handler(xattr->x_type);
+
+	if (handler && (!handler->list || handler->list(dentry))) {
+		const char *prefix = handler->prefix ?: handler->name;
+		const size_t prefix_len = strlen(prefix);
+		const size_t total_len = prefix_len + xattr->x_name_len + 1;
+
+		if (buffer && total_len <= buffer_size) {
+			memcpy(buffer, prefix, prefix_len);
+			memcpy(buffer + prefix_len,
+			       xattr->x_name, xattr->x_name_len);
+			buffer[prefix_len + xattr->x_name_len] = '\0';
+		}
+
+		return total_len;
+	} else {
+		return 0;
+	}
+}
+
+ssize_t bch_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct cache_set *c = dentry->d_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	const struct bch_xattr *xattr;
+	u64 inum = dentry->d_inode->i_ino;
+	ssize_t ret = 0;
+	size_t len;
+
+	for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), k) {
+		BUG_ON(k.k->p.inode < inum);
+
+		if (k.k->p.inode > inum)
+			break;
+
+		if (k.k->type != BCH_XATTR)
+			continue;
+
+		xattr = bkey_s_c_to_xattr(k).v;
+
+		len = bch_xattr_emit(dentry, xattr, buffer, buffer_size);
+		if (buffer) {
+			if (len > buffer_size) {
+				bch_btree_iter_unlock(&iter);
+				return -ERANGE;
+			}
+
+			buffer += len;
+			buffer_size -= len;
+		}
+
+		ret += len;
+
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+static int bch_xattr_get_handler(const struct xattr_handler *handler,
+				 struct dentry *dentry, struct inode *inode,
+				 const char *name, void *buffer, size_t size)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	return bch_xattr_get(c, inode, name, buffer, size, handler->flags);
+}
+
+static int bch_xattr_set_handler(const struct xattr_handler *handler,
+				 struct dentry *dentry, struct inode *inode,
+				 const char *name, const void *value,
+				 size_t size, int flags)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	return bch_xattr_set(c, inode, name, value, size, flags,
+			     handler->flags);
+}
+
+static const struct xattr_handler bch_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= bch_xattr_get_handler,
+	.set	= bch_xattr_set_handler,
+	.flags	= BCH_XATTR_INDEX_USER,
+};
+
+static bool bch_xattr_trusted_list(struct dentry *dentry)
+{
+	return capable(CAP_SYS_ADMIN);
+}
+
+static const struct xattr_handler bch_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= bch_xattr_trusted_list,
+	.get	= bch_xattr_get_handler,
+	.set	= bch_xattr_set_handler,
+	.flags	= BCH_XATTR_INDEX_TRUSTED,
+};
+
+static const struct xattr_handler bch_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.get	= bch_xattr_get_handler,
+	.set	= bch_xattr_set_handler,
+	.flags	= BCH_XATTR_INDEX_SECURITY,
+};
+
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+	[BCH_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
+	[BCH_XATTR_INDEX_POSIX_ACL_ACCESS]	=
+		&posix_acl_access_xattr_handler,
+	[BCH_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
+		&posix_acl_default_xattr_handler,
+	[BCH_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
+	[BCH_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
+};
+
+const struct xattr_handler *bch_xattr_handlers[] = {
+	&bch_xattr_user_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+	&bch_xattr_trusted_handler,
+	&bch_xattr_security_handler,
+	NULL
+};
+
+static const struct xattr_handler *bch_xattr_type_to_handler(unsigned type)
+{
+	return type < ARRAY_SIZE(bch_xattr_handler_map)
+		? bch_xattr_handler_map[type]
+		: NULL;
+}
diff --git a/libbcache/xattr.h b/libbcache/xattr.h
new file mode 100644
index 0000000..54eb920
--- /dev/null
+++ b/libbcache/xattr.h
@@ -0,0 +1,17 @@
+#ifndef _BCACHE_XATTR_H
+#define _BCACHE_XATTR_H
+
+extern const struct bkey_ops bch_bkey_xattr_ops;
+
+struct dentry;
+struct xattr_handler;
+
+int bch_xattr_get(struct cache_set *, struct inode *,
+		  const char *, void *, size_t, int);
+int bch_xattr_set(struct cache_set *, struct inode *,
+		  const char *, const void *, size_t, int, int);
+ssize_t bch_xattr_list(struct dentry *, char *, size_t);
+
+extern const struct xattr_handler *bch_xattr_handlers[];
+
+#endif /* _BCACHE_XATTR_H */
author	Kent Overstreet <kent.overstreet@gmail.com>	2017-01-08 00:13:18 -0900
committer	Kent Overstreet <kent.overstreet@gmail.com>	2017-01-20 09:07:08 -0900
commit	b33fc8298f7e13226b9895abc57c9bfce5e3fa2d (patch)
tree	a3d2a5a909b6372f7777c1c5c18cef5f81d123a9 /libbcache
parent	7f4191a202ea4558ca2d5eb8a47daea33c9999c7 (diff)