bcachefs: Initial commit

Forked from drivers/md/bcache, now a full blown COW multi device filesystem with a long list of features - https://bcachefs.org Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
author: Kent Overstreet <kent.overstreet@gmail.com> 2017-03-16 22:18:50 -0800
committer: Kent Overstreet <kent.overstreet@gmail.com> 2021-04-27 12:17:53 -0400
commit: afb402e0dfe759cfb4bf1c594d4fbbcbe6a30c14 (patch)
tree: f4bae59d80d056eb6dfbb536678b211c9e383b6f /fs/bcachefs
parent: b7faa92b19192fbb7b9a4211bbebeacdd3134efe (diff)
129 files changed, 63241 insertions, 0 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
new file mode 100644
index 000000000000..5594af719b2a
--- /dev/null
+++ b/fs/bcachefs/Kconfig
@@ -0,0 +1,50 @@
+
+config BCACHEFS_FS
+	tristate "bcachefs filesystem support"
+	depends on BLOCK
+	select EXPORTFS
+	select CLOSURES
+	select LIBCRC32C
+	select CRC64
+	select FS_POSIX_ACL
+	select LZ4_COMPRESS
+	select LZ4_DECOMPRESS
+	select ZLIB_DEFLATE
+	select ZLIB_INFLATE
+	select ZSTD_COMPRESS
+	select ZSTD_DECOMPRESS
+	select CRYPTO_SHA256
+	select CRYPTO_CHACHA20
+	select CRYPTO_POLY1305
+	select KEYS
+	select SIXLOCKS
+	select RAID6_PQ
+	select XOR_BLOCKS
+	help
+	The bcachefs filesystem - a modern, copy on write filesystem, with
+	support for multiple devices, compression, checksumming, etc.
+
+config BCACHEFS_QUOTA
+	bool "bcachefs quota support"
+	depends on BCACHEFS_FS
+	select QUOTACTL
+
+config BCACHEFS_POSIX_ACL
+	bool "bcachefs POSIX ACL support"
+	depends on BCACHEFS_FS
+	select FS_POSIX_ACL
+
+config BCACHEFS_DEBUG
+	bool "bcachefs debugging"
+	depends on BCACHEFS_FS
+	help
+	Enables many extra debugging checks and assertions.
+
+	The resulting code will be significantly slower than normal; you
+	probably shouldn't select this option unless you're a developer.
+
+config BCACHEFS_TESTS
+	bool "bcachefs unit and performance tests"
+	depends on BCACHEFS_FS
+	help
+	Include some unit and performance tests for the core btree code
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
new file mode 100644
index 000000000000..b199da94f311
--- /dev/null
+++ b/fs/bcachefs/Makefile
@@ -0,0 +1,57 @@
+
+obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
+
+bcachefs-y		:=	\
+	acl.o			\
+	alloc_background.o	\
+	alloc_foreground.o	\
+	bkey.o			\
+	bkey_methods.o		\
+	bkey_sort.o		\
+	bset.o			\
+	btree_cache.o		\
+	btree_gc.o		\
+	btree_io.o		\
+	btree_iter.o		\
+	btree_update_interior.o	\
+	btree_update_leaf.o	\
+	buckets.o		\
+	chardev.o		\
+	checksum.o		\
+	clock.o			\
+	compress.o		\
+	debug.o			\
+	dirent.o		\
+	disk_groups.o		\
+	ec.o			\
+	error.o			\
+	extents.o		\
+	fs.o			\
+	fs-common.o		\
+	fs-ioctl.o		\
+	fs-io.o			\
+	fsck.o			\
+	inode.o			\
+	io.o			\
+	journal.o		\
+	journal_io.o		\
+	journal_reclaim.o	\
+	journal_seq_blacklist.o	\
+	keylist.o		\
+	migrate.o		\
+	move.o			\
+	movinggc.o		\
+	opts.o			\
+	quota.o			\
+	rebalance.o		\
+	recovery.o		\
+	reflink.o		\
+	replicas.o		\
+	siphash.o		\
+	super.o			\
+	super-io.o		\
+	sysfs.o			\
+	tests.o			\
+	trace.o			\
+	util.o			\
+	xattr.o
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
new file mode 100644
index 000000000000..dcd0dfe87b51
--- /dev/null
+++ b/fs/bcachefs/acl.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "bcachefs.h"
+
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "acl.h"
+#include "fs.h"
+#include "xattr.h"
+
+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
+{
+	return sizeof(bch_acl_header) +
+		sizeof(bch_acl_entry_short) * nr_short +
+		sizeof(bch_acl_entry) * nr_long;
+}
+
+static inline int acl_to_xattr_type(int type)
+{
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
+	case ACL_TYPE_DEFAULT:
+		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
+{
+	const void *p, *end = value + size;
+	struct posix_acl *acl;
+	struct posix_acl_entry *out;
+	unsigned count = 0;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(bch_acl_header))
+		goto invalid;
+	if (((bch_acl_header *)value)->a_version !=
+	    cpu_to_le32(BCH_ACL_VERSION))
+		goto invalid;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *entry = p;
+
+		if (p + sizeof(bch_acl_entry_short) > end)
+			goto invalid;
+
+		switch (le16_to_cpu(entry->e_tag)) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+		case ACL_GROUP:
+			p += sizeof(bch_acl_entry);
+			break;
+		default:
+			goto invalid;
+		}
+
+		count++;
+	}
+
+	if (p > end)
+		goto invalid;
+
+	if (!count)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	out = acl->a_entries;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *in = p;
+
+		out->e_tag  = le16_to_cpu(in->e_tag);
+		out->e_perm = le16_to_cpu(in->e_perm);
+
+		switch (out->e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+			out->e_uid = make_kuid(&init_user_ns,
+					       le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			out->e_gid = make_kgid(&init_user_ns,
+					       le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		}
+
+		out++;
+	}
+
+	BUG_ON(out != acl->a_entries + acl->a_count);
+
+	return acl;
+invalid:
+	pr_err("invalid acl entry");
+	return ERR_PTR(-EINVAL);
+}
+
+#define acl_for_each_entry(acl, acl_e)			\
+	for (acl_e = acl->a_entries;			\
+	     acl_e < acl->a_entries + acl->a_count;	\
+	     acl_e++)
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static struct bkey_i_xattr *
+bch2_acl_to_xattr(struct btree_trans *trans,
+		  const struct posix_acl *acl,
+		  int type)
+{
+	struct bkey_i_xattr *xattr;
+	bch_acl_header *acl_header;
+	const struct posix_acl_entry *acl_e;
+	void *outptr;
+	unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
+
+	acl_for_each_entry(acl, acl_e) {
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			nr_long++;
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			nr_short++;
+			break;
+		default:
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	acl_len = bch2_acl_size(nr_short, nr_long);
+	u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
+
+	if (u64s > U8_MAX)
+		return ERR_PTR(-E2BIG);
+
+	xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+	if (IS_ERR(xattr))
+		return xattr;
+
+	bkey_xattr_init(&xattr->k_i);
+	xattr->k.u64s		= u64s;
+	xattr->v.x_type		= acl_to_xattr_type(type);
+	xattr->v.x_name_len	= 0,
+	xattr->v.x_val_len	= cpu_to_le16(acl_len);
+
+	acl_header = xattr_val(&xattr->v);
+	acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
+
+	outptr = (void *) acl_header + sizeof(*acl_header);
+
+	acl_for_each_entry(acl, acl_e) {
+		bch_acl_entry *entry = outptr;
+
+		entry->e_tag = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			outptr += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			outptr += sizeof(bch_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			outptr += sizeof(bch_acl_entry_short);
+			break;
+		}
+	}
+
+	BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
+
+	return xattr;
+}
+
+struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c_xattr xattr;
+	struct posix_acl *acl = NULL;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+			&inode->ei_str_hash, inode->v.i_ino,
+			&X_SEARCH(acl_to_xattr_type(type), "", 0),
+			0);
+	if (IS_ERR(iter)) {
+		if (PTR_ERR(iter) == -EINTR)
+			goto retry;
+
+		if (PTR_ERR(iter) != -ENOENT)
+			acl = ERR_CAST(iter);
+		goto out;
+	}
+
+	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+
+	acl = bch2_acl_from_disk(xattr_val(xattr.v),
+			le16_to_cpu(xattr.v->x_val_len));
+
+	if (!IS_ERR(acl))
+		set_cached_acl(&inode->v, type, acl);
+out:
+	bch2_trans_exit(&trans);
+	return acl;
+}
+
+int bch2_set_acl_trans(struct btree_trans *trans,
+		       struct bch_inode_unpacked *inode_u,
+		       const struct bch_hash_info *hash_info,
+		       struct posix_acl *acl, int type)
+{
+	int ret;
+
+	if (type == ACL_TYPE_DEFAULT &&
+	    !S_ISDIR(inode_u->bi_mode))
+		return acl ? -EACCES : 0;
+
+	if (acl) {
+		struct bkey_i_xattr *xattr =
+			bch2_acl_to_xattr(trans, acl, type);
+		if (IS_ERR(xattr))
+			return PTR_ERR(xattr);
+
+		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+				    inode_u->bi_inum, &xattr->k_i, 0);
+	} else {
+		struct xattr_search_key search =
+			X_SEARCH(acl_to_xattr_type(type), "", 0);
+
+		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
+				       inode_u->bi_inum, &search);
+	}
+
+	return ret == -ENOENT ? 0 : ret;
+}
+
+int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct btree_iter *inode_iter;
+	struct bch_inode_unpacked inode_u;
+	struct posix_acl *acl;
+	umode_t mode;
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+	acl = _acl;
+
+	inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
+				     BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(inode_iter);
+	if (ret)
+		goto btree_err;
+
+	mode = inode_u.bi_mode;
+
+	if (type == ACL_TYPE_ACCESS) {
+		ret = posix_acl_update_mode(&inode->v, &mode, &acl);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_set_acl_trans(&trans, &inode_u,
+				 &inode->ei_str_hash,
+				 acl, type);
+	if (ret)
+		goto btree_err;
+
+	inode_u.bi_ctime	= bch2_current_time(c);
+	inode_u.bi_mode		= mode;
+
+	ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
+		bch2_trans_commit(&trans, NULL,
+				  &inode->ei_journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK);
+btree_err:
+	if (ret == -EINTR)
+		goto retry;
+	if (unlikely(ret))
+		goto err;
+
+	bch2_inode_update_after_write(c, inode, &inode_u,
+				      ATTR_CTIME|ATTR_MODE);
+
+	set_cached_acl(&inode->v, type, acl);
+err:
+	bch2_trans_exit(&trans);
+	mutex_unlock(&inode->ei_update_lock);
+
+	return ret;
+}
+
+int bch2_acl_chmod(struct btree_trans *trans,
+		   struct bch_inode_info *inode,
+		   umode_t mode,
+		   struct posix_acl **new_acl)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c_xattr xattr;
+	struct bkey_i_xattr *new;
+	struct posix_acl *acl;
+	int ret = 0;
+
+	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
+			&inode->ei_str_hash, inode->v.i_ino,
+			&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
+			BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
+
+	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+
+	acl = bch2_acl_from_disk(xattr_val(xattr.v),
+			le16_to_cpu(xattr.v->x_val_len));
+	if (IS_ERR_OR_NULL(acl))
+		return PTR_ERR(acl);
+
+	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+	if (ret)
+		goto err;
+
+	new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+	if (IS_ERR(new)) {
+		ret = PTR_ERR(new);
+		goto err;
+	}
+
+	new->k.p = iter->pos;
+	bch2_trans_update(trans, iter, &new->k_i);
+	*new_acl = acl;
+	acl = NULL;
+err:
+	kfree(acl);
+	return ret;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
new file mode 100644
index 000000000000..cb62d502a7ff
--- /dev/null
+++ b/fs/bcachefs/acl.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ACL_H
+#define _BCACHEFS_ACL_H
+
+struct bch_inode_unpacked;
+struct bch_hash_info;
+struct bch_inode_info;
+struct posix_acl;
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#define BCH_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} bch_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} bch_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} bch_acl_header;
+
+struct posix_acl *bch2_get_acl(struct inode *, int);
+
+int bch2_set_acl_trans(struct btree_trans *,
+		       struct bch_inode_unpacked *,
+		       const struct bch_hash_info *,
+		       struct posix_acl *, int);
+int bch2_set_acl(struct inode *, struct posix_acl *, int);
+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
+		   umode_t, struct posix_acl **);
+
+#else
+
+static inline int bch2_set_acl_trans(struct btree_trans *trans,
+				     struct bch_inode_unpacked *inode_u,
+				     const struct bch_hash_info *hash_info,
+				     struct posix_acl *acl, int type)
+{
+	return 0;
+}
+
+static inline int bch2_acl_chmod(struct btree_trans *trans,
+				 struct bch_inode_info *inode,
+				 umode_t mode,
+				 struct posix_acl **new_acl)
+{
+	return 0;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+
+#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
new file mode 100644
index 000000000000..e252a039dc2b
--- /dev/null
+++ b/fs/bcachefs/alloc_background.c
@@ -0,0 +1,1656 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "recovery.h"
+
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
+#include <trace/events/bcachefs.h>
+
+static const char * const bch2_alloc_field_names[] = {
+#define x(name, bytes) #name,
+	BCH_ALLOC_FIELDS()
+#undef x
+	NULL
+};
+
+static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
+
+/* Ratelimiting/PD controllers */
+
+static void pd_controllers_update(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(to_delayed_work(work),
+					   struct bch_fs,
+					   pd_controllers_update);
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+
+		u64 free = bucket_to_sector(ca,
+				__dev_buckets_free(ca, stats)) << 9;
+		/*
+		 * Bytes of internal fragmentation, which can be
+		 * reclaimed by copy GC
+		 */
+		s64 fragmented = (bucket_to_sector(ca,
+					stats.buckets[BCH_DATA_USER] +
+					stats.buckets[BCH_DATA_CACHED]) -
+				  (stats.sectors[BCH_DATA_USER] +
+				   stats.sectors[BCH_DATA_CACHED])) << 9;
+
+		fragmented = max(0LL, fragmented);
+
+		bch2_pd_controller_update(&ca->copygc_pd,
+					 free, fragmented, -1);
+	}
+
+	schedule_delayed_work(&c->pd_controllers_update,
+			      c->pd_controllers_update_seconds * HZ);
+}
+
+/* Persistent alloc info: */
+
+static inline u64 get_alloc_field(const struct bch_alloc *a,
+				  const void **p, unsigned field)
+{
+	unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+	u64 v;
+
+	if (!(a->fields & (1 << field)))
+		return 0;
+
+	switch (bytes) {
+	case 1:
+		v = *((const u8 *) *p);
+		break;
+	case 2:
+		v = le16_to_cpup(*p);
+		break;
+	case 4:
+		v = le32_to_cpup(*p);
+		break;
+	case 8:
+		v = le64_to_cpup(*p);
+		break;
+	default:
+		BUG();
+	}
+
+	*p += bytes;
+	return v;
+}
+
+static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
+				   unsigned field, u64 v)
+{
+	unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+
+	if (!v)
+		return;
+
+	a->v.fields |= 1 << field;
+
+	switch (bytes) {
+	case 1:
+		*((u8 *) *p) = v;
+		break;
+	case 2:
+		*((__le16 *) *p) = cpu_to_le16(v);
+		break;
+	case 4:
+		*((__le32 *) *p) = cpu_to_le32(v);
+		break;
+	case 8:
+		*((__le64 *) *p) = cpu_to_le64(v);
+		break;
+	default:
+		BUG();
+	}
+
+	*p += bytes;
+}
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+{
+	struct bkey_alloc_unpacked ret = { .gen = 0 };
+
+	if (k.k->type == KEY_TYPE_alloc) {
+		const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
+		const void *d = a->data;
+		unsigned idx = 0;
+
+		ret.gen = a->gen;
+
+#define x(_name, _bits)	ret._name = get_alloc_field(a, &d, idx++);
+		BCH_ALLOC_FIELDS()
+#undef  x
+	}
+	return ret;
+}
+
+void bch2_alloc_pack(struct bkey_i_alloc *dst,
+		     const struct bkey_alloc_unpacked src)
+{
+	unsigned idx = 0;
+	void *d = dst->v.data;
+	unsigned bytes;
+
+	dst->v.fields	= 0;
+	dst->v.gen	= src.gen;
+
+#define x(_name, _bits)	put_alloc_field(dst, &d, idx++, src._name);
+	BCH_ALLOC_FIELDS()
+#undef  x
+
+	bytes = (void *) d - (void *) &dst->v;
+	set_bkey_val_bytes(&dst->k, bytes);
+	memset_u64s_tail(&dst->v, 0, bytes);
+}
+
+static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
+{
+	unsigned i, bytes = offsetof(struct bch_alloc, data);
+
+	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
+		if (a->fields & (1 << i))
+			bytes += BCH_ALLOC_FIELD_BYTES[i];
+
+	return DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
+	if (k.k->p.inode >= c->sb.nr_devices ||
+	    !c->devs[k.k->p.inode])
+		return "invalid device";
+
+	/* allow for unknown fields */
+	if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
+		return "incorrect value size";
+
+	return NULL;
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+			struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+	const void *d = a.v->data;
+	unsigned i;
+
+	pr_buf(out, "gen %u", a.v->gen);
+
+	for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
+		if (a.v->fields & (1 << i))
+			pr_buf(out, " %s %llu",
+			       bch2_alloc_field_names[i],
+			       get_alloc_field(a.v, &d, i));
+}
+
+int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	struct journal_key *j;
+	unsigned i;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
+		bch2_mark_key(c, k, 0, 0, NULL, 0,
+			      BCH_BUCKET_MARK_ALLOC_READ|
+			      BCH_BUCKET_MARK_NOATOMIC);
+
+	ret = bch2_trans_exit(&trans) ?: ret;
+	if (ret) {
+		bch_err(c, "error reading alloc info: %i", ret);
+		return ret;
+	}
+
+	for_each_journal_key(*journal_keys, j)
+		if (j->btree_id == BTREE_ID_ALLOC)
+			bch2_mark_key(c, bkey_i_to_s_c(j->k),
+				      0, 0, NULL, 0,
+				      BCH_BUCKET_MARK_ALLOC_READ|
+				      BCH_BUCKET_MARK_NOATOMIC);
+
+	percpu_down_write(&c->mark_lock);
+	bch2_dev_usage_from_buckets(c);
+	percpu_up_write(&c->mark_lock);
+
+	mutex_lock(&c->bucket_clock[READ].lock);
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		bch2_recalc_oldest_io(c, ca, READ);
+		up_read(&ca->bucket_lock);
+	}
+	mutex_unlock(&c->bucket_clock[READ].lock);
+
+	mutex_lock(&c->bucket_clock[WRITE].lock);
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		bch2_recalc_oldest_io(c, ca, WRITE);
+		up_read(&ca->bucket_lock);
+	}
+	mutex_unlock(&c->bucket_clock[WRITE].lock);
+
+	return 0;
+}
+
+enum alloc_write_ret {
+	ALLOC_WROTE,
+	ALLOC_NOWROTE,
+	ALLOC_END,
+};
+
+static int bch2_alloc_write_key(struct btree_trans *trans,
+				struct btree_iter *iter,
+				unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	struct bucket_array *ba;
+	struct bucket *g;
+	struct bucket_mark m;
+	struct bkey_alloc_unpacked old_u, new_u;
+	__BKEY_PADDED(k, 8) alloc_key; /* hack: */
+	struct bkey_i_alloc *a;
+	int ret;
+retry:
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	old_u = bch2_alloc_unpack(k);
+
+	if (iter->pos.inode >= c->sb.nr_devices ||
+	    !c->devs[iter->pos.inode])
+		return ALLOC_END;
+
+	percpu_down_read(&c->mark_lock);
+	ca	= bch_dev_bkey_exists(c, iter->pos.inode);
+	ba	= bucket_array(ca);
+
+	if (iter->pos.offset >= ba->nbuckets) {
+		percpu_up_read(&c->mark_lock);
+		return ALLOC_END;
+	}
+
+	g	= &ba->b[iter->pos.offset];
+	m	= READ_ONCE(g->mark);
+	new_u	= alloc_mem_to_key(g, m);
+	percpu_up_read(&c->mark_lock);
+
+	if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+		return ALLOC_NOWROTE;
+
+	a = bkey_alloc_init(&alloc_key.k);
+	a->k.p = iter->pos;
+	bch2_alloc_pack(a, new_u);
+
+	bch2_trans_update(trans, iter, &a->k_i);
+	ret = bch2_trans_commit(trans, NULL, NULL,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_NOMARK|
+				flags);
+err:
+	if (ret == -EINTR)
+		goto retry;
+	return ret;
+}
+
+int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
+
+	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	for_each_rw_member(ca, c, i) {
+		unsigned first_bucket;
+
+		percpu_down_read(&c->mark_lock);
+		first_bucket = bucket_array(ca)->first_bucket;
+		percpu_up_read(&c->mark_lock);
+
+		bch2_btree_iter_set_pos(iter, POS(i, first_bucket));
+
+		while (1) {
+			ret = bch2_alloc_write_key(&trans, iter, flags);
+			if (ret < 0 || ret == ALLOC_END)
+				break;
+			if (ret == ALLOC_WROTE)
+				*wrote = true;
+			bch2_btree_iter_next_slot(iter);
+		}
+
+		if (ret < 0) {
+			percpu_ref_put(&ca->io_ref);
+			break;
+		}
+	}
+
+	bch2_trans_exit(&trans);
+
+	return ret < 0 ? ret : 0;
+}
+
+int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	ret = bch2_alloc_write_key(&trans, iter,
+				   BTREE_INSERT_NOFAIL|
+				   BTREE_INSERT_LAZY_RW|
+				   BTREE_INSERT_JOURNAL_REPLAY|
+				   BTREE_INSERT_NOMARK);
+	bch2_trans_exit(&trans);
+	return ret < 0 ? ret : 0;
+}
+
+/* Bucket IO clocks: */
+
+static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
+{
+	struct bucket_clock *clock = &c->bucket_clock[rw];
+	struct bucket_array *buckets = bucket_array(ca);
+	struct bucket *g;
+	u16 max_last_io = 0;
+	unsigned i;
+
+	lockdep_assert_held(&c->bucket_clock[rw].lock);
+
+	/* Recalculate max_last_io for this device: */
+	for_each_bucket(g, buckets)
+		max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
+
+	ca->max_last_bucket_io[rw] = max_last_io;
+
+	/* Recalculate global max_last_io: */
+	max_last_io = 0;
+
+	for_each_member_device(ca, c, i)
+		max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
+
+	clock->max_last_io = max_last_io;
+}
+
+static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
+{
+	struct bucket_clock *clock = &c->bucket_clock[rw];
+	struct bucket_array *buckets;
+	struct bch_dev *ca;
+	struct bucket *g;
+	unsigned i;
+
+	trace_rescale_prios(c);
+
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		buckets = bucket_array(ca);
+
+		for_each_bucket(g, buckets)
+			g->io_time[rw] = clock->hand -
+			bucket_last_io(c, g, rw) / 2;
+
+		bch2_recalc_oldest_io(c, ca, rw);
+
+		up_read(&ca->bucket_lock);
+	}
+}
+
+static inline u64 bucket_clock_freq(u64 capacity)
+{
+	return max(capacity >> 10, 2028ULL);
+}
+
+static void bch2_inc_clock_hand(struct io_timer *timer)
+{
+	struct bucket_clock *clock = container_of(timer,
+						struct bucket_clock, rescale);
+	struct bch_fs *c = container_of(clock,
+					struct bch_fs, bucket_clock[clock->rw]);
+	struct bch_dev *ca;
+	u64 capacity;
+	unsigned i;
+
+	mutex_lock(&clock->lock);
+
+	/* if clock cannot be advanced more, rescale prio */
+	if (clock->max_last_io >= U16_MAX - 2)
+		bch2_rescale_bucket_io_times(c, clock->rw);
+
+	BUG_ON(clock->max_last_io >= U16_MAX - 2);
+
+	for_each_member_device(ca, c, i)
+		ca->max_last_bucket_io[clock->rw]++;
+	clock->max_last_io++;
+	clock->hand++;
+
+	mutex_unlock(&clock->lock);
+
+	capacity = READ_ONCE(c->capacity);
+
+	if (!capacity)
+		return;
+
+	/*
+	 * we only increment when 0.1% of the filesystem capacity has been read
+	 * or written too, this determines if it's time
+	 *
+	 * XXX: we shouldn't really be going off of the capacity of devices in
+	 * RW mode (that will be 0 when we're RO, yet we can still service
+	 * reads)
+	 */
+	timer->expire += bucket_clock_freq(capacity);
+
+	bch2_io_timer_add(&c->io_clock[clock->rw], timer);
+}
+
+static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
+{
+	struct bucket_clock *clock = &c->bucket_clock[rw];
+
+	clock->hand		= 1;
+	clock->rw		= rw;
+	clock->rescale.fn	= bch2_inc_clock_hand;
+	clock->rescale.expire	= bucket_clock_freq(c->capacity);
+	mutex_init(&clock->lock);
+}
+
+/* Background allocator thread: */
+
+/*
+ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
+ * (marking them as invalidated on disk), then optionally issues discard
+ * commands to the newly free buckets, then puts them on the various freelists.
+ */
+
+#define BUCKET_GC_GEN_MAX	96U
+
+/**
+ * wait_buckets_available - wait on reclaimable buckets
+ *
+ * If there aren't enough available buckets to fill up free_inc, wait until
+ * there are.
+ */
+static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned long gc_count = c->gc_count;
+	int ret = 0;
+
+	ca->allocator_state = ALLOCATOR_BLOCKED;
+	closure_wake_up(&c->freelist_wait);
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop()) {
+			ret = 1;
+			break;
+		}
+
+		if (gc_count != c->gc_count)
+			ca->inc_gen_really_needs_gc = 0;
+
+		if ((ssize_t) (dev_buckets_available(c, ca) -
+			       ca->inc_gen_really_needs_gc) >=
+		    (ssize_t) fifo_free(&ca->free_inc))
+			break;
+
+		up_read(&c->gc_lock);
+		schedule();
+		try_to_freeze();
+		down_read(&c->gc_lock);
+	}
+
+	__set_current_state(TASK_RUNNING);
+	ca->allocator_state = ALLOCATOR_RUNNING;
+	closure_wake_up(&c->freelist_wait);
+
+	return ret;
+}
+
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
+				       size_t bucket,
+				       struct bucket_mark mark)
+{
+	u8 gc_gen;
+
+	if (!is_available_bucket(mark))
+		return false;
+
+	if (ca->buckets_nouse &&
+	    test_bit(bucket, ca->buckets_nouse))
+		return false;
+
+	gc_gen = bucket_gc_gen(ca, bucket);
+
+	if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
+		ca->inc_gen_needs_gc++;
+
+	if (gc_gen >= BUCKET_GC_GEN_MAX)
+		ca->inc_gen_really_needs_gc++;
+
+	return gc_gen < BUCKET_GC_GEN_MAX;
+}
+
+/*
+ * Determines what order we're going to reuse buckets, smallest bucket_key()
+ * first.
+ *
+ *
+ * - We take into account the read prio of the bucket, which gives us an
+ *   indication of how hot the data is -- we scale the prio so that the prio
+ *   farthest from the clock is worth 1/8th of the closest.
+ *
+ * - The number of sectors of cached data in the bucket, which gives us an
+ *   indication of the cost in cache misses this eviction will cause.
+ *
+ * - If hotness * sectors used compares equal, we pick the bucket with the
+ *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
+ *   number repeatedly forces us to run mark and sweep gc to avoid generation
+ *   number wraparound.
+ */
+
+static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
+				     size_t b, struct bucket_mark m)
+{
+	unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
+	unsigned max_last_io = ca->max_last_bucket_io[READ];
+
+	/*
+	 * Time since last read, scaled to [0, 8) where larger value indicates
+	 * more recently read data:
+	 */
+	unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
+
+	/* How much we want to keep the data in this bucket: */
+	unsigned long data_wantness =
+		(hotness + 1) * bucket_sectors_used(m);
+
+	unsigned long needs_journal_commit =
+		bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+
+	return  (data_wantness << 9) |
+		(needs_journal_commit << 8) |
+		(bucket_gc_gen(ca, b) / 16);
+}
+
+static inline int bucket_alloc_cmp(alloc_heap *h,
+				   struct alloc_heap_entry l,
+				   struct alloc_heap_entry r)
+{
+	return  cmp_int(l.key, r.key) ?:
+		cmp_int(r.nr, l.nr) ?:
+		cmp_int(l.bucket, r.bucket);
+}
+
+static inline int bucket_idx_cmp(const void *_l, const void *_r)
+{
+	const struct alloc_heap_entry *l = _l, *r = _r;
+
+	return cmp_int(l->bucket, r->bucket);
+}
+
+static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bucket_array *buckets;
+	struct alloc_heap_entry e = { 0 };
+	size_t b, i, nr = 0;
+
+	ca->alloc_heap.used = 0;
+
+	mutex_lock(&c->bucket_clock[READ].lock);
+	down_read(&ca->bucket_lock);
+
+	buckets = bucket_array(ca);
+
+	bch2_recalc_oldest_io(c, ca, READ);
+
+	/*
+	 * Find buckets with lowest read priority, by building a maxheap sorted
+	 * by read priority and repeatedly replacing the maximum element until
+	 * all buckets have been visited.
+	 */
+	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
+		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+		unsigned long key = bucket_sort_key(c, ca, b, m);
+
+		if (!bch2_can_invalidate_bucket(ca, b, m))
+			continue;
+
+		if (e.nr && e.bucket + e.nr == b && e.key == key) {
+			e.nr++;
+		} else {
+			if (e.nr)
+				heap_add_or_replace(&ca->alloc_heap, e,
+					-bucket_alloc_cmp, NULL);
+
+			e = (struct alloc_heap_entry) {
+				.bucket = b,
+				.nr	= 1,
+				.key	= key,
+			};
+		}
+
+		cond_resched();
+	}
+
+	if (e.nr)
+		heap_add_or_replace(&ca->alloc_heap, e,
+				-bucket_alloc_cmp, NULL);
+
+	for (i = 0; i < ca->alloc_heap.used; i++)
+		nr += ca->alloc_heap.data[i].nr;
+
+	while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
+		nr -= ca->alloc_heap.data[0].nr;
+		heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
+	}
+
+	up_read(&ca->bucket_lock);
+	mutex_unlock(&c->bucket_clock[READ].lock);
+}
+
+static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bucket_array *buckets = bucket_array(ca);
+	struct bucket_mark m;
+	size_t b, start;
+
+	if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
+	    ca->fifo_last_bucket >= ca->mi.nbuckets)
+		ca->fifo_last_bucket = ca->mi.first_bucket;
+
+	start = ca->fifo_last_bucket;
+
+	do {
+		ca->fifo_last_bucket++;
+		if (ca->fifo_last_bucket == ca->mi.nbuckets)
+			ca->fifo_last_bucket = ca->mi.first_bucket;
+
+		b = ca->fifo_last_bucket;
+		m = READ_ONCE(buckets->b[b].mark);
+
+		if (bch2_can_invalidate_bucket(ca, b, m)) {
+			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
+
+			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+			if (heap_full(&ca->alloc_heap))
+				break;
+		}
+
+		cond_resched();
+	} while (ca->fifo_last_bucket != start);
+}
+
+static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bucket_array *buckets = bucket_array(ca);
+	struct bucket_mark m;
+	size_t checked, i;
+
+	for (checked = 0;
+	     checked < ca->mi.nbuckets / 2;
+	     checked++) {
+		size_t b = bch2_rand_range(ca->mi.nbuckets -
+					   ca->mi.first_bucket) +
+			ca->mi.first_bucket;
+
+		m = READ_ONCE(buckets->b[b].mark);
+
+		if (bch2_can_invalidate_bucket(ca, b, m)) {
+			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
+
+			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+			if (heap_full(&ca->alloc_heap))
+				break;
+		}
+
+		cond_resched();
+	}
+
+	sort(ca->alloc_heap.data,
+	     ca->alloc_heap.used,
+	     sizeof(ca->alloc_heap.data[0]),
+	     bucket_idx_cmp, NULL);
+
+	/* remove duplicates: */
+	for (i = 0; i + 1 < ca->alloc_heap.used; i++)
+		if (ca->alloc_heap.data[i].bucket ==
+		    ca->alloc_heap.data[i + 1].bucket)
+			ca->alloc_heap.data[i].nr = 0;
+}
+
+static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+	size_t i, nr = 0;
+
+	ca->inc_gen_needs_gc			= 0;
+
+	switch (ca->mi.replacement) {
+	case CACHE_REPLACEMENT_LRU:
+		find_reclaimable_buckets_lru(c, ca);
+		break;
+	case CACHE_REPLACEMENT_FIFO:
+		find_reclaimable_buckets_fifo(c, ca);
+		break;
+	case CACHE_REPLACEMENT_RANDOM:
+		find_reclaimable_buckets_random(c, ca);
+		break;
+	}
+
+	heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
+
+	for (i = 0; i < ca->alloc_heap.used; i++)
+		nr += ca->alloc_heap.data[i].nr;
+
+	return nr;
+}
+
+static inline long next_alloc_bucket(struct bch_dev *ca)
+{
+	struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+
+	while (ca->alloc_heap.used) {
+		if (top->nr) {
+			size_t b = top->bucket;
+
+			top->bucket++;
+			top->nr--;
+			return b;
+		}
+
+		heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+	}
+
+	return -1;
+}
+
+/*
+ * returns sequence number of most recent journal entry that updated this
+ * bucket:
+ */
+static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
+{
+	if (m.journal_seq_valid) {
+		u64 journal_seq = atomic64_read(&c->journal.seq);
+		u64 bucket_seq	= journal_seq;
+
+		bucket_seq &= ~((u64) U16_MAX);
+		bucket_seq |= m.journal_seq;
+
+		if (bucket_seq > journal_seq)
+			bucket_seq -= 1 << 16;
+
+		return bucket_seq;
+	} else {
+		return 0;
+	}
+}
+
+static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
+				       struct bch_dev *ca,
+				       struct btree_iter *iter,
+				       u64 *journal_seq, unsigned flags)
+{
+#if 0
+	__BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
+#else
+	/* hack: */
+	__BKEY_PADDED(k, 8) alloc_key;
+#endif
+	struct bch_fs *c = trans->c;
+	struct bkey_i_alloc *a;
+	struct bkey_alloc_unpacked u;
+	struct bucket *g;
+	struct bucket_mark m;
+	struct bkey_s_c k;
+	bool invalidating_cached_data;
+	size_t b;
+	int ret;
+
+	BUG_ON(!ca->alloc_heap.used ||
+	       !ca->alloc_heap.data[0].nr);
+	b = ca->alloc_heap.data[0].bucket;
+
+	/* first, put on free_inc and mark as owned by allocator: */
+	percpu_down_read(&c->mark_lock);
+	spin_lock(&c->freelist_lock);
+
+	verify_not_on_freelist(c, ca, b);
+
+	BUG_ON(!fifo_push(&ca->free_inc, b));
+
+	bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+
+	spin_unlock(&c->freelist_lock);
+	percpu_up_read(&c->mark_lock);
+
+	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
+
+	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
+retry:
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	/*
+	 * The allocator has to start before journal replay is finished - thus,
+	 * we have to trust the in memory bucket @m, not the version in the
+	 * btree:
+	 */
+	percpu_down_read(&c->mark_lock);
+	g = bucket(ca, b);
+	m = READ_ONCE(g->mark);
+	u = alloc_mem_to_key(g, m);
+	percpu_up_read(&c->mark_lock);
+
+	invalidating_cached_data = m.cached_sectors != 0;
+
+	u.gen++;
+	u.data_type	= 0;
+	u.dirty_sectors	= 0;
+	u.cached_sectors = 0;
+	u.read_time	= c->bucket_clock[READ].hand;
+	u.write_time	= c->bucket_clock[WRITE].hand;
+
+	a = bkey_alloc_init(&alloc_key.k);
+	a->k.p = iter->pos;
+	bch2_alloc_pack(a, u);
+
+	bch2_trans_update(trans, iter, &a->k_i);
+
+	/*
+	 * XXX:
+	 * when using deferred btree updates, we have journal reclaim doing
+	 * btree updates and thus requiring the allocator to make forward
+	 * progress, and here the allocator is requiring space in the journal -
+	 * so we need a journal pre-reservation:
+	 */
+	ret = bch2_trans_commit(trans, NULL,
+				invalidating_cached_data ? journal_seq : NULL,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOUNLOCK|
+				BTREE_INSERT_NOCHECK_RW|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_USE_RESERVE|
+				BTREE_INSERT_USE_ALLOC_RESERVE|
+				BTREE_INSERT_BUCKET_INVALIDATE|
+				flags);
+	if (ret == -EINTR)
+		goto retry;
+
+	if (!ret) {
+		/* remove from alloc_heap: */
+		struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+
+		top->bucket++;
+		top->nr--;
+
+		if (!top->nr)
+			heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+
+		/*
+		 * Make sure we flush the last journal entry that updated this
+		 * bucket (i.e. deleting the last reference) before writing to
+		 * this bucket again:
+		 */
+		*journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
+	} else {
+		size_t b2;
+
+		/* remove from free_inc: */
+		percpu_down_read(&c->mark_lock);
+		spin_lock(&c->freelist_lock);
+
+		bch2_mark_alloc_bucket(c, ca, b, false,
+				       gc_pos_alloc(c, NULL), 0);
+
+		BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
+		BUG_ON(b != b2);
+
+		spin_unlock(&c->freelist_lock);
+		percpu_up_read(&c->mark_lock);
+	}
+
+	return ret;
+}
+
+static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+				       size_t bucket, u64 *flush_seq)
+{
+	struct bucket_mark m;
+
+	percpu_down_read(&c->mark_lock);
+	spin_lock(&c->freelist_lock);
+
+	bch2_invalidate_bucket(c, ca, bucket, &m);
+
+	verify_not_on_freelist(c, ca, bucket);
+	BUG_ON(!fifo_push(&ca->free_inc, bucket));
+
+	spin_unlock(&c->freelist_lock);
+
+	bucket_io_clock_reset(c, ca, bucket, READ);
+	bucket_io_clock_reset(c, ca, bucket, WRITE);
+
+	percpu_up_read(&c->mark_lock);
+
+	*flush_seq = max(*flush_seq, bucket_journal_seq(c, m));
+
+	return m.cached_sectors != 0;
+}
+
+/*
+ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
+ */
+static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	u64 journal_seq = 0;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
+				   POS(ca->dev_idx, 0),
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	/* Only use nowait if we've already invalidated at least one bucket: */
+	while (!ret &&
+	       !fifo_full(&ca->free_inc) &&
+	       ca->alloc_heap.used)
+		ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq,
+				BTREE_INSERT_GC_LOCK_HELD|
+				(!fifo_empty(&ca->free_inc)
+				 ? BTREE_INSERT_NOWAIT : 0));
+
+	bch2_trans_exit(&trans);
+
+	/* If we used NOWAIT, don't return the error: */
+	if (!fifo_empty(&ca->free_inc))
+		ret = 0;
+	if (ret) {
+		bch_err(ca, "error invalidating buckets: %i", ret);
+		return ret;
+	}
+
+	if (journal_seq)
+		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
+	if (ret) {
+		bch_err(ca, "journal error: %i", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
+{
+	unsigned i;
+	int ret = 0;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		spin_lock(&c->freelist_lock);
+		for (i = 0; i < RESERVE_NR; i++)
+			if (fifo_push(&ca->free[i], bucket)) {
+				fifo_pop(&ca->free_inc, bucket);
+
+				closure_wake_up(&c->freelist_wait);
+				ca->allocator_state = ALLOCATOR_RUNNING;
+
+				spin_unlock(&c->freelist_lock);
+				goto out;
+			}
+
+		if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) {
+			ca->allocator_state = ALLOCATOR_BLOCKED_FULL;
+			closure_wake_up(&c->freelist_wait);
+		}
+
+		spin_unlock(&c->freelist_lock);
+
+		if ((current->flags & PF_KTHREAD) &&
+		    kthread_should_stop()) {
+			ret = 1;
+			break;
+		}
+
+		schedule();
+		try_to_freeze();
+	}
+out:
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
+
+/*
+ * Pulls buckets off free_inc, discards them (if enabled), then adds them to
+ * freelists, waiting until there's room if necessary:
+ */
+static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+	while (!fifo_empty(&ca->free_inc)) {
+		size_t bucket = fifo_peek(&ca->free_inc);
+
+		if (ca->mi.discard &&
+		    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+			blkdev_issue_discard(ca->disk_sb.bdev,
+					     bucket_to_sector(ca, bucket),
+					     ca->mi.bucket_size, GFP_NOIO, 0);
+
+		if (push_invalidated_bucket(c, ca, bucket))
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * bch_allocator_thread - move buckets from free_inc to reserves
+ *
+ * The free_inc FIFO is populated by find_reclaimable_buckets(), and
+ * the reserves are depleted by bucket allocation. When we run out
+ * of free_inc, try to invalidate some buckets and write out
+ * prios and gens.
+ */
+static int bch2_allocator_thread(void *arg)
+{
+	struct bch_dev *ca = arg;
+	struct bch_fs *c = ca->fs;
+	size_t nr;
+	int ret;
+
+	set_freezable();
+	ca->allocator_state = ALLOCATOR_RUNNING;
+
+	while (1) {
+		cond_resched();
+
+		pr_debug("discarding %zu invalidated buckets",
+			 fifo_used(&ca->free_inc));
+
+		ret = discard_invalidated_buckets(c, ca);
+		if (ret)
+			goto stop;
+
+		down_read(&c->gc_lock);
+
+		ret = bch2_invalidate_buckets(c, ca);
+		if (ret) {
+			up_read(&c->gc_lock);
+			goto stop;
+		}
+
+		if (!fifo_empty(&ca->free_inc)) {
+			up_read(&c->gc_lock);
+			continue;
+		}
+
+		pr_debug("free_inc now empty");
+
+		do {
+			/*
+			 * Find some buckets that we can invalidate, either
+			 * they're completely unused, or only contain clean data
+			 * that's been written back to the backing device or
+			 * another cache tier
+			 */
+
+			pr_debug("scanning for reclaimable buckets");
+
+			nr = find_reclaimable_buckets(c, ca);
+
+			pr_debug("found %zu buckets", nr);
+
+			trace_alloc_batch(ca, nr, ca->alloc_heap.size);
+
+			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
+			     ca->inc_gen_really_needs_gc) &&
+			    c->gc_thread) {
+				atomic_inc(&c->kick_gc);
+				wake_up_process(c->gc_thread);
+			}
+
+			/*
+			 * If we found any buckets, we have to invalidate them
+			 * before we scan for more - but if we didn't find very
+			 * many we may want to wait on more buckets being
+			 * available so we don't spin:
+			 */
+			if (!nr ||
+			    (nr < ALLOC_SCAN_BATCH(ca) &&
+			     !fifo_empty(&ca->free[RESERVE_NONE]))) {
+				ret = wait_buckets_available(c, ca);
+				if (ret) {
+					up_read(&c->gc_lock);
+					goto stop;
+				}
+			}
+		} while (!nr);
+
+		up_read(&c->gc_lock);
+
+		pr_debug("%zu buckets to invalidate", nr);
+
+		/*
+		 * alloc_heap is now full of newly-invalidated buckets: next,
+		 * write out the new bucket gens:
+		 */
+	}
+
+stop:
+	pr_debug("alloc thread stopping (ret %i)", ret);
+	ca->allocator_state = ALLOCATOR_STOPPED;
+	closure_wake_up(&c->freelist_wait);
+	return 0;
+}
+
+/* Startup/shutdown (ro/rw): */
+
+void bch2_recalc_capacity(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
+	unsigned bucket_size_max = 0;
+	unsigned long ra_pages = 0;
+	unsigned i, j;
+
+	lockdep_assert_held(&c->state_lock);
+
+	for_each_online_member(ca, c, i) {
+		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;
+
+		ra_pages += bdi->ra_pages;
+	}
+
+	bch2_set_ra_pages(c, ra_pages);
+
+	for_each_rw_member(ca, c, i) {
+		u64 dev_reserve = 0;
+
+		/*
+		 * We need to reserve buckets (from the number
+		 * of currently available buckets) against
+		 * foreground writes so that mainly copygc can
+		 * make forward progress.
+		 *
+		 * We need enough to refill the various reserves
+		 * from scratch - copygc will use its entire
+		 * reserve all at once, then run against when
+		 * its reserve is refilled (from the formerly
+		 * available buckets).
+		 *
+		 * This reserve is just used when considering if
+		 * allocations for foreground writes must wait -
+		 * not -ENOSPC calculations.
+		 */
+		for (j = 0; j < RESERVE_NONE; j++)
+			dev_reserve += ca->free[j].size;
+
+		dev_reserve += 1;	/* btree write point */
+		dev_reserve += 1;	/* copygc write point */
+		dev_reserve += 1;	/* rebalance write point */
+
+		dev_reserve *= ca->mi.bucket_size;
+
+		ca->copygc_threshold = dev_reserve;
+
+		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
+					     ca->mi.first_bucket);
+
+		reserved_sectors += dev_reserve * 2;
+
+		bucket_size_max = max_t(unsigned, bucket_size_max,
+					ca->mi.bucket_size);
+	}
+
+	gc_reserve = c->opts.gc_reserve_bytes
+		? c->opts.gc_reserve_bytes >> 9
+		: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
+
+	reserved_sectors = max(gc_reserve, reserved_sectors);
+
+	reserved_sectors = min(reserved_sectors, capacity);
+
+	c->capacity = capacity - reserved_sectors;
+
+	c->bucket_size_max = bucket_size_max;
+
+	if (c->capacity) {
+		bch2_io_timer_add(&c->io_clock[READ],
+				 &c->bucket_clock[READ].rescale);
+		bch2_io_timer_add(&c->io_clock[WRITE],
+				 &c->bucket_clock[WRITE].rescale);
+	} else {
+		bch2_io_timer_del(&c->io_clock[READ],
+				 &c->bucket_clock[READ].rescale);
+		bch2_io_timer_del(&c->io_clock[WRITE],
+				 &c->bucket_clock[WRITE].rescale);
+	}
+
+	/* Wake up case someone was waiting for buckets */
+	closure_wake_up(&c->freelist_wait);
+}
+
+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct open_bucket *ob;
+	bool ret = false;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid && !ob->on_partial_list &&
+		    ob->ptr.dev == ca->dev_idx)
+			ret = true;
+		spin_unlock(&ob->lock);
+	}
+
+	return ret;
+}
+
+/* device goes ro: */
+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned i;
+
+	BUG_ON(ca->alloc_thread);
+
+	/* First, remove device from allocation groups: */
+
+	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		clear_bit(ca->dev_idx, c->rw_devs[i].d);
+
+	/*
+	 * Capacity is calculated based off of devices in allocation groups:
+	 */
+	bch2_recalc_capacity(c);
+
+	/* Next, close write points that point to this device... */
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+		bch2_writepoint_stop(c, ca, &c->write_points[i]);
+
+	bch2_writepoint_stop(c, ca, &ca->copygc_write_point);
+	bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
+	bch2_writepoint_stop(c, ca, &c->btree_write_point);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	while (c->btree_reserve_cache_nr) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		bch2_open_buckets_put(c, &a->ob);
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	while (1) {
+		struct open_bucket *ob;
+
+		spin_lock(&c->freelist_lock);
+		if (!ca->open_buckets_partial_nr) {
+			spin_unlock(&c->freelist_lock);
+			break;
+		}
+		ob = c->open_buckets +
+			ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+		ob->on_partial_list = false;
+		spin_unlock(&c->freelist_lock);
+
+		bch2_open_bucket_put(c, ob);
+	}
+
+	bch2_ec_stop_dev(c, ca);
+
+	/*
+	 * Wake up threads that were blocked on allocation, so they can notice
+	 * the device can no longer be removed and the capacity has changed:
+	 */
+	closure_wake_up(&c->freelist_wait);
+
+	/*
+	 * journal_res_get() can block waiting for free space in the journal -
+	 * it needs to notice there may not be devices to allocate from anymore:
+	 */
+	wake_up(&c->journal.wait);
+
+	/* Now wait for any in flight writes: */
+
+	closure_wait_event(&c->open_buckets_wait,
+			   !bch2_dev_has_open_write_point(c, ca));
+}
+
+/* device goes rw: */
+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		if (ca->mi.data_allowed & (1 << i))
+			set_bit(ca->dev_idx, c->rw_devs[i].d);
+}
+
+void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
+{
+	if (ca->alloc_thread)
+		closure_wait_event(&c->freelist_wait,
+				   ca->allocator_state != ALLOCATOR_RUNNING);
+}
+
+/* stop allocator thread: */
+void bch2_dev_allocator_stop(struct bch_dev *ca)
+{
+	struct task_struct *p;
+
+	p = rcu_dereference_protected(ca->alloc_thread, 1);
+	ca->alloc_thread = NULL;
+
+	/*
+	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
+	 * the thread shutting down to avoid bch2_wake_allocator() racing:
+	 *
+	 * XXX: it would be better to have the rcu barrier be asynchronous
+	 * instead of blocking us here
+	 */
+	synchronize_rcu();
+
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+/* start allocator thread: */
+int bch2_dev_allocator_start(struct bch_dev *ca)
+{
+	struct task_struct *p;
+
+	/*
+	 * allocator thread already started?
+	 */
+	if (ca->alloc_thread)
+		return 0;
+
+	p = kthread_create(bch2_allocator_thread, ca,
+			   "bch_alloc[%s]", ca->name);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	get_task_struct(p);
+	rcu_assign_pointer(ca->alloc_thread, p);
+	wake_up_process(p);
+	return 0;
+}
+
+static bool flush_held_btree_writes(struct bch_fs *c)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct btree *b;
+	bool nodes_unwritten;
+	size_t i;
+again:
+	cond_resched();
+	nodes_unwritten = false;
+
+	if (bch2_journal_error(&c->journal))
+		return true;
+
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, i, pos)
+		if (btree_node_need_write(b)) {
+			if (btree_node_may_write(b)) {
+				rcu_read_unlock();
+				btree_node_lock_type(c, b, SIX_LOCK_read);
+				bch2_btree_node_write(c, b, SIX_LOCK_read);
+				six_unlock_read(&b->lock);
+				goto again;
+			} else {
+				nodes_unwritten = true;
+			}
+		}
+	rcu_read_unlock();
+
+	if (c->btree_roots_dirty) {
+		bch2_journal_meta(&c->journal);
+		goto again;
+	}
+
+	return !nodes_unwritten &&
+		!bch2_btree_interior_updates_nr_pending(c);
+}
+
+static void allocator_start_issue_discards(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned dev_iter;
+	size_t bu;
+
+	for_each_rw_member(ca, c, dev_iter)
+		while (fifo_pop(&ca->free_inc, bu))
+			blkdev_issue_discard(ca->disk_sb.bdev,
+					     bucket_to_sector(ca, bu),
+					     ca->mi.bucket_size, GFP_NOIO, 0);
+}
+
+static int resize_free_inc(struct bch_dev *ca)
+{
+	alloc_fifo free_inc;
+
+	if (!fifo_full(&ca->free_inc))
+		return 0;
+
+	if (!init_fifo(&free_inc,
+		       ca->free_inc.size * 2,
+		       GFP_KERNEL))
+		return -ENOMEM;
+
+	fifo_move(&free_inc, &ca->free_inc);
+	swap(free_inc, ca->free_inc);
+	free_fifo(&free_inc);
+	return 0;
+}
+
+static bool bch2_fs_allocator_start_fast(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned dev_iter;
+	bool ret = true;
+
+	if (test_alloc_startup(c))
+		return false;
+
+	down_read(&c->gc_lock);
+
+	/* Scan for buckets that are already invalidated: */
+	for_each_rw_member(ca, c, dev_iter) {
+		struct bucket_array *buckets;
+		struct bucket_mark m;
+		long bu;
+
+		down_read(&ca->bucket_lock);
+		buckets = bucket_array(ca);
+
+		for (bu = buckets->first_bucket;
+		     bu < buckets->nbuckets; bu++) {
+			m = READ_ONCE(buckets->b[bu].mark);
+
+			if (!buckets->b[bu].gen_valid ||
+			    !is_available_bucket(m) ||
+			    m.cached_sectors ||
+			    (ca->buckets_nouse &&
+			     test_bit(bu, ca->buckets_nouse)))
+				continue;
+
+			percpu_down_read(&c->mark_lock);
+			bch2_mark_alloc_bucket(c, ca, bu, true,
+					gc_pos_alloc(c, NULL), 0);
+			percpu_up_read(&c->mark_lock);
+
+			fifo_push(&ca->free_inc, bu);
+
+			discard_invalidated_buckets(c, ca);
+
+			if (fifo_full(&ca->free[RESERVE_BTREE]))
+				break;
+		}
+		up_read(&ca->bucket_lock);
+	}
+
+	up_read(&c->gc_lock);
+
+	/* did we find enough buckets? */
+	for_each_rw_member(ca, c, dev_iter)
+		if (!fifo_full(&ca->free[RESERVE_BTREE]))
+			ret = false;
+
+	return ret;
+}
+
+int bch2_fs_allocator_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned dev_iter;
+	u64 journal_seq = 0;
+	bool wrote;
+	long bu;
+	int ret = 0;
+
+	if (!test_alloc_startup(c) &&
+	    bch2_fs_allocator_start_fast(c))
+		return 0;
+
+	pr_debug("not enough empty buckets; scanning for reclaimable buckets");
+
+	/*
+	 * We're moving buckets to freelists _before_ they've been marked as
+	 * invalidated on disk - we have to so that we can allocate new btree
+	 * nodes to mark them as invalidated on disk.
+	 *
+	 * However, we can't _write_ to any of these buckets yet - they might
+	 * have cached data in them, which is live until they're marked as
+	 * invalidated on disk:
+	 */
+	set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+
+	down_read(&c->gc_lock);
+	do {
+		wrote = false;
+
+		for_each_rw_member(ca, c, dev_iter) {
+			find_reclaimable_buckets(c, ca);
+
+			while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
+			       (bu = next_alloc_bucket(ca)) >= 0) {
+				ret = resize_free_inc(ca);
+				if (ret) {
+					percpu_ref_put(&ca->io_ref);
+					up_read(&c->gc_lock);
+					goto err;
+				}
+
+				bch2_invalidate_one_bucket(c, ca, bu,
+							   &journal_seq);
+
+				fifo_push(&ca->free[RESERVE_BTREE], bu);
+			}
+		}
+
+		pr_debug("done scanning for reclaimable buckets");
+
+		/*
+		 * XXX: it's possible for this to deadlock waiting on journal reclaim,
+		 * since we're holding btree writes. What then?
+		 */
+		ret = bch2_alloc_write(c,
+				       BTREE_INSERT_NOCHECK_RW|
+				       BTREE_INSERT_USE_ALLOC_RESERVE|
+				       BTREE_INSERT_NOWAIT, &wrote);
+
+		/*
+		 * If bch2_alloc_write() did anything, it may have used some
+		 * buckets, and we need the RESERVE_BTREE freelist full - so we
+		 * need to loop and scan again.
+		 * And if it errored, it may have been because there weren't
+		 * enough buckets, so just scan and loop again as long as it
+		 * made some progress:
+		 */
+	} while (wrote);
+	up_read(&c->gc_lock);
+
+	if (ret)
+		goto err;
+
+	pr_debug("flushing journal");
+
+	ret = bch2_journal_flush(&c->journal);
+	if (ret)
+		goto err;
+
+	pr_debug("issuing discards");
+	allocator_start_issue_discards(c);
+err:
+	clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+	closure_wait_event(&c->btree_interior_update_wait,
+			   flush_held_btree_writes(c));
+
+	return ret;
+}
+
+void bch2_fs_allocator_background_init(struct bch_fs *c)
+{
+	spin_lock_init(&c->freelist_lock);
+	bch2_bucket_clock_init(c, READ);
+	bch2_bucket_clock_init(c, WRITE);
+
+	c->pd_controllers_update_seconds = 5;
+	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
+}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
new file mode 100644
index 000000000000..501c444353fb
--- /dev/null
+++ b/fs/bcachefs/alloc_background.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
+#define _BCACHEFS_ALLOC_BACKGROUND_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "debug.h"
+
+struct bkey_alloc_unpacked {
+	u8		gen;
+#define x(_name, _bits)	u##_bits _name;
+	BCH_ALLOC_FIELDS()
+#undef  x
+};
+
+/* returns true if not equal */
+static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
+					   struct bkey_alloc_unpacked r)
+{
+	return l.gen != r.gen
+#define x(_name, _bits)	|| l._name != r._name
+	BCH_ALLOC_FIELDS()
+#undef  x
+	;
+}
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
+void bch2_alloc_pack(struct bkey_i_alloc *,
+		     const struct bkey_alloc_unpacked);
+
+static inline struct bkey_alloc_unpacked
+alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
+{
+	return (struct bkey_alloc_unpacked) {
+		.gen		= m.gen,
+		.oldest_gen	= g->oldest_gen,
+		.data_type	= m.data_type,
+		.dirty_sectors	= m.dirty_sectors,
+		.cached_sectors	= m.cached_sectors,
+		.read_time	= g->io_time[READ],
+		.write_time	= g->io_time[WRITE],
+	};
+}
+
+#define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
+
+const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_alloc (struct bkey_ops) {		\
+	.key_invalid	= bch2_alloc_invalid,		\
+	.val_to_text	= bch2_alloc_to_text,		\
+}
+
+struct journal_keys;
+int bch2_alloc_read(struct bch_fs *, struct journal_keys *);
+int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *);
+
+static inline void bch2_wake_allocator(struct bch_dev *ca)
+{
+	struct task_struct *p;
+
+	rcu_read_lock();
+	p = rcu_dereference(ca->alloc_thread);
+	if (p)
+		wake_up_process(p);
+	rcu_read_unlock();
+}
+
+static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
+					  size_t bucket)
+{
+	if (expensive_debug_checks(c) &&
+	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
+		size_t iter;
+		long i;
+		unsigned j;
+
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each_entry(i, &ca->free[j], iter)
+				BUG_ON(i == bucket);
+		fifo_for_each_entry(i, &ca->free_inc, iter)
+			BUG_ON(i == bucket);
+	}
+}
+
+void bch2_recalc_capacity(struct bch_fs *);
+
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+
+void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_stop(struct bch_dev *);
+int bch2_dev_allocator_start(struct bch_dev *);
+
+int bch2_alloc_write(struct bch_fs *, unsigned, bool *);
+int bch2_fs_allocator_start(struct bch_fs *);
+void bch2_fs_allocator_background_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
new file mode 100644
index 000000000000..697d576802b6
--- /dev/null
+++ b/fs/bcachefs/alloc_foreground.c
@@ -0,0 +1,1044 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Primary bucket allocation code
+ *
+ * Copyright 2012 Google, Inc.
+ *
+ * Allocation in bcache is done in terms of buckets:
+ *
+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
+ * btree pointers - they must match for the pointer to be considered valid.
+ *
+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
+ * bucket simply by incrementing its gen.
+ *
+ * The gens (along with the priorities; it's really the gens are important but
+ * the code is named as if it's the priorities) are written in an arbitrary list
+ * of buckets on disk, with a pointer to them in the journal header.
+ *
+ * When we invalidate a bucket, we have to write its new gen to disk and wait
+ * for that write to complete before we use it - otherwise after a crash we
+ * could have pointers that appeared to be good but pointed to data that had
+ * been overwritten.
+ *
+ * Since the gens and priorities are all stored contiguously on disk, we can
+ * batch this up: We fill up the free_inc list with freshly invalidated buckets,
+ * call prio_write(), and when prio_write() finishes we pull buckets off the
+ * free_inc list and optionally discard them.
+ *
+ * free_inc isn't the only freelist - if it was, we'd often have to sleep while
+ * priorities and gens were being written before we could allocate. c->free is a
+ * smaller freelist, and buckets on that list are always ready to be used.
+ *
+ * If we've got discards enabled, that happens when a bucket moves from the
+ * free_inc list to the free list.
+ *
+ * It's important to ensure that gens don't wrap around - with respect to
+ * either the oldest gen in the btree or the gen on disk. This is quite
+ * difficult to do in practice, but we explicitly guard against it anyways - if
+ * a bucket is in danger of wrapping around we simply skip invalidating it that
+ * time around, and we garbage collect or rewrite the priorities sooner than we
+ * would have otherwise.
+ *
+ * bch2_bucket_alloc() allocates a single bucket from a specific device.
+ *
+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices
+ * in a given filesystem.
+ *
+ * invalidate_buckets() drives all the processes described above. It's called
+ * from bch2_bucket_alloc() and a few other places that need to make sure free
+ * buckets are ready.
+ *
+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be
+ * invalidated, and then invalidate them and stick them on the free_inc list -
+ * in either lru or fifo order.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "io.h"
+
+#include <linux/math64.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <trace/events/bcachefs.h>
+
+enum bucket_alloc_ret {
+	ALLOC_SUCCESS,
+	OPEN_BUCKETS_EMPTY,
+	FREELIST_EMPTY,		/* Allocator thread not keeping up */
+};
+
+/*
+ * Open buckets represent a bucket that's currently being allocated from.  They
+ * serve two purposes:
+ *
+ *  - They track buckets that have been partially allocated, allowing for
+ *    sub-bucket sized allocations - they're used by the sector allocator below
+ *
+ *  - They provide a reference to the buckets they own that mark and sweep GC
+ *    can find, until the new allocation has a pointer to it inserted into the
+ *    btree
+ *
+ * When allocating some space with the sector allocator, the allocation comes
+ * with a reference to an open bucket - the caller is required to put that
+ * reference _after_ doing the index update that makes its allocation reachable.
+ */
+
+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+	if (ob->ec) {
+		bch2_ec_bucket_written(c, ob);
+		return;
+	}
+
+	percpu_down_read(&c->mark_lock);
+	spin_lock(&ob->lock);
+
+	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
+			       false, gc_pos_alloc(c, ob), 0);
+	ob->valid = false;
+	ob->type = 0;
+
+	spin_unlock(&ob->lock);
+	percpu_up_read(&c->mark_lock);
+
+	spin_lock(&c->freelist_lock);
+	ob->freelist = c->open_buckets_freelist;
+	c->open_buckets_freelist = ob - c->open_buckets;
+	c->open_buckets_nr_free++;
+	spin_unlock(&c->freelist_lock);
+
+	closure_wake_up(&c->open_buckets_wait);
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *c,
+				  struct open_buckets *obs,
+				  unsigned dev)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->ptr.dev == dev &&
+		    ob->ec)
+			bch2_ec_bucket_cancel(c, ob);
+}
+
+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+
+	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
+
+	ob = c->open_buckets + c->open_buckets_freelist;
+	c->open_buckets_freelist = ob->freelist;
+	atomic_set(&ob->pin, 1);
+	ob->type = 0;
+
+	c->open_buckets_nr_free--;
+	return ob;
+}
+
+static void open_bucket_free_unused(struct bch_fs *c,
+				    struct open_bucket *ob,
+				    bool may_realloc)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+	BUG_ON(ca->open_buckets_partial_nr >=
+	       ARRAY_SIZE(ca->open_buckets_partial));
+
+	if (ca->open_buckets_partial_nr <
+	    ARRAY_SIZE(ca->open_buckets_partial) &&
+	    may_realloc) {
+		spin_lock(&c->freelist_lock);
+		ob->on_partial_list = true;
+		ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
+			ob - c->open_buckets;
+		spin_unlock(&c->freelist_lock);
+
+		closure_wake_up(&c->open_buckets_wait);
+		closure_wake_up(&c->freelist_wait);
+	} else {
+		bch2_open_bucket_put(c, ob);
+	}
+}
+
+static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+		BUG_ON(ptr_stale(ca, &ob->ptr));
+	}
+#endif
+}
+
+/* _only_ for allocating the journal on a new device: */
+long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
+{
+	struct bucket_array *buckets;
+	ssize_t b;
+
+	rcu_read_lock();
+	buckets = bucket_array(ca);
+
+	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
+		if (is_available_bucket(buckets->b[b].mark))
+			goto success;
+	b = -1;
+success:
+	rcu_read_unlock();
+	return b;
+}
+
+static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
+{
+	switch (reserve) {
+	case RESERVE_ALLOC:
+		return 0;
+	case RESERVE_BTREE:
+		return BTREE_NODE_OPEN_BUCKET_RESERVE;
+	default:
+		return BTREE_NODE_OPEN_BUCKET_RESERVE * 2;
+	}
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ * */
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+				      enum alloc_reserve reserve,
+				      bool may_alloc_partial,
+				      struct closure *cl)
+{
+	struct bucket_array *buckets;
+	struct open_bucket *ob;
+	long bucket = 0;
+
+	spin_lock(&c->freelist_lock);
+
+	if (may_alloc_partial &&
+	    ca->open_buckets_partial_nr) {
+		ob = c->open_buckets +
+			ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+		ob->on_partial_list = false;
+		spin_unlock(&c->freelist_lock);
+		return ob;
+	}
+
+	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
+		if (cl)
+			closure_wait(&c->open_buckets_wait, cl);
+
+		if (!c->blocked_allocate_open_bucket)
+			c->blocked_allocate_open_bucket = local_clock();
+
+		spin_unlock(&c->freelist_lock);
+		trace_open_bucket_alloc_fail(ca, reserve);
+		return ERR_PTR(-OPEN_BUCKETS_EMPTY);
+	}
+
+	if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
+		goto out;
+
+	switch (reserve) {
+	case RESERVE_ALLOC:
+		if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
+			goto out;
+		break;
+	case RESERVE_BTREE:
+		if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
+		    ca->free[RESERVE_BTREE].size &&
+		    fifo_pop(&ca->free[RESERVE_BTREE], bucket))
+			goto out;
+		break;
+	case RESERVE_MOVINGGC:
+		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
+			goto out;
+		break;
+	default:
+		break;
+	}
+
+	if (cl)
+		closure_wait(&c->freelist_wait, cl);
+
+	if (!c->blocked_allocate)
+		c->blocked_allocate = local_clock();
+
+	spin_unlock(&c->freelist_lock);
+
+	trace_bucket_alloc_fail(ca, reserve);
+	return ERR_PTR(-FREELIST_EMPTY);
+out:
+	verify_not_on_freelist(c, ca, bucket);
+
+	ob = bch2_open_bucket_alloc(c);
+
+	spin_lock(&ob->lock);
+	buckets = bucket_array(ca);
+
+	ob->valid	= true;
+	ob->sectors_free = ca->mi.bucket_size;
+	ob->ptr		= (struct bch_extent_ptr) {
+		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
+		.gen	= buckets->b[bucket].mark.gen,
+		.offset	= bucket_to_sector(ca, bucket),
+		.dev	= ca->dev_idx,
+	};
+
+	bucket_io_clock_reset(c, ca, bucket, READ);
+	bucket_io_clock_reset(c, ca, bucket, WRITE);
+	spin_unlock(&ob->lock);
+
+	if (c->blocked_allocate_open_bucket) {
+		bch2_time_stats_update(
+			&c->times[BCH_TIME_blocked_allocate_open_bucket],
+			c->blocked_allocate_open_bucket);
+		c->blocked_allocate_open_bucket = 0;
+	}
+
+	if (c->blocked_allocate) {
+		bch2_time_stats_update(
+			&c->times[BCH_TIME_blocked_allocate],
+			c->blocked_allocate);
+		c->blocked_allocate = 0;
+	}
+
+	spin_unlock(&c->freelist_lock);
+
+	bch2_wake_allocator(ca);
+
+	trace_bucket_alloc(ca, reserve);
+	return ob;
+}
+
+static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
+			    unsigned l, unsigned r)
+{
+	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
+		(stripe->next_alloc[l] < stripe->next_alloc[r]));
+}
+
+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
+
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
+					  struct dev_stripe_state *stripe,
+					  struct bch_devs_mask *devs)
+{
+	struct dev_alloc_list ret = { .nr = 0 };
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device_rcu(ca, c, i, devs)
+		ret.devs[ret.nr++] = i;
+
+	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
+	return ret;
+}
+
+void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca,
+			       struct dev_stripe_state *stripe)
+{
+	u64 *v = stripe->next_alloc + ca->dev_idx;
+	u64 free_space = dev_buckets_free(c, ca);
+	u64 free_space_inv = free_space
+		? div64_u64(1ULL << 48, free_space)
+		: 1ULL << 48;
+	u64 scale = *v / 4;
+
+	if (*v + free_space_inv >= *v)
+		*v += free_space_inv;
+	else
+		*v = U64_MAX;
+
+	for (v = stripe->next_alloc;
+	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
+		*v = *v < scale ? 0 : *v - scale;
+}
+
+#define BUCKET_MAY_ALLOC_PARTIAL	(1 << 0)
+#define BUCKET_ALLOC_USE_DURABILITY	(1 << 1)
+
+static void add_new_bucket(struct bch_fs *c,
+			   struct open_buckets *ptrs,
+			   struct bch_devs_mask *devs_may_alloc,
+			   unsigned *nr_effective,
+			   bool *have_cache,
+			   unsigned flags,
+			   struct open_bucket *ob)
+{
+	unsigned durability =
+		bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability;
+
+	__clear_bit(ob->ptr.dev, devs_may_alloc->d);
+	*nr_effective	+= (flags & BUCKET_ALLOC_USE_DURABILITY)
+		? durability : 1;
+	*have_cache	|= !durability;
+
+	ob_push(c, ptrs, ob);
+}
+
+static int bch2_bucket_alloc_set(struct bch_fs *c,
+				 struct open_buckets *ptrs,
+				 struct dev_stripe_state *stripe,
+				 struct bch_devs_mask *devs_may_alloc,
+				 unsigned nr_replicas,
+				 unsigned *nr_effective,
+				 bool *have_cache,
+				 enum alloc_reserve reserve,
+				 unsigned flags,
+				 struct closure *cl)
+{
+	struct dev_alloc_list devs_sorted =
+		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+	struct bch_dev *ca;
+	bool alloc_failure = false;
+	unsigned i;
+
+	BUG_ON(*nr_effective >= nr_replicas);
+
+	for (i = 0; i < devs_sorted.nr; i++) {
+		struct open_bucket *ob;
+
+		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+		if (!ca)
+			continue;
+
+		if (!ca->mi.durability && *have_cache)
+			continue;
+
+		ob = bch2_bucket_alloc(c, ca, reserve,
+				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
+		if (IS_ERR(ob)) {
+			enum bucket_alloc_ret ret = -PTR_ERR(ob);
+
+			WARN_ON(reserve == RESERVE_MOVINGGC &&
+				ret != OPEN_BUCKETS_EMPTY);
+
+			if (cl)
+				return -EAGAIN;
+			if (ret == OPEN_BUCKETS_EMPTY)
+				return -ENOSPC;
+			alloc_failure = true;
+			continue;
+		}
+
+		add_new_bucket(c, ptrs, devs_may_alloc,
+			       nr_effective, have_cache, flags, ob);
+
+		bch2_dev_stripe_increment(c, ca, stripe);
+
+		if (*nr_effective >= nr_replicas)
+			return 0;
+	}
+
+	return alloc_failure ? -ENOSPC : -EROFS;
+}
+
+/* Allocate from stripes: */
+
+/*
+ * XXX: use a higher watermark for allocating open buckets here:
+ */
+static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct bch_devs_mask devs;
+	struct open_bucket *ob;
+	unsigned i, nr_have = 0, nr_data =
+		min_t(unsigned, h->nr_active_devs,
+		      EC_STRIPE_MAX) - h->redundancy;
+	bool have_cache = true;
+	int ret = 0;
+
+	BUG_ON(h->blocks.nr > nr_data);
+	BUG_ON(h->parity.nr > h->redundancy);
+
+	devs = h->devs;
+
+	open_bucket_for_each(c, &h->parity, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+	open_bucket_for_each(c, &h->blocks, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+
+	percpu_down_read(&c->mark_lock);
+	rcu_read_lock();
+
+	if (h->parity.nr < h->redundancy) {
+		nr_have = h->parity.nr;
+
+		ret = bch2_bucket_alloc_set(c, &h->parity,
+					    &h->parity_stripe,
+					    &devs,
+					    h->redundancy,
+					    &nr_have,
+					    &have_cache,
+					    RESERVE_NONE,
+					    0,
+					    NULL);
+		if (ret)
+			goto err;
+	}
+
+	if (h->blocks.nr < nr_data) {
+		nr_have = h->blocks.nr;
+
+		ret = bch2_bucket_alloc_set(c, &h->blocks,
+					    &h->block_stripe,
+					    &devs,
+					    nr_data,
+					    &nr_have,
+					    &have_cache,
+					    RESERVE_NONE,
+					    0,
+					    NULL);
+		if (ret)
+			goto err;
+	}
+
+	rcu_read_unlock();
+	percpu_up_read(&c->mark_lock);
+
+	return bch2_ec_stripe_new_alloc(c, h);
+err:
+	rcu_read_unlock();
+	percpu_up_read(&c->mark_lock);
+	return -1;
+}
+
+/*
+ * if we can't allocate a new stripe because there are already too many
+ * partially filled stripes, force allocating from an existing stripe even when
+ * it's to a device we don't want:
+ */
+
+static void bucket_alloc_from_stripe(struct bch_fs *c,
+				     struct open_buckets *ptrs,
+				     struct write_point *wp,
+				     struct bch_devs_mask *devs_may_alloc,
+				     u16 target,
+				     unsigned erasure_code,
+				     unsigned nr_replicas,
+				     unsigned *nr_effective,
+				     bool *have_cache,
+				     unsigned flags)
+{
+	struct dev_alloc_list devs_sorted;
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	struct bch_dev *ca;
+	unsigned i, ec_idx;
+
+	if (!erasure_code)
+		return;
+
+	if (nr_replicas < 2)
+		return;
+
+	if (ec_open_bucket(c, ptrs))
+		return;
+
+	h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1);
+	if (!h)
+		return;
+
+	if (!h->s && ec_stripe_alloc(c, h))
+		goto out_put_head;
+
+	rcu_read_lock();
+	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
+	rcu_read_unlock();
+
+	for (i = 0; i < devs_sorted.nr; i++)
+		open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
+			if (ob->ptr.dev == devs_sorted.devs[i] &&
+			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
+				goto got_bucket;
+	goto out_put_head;
+got_bucket:
+	ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+	ob->ec_idx	= ec_idx;
+	ob->ec		= h->s;
+
+	add_new_bucket(c, ptrs, devs_may_alloc,
+		       nr_effective, have_cache, flags, ob);
+	atomic_inc(&h->s->pin);
+out_put_head:
+	bch2_ec_stripe_head_put(h);
+}
+
+/* Sector allocator */
+
+static void get_buckets_from_writepoint(struct bch_fs *c,
+					struct open_buckets *ptrs,
+					struct write_point *wp,
+					struct bch_devs_mask *devs_may_alloc,
+					unsigned nr_replicas,
+					unsigned *nr_effective,
+					bool *have_cache,
+					unsigned flags,
+					bool need_ec)
+{
+	struct open_buckets ptrs_skip = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+		if (*nr_effective < nr_replicas &&
+		    test_bit(ob->ptr.dev, devs_may_alloc->d) &&
+		    (ca->mi.durability ||
+		     (wp->type == BCH_DATA_USER && !*have_cache)) &&
+		    (ob->ec || !need_ec)) {
+			add_new_bucket(c, ptrs, devs_may_alloc,
+				       nr_effective, have_cache,
+				       flags, ob);
+		} else {
+			ob_push(c, &ptrs_skip, ob);
+		}
+	}
+	wp->ptrs = ptrs_skip;
+}
+
+static int open_bucket_add_buckets(struct bch_fs *c,
+				   struct open_buckets *ptrs,
+				   struct write_point *wp,
+				   struct bch_devs_list *devs_have,
+				   u16 target,
+				   unsigned erasure_code,
+				   unsigned nr_replicas,
+				   unsigned *nr_effective,
+				   bool *have_cache,
+				   enum alloc_reserve reserve,
+				   unsigned flags,
+				   struct closure *_cl)
+{
+	struct bch_devs_mask devs;
+	struct open_bucket *ob;
+	struct closure *cl = NULL;
+	unsigned i;
+	int ret;
+
+	rcu_read_lock();
+	devs = target_rw_devs(c, wp->type, target);
+	rcu_read_unlock();
+
+	/* Don't allocate from devices we already have pointers to: */
+	for (i = 0; i < devs_have->nr; i++)
+		__clear_bit(devs_have->devs[i], devs.d);
+
+	open_bucket_for_each(c, ptrs, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+
+	if (erasure_code) {
+		get_buckets_from_writepoint(c, ptrs, wp, &devs,
+					    nr_replicas, nr_effective,
+					    have_cache, flags, true);
+		if (*nr_effective >= nr_replicas)
+			return 0;
+
+		bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+					 target, erasure_code,
+					 nr_replicas, nr_effective,
+					 have_cache, flags);
+		if (*nr_effective >= nr_replicas)
+			return 0;
+	}
+
+	get_buckets_from_writepoint(c, ptrs, wp, &devs,
+				    nr_replicas, nr_effective,
+				    have_cache, flags, false);
+	if (*nr_effective >= nr_replicas)
+		return 0;
+
+	percpu_down_read(&c->mark_lock);
+	rcu_read_lock();
+
+retry_blocking:
+	/*
+	 * Try nonblocking first, so that if one device is full we'll try from
+	 * other devices:
+	 */
+	ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
+				nr_replicas, nr_effective, have_cache,
+				reserve, flags, cl);
+	if (ret && ret != -EROFS && !cl && _cl) {
+		cl = _cl;
+		goto retry_blocking;
+	}
+
+	rcu_read_unlock();
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
+}
+
+void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
+				struct open_buckets *obs)
+{
+	struct open_buckets ptrs = { .nr = 0 };
+	struct open_bucket *ob, *ob2;
+	unsigned i, j;
+
+	open_bucket_for_each(c, obs, ob, i) {
+		bool drop = !ca || ob->ptr.dev == ca->dev_idx;
+
+		if (!drop && ob->ec) {
+			mutex_lock(&ob->ec->lock);
+			open_bucket_for_each(c, &ob->ec->blocks, ob2, j)
+				drop |= ob2->ptr.dev == ca->dev_idx;
+			open_bucket_for_each(c, &ob->ec->parity, ob2, j)
+				drop |= ob2->ptr.dev == ca->dev_idx;
+			mutex_unlock(&ob->ec->lock);
+		}
+
+		if (drop)
+			bch2_open_bucket_put(c, ob);
+		else
+			ob_push(c, &ptrs, ob);
+	}
+
+	*obs = ptrs;
+}
+
+void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+			  struct write_point *wp)
+{
+	mutex_lock(&wp->lock);
+	bch2_open_buckets_stop_dev(c, ca, &wp->ptrs);
+	mutex_unlock(&wp->lock);
+}
+
+static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
+						 unsigned long write_point)
+{
+	unsigned hash =
+		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
+
+	return &c->write_points_hash[hash];
+}
+
+static struct write_point *__writepoint_find(struct hlist_head *head,
+					     unsigned long write_point)
+{
+	struct write_point *wp;
+
+	hlist_for_each_entry_rcu(wp, head, node)
+		if (wp->write_point == write_point)
+			return wp;
+
+	return NULL;
+}
+
+static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
+{
+	u64 stranded	= c->write_points_nr * c->bucket_size_max;
+	u64 free	= bch2_fs_usage_read_short(c).free;
+
+	return stranded * factor > free;
+}
+
+static bool try_increase_writepoints(struct bch_fs *c)
+{
+	struct write_point *wp;
+
+	if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
+	    too_many_writepoints(c, 32))
+		return false;
+
+	wp = c->write_points + c->write_points_nr++;
+	hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
+	return true;
+}
+
+static bool try_decrease_writepoints(struct bch_fs *c,
+				     unsigned old_nr)
+{
+	struct write_point *wp;
+
+	mutex_lock(&c->write_points_hash_lock);
+	if (c->write_points_nr < old_nr) {
+		mutex_unlock(&c->write_points_hash_lock);
+		return true;
+	}
+
+	if (c->write_points_nr == 1 ||
+	    !too_many_writepoints(c, 8)) {
+		mutex_unlock(&c->write_points_hash_lock);
+		return false;
+	}
+
+	wp = c->write_points + --c->write_points_nr;
+
+	hlist_del_rcu(&wp->node);
+	mutex_unlock(&c->write_points_hash_lock);
+
+	bch2_writepoint_stop(c, NULL, wp);
+	return true;
+}
+
+static struct write_point *writepoint_find(struct bch_fs *c,
+					   unsigned long write_point)
+{
+	struct write_point *wp, *oldest;
+	struct hlist_head *head;
+
+	if (!(write_point & 1UL)) {
+		wp = (struct write_point *) write_point;
+		mutex_lock(&wp->lock);
+		return wp;
+	}
+
+	head = writepoint_hash(c, write_point);
+restart_find:
+	wp = __writepoint_find(head, write_point);
+	if (wp) {
+lock_wp:
+		mutex_lock(&wp->lock);
+		if (wp->write_point == write_point)
+			goto out;
+		mutex_unlock(&wp->lock);
+		goto restart_find;
+	}
+restart_find_oldest:
+	oldest = NULL;
+	for (wp = c->write_points;
+	     wp < c->write_points + c->write_points_nr; wp++)
+		if (!oldest || time_before64(wp->last_used, oldest->last_used))
+			oldest = wp;
+
+	mutex_lock(&oldest->lock);
+	mutex_lock(&c->write_points_hash_lock);
+	if (oldest >= c->write_points + c->write_points_nr ||
+	    try_increase_writepoints(c)) {
+		mutex_unlock(&c->write_points_hash_lock);
+		mutex_unlock(&oldest->lock);
+		goto restart_find_oldest;
+	}
+
+	wp = __writepoint_find(head, write_point);
+	if (wp && wp != oldest) {
+		mutex_unlock(&c->write_points_hash_lock);
+		mutex_unlock(&oldest->lock);
+		goto lock_wp;
+	}
+
+	wp = oldest;
+	hlist_del_rcu(&wp->node);
+	wp->write_point = write_point;
+	hlist_add_head_rcu(&wp->node, head);
+	mutex_unlock(&c->write_points_hash_lock);
+out:
+	wp->last_used = sched_clock();
+	return wp;
+}
+
+/*
+ * Get us an open_bucket we can allocate from, return with it locked:
+ */
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
+				unsigned target,
+				unsigned erasure_code,
+				struct write_point_specifier write_point,
+				struct bch_devs_list *devs_have,
+				unsigned nr_replicas,
+				unsigned nr_replicas_required,
+				enum alloc_reserve reserve,
+				unsigned flags,
+				struct closure *cl)
+{
+	struct write_point *wp;
+	struct open_bucket *ob;
+	struct open_buckets ptrs;
+	unsigned nr_effective, write_points_nr;
+	unsigned ob_flags = 0;
+	bool have_cache;
+	int ret, i;
+
+	if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
+		ob_flags |= BUCKET_ALLOC_USE_DURABILITY;
+
+	BUG_ON(!nr_replicas || !nr_replicas_required);
+retry:
+	ptrs.nr		= 0;
+	nr_effective	= 0;
+	write_points_nr = c->write_points_nr;
+	have_cache	= false;
+
+	wp = writepoint_find(c, write_point.v);
+
+	if (wp->type == BCH_DATA_USER)
+		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
+
+	/* metadata may not allocate on cache devices: */
+	if (wp->type != BCH_DATA_USER)
+		have_cache = true;
+
+	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+					      target, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, reserve,
+					      ob_flags, cl);
+	} else {
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+					      target, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, reserve,
+					      ob_flags, NULL);
+		if (!ret)
+			goto alloc_done;
+
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+					      0, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, reserve,
+					      ob_flags, cl);
+	}
+alloc_done:
+	BUG_ON(!ret && nr_effective < nr_replicas);
+
+	if (erasure_code && !ec_open_bucket(c, &ptrs))
+		pr_debug("failed to get ec bucket: ret %u", ret);
+
+	if (ret == -EROFS &&
+	    nr_effective >= nr_replicas_required)
+		ret = 0;
+
+	if (ret)
+		goto err;
+
+	/* Free buckets we didn't use: */
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER);
+
+	wp->ptrs = ptrs;
+
+	wp->sectors_free = UINT_MAX;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
+
+	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
+
+	verify_not_stale(c, &wp->ptrs);
+
+	return wp;
+err:
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
+			ob_push(c, &ptrs, ob);
+		else
+			open_bucket_free_unused(c, ob,
+					wp->type == BCH_DATA_USER);
+	wp->ptrs = ptrs;
+
+	mutex_unlock(&wp->lock);
+
+	if (ret == -ENOSPC &&
+	    try_decrease_writepoints(c, write_points_nr))
+		goto retry;
+
+	return ERR_PTR(ret);
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
+				    struct bkey_i *k, unsigned sectors)
+
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	BUG_ON(sectors > wp->sectors_free);
+	wp->sectors_free -= sectors;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+		struct bch_extent_ptr tmp = ob->ptr;
+
+		tmp.cached = !ca->mi.durability &&
+			wp->type == BCH_DATA_USER;
+
+		tmp.offset += ca->mi.bucket_size - ob->sectors_free;
+		bch2_bkey_append_ptr(k, tmp);
+
+		BUG_ON(sectors > ob->sectors_free);
+		ob->sectors_free -= sectors;
+	}
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
+{
+	struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
+	wp->ptrs = keep;
+
+	mutex_unlock(&wp->lock);
+
+	bch2_open_buckets_put(c, &ptrs);
+}
+
+void bch2_fs_allocator_foreground_init(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+	struct write_point *wp;
+
+	mutex_init(&c->write_points_hash_lock);
+	c->write_points_nr = ARRAY_SIZE(c->write_points);
+
+	/* open bucket 0 is a sentinal NULL: */
+	spin_lock_init(&c->open_buckets[0].lock);
+
+	for (ob = c->open_buckets + 1;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
+		spin_lock_init(&ob->lock);
+		c->open_buckets_nr_free++;
+
+		ob->freelist = c->open_buckets_freelist;
+		c->open_buckets_freelist = ob - c->open_buckets;
+	}
+
+	writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
+	writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
+
+	for (wp = c->write_points;
+	     wp < c->write_points + c->write_points_nr; wp++) {
+		writepoint_init(wp, BCH_DATA_USER);
+
+		wp->last_used	= sched_clock();
+		wp->write_point	= (unsigned long) wp;
+		hlist_add_head_rcu(&wp->node,
+				   writepoint_hash(c, wp->write_point));
+	}
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
new file mode 100644
index 000000000000..687f973e4b3a
--- /dev/null
+++ b/fs/bcachefs/alloc_foreground.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
+#define _BCACHEFS_ALLOC_FOREGROUND_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+
+#include <linux/hash.h>
+
+struct bkey;
+struct bch_dev;
+struct bch_fs;
+struct bch_devs_List;
+
+struct dev_alloc_list {
+	unsigned	nr;
+	u8		devs[BCH_SB_MEMBERS_MAX];
+};
+
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
+					  struct dev_stripe_state *,
+					  struct bch_devs_mask *);
+void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *,
+			       struct dev_stripe_state *);
+
+long bch2_bucket_alloc_new_fs(struct bch_dev *);
+
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
+				      enum alloc_reserve, bool,
+				      struct closure *);
+
+static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
+			   struct open_bucket *ob)
+{
+	BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
+
+	obs->v[obs->nr++] = ob - c->open_buckets;
+}
+
+#define open_bucket_for_each(_c, _obs, _ob, _i)				\
+	for ((_i) = 0;							\
+	     (_i) < (_obs)->nr &&					\
+	     ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);	\
+	     (_i)++)
+
+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
+						 struct open_buckets *obs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->ec)
+			return ob;
+
+	return NULL;
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *,
+			struct open_buckets *, unsigned);
+
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	if (atomic_dec_and_test(&ob->pin))
+		__bch2_open_bucket_put(c, ob);
+}
+
+static inline void bch2_open_buckets_put(struct bch_fs *c,
+					 struct open_buckets *ptrs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, ptrs, ob, i)
+		bch2_open_bucket_put(c, ob);
+	ptrs->nr = 0;
+}
+
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+					struct write_point *wp,
+					struct open_buckets *ptrs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		ob->type = wp->type;
+		atomic_inc(&ob->pin);
+		ob_push(c, ptrs, ob);
+	}
+}
+
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
+					     unsigned, unsigned,
+					     struct write_point_specifier,
+					     struct bch_devs_list *,
+					     unsigned, unsigned,
+					     enum alloc_reserve,
+					     unsigned,
+					     struct closure *);
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+				    struct bkey_i *, unsigned);
+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
+
+void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
+				struct open_buckets *);
+
+void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
+			  struct write_point *);
+
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+	return (struct write_point_specifier) { .v = v | 1 };
+}
+
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+	return (struct write_point_specifier) { .v = (unsigned long) wp };
+}
+
+static inline void writepoint_init(struct write_point *wp,
+				   enum bch_data_type type)
+{
+	mutex_init(&wp->lock);
+	wp->type = type;
+}
+
+void bch2_fs_allocator_foreground_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
new file mode 100644
index 000000000000..832568dc9551
--- /dev/null
+++ b/fs/bcachefs/alloc_types.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_TYPES_H
+#define _BCACHEFS_ALLOC_TYPES_H
+
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#include "clock_types.h"
+#include "fifo.h"
+
+struct ec_bucket_buf;
+
+/* There's two of these clocks, one for reads and one for writes: */
+struct bucket_clock {
+	/*
+	 * "now" in (read/write) IO time - incremented whenever we do X amount
+	 * of reads or writes.
+	 *
+	 * Goes with the bucket read/write prios: when we read or write to a
+	 * bucket we reset the bucket's prio to the current hand; thus hand -
+	 * prio = time since bucket was last read/written.
+	 *
+	 * The units are some amount (bytes/sectors) of data read/written, and
+	 * the units can change on the fly if we need to rescale to fit
+	 * everything in a u16 - your only guarantee is that the units are
+	 * consistent.
+	 */
+	u16			hand;
+	u16			max_last_io;
+
+	int			rw;
+
+	struct io_timer		rescale;
+	struct mutex		lock;
+};
+
+/* There is one reserve for each type of btree, one for prios and gens
+ * and one for moving GC */
+enum alloc_reserve {
+	RESERVE_ALLOC		= -1,
+	RESERVE_BTREE		= 0,
+	RESERVE_MOVINGGC	= 1,
+	RESERVE_NONE		= 2,
+	RESERVE_NR		= 3,
+};
+
+typedef FIFO(long)	alloc_fifo;
+
+/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
+#define OPEN_BUCKETS_COUNT	256
+
+#define WRITE_POINT_HASH_NR	32
+#define WRITE_POINT_MAX		32
+
+struct open_bucket {
+	spinlock_t		lock;
+	atomic_t		pin;
+	u8			freelist;
+	u8			ec_idx;
+	u8			type;
+	unsigned		valid:1;
+	unsigned		on_partial_list:1;
+	unsigned		sectors_free;
+	struct bch_extent_ptr	ptr;
+	struct ec_stripe_new	*ec;
+};
+
+#define OPEN_BUCKET_LIST_MAX	15
+
+struct open_buckets {
+	u8			nr;
+	u8			v[OPEN_BUCKET_LIST_MAX];
+};
+
+struct dev_stripe_state {
+	u64			next_alloc[BCH_SB_MEMBERS_MAX];
+};
+
+struct write_point {
+	struct hlist_node	node;
+	struct mutex		lock;
+	u64			last_used;
+	unsigned long		write_point;
+	enum bch_data_type	type;
+	bool			is_ec;
+
+	/* calculated based on how many pointers we're actually going to use: */
+	unsigned		sectors_free;
+
+	struct open_buckets	ptrs;
+	struct dev_stripe_state	stripe;
+};
+
+struct write_point_specifier {
+	unsigned long		v;
+};
+
+struct alloc_heap_entry {
+	size_t			bucket;
+	size_t			nr;
+	unsigned long		key;
+};
+
+typedef HEAP(struct alloc_heap_entry) alloc_heap;
+
+#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
new file mode 100644
index 000000000000..496e1b6824b4
--- /dev/null
+++ b/fs/bcachefs/bcachefs.h
@@ -0,0 +1,854 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_H
+#define _BCACHEFS_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+
+#include <linux/bug.h>
+#include <linux/bio.h>
+#include <linux/closure.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/math64.h>
+#include <linux/mutex.h>
+#include <linux/percpu-refcount.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/rhashtable.h>
+#include <linux/rwsem.h>
+#include <linux/seqlock.h>
+#include <linux/shrinker.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/zstd.h>
+
+#include "bcachefs_format.h"
+#include "fifo.h"
+#include "opts.h"
+#include "util.h"
+
+#define dynamic_fault(...)		0
+#define race_fault(...)			0
+
+#define bch2_fs_init_fault(name)					\
+	dynamic_fault("bcachefs:bch_fs_init:" name)
+#define bch2_meta_read_fault(name)					\
+	 dynamic_fault("bcachefs:meta:read:" name)
+#define bch2_meta_write_fault(name)					\
+	 dynamic_fault("bcachefs:meta:write:" name)
+
+#ifdef __KERNEL__
+#define bch2_fmt(_c, fmt)	"bcachefs (%s): " fmt "\n", ((_c)->name)
+#else
+#define bch2_fmt(_c, fmt)	fmt "\n"
+#endif
+
+#define bch_info(c, fmt, ...) \
+	printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_notice(c, fmt, ...) \
+	printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn(c, fmt, ...) \
+	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn_ratelimited(c, fmt, ...) \
+	printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err(c, fmt, ...) \
+	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_ratelimited(c, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+
+#define bch_verbose(c, fmt, ...)					\
+do {									\
+	if ((c)->opts.verbose)						\
+		bch_info(c, fmt, ##__VA_ARGS__);			\
+} while (0)
+
+#define pr_verbose_init(opts, fmt, ...)					\
+do {									\
+	if (opt_get(opts, verbose))					\
+		pr_info(fmt, ##__VA_ARGS__);				\
+} while (0)
+
+/* Parameters that are useful for debugging, but should always be compiled in: */
+#define BCH_DEBUG_PARAMS_ALWAYS()					\
+	BCH_DEBUG_PARAM(key_merging_disabled,				\
+		"Disables merging of extents")				\
+	BCH_DEBUG_PARAM(btree_gc_always_rewrite,			\
+		"Causes mark and sweep to compact and rewrite every "	\
+		"btree node it traverses")				\
+	BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,			\
+		"Disables rewriting of btree nodes during mark and sweep")\
+	BCH_DEBUG_PARAM(btree_shrinker_disabled,			\
+		"Disables the shrinker callback for the btree node cache")
+
+/* Parameters that should only be compiled in in debug mode: */
+#define BCH_DEBUG_PARAMS_DEBUG()					\
+	BCH_DEBUG_PARAM(expensive_debug_checks,				\
+		"Enables various runtime debugging checks that "	\
+		"significantly affect performance")			\
+	BCH_DEBUG_PARAM(debug_check_iterators,				\
+		"Enables extra verification for btree iterators")	\
+	BCH_DEBUG_PARAM(debug_check_bkeys,				\
+		"Run bkey_debugcheck (primarily checking GC/allocation "\
+		"information) when iterating over keys")		\
+	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
+		"Reread btree nodes at various points to verify the "	\
+		"mergesort in the read path against modifications "	\
+		"done in memory")					\
+	BCH_DEBUG_PARAM(journal_seq_verify,				\
+		"Store the journal sequence number in the version "	\
+		"number of every btree key, and verify that btree "	\
+		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(inject_invalid_keys,				\
+		"Store the journal sequence number in the version "	\
+		"number of every btree key, and verify that btree "	\
+		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(test_alloc_startup,				\
+		"Force allocator startup to use the slowpath where it"	\
+		"can't find enough free buckets without invalidating"	\
+		"cached data")						\
+	BCH_DEBUG_PARAM(force_reconstruct_read,				\
+		"Force reads to use the reconstruct path, when reading"	\
+		"from erasure coded extents")				\
+	BCH_DEBUG_PARAM(test_restart_gc,				\
+		"Test restarting mark and sweep gc when bucket gens change")
+
+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
+#else
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
+#endif
+
+#define BCH_TIME_STATS()			\
+	x(btree_node_mem_alloc)			\
+	x(btree_node_split)			\
+	x(btree_node_sort)			\
+	x(btree_node_read)			\
+	x(btree_gc)				\
+	x(btree_lock_contended_read)		\
+	x(btree_lock_contended_intent)		\
+	x(btree_lock_contended_write)		\
+	x(data_write)				\
+	x(data_read)				\
+	x(data_promote)				\
+	x(journal_write)			\
+	x(journal_delay)			\
+	x(journal_flush_seq)			\
+	x(blocked_journal)			\
+	x(blocked_allocate)			\
+	x(blocked_allocate_open_bucket)
+
+enum bch_time_stats {
+#define x(name) BCH_TIME_##name,
+	BCH_TIME_STATS()
+#undef x
+	BCH_TIME_STAT_NR
+};
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "clock_types.h"
+#include "ec_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "quota_types.h"
+#include "rebalance_types.h"
+#include "replicas_types.h"
+#include "super_types.h"
+
+/* Number of nodes btree coalesce will try to coalesce at once */
+#define GC_MERGE_NODES		4U
+
+/* Maximum number of nodes we might need to allocate atomically: */
+#define BTREE_RESERVE_MAX	(BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
+
+/* Size of the freelist we allocate btree nodes from: */
+#define BTREE_NODE_RESERVE	BTREE_RESERVE_MAX
+
+#define BTREE_NODE_OPEN_BUCKET_RESERVE	(BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
+
+struct btree;
+
+enum gc_phase {
+	GC_PHASE_NOT_RUNNING,
+	GC_PHASE_START,
+	GC_PHASE_SB,
+
+	GC_PHASE_BTREE_EC,
+	GC_PHASE_BTREE_EXTENTS,
+	GC_PHASE_BTREE_INODES,
+	GC_PHASE_BTREE_DIRENTS,
+	GC_PHASE_BTREE_XATTRS,
+	GC_PHASE_BTREE_ALLOC,
+	GC_PHASE_BTREE_QUOTAS,
+	GC_PHASE_BTREE_REFLINK,
+
+	GC_PHASE_PENDING_DELETE,
+	GC_PHASE_ALLOC,
+};
+
+struct gc_pos {
+	enum gc_phase		phase;
+	struct bpos		pos;
+	unsigned		level;
+};
+
+struct io_count {
+	u64			sectors[2][BCH_DATA_NR];
+};
+
+struct bch_dev {
+	struct kobject		kobj;
+	struct percpu_ref	ref;
+	struct completion	ref_completion;
+	struct percpu_ref	io_ref;
+	struct completion	io_ref_completion;
+
+	struct bch_fs		*fs;
+
+	u8			dev_idx;
+	/*
+	 * Cached version of this device's member info from superblock
+	 * Committed by bch2_write_super() -> bch_fs_mi_update()
+	 */
+	struct bch_member_cpu	mi;
+	uuid_le			uuid;
+	char			name[BDEVNAME_SIZE];
+
+	struct bch_sb_handle	disk_sb;
+	struct bch_sb		*sb_read_scratch;
+	int			sb_write_error;
+
+	struct bch_devs_mask	self;
+
+	/* biosets used in cloned bios for writing multiple replicas */
+	struct bio_set		replica_set;
+
+	/*
+	 * Buckets:
+	 * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
+	 * gc_lock, for device resize - holding any is sufficient for access:
+	 * Or rcu_read_lock(), but only for ptr_stale():
+	 */
+	struct bucket_array __rcu *buckets[2];
+	unsigned long		*buckets_nouse;
+	struct rw_semaphore	bucket_lock;
+
+	struct bch_dev_usage __percpu *usage[2];
+
+	/* Allocator: */
+	struct task_struct __rcu *alloc_thread;
+
+	/*
+	 * free: Buckets that are ready to be used
+	 *
+	 * free_inc: Incoming buckets - these are buckets that currently have
+	 * cached data in them, and we can't reuse them until after we write
+	 * their new gen to disk. After prio_write() finishes writing the new
+	 * gens/prios, they'll be moved to the free list (and possibly discarded
+	 * in the process)
+	 */
+	alloc_fifo		free[RESERVE_NR];
+	alloc_fifo		free_inc;
+	spinlock_t		freelist_lock;
+
+	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
+	unsigned		open_buckets_partial_nr;
+
+	size_t			fifo_last_bucket;
+
+	/* last calculated minimum prio */
+	u16			max_last_bucket_io[2];
+
+	size_t			inc_gen_needs_gc;
+	size_t			inc_gen_really_needs_gc;
+
+	/*
+	 * XXX: this should be an enum for allocator state, so as to include
+	 * error state
+	 */
+	enum {
+		ALLOCATOR_STOPPED,
+		ALLOCATOR_RUNNING,
+		ALLOCATOR_BLOCKED,
+		ALLOCATOR_BLOCKED_FULL,
+	}			allocator_state;
+
+	alloc_heap		alloc_heap;
+
+	/* Copying GC: */
+	struct task_struct	*copygc_thread;
+	copygc_heap		copygc_heap;
+	struct bch_pd_controller copygc_pd;
+	struct write_point	copygc_write_point;
+	u64			copygc_threshold;
+
+	atomic64_t		rebalance_work;
+
+	struct journal_device	journal;
+
+	struct work_struct	io_error_work;
+
+	/* The rest of this all shows up in sysfs */
+	atomic64_t		cur_latency[2];
+	struct time_stats	io_latency[2];
+
+#define CONGESTED_MAX		1024
+	atomic_t		congested;
+	u64			congested_last;
+
+	struct io_count __percpu *io_done;
+};
+
+enum {
+	/* startup: */
+	BCH_FS_ALLOC_READ_DONE,
+	BCH_FS_ALLOCATOR_STARTED,
+	BCH_FS_ALLOCATOR_RUNNING,
+	BCH_FS_INITIAL_GC_DONE,
+	BCH_FS_FSCK_DONE,
+	BCH_FS_STARTED,
+	BCH_FS_RW,
+
+	/* shutdown: */
+	BCH_FS_STOPPING,
+	BCH_FS_EMERGENCY_RO,
+	BCH_FS_WRITE_DISABLE_COMPLETE,
+
+	/* errors: */
+	BCH_FS_ERROR,
+	BCH_FS_ERRORS_FIXED,
+
+	/* misc: */
+	BCH_FS_BDEV_MOUNTED,
+	BCH_FS_FIXED_GENS,
+	BCH_FS_ALLOC_WRITTEN,
+	BCH_FS_REBUILD_REPLICAS,
+	BCH_FS_HOLD_BTREE_WRITES,
+};
+
+struct btree_debug {
+	unsigned		id;
+	struct dentry		*btree;
+	struct dentry		*btree_format;
+	struct dentry		*failed;
+};
+
+struct bch_fs_pcpu {
+	u64			sectors_available;
+};
+
+struct journal_seq_blacklist_table {
+	size_t			nr;
+	struct journal_seq_blacklist_table_entry {
+		u64		start;
+		u64		end;
+		bool		dirty;
+	}			entries[0];
+};
+
+struct bch_fs {
+	struct closure		cl;
+
+	struct list_head	list;
+	struct kobject		kobj;
+	struct kobject		internal;
+	struct kobject		opts_dir;
+	struct kobject		time_stats;
+	unsigned long		flags;
+
+	int			minor;
+	struct device		*chardev;
+	struct super_block	*vfs_sb;
+	char			name[40];
+
+	/* ro/rw, add/remove devices: */
+	struct mutex		state_lock;
+
+	/* Counts outstanding writes, for clean transition to read-only */
+	struct percpu_ref	writes;
+	struct work_struct	read_only_work;
+
+	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
+
+	struct bch_replicas_cpu replicas;
+	struct bch_replicas_cpu replicas_gc;
+	struct mutex		replicas_gc_lock;
+
+	struct journal_entry_res replicas_journal_res;
+
+	struct bch_disk_groups_cpu __rcu *disk_groups;
+
+	struct bch_opts		opts;
+
+	/* Updated by bch2_sb_update():*/
+	struct {
+		uuid_le		uuid;
+		uuid_le		user_uuid;
+
+		u16		version;
+		u16		encoded_extent_max;
+
+		u8		nr_devices;
+		u8		clean;
+
+		u8		encryption_type;
+
+		u64		time_base_lo;
+		u32		time_base_hi;
+		u32		time_precision;
+		u64		features;
+		u64		compat;
+	}			sb;
+
+	struct bch_sb_handle	disk_sb;
+
+	unsigned short		block_bits;	/* ilog2(block_size) */
+
+	u16			btree_foreground_merge_threshold;
+
+	struct closure		sb_write;
+	struct mutex		sb_lock;
+
+	/* BTREE CACHE */
+	struct bio_set		btree_bio;
+
+	struct btree_root	btree_roots[BTREE_ID_NR];
+	bool			btree_roots_dirty;
+	struct mutex		btree_root_lock;
+
+	struct btree_cache	btree_cache;
+
+	mempool_t		btree_reserve_pool;
+
+	/*
+	 * Cache of allocated btree nodes - if we allocate a btree node and
+	 * don't use it, if we free it that space can't be reused until going
+	 * _all_ the way through the allocator (which exposes us to a livelock
+	 * when allocating btree reserves fail halfway through) - instead, we
+	 * can stick them here:
+	 */
+	struct btree_alloc	btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+	unsigned		btree_reserve_cache_nr;
+	struct mutex		btree_reserve_cache_lock;
+
+	mempool_t		btree_interior_update_pool;
+	struct list_head	btree_interior_update_list;
+	struct mutex		btree_interior_update_lock;
+	struct closure_waitlist	btree_interior_update_wait;
+
+	mempool_t		btree_iters_pool;
+
+	struct workqueue_struct	*wq;
+	/* copygc needs its own workqueue for index updates.. */
+	struct workqueue_struct	*copygc_wq;
+	struct workqueue_struct	*journal_reclaim_wq;
+
+	/* ALLOCATION */
+	struct delayed_work	pd_controllers_update;
+	unsigned		pd_controllers_update_seconds;
+
+	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
+
+	u64			capacity; /* sectors */
+
+	/*
+	 * When capacity _decreases_ (due to a disk being removed), we
+	 * increment capacity_gen - this invalidates outstanding reservations
+	 * and forces them to be revalidated
+	 */
+	u32			capacity_gen;
+	unsigned		bucket_size_max;
+
+	atomic64_t		sectors_available;
+
+	struct bch_fs_pcpu __percpu	*pcpu;
+
+	struct percpu_rw_semaphore	mark_lock;
+
+	seqcount_t			usage_lock;
+	struct bch_fs_usage		*usage_base;
+	struct bch_fs_usage __percpu	*usage[2];
+	struct bch_fs_usage __percpu	*usage_gc;
+
+	/* single element mempool: */
+	struct mutex		usage_scratch_lock;
+	struct bch_fs_usage	*usage_scratch;
+
+	/*
+	 * When we invalidate buckets, we use both the priority and the amount
+	 * of good data to determine which buckets to reuse first - to weight
+	 * those together consistently we keep track of the smallest nonzero
+	 * priority of any bucket.
+	 */
+	struct bucket_clock	bucket_clock[2];
+
+	struct io_clock		io_clock[2];
+
+	/* JOURNAL SEQ BLACKLIST */
+	struct journal_seq_blacklist_table *
+				journal_seq_blacklist_table;
+	struct work_struct	journal_seq_blacklist_gc_work;
+
+	/* ALLOCATOR */
+	spinlock_t		freelist_lock;
+	struct closure_waitlist	freelist_wait;
+	u64			blocked_allocate;
+	u64			blocked_allocate_open_bucket;
+	u8			open_buckets_freelist;
+	u8			open_buckets_nr_free;
+	struct closure_waitlist	open_buckets_wait;
+	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
+
+	struct write_point	btree_write_point;
+	struct write_point	rebalance_write_point;
+
+	struct write_point	write_points[WRITE_POINT_MAX];
+	struct hlist_head	write_points_hash[WRITE_POINT_HASH_NR];
+	struct mutex		write_points_hash_lock;
+	unsigned		write_points_nr;
+
+	/* GARBAGE COLLECTION */
+	struct task_struct	*gc_thread;
+	atomic_t		kick_gc;
+	unsigned long		gc_count;
+
+	/*
+	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
+	 * has been marked by GC.
+	 *
+	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
+	 *
+	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
+	 * can read without a lock.
+	 */
+	seqcount_t		gc_pos_lock;
+	struct gc_pos		gc_pos;
+
+	/*
+	 * The allocation code needs gc_mark in struct bucket to be correct, but
+	 * it's not while a gc is in progress.
+	 */
+	struct rw_semaphore	gc_lock;
+
+	/* IO PATH */
+	struct bio_set		bio_read;
+	struct bio_set		bio_read_split;
+	struct bio_set		bio_write;
+	struct mutex		bio_bounce_pages_lock;
+	mempool_t		bio_bounce_pages;
+	struct rhashtable	promote_table;
+
+	mempool_t		compression_bounce[2];
+	mempool_t		compress_workspace[BCH_COMPRESSION_NR];
+	mempool_t		decompress_workspace;
+	ZSTD_parameters		zstd_params;
+
+	struct crypto_shash	*sha256;
+	struct crypto_sync_skcipher *chacha20;
+	struct crypto_shash	*poly1305;
+
+	atomic64_t		key_version;
+
+	/* REBALANCE */
+	struct bch_fs_rebalance	rebalance;
+
+	/* STRIPES: */
+	GENRADIX(struct stripe) stripes[2];
+	struct mutex		ec_stripe_create_lock;
+
+	ec_stripes_heap		ec_stripes_heap;
+	spinlock_t		ec_stripes_heap_lock;
+
+	/* ERASURE CODING */
+	struct list_head	ec_new_stripe_list;
+	struct mutex		ec_new_stripe_lock;
+	u64			ec_stripe_hint;
+
+	struct bio_set		ec_bioset;
+
+	struct work_struct	ec_stripe_delete_work;
+	struct llist_head	ec_stripe_delete_list;
+
+	/* REFLINK */
+	u64			reflink_hint;
+
+	/* VFS IO PATH - fs-io.c */
+	struct bio_set		writepage_bioset;
+	struct bio_set		dio_write_bioset;
+	struct bio_set		dio_read_bioset;
+
+	struct bio_list		btree_write_error_list;
+	struct work_struct	btree_write_error_work;
+	spinlock_t		btree_write_error_lock;
+
+	/* ERRORS */
+	struct list_head	fsck_errors;
+	struct mutex		fsck_error_lock;
+	bool			fsck_alloc_err;
+
+	/* QUOTAS */
+	struct bch_memquota_type quotas[QTYP_NR];
+
+	/* DEBUG JUNK */
+	struct dentry		*debug;
+	struct btree_debug	btree_debug[BTREE_ID_NR];
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct btree		*verify_data;
+	struct btree_node	*verify_ondisk;
+	struct mutex		verify_lock;
+#endif
+
+	u64			unused_inode_hint;
+
+	/*
+	 * A btree node on disk could have too many bsets for an iterator to fit
+	 * on the stack - have to dynamically allocate them
+	 */
+	mempool_t		fill_iter;
+
+	mempool_t		btree_bounce_pool;
+
+	struct journal		journal;
+
+	u64			last_bucket_seq_cleanup;
+
+	/* The rest of this all shows up in sysfs */
+	atomic_long_t		read_realloc_races;
+	atomic_long_t		extent_migrate_done;
+	atomic_long_t		extent_migrate_raced;
+
+	unsigned		btree_gc_periodic:1;
+	unsigned		copy_gc_enabled:1;
+	bool			promote_whole_extents;
+
+#define BCH_DEBUG_PARAM(name, description) bool name;
+	BCH_DEBUG_PARAMS_ALL()
+#undef BCH_DEBUG_PARAM
+
+	struct time_stats	times[BCH_TIME_STAT_NR];
+};
+
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
+{
+#ifndef NO_BCACHEFS_FS
+	if (c->vfs_sb)
+		c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
+}
+
+static inline unsigned bucket_bytes(const struct bch_dev *ca)
+{
+	return ca->mi.bucket_size << 9;
+}
+
+static inline unsigned block_bytes(const struct bch_fs *c)
+{
+	return c->opts.block_size << 9;
+}
+
+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
+{
+	return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
+}
+
+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
+{
+	s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
+
+	if (c->sb.time_precision == 1)
+		return ns;
+
+	return div_s64(ns, c->sb.time_precision);
+}
+
+static inline s64 bch2_current_time(struct bch_fs *c)
+{
+	struct timespec64 now;
+
+	ktime_get_coarse_real_ts64(&now);
+	return timespec_to_bch2_time(c, now);
+}
+
+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
+{
+	return dev < c->sb.nr_devices && c->devs[dev];
+}
+
+#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
new file mode 100644
index 000000000000..d619e5caf09b
--- /dev/null
+++ b/fs/bcachefs/bcachefs_format.h
@@ -0,0 +1,1604 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FORMAT_H
+#define _BCACHEFS_FORMAT_H
+
+/*
+ * bcachefs on disk data structures
+ *
+ * OVERVIEW:
+ *
+ * There are three main types of on disk data structures in bcachefs (this is
+ * reduced from 5 in bcache)
+ *
+ *  - superblock
+ *  - journal
+ *  - btree
+ *
+ * The btree is the primary structure; most metadata exists as keys in the
+ * various btrees. There are only a small number of btrees, they're not
+ * sharded - we have one btree for extents, another for inodes, et cetera.
+ *
+ * SUPERBLOCK:
+ *
+ * The superblock contains the location of the journal, the list of devices in
+ * the filesystem, and in general any metadata we need in order to decide
+ * whether we can start a filesystem or prior to reading the journal/btree
+ * roots.
+ *
+ * The superblock is extensible, and most of the contents of the superblock are
+ * in variable length, type tagged fields; see struct bch_sb_field.
+ *
+ * Backup superblocks do not reside in a fixed location; also, superblocks do
+ * not have a fixed size. To locate backup superblocks we have struct
+ * bch_sb_layout; we store a copy of this inside every superblock, and also
+ * before the first superblock.
+ *
+ * JOURNAL:
+ *
+ * The journal primarily records btree updates in the order they occurred;
+ * journal replay consists of just iterating over all the keys in the open
+ * journal entries and re-inserting them into the btrees.
+ *
+ * The journal also contains entry types for the btree roots, and blacklisted
+ * journal sequence numbers (see journal_seq_blacklist.c).
+ *
+ * BTREE:
+ *
+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
+ * 128k-256k) and log structured. We use struct btree_node for writing the first
+ * entry in a given node (offset 0), and struct btree_node_entry for all
+ * subsequent writes.
+ *
+ * After the header, btree node entries contain a list of keys in sorted order.
+ * Values are stored inline with the keys; since values are variable length (and
+ * keys effectively are variable length too, due to packing) we can't do random
+ * access without building up additional in memory tables in the btree node read
+ * path.
+ *
+ * BTREE KEYS (struct bkey):
+ *
+ * The various btrees share a common format for the key - so as to avoid
+ * switching in fastpath lookup/comparison code - but define their own
+ * structures for the key values.
+ *
+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max
+ * size is just under 2k. The common part also contains a type tag for the
+ * value, and a format field indicating whether the key is packed or not (and
+ * also meant to allow adding new key fields in the future, if desired).
+ *
+ * bkeys, when stored within a btree node, may also be packed. In that case, the
+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
+ * be generous with field sizes in the common part of the key format (64 bit
+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
+ */
+
+#include <asm/types.h>
+#include <asm/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/uuid.h>
+
+#define LE_BITMASK(_bits, name, type, field, offset, end)		\
+static const unsigned	name##_OFFSET = offset;				\
+static const unsigned	name##_BITS = (end - offset);			\
+static const __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;	\
+									\
+static inline __u64 name(const type *k)					\
+{									\
+	return (__le##_bits##_to_cpu(k->field) >> offset) &		\
+		~(~0ULL << (end - offset));				\
+}									\
+									\
+static inline void SET_##name(type *k, __u64 v)				\
+{									\
+	__u##_bits new = __le##_bits##_to_cpu(k->field);		\
+									\
+	new &= ~(~(~0ULL << (end - offset)) << offset);			\
+	new |= (v & ~(~0ULL << (end - offset))) << offset;		\
+	k->field = __cpu_to_le##_bits(new);				\
+}
+
+#define LE16_BITMASK(n, t, f, o, e)	LE_BITMASK(16, n, t, f, o, e)
+#define LE32_BITMASK(n, t, f, o, e)	LE_BITMASK(32, n, t, f, o, e)
+#define LE64_BITMASK(n, t, f, o, e)	LE_BITMASK(64, n, t, f, o, e)
+
+struct bkey_format {
+	__u8		key_u64s;
+	__u8		nr_fields;
+	/* One unused slot for now: */
+	__u8		bits_per_field[6];
+	__le64		field_offset[6];
+};
+
+/* Btree keys - all units are in sectors */
+
+struct bpos {
+	/*
+	 * Word order matches machine byte order - btree code treats a bpos as a
+	 * single large integer, for search/comparison purposes
+	 *
+	 * Note that wherever a bpos is embedded in another on disk data
+	 * structure, it has to be byte swabbed when reading in metadata that
+	 * wasn't written in native endian order:
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u32		snapshot;
+	__u64		offset;
+	__u64		inode;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	__u64		inode;
+	__u64		offset;		/* Points to end of extent - sectors */
+	__u32		snapshot;
+#else
+#error edit for your odd byteorder.
+#endif
+} __attribute__((packed, aligned(4)));
+
+#define KEY_INODE_MAX			((__u64)~0ULL)
+#define KEY_OFFSET_MAX			((__u64)~0ULL)
+#define KEY_SNAPSHOT_MAX		((__u32)~0U)
+#define KEY_SIZE_MAX			((__u32)~0U)
+
+static inline struct bpos POS(__u64 inode, __u64 offset)
+{
+	struct bpos ret;
+
+	ret.inode	= inode;
+	ret.offset	= offset;
+	ret.snapshot	= 0;
+
+	return ret;
+}
+
+#define POS_MIN				POS(0, 0)
+#define POS_MAX				POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
+
+/* Empty placeholder struct, for container_of() */
+struct bch_val {
+	__u64		__nothing[0];
+};
+
+struct bversion {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u64		lo;
+	__u32		hi;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	__u32		hi;
+	__u64		lo;
+#endif
+} __attribute__((packed, aligned(4)));
+
+struct bkey {
+	/* Size of combined key and value, in u64s */
+	__u8		u64s;
+
+	/* Format of key (0 for format local to btree node) */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8		format:7,
+			needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u8		needs_whiteout:1,
+			format:7;
+#else
+#error edit for your odd byteorder.
+#endif
+
+	/* Type of the value */
+	__u8		type;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u8		pad[1];
+
+	struct bversion	version;
+	__u32		size;		/* extent size, in sectors */
+	struct bpos	p;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	struct bpos	p;
+	__u32		size;		/* extent size, in sectors */
+	struct bversion	version;
+
+	__u8		pad[1];
+#endif
+} __attribute__((packed, aligned(8)));
+
+struct bkey_packed {
+	__u64		_data[0];
+
+	/* Size of combined key and value, in u64s */
+	__u8		u64s;
+
+	/* Format of key (0 for format local to btree node) */
+
+	/*
+	 * XXX: next incompat on disk format change, switch format and
+	 * needs_whiteout - bkey_packed() will be cheaper if format is the high
+	 * bits of the bitfield
+	 */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8		format:7,
+			needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u8		needs_whiteout:1,
+			format:7;
+#endif
+
+	/* Type of the value */
+	__u8		type;
+	__u8		key_start[0];
+
+	/*
+	 * We copy bkeys with struct assignment in various places, and while
+	 * that shouldn't be done with packed bkeys we can't disallow it in C,
+	 * and it's legal to cast a bkey to a bkey_packed  - so padding it out
+	 * to the same size as struct bkey should hopefully be safest.
+	 */
+	__u8		pad[sizeof(struct bkey) - 3];
+} __attribute__((packed, aligned(8)));
+
+#define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
+#define BKEY_U64s_MAX			U8_MAX
+#define BKEY_VAL_U64s_MAX		(BKEY_U64s_MAX - BKEY_U64s)
+
+#define KEY_PACKED_BITS_START		24
+
+#define KEY_FORMAT_LOCAL_BTREE		0
+#define KEY_FORMAT_CURRENT		1
+
+enum bch_bkey_fields {
+	BKEY_FIELD_INODE,
+	BKEY_FIELD_OFFSET,
+	BKEY_FIELD_SNAPSHOT,
+	BKEY_FIELD_SIZE,
+	BKEY_FIELD_VERSION_HI,
+	BKEY_FIELD_VERSION_LO,
+	BKEY_NR_FIELDS,
+};
+
+#define bkey_format_field(name, field)					\
+	[BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
+
+#define BKEY_FORMAT_CURRENT						\
+((struct bkey_format) {							\
+	.key_u64s	= BKEY_U64s,					\
+	.nr_fields	= BKEY_NR_FIELDS,				\
+	.bits_per_field = {						\
+		bkey_format_field(INODE,	p.inode),		\
+		bkey_format_field(OFFSET,	p.offset),		\
+		bkey_format_field(SNAPSHOT,	p.snapshot),		\
+		bkey_format_field(SIZE,		size),			\
+		bkey_format_field(VERSION_HI,	version.hi),		\
+		bkey_format_field(VERSION_LO,	version.lo),		\
+	},								\
+})
+
+/* bkey with inline value */
+struct bkey_i {
+	__u64			_data[0];
+
+	union {
+	struct {
+		/* Size of combined key and value, in u64s */
+		__u8		u64s;
+	};
+	struct {
+		struct bkey	k;
+		struct bch_val	v;
+	};
+	};
+};
+
+#define KEY(_inode, _offset, _size)					\
+((struct bkey) {							\
+	.u64s		= BKEY_U64s,					\
+	.format		= KEY_FORMAT_CURRENT,				\
+	.p		= POS(_inode, _offset),				\
+	.size		= _size,					\
+})
+
+static inline void bkey_init(struct bkey *k)
+{
+	*k = KEY(0, 0, 0);
+}
+
+#define bkey_bytes(_k)		((_k)->u64s * sizeof(__u64))
+
+#define __BKEY_PADDED(key, pad)					\
+	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+
+/*
+ * - DELETED keys are used internally to mark keys that should be ignored but
+ *   override keys in composition order.  Their version number is ignored.
+ *
+ * - DISCARDED keys indicate that the data is all 0s because it has been
+ *   discarded. DISCARDs may have a version; if the version is nonzero the key
+ *   will be persistent, otherwise the key will be dropped whenever the btree
+ *   node is rewritten (like DELETED keys).
+ *
+ * - ERROR: any read of the data returns a read error, as the data was lost due
+ *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
+ *   by new writes or cluster-wide GC. Node repair can also overwrite them with
+ *   the same or a more recent version number, but not with an older version
+ *   number.
+ *
+ * - WHITEOUT: for hash table btrees
+*/
+#define BCH_BKEY_TYPES()				\
+	x(deleted,		0)			\
+	x(discard,		1)			\
+	x(error,		2)			\
+	x(cookie,		3)			\
+	x(whiteout,		4)			\
+	x(btree_ptr,		5)			\
+	x(extent,		6)			\
+	x(reservation,		7)			\
+	x(inode,		8)			\
+	x(inode_generation,	9)			\
+	x(dirent,		10)			\
+	x(xattr,		11)			\
+	x(alloc,		12)			\
+	x(quota,		13)			\
+	x(stripe,		14)			\
+	x(reflink_p,		15)			\
+	x(reflink_v,		16)
+
+enum bch_bkey_type {
+#define x(name, nr) KEY_TYPE_##name	= nr,
+	BCH_BKEY_TYPES()
+#undef x
+	KEY_TYPE_MAX,
+};
+
+struct bch_cookie {
+	struct bch_val		v;
+	__le64			cookie;
+};
+
+/* Extents */
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32	- 0b1
+ * bch_extent_ptr	- 0b10
+ * bch_extent_crc64	- 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+/* 128 bits, sufficient for cryptographic MACs: */
+struct bch_csum {
+	__le64			lo;
+	__le64			hi;
+} __attribute__((packed, aligned(8)));
+
+enum bch_csum_type {
+	BCH_CSUM_NONE			= 0,
+	BCH_CSUM_CRC32C_NONZERO		= 1,
+	BCH_CSUM_CRC64_NONZERO		= 2,
+	BCH_CSUM_CHACHA20_POLY1305_80	= 3,
+	BCH_CSUM_CHACHA20_POLY1305_128	= 4,
+	BCH_CSUM_CRC32C			= 5,
+	BCH_CSUM_CRC64			= 6,
+	BCH_CSUM_NR			= 7,
+};
+
+static const unsigned bch_crc_bytes[] = {
+	[BCH_CSUM_NONE]				= 0,
+	[BCH_CSUM_CRC32C_NONZERO]		= 4,
+	[BCH_CSUM_CRC32C]			= 4,
+	[BCH_CSUM_CRC64_NONZERO]		= 8,
+	[BCH_CSUM_CRC64]			= 8,
+	[BCH_CSUM_CHACHA20_POLY1305_80]		= 10,
+	[BCH_CSUM_CHACHA20_POLY1305_128]	= 16,
+};
+
+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
+{
+	switch (type) {
+	case BCH_CSUM_CHACHA20_POLY1305_80:
+	case BCH_CSUM_CHACHA20_POLY1305_128:
+		return true;
+	default:
+		return false;
+	}
+}
+
+enum bch_compression_type {
+	BCH_COMPRESSION_NONE		= 0,
+	BCH_COMPRESSION_LZ4_OLD		= 1,
+	BCH_COMPRESSION_GZIP		= 2,
+	BCH_COMPRESSION_LZ4		= 3,
+	BCH_COMPRESSION_ZSTD		= 4,
+	BCH_COMPRESSION_NR		= 5,
+};
+
+#define BCH_EXTENT_ENTRY_TYPES()		\
+	x(ptr,			0)		\
+	x(crc32,		1)		\
+	x(crc64,		2)		\
+	x(crc128,		3)		\
+	x(stripe_ptr,		4)
+#define BCH_EXTENT_ENTRY_MAX	5
+
+enum bch_extent_entry_type {
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u32			type:2,
+				_compressed_size:7,
+				_uncompressed_size:7,
+				offset:7,
+				_unused:1,
+				csum_type:4,
+				compression_type:4;
+	__u32			csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u32			csum;
+	__u32			compression_type:4,
+				csum_type:4,
+				_unused:1,
+				offset:7,
+				_uncompressed_size:7,
+				_compressed_size:7,
+				type:2;
+#endif
+} __attribute__((packed, aligned(8)));
+
+#define CRC32_SIZE_MAX		(1U << 7)
+#define CRC32_NONCE_MAX		0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:3,
+				_compressed_size:9,
+				_uncompressed_size:9,
+				offset:9,
+				nonce:10,
+				csum_type:4,
+				compression_type:4,
+				csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			csum_hi:16,
+				compression_type:4,
+				csum_type:4,
+				nonce:10,
+				offset:9,
+				_uncompressed_size:9,
+				_compressed_size:9,
+				type:3;
+#endif
+	__u64			csum_lo;
+} __attribute__((packed, aligned(8)));
+
+#define CRC64_SIZE_MAX		(1U << 9)
+#define CRC64_NONCE_MAX		((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:4,
+				_compressed_size:13,
+				_uncompressed_size:13,
+				offset:13,
+				nonce:13,
+				csum_type:4,
+				compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			compression_type:4,
+				csum_type:4,
+				nonce:13,
+				offset:13,
+				_uncompressed_size:13,
+				_compressed_size:13,
+				type:4;
+#endif
+	struct bch_csum		csum;
+} __attribute__((packed, aligned(8)));
+
+#define CRC128_SIZE_MAX		(1U << 13)
+#define CRC128_NONCE_MAX	((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:1,
+				cached:1,
+				unused:1,
+				reservation:1,
+				offset:44, /* 8 petabytes */
+				dev:8,
+				gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			gen:8,
+				dev:8,
+				offset:44,
+				reservation:1,
+				unused:1,
+				cached:1,
+				type:1;
+#endif
+} __attribute__((packed, aligned(8)));
+
+struct bch_extent_stripe_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:5,
+				block:8,
+				idx:51;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			idx:51,
+				block:8,
+				type:5;
+#endif
+};
+
+struct bch_extent_reservation {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:6,
+				unused:22,
+				replicas:4,
+				generation:32;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			generation:32,
+				replicas:4,
+				unused:22,
+				type:6;
+#endif
+};
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
+	unsigned long			type;
+#elif __BITS_PER_LONG == 32
+	struct {
+		unsigned long		pad;
+		unsigned long		type;
+	};
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define x(f, n) struct bch_extent_##f	f;
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+struct bch_btree_ptr {
+	struct bch_val		v;
+
+	struct bch_extent_ptr	start[0];
+	__u64			_data[0];
+} __attribute__((packed, aligned(8)));
+
+struct bch_extent {
+	struct bch_val		v;
+
+	union bch_extent_entry	start[0];
+	__u64			_data[0];
+} __attribute__((packed, aligned(8)));
+
+struct bch_reservation {
+	struct bch_val		v;
+
+	__le32			generation;
+	__u8			nr_replicas;
+	__u8			pad[3];
+} __attribute__((packed, aligned(8)));
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+	((sizeof(struct bch_extent_crc128) +			\
+	  sizeof(struct bch_extent_ptr)) / sizeof(u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX				\
+	(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+#define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX				\
+	((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX)
+#define BKEY_BTREE_PTR_U64s_MAX					\
+	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+/* Inodes */
+
+#define BLOCKDEV_INODE_MAX	4096
+
+#define BCACHEFS_ROOT_INO	4096
+
+struct bch_inode {
+	struct bch_val		v;
+
+	__le64			bi_hash_seed;
+	__le32			bi_flags;
+	__le16			bi_mode;
+	__u8			fields[0];
+} __attribute__((packed, aligned(8)));
+
+struct bch_inode_generation {
+	struct bch_val		v;
+
+	__le32			bi_generation;
+	__le32			pad;
+} __attribute__((packed, aligned(8)));
+
+#define BCH_INODE_FIELDS()			\
+	x(bi_atime,			64)	\
+	x(bi_ctime,			64)	\
+	x(bi_mtime,			64)	\
+	x(bi_otime,			64)	\
+	x(bi_size,			64)	\
+	x(bi_sectors,			64)	\
+	x(bi_uid,			32)	\
+	x(bi_gid,			32)	\
+	x(bi_nlink,			32)	\
+	x(bi_generation,		32)	\
+	x(bi_dev,			32)	\
+	x(bi_data_checksum,		8)	\
+	x(bi_compression,		8)	\
+	x(bi_project,			32)	\
+	x(bi_background_compression,	8)	\
+	x(bi_data_replicas,		8)	\
+	x(bi_promote_target,		16)	\
+	x(bi_foreground_target,		16)	\
+	x(bi_background_target,		16)	\
+	x(bi_erasure_code,		16)	\
+	x(bi_fields_set,		16)
+
+/* subset of BCH_INODE_FIELDS */
+#define BCH_INODE_OPTS()			\
+	x(data_checksum,		8)	\
+	x(compression,			8)	\
+	x(project,			32)	\
+	x(background_compression,	8)	\
+	x(data_replicas,		8)	\
+	x(promote_target,		16)	\
+	x(foreground_target,		16)	\
+	x(background_target,		16)	\
+	x(erasure_code,			16)
+
+enum inode_opt_id {
+#define x(name, ...)				\
+	Inode_opt_##name,
+	BCH_INODE_OPTS()
+#undef  x
+	Inode_opt_nr,
+};
+
+enum {
+	/*
+	 * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
+	 * flags)
+	 */
+	__BCH_INODE_SYNC	= 0,
+	__BCH_INODE_IMMUTABLE	= 1,
+	__BCH_INODE_APPEND	= 2,
+	__BCH_INODE_NODUMP	= 3,
+	__BCH_INODE_NOATIME	= 4,
+
+	__BCH_INODE_I_SIZE_DIRTY= 5,
+	__BCH_INODE_I_SECTORS_DIRTY= 6,
+	__BCH_INODE_UNLINKED	= 7,
+
+	/* bits 20+ reserved for packed fields below: */
+};
+
+#define BCH_INODE_SYNC		(1 << __BCH_INODE_SYNC)
+#define BCH_INODE_IMMUTABLE	(1 << __BCH_INODE_IMMUTABLE)
+#define BCH_INODE_APPEND	(1 << __BCH_INODE_APPEND)
+#define BCH_INODE_NODUMP	(1 << __BCH_INODE_NODUMP)
+#define BCH_INODE_NOATIME	(1 << __BCH_INODE_NOATIME)
+#define BCH_INODE_I_SIZE_DIRTY	(1 << __BCH_INODE_I_SIZE_DIRTY)
+#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
+#define BCH_INODE_UNLINKED	(1 << __BCH_INODE_UNLINKED)
+
+LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 32);
+
+/* Dirents */
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+struct bch_dirent {
+	struct bch_val		v;
+
+	/* Target inode number: */
+	__le64			d_inum;
+
+	/*
+	 * Copy of mode bits 12-15 from the target inode - so userspace can get
+	 * the filetype without having to do a stat()
+	 */
+	__u8			d_type;
+
+	__u8			d_name[];
+} __attribute__((packed, aligned(8)));
+
+#define BCH_NAME_MAX	(U8_MAX * sizeof(u64) -				\
+			 sizeof(struct bkey) -				\
+			 offsetof(struct bch_dirent, d_name))
+
+
+/* Xattrs */
+
+#define KEY_TYPE_XATTR_INDEX_USER			0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS	1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT	2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED			3
+#define KEY_TYPE_XATTR_INDEX_SECURITY	        4
+
+struct bch_xattr {
+	struct bch_val		v;
+	__u8			x_type;
+	__u8			x_name_len;
+	__le16			x_val_len;
+	__u8			x_name[];
+} __attribute__((packed, aligned(8)));
+
+/* Bucket/allocation information: */
+
+struct bch_alloc {
+	struct bch_val		v;
+	__u8			fields;
+	__u8			gen;
+	__u8			data[];
+} __attribute__((packed, aligned(8)));
+
+#define BCH_ALLOC_FIELDS()			\
+	x(read_time,		16)		\
+	x(write_time,		16)		\
+	x(data_type,		8)		\
+	x(dirty_sectors,	16)		\
+	x(cached_sectors,	16)		\
+	x(oldest_gen,		8)
+
+enum {
+#define x(name, bytes) BCH_ALLOC_FIELD_##name,
+	BCH_ALLOC_FIELDS()
+#undef x
+	BCH_ALLOC_FIELD_NR
+};
+
+static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
+	BCH_ALLOC_FIELDS()
+#undef x
+};
+
+#define x(name, bits) + (bits / 8)
+static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
+	DIV_ROUND_UP(offsetof(struct bch_alloc, data)
+		     BCH_ALLOC_FIELDS(), sizeof(u64));
+#undef x
+
+#define BKEY_ALLOC_U64s_MAX	(BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX)
+
+/* Quotas: */
+
+enum quota_types {
+	QTYP_USR		= 0,
+	QTYP_GRP		= 1,
+	QTYP_PRJ		= 2,
+	QTYP_NR			= 3,
+};
+
+enum quota_counters {
+	Q_SPC			= 0,
+	Q_INO			= 1,
+	Q_COUNTERS		= 2,
+};
+
+struct bch_quota_counter {
+	__le64			hardlimit;
+	__le64			softlimit;
+};
+
+struct bch_quota {
+	struct bch_val		v;
+	struct bch_quota_counter c[Q_COUNTERS];
+} __attribute__((packed, aligned(8)));
+
+/* Erasure coding */
+
+struct bch_stripe {
+	struct bch_val		v;
+	__le16			sectors;
+	__u8			algorithm;
+	__u8			nr_blocks;
+	__u8			nr_redundant;
+
+	__u8			csum_granularity_bits;
+	__u8			csum_type;
+	__u8			pad;
+
+	struct bch_extent_ptr	ptrs[0];
+} __attribute__((packed, aligned(8)));
+
+/* Reflink: */
+
+struct bch_reflink_p {
+	struct bch_val		v;
+	__le64			idx;
+
+	__le32			reservation_generation;
+	__u8			nr_replicas;
+	__u8			pad[3];
+};
+
+struct bch_reflink_v {
+	struct bch_val		v;
+	__le64			refcount;
+	union bch_extent_entry	start[0];
+	__u64			_data[0];
+};
+
+/* Optional/variable size superblock sections: */
+
+struct bch_sb_field {
+	__u64			_data[0];
+	__le32			u64s;
+	__le32			type;
+};
+
+#define BCH_SB_FIELDS()		\
+	x(journal,	0)	\
+	x(members,	1)	\
+	x(crypt,	2)	\
+	x(replicas_v0,	3)	\
+	x(quota,	4)	\
+	x(disk_groups,	5)	\
+	x(clean,	6)	\
+	x(replicas,	7)	\
+	x(journal_seq_blacklist, 8)
+
+enum bch_sb_field_type {
+#define x(f, nr)	BCH_SB_FIELD_##f = nr,
+	BCH_SB_FIELDS()
+#undef x
+	BCH_SB_FIELD_NR
+};
+
+/* BCH_SB_FIELD_journal: */
+
+struct bch_sb_field_journal {
+	struct bch_sb_field	field;
+	__le64			buckets[0];
+};
+
+/* BCH_SB_FIELD_members: */
+
+#define BCH_MIN_NR_NBUCKETS	(1 << 6)
+
+struct bch_member {
+	uuid_le			uuid;
+	__le64			nbuckets;	/* device size */
+	__le16			first_bucket;   /* index of first bucket used */
+	__le16			bucket_size;	/* sectors */
+	__le32			pad;
+	__le64			last_mount;	/* time_t */
+
+	__le64			flags[2];
+};
+
+LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags[0],  0,  4)
+/* 4-10 unused, was TIER, HAS_(META)DATA */
+LE64_BITMASK(BCH_MEMBER_REPLACEMENT,	struct bch_member, flags[0], 10, 14)
+LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags[0], 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags[0], 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags[0], 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags[0], 28, 30)
+
+#define BCH_TIER_MAX			4U
+
+#if 0
+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
+#endif
+
+enum bch_member_state {
+	BCH_MEMBER_STATE_RW		= 0,
+	BCH_MEMBER_STATE_RO		= 1,
+	BCH_MEMBER_STATE_FAILED		= 2,
+	BCH_MEMBER_STATE_SPARE		= 3,
+	BCH_MEMBER_STATE_NR		= 4,
+};
+
+enum cache_replacement {
+	CACHE_REPLACEMENT_LRU		= 0,
+	CACHE_REPLACEMENT_FIFO		= 1,
+	CACHE_REPLACEMENT_RANDOM	= 2,
+	CACHE_REPLACEMENT_NR		= 3,
+};
+
+struct bch_sb_field_members {
+	struct bch_sb_field	field;
+	struct bch_member	members[0];
+};
+
+/* BCH_SB_FIELD_crypt: */
+
+struct nonce {
+	__le32			d[4];
+};
+
+struct bch_key {
+	__le64			key[4];
+};
+
+#define BCH_KEY_MAGIC					\
+	(((u64) 'b' <<  0)|((u64) 'c' <<  8)|		\
+	 ((u64) 'h' << 16)|((u64) '*' << 24)|		\
+	 ((u64) '*' << 32)|((u64) 'k' << 40)|		\
+	 ((u64) 'e' << 48)|((u64) 'y' << 56))
+
+struct bch_encrypted_key {
+	__le64			magic;
+	struct bch_key		key;
+};
+
+/*
+ * If this field is present in the superblock, it stores an encryption key which
+ * is used encrypt all other data/metadata. The key will normally be encrypted
+ * with the key userspace provides, but if encryption has been turned off we'll
+ * just store the master key unencrypted in the superblock so we can access the
+ * previously encrypted data.
+ */
+struct bch_sb_field_crypt {
+	struct bch_sb_field	field;
+
+	__le64			flags;
+	__le64			kdf_flags;
+	struct bch_encrypted_key key;
+};
+
+LE64_BITMASK(BCH_CRYPT_KDF_TYPE,	struct bch_sb_field_crypt, flags, 0, 4);
+
+enum bch_kdf_types {
+	BCH_KDF_SCRYPT		= 0,
+	BCH_KDF_NR		= 1,
+};
+
+/* stored as base 2 log of scrypt params: */
+LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
+LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
+LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
+
+/* BCH_SB_FIELD_replicas: */
+
+enum bch_data_type {
+	BCH_DATA_NONE		= 0,
+	BCH_DATA_SB		= 1,
+	BCH_DATA_JOURNAL	= 2,
+	BCH_DATA_BTREE		= 3,
+	BCH_DATA_USER		= 4,
+	BCH_DATA_CACHED		= 5,
+	BCH_DATA_NR		= 6,
+};
+
+struct bch_replicas_entry_v0 {
+	__u8			data_type;
+	__u8			nr_devs;
+	__u8			devs[0];
+} __attribute__((packed));
+
+struct bch_sb_field_replicas_v0 {
+	struct bch_sb_field	field;
+	struct bch_replicas_entry_v0 entries[0];
+} __attribute__((packed, aligned(8)));
+
+struct bch_replicas_entry {
+	__u8			data_type;
+	__u8			nr_devs;
+	__u8			nr_required;
+	__u8			devs[0];
+} __attribute__((packed));
+
+struct bch_sb_field_replicas {
+	struct bch_sb_field	field;
+	struct bch_replicas_entry entries[0];
+} __attribute__((packed, aligned(8)));
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+	__le32				timelimit;
+	__le32				warnlimit;
+};
+
+struct bch_sb_quota_type {
+	__le64				flags;
+	struct bch_sb_quota_counter	c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+	struct bch_sb_field		field;
+	struct bch_sb_quota_type	q[QTYP_NR];
+} __attribute__((packed, aligned(8)));
+
+/* BCH_SB_FIELD_disk_groups: */
+
+#define BCH_SB_LABEL_SIZE		32
+
+struct bch_disk_group {
+	__u8			label[BCH_SB_LABEL_SIZE];
+	__le64			flags[2];
+} __attribute__((packed, aligned(8)));
+
+LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
+LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
+
+struct bch_sb_field_disk_groups {
+	struct bch_sb_field	field;
+	struct bch_disk_group	entries[0];
+} __attribute__((packed, aligned(8)));
+
+/*
+ * On clean shutdown, store btree roots and current journal sequence number in
+ * the superblock:
+ */
+struct jset_entry {
+	__le16			u64s;
+	__u8			btree_id;
+	__u8			level;
+	__u8			type; /* designates what this jset holds */
+	__u8			pad[3];
+
+	union {
+		struct bkey_i	start[0];
+		__u64		_data[0];
+	};
+};
+
+struct bch_sb_field_clean {
+	struct bch_sb_field	field;
+
+	__le32			flags;
+	__le16			read_clock;
+	__le16			write_clock;
+	__le64			journal_seq;
+
+	union {
+		struct jset_entry start[0];
+		__u64		_data[0];
+	};
+};
+
+struct journal_seq_blacklist_entry {
+	__le64			start;
+	__le64			end;
+};
+
+struct bch_sb_field_journal_seq_blacklist {
+	struct bch_sb_field	field;
+
+	union {
+		struct journal_seq_blacklist_entry start[0];
+		__u64		_data[0];
+	};
+};
+
+/* Superblock: */
+
+/*
+ * New versioning scheme:
+ * One common version number for all on disk data structures - superblock, btree
+ * nodes, journal entries
+ */
+#define BCH_JSET_VERSION_OLD			2
+#define BCH_BSET_VERSION_OLD			3
+
+enum bcachefs_metadata_version {
+	bcachefs_metadata_version_min			= 9,
+	bcachefs_metadata_version_new_versioning	= 10,
+	bcachefs_metadata_version_bkey_renumber		= 10,
+	bcachefs_metadata_version_max			= 11,
+};
+
+#define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
+
+#define BCH_SB_SECTOR			8
+#define BCH_SB_MEMBERS_MAX		64 /* XXX kill */
+
+struct bch_sb_layout {
+	uuid_le			magic;	/* bcachefs superblock UUID */
+	__u8			layout_type;
+	__u8			sb_max_size_bits; /* base 2 of 512 byte sectors */
+	__u8			nr_superblocks;
+	__u8			pad[5];
+	__le64			sb_offset[61];
+} __attribute__((packed, aligned(8)));
+
+#define BCH_SB_LAYOUT_SECTOR	7
+
+/*
+ * @offset	- sector where this sb was written
+ * @version	- on disk format version
+ * @version_min	- Oldest metadata version this filesystem contains; so we can
+ *		  safely drop compatibility code and refuse to mount filesystems
+ *		  we'd need it for
+ * @magic	- identifies as a bcachefs superblock (BCACHE_MAGIC)
+ * @seq		- incremented each time superblock is written
+ * @uuid	- used for generating various magic numbers and identifying
+ *                member devices, never changes
+ * @user_uuid	- user visible UUID, may be changed
+ * @label	- filesystem label
+ * @seq		- identifies most recent superblock, incremented each time
+ *		  superblock is written
+ * @features	- enabled incompatible features
+ */
+struct bch_sb {
+	struct bch_csum		csum;
+	__le16			version;
+	__le16			version_min;
+	__le16			pad[2];
+	uuid_le			magic;
+	uuid_le			uuid;
+	uuid_le			user_uuid;
+	__u8			label[BCH_SB_LABEL_SIZE];
+	__le64			offset;
+	__le64			seq;
+
+	__le16			block_size;
+	__u8			dev_idx;
+	__u8			nr_devices;
+	__le32			u64s;
+
+	__le64			time_base_lo;
+	__le32			time_base_hi;
+	__le32			time_precision;
+
+	__le64			flags[8];
+	__le64			features[2];
+	__le64			compat[2];
+
+	struct bch_sb_layout	layout;
+
+	union {
+		struct bch_sb_field start[0];
+		__le64		_data[0];
+	};
+} __attribute__((packed, aligned(8)));
+
+/*
+ * Flags:
+ * BCH_SB_INITALIZED	- set on first mount
+ * BCH_SB_CLEAN		- did we shut down cleanly? Just a hint, doesn't affect
+ *			  behaviour of mount/recovery path:
+ * BCH_SB_INODE_32BIT	- limit inode numbers to 32 bits
+ * BCH_SB_128_BIT_MACS	- 128 bit macs instead of 80
+ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
+ *			   DATA/META_CSUM_TYPE. Also indicates encryption
+ *			   algorithm in use, if/when we get more than one
+ */
+
+LE16_BITMASK(BCH_SB_BLOCK_SIZE,		struct bch_sb, block_size, 0, 16);
+
+LE64_BITMASK(BCH_SB_INITIALIZED,	struct bch_sb, flags[0],  0,  1);
+LE64_BITMASK(BCH_SB_CLEAN,		struct bch_sb, flags[0],  1,  2);
+LE64_BITMASK(BCH_SB_CSUM_TYPE,		struct bch_sb, flags[0],  2,  8);
+LE64_BITMASK(BCH_SB_ERROR_ACTION,	struct bch_sb, flags[0],  8, 12);
+
+LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,	struct bch_sb, flags[0], 12, 28);
+
+LE64_BITMASK(BCH_SB_GC_RESERVE,		struct bch_sb, flags[0], 28, 33);
+LE64_BITMASK(BCH_SB_ROOT_RESERVE,	struct bch_sb, flags[0], 33, 40);
+
+LE64_BITMASK(BCH_SB_META_CSUM_TYPE,	struct bch_sb, flags[0], 40, 44);
+LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,	struct bch_sb, flags[0], 44, 48);
+
+LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,	struct bch_sb, flags[0], 48, 52);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,	struct bch_sb, flags[0], 52, 56);
+
+LE64_BITMASK(BCH_SB_POSIX_ACL,		struct bch_sb, flags[0], 56, 57);
+LE64_BITMASK(BCH_SB_USRQUOTA,		struct bch_sb, flags[0], 57, 58);
+LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
+LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
+
+LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
+
+/* 61-64 unused */
+
+LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,	struct bch_sb, flags[1],  4,  8);
+LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
+
+LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
+LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,	struct bch_sb, flags[1], 10, 14);
+
+/*
+ * Max size of an extent that may require bouncing to read or write
+ * (checksummed, compressed): 64k
+ */
+LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
+					struct bch_sb, flags[1], 14, 20);
+
+LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
+
+LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);
+
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
+					struct bch_sb, flags[2],  0,  4);
+LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
+
+LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
+
+/* Features: */
+enum bch_sb_features {
+	BCH_FEATURE_LZ4			= 0,
+	BCH_FEATURE_GZIP		= 1,
+	BCH_FEATURE_ZSTD		= 2,
+	BCH_FEATURE_ATOMIC_NLINK	= 3, /* should have gone under compat */
+	BCH_FEATURE_EC			= 4,
+	BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
+	BCH_FEATURE_REFLINK		= 6,
+	BCH_FEATURE_NEW_SIPHASH		= 7,
+	BCH_FEATURE_NR,
+};
+
+enum bch_sb_compat {
+	BCH_COMPAT_FEAT_ALLOC_INFO	= 0,
+	BCH_COMPAT_FEAT_ALLOC_METADATA	= 1,
+};
+
+/* options: */
+
+#define BCH_REPLICAS_MAX		4U
+
+enum bch_error_actions {
+	BCH_ON_ERROR_CONTINUE		= 0,
+	BCH_ON_ERROR_RO			= 1,
+	BCH_ON_ERROR_PANIC		= 2,
+	BCH_NR_ERROR_ACTIONS		= 3,
+};
+
+enum bch_csum_opts {
+	BCH_CSUM_OPT_NONE		= 0,
+	BCH_CSUM_OPT_CRC32C		= 1,
+	BCH_CSUM_OPT_CRC64		= 2,
+	BCH_CSUM_OPT_NR			= 3,
+};
+
+enum bch_str_hash_type {
+	BCH_STR_HASH_CRC32C		= 0,
+	BCH_STR_HASH_CRC64		= 1,
+	BCH_STR_HASH_SIPHASH_OLD	= 2,
+	BCH_STR_HASH_SIPHASH		= 3,
+	BCH_STR_HASH_NR			= 4,
+};
+
+enum bch_str_hash_opts {
+	BCH_STR_HASH_OPT_CRC32C		= 0,
+	BCH_STR_HASH_OPT_CRC64		= 1,
+	BCH_STR_HASH_OPT_SIPHASH	= 2,
+	BCH_STR_HASH_OPT_NR		= 3,
+};
+
+#define BCH_COMPRESSION_TYPES()		\
+	x(NONE)				\
+	x(LZ4)				\
+	x(GZIP)				\
+	x(ZSTD)
+
+enum bch_compression_opts {
+#define x(t) BCH_COMPRESSION_OPT_##t,
+	BCH_COMPRESSION_TYPES()
+#undef x
+	BCH_COMPRESSION_OPT_NR
+};
+
+/*
+ * Magic numbers
+ *
+ * The various other data structures have their own magic numbers, which are
+ * xored with the first part of the cache set's UUID
+ */
+
+#define BCACHE_MAGIC							\
+	UUID_LE(0xf67385c6, 0x1a4e, 0xca45,				\
+		0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
+
+#define BCACHEFS_STATFS_MAGIC		0xca451a4e
+
+#define JSET_MAGIC		__cpu_to_le64(0x245235c1a3625032ULL)
+#define BSET_MAGIC		__cpu_to_le64(0x90135c78b99e07f5ULL)
+
+static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
+{
+	__le64 ret;
+	memcpy(&ret, &sb->uuid, sizeof(ret));
+	return ret;
+}
+
+static inline __u64 __jset_magic(struct bch_sb *sb)
+{
+	return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
+}
+
+static inline __u64 __bset_magic(struct bch_sb *sb)
+{
+	return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
+}
+
+/* Journal */
+
+#define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
+
+#define BCH_JSET_ENTRY_TYPES()			\
+	x(btree_keys,		0)		\
+	x(btree_root,		1)		\
+	x(prio_ptrs,		2)		\
+	x(blacklist,		3)		\
+	x(blacklist_v2,		4)		\
+	x(usage,		5)		\
+	x(data_usage,		6)
+
+enum {
+#define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
+	BCH_JSET_ENTRY_TYPES()
+#undef x
+	BCH_JSET_ENTRY_NR
+};
+
+/*
+ * Journal sequence numbers can be blacklisted: bsets record the max sequence
+ * number of all the journal entries they contain updates for, so that on
+ * recovery we can ignore those bsets that contain index updates newer that what
+ * made it into the journal.
+ *
+ * This means that we can't reuse that journal_seq - we have to skip it, and
+ * then record that we skipped it so that the next time we crash and recover we
+ * don't think there was a missing journal entry.
+ */
+struct jset_entry_blacklist {
+	struct jset_entry	entry;
+	__le64			seq;
+};
+
+struct jset_entry_blacklist_v2 {
+	struct jset_entry	entry;
+	__le64			start;
+	__le64			end;
+};
+
+enum {
+	FS_USAGE_RESERVED		= 0,
+	FS_USAGE_INODES			= 1,
+	FS_USAGE_KEY_VERSION		= 2,
+	FS_USAGE_NR			= 3
+};
+
+struct jset_entry_usage {
+	struct jset_entry	entry;
+	__le64			v;
+} __attribute__((packed));
+
+struct jset_entry_data_usage {
+	struct jset_entry	entry;
+	__le64			v;
+	struct bch_replicas_entry r;
+} __attribute__((packed));
+
+/*
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
+struct jset {
+	struct bch_csum		csum;
+
+	__le64			magic;
+	__le64			seq;
+	__le32			version;
+	__le32			flags;
+
+	__le32			u64s; /* size of d[] in u64s */
+
+	__u8			encrypted_start[0];
+
+	__le16			read_clock;
+	__le16			write_clock;
+
+	/* Sequence number of oldest dirty journal entry */
+	__le64			last_seq;
+
+
+	union {
+		struct jset_entry start[0];
+		__u64		_data[0];
+	};
+} __attribute__((packed, aligned(8)));
+
+LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
+LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
+
+#define BCH_JOURNAL_BUCKETS_MIN		8
+
+/* Btree: */
+
+#define BCH_BTREE_IDS()					\
+	x(EXTENTS,	0, "extents")			\
+	x(INODES,	1, "inodes")			\
+	x(DIRENTS,	2, "dirents")			\
+	x(XATTRS,	3, "xattrs")			\
+	x(ALLOC,	4, "alloc")			\
+	x(QUOTAS,	5, "quotas")			\
+	x(EC,		6, "stripes")			\
+	x(REFLINK,	7, "reflink")
+
+enum btree_id {
+#define x(kwd, val, name) BTREE_ID_##kwd = val,
+	BCH_BTREE_IDS()
+#undef x
+	BTREE_ID_NR
+};
+
+#define BTREE_MAX_DEPTH		4U
+
+/* Btree nodes */
+
+/*
+ * Btree nodes
+ *
+ * On disk a btree node is a list/log of these; within each set the keys are
+ * sorted
+ */
+struct bset {
+	__le64			seq;
+
+	/*
+	 * Highest journal entry this bset contains keys for.
+	 * If on recovery we don't see that journal entry, this bset is ignored:
+	 * this allows us to preserve the order of all index updates after a
+	 * crash, since the journal records a total order of all index updates
+	 * and anything that didn't make it to the journal doesn't get used.
+	 */
+	__le64			journal_seq;
+
+	__le32			flags;
+	__le16			version;
+	__le16			u64s; /* count of d[] in u64s */
+
+	union {
+		struct bkey_packed start[0];
+		__u64		_data[0];
+	};
+} __attribute__((packed, aligned(8)));
+
+LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
+
+LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 4, 5);
+LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
+				struct bset, flags, 5, 6);
+
+struct btree_node {
+	struct bch_csum		csum;
+	__le64			magic;
+
+	/* this flags field is encrypted, unlike bset->flags: */
+	__le64			flags;
+
+	/* Closed interval: */
+	struct bpos		min_key;
+	struct bpos		max_key;
+	struct bch_extent_ptr	ptr;
+	struct bkey_format	format;
+
+	union {
+	struct bset		keys;
+	struct {
+		__u8		pad[22];
+		__le16		u64s;
+		__u64		_data[0];
+
+	};
+	};
+} __attribute__((packed, aligned(8)));
+
+LE64_BITMASK(BTREE_NODE_ID,	struct btree_node, flags,  0,  4);
+LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
+/* 8-32 unused */
+LE64_BITMASK(BTREE_NODE_SEQ,	struct btree_node, flags, 32, 64);
+
+struct btree_node_entry {
+	struct bch_csum		csum;
+
+	union {
+	struct bset		keys;
+	struct {
+		__u8		pad[22];
+		__le16		u64s;
+		__u64		_data[0];
+
+	};
+	};
+} __attribute__((packed, aligned(8)));
+
+#endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
new file mode 100644
index 000000000000..d668ede5491a
--- /dev/null
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -0,0 +1,314 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IOCTL_H
+#define _BCACHEFS_IOCTL_H
+
+#include <linux/uuid.h>
+#include <asm/ioctl.h>
+#include "bcachefs_format.h"
+
+/*
+ * Flags common to multiple ioctls:
+ */
+#define BCH_FORCE_IF_DATA_LOST		(1 << 0)
+#define BCH_FORCE_IF_METADATA_LOST	(1 << 1)
+#define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
+#define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
+
+#define BCH_FORCE_IF_DEGRADED			\
+	(BCH_FORCE_IF_DATA_DEGRADED|		\
+	 BCH_FORCE_IF_METADATA_DEGRADED)
+
+/*
+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
+ * filesystem:
+ */
+#define BCH_BY_INDEX			(1 << 4)
+
+/*
+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
+ * wide superblock:
+ */
+#define BCH_READ_DEV			(1 << 5)
+
+/* global control dev: */
+
+/* These are currently broken, and probably unnecessary: */
+#if 0
+#define BCH_IOCTL_ASSEMBLE	_IOW(0xbc, 1, struct bch_ioctl_assemble)
+#define BCH_IOCTL_INCREMENTAL	_IOW(0xbc, 2, struct bch_ioctl_incremental)
+
+struct bch_ioctl_assemble {
+	__u32			flags;
+	__u32			nr_devs;
+	__u64			pad;
+	__u64			devs[];
+};
+
+struct bch_ioctl_incremental {
+	__u32			flags;
+	__u64			pad;
+	__u64			dev;
+};
+#endif
+
+/* filesystem ioctls: */
+
+#define BCH_IOCTL_QUERY_UUID	_IOR(0xbc,	1,  struct bch_ioctl_query_uuid)
+
+/* These only make sense when we also have incremental assembly */
+#if 0
+#define BCH_IOCTL_START		_IOW(0xbc,	2,  struct bch_ioctl_start)
+#define BCH_IOCTL_STOP		_IO(0xbc,	3)
+#endif
+
+#define BCH_IOCTL_DISK_ADD	_IOW(0xbc,	4,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_REMOVE	_IOW(0xbc,	5,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_ONLINE	_IOW(0xbc,	6,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_OFFLINE	_IOW(0xbc,	7,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,	8,  struct bch_ioctl_disk_set_state)
+#define BCH_IOCTL_DATA		_IOW(0xbc,	10, struct bch_ioctl_data)
+#define BCH_IOCTL_USAGE		_IOWR(0xbc,	11, struct bch_ioctl_usage)
+#define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
+#define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
+#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	14,  struct bch_ioctl_disk_resize)
+
+/* ioctl below act on a particular file, not the filesystem as a whole: */
+
+#define BCHFS_IOC_REINHERIT_ATTRS	_IOR(0xbc, 64, const char __user *)
+
+/*
+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
+ *
+ * Returns user visible UUID, not internal UUID (which may not ever be changed);
+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
+ * this UUID.
+ */
+struct bch_ioctl_query_uuid {
+	uuid_le			uuid;
+};
+
+#if 0
+struct bch_ioctl_start {
+	__u32			flags;
+	__u32			pad;
+};
+#endif
+
+/*
+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
+ *
+ * The specified device must not be open or in use. On success, the new device
+ * will be an online member of the filesystem just like any other member.
+ *
+ * The device must first be prepared by userspace by formatting with a bcachefs
+ * superblock, which is only used for passing in superblock options/parameters
+ * for that device (in struct bch_member). The new device's superblock should
+ * not claim to be a member of any existing filesystem - UUIDs on it will be
+ * ignored.
+ */
+
+/*
+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
+ *
+ * Any data present on @dev will be permanently deleted, and @dev will be
+ * removed from its slot in the filesystem's list of member devices. The device
+ * may be either offline or offline.
+ *
+ * Will fail removing @dev would leave us with insufficient read write devices
+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
+ * set.
+ */
+
+/*
+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
+ * but is not open (e.g. because we started in degraded mode), bring it online
+ *
+ * all existing data on @dev will be available once the device is online,
+ * exactly as if @dev was present when the filesystem was first mounted
+ */
+
+/*
+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
+ * block device, without removing it from the filesystem (so it can be brought
+ * back online later)
+ *
+ * Data present on @dev will be unavailable while @dev is offline (unless
+ * replicated), but will still be intact and untouched if @dev is brought back
+ * online
+ *
+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
+ * leave us with insufficient read write devices or degraded/unavailable data,
+ * unless the approprate BCH_FORCE_IF_* flags are set.
+ */
+
+struct bch_ioctl_disk {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
+ *
+ * @new_state		- one of the bch_member_state states (rw, ro, failed,
+ *			  spare)
+ *
+ * Will refuse to change member state if we would then have insufficient devices
+ * to write to, or if it would result in degraded data (when @new_state is
+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
+ */
+struct bch_ioctl_disk_set_state {
+	__u32			flags;
+	__u8			new_state;
+	__u8			pad[3];
+	__u64			dev;
+};
+
+enum bch_data_ops {
+	BCH_DATA_OP_SCRUB	= 0,
+	BCH_DATA_OP_REREPLICATE	= 1,
+	BCH_DATA_OP_MIGRATE	= 2,
+	BCH_DATA_OP_NR		= 3,
+};
+
+/*
+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
+ * scrub, rereplicate, migrate).
+ *
+ * This ioctl kicks off a job in the background, and returns a file descriptor.
+ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
+ * indicating current progress, and closing the file descriptor will stop the
+ * job. The file descriptor is O_CLOEXEC.
+ */
+struct bch_ioctl_data {
+	__u32			op;
+	__u32			flags;
+
+	struct bpos		start;
+	struct bpos		end;
+
+	union {
+	struct {
+		__u32		dev;
+		__u32		pad;
+	}			migrate;
+	struct {
+		__u64		pad[8];
+	};
+	};
+} __attribute__((packed, aligned(8)));
+
+enum bch_data_event {
+	BCH_DATA_EVENT_PROGRESS	= 0,
+	/* XXX: add an event for reporting errors */
+	BCH_DATA_EVENT_NR	= 1,
+};
+
+struct bch_ioctl_data_progress {
+	__u8			data_type;
+	__u8			btree_id;
+	__u8			pad[2];
+	struct bpos		pos;
+
+	__u64			sectors_done;
+	__u64			sectors_total;
+} __attribute__((packed, aligned(8)));
+
+struct bch_ioctl_data_event {
+	__u8			type;
+	__u8			pad[7];
+	union {
+	struct bch_ioctl_data_progress p;
+	__u64			pad2[15];
+	};
+} __attribute__((packed, aligned(8)));
+
+struct bch_ioctl_dev_usage {
+	__u8			state;
+	__u8			alive;
+	__u8			pad[6];
+	__u32			dev;
+
+	__u32			bucket_size;
+	__u64			nr_buckets;
+
+	__u64			buckets[BCH_DATA_NR];
+	__u64			sectors[BCH_DATA_NR];
+};
+
+struct bch_ioctl_fs_usage {
+	__u64			capacity;
+	__u64			used;
+	__u64			online_reserved;
+	__u64			persistent_reserved[BCH_REPLICAS_MAX];
+	__u64			sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
+};
+
+/*
+ * BCH_IOCTL_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @nr_devices	- number of devices userspace allocated space for in @devs
+ *
+ * On success, @fs and @devs will be filled out appropriately and devs[i].alive
+ * will indicate if a device was present in that slot
+ *
+ * Returns -ERANGE if @nr_devices was too small
+ */
+struct bch_ioctl_usage {
+	__u16			nr_devices;
+	__u16			pad[3];
+
+	struct bch_ioctl_fs_usage fs;
+	struct bch_ioctl_dev_usage devs[0];
+};
+
+/*
+ * BCH_IOCTL_READ_SUPER: read filesystem superblock
+ *
+ * Equivalent to reading the superblock directly from the block device, except
+ * avoids racing with the kernel writing the superblock or having to figure out
+ * which block device to read
+ *
+ * @sb		- buffer to read into
+ * @size	- size of userspace allocated buffer
+ * @dev		- device to read superblock for, if BCH_READ_DEV flag is
+ *		  specified
+ *
+ * Returns -ERANGE if buffer provided is too small
+ */
+struct bch_ioctl_read_super {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			size;
+	__u64			sb;
+};
+
+/*
+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
+ * determine if disk is a (online) member - if so, returns device's index
+ *
+ * Returns -ENOENT if not found
+ */
+struct bch_ioctl_disk_get_idx {
+	__u64			dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
+ *
+ * @dev		- member to resize
+ * @nbuckets	- new number of buckets
+ */
+struct bch_ioctl_disk_resize {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			nbuckets;
+};
+
+#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
new file mode 100644
index 000000000000..ed7ca5b0636d
--- /dev/null
+++ b/fs/bcachefs/bkey.c
@@ -0,0 +1,1160 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "bset.h"
+#include "util.h"
+
+#undef EBUG_ON
+
+#ifdef DEBUG_BKEYS
+#define EBUG_ON(cond)		BUG_ON(cond)
+#else
+#define EBUG_ON(cond)
+#endif
+
+const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
+			      const struct bkey_packed *);
+
+void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits)
+{
+	unsigned bit = high_bit_offset, done = 0;
+
+	while (1) {
+		while (bit < 64) {
+			if (done && !(done % 8))
+				*out++ = ' ';
+			*out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
+			bit++;
+			done++;
+			if (done == nr_bits) {
+				*out++ = '\0';
+				return;
+			}
+		}
+
+		p = next_word(p);
+		bit = 0;
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
+				 const struct bkey *unpacked,
+				 const struct bkey_format *format)
+{
+	struct bkey tmp;
+
+	BUG_ON(bkeyp_val_u64s(format, packed) !=
+	       bkey_val_u64s(unpacked));
+
+	BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
+
+	tmp = __bch2_bkey_unpack_key(format, packed);
+
+	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
+		char buf1[160], buf2[160];
+		char buf3[160], buf4[160];
+
+		bch2_bkey_to_text(&PBUF(buf1), unpacked);
+		bch2_bkey_to_text(&PBUF(buf2), &tmp);
+		bch2_to_binary(buf3, (void *) unpacked, 80);
+		bch2_to_binary(buf4, high_word(format, packed), 80);
+
+		panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
+		      format->key_u64s,
+		      format->bits_per_field[0],
+		      format->bits_per_field[1],
+		      format->bits_per_field[2],
+		      format->bits_per_field[3],
+		      format->bits_per_field[4],
+		      buf1, buf2, buf3, buf4);
+	}
+}
+
+#else
+static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
+					const struct bkey *unpacked,
+					const struct bkey_format *format) {}
+#endif
+
+struct pack_state {
+	const struct bkey_format *format;
+	unsigned		bits;	/* bits remaining in current word */
+	u64			w;	/* current word */
+	u64			*p;	/* pointer to next word */
+};
+
+__always_inline
+static struct pack_state pack_state_init(const struct bkey_format *format,
+					 struct bkey_packed *k)
+{
+	u64 *p = high_word(format, k);
+
+	return (struct pack_state) {
+		.format	= format,
+		.bits	= 64 - high_bit_offset,
+		.w	= 0,
+		.p	= p,
+	};
+}
+
+__always_inline
+static void pack_state_finish(struct pack_state *state,
+			      struct bkey_packed *k)
+{
+	EBUG_ON(state->p <  k->_data);
+	EBUG_ON(state->p >= k->_data + state->format->key_u64s);
+
+	*state->p = state->w;
+}
+
+struct unpack_state {
+	const struct bkey_format *format;
+	unsigned		bits;	/* bits remaining in current word */
+	u64			w;	/* current word */
+	const u64		*p;	/* pointer to next word */
+};
+
+__always_inline
+static struct unpack_state unpack_state_init(const struct bkey_format *format,
+					     const struct bkey_packed *k)
+{
+	const u64 *p = high_word(format, k);
+
+	return (struct unpack_state) {
+		.format	= format,
+		.bits	= 64 - high_bit_offset,
+		.w	= *p << high_bit_offset,
+		.p	= p,
+	};
+}
+
+__always_inline
+static u64 get_inc_field(struct unpack_state *state, unsigned field)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
+
+	if (bits >= state->bits) {
+		v = state->w >> (64 - bits);
+		bits -= state->bits;
+
+		state->p = next_word(state->p);
+		state->w = *state->p;
+		state->bits = 64;
+	}
+
+	/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+	v |= (state->w >> 1) >> (63 - bits);
+	state->w <<= bits;
+	state->bits -= bits;
+
+	return v + offset;
+}
+
+__always_inline
+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 offset = le64_to_cpu(state->format->field_offset[field]);
+
+	if (v < offset)
+		return false;
+
+	v -= offset;
+
+	if (fls64(v) > bits)
+		return false;
+
+	if (bits > state->bits) {
+		bits -= state->bits;
+		/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+		state->w |= (v >> 1) >> (bits - 1);
+
+		*state->p = state->w;
+		state->p = next_word(state->p);
+		state->w = 0;
+		state->bits = 64;
+	}
+
+	state->bits -= bits;
+	state->w |= v << state->bits;
+
+	return true;
+}
+
+/*
+ * Note: does NOT set out->format (we don't know what it should be here!)
+ *
+ * Also: doesn't work on extents - it doesn't preserve the invariant that
+ * if k is packed bkey_start_pos(k) will successfully pack
+ */
+static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
+				   struct bkey_packed *out,
+				   const struct bkey_format *in_f,
+				   const struct bkey_packed *in)
+{
+	struct pack_state out_s = pack_state_init(out_f, out);
+	struct unpack_state in_s = unpack_state_init(in_f, in);
+	unsigned i;
+
+	out->_data[0] = 0;
+
+	for (i = 0; i < BKEY_NR_FIELDS; i++)
+		if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
+			return false;
+
+	/* Can't happen because the val would be too big to unpack: */
+	EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
+
+	pack_state_finish(&out_s, out);
+	out->u64s	= out_f->key_u64s + in->u64s - in_f->key_u64s;
+	out->needs_whiteout = in->needs_whiteout;
+	out->type	= in->type;
+
+	return true;
+}
+
+bool bch2_bkey_transform(const struct bkey_format *out_f,
+			struct bkey_packed *out,
+			const struct bkey_format *in_f,
+			const struct bkey_packed *in)
+{
+	if (!bch2_bkey_transform_key(out_f, out, in_f, in))
+		return false;
+
+	memcpy_u64s((u64 *) out + out_f->key_u64s,
+		    (u64 *) in + in_f->key_u64s,
+		    (in->u64s - in_f->key_u64s));
+	return true;
+}
+
+#define bkey_fields()							\
+	x(BKEY_FIELD_INODE,		p.inode)			\
+	x(BKEY_FIELD_OFFSET,		p.offset)			\
+	x(BKEY_FIELD_SNAPSHOT,		p.snapshot)			\
+	x(BKEY_FIELD_SIZE,		size)				\
+	x(BKEY_FIELD_VERSION_HI,	version.hi)			\
+	x(BKEY_FIELD_VERSION_LO,	version.lo)
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
+			      const struct bkey_packed *in)
+{
+	struct unpack_state state = unpack_state_init(format, in);
+	struct bkey out;
+
+	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+	EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
+
+	out.u64s	= BKEY_U64s + in->u64s - format->key_u64s;
+	out.format	= KEY_FORMAT_CURRENT;
+	out.needs_whiteout = in->needs_whiteout;
+	out.type	= in->type;
+	out.pad[0]	= 0;
+
+#define x(id, field)	out.field = get_inc_field(&state, id);
+	bkey_fields()
+#undef x
+
+	return out;
+}
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *format,
+				     const struct bkey_packed *in)
+{
+	struct unpack_state state = unpack_state_init(format, in);
+	struct bpos out;
+
+	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+
+	out.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
+	out.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
+	out.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
+
+	return out;
+}
+#endif
+
+/**
+ * bch2_bkey_pack_key -- pack just the key, not the value
+ */
+bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
+		   const struct bkey_format *format)
+{
+	struct pack_state state = pack_state_init(format, out);
+
+	EBUG_ON((void *) in == (void *) out);
+	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->format != KEY_FORMAT_CURRENT);
+
+	out->_data[0] = 0;
+
+#define x(id, field)	if (!set_inc_field(&state, id, in->field)) return false;
+	bkey_fields()
+#undef x
+
+	/*
+	 * Extents - we have to guarantee that if an extent is packed, a trimmed
+	 * version will also pack:
+	 */
+	if (bkey_start_offset(in) <
+	    le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
+		return false;
+
+	pack_state_finish(&state, out);
+	out->u64s	= format->key_u64s + in->u64s - BKEY_U64s;
+	out->format	= KEY_FORMAT_LOCAL_BTREE;
+	out->needs_whiteout = in->needs_whiteout;
+	out->type	= in->type;
+
+	bch2_bkey_pack_verify(out, in, format);
+	return true;
+}
+
+/**
+ * bch2_bkey_unpack -- unpack the key and the value
+ */
+void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
+		 const struct bkey_packed *src)
+{
+	__bkey_unpack_key(b, &dst->k, src);
+
+	memcpy_u64s(&dst->v,
+		    bkeyp_val(&b->format, src),
+		    bkeyp_val_u64s(&b->format, src));
+}
+
+/**
+ * bch2_bkey_pack -- pack the key and the value
+ */
+bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
+	       const struct bkey_format *format)
+{
+	struct bkey_packed tmp;
+
+	if (!bch2_bkey_pack_key(&tmp, &in->k, format))
+		return false;
+
+	memmove_u64s((u64 *) out + format->key_u64s,
+		     &in->v,
+		     bkey_val_u64s(&in->k));
+	memcpy_u64s(out, &tmp, format->key_u64s);
+
+	return true;
+}
+
+__always_inline
+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 offset = le64_to_cpu(state->format->field_offset[field]);
+	bool ret = true;
+
+	EBUG_ON(v < offset);
+	v -= offset;
+
+	if (fls64(v) > bits) {
+		v = ~(~0ULL << bits);
+		ret = false;
+	}
+
+	if (bits > state->bits) {
+		bits -= state->bits;
+		state->w |= (v >> 1) >> (bits - 1);
+
+		*state->p = state->w;
+		state->p = next_word(state->p);
+		state->w = 0;
+		state->bits = 64;
+	}
+
+	state->bits -= bits;
+	state->w |= v << state->bits;
+
+	return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+static bool bkey_packed_successor(struct bkey_packed *out,
+				  const struct btree *b,
+				  struct bkey_packed k)
+{
+	const struct bkey_format *f = &b->format;
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned first_bit, offset;
+	u64 *p;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	if (!nr_key_bits)
+		return false;
+
+	*out = k;
+
+	first_bit = high_bit_offset + nr_key_bits - 1;
+	p = nth_word(high_word(f, out), first_bit >> 6);
+	offset = 63 - (first_bit & 63);
+
+	while (nr_key_bits) {
+		unsigned bits = min(64 - offset, nr_key_bits);
+		u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+		if ((*p & mask) != mask) {
+			*p += 1ULL << offset;
+			EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
+			return true;
+		}
+
+		*p &= ~mask;
+		p = prev_word(p);
+		nr_key_bits -= bits;
+		offset = 0;
+	}
+
+	return false;
+}
+#endif
+
+/*
+ * Returns a packed key that compares <= in
+ *
+ * This is used in bset_search_tree(), where we need a packed pos in order to be
+ * able to compare against the keys in the auxiliary search tree - and it's
+ * legal to use a packed pos that isn't equivalent to the original pos,
+ * _provided_ it compares <= to the original pos.
+ */
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
+					   struct bpos in,
+					   const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	struct pack_state state = pack_state_init(f, out);
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bpos orig = in;
+#endif
+	bool exact = true;
+
+	out->_data[0] = 0;
+
+	if (unlikely(in.snapshot <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
+		if (!in.offset-- &&
+		    !in.inode--)
+			return BKEY_PACK_POS_FAIL;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(in.offset <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
+		if (!in.inode--)
+			return BKEY_PACK_POS_FAIL;
+		in.offset	= KEY_OFFSET_MAX;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(in.inode <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
+		return BKEY_PACK_POS_FAIL;
+
+	if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) {
+		in.offset	= KEY_OFFSET_MAX;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) {
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))
+		exact = false;
+
+	pack_state_finish(&state, out);
+	out->u64s	= f->key_u64s;
+	out->format	= KEY_FORMAT_LOCAL_BTREE;
+	out->type	= KEY_TYPE_deleted;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	if (exact) {
+		BUG_ON(bkey_cmp_left_packed(b, out, &orig));
+	} else {
+		struct bkey_packed successor;
+
+		BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
+		BUG_ON(bkey_packed_successor(&successor, b, *out) &&
+		       bkey_cmp_left_packed(b, &successor, &orig) < 0);
+	}
+#endif
+
+	return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
+}
+
+void bch2_bkey_format_init(struct bkey_format_state *s)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
+		s->field_min[i] = U64_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
+		s->field_max[i] = 0;
+
+	/* Make sure we can store a size of 0: */
+	s->field_min[BKEY_FIELD_SIZE] = 0;
+}
+
+static void __bkey_format_add(struct bkey_format_state *s,
+			      unsigned field, u64 v)
+{
+	s->field_min[field] = min(s->field_min[field], v);
+	s->field_max[field] = max(s->field_max[field], v);
+}
+
+/*
+ * Changes @format so that @k can be successfully packed with @format
+ */
+void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
+{
+#define x(id, field) __bkey_format_add(s, id, k->field);
+	bkey_fields()
+#undef x
+	__bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
+}
+
+void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
+{
+	unsigned field = 0;
+
+	__bkey_format_add(s, field++, p.inode);
+	__bkey_format_add(s, field++, p.offset);
+	__bkey_format_add(s, field++, p.snapshot);
+}
+
+/*
+ * We don't want it to be possible for the packed format to represent fields
+ * bigger than a u64... that will cause confusion and issues (like with
+ * bkey_packed_successor())
+ */
+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
+			     unsigned bits, u64 offset)
+{
+	offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1));
+
+	f->bits_per_field[i]	= bits;
+	f->field_offset[i]	= cpu_to_le64(offset);
+}
+
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
+{
+	unsigned i, bits = KEY_PACKED_BITS_START;
+	struct bkey_format ret = {
+		.nr_fields = BKEY_NR_FIELDS,
+	};
+
+	for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
+		s->field_min[i] = min(s->field_min[i], s->field_max[i]);
+
+		set_format_field(&ret, i,
+				 fls64(s->field_max[i] - s->field_min[i]),
+				 s->field_min[i]);
+
+		bits += ret.bits_per_field[i];
+	}
+
+	/* allow for extent merging: */
+	if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
+		ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
+		bits += 4;
+	}
+
+	ret.key_u64s = DIV_ROUND_UP(bits, 64);
+
+	/* if we have enough spare bits, round fields up to nearest byte */
+	bits = ret.key_u64s * 64 - bits;
+
+	for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
+		unsigned r = round_up(ret.bits_per_field[i], 8) -
+			ret.bits_per_field[i];
+
+		if (r <= bits) {
+			set_format_field(&ret, i,
+					 ret.bits_per_field[i] + r,
+					 le64_to_cpu(ret.field_offset[i]));
+			bits -= r;
+		}
+	}
+
+	EBUG_ON(bch2_bkey_format_validate(&ret));
+	return ret;
+}
+
+const char *bch2_bkey_format_validate(struct bkey_format *f)
+{
+	unsigned i, bits = KEY_PACKED_BITS_START;
+
+	if (f->nr_fields != BKEY_NR_FIELDS)
+		return "incorrect number of fields";
+
+	for (i = 0; i < f->nr_fields; i++) {
+		u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+		if (f->bits_per_field[i] > 64)
+			return "field too large";
+
+		if (field_offset &&
+		    (f->bits_per_field[i] == 64 ||
+		    (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
+		     field_offset)))
+			return "offset + bits overflow";
+
+		bits += f->bits_per_field[i];
+	}
+
+	if (f->key_u64s != DIV_ROUND_UP(bits, 64))
+		return "incorrect key_u64s";
+
+	return NULL;
+}
+
+/*
+ * Most significant differing bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
+					  const struct bkey_packed *l_k,
+					  const struct bkey_packed *r_k)
+{
+	const u64 *l = high_word(&b->format, l_k);
+	const u64 *r = high_word(&b->format, r_k);
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned word_bits = 64 - high_bit_offset;
+	u64 l_v, r_v;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+	/* for big endian, skip past header */
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (nr_key_bits) {
+		if (nr_key_bits < word_bits) {
+			l_v >>= word_bits - nr_key_bits;
+			r_v >>= word_bits - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= word_bits;
+		}
+
+		if (l_v != r_v)
+			return fls64(l_v ^ r_v) - 1 + nr_key_bits;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+		word_bits = 64;
+	}
+
+	return 0;
+}
+
+/*
+ * First set bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
+{
+	const u64 *p = high_word(&b->format, k);
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned ret = 0, offset;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+	offset = nr_key_bits;
+	while (offset > 64) {
+		p = next_word(p);
+		offset -= 64;
+	}
+
+	offset = 64 - offset;
+
+	while (nr_key_bits) {
+		unsigned bits = nr_key_bits + offset < 64
+			? nr_key_bits
+			: 64 - offset;
+
+		u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+		if (*p & mask)
+			return ret + __ffs64(*p & mask) - offset;
+
+		p = prev_word(p);
+		nr_key_bits -= bits;
+		ret += bits;
+		offset = 0;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_X86_64
+
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	long d0, d1, d2, d3;
+	int cmp;
+
+	/* we shouldn't need asm for this, but gcc is being retarded: */
+
+	asm(".intel_syntax noprefix;"
+	    "xor eax, eax;"
+	    "xor edx, edx;"
+	    "1:;"
+	    "mov r8, [rdi];"
+	    "mov r9, [rsi];"
+	    "sub ecx, 64;"
+	    "jl 2f;"
+
+	    "cmp r8, r9;"
+	    "jnz 3f;"
+
+	    "lea rdi, [rdi - 8];"
+	    "lea rsi, [rsi - 8];"
+	    "jmp 1b;"
+
+	    "2:;"
+	    "not ecx;"
+	    "shr r8, 1;"
+	    "shr r9, 1;"
+	    "shr r8, cl;"
+	    "shr r9, cl;"
+	    "cmp r8, r9;"
+
+	    "3:\n"
+	    "seta al;"
+	    "setb dl;"
+	    "sub eax, edx;"
+	    ".att_syntax prefix;"
+	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+	    : "0" (l), "1" (r), "3" (nr_key_bits)
+	    : "r8", "r9", "cc", "memory");
+
+	return cmp;
+}
+
+#define I(_x)			(*(out)++ = (_x))
+#define I1(i0)						I(i0)
+#define I2(i0, i1)		(I1(i0),		I(i1))
+#define I3(i0, i1, i2)		(I2(i0, i1),		I(i2))
+#define I4(i0, i1, i2, i3)	(I3(i0, i1, i2),	I(i3))
+#define I5(i0, i1, i2, i3, i4)	(I4(i0, i1, i2, i3),	I(i4))
+
+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
+			      enum bch_bkey_fields field,
+			      unsigned dst_offset, unsigned dst_size,
+			      bool *eax_zeroed)
+{
+	unsigned bits = format->bits_per_field[field];
+	u64 offset = le64_to_cpu(format->field_offset[field]);
+	unsigned i, byte, bit_offset, align, shl, shr;
+
+	if (!bits && !offset) {
+		if (!*eax_zeroed) {
+			/* xor eax, eax */
+			I2(0x31, 0xc0);
+		}
+
+		*eax_zeroed = true;
+		goto set_field;
+	}
+
+	if (!bits) {
+		/* just return offset: */
+
+		switch (dst_size) {
+		case 8:
+			if (offset > S32_MAX) {
+				/* mov [rdi + dst_offset], offset */
+				I3(0xc7, 0x47, dst_offset);
+				memcpy(out, &offset, 4);
+				out += 4;
+
+				I3(0xc7, 0x47, dst_offset + 4);
+				memcpy(out, (void *) &offset + 4, 4);
+				out += 4;
+			} else {
+				/* mov [rdi + dst_offset], offset */
+				/* sign extended */
+				I4(0x48, 0xc7, 0x47, dst_offset);
+				memcpy(out, &offset, 4);
+				out += 4;
+			}
+			break;
+		case 4:
+			/* mov [rdi + dst_offset], offset */
+			I3(0xc7, 0x47, dst_offset);
+			memcpy(out, &offset, 4);
+			out += 4;
+			break;
+		default:
+			BUG();
+		}
+
+		return out;
+	}
+
+	bit_offset = format->key_u64s * 64;
+	for (i = 0; i <= field; i++)
+		bit_offset -= format->bits_per_field[i];
+
+	byte = bit_offset / 8;
+	bit_offset -= byte * 8;
+
+	*eax_zeroed = false;
+
+	if (bit_offset == 0 && bits == 8) {
+		/* movzx eax, BYTE PTR [rsi + imm8] */
+		I4(0x0f, 0xb6, 0x46, byte);
+	} else if (bit_offset == 0 && bits == 16) {
+		/* movzx eax, WORD PTR [rsi + imm8] */
+		I4(0x0f, 0xb7, 0x46, byte);
+	} else if (bit_offset + bits <= 32) {
+		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+		byte -= align;
+		bit_offset += align * 8;
+
+		BUG_ON(bit_offset + bits > 32);
+
+		/* mov eax, [rsi + imm8] */
+		I3(0x8b, 0x46, byte);
+
+		if (bit_offset) {
+			/* shr eax, imm8 */
+			I3(0xc1, 0xe8, bit_offset);
+		}
+
+		if (bit_offset + bits < 32) {
+			unsigned mask = ~0U >> (32 - bits);
+
+			/* and eax, imm32 */
+			I1(0x25);
+			memcpy(out, &mask, 4);
+			out += 4;
+		}
+	} else if (bit_offset + bits <= 64) {
+		align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
+		byte -= align;
+		bit_offset += align * 8;
+
+		BUG_ON(bit_offset + bits > 64);
+
+		/* mov rax, [rsi + imm8] */
+		I4(0x48, 0x8b, 0x46, byte);
+
+		shl = 64 - bit_offset - bits;
+		shr = bit_offset + shl;
+
+		if (shl) {
+			/* shl rax, imm8 */
+			I4(0x48, 0xc1, 0xe0, shl);
+		}
+
+		if (shr) {
+			/* shr rax, imm8 */
+			I4(0x48, 0xc1, 0xe8, shr);
+		}
+	} else {
+		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+		byte -= align;
+		bit_offset += align * 8;
+
+		BUG_ON(bit_offset + bits > 96);
+
+		/* mov rax, [rsi + byte] */
+		I4(0x48, 0x8b, 0x46, byte);
+
+		/* mov edx, [rsi + byte + 8] */
+		I3(0x8b, 0x56, byte + 8);
+
+		/* bits from next word: */
+		shr = bit_offset + bits - 64;
+		BUG_ON(shr > bit_offset);
+
+		/* shr rax, bit_offset */
+		I4(0x48, 0xc1, 0xe8, shr);
+
+		/* shl rdx, imm8 */
+		I4(0x48, 0xc1, 0xe2, 64 - shr);
+
+		/* or rax, rdx */
+		I3(0x48, 0x09, 0xd0);
+
+		shr = bit_offset - shr;
+
+		if (shr) {
+			/* shr rax, imm8 */
+			I4(0x48, 0xc1, 0xe8, shr);
+		}
+	}
+
+	/* rax += offset: */
+	if (offset > S32_MAX) {
+		/* mov rdx, imm64 */
+		I2(0x48, 0xba);
+		memcpy(out, &offset, 8);
+		out += 8;
+		/* add %rdx, %rax */
+		I3(0x48, 0x01, 0xd0);
+	} else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
+		/* add rax, imm32 */
+		I2(0x48, 0x05);
+		memcpy(out, &offset, 4);
+		out += 4;
+	} else if (offset) {
+		/* add eax, imm32 */
+		I1(0x05);
+		memcpy(out, &offset, 4);
+		out += 4;
+	}
+set_field:
+	switch (dst_size) {
+	case 8:
+		/* mov [rdi + dst_offset], rax */
+		I4(0x48, 0x89, 0x47, dst_offset);
+		break;
+	case 4:
+		/* mov [rdi + dst_offset], eax */
+		I3(0x89, 0x47, dst_offset);
+		break;
+	default:
+		BUG();
+	}
+
+	return out;
+}
+
+int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
+{
+	bool eax_zeroed = false;
+	u8 *out = _out;
+
+	/*
+	 * rdi: dst - unpacked key
+	 * rsi: src - packed key
+	 */
+
+	/* k->u64s, k->format, k->type */
+
+	/* mov eax, [rsi] */
+	I2(0x8b, 0x06);
+
+	/* add eax, BKEY_U64s - format->key_u64s */
+	I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
+
+	/* and eax, imm32: mask out k->pad: */
+	I5(0x25, 0xff, 0xff, 0xff, 0);
+
+	/* mov [rdi], eax */
+	I2(0x89, 0x07);
+
+#define x(id, field)							\
+	out = compile_bkey_field(format, out, id,			\
+				 offsetof(struct bkey, field),		\
+				 sizeof(((struct bkey *) NULL)->field),	\
+				 &eax_zeroed);
+	bkey_fields()
+#undef x
+
+	/* retq */
+	I1(0xc3);
+
+	return (void *) out - _out;
+}
+
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	u64 l_v, r_v;
+
+	if (!nr_key_bits)
+		return 0;
+
+	/* for big endian, skip past header */
+	nr_key_bits += high_bit_offset;
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (1) {
+		if (nr_key_bits < 64) {
+			l_v >>= 64 - nr_key_bits;
+			r_v >>= 64 - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= 64;
+		}
+
+		if (!nr_key_bits || l_v != r_v)
+			break;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+	}
+
+	return cmp_int(l_v, r_v);
+}
+#endif
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
+					  const struct bkey_packed *r,
+					  const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	int ret;
+
+	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	ret = __bkey_cmp_bits(high_word(f, l),
+			      high_word(f, r),
+			      b->nr_key_bits);
+
+	EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l),
+				bkey_unpack_pos(b, r)));
+	return ret;
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
+					       const struct bkey_packed *l,
+					       const struct bpos *r)
+{
+	return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
+			   const struct bkey_packed *r,
+			   const struct btree *b)
+{
+	int packed = bkey_lr_packed(l, r);
+
+	if (likely(packed == BKEY_PACKED_BOTH))
+		return __bch2_bkey_cmp_packed_format_checked(l, r, b);
+
+	switch (packed) {
+	case BKEY_PACKED_NONE:
+		return bkey_cmp(((struct bkey *) l)->p,
+				((struct bkey *) r)->p);
+	case BKEY_PACKED_LEFT:
+		return __bch2_bkey_cmp_left_packed_format_checked(b,
+				  (struct bkey_packed *) l,
+				  &((struct bkey *) r)->p);
+	case BKEY_PACKED_RIGHT:
+		return -__bch2_bkey_cmp_left_packed_format_checked(b,
+				  (struct bkey_packed *) r,
+				  &((struct bkey *) l)->p);
+	default:
+		unreachable();
+	}
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_left_packed(const struct btree *b,
+				const struct bkey_packed *l,
+				const struct bpos *r)
+{
+	const struct bkey *l_unpacked;
+
+	return unlikely(l_unpacked = packed_to_bkey_c(l))
+		? bkey_cmp(l_unpacked->p, *r)
+		: __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+void bch2_bpos_swab(struct bpos *p)
+{
+	u8 *l = (u8 *) p;
+	u8 *h = ((u8 *) &p[1]) - 1;
+
+	while (l < h) {
+		swap(*l, *h);
+		l++;
+		--h;
+	}
+}
+
+void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
+{
+	const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
+	u8 *l = k->key_start;
+	u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
+
+	while (l < h) {
+		swap(*l, *h);
+		l++;
+		--h;
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void)
+{
+	struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
+	struct bkey_packed p;
+
+	struct bkey_format test_format = {
+		.key_u64s	= 2,
+		.nr_fields	= BKEY_NR_FIELDS,
+		.bits_per_field = {
+			13,
+			64,
+		},
+	};
+
+	struct unpack_state in_s =
+		unpack_state_init(&bch2_bkey_format_current, (void *) &t);
+	struct pack_state out_s = pack_state_init(&test_format, &p);
+	unsigned i;
+
+	for (i = 0; i < out_s.format->nr_fields; i++) {
+		u64 a, v = get_inc_field(&in_s, i);
+
+		switch (i) {
+#define x(id, field)	case id: a = t.field; break;
+	bkey_fields()
+#undef x
+		default:
+			BUG();
+		}
+
+		if (a != v)
+			panic("got %llu actual %llu i %u\n", v, a, i);
+
+		if (!set_inc_field(&out_s, i, v))
+			panic("failed at %u\n", i);
+	}
+
+	BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
+}
+#endif
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
new file mode 100644
index 000000000000..b26f4934b264
--- /dev/null
+++ b/fs/bcachefs/bkey.h
@@ -0,0 +1,594 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_H
+#define _BCACHEFS_BKEY_H
+
+#include <linux/bug.h>
+#include "bcachefs_format.h"
+
+#include "util.h"
+#include "vstructs.h"
+
+#ifdef CONFIG_X86_64
+#define HAVE_BCACHEFS_COMPILED_UNPACK	1
+#endif
+
+void bch2_to_binary(char *, const u64 *, unsigned);
+
+/* bkey with split value, const */
+struct bkey_s_c {
+	const struct bkey	*k;
+	const struct bch_val	*v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+	union {
+	struct {
+		struct bkey	*k;
+		struct bch_val	*v;
+	};
+	struct bkey_s_c		s_c;
+	};
+};
+
+#define bkey_next(_k)		vstruct_next(_k)
+
+#define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+	return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+	k->u64s = BKEY_U64s + val_u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+	k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+#define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
+
+#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
+
+#define bkey_whiteout(_k)				\
+	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
+
+#define bkey_packed_typecheck(_k)					\
+({									\
+	BUILD_BUG_ON(!type_is(_k, struct bkey *) &&			\
+		     !type_is(_k, struct bkey_packed *));		\
+	type_is(_k, struct bkey_packed *);				\
+})
+
+enum bkey_lr_packed {
+	BKEY_PACKED_BOTH,
+	BKEY_PACKED_RIGHT,
+	BKEY_PACKED_LEFT,
+	BKEY_PACKED_NONE,
+};
+
+#define bkey_lr_packed_typecheck(_l, _r)				\
+	(!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
+
+#define bkey_lr_packed(_l, _r)						\
+	((_l)->format + ((_r)->format << 1))
+
+#define bkey_copy(_dst, _src)					\
+do {								\
+	BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) &&		\
+		     !type_is(_dst, struct bkey_packed *));	\
+	BUILD_BUG_ON(!type_is(_src, struct bkey_i *) &&		\
+		     !type_is(_src, struct bkey_packed *));	\
+	EBUG_ON((u64 *) (_dst) > (u64 *) (_src) &&		\
+		(u64 *) (_dst) < (u64 *) (_src) +		\
+		((struct bkey *) (_src))->u64s);		\
+								\
+	memcpy_u64s_small((_dst), (_src),			\
+			  ((struct bkey *) (_src))->u64s);	\
+} while (0)
+
+struct btree;
+
+struct bkey_format_state {
+	u64 field_min[BKEY_NR_FIELDS];
+	u64 field_max[BKEY_NR_FIELDS];
+};
+
+void bch2_bkey_format_init(struct bkey_format_state *);
+void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+const char *bch2_bkey_format_validate(struct bkey_format *);
+
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
+					  const struct bkey_packed *,
+					  const struct bkey_packed *);
+__pure
+unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
+				     const struct bkey_packed *,
+				     const struct btree *);
+
+__pure
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
+					  const struct bkey_packed *,
+					  const struct bpos *);
+
+__pure
+int __bch2_bkey_cmp_packed(const struct bkey_packed *,
+			   const struct bkey_packed *,
+			   const struct btree *);
+
+__pure
+int __bch2_bkey_cmp_left_packed(const struct btree *,
+				const struct bkey_packed *,
+				const struct bpos *);
+
+static inline __pure
+int bkey_cmp_left_packed(const struct btree *b,
+			 const struct bkey_packed *l, const struct bpos *r)
+{
+	return __bch2_bkey_cmp_left_packed(b, l, r);
+}
+
+/*
+ * we prefer to pass bpos by ref, but it's often enough terribly convenient to
+ * pass it by by val... as much as I hate c++, const ref would be nice here:
+ */
+__pure __flatten
+static inline int bkey_cmp_left_packed_byval(const struct btree *b,
+					     const struct bkey_packed *l,
+					     struct bpos r)
+{
+	return bkey_cmp_left_packed(b, l, &r);
+}
+
+/*
+ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
+ * skip dispatching on k->format:
+ */
+#define bkey_cmp_packed(_b, _l, _r)					\
+({									\
+	int _cmp;							\
+									\
+	switch (bkey_lr_packed_typecheck(_l, _r)) {			\
+	case BKEY_PACKED_NONE:						\
+		_cmp = bkey_cmp(((struct bkey *) (_l))->p,		\
+				((struct bkey *) (_r))->p);		\
+		break;							\
+	case BKEY_PACKED_LEFT:						\
+		_cmp = bkey_cmp_left_packed((_b),			\
+				  (struct bkey_packed *) (_l),		\
+				  &((struct bkey *) (_r))->p);		\
+		break;							\
+	case BKEY_PACKED_RIGHT:						\
+		_cmp = -bkey_cmp_left_packed((_b),			\
+				  (struct bkey_packed *) (_r),		\
+				  &((struct bkey *) (_l))->p);		\
+		break;							\
+	case BKEY_PACKED_BOTH:						\
+		_cmp = __bch2_bkey_cmp_packed((void *) (_l),		\
+					 (void *) (_r), (_b));		\
+		break;							\
+	}								\
+	_cmp;								\
+})
+
+#if 1
+static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
+{
+	if (l.inode != r.inode)
+		return l.inode < r.inode ? -1 : 1;
+	if (l.offset != r.offset)
+		return l.offset < r.offset ? -1 : 1;
+	if (l.snapshot != r.snapshot)
+		return l.snapshot < r.snapshot ? -1 : 1;
+	return 0;
+}
+#else
+int bkey_cmp(struct bpos l, struct bpos r);
+#endif
+
+static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+{
+	return bkey_cmp(l, r) < 0 ? l : r;
+}
+
+void bch2_bpos_swab(struct bpos *);
+void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
+
+static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
+{
+	return  cmp_int(l.hi, r.hi) ?:
+		cmp_int(l.lo, r.lo);
+}
+
+#define ZERO_VERSION	((struct bversion) { .hi = 0, .lo = 0 })
+#define MAX_VERSION	((struct bversion) { .hi = ~0, .lo = ~0ULL })
+
+static __always_inline int bversion_zero(struct bversion v)
+{
+	return !bversion_cmp(v, ZERO_VERSION);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+/* statement expressions confusing unlikely()? */
+#define bkey_packed(_k)							\
+	({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT);			\
+	 (_k)->format != KEY_FORMAT_CURRENT; })
+#else
+#define bkey_packed(_k)		((_k)->format != KEY_FORMAT_CURRENT)
+#endif
+
+/*
+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse
+ */
+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
+{
+	return (struct bkey_packed *) k;
+}
+
+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
+{
+	return (const struct bkey_packed *) k;
+}
+
+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (struct bkey_i *) k;
+}
+
+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (const struct bkey *) k;
+}
+
+static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
+{
+	return format->bits_per_field[BKEY_FIELD_INODE] +
+		format->bits_per_field[BKEY_FIELD_OFFSET] +
+		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
+}
+
+static inline struct bpos bkey_successor(struct bpos p)
+{
+	struct bpos ret = p;
+
+	if (!++ret.offset)
+		BUG_ON(!++ret.inode);
+
+	return ret;
+}
+
+static inline struct bpos bkey_predecessor(struct bpos p)
+{
+	struct bpos ret = p;
+
+	if (!ret.offset--)
+		BUG_ON(!ret.inode--);
+
+	return ret;
+}
+
+static inline u64 bkey_start_offset(const struct bkey *k)
+{
+	return k->p.offset - k->size;
+}
+
+static inline struct bpos bkey_start_pos(const struct bkey *k)
+{
+	return (struct bpos) {
+		.inode		= k->p.inode,
+		.offset		= bkey_start_offset(k),
+		.snapshot	= k->p.snapshot,
+	};
+}
+
+/* Packed helpers */
+
+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+
+	EBUG_ON(k->u64s < ret);
+	return ret;
+}
+
+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
+				       const struct bkey_packed *k)
+{
+	return bkeyp_key_u64s(format, k) * sizeof(u64);
+}
+
+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	return k->u64s - bkeyp_key_u64s(format, k);
+}
+
+static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
+				     const struct bkey_packed *k)
+{
+	return bkeyp_val_u64s(format, k) * sizeof(u64);
+}
+
+static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
+				      struct bkey_packed *k, unsigned val_u64s)
+{
+	k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
+}
+
+#define bkeyp_val(_format, _k)						\
+	 ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
+
+extern const struct bkey_format bch2_bkey_format_current;
+
+bool bch2_bkey_transform(const struct bkey_format *,
+			 struct bkey_packed *,
+			 const struct bkey_format *,
+			 const struct bkey_packed *);
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
+				   const struct bkey_packed *);
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *,
+			      const struct bkey_packed *);
+#endif
+
+bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
+		   const struct bkey_format *);
+
+enum bkey_pack_pos_ret {
+	BKEY_PACK_POS_EXACT,
+	BKEY_PACK_POS_SMALLER,
+	BKEY_PACK_POS_FAIL,
+};
+
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
+					   const struct btree *);
+
+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
+				 const struct btree *b)
+{
+	return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
+}
+
+void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
+		 const struct bkey_packed *);
+bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
+	       const struct bkey_format *);
+
+static inline u64 bkey_field_max(const struct bkey_format *f,
+				 enum bch_bkey_fields nr)
+{
+	return f->bits_per_field[nr] < 64
+		? (le64_to_cpu(f->field_offset[nr]) +
+		   ~(~0ULL << f->bits_per_field[nr]))
+		: U64_MAX;
+}
+
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+
+int bch2_compile_bkey_format(const struct bkey_format *, void *);
+
+#else
+
+static inline int bch2_compile_bkey_format(const struct bkey_format *format,
+					  void *out) { return 0; }
+
+#endif
+
+static inline void bkey_reassemble(struct bkey_i *dst,
+				   struct bkey_s_c src)
+{
+	BUG_ON(bkey_packed(src.k));
+	dst->k = *src.k;
+	memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
+}
+
+#define bkey_s_null		((struct bkey_s)   { .k = NULL })
+#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+	return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+	return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+	return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define BKEY_VAL_ACCESSORS(name)					\
+struct bkey_i_##name {							\
+	union {								\
+		struct bkey		k;				\
+		struct bkey_i		k_i;				\
+	};								\
+	struct bch_##name		v;				\
+};									\
+									\
+struct bkey_s_c_##name {						\
+	union {								\
+	struct {							\
+		const struct bkey	*k;				\
+		const struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+struct bkey_s_##name {							\
+	union {								\
+	struct {							\
+		struct bkey		*k;				\
+		struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c_##name		c;				\
+	struct bkey_s			s;				\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
+{									\
+	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline const struct bkey_i_##name *				\
+bkey_i_to_##name##_c(const struct bkey_i *k)				\
+{									\
+	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
+{									\
+	EBUG_ON(k.k->type != KEY_TYPE_##name);				\
+	return (struct bkey_s_##name) {					\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{									\
+	EBUG_ON(k.k->type != KEY_TYPE_##name);				\
+	return (struct bkey_s_c_##name) {				\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{									\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+name##_i_to_s_c(const struct bkey_i_##name *k)				\
+{									\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
+{									\
+	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+bkey_i_to_s_c_##name(const struct bkey_i *k)				\
+{									\
+	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{									\
+	struct bkey_i_##name *k =					\
+		container_of(&_k->k, struct bkey_i_##name, k);		\
+									\
+	bkey_init(&k->k);						\
+	memset(&k->v, 0, sizeof(k->v));					\
+	k->k.type = KEY_TYPE_##name;					\
+	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
+									\
+	return k;							\
+}
+
+BKEY_VAL_ACCESSORS(cookie);
+BKEY_VAL_ACCESSORS(btree_ptr);
+BKEY_VAL_ACCESSORS(extent);
+BKEY_VAL_ACCESSORS(reservation);
+BKEY_VAL_ACCESSORS(inode);
+BKEY_VAL_ACCESSORS(inode_generation);
+BKEY_VAL_ACCESSORS(dirent);
+BKEY_VAL_ACCESSORS(xattr);
+BKEY_VAL_ACCESSORS(alloc);
+BKEY_VAL_ACCESSORS(quota);
+BKEY_VAL_ACCESSORS(stripe);
+BKEY_VAL_ACCESSORS(reflink_p);
+BKEY_VAL_ACCESSORS(reflink_v);
+
+/* byte order helpers */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+	return f->key_u64s - 1;
+}
+
+#define high_bit_offset		0
+#define nth_word(p, n)		((p) - (n))
+
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+	return 0;
+}
+
+#define high_bit_offset		KEY_PACKED_BITS_START
+#define nth_word(p, n)		((p) + (n))
+
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define high_word(f, k)		((k)->_data + high_word_offset(f))
+#define next_word(p)		nth_word(p, 1)
+#define prev_word(p)		nth_word(p, -1)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void);
+#else
+static inline void bch2_bkey_pack_test(void) {}
+#endif
+
+#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
new file mode 100644
index 000000000000..f01405dd502b
--- /dev/null
+++ b/fs/bcachefs/bkey_methods.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "alloc_background.h"
+#include "dirent.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "quota.h"
+#include "reflink.h"
+#include "xattr.h"
+
+const char * const bch2_bkey_types[] = {
+#define x(name, nr) #name,
+	BCH_BKEY_TYPES()
+#undef x
+	NULL
+};
+
+static const char *deleted_key_invalid(const struct bch_fs *c,
+					struct bkey_s_c k)
+{
+	return NULL;
+}
+
+#define bch2_bkey_ops_deleted (struct bkey_ops) {	\
+	.key_invalid = deleted_key_invalid,		\
+}
+
+#define bch2_bkey_ops_discard (struct bkey_ops) {	\
+	.key_invalid = deleted_key_invalid,		\
+}
+
+static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (bkey_val_bytes(k.k))
+		return "value size should be zero";
+
+	return NULL;
+}
+
+#define bch2_bkey_ops_error (struct bkey_ops) {		\
+	.key_invalid = empty_val_key_invalid,		\
+}
+
+static const char *key_type_cookie_invalid(const struct bch_fs *c,
+					   struct bkey_s_c k)
+{
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie))
+		return "incorrect value size";
+
+	return NULL;
+}
+
+#define bch2_bkey_ops_cookie (struct bkey_ops) {	\
+	.key_invalid = key_type_cookie_invalid,		\
+}
+
+#define bch2_bkey_ops_whiteout (struct bkey_ops) {	\
+	.key_invalid = empty_val_key_invalid,		\
+}
+
+static const struct bkey_ops bch2_bkey_ops[] = {
+#define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
+	BCH_BKEY_TYPES()
+#undef x
+};
+
+const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (k.k->type >= KEY_TYPE_MAX)
+		return "invalid type";
+
+	return bch2_bkey_ops[k.k->type].key_invalid(c, k);
+}
+
+const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+				enum btree_node_type type)
+{
+	if (k.k->u64s < BKEY_U64s)
+		return "u64s too small";
+
+	if ((btree_node_type_is_extents(type) ||
+	     type == BKEY_TYPE_BTREE) &&
+	    bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+		return "value too big";
+
+	if (btree_node_type_is_extents(type)) {
+		if ((k.k->size == 0) != bkey_deleted(k.k))
+			return "bad size field";
+
+		if (k.k->size > k.k->p.offset)
+			return "size greater than offset";
+	} else {
+		if (k.k->size)
+			return "nonzero size field";
+	}
+
+	if (k.k->p.snapshot)
+		return "nonzero snapshot";
+
+	if (type != BKEY_TYPE_BTREE &&
+	    !bkey_cmp(k.k->p, POS_MAX))
+		return "POS_MAX key";
+
+	return NULL;
+}
+
+const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+			      enum btree_node_type type)
+{
+	return __bch2_bkey_invalid(c, k, type) ?:
+		bch2_bkey_val_invalid(c, k);
+}
+
+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
+{
+	if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
+		return "key before start of btree node";
+
+	if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+		return "key past end of btree node";
+
+	return NULL;
+}
+
+void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+	const char *invalid;
+
+	BUG_ON(!k.k->u64s);
+
+	invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?:
+		bch2_bkey_in_btree_node(b, k);
+	if (invalid) {
+		char buf[160];
+
+		bch2_bkey_val_to_text(&PBUF(buf), c, k);
+		bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
+		return;
+	}
+
+	if (ops->key_debugcheck)
+		ops->key_debugcheck(c, k);
+}
+
+void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
+{
+	if (!bkey_cmp(pos, POS_MIN))
+		pr_buf(out, "POS_MIN");
+	else if (!bkey_cmp(pos, POS_MAX))
+		pr_buf(out, "POS_MAX");
+	else
+		pr_buf(out, "%llu:%llu", pos.inode, pos.offset);
+}
+
+void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
+{
+	pr_buf(out, "u64s %u type %s ", k->u64s,
+	       bch2_bkey_types[k->type]);
+
+	bch2_bpos_to_text(out, k->p);
+
+	pr_buf(out, " snap %u len %u ver %llu",
+	       k->p.snapshot, k->size, k->version.lo);
+}
+
+void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
+		      struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+
+	if (likely(ops->val_to_text))
+		ops->val_to_text(out, c, k);
+}
+
+void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
+			   struct bkey_s_c k)
+{
+	bch2_bkey_to_text(out, k.k);
+	pr_buf(out, ": ");
+	bch2_val_to_text(out, c, k);
+}
+
+void bch2_bkey_swab(const struct bkey_format *f,
+		    struct bkey_packed *k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[k->type];
+
+	bch2_bkey_swab_key(f, k);
+
+	if (ops->swab)
+		ops->swab(f, k);
+}
+
+bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+
+	return ops->key_normalize
+		? ops->key_normalize(c, k)
+		: false;
+}
+
+enum merge_result bch2_bkey_merge(struct bch_fs *c,
+				  struct bkey_s l, struct bkey_s r)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
+	enum merge_result ret;
+
+	if (key_merging_disabled(c) ||
+	    !ops->key_merge ||
+	    l.k->type != r.k->type ||
+	    bversion_cmp(l.k->version, r.k->version) ||
+	    bkey_cmp(l.k->p, bkey_start_pos(r.k)))
+		return BCH_MERGE_NOMERGE;
+
+	ret = ops->key_merge(c, l, r);
+
+	if (ret != BCH_MERGE_NOMERGE)
+		l.k->needs_whiteout |= r.k->needs_whiteout;
+	return ret;
+}
+
+static const struct old_bkey_type {
+	u8		btree_node_type;
+	u8		old;
+	u8		new;
+} bkey_renumber_table[] = {
+	{BKEY_TYPE_BTREE,	128, KEY_TYPE_btree_ptr		},
+	{BKEY_TYPE_EXTENTS,	128, KEY_TYPE_extent		},
+	{BKEY_TYPE_EXTENTS,	129, KEY_TYPE_extent		},
+	{BKEY_TYPE_EXTENTS,	130, KEY_TYPE_reservation	},
+	{BKEY_TYPE_INODES,	128, KEY_TYPE_inode		},
+	{BKEY_TYPE_INODES,	130, KEY_TYPE_inode_generation	},
+	{BKEY_TYPE_DIRENTS,	128, KEY_TYPE_dirent		},
+	{BKEY_TYPE_DIRENTS,	129, KEY_TYPE_whiteout		},
+	{BKEY_TYPE_XATTRS,	128, KEY_TYPE_xattr		},
+	{BKEY_TYPE_XATTRS,	129, KEY_TYPE_whiteout		},
+	{BKEY_TYPE_ALLOC,	128, KEY_TYPE_alloc		},
+	{BKEY_TYPE_QUOTAS,	128, KEY_TYPE_quota		},
+};
+
+void bch2_bkey_renumber(enum btree_node_type btree_node_type,
+			struct bkey_packed *k,
+			int write)
+{
+	const struct old_bkey_type *i;
+
+	for (i = bkey_renumber_table;
+	     i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
+	     i++)
+		if (btree_node_type == i->btree_node_type &&
+		    k->type == (write ? i->new : i->old)) {
+			k->type = write ? i->old : i->new;
+			break;
+		}
+}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
new file mode 100644
index 000000000000..8568b65c1ed2
--- /dev/null
+++ b/fs/bcachefs/bkey_methods.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_METHODS_H
+#define _BCACHEFS_BKEY_METHODS_H
+
+#include "bkey.h"
+
+struct bch_fs;
+struct btree;
+struct bkey;
+enum btree_node_type;
+
+extern const char * const bch2_bkey_types[];
+
+enum merge_result {
+	BCH_MERGE_NOMERGE,
+
+	/*
+	 * The keys were mergeable, but would have overflowed size - so instead
+	 * l was changed to the maximum size, and both keys were modified:
+	 */
+	BCH_MERGE_PARTIAL,
+	BCH_MERGE_MERGE,
+};
+
+struct bkey_ops {
+	/* Returns reason for being invalid if invalid, else NULL: */
+	const char *	(*key_invalid)(const struct bch_fs *,
+				       struct bkey_s_c);
+	void		(*key_debugcheck)(struct bch_fs *, struct bkey_s_c);
+	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
+				       struct bkey_s_c);
+	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
+	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
+	enum merge_result (*key_merge)(struct bch_fs *,
+				       struct bkey_s, struct bkey_s);
+};
+
+const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
+const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+				enum btree_node_type);
+const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+			      enum btree_node_type);
+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
+
+void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+
+void bch2_bpos_to_text(struct printbuf *, struct bpos);
+void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
+void bch2_val_to_text(struct printbuf *, struct bch_fs *,
+		      struct bkey_s_c);
+void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
+			   struct bkey_s_c);
+
+void bch2_bkey_swab(const struct bkey_format *, struct bkey_packed *);
+
+bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
+
+enum merge_result bch2_bkey_merge(struct bch_fs *,
+				  struct bkey_s, struct bkey_s);
+
+void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
+
+#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
new file mode 100644
index 000000000000..e32fad5a91ac
--- /dev/null
+++ b/fs/bcachefs/bkey_sort.c
@@ -0,0 +1,630 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_sort.h"
+#include "bset.h"
+#include "extents.h"
+
+/* too many iterators, need to clean this up */
+
+/* btree_node_iter_large: */
+
+#define btree_node_iter_cmp_heap(h, _l, _r) btree_node_iter_cmp(b, _l, _r)
+
+static inline bool
+bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
+{
+	return !iter->used;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
+				    struct btree *b)
+{
+	return bch2_btree_node_iter_large_end(iter)
+		? NULL
+		: __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static void
+bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
+				   struct btree *b)
+{
+	iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s;
+
+	EBUG_ON(!iter->used);
+	EBUG_ON(iter->data->k > iter->data->end);
+
+	if (iter->data->k == iter->data->end)
+		heap_del(iter, 0, btree_node_iter_cmp_heap, NULL);
+	else
+		heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
+				    struct btree *b)
+{
+	struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
+
+	if (ret)
+		bch2_btree_node_iter_large_advance(iter, b);
+
+	return ret;
+}
+
+void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
+				     struct btree *b,
+				     const struct bkey_packed *k,
+				     const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set n =
+			((struct btree_node_iter_set) {
+				 __btree_node_key_to_offset(b, k),
+				 __btree_node_key_to_offset(b, end)
+			 });
+
+		__heap_add(iter, n, btree_node_iter_cmp_heap, NULL);
+	}
+}
+
+static void sort_key_next(struct btree_node_iter_large *iter,
+			  struct btree *b,
+			  struct btree_node_iter_set *i)
+{
+	i->k += __btree_node_offset_to_key(b, i->k)->u64s;
+
+	if (i->k == i->end)
+		*i = iter->data[--iter->used];
+}
+
+/* regular sort_iters */
+
+typedef int (*sort_cmp_fn)(struct btree *,
+			   struct bkey_packed *,
+			   struct bkey_packed *);
+
+static inline void __sort_iter_sift(struct sort_iter *iter,
+				    unsigned from,
+				    sort_cmp_fn cmp)
+{
+	unsigned i;
+
+	for (i = from;
+	     i + 1 < iter->used &&
+	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
+	     i++)
+		swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+
+	__sort_iter_sift(iter, 0, cmp);
+}
+
+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	unsigned i = iter->used;
+
+	while (i--)
+		__sort_iter_sift(iter, i, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
+{
+	return iter->used ? iter->data->k : NULL;
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	iter->data->k = bkey_next(iter->data->k);
+
+	BUG_ON(iter->data->k > iter->data->end);
+
+	if (iter->data->k == iter->data->end)
+		array_remove_item(iter->data, iter->used, 0);
+	else
+		sort_iter_sift(iter, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
+						 sort_cmp_fn cmp)
+{
+	struct bkey_packed *ret = sort_iter_peek(iter);
+
+	if (ret)
+		sort_iter_advance(iter, cmp);
+
+	return ret;
+}
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+#define key_sort_cmp(h, l, r)						\
+({									\
+	bkey_cmp_packed(b,						\
+			__btree_node_offset_to_key(b, (l).k),		\
+			__btree_node_offset_to_key(b, (r).k))		\
+									\
+	?: (l).k - (r).k;						\
+})
+
+static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
+					struct btree *b)
+{
+	struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
+	struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
+
+	if (bkey_whiteout(k))
+		return true;
+
+	if (iter->used < 2)
+		return false;
+
+	if (iter->used > 2 &&
+	    key_sort_cmp(iter, r[0], r[1]) >= 0)
+		r++;
+
+	/*
+	 * key_sort_cmp() ensures that when keys compare equal the older key
+	 * comes first; so if l->k compares equal to r->k then l->k is older and
+	 * should be dropped.
+	 */
+	return !bkey_cmp_packed(b,
+				__btree_node_offset_to_key(b, l->k),
+				__btree_node_offset_to_key(b, r->k));
+}
+
+struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
+					struct btree *b,
+					struct btree_node_iter_large *iter)
+{
+	struct bkey_packed *out = dst->start;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	heap_resort(iter, key_sort_cmp, NULL);
+
+	while (!bch2_btree_node_iter_large_end(iter)) {
+		if (!should_drop_next_key(iter, b)) {
+			struct bkey_packed *k =
+				__btree_node_offset_to_key(b, iter->data->k);
+
+			bkey_copy(out, k);
+			btree_keys_account_key_add(&nr, 0, out);
+			out = bkey_next(out);
+		}
+
+		sort_key_next(iter, b, iter->data);
+		heap_sift_down(iter, 0, key_sort_cmp, NULL);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/*
+ * If keys compare equal, compare by pointer order:
+ *
+ * Necessary for sort_fix_overlapping() - if there are multiple keys that
+ * compare equal in different sets, we have to process them newest to oldest.
+ */
+#define extent_sort_cmp(h, l, r)					\
+({									\
+	struct bkey _ul = bkey_unpack_key(b,				\
+				__btree_node_offset_to_key(b, (l).k));	\
+	struct bkey _ur = bkey_unpack_key(b,				\
+				__btree_node_offset_to_key(b, (r).k));	\
+									\
+	bkey_cmp(bkey_start_pos(&_ul),					\
+		 bkey_start_pos(&_ur)) ?: (r).k - (l).k;		\
+})
+
+static inline void extent_sort_sift(struct btree_node_iter_large *iter,
+				    struct btree *b, size_t i)
+{
+	heap_sift_down(iter, i, extent_sort_cmp, NULL);
+}
+
+static inline void extent_sort_next(struct btree_node_iter_large *iter,
+				    struct btree *b,
+				    struct btree_node_iter_set *i)
+{
+	sort_key_next(iter, b, i);
+	heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL);
+}
+
+static void extent_sort_advance_prev(struct bkey_format *f,
+				     struct btree_nr_keys *nr,
+				     struct bkey_packed *start,
+				     struct bkey_packed **prev)
+{
+	if (*prev) {
+		bch2_bkey_pack(*prev, (void *) *prev, f);
+
+		btree_keys_account_key_add(nr, 0, *prev);
+		*prev = bkey_next(*prev);
+	} else {
+		*prev = start;
+	}
+}
+
+static void extent_sort_append(struct bch_fs *c,
+			       struct bkey_format *f,
+			       struct btree_nr_keys *nr,
+			       struct bkey_packed *start,
+			       struct bkey_packed **prev,
+			       struct bkey_s k)
+{
+	if (bkey_whiteout(k.k))
+		return;
+
+	/*
+	 * prev is always unpacked, for key merging - until right before we
+	 * advance it:
+	 */
+
+	if (*prev &&
+	    bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) ==
+	    BCH_MERGE_MERGE)
+		return;
+
+	extent_sort_advance_prev(f, nr, start, prev);
+
+	bkey_reassemble((void *) *prev, k.s_c);
+}
+
+struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
+					struct bset *dst,
+					struct btree *b,
+					struct btree_node_iter_large *iter)
+{
+	struct bkey_format *f = &b->format;
+	struct btree_node_iter_set *_l = iter->data, *_r;
+	struct bkey_packed *prev = NULL, *lk, *rk;
+	struct bkey l_unpacked, r_unpacked;
+	struct bkey_s l, r;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	heap_resort(iter, extent_sort_cmp, NULL);
+
+	while (!bch2_btree_node_iter_large_end(iter)) {
+		lk = __btree_node_offset_to_key(b, _l->k);
+		l = __bkey_disassemble(b, lk, &l_unpacked);
+
+		if (iter->used == 1) {
+			extent_sort_append(c, f, &nr, dst->start, &prev, l);
+			extent_sort_next(iter, b, _l);
+			continue;
+		}
+
+		_r = iter->data + 1;
+		if (iter->used > 2 &&
+		    extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
+			_r++;
+
+		rk = __btree_node_offset_to_key(b, _r->k);
+		r = __bkey_disassemble(b, rk, &r_unpacked);
+
+		/* If current key and next key don't overlap, just append */
+		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
+			extent_sort_append(c, f, &nr, dst->start, &prev, l);
+			extent_sort_next(iter, b, _l);
+			continue;
+		}
+
+		/* Skip 0 size keys */
+		if (!r.k->size) {
+			extent_sort_next(iter, b, _r);
+			continue;
+		}
+
+		/*
+		 * overlap: keep the newer key and trim the older key so they
+		 * don't overlap. comparing pointers tells us which one is
+		 * newer, since the bsets are appended one after the other.
+		 */
+
+		/* can't happen because of comparison func */
+		BUG_ON(_l->k < _r->k &&
+		       !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
+
+		if (_l->k > _r->k) {
+			/* l wins, trim r */
+			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
+				sort_key_next(iter, b, _r);
+			} else {
+				__bch2_cut_front(l.k->p, r);
+				extent_save(b, rk, r.k);
+			}
+
+			extent_sort_sift(iter, b, _r - iter->data);
+		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
+			BKEY_PADDED(k) tmp;
+
+			/*
+			 * r wins, but it overlaps in the middle of l - split l:
+			 */
+			bkey_reassemble(&tmp.k, l.s_c);
+			bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
+
+			__bch2_cut_front(r.k->p, l);
+			extent_save(b, lk, l.k);
+
+			extent_sort_sift(iter, b, 0);
+
+			extent_sort_append(c, f, &nr, dst->start,
+					   &prev, bkey_i_to_s(&tmp.k));
+		} else {
+			bch2_cut_back(bkey_start_pos(r.k), l.k);
+			extent_save(b, lk, l.k);
+		}
+	}
+
+	extent_sort_advance_prev(f, &nr, dst->start, &prev);
+
+	dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+	return nr;
+}
+
+/* Sort + repack in a new format: */
+struct btree_nr_keys
+bch2_sort_repack(struct bset *dst, struct btree *src,
+		 struct btree_node_iter *src_iter,
+		 struct bkey_format *out_f,
+		 bool filter_whiteouts)
+{
+	struct bkey_format *in_f = &src->format;
+	struct bkey_packed *in, *out = vstruct_last(dst);
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
+		if (filter_whiteouts && bkey_whiteout(in))
+			continue;
+
+		if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+				       ? in_f : &bch2_bkey_format_current, in))
+			out->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bch2_bkey_unpack(src, (void *) out, in);
+
+		btree_keys_account_key_add(&nr, 0, out);
+		out = bkey_next(out);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Sort, repack, and merge: */
+struct btree_nr_keys
+bch2_sort_repack_merge(struct bch_fs *c,
+		       struct bset *dst, struct btree *src,
+		       struct btree_node_iter *iter,
+		       struct bkey_format *out_f,
+		       bool filter_whiteouts)
+{
+	struct bkey_packed *prev = NULL, *k_packed;
+	struct bkey_s k;
+	struct btree_nr_keys nr;
+	BKEY_PADDED(k) tmp;
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
+		if (filter_whiteouts && bkey_whiteout(k_packed))
+			continue;
+
+		EBUG_ON(bkeyp_val_u64s(&src->format, k_packed) >
+			BKEY_EXTENT_VAL_U64s_MAX);
+
+		bch2_bkey_unpack(src, &tmp.k, k_packed);
+		k = bkey_i_to_s(&tmp.k);
+
+		if (filter_whiteouts &&
+		    bch2_bkey_normalize(c, k))
+			continue;
+
+		extent_sort_append(c, out_f, &nr, vstruct_last(dst), &prev, k);
+	}
+
+	extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev);
+
+	dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+	return nr;
+}
+
+static inline int sort_keys_cmp(struct btree *b,
+				struct bkey_packed *l,
+				struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r) ?:
+		(int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
+		(int) l->needs_whiteout - (int) r->needs_whiteout;
+}
+
+unsigned bch2_sort_keys(struct bkey_packed *dst,
+			struct sort_iter *iter,
+			bool filter_whiteouts)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *next, *out = dst;
+
+	sort_iter_sort(iter, sort_keys_cmp);
+
+	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+		if (bkey_whiteout(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		if (bkey_whiteout(in) &&
+		    (next = sort_iter_peek(iter)) &&
+		    !bkey_cmp_packed(iter->b, in, next)) {
+			BUG_ON(in->needs_whiteout &&
+			       next->needs_whiteout);
+			/*
+			 * XXX racy, called with read lock from write path
+			 *
+			 * leads to spurious BUG_ON() in bkey_unpack_key() in
+			 * debug mode
+			 */
+			next->needs_whiteout |= in->needs_whiteout;
+			continue;
+		}
+
+		if (bkey_whiteout(in)) {
+			memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
+			set_bkeyp_val_u64s(f, out, 0);
+		} else {
+			bkey_copy(out, in);
+		}
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extents_cmp(struct btree *b,
+				   struct bkey_packed *l,
+				   struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r) ?:
+		(int) bkey_deleted(l) - (int) bkey_deleted(r);
+}
+
+unsigned bch2_sort_extents(struct bkey_packed *dst,
+			   struct sort_iter *iter,
+			   bool filter_whiteouts)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, sort_extents_cmp);
+
+	while ((in = sort_iter_next(iter, sort_extents_cmp))) {
+		if (bkey_deleted(in))
+			continue;
+
+		if (bkey_whiteout(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		bkey_copy(out, in);
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_key_whiteouts_cmp(struct btree *b,
+					 struct bkey_packed *l,
+					 struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r);
+}
+
+unsigned bch2_sort_key_whiteouts(struct bkey_packed *dst,
+				 struct sort_iter *iter)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, sort_key_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
+		bkey_copy(out, in);
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extent_whiteouts_cmp(struct btree *b,
+					    struct bkey_packed *l,
+					    struct bkey_packed *r)
+{
+	struct bkey ul = bkey_unpack_key(b, l);
+	struct bkey ur = bkey_unpack_key(b, r);
+
+	return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
+}
+
+unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst,
+				    struct sort_iter *iter)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *out = dst;
+	struct bkey_i l, r;
+	bool prev = false, l_packed = false;
+	u64 max_packed_size	= bkey_field_max(f, BKEY_FIELD_SIZE);
+	u64 max_packed_offset	= bkey_field_max(f, BKEY_FIELD_OFFSET);
+	u64 new_size;
+
+	max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
+
+	sort_iter_sort(iter, sort_extent_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+		if (bkey_deleted(in))
+			continue;
+
+		EBUG_ON(bkeyp_val_u64s(f, in));
+		EBUG_ON(in->type != KEY_TYPE_discard);
+
+		r.k = bkey_unpack_key(iter->b, in);
+
+		if (prev &&
+		    bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
+			if (bkey_cmp(l.k.p, r.k.p) >= 0)
+				continue;
+
+			new_size = l_packed
+				? min(max_packed_size, max_packed_offset -
+				      bkey_start_offset(&l.k))
+				: KEY_SIZE_MAX;
+
+			new_size = min(new_size, r.k.p.offset -
+				       bkey_start_offset(&l.k));
+
+			BUG_ON(new_size < l.k.size);
+
+			bch2_key_resize(&l.k, new_size);
+
+			if (bkey_cmp(l.k.p, r.k.p) >= 0)
+				continue;
+
+			bch2_cut_front(l.k.p, &r);
+		}
+
+		if (prev) {
+			if (!bch2_bkey_pack(out, &l, f)) {
+				BUG_ON(l_packed);
+				bkey_copy(out, &l);
+			}
+			out = bkey_next(out);
+		}
+
+		l = r;
+		prev = true;
+		l_packed = bkey_packed(in);
+	}
+
+	if (prev) {
+		if (!bch2_bkey_pack(out, &l, f)) {
+			BUG_ON(l_packed);
+			bkey_copy(out, &l);
+		}
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
new file mode 100644
index 000000000000..397009181eae
--- /dev/null
+++ b/fs/bcachefs/bkey_sort.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_SORT_H
+#define _BCACHEFS_BKEY_SORT_H
+
+struct btree_node_iter_large {
+	u16		used;
+
+	struct btree_node_iter_set data[MAX_BSETS];
+};
+
+void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
+				     struct btree *,
+				     const struct bkey_packed *,
+				     const struct bkey_packed *);
+
+struct sort_iter {
+	struct btree	*b;
+	unsigned		used;
+
+	struct sort_iter_set {
+		struct bkey_packed *k, *end;
+	} data[MAX_BSETS + 1];
+};
+
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b)
+{
+	memset(iter, 0, sizeof(*iter));
+	iter->b = b;
+}
+
+static inline void sort_iter_add(struct sort_iter *iter,
+				 struct bkey_packed *k,
+				 struct bkey_packed *end)
+{
+	BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
+
+	if (k != end)
+		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
+}
+
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bset *, struct btree *,
+			      struct btree_node_iter_large *);
+struct btree_nr_keys
+bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *,
+				 struct btree *,
+				 struct btree_node_iter_large *);
+
+struct btree_nr_keys
+bch2_sort_repack(struct bset *, struct btree *,
+		 struct btree_node_iter *,
+		 struct bkey_format *, bool);
+struct btree_nr_keys
+bch2_sort_repack_merge(struct bch_fs *,
+		       struct bset *, struct btree *,
+		       struct btree_node_iter *,
+		       struct bkey_format *, bool);
+
+unsigned bch2_sort_keys(struct bkey_packed *,
+			struct sort_iter *, bool);
+unsigned bch2_sort_extents(struct bkey_packed *,
+			   struct sort_iter *, bool);
+
+unsigned bch2_sort_key_whiteouts(struct bkey_packed *,
+				 struct sort_iter *);
+unsigned bch2_sort_extent_whiteouts(struct bkey_packed *,
+				    struct sort_iter *);
+
+#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
new file mode 100644
index 000000000000..ff9465750528
--- /dev/null
+++ b/fs/bcachefs/bset.c
@@ -0,0 +1,1876 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for working with individual keys, and sorted sets of keys with in a
+ * btree node
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "bset.h"
+#include "eytzinger.h"
+#include "util.h"
+
+#include <asm/unaligned.h>
+#include <linux/console.h>
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+/* hack.. */
+#include "alloc_types.h"
+#include <trace/events/bcachefs.h>
+
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
+						  struct btree *);
+
+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
+{
+	unsigned n = ARRAY_SIZE(iter->data);
+
+	while (n && __btree_node_iter_set_end(iter, n - 1))
+		--n;
+
+	return n;
+}
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
+{
+	unsigned offset = __btree_node_key_to_offset(b, k);
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		if (offset <= t->end_offset) {
+			EBUG_ON(offset < btree_bkey_first_offset(t));
+			return t;
+		}
+
+	BUG();
+}
+
+/*
+ * There are never duplicate live keys in the btree - but including keys that
+ * have been flagged as deleted (and will be cleaned up later) we _will_ see
+ * duplicates.
+ *
+ * Thus the sort order is: usual key comparison first, but for keys that compare
+ * equal the deleted key(s) come first, and the (at most one) live version comes
+ * last.
+ *
+ * The main reason for this is insertion: to handle overwrites, we first iterate
+ * over keys that compare equal to our insert key, and then insert immediately
+ * prior to the first key greater than the key we're inserting - our insert
+ * position will be after all keys that compare equal to our insert key, which
+ * by the time we actually do the insert will all be deleted.
+ */
+
+void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
+{
+	struct bkey_packed *_k, *_n;
+	struct bkey k, n;
+	char buf[120];
+
+	if (!i->u64s)
+		return;
+
+	for (_k = i->start, k = bkey_unpack_key(b, _k);
+	     _k < vstruct_last(i);
+	     _k = _n, k = n) {
+		_n = bkey_next(_k);
+
+		bch2_bkey_to_text(&PBUF(buf), &k);
+		printk(KERN_ERR "block %u key %5u: %s\n", set,
+		       __btree_node_key_to_offset(b, _k), buf);
+
+		if (_n == vstruct_last(i))
+			continue;
+
+		n = bkey_unpack_key(b, _n);
+
+		if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
+			printk(KERN_ERR "Key skipped backwards\n");
+			continue;
+		}
+
+		/*
+		 * Weird check for duplicate non extent keys: extents are
+		 * deleted iff they have 0 size, so if it has zero size and it's
+		 * not deleted these aren't extents:
+		 */
+		if (((!k.size && !bkey_deleted(&k)) ||
+		     (!n.size && !bkey_deleted(&n))) &&
+		    !bkey_deleted(&k) &&
+		    !bkey_cmp(n.p, k.p))
+			printk(KERN_ERR "Duplicate keys\n");
+	}
+}
+
+void bch2_dump_btree_node(struct btree *b)
+{
+	struct bset_tree *t;
+
+	console_lock();
+	for_each_bset(b, t)
+		bch2_dump_bset(b, bset(b, t), t - b->set);
+	console_unlock();
+}
+
+void bch2_dump_btree_node_iter(struct btree *b,
+			      struct btree_node_iter *iter)
+{
+	struct btree_node_iter_set *set;
+
+	printk(KERN_ERR "btree node iter with %u/%u sets:\n",
+	       __btree_node_iter_used(iter), b->nsets);
+
+	btree_node_iter_for_each(iter, set) {
+		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
+		struct bset_tree *t = bch2_bkey_to_bset(b, k);
+		struct bkey uk = bkey_unpack_key(b, k);
+		char buf[100];
+
+		bch2_bkey_to_text(&PBUF(buf), &uk);
+		printk(KERN_ERR "set %zu key %u: %s\n",
+		       t - b->set, set->k, buf);
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+	struct bset_tree *t;
+	struct bkey_packed *k;
+	struct btree_nr_keys nr = { 0 };
+
+	for_each_bset(b, t)
+		for (k = btree_bkey_first(b, t);
+		     k != btree_bkey_last(b, t);
+		     k = bkey_next(k))
+			if (!bkey_whiteout(k))
+				btree_keys_account_key_add(&nr, t - b->set, k);
+
+	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
+}
+
+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
+					    struct btree *b)
+{
+	struct btree_node_iter iter = *_iter;
+	const struct bkey_packed *k, *n;
+
+	k = bch2_btree_node_iter_peek_all(&iter, b);
+	__bch2_btree_node_iter_advance(&iter, b);
+	n = bch2_btree_node_iter_peek_all(&iter, b);
+
+	bkey_unpack_key(b, k);
+
+	if (n &&
+	    bkey_iter_cmp(b, k, n) > 0) {
+		struct btree_node_iter_set *set;
+		struct bkey ku = bkey_unpack_key(b, k);
+		struct bkey nu = bkey_unpack_key(b, n);
+		char buf1[80], buf2[80];
+
+		bch2_dump_btree_node(b);
+		bch2_bkey_to_text(&PBUF(buf1), &ku);
+		bch2_bkey_to_text(&PBUF(buf2), &nu);
+		printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
+		       buf1, buf2);
+		printk(KERN_ERR "iter was:");
+
+		btree_node_iter_for_each(_iter, set) {
+			struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
+			struct bset_tree *t = bch2_bkey_to_bset(b, k);
+			printk(" [%zi %zi]", t - b->set,
+			       k->_data - bset(b, t)->_data);
+		}
+		panic("\n");
+	}
+}
+
+void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+				 struct btree *b)
+{
+	struct btree_node_iter_set *set, *s2;
+	struct bkey_packed *k, *p;
+	struct bset_tree *t;
+
+	if (bch2_btree_node_iter_end(iter))
+		return;
+
+	/* Verify no duplicates: */
+	btree_node_iter_for_each(iter, set)
+		btree_node_iter_for_each(iter, s2)
+			BUG_ON(set != s2 && set->end == s2->end);
+
+	/* Verify that set->end is correct: */
+	btree_node_iter_for_each(iter, set) {
+		for_each_bset(b, t)
+			if (set->end == t->end_offset)
+				goto found;
+		BUG();
+found:
+		BUG_ON(set->k < btree_bkey_first_offset(t) ||
+		       set->k >= t->end_offset);
+	}
+
+	/* Verify iterator is sorted: */
+	btree_node_iter_for_each(iter, set)
+		BUG_ON(set != iter->data &&
+		       btree_node_iter_cmp(b, set[-1], set[0]) > 0);
+
+	k = bch2_btree_node_iter_peek_all(iter, b);
+
+	for_each_bset(b, t) {
+		if (iter->data[0].end == t->end_offset)
+			continue;
+
+		p = bch2_bkey_prev_all(b, t,
+			bch2_btree_node_iter_bset_pos(iter, b, t));
+
+		BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
+	}
+}
+
+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
+			    struct bkey_packed *insert, unsigned clobber_u64s)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, where);
+	struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
+	struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+#if 0
+	BUG_ON(prev &&
+	       bkey_iter_cmp(b, prev, insert) > 0);
+#else
+	if (prev &&
+	    bkey_iter_cmp(b, prev, insert) > 0) {
+		struct bkey k1 = bkey_unpack_key(b, prev);
+		struct bkey k2 = bkey_unpack_key(b, insert);
+		char buf1[100];
+		char buf2[100];
+
+		bch2_dump_btree_node(b);
+		bch2_bkey_to_text(&PBUF(buf1), &k1);
+		bch2_bkey_to_text(&PBUF(buf2), &k2);
+
+		panic("prev > insert:\n"
+		      "prev    key %5u %s\n"
+		      "insert  key %5u %s\n",
+		       __btree_node_key_to_offset(b, prev), buf1,
+		       __btree_node_key_to_offset(b, insert), buf2);
+	}
+#endif
+#if 0
+	BUG_ON(next != btree_bkey_last(b, t) &&
+	       bkey_iter_cmp(b, insert, next) > 0);
+#else
+	if (next != btree_bkey_last(b, t) &&
+	    bkey_iter_cmp(b, insert, next) > 0) {
+		struct bkey k1 = bkey_unpack_key(b, insert);
+		struct bkey k2 = bkey_unpack_key(b, next);
+		char buf1[100];
+		char buf2[100];
+
+		bch2_dump_btree_node(b);
+		bch2_bkey_to_text(&PBUF(buf1), &k1);
+		bch2_bkey_to_text(&PBUF(buf2), &k2);
+
+		panic("insert > next:\n"
+		      "insert  key %5u %s\n"
+		      "next    key %5u %s\n",
+		       __btree_node_key_to_offset(b, insert), buf1,
+		       __btree_node_key_to_offset(b, next), buf2);
+	}
+#endif
+}
+
+#else
+
+static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
+						   struct btree *b) {}
+
+#endif
+
+/* Auxiliary search trees */
+
+#define BFLOAT_FAILED_UNPACKED	(U8_MAX - 0)
+#define BFLOAT_FAILED_PREV	(U8_MAX - 1)
+#define BFLOAT_FAILED_OVERFLOW	(U8_MAX - 2)
+#define BFLOAT_FAILED		(U8_MAX - 2)
+
+#define KEY_WORDS		BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
+
+struct bkey_float {
+	u8		exponent;
+	u8		key_offset;
+	union {
+		u32	mantissa32;
+	struct {
+		u16	mantissa16;
+		u16	_pad;
+	};
+	};
+} __packed;
+
+#define BFLOAT_32BIT_NR		32U
+
+static unsigned bkey_float_byte_offset(unsigned idx)
+{
+	int d = (idx - BFLOAT_32BIT_NR) << 1;
+
+	d &= ~(d >> 31);
+
+	return idx * 6 - d;
+}
+
+struct ro_aux_tree {
+	struct bkey_float	_d[0];
+};
+
+struct rw_aux_tree {
+	u16		offset;
+	struct bpos	k;
+};
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE		128
+
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree *b)
+{
+	return PAGE_SIZE << b->page_order;
+}
+
+static inline size_t btree_keys_cachelines(struct btree *b)
+{
+	return btree_keys_bytes(b) / BSET_CACHELINE;
+}
+
+static inline size_t btree_aux_data_bytes(struct btree *b)
+{
+	return btree_keys_cachelines(b) * 8;
+}
+
+static inline size_t btree_aux_data_u64s(struct btree *b)
+{
+	return btree_aux_data_bytes(b) / sizeof(u64);
+}
+
+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
+{
+	BUG_ON(t->aux_data_offset == U16_MAX);
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		return t->aux_data_offset;
+	case BSET_RO_AUX_TREE:
+		return t->aux_data_offset +
+			DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
+				     sizeof(u8) * t->size, 8);
+	case BSET_RW_AUX_TREE:
+		return t->aux_data_offset +
+			DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
+	default:
+		BUG();
+	}
+}
+
+static unsigned bset_aux_tree_buf_start(const struct btree *b,
+					const struct bset_tree *t)
+{
+	return t == b->set
+		? DIV_ROUND_UP(b->unpack_fn_len, 8)
+		: bset_aux_tree_buf_end(t - 1);
+}
+
+static void *__aux_tree_base(const struct btree *b,
+			     const struct bset_tree *t)
+{
+	return b->aux_data + t->aux_data_offset * 8;
+}
+
+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
+					    const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	return __aux_tree_base(b, t);
+}
+
+static u8 *ro_aux_tree_prev(const struct btree *b,
+			    const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
+}
+
+static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
+					 unsigned idx)
+{
+	return (void *) b + bkey_float_byte_offset(idx);
+}
+
+static struct bkey_float *bkey_float(const struct btree *b,
+				     const struct bset_tree *t,
+				     unsigned idx)
+{
+	return bkey_float_get(ro_aux_tree_base(b, t), idx);
+}
+
+static void bset_aux_tree_verify(struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		if (t->aux_data_offset == U16_MAX)
+			continue;
+
+		BUG_ON(t != b->set &&
+		       t[-1].aux_data_offset == U16_MAX);
+
+		BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
+		BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
+		BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
+	}
+#endif
+}
+
+/* Memory allocation */
+
+void bch2_btree_keys_free(struct btree *b)
+{
+	vfree(b->aux_data);
+	b->aux_data = NULL;
+}
+
+int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
+{
+	b->page_order	= page_order;
+	b->aux_data	= vmalloc_exec(btree_aux_data_bytes(b), gfp);
+	if (!b->aux_data)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
+{
+	unsigned i;
+
+	b->nsets		= 0;
+	memset(&b->nr, 0, sizeof(b->nr));
+#ifdef CONFIG_BCACHEFS_DEBUG
+	b->expensive_debug_checks = expensive_debug_checks;
+#endif
+	for (i = 0; i < MAX_BSETS; i++)
+		b->set[i].data_offset = U16_MAX;
+
+	bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+/* Binary tree stuff for auxiliary search trees */
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmetic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
+ * then bkey_float->m gives us the offset within that cacheline, in units of 8
+ * bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static inline void *bset_cacheline(const struct btree *b,
+				   const struct bset_tree *t,
+				   unsigned cacheline)
+{
+	return (void *) round_down((unsigned long) btree_bkey_first(b, t),
+				   L1_CACHE_BYTES) +
+		cacheline * BSET_CACHELINE;
+}
+
+static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
+					     const struct bset_tree *t,
+					     unsigned cacheline,
+					     unsigned offset)
+{
+	return bset_cacheline(b, t, cacheline) + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(const struct btree *b,
+				  const struct bset_tree *t,
+				  const struct bkey_packed *k)
+{
+	return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
+}
+
+static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
+					  const struct bset_tree *t,
+					  unsigned cacheline,
+					  const struct bkey_packed *k)
+{
+	return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
+}
+
+static unsigned bkey_to_cacheline_offset(const struct btree *b,
+					 const struct bset_tree *t,
+					 unsigned cacheline,
+					 const struct bkey_packed *k)
+{
+	size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
+
+	EBUG_ON(m > U8_MAX);
+	return m;
+}
+
+static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
+					       const struct bset_tree *t,
+					       unsigned j)
+{
+	return cacheline_to_bkey(b, t,
+			__eytzinger1_to_inorder(j, t->size, t->extra),
+			bkey_float(b, t, j)->key_offset);
+}
+
+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
+					     const struct bset_tree *t,
+					     unsigned j)
+{
+	unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
+
+	return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
+}
+
+static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
+				       const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+
+	return __aux_tree_base(b, t);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
+					  struct bset_tree *t,
+					  unsigned j)
+{
+	return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
+}
+
+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
+			    unsigned j, struct bkey_packed *k)
+{
+	EBUG_ON(k >= btree_bkey_last(b, t));
+
+	rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
+		.offset	= __btree_node_key_to_offset(b, k),
+		.k	= bkey_unpack_pos(b, k),
+	};
+}
+
+static void bch2_bset_verify_rw_aux_tree(struct btree *b,
+					struct bset_tree *t)
+{
+	struct bkey_packed *k = btree_bkey_first(b, t);
+	unsigned j = 0;
+
+	if (!btree_keys_expensive_checks(b))
+		return;
+
+	BUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	BUG_ON(t->size < 1);
+	BUG_ON(rw_aux_to_bkey(b, t, j) != k);
+
+	goto start;
+	while (1) {
+		if (rw_aux_to_bkey(b, t, j) == k) {
+			BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
+					bkey_unpack_pos(b, k)));
+start:
+			if (++j == t->size)
+				break;
+
+			BUG_ON(rw_aux_tree(b, t)[j].offset <=
+			       rw_aux_tree(b, t)[j - 1].offset);
+		}
+
+		k = bkey_next(k);
+		BUG_ON(k >= btree_bkey_last(b, t));
+	}
+}
+
+/* returns idx of first entry >= offset: */
+static unsigned rw_aux_tree_bsearch(struct btree *b,
+				    struct bset_tree *t,
+				    unsigned offset)
+{
+	unsigned bset_offs = offset - btree_bkey_first_offset(t);
+	unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
+	unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
+
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+	EBUG_ON(!t->size);
+	EBUG_ON(idx > t->size);
+
+	while (idx < t->size &&
+	       rw_aux_tree(b, t)[idx].offset < offset)
+		idx++;
+
+	while (idx &&
+	       rw_aux_tree(b, t)[idx - 1].offset >= offset)
+		idx--;
+
+	EBUG_ON(idx < t->size &&
+		rw_aux_tree(b, t)[idx].offset < offset);
+	EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
+	EBUG_ON(idx + 1 < t->size &&
+		rw_aux_tree(b, t)[idx].offset ==
+		rw_aux_tree(b, t)[idx + 1].offset);
+
+	return idx;
+}
+
+static inline unsigned bfloat_mantissa(const struct bkey_float *f,
+				       unsigned idx)
+{
+	return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
+}
+
+static inline void bfloat_mantissa_set(struct bkey_float *f,
+				       unsigned idx, unsigned mantissa)
+{
+	if (idx < BFLOAT_32BIT_NR)
+		f->mantissa32 = mantissa;
+	else
+		f->mantissa16 = mantissa;
+}
+
+static inline unsigned bkey_mantissa(const struct bkey_packed *k,
+				     const struct bkey_float *f,
+				     unsigned idx)
+{
+	u64 v;
+
+	EBUG_ON(!bkey_packed(k));
+
+	v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
+
+	/*
+	 * In little endian, we're shifting off low bits (and then the bits we
+	 * want are at the low end), in big endian we're shifting off high bits
+	 * (and then the bits we want are at the high end, so we shift them
+	 * back down):
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	v >>= f->exponent & 7;
+#else
+	v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
+#endif
+	return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
+}
+
+static void make_bfloat(struct btree *b, struct bset_tree *t,
+			unsigned j,
+			struct bkey_packed *min_key,
+			struct bkey_packed *max_key)
+{
+	struct bkey_float *f = bkey_float(b, t, j);
+	struct bkey_packed *m = tree_to_bkey(b, t, j);
+	struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
+	struct bkey_packed *l, *r;
+	unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
+	unsigned mantissa;
+	int shift, exponent, high_bit;
+
+	EBUG_ON(bkey_next(p) != m);
+
+	if (is_power_of_2(j)) {
+		l = min_key;
+
+		if (!l->u64s) {
+			if (!bkey_pack_pos(l, b->data->min_key, b)) {
+				struct bkey_i tmp;
+
+				bkey_init(&tmp.k);
+				tmp.k.p = b->data->min_key;
+				bkey_copy(l, &tmp);
+			}
+		}
+	} else {
+		l = tree_to_prev_bkey(b, t, j >> ffs(j));
+
+		EBUG_ON(m < l);
+	}
+
+	if (is_power_of_2(j + 1)) {
+		r = max_key;
+
+		if (!r->u64s) {
+			if (!bkey_pack_pos(r, t->max_key, b)) {
+				struct bkey_i tmp;
+
+				bkey_init(&tmp.k);
+				tmp.k.p = t->max_key;
+				bkey_copy(r, &tmp);
+			}
+		}
+	} else {
+		r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
+
+		EBUG_ON(m > r);
+	}
+
+	/*
+	 * for failed bfloats, the lookup code falls back to comparing against
+	 * the original key.
+	 */
+
+	if (!bkey_packed(l) || !bkey_packed(r) ||
+	    !bkey_packed(p) || !bkey_packed(m) ||
+	    !b->nr_key_bits) {
+		f->exponent = BFLOAT_FAILED_UNPACKED;
+		return;
+	}
+
+	/*
+	 * The greatest differing bit of l and r is the first bit we must
+	 * include in the bfloat mantissa we're creating in order to do
+	 * comparisons - that bit always becomes the high bit of
+	 * bfloat->mantissa, and thus the exponent we're calculating here is
+	 * the position of what will become the low bit in bfloat->mantissa:
+	 *
+	 * Note that this may be negative - we may be running off the low end
+	 * of the key: we handle this later:
+	 */
+	high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
+		       min_t(unsigned, bits, b->nr_key_bits) - 1);
+	exponent = high_bit - (bits - 1);
+
+	/*
+	 * Then we calculate the actual shift value, from the start of the key
+	 * (k->_data), to get the key bits starting at exponent:
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
+
+	EBUG_ON(shift + bits > b->format.key_u64s * 64);
+#else
+	shift = high_bit_offset +
+		b->nr_key_bits -
+		exponent -
+		bits;
+
+	EBUG_ON(shift < KEY_PACKED_BITS_START);
+#endif
+	EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
+
+	f->exponent = shift;
+	mantissa = bkey_mantissa(m, f, j);
+
+	/*
+	 * If we've got garbage bits, set them to all 1s - it's legal for the
+	 * bfloat to compare larger than the original key, but not smaller:
+	 */
+	if (exponent < 0)
+		mantissa |= ~(~0U << -exponent);
+
+	bfloat_mantissa_set(f, j, mantissa);
+
+	/*
+	 * The bfloat must be able to tell its key apart from the previous key -
+	 * if its key and the previous key don't differ in the required bits,
+	 * flag as failed - unless the keys are actually equal, in which case
+	 * we aren't required to return a specific one:
+	 */
+	if (exponent > 0 &&
+	    bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
+	    bkey_cmp_packed(b, p, m)) {
+		f->exponent = BFLOAT_FAILED_PREV;
+		return;
+	}
+
+	/*
+	 * f->mantissa must compare >= the original key - for transitivity with
+	 * the comparison in bset_search_tree. If we're dropping set bits,
+	 * increment it:
+	 */
+	if (exponent > (int) bch2_bkey_ffs(b, m)) {
+		if (j < BFLOAT_32BIT_NR
+		    ? f->mantissa32 == U32_MAX
+		    : f->mantissa16 == U16_MAX)
+			f->exponent = BFLOAT_FAILED_OVERFLOW;
+
+		if (j < BFLOAT_32BIT_NR)
+			f->mantissa32++;
+		else
+			f->mantissa16++;
+	}
+}
+
+/* bytes remaining - only valid for last bset: */
+static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+	bset_aux_tree_verify(b);
+
+	return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
+}
+
+static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+	unsigned bytes = __bset_tree_capacity(b, t);
+
+	if (bytes < 7 * BFLOAT_32BIT_NR)
+		return bytes / 7;
+
+	bytes -= 7 * BFLOAT_32BIT_NR;
+
+	return BFLOAT_32BIT_NR + bytes / 5;
+}
+
+static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
+{
+	return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
+}
+
+static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bkey_packed *k;
+
+	t->size = 1;
+	t->extra = BSET_RW_AUX_TREE_VAL;
+	rw_aux_tree(b, t)[0].offset =
+		__btree_node_key_to_offset(b, btree_bkey_first(b, t));
+
+	for (k = btree_bkey_first(b, t);
+	     k != btree_bkey_last(b, t);
+	     k = bkey_next(k)) {
+		if (t->size == bset_rw_tree_capacity(b, t))
+			break;
+
+		if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
+		    L1_CACHE_BYTES)
+			rw_aux_tree_set(b, t, t->size++, k);
+	}
+}
+
+static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
+	struct bkey_packed min_key, max_key;
+	unsigned j, cacheline = 1;
+
+	/* signal to make_bfloat() that they're uninitialized: */
+	min_key.u64s = max_key.u64s = 0;
+
+	t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
+		      bset_ro_tree_capacity(b, t));
+retry:
+	if (t->size < 2) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		return;
+	}
+
+	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+	/* First we figure out where the first key in each cacheline is */
+	eytzinger1_for_each(j, t->size) {
+		while (bkey_to_cacheline(b, t, k) < cacheline)
+			prev = k, k = bkey_next(k);
+
+		if (k >= btree_bkey_last(b, t)) {
+			/* XXX: this path sucks */
+			t->size--;
+			goto retry;
+		}
+
+		ro_aux_tree_prev(b, t)[j] = prev->u64s;
+		bkey_float(b, t, j)->key_offset =
+			bkey_to_cacheline_offset(b, t, cacheline++, k);
+
+		EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
+		EBUG_ON(tree_to_bkey(b, t, j) != k);
+	}
+
+	while (bkey_next(k) != btree_bkey_last(b, t))
+		k = bkey_next(k);
+
+	t->max_key = bkey_unpack_pos(b, k);
+
+	/* Then we build the tree */
+	eytzinger1_for_each(j, t->size)
+		make_bfloat(b, t, j, &min_key, &max_key);
+}
+
+static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bset_tree *i;
+
+	for (i = b->set; i != t; i++)
+		BUG_ON(bset_has_rw_aux_tree(i));
+
+	bch2_bset_set_no_aux_tree(b, t);
+
+	/* round up to next cacheline: */
+	t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
+				      SMP_CACHE_BYTES / sizeof(u64));
+
+	bset_aux_tree_verify(b);
+}
+
+void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
+			     bool writeable)
+{
+	if (writeable
+	    ? bset_has_rw_aux_tree(t)
+	    : bset_has_ro_aux_tree(t))
+		return;
+
+	bset_alloc_tree(b, t);
+
+	if (!__bset_tree_capacity(b, t))
+		return;
+
+	if (writeable)
+		__build_rw_aux_tree(b, t);
+	else
+		__build_ro_aux_tree(b, t);
+
+	bset_aux_tree_verify(b);
+}
+
+void bch2_bset_init_first(struct btree *b, struct bset *i)
+{
+	struct bset_tree *t;
+
+	BUG_ON(b->nsets);
+
+	memset(i, 0, sizeof(*i));
+	get_random_bytes(&i->seq, sizeof(i->seq));
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	t = &b->set[b->nsets++];
+	set_btree_bset(b, t, i);
+}
+
+void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
+			 struct btree_node_entry *bne)
+{
+	struct bset *i = &bne->keys;
+	struct bset_tree *t;
+
+	BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+	BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
+	BUG_ON(b->nsets >= MAX_BSETS);
+
+	memset(i, 0, sizeof(*i));
+	i->seq = btree_bset_first(b)->seq;
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	t = &b->set[b->nsets++];
+	set_btree_bset(b, t, i);
+}
+
+/*
+ * find _some_ key in the same bset as @k that precedes @k - not necessarily the
+ * immediate predecessor:
+ */
+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
+				       struct bkey_packed *k)
+{
+	struct bkey_packed *p;
+	unsigned offset;
+	int j;
+
+	EBUG_ON(k < btree_bkey_first(b, t) ||
+		k > btree_bkey_last(b, t));
+
+	if (k == btree_bkey_first(b, t))
+		return NULL;
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		p = btree_bkey_first(b, t);
+		break;
+	case BSET_RO_AUX_TREE:
+		j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
+
+		do {
+			p = j ? tree_to_bkey(b, t,
+					__inorder_to_eytzinger1(j--,
+							t->size, t->extra))
+			      : btree_bkey_first(b, t);
+		} while (p >= k);
+		break;
+	case BSET_RW_AUX_TREE:
+		offset = __btree_node_key_to_offset(b, k);
+		j = rw_aux_tree_bsearch(b, t, offset);
+		p = j ? rw_aux_to_bkey(b, t, j - 1)
+		      : btree_bkey_first(b, t);
+		break;
+	}
+
+	return p;
+}
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
+					  struct bset_tree *t,
+					  struct bkey_packed *k,
+					  unsigned min_key_type)
+{
+	struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
+
+	while ((p = __bkey_prev(b, t, k)) && !ret) {
+		for (i = p; i != k; i = bkey_next(i))
+			if (i->type >= min_key_type)
+				ret = i;
+
+		k = p;
+	}
+
+	if (btree_keys_expensive_checks(b)) {
+		BUG_ON(ret >= orig_k);
+
+		for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t);
+		     i != orig_k;
+		     i = bkey_next(i))
+			BUG_ON(i->type >= min_key_type);
+	}
+
+	return ret;
+}
+
+/* Insert */
+
+static void rw_aux_tree_fix_invalidated_key(struct btree *b,
+					    struct bset_tree *t,
+					    struct bkey_packed *k)
+{
+	unsigned offset = __btree_node_key_to_offset(b, k);
+	unsigned j = rw_aux_tree_bsearch(b, t, offset);
+
+	if (j < t->size &&
+	    rw_aux_tree(b, t)[j].offset == offset)
+		rw_aux_tree_set(b, t, j, k);
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+}
+
+static void ro_aux_tree_fix_invalidated_key(struct btree *b,
+					    struct bset_tree *t,
+					    struct bkey_packed *k)
+{
+	struct bkey_packed min_key, max_key;
+	unsigned inorder, j;
+
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	/* signal to make_bfloat() that they're uninitialized: */
+	min_key.u64s = max_key.u64s = 0;
+
+	if (bkey_next(k) == btree_bkey_last(b, t)) {
+		t->max_key = bkey_unpack_pos(b, k);
+
+		for (j = 1; j < t->size; j = j * 2 + 1)
+			make_bfloat(b, t, j, &min_key, &max_key);
+	}
+
+	inorder = bkey_to_cacheline(b, t, k);
+
+	if (inorder &&
+	    inorder < t->size) {
+		j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
+
+		if (k == tree_to_bkey(b, t, j)) {
+			/* Fix the node this key corresponds to */
+			make_bfloat(b, t, j, &min_key, &max_key);
+
+			/* Children for which this key is the right boundary */
+			for (j = eytzinger1_left_child(j);
+			     j < t->size;
+			     j = eytzinger1_right_child(j))
+				make_bfloat(b, t, j, &min_key, &max_key);
+		}
+	}
+
+	if (inorder + 1 < t->size) {
+		j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
+
+		if (k == tree_to_prev_bkey(b, t, j)) {
+			make_bfloat(b, t, j, &min_key, &max_key);
+
+			/* Children for which this key is the left boundary */
+			for (j = eytzinger1_right_child(j);
+			     j < t->size;
+			     j = eytzinger1_left_child(j))
+				make_bfloat(b, t, j, &min_key, &max_key);
+		}
+	}
+}
+
+/**
+ * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
+ * modified, fix any auxiliary search tree by remaking all the nodes in the
+ * auxiliary search tree that @k corresponds to
+ */
+void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, k);
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		break;
+	case BSET_RO_AUX_TREE:
+		ro_aux_tree_fix_invalidated_key(b, t, k);
+		break;
+	case BSET_RW_AUX_TREE:
+		rw_aux_tree_fix_invalidated_key(b, t, k);
+		break;
+	}
+}
+
+static void bch2_bset_fix_lookup_table(struct btree *b,
+				       struct bset_tree *t,
+				       struct bkey_packed *_where,
+				       unsigned clobber_u64s,
+				       unsigned new_u64s)
+{
+	int shift = new_u64s - clobber_u64s;
+	unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+
+	EBUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	/* returns first entry >= where */
+	l = rw_aux_tree_bsearch(b, t, where);
+
+	if (!l) /* never delete first entry */
+		l++;
+	else if (l < t->size &&
+		 where < t->end_offset &&
+		 rw_aux_tree(b, t)[l].offset == where)
+		rw_aux_tree_set(b, t, l++, _where);
+
+	/* l now > where */
+
+	for (j = l;
+	     j < t->size &&
+	     rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
+	     j++)
+		;
+
+	if (j < t->size &&
+	    rw_aux_tree(b, t)[j].offset + shift ==
+	    rw_aux_tree(b, t)[l - 1].offset)
+		j++;
+
+	memmove(&rw_aux_tree(b, t)[l],
+		&rw_aux_tree(b, t)[j],
+		(void *) &rw_aux_tree(b, t)[t->size] -
+		(void *) &rw_aux_tree(b, t)[j]);
+	t->size -= j - l;
+
+	for (j = l; j < t->size; j++)
+	       rw_aux_tree(b, t)[j].offset += shift;
+
+	EBUG_ON(l < t->size &&
+		rw_aux_tree(b, t)[l].offset ==
+		rw_aux_tree(b, t)[l - 1].offset);
+
+	if (t->size < bset_rw_tree_capacity(b, t) &&
+	    (l < t->size
+	     ? rw_aux_tree(b, t)[l].offset
+	     : t->end_offset) -
+	    rw_aux_tree(b, t)[l - 1].offset >
+	    L1_CACHE_BYTES / sizeof(u64)) {
+		struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
+		struct bkey_packed *end = l < t->size
+			? rw_aux_to_bkey(b, t, l)
+			: btree_bkey_last(b, t);
+		struct bkey_packed *k = start;
+
+		while (1) {
+			k = bkey_next(k);
+			if (k == end)
+				break;
+
+			if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+				memmove(&rw_aux_tree(b, t)[l + 1],
+					&rw_aux_tree(b, t)[l],
+					(void *) &rw_aux_tree(b, t)[t->size] -
+					(void *) &rw_aux_tree(b, t)[l]);
+				t->size++;
+				rw_aux_tree_set(b, t, l, k);
+				break;
+			}
+		}
+	}
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+	bset_aux_tree_verify(b);
+}
+
+void bch2_bset_insert(struct btree *b,
+		      struct btree_node_iter *iter,
+		      struct bkey_packed *where,
+		      struct bkey_i *insert,
+		      unsigned clobber_u64s)
+{
+	struct bkey_format *f = &b->format;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bkey_packed packed, *src = bkey_to_packed(insert);
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+	bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
+
+	if (bch2_bkey_pack_key(&packed, &insert->k, f))
+		src = &packed;
+
+	if (!bkey_whiteout(&insert->k))
+		btree_keys_account_key_add(&b->nr, t - b->set, src);
+
+	if (src->u64s != clobber_u64s) {
+		u64 *src_p = where->_data + clobber_u64s;
+		u64 *dst_p = where->_data + src->u64s;
+
+		EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
+			(int) clobber_u64s - src->u64s);
+
+		memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+		le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
+		set_btree_bset_end(b, t);
+	}
+
+	memcpy_u64s(where, src,
+		    bkeyp_key_u64s(f, src));
+	memcpy_u64s(bkeyp_val(f, where), &insert->v,
+		    bkeyp_val_u64s(f, src));
+
+	bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
+
+	bch2_verify_btree_nr_keys(b);
+}
+
+void bch2_bset_delete(struct btree *b,
+		      struct bkey_packed *where,
+		      unsigned clobber_u64s)
+{
+	struct bset_tree *t = bset_tree_last(b);
+	u64 *src_p = where->_data + clobber_u64s;
+	u64 *dst_p = where->_data;
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+
+	EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
+
+	memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+	le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
+	set_btree_bset_end(b, t);
+
+	bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
+}
+
+/* Lookup */
+
+__flatten
+static struct bkey_packed *bset_search_write_set(const struct btree *b,
+				struct bset_tree *t,
+				struct bpos *search,
+				const struct bkey_packed *packed_search)
+{
+	unsigned l = 0, r = t->size;
+
+	while (l + 1 != r) {
+		unsigned m = (l + r) >> 1;
+
+		if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
+			l = m;
+		else
+			r = m;
+	}
+
+	return rw_aux_to_bkey(b, t, l);
+}
+
+noinline
+static int bset_search_tree_slowpath(const struct btree *b,
+				struct bset_tree *t, struct bpos *search,
+				const struct bkey_packed *packed_search,
+				unsigned n)
+{
+	return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
+				 packed_search, search) < 0;
+}
+
+__flatten
+static struct bkey_packed *bset_search_tree(const struct btree *b,
+				struct bset_tree *t,
+				struct bpos *search,
+				const struct bkey_packed *packed_search)
+{
+	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
+	struct bkey_float *f = bkey_float_get(base, 1);
+	void *p;
+	unsigned inorder, n = 1;
+
+	while (1) {
+		if (likely(n << 4 < t->size)) {
+			p = bkey_float_get(base, n << 4);
+			prefetch(p);
+		} else if (n << 3 < t->size) {
+			inorder = __eytzinger1_to_inorder(n, t->size, t->extra);
+			p = bset_cacheline(b, t, inorder);
+#ifdef CONFIG_X86_64
+			asm(".intel_syntax noprefix;"
+			    "prefetcht0 [%0 - 127 + 64 * 0];"
+			    "prefetcht0 [%0 - 127 + 64 * 1];"
+			    "prefetcht0 [%0 - 127 + 64 * 2];"
+			    "prefetcht0 [%0 - 127 + 64 * 3];"
+			    ".att_syntax prefix;"
+			    :
+			    : "r" (p + 127));
+#else
+			prefetch(p + L1_CACHE_BYTES * 0);
+			prefetch(p + L1_CACHE_BYTES * 1);
+			prefetch(p + L1_CACHE_BYTES * 2);
+			prefetch(p + L1_CACHE_BYTES * 3);
+#endif
+		} else if (n >= t->size)
+			break;
+
+		f = bkey_float_get(base, n);
+
+		if (packed_search &&
+		    likely(f->exponent < BFLOAT_FAILED))
+			n = n * 2 + (bfloat_mantissa(f, n) <
+				     bkey_mantissa(packed_search, f, n));
+		else
+			n = n * 2 + bset_search_tree_slowpath(b, t,
+						search, packed_search, n);
+	} while (n < t->size);
+
+	inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
+
+	/*
+	 * n would have been the node we recursed to - the low bit tells us if
+	 * we recursed left or recursed right.
+	 */
+	if (n & 1) {
+		return cacheline_to_bkey(b, t, inorder, f->key_offset);
+	} else {
+		if (--inorder) {
+			n = eytzinger1_prev(n >> 1, t->size);
+			f = bkey_float_get(base, n);
+			return cacheline_to_bkey(b, t, inorder, f->key_offset);
+		} else
+			return btree_bkey_first(b, t);
+	}
+}
+
+/*
+ * Returns the first key greater than or equal to @search
+ */
+__always_inline __flatten
+static struct bkey_packed *bch2_bset_search(struct btree *b,
+				struct bset_tree *t,
+				struct bpos *search,
+				struct bkey_packed *packed_search,
+				const struct bkey_packed *lossy_packed_search)
+{
+	struct bkey_packed *m;
+
+	/*
+	 * First, we search for a cacheline, then lastly we do a linear search
+	 * within that cacheline.
+	 *
+	 * To search for the cacheline, there's three different possibilities:
+	 *  * The set is too small to have a search tree, so we just do a linear
+	 *    search over the whole set.
+	 *  * The set is the one we're currently inserting into; keeping a full
+	 *    auxiliary search tree up to date would be too expensive, so we
+	 *    use a much simpler lookup table to do a binary search -
+	 *    bset_search_write_set().
+	 *  * Or we use the auxiliary search tree we constructed earlier -
+	 *    bset_search_tree()
+	 */
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		m = btree_bkey_first(b, t);
+		break;
+	case BSET_RW_AUX_TREE:
+		m = bset_search_write_set(b, t, search, lossy_packed_search);
+		break;
+	case BSET_RO_AUX_TREE:
+		/*
+		 * Each node in the auxiliary search tree covers a certain range
+		 * of bits, and keys above and below the set it covers might
+		 * differ outside those bits - so we have to special case the
+		 * start and end - handle that here:
+		 */
+
+		if (bkey_cmp(*search, t->max_key) > 0)
+			return btree_bkey_last(b, t);
+
+		m = bset_search_tree(b, t, search, lossy_packed_search);
+		break;
+	}
+
+	if (lossy_packed_search)
+		while (m != btree_bkey_last(b, t) &&
+		       bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search,
+					      m) > 0)
+			m = bkey_next(m);
+
+	if (!packed_search)
+		while (m != btree_bkey_last(b, t) &&
+		       bkey_iter_pos_cmp(b, search, m) > 0)
+			m = bkey_next(m);
+
+	if (btree_keys_expensive_checks(b)) {
+		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
+
+		BUG_ON(prev &&
+		       bkey_iter_cmp_p_or_unp(b, search, packed_search,
+					      prev) <= 0);
+	}
+
+	return m;
+}
+
+/* Btree node iterator */
+
+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
+			      struct btree *b,
+			      const struct bkey_packed *k,
+			      const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set *pos;
+
+		btree_node_iter_for_each(iter, pos)
+			;
+
+		BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
+		*pos = (struct btree_node_iter_set) {
+			__btree_node_key_to_offset(b, k),
+			__btree_node_key_to_offset(b, end)
+		};
+	}
+}
+
+void bch2_btree_node_iter_push(struct btree_node_iter *iter,
+			       struct btree *b,
+			       const struct bkey_packed *k,
+			       const struct bkey_packed *end)
+{
+	__bch2_btree_node_iter_push(iter, b, k, end);
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+noinline __flatten __attribute__((cold))
+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
+			      struct btree *b, struct bpos *search)
+{
+	struct bset_tree *t;
+
+	trace_bkey_pack_pos_fail(search);
+
+	for_each_bset(b, t)
+		__bch2_btree_node_iter_push(iter, b,
+			bch2_bset_search(b, t, search, NULL, NULL),
+			btree_bkey_last(b, t));
+
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+/**
+ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * given position
+ *
+ * Main entry point to the lookup code for individual btree nodes:
+ *
+ * NOTE:
+ *
+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
+ * keys. This doesn't matter for most code, but it does matter for lookups.
+ *
+ * Some adjacent keys with a string of equal keys:
+ *	i j k k k k l m
+ *
+ * If you search for k, the lookup code isn't guaranteed to return you any
+ * specific k. The lookup code is conceptually doing a binary search and
+ * iterating backwards is very expensive so if the pivot happens to land at the
+ * last k that's what you'll get.
+ *
+ * This works out ok, but it's something to be aware of:
+ *
+ *  - For non extents, we guarantee that the live key comes last - see
+ *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
+ *    see will only be deleted keys you don't care about.
+ *
+ *  - For extents, deleted keys sort last (see the comment at the top of this
+ *    file). But when you're searching for extents, you actually want the first
+ *    key strictly greater than your search key - an extent that compares equal
+ *    to the search key is going to have 0 sectors after the search key.
+ *
+ *    But this does mean that we can't just search for
+ *    bkey_successor(start_of_range) to get the first extent that overlaps with
+ *    the range we want - if we're unlucky and there's an extent that ends
+ *    exactly where we searched, then there could be a deleted key at the same
+ *    position and we'd get that when we search instead of the preceding extent
+ *    we needed.
+ *
+ *    So we've got to search for start_of_range, then after the lookup iterate
+ *    past any extents that compare equal to the position we searched for.
+ */
+__flatten
+void bch2_btree_node_iter_init(struct btree_node_iter *iter,
+			       struct btree *b, struct bpos *search)
+{
+	struct bset_tree *t;
+	struct bkey_packed p, *packed_search = NULL;
+	struct btree_node_iter_set *pos = iter->data;
+
+	EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
+	bset_aux_tree_verify(b);
+
+	memset(iter, 0, sizeof(*iter));
+
+	switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
+	case BKEY_PACK_POS_EXACT:
+		packed_search = &p;
+		break;
+	case BKEY_PACK_POS_SMALLER:
+		packed_search = NULL;
+		break;
+	case BKEY_PACK_POS_FAIL:
+		btree_node_iter_init_pack_failed(iter, b, search);
+		return;
+	}
+
+	for_each_bset(b, t) {
+		struct bkey_packed *k = bch2_bset_search(b, t, search,
+							 packed_search, &p);
+		struct bkey_packed *end = btree_bkey_last(b, t);
+
+		if (k != end)
+			*pos++ = (struct btree_node_iter_set) {
+				__btree_node_key_to_offset(b, k),
+				__btree_node_key_to_offset(b, end)
+			};
+	}
+
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
+					  struct btree *b)
+{
+	struct bset_tree *t;
+
+	memset(iter, 0, sizeof(*iter));
+
+	for_each_bset(b, t)
+		__bch2_btree_node_iter_push(iter, b,
+					   btree_bkey_first(b, t),
+					   btree_bkey_last(b, t));
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
+						  struct btree *b,
+						  struct bset_tree *t)
+{
+	struct btree_node_iter_set *set;
+
+	btree_node_iter_for_each(iter, set)
+		if (set->end == t->end_offset)
+			return __btree_node_offset_to_key(b, set->k);
+
+	return btree_bkey_last(b, t);
+}
+
+static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
+					    struct btree *b,
+					    unsigned first)
+{
+	bool ret;
+
+	if ((ret = (btree_node_iter_cmp(b,
+					iter->data[first],
+					iter->data[first + 1]) > 0)))
+		swap(iter->data[first], iter->data[first + 1]);
+	return ret;
+}
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
+			       struct btree *b)
+{
+	/* unrolled bubble sort: */
+
+	if (!__btree_node_iter_set_end(iter, 2)) {
+		btree_node_iter_sort_two(iter, b, 0);
+		btree_node_iter_sort_two(iter, b, 1);
+	}
+
+	if (!__btree_node_iter_set_end(iter, 1))
+		btree_node_iter_sort_two(iter, b, 0);
+}
+
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
+				   struct btree_node_iter_set *set)
+{
+	struct btree_node_iter_set *last =
+		iter->data + ARRAY_SIZE(iter->data) - 1;
+
+	memmove(&set[0], &set[1], (void *) last - (void *) set);
+	*last = (struct btree_node_iter_set) { 0, 0 };
+}
+
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
+						  struct btree *b)
+{
+	iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
+
+	EBUG_ON(iter->data->k > iter->data->end);
+
+	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
+		bch2_btree_node_iter_set_drop(iter, iter->data);
+		return;
+	}
+
+	if (__btree_node_iter_set_end(iter, 1))
+		return;
+
+	if (!btree_node_iter_sort_two(iter, b, 0))
+		return;
+
+	if (__btree_node_iter_set_end(iter, 2))
+		return;
+
+	btree_node_iter_sort_two(iter, b, 1);
+}
+
+void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
+				  struct btree *b)
+{
+	if (btree_keys_expensive_checks(b)) {
+		bch2_btree_node_iter_verify(iter, b);
+		bch2_btree_node_iter_next_check(iter, b);
+	}
+
+	__bch2_btree_node_iter_advance(iter, b);
+}
+
+/*
+ * Expensive:
+ */
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
+						  struct btree *b)
+{
+	struct bkey_packed *k, *prev = NULL;
+	struct btree_node_iter_set *set;
+	struct bset_tree *t;
+	unsigned end = 0;
+
+	bch2_btree_node_iter_verify(iter, b);
+
+	for_each_bset(b, t) {
+		k = bch2_bkey_prev_all(b, t,
+			bch2_btree_node_iter_bset_pos(iter, b, t));
+		if (k &&
+		    (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
+			prev = k;
+			end = t->end_offset;
+		}
+	}
+
+	if (!prev)
+		return NULL;
+
+	/*
+	 * We're manually memmoving instead of just calling sort() to ensure the
+	 * prev we picked ends up in slot 0 - sort won't necessarily put it
+	 * there because of duplicate deleted keys:
+	 */
+	btree_node_iter_for_each(iter, set)
+		if (set->end == end)
+			goto found;
+
+	BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
+found:
+	BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
+
+	memmove(&iter->data[1],
+		&iter->data[0],
+		(void *) set - (void *) &iter->data[0]);
+
+	iter->data[0].k = __btree_node_key_to_offset(b, prev);
+	iter->data[0].end = end;
+
+	bch2_btree_node_iter_verify(iter, b);
+	return prev;
+}
+
+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
+						     struct btree *b,
+						     unsigned min_key_type)
+{
+	struct bkey_packed *prev;
+
+	do {
+		prev = bch2_btree_node_iter_prev_all(iter, b);
+	} while (prev && prev->type < min_key_type);
+
+	return prev;
+}
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
+						 struct btree *b,
+						 struct bkey *u)
+{
+	struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
+
+	return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
+}
+
+/* Mergesort */
+
+void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		enum bset_aux_tree_type type = bset_aux_tree_type(t);
+		size_t j;
+
+		stats->sets[type].nr++;
+		stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
+			sizeof(u64);
+
+		if (bset_has_ro_aux_tree(t)) {
+			stats->floats += t->size - 1;
+
+			for (j = 1; j < t->size; j++)
+				switch (bkey_float(b, t, j)->exponent) {
+				case BFLOAT_FAILED_UNPACKED:
+					stats->failed_unpacked++;
+					break;
+				case BFLOAT_FAILED_PREV:
+					stats->failed_prev++;
+					break;
+				case BFLOAT_FAILED_OVERFLOW:
+					stats->failed_overflow++;
+					break;
+				}
+		}
+	}
+}
+
+void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
+			 struct bkey_packed *k)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, k);
+	struct bkey_packed *l, *r, *p;
+	struct bkey uk, up;
+	char buf1[200], buf2[200];
+	unsigned j, inorder;
+
+	if (out->pos != out->end)
+		*out->pos = '\0';
+
+	if (!bset_has_ro_aux_tree(t))
+		return;
+
+	inorder = bkey_to_cacheline(b, t, k);
+	if (!inorder || inorder >= t->size)
+		return;
+
+	j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
+	if (k != tree_to_bkey(b, t, j))
+		return;
+
+	switch (bkey_float(b, t, j)->exponent) {
+	case BFLOAT_FAILED_UNPACKED:
+		uk = bkey_unpack_key(b, k);
+		pr_buf(out,
+		       "    failed unpacked at depth %u\n"
+		       "\t%llu:%llu\n",
+		       ilog2(j),
+		       uk.p.inode, uk.p.offset);
+		break;
+	case BFLOAT_FAILED_PREV:
+		p = tree_to_prev_bkey(b, t, j);
+		l = is_power_of_2(j)
+			? btree_bkey_first(b, t)
+			: tree_to_prev_bkey(b, t, j >> ffs(j));
+		r = is_power_of_2(j + 1)
+			? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
+			: tree_to_bkey(b, t, j >> (ffz(j) + 1));
+
+		up = bkey_unpack_key(b, p);
+		uk = bkey_unpack_key(b, k);
+		bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
+		bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
+
+		pr_buf(out,
+		       "    failed prev at depth %u\n"
+		       "\tkey starts at bit %u but first differing bit at %u\n"
+		       "\t%llu:%llu\n"
+		       "\t%llu:%llu\n"
+		       "\t%s\n"
+		       "\t%s\n",
+		       ilog2(j),
+		       bch2_bkey_greatest_differing_bit(b, l, r),
+		       bch2_bkey_greatest_differing_bit(b, p, k),
+		       uk.p.inode, uk.p.offset,
+		       up.p.inode, up.p.offset,
+		       buf1, buf2);
+		break;
+	case BFLOAT_FAILED_OVERFLOW:
+		uk = bkey_unpack_key(b, k);
+		pr_buf(out,
+		       "    failed overflow at depth %u\n"
+		       "\t%llu:%llu\n",
+		       ilog2(j),
+		       uk.p.inode, uk.p.offset);
+		break;
+	}
+}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
new file mode 100644
index 000000000000..643bd9e8bc4d
--- /dev/null
+++ b/fs/bcachefs/bset.h
@@ -0,0 +1,624 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BSET_H
+#define _BCACHEFS_BSET_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "bcachefs_format.h"
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "util.h" /* for time_stats */
+#include "vstructs.h"
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bkey_invalid and
+ * bkey_deleted().
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that,  we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking.  Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n.  The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+extern bool bch2_expensive_debug_checks;
+
+static inline bool btree_keys_expensive_checks(const struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	return bch2_expensive_debug_checks || *b->expensive_debug_checks;
+#else
+	return false;
+#endif
+}
+
+enum bset_aux_tree_type {
+	BSET_NO_AUX_TREE,
+	BSET_RO_AUX_TREE,
+	BSET_RW_AUX_TREE,
+};
+
+#define BSET_TREE_NR_TYPES	3
+
+#define BSET_NO_AUX_TREE_VAL	(U16_MAX)
+#define BSET_RW_AUX_TREE_VAL	(U16_MAX - 1)
+
+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
+{
+	switch (t->extra) {
+	case BSET_NO_AUX_TREE_VAL:
+		EBUG_ON(t->size);
+		return BSET_NO_AUX_TREE;
+	case BSET_RW_AUX_TREE_VAL:
+		EBUG_ON(!t->size);
+		return BSET_RW_AUX_TREE;
+	default:
+		EBUG_ON(!t->size);
+		return BSET_RO_AUX_TREE;
+	}
+}
+
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+			       struct bkey *dst,
+			       const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+	{
+		compiled_unpack_fn unpack_fn = b->aux_data;
+		unpack_fn(dst, src);
+
+		if (btree_keys_expensive_checks(b)) {
+			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+
+			/*
+			 * hack around a harmless race when compacting whiteouts
+			 * for a write:
+			 */
+			dst2.needs_whiteout = dst->needs_whiteout;
+
+			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
+		}
+	}
+#else
+	*dst = __bch2_bkey_unpack_key(&b->format, src);
+#endif
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+	struct bkey dst;
+
+	__bkey_unpack_key_format_checked(b, &dst, src);
+	return dst;
+}
+
+static inline void __bkey_unpack_key(const struct btree *b,
+				     struct bkey *dst,
+				     const struct bkey_packed *src)
+{
+	if (likely(bkey_packed(src)))
+		__bkey_unpack_key_format_checked(b, dst, src);
+	else
+		*dst = *packed_to_bkey_c(src);
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_key_format_checked(b, src)
+		: *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+	return bkey_unpack_key_format_checked(b, src).p;
+#else
+	return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_pos_format_checked(b, src)
+		: packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(struct btree *b,
+					       const struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(struct btree *b,
+					       struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
+#define for_each_bset(_b, _t)					\
+	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+
+static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
+}
+
+static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
+}
+
+static inline void bch2_bset_set_no_aux_tree(struct btree *b,
+					    struct bset_tree *t)
+{
+	BUG_ON(t < b->set);
+
+	for (; t < b->set + ARRAY_SIZE(b->set); t++) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		t->aux_data_offset = U16_MAX;
+	}
+}
+
+static inline void btree_node_set_format(struct btree *b,
+					 struct bkey_format f)
+{
+	int len;
+
+	b->format	= f;
+	b->nr_key_bits	= bkey_format_key_bits(&f);
+
+	len = bch2_compile_bkey_format(&b->format, b->aux_data);
+	BUG_ON(len < 0 || len > U8_MAX);
+
+	b->unpack_fn_len = len;
+
+	bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+static inline struct bset *bset_next_set(struct btree *b,
+					 unsigned block_bytes)
+{
+	struct bset *i = btree_bset_last(b);
+
+	EBUG_ON(!is_power_of_2(block_bytes));
+
+	return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
+}
+
+void bch2_btree_keys_free(struct btree *);
+int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
+void bch2_btree_keys_init(struct btree *, bool *);
+
+void bch2_bset_init_first(struct btree *, struct bset *);
+void bch2_bset_init_next(struct bch_fs *, struct btree *,
+			 struct btree_node_entry *);
+void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
+void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *);
+
+void bch2_bset_insert(struct btree *, struct btree_node_iter *,
+		     struct bkey_packed *, struct bkey_i *, unsigned);
+void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
+
+/* Bkey utility code */
+
+/* packed or unpacked */
+static inline int bkey_cmp_p_or_unp(const struct btree *b,
+				    const struct bkey_packed *l,
+				    const struct bkey_packed *r_packed,
+				    struct bpos *r)
+{
+	EBUG_ON(r_packed && !bkey_packed(r_packed));
+
+	if (unlikely(!bkey_packed(l)))
+		return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+
+	if (likely(r_packed))
+		return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
+
+	return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
+					  struct bkey_packed *, unsigned);
+
+static inline struct bkey_packed *
+bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+	return bch2_bkey_prev_filter(b, t, k, 0);
+}
+
+static inline struct bkey_packed *
+bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+	return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1);
+}
+
+enum bch_extent_overlap {
+	BCH_EXTENT_OVERLAP_ALL		= 0,
+	BCH_EXTENT_OVERLAP_BACK		= 1,
+	BCH_EXTENT_OVERLAP_FRONT	= 2,
+	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+							  const struct bkey *m)
+{
+	int cmp1 = bkey_cmp(k->p, m->p) < 0;
+	int cmp2 = bkey_cmp(bkey_start_pos(k),
+			    bkey_start_pos(m)) > 0;
+
+	return (cmp1 << 1) + cmp2;
+}
+
+/* Btree key iteration */
+
+void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
+			      const struct bkey_packed *,
+			      const struct bkey_packed *);
+void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
+			       struct bpos *);
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
+					  struct btree *);
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
+						 struct btree *,
+						 struct bset_tree *);
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
+				   struct btree_node_iter_set *);
+void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
+
+#define btree_node_iter_for_each(_iter, _set)				\
+	for (_set = (_iter)->data;					\
+	     _set < (_iter)->data + ARRAY_SIZE((_iter)->data) &&	\
+	     (_set)->k != (_set)->end;					\
+	     _set++)
+
+static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
+					     unsigned i)
+{
+	return iter->data[i].k == iter->data[i].end;
+}
+
+static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
+{
+	return __btree_node_iter_set_end(iter, 0);
+}
+
+/*
+ * When keys compare equal, deleted keys compare first:
+ *
+ * XXX: only need to compare pointers for keys that are both within a
+ * btree_node_iterator - we need to break ties for prev() to work correctly
+ */
+static inline int bkey_iter_cmp(struct btree *b,
+				const struct bkey_packed *l,
+				const struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r)
+		?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
+		?: cmp_int(l, r);
+}
+
+static inline int btree_node_iter_cmp(struct btree *b,
+				      struct btree_node_iter_set l,
+				      struct btree_node_iter_set r)
+{
+	return bkey_iter_cmp(b,
+			__btree_node_offset_to_key(b, l.k),
+			__btree_node_offset_to_key(b, r.k));
+}
+
+/* These assume l (the search key) is not a deleted key: */
+static inline int bkey_iter_pos_cmp(struct btree *b,
+			struct bpos *l,
+			const struct bkey_packed *r)
+{
+	return -bkey_cmp_left_packed(b, r, l)
+		?: (int) bkey_deleted(r);
+}
+
+static inline int bkey_iter_cmp_p_or_unp(struct btree *b,
+			struct bpos *l,
+			const struct bkey_packed *l_packed,
+			const struct bkey_packed *r)
+{
+	return -bkey_cmp_p_or_unp(b, r, l_packed, l)
+		?: (int) bkey_deleted(r);
+}
+
+static inline struct bkey_packed *
+__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
+				struct btree *b)
+{
+	return __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter,
+				 struct btree *b,
+				 unsigned min_key_type)
+{
+	while (!bch2_btree_node_iter_end(iter)) {
+		struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b);
+
+		if (k->type >= min_key_type)
+			return k;
+
+		bch2_btree_node_iter_advance(iter, b);
+	}
+
+	return NULL;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
+			      struct btree *b)
+{
+	return bch2_btree_node_iter_peek_filter(iter, b, 0);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
+{
+	return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
+{
+	struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
+
+	if (ret)
+		bch2_btree_node_iter_advance(iter, b);
+
+	return ret;
+}
+
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
+						  struct btree *);
+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
+						     struct btree *, unsigned);
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
+{
+	return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1);
+}
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
+						struct btree *,
+						struct bkey *);
+
+#define for_each_btree_node_key_unpack(b, k, iter, unpacked)		\
+	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
+	     (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
+	     bch2_btree_node_iter_advance(iter, b))
+
+/* Accounting: */
+
+static inline void btree_keys_account_key(struct btree_nr_keys *n,
+					  unsigned bset,
+					  struct bkey_packed *k,
+					  int sign)
+{
+	n->live_u64s		+= k->u64s * sign;
+	n->bset_u64s[bset]	+= k->u64s * sign;
+
+	if (bkey_packed(k))
+		n->packed_keys	+= sign;
+	else
+		n->unpacked_keys += sign;
+}
+
+#define btree_keys_account_key_add(_nr, _bset_idx, _k)		\
+	btree_keys_account_key(_nr, _bset_idx, _k, 1)
+#define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
+	btree_keys_account_key(_nr, _bset_idx, _k, -1)
+
+#define btree_account_key_add(_b, _k)				\
+	btree_keys_account_key(&(_b)->nr,			\
+		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
+#define btree_account_key_drop(_b, _k)				\
+	btree_keys_account_key(&(_b)->nr,			\
+		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
+
+struct bset_stats {
+	struct {
+		size_t nr, bytes;
+	} sets[BSET_TREE_NR_TYPES];
+
+	size_t floats;
+	size_t failed_unpacked;
+	size_t failed_prev;
+	size_t failed_overflow;
+};
+
+void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
+void bch2_bfloat_to_text(struct printbuf *, struct btree *,
+			 struct bkey_packed *);
+
+/* Debug stuff */
+
+void bch2_dump_bset(struct btree *, struct bset *, unsigned);
+void bch2_dump_btree_node(struct btree *);
+void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *);
+void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
+			    struct bkey_packed *, unsigned);
+
+#else
+
+static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
+static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+					      struct btree *b) {}
+static inline void bch2_verify_insert_pos(struct btree *b,
+					  struct bkey_packed *where,
+					  struct bkey_packed *insert,
+					  unsigned clobber_u64s) {}
+#endif
+
+static inline void bch2_verify_btree_nr_keys(struct btree *b)
+{
+	if (btree_keys_expensive_checks(b))
+		__bch2_verify_btree_nr_keys(b);
+}
+
+#endif /* _BCACHEFS_BSET_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
new file mode 100644
index 000000000000..416949512057
--- /dev/null
+++ b/fs/bcachefs/btree_cache.c
@@ -0,0 +1,934 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+
+#include <linux/prefetch.h>
+#include <linux/sched/mm.h>
+#include <trace/events/bcachefs.h>
+
+const char * const bch2_btree_ids[] = {
+#define x(kwd, val, name) name,
+	BCH_BTREE_IDS()
+#undef x
+	NULL
+};
+
+void bch2_recalc_btree_reserve(struct bch_fs *c)
+{
+	unsigned i, reserve = 16;
+
+	if (!c->btree_roots[0].b)
+		reserve += 8;
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (c->btree_roots[i].b)
+			reserve += min_t(unsigned, 1,
+					 c->btree_roots[i].b->level) * 8;
+
+	c->btree_cache.reserve = reserve;
+}
+
+static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+{
+	return max_t(int, 0, bc->used - bc->reserve);
+}
+
+static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
+{
+	EBUG_ON(btree_node_write_in_flight(b));
+
+	kvpfree(b->data, btree_bytes(c));
+	b->data = NULL;
+	bch2_btree_keys_free(b);
+}
+
+static void btree_node_data_free(struct bch_fs *c, struct btree *b)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	__btree_node_data_free(c, b);
+	bc->used--;
+	list_move(&b->list, &bc->freed);
+}
+
+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+				   const void *obj)
+{
+	const struct btree *b = obj;
+	const u64 *v = arg->key;
+
+	return PTR_HASH(&b->key) == *v ? 0 : 1;
+}
+
+static const struct rhashtable_params bch_btree_cache_params = {
+	.head_offset	= offsetof(struct btree, hash),
+	.key_offset	= offsetof(struct btree, key.v),
+	.key_len	= sizeof(struct bch_extent_ptr),
+	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
+};
+
+static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	b->data = kvpmalloc(btree_bytes(c), gfp);
+	if (!b->data)
+		goto err;
+
+	if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
+		goto err;
+
+	bc->used++;
+	list_move(&b->list, &bc->freeable);
+	return;
+err:
+	kvpfree(b->data, btree_bytes(c));
+	b->data = NULL;
+	list_move(&b->list, &bc->freed);
+}
+
+static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
+{
+	struct btree *b = kzalloc(sizeof(struct btree), gfp);
+	if (!b)
+		return NULL;
+
+	bkey_btree_ptr_init(&b->key);
+	six_lock_init(&b->lock);
+	INIT_LIST_HEAD(&b->list);
+	INIT_LIST_HEAD(&b->write_blocked);
+
+	btree_node_data_alloc(c, b, gfp);
+	return b->data ? b : NULL;
+}
+
+/* Btree in memory cache - hash table */
+
+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+{
+	rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+
+	/* Cause future lookups for this node to fail: */
+	PTR_HASH(&b->key) = 0;
+}
+
+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
+{
+	return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
+					     bch_btree_cache_params);
+}
+
+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
+				unsigned level, enum btree_id id)
+{
+	int ret;
+
+	b->level	= level;
+	b->btree_id	= id;
+
+	mutex_lock(&bc->lock);
+	ret = __bch2_btree_node_hash_insert(bc, b);
+	if (!ret)
+		list_add(&b->list, &bc->live);
+	mutex_unlock(&bc->lock);
+
+	return ret;
+}
+
+__flatten
+static inline struct btree *btree_cache_find(struct btree_cache *bc,
+				     const struct bkey_i *k)
+{
+	return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
+				      bch_btree_cache_params);
+}
+
+/*
+ * this version is for btree nodes that have already been freed (we're not
+ * reaping a real btree node)
+ */
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	int ret = 0;
+
+	lockdep_assert_held(&bc->lock);
+
+	if (!six_trylock_intent(&b->lock))
+		return -ENOMEM;
+
+	if (!six_trylock_write(&b->lock))
+		goto out_unlock_intent;
+
+	if (btree_node_noevict(b))
+		goto out_unlock;
+
+	if (!btree_node_may_write(b))
+		goto out_unlock;
+
+	if (btree_node_dirty(b) &&
+	    test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+		goto out_unlock;
+
+	if (btree_node_dirty(b) ||
+	    btree_node_write_in_flight(b) ||
+	    btree_node_read_in_flight(b)) {
+		if (!flush)
+			goto out_unlock;
+
+		wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+			       TASK_UNINTERRUPTIBLE);
+
+		/*
+		 * Using the underscore version because we don't want to compact
+		 * bsets after the write, since this node is about to be evicted
+		 * - unless btree verify mode is enabled, since it runs out of
+		 * the post write cleanup:
+		 */
+		if (verify_btree_ondisk(c))
+			bch2_btree_node_write(c, b, SIX_LOCK_intent);
+		else
+			__bch2_btree_node_write(c, b, SIX_LOCK_read);
+
+		/* wait for any in flight btree write */
+		btree_node_wait_on_io(b);
+	}
+out:
+	if (PTR_HASH(&b->key) && !ret)
+		trace_btree_node_reap(c, b);
+	return ret;
+out_unlock:
+	six_unlock_write(&b->lock);
+out_unlock_intent:
+	six_unlock_intent(&b->lock);
+	ret = -ENOMEM;
+	goto out;
+}
+
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+{
+	return __btree_node_reclaim(c, b, false);
+}
+
+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
+{
+	return __btree_node_reclaim(c, b, true);
+}
+
+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc)
+{
+	struct bch_fs *c = container_of(shrink, struct bch_fs,
+					btree_cache.shrink);
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b, *t;
+	unsigned long nr = sc->nr_to_scan;
+	unsigned long can_free;
+	unsigned long touched = 0;
+	unsigned long freed = 0;
+	unsigned i;
+
+	if (btree_shrinker_disabled(c))
+		return SHRINK_STOP;
+
+	/* Return -1 if we can't do anything right now */
+	if (sc->gfp_mask & __GFP_IO)
+		mutex_lock(&bc->lock);
+	else if (!mutex_trylock(&bc->lock))
+		return -1;
+
+	/*
+	 * It's _really_ critical that we don't free too many btree nodes - we
+	 * have to always leave ourselves a reserve. The reserve is how we
+	 * guarantee that allocating memory for a new btree node can always
+	 * succeed, so that inserting keys into the btree can always succeed and
+	 * IO can always make forward progress:
+	 */
+	nr /= btree_pages(c);
+	can_free = btree_cache_can_free(bc);
+	nr = min_t(unsigned long, nr, can_free);
+
+	i = 0;
+	list_for_each_entry_safe(b, t, &bc->freeable, list) {
+		touched++;
+
+		if (freed >= nr)
+			break;
+
+		if (++i > 3 &&
+		    !btree_node_reclaim(c, b)) {
+			btree_node_data_free(c, b);
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+			freed++;
+		}
+	}
+restart:
+	list_for_each_entry_safe(b, t, &bc->live, list) {
+		touched++;
+
+		if (freed >= nr) {
+			/* Save position */
+			if (&t->list != &bc->live)
+				list_move_tail(&bc->live, &t->list);
+			break;
+		}
+
+		if (!btree_node_accessed(b) &&
+		    !btree_node_reclaim(c, b)) {
+			/* can't call bch2_btree_node_hash_remove under lock  */
+			freed++;
+			if (&t->list != &bc->live)
+				list_move_tail(&bc->live, &t->list);
+
+			btree_node_data_free(c, b);
+			mutex_unlock(&bc->lock);
+
+			bch2_btree_node_hash_remove(bc, b);
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+
+			if (freed >= nr)
+				goto out;
+
+			if (sc->gfp_mask & __GFP_IO)
+				mutex_lock(&bc->lock);
+			else if (!mutex_trylock(&bc->lock))
+				goto out;
+			goto restart;
+		} else
+			clear_btree_node_accessed(b);
+	}
+
+	mutex_unlock(&bc->lock);
+out:
+	return (unsigned long) freed * btree_pages(c);
+}
+
+static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc)
+{
+	struct bch_fs *c = container_of(shrink, struct bch_fs,
+					btree_cache.shrink);
+	struct btree_cache *bc = &c->btree_cache;
+
+	if (btree_shrinker_disabled(c))
+		return 0;
+
+	return btree_cache_can_free(bc) * btree_pages(c);
+}
+
+void bch2_fs_btree_cache_exit(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	unsigned i;
+
+	if (bc->shrink.list.next)
+		unregister_shrinker(&bc->shrink);
+
+	mutex_lock(&bc->lock);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	if (c->verify_data)
+		list_move(&c->verify_data->list, &bc->live);
+
+	kvpfree(c->verify_ondisk, btree_bytes(c));
+#endif
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (c->btree_roots[i].b)
+			list_add(&c->btree_roots[i].b->list, &bc->live);
+
+	list_splice(&bc->freeable, &bc->live);
+
+	while (!list_empty(&bc->live)) {
+		b = list_first_entry(&bc->live, struct btree, list);
+
+		BUG_ON(btree_node_read_in_flight(b) ||
+		       btree_node_write_in_flight(b));
+
+		if (btree_node_dirty(b))
+			bch2_btree_complete_write(c, b, btree_current_write(b));
+		clear_btree_node_dirty(b);
+
+		btree_node_data_free(c, b);
+	}
+
+	while (!list_empty(&bc->freed)) {
+		b = list_first_entry(&bc->freed, struct btree, list);
+		list_del(&b->list);
+		kfree(b);
+	}
+
+	mutex_unlock(&bc->lock);
+
+	if (bc->table_init_done)
+		rhashtable_destroy(&bc->table);
+}
+
+int bch2_fs_btree_cache_init(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	unsigned i;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
+	if (ret)
+		goto out;
+
+	bc->table_init_done = true;
+
+	bch2_recalc_btree_reserve(c);
+
+	for (i = 0; i < bc->reserve; i++)
+		if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+	list_splice_init(&bc->live, &bc->freeable);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	mutex_init(&c->verify_lock);
+
+	c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+	if (!c->verify_ondisk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
+	if (!c->verify_data) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_del_init(&c->verify_data->list);
+#endif
+
+	bc->shrink.count_objects	= bch2_btree_cache_count;
+	bc->shrink.scan_objects		= bch2_btree_cache_scan;
+	bc->shrink.seeks		= 4;
+	bc->shrink.batch		= btree_pages(c) * 2;
+	register_shrinker(&bc->shrink);
+out:
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
+
+void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
+{
+	mutex_init(&bc->lock);
+	INIT_LIST_HEAD(&bc->live);
+	INIT_LIST_HEAD(&bc->freeable);
+	INIT_LIST_HEAD(&bc->freed);
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	if (bc->alloc_lock == current) {
+		trace_btree_node_cannibalize_unlock(c);
+		bc->alloc_lock = NULL;
+		closure_wake_up(&bc->alloc_wait);
+	}
+}
+
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct task_struct *old;
+
+	old = cmpxchg(&bc->alloc_lock, NULL, current);
+	if (old == NULL || old == current)
+		goto success;
+
+	if (!cl) {
+		trace_btree_node_cannibalize_lock_fail(c);
+		return -ENOMEM;
+	}
+
+	closure_wait(&bc->alloc_wait, cl);
+
+	/* Try again, after adding ourselves to waitlist */
+	old = cmpxchg(&bc->alloc_lock, NULL, current);
+	if (old == NULL || old == current) {
+		/* We raced */
+		closure_wake_up(&bc->alloc_wait);
+		goto success;
+	}
+
+	trace_btree_node_cannibalize_lock_fail(c);
+	return -EAGAIN;
+
+success:
+	trace_btree_node_cannibalize_lock(c);
+	return 0;
+}
+
+static struct btree *btree_node_cannibalize(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	list_for_each_entry_reverse(b, &bc->live, list)
+		if (!btree_node_reclaim(c, b))
+			return b;
+
+	while (1) {
+		list_for_each_entry_reverse(b, &bc->live, list)
+			if (!btree_node_write_and_reclaim(c, b))
+				return b;
+
+		/*
+		 * Rare case: all nodes were intent-locked.
+		 * Just busy-wait.
+		 */
+		WARN_ONCE(1, "btree cache cannibalize failed\n");
+		cond_resched();
+	}
+}
+
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	u64 start_time = local_clock();
+	unsigned flags;
+
+	flags = memalloc_nofs_save();
+	mutex_lock(&bc->lock);
+
+	/*
+	 * btree_free() doesn't free memory; it sticks the node on the end of
+	 * the list. Check if there's any freed nodes there:
+	 */
+	list_for_each_entry(b, &bc->freeable, list)
+		if (!btree_node_reclaim(c, b))
+			goto out_unlock;
+
+	/*
+	 * We never free struct btree itself, just the memory that holds the on
+	 * disk node. Check the freed list before allocating a new one:
+	 */
+	list_for_each_entry(b, &bc->freed, list)
+		if (!btree_node_reclaim(c, b)) {
+			btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
+			if (b->data)
+				goto out_unlock;
+
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+			goto err;
+		}
+
+	b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO);
+	if (!b)
+		goto err;
+
+	BUG_ON(!six_trylock_intent(&b->lock));
+	BUG_ON(!six_trylock_write(&b->lock));
+out_unlock:
+	BUG_ON(btree_node_hashed(b));
+	BUG_ON(btree_node_write_in_flight(b));
+
+	list_del_init(&b->list);
+	mutex_unlock(&bc->lock);
+	memalloc_nofs_restore(flags);
+out:
+	b->flags		= 0;
+	b->written		= 0;
+	b->nsets		= 0;
+	b->sib_u64s[0]		= 0;
+	b->sib_u64s[1]		= 0;
+	b->whiteout_u64s	= 0;
+	b->uncompacted_whiteout_u64s = 0;
+	bch2_btree_keys_init(b, &c->expensive_debug_checks);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+			       start_time);
+
+	return b;
+err:
+	/* Try to cannibalize another cached btree node: */
+	if (bc->alloc_lock == current) {
+		b = btree_node_cannibalize(c);
+		list_del_init(&b->list);
+		mutex_unlock(&bc->lock);
+
+		bch2_btree_node_hash_remove(bc, b);
+
+		trace_btree_node_cannibalize(c);
+		goto out;
+	}
+
+	mutex_unlock(&bc->lock);
+	return ERR_PTR(-ENOMEM);
+}
+
+/* Slowpath, don't want it inlined into btree_iter_traverse() */
+static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
+				struct btree_iter *iter,
+				const struct bkey_i *k,
+				unsigned level,
+				enum six_lock_type lock_type,
+				bool sync)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	/*
+	 * Parent node must be locked, else we could read in a btree node that's
+	 * been freed:
+	 */
+	BUG_ON(!btree_node_locked(iter, level + 1));
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+
+	b = bch2_btree_node_mem_alloc(c);
+	if (IS_ERR(b))
+		return b;
+
+	bkey_copy(&b->key, k);
+	if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
+		/* raced with another fill: */
+
+		/* mark as unhashed... */
+		PTR_HASH(&b->key) = 0;
+
+		mutex_lock(&bc->lock);
+		list_add(&b->list, &bc->freeable);
+		mutex_unlock(&bc->lock);
+
+		six_unlock_write(&b->lock);
+		six_unlock_intent(&b->lock);
+		return NULL;
+	}
+
+	/*
+	 * If the btree node wasn't cached, we can't drop our lock on
+	 * the parent until after it's added to the cache - because
+	 * otherwise we could race with a btree_split() freeing the node
+	 * we're trying to lock.
+	 *
+	 * But the deadlock described below doesn't exist in this case,
+	 * so it's safe to not drop the parent lock until here:
+	 */
+	if (btree_node_read_locked(iter, level + 1))
+		btree_node_unlock(iter, level + 1);
+
+	bch2_btree_node_read(c, b, sync);
+
+	six_unlock_write(&b->lock);
+
+	if (!sync) {
+		six_unlock_intent(&b->lock);
+		return NULL;
+	}
+
+	if (lock_type == SIX_LOCK_read)
+		six_lock_downgrade(&b->lock);
+
+	return b;
+}
+
+/**
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * the @write parameter.
+ */
+struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
+				  const struct bkey_i *k, unsigned level,
+				  enum six_lock_type lock_type)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	struct bset_tree *t;
+
+	/*
+	 * XXX: locking optimization
+	 *
+	 * we can make the locking looser here - caller can drop lock on parent
+	 * node before locking child node (and potentially blocking): we just
+	 * have to have bch2_btree_node_fill() call relock on the parent and
+	 * return -EINTR if that fails
+	 */
+	EBUG_ON(!btree_node_locked(iter, level + 1));
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+retry:
+	b = btree_cache_find(bc, k);
+	if (unlikely(!b)) {
+		/*
+		 * We must have the parent locked to call bch2_btree_node_fill(),
+		 * else we could read in a btree node from disk that's been
+		 * freed:
+		 */
+		b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
+
+		/* We raced and found the btree node in the cache */
+		if (!b)
+			goto retry;
+
+		if (IS_ERR(b))
+			return b;
+	} else {
+		/*
+		 * There's a potential deadlock with splits and insertions into
+		 * interior nodes we have to avoid:
+		 *
+		 * The other thread might be holding an intent lock on the node
+		 * we want, and they want to update its parent node so they're
+		 * going to upgrade their intent lock on the parent node to a
+		 * write lock.
+		 *
+		 * But if we're holding a read lock on the parent, and we're
+		 * trying to get the intent lock they're holding, we deadlock.
+		 *
+		 * So to avoid this we drop the read locks on parent nodes when
+		 * we're starting to take intent locks - and handle the race.
+		 *
+		 * The race is that they might be about to free the node we
+		 * want, and dropping our read lock on the parent node lets them
+		 * update the parent marking the node we want as freed, and then
+		 * free it:
+		 *
+		 * To guard against this, btree nodes are evicted from the cache
+		 * when they're freed - and PTR_HASH() is zeroed out, which we
+		 * check for after we lock the node.
+		 *
+		 * Then, bch2_btree_node_relock() on the parent will fail - because
+		 * the parent was modified, when the pointer to the node we want
+		 * was removed - and we'll bail out:
+		 */
+		if (btree_node_read_locked(iter, level + 1))
+			btree_node_unlock(iter, level + 1);
+
+		if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
+			return ERR_PTR(-EINTR);
+
+		if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
+			     b->level != level ||
+			     race_fault())) {
+			six_unlock_type(&b->lock, lock_type);
+			if (bch2_btree_node_relock(iter, level + 1))
+				goto retry;
+
+			trace_trans_restart_btree_node_reused(iter->trans->ip);
+			return ERR_PTR(-EINTR);
+		}
+	}
+
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	/* avoid atomic set bit if it's not needed: */
+	if (btree_node_accessed(b))
+		set_btree_node_accessed(b);
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_type(&b->lock, lock_type);
+		return ERR_PTR(-EIO);
+	}
+
+	EBUG_ON(b->btree_id != iter->btree_id ||
+		BTREE_NODE_LEVEL(b->data) != level ||
+		bkey_cmp(b->data->max_key, k->k.p));
+
+	return b;
+}
+
+struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
+					  struct btree_iter *iter,
+					  struct btree *b,
+					  enum btree_node_sibling sib)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree *parent;
+	struct btree_node_iter node_iter;
+	struct bkey_packed *k;
+	BKEY_PADDED(k) tmp;
+	struct btree *ret = NULL;
+	unsigned level = b->level;
+
+	parent = btree_iter_node(iter, level + 1);
+	if (!parent)
+		return NULL;
+
+	if (!bch2_btree_node_relock(iter, level + 1)) {
+		ret = ERR_PTR(-EINTR);
+		goto out;
+	}
+
+	node_iter = iter->l[parent->level].iter;
+
+	k = bch2_btree_node_iter_peek_all(&node_iter, parent);
+	BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
+
+	k = sib == btree_prev_sib
+		? bch2_btree_node_iter_prev(&node_iter, parent)
+		: (bch2_btree_node_iter_advance(&node_iter, parent),
+		   bch2_btree_node_iter_peek(&node_iter, parent));
+	if (!k)
+		goto out;
+
+	bch2_bkey_unpack(parent, &tmp.k, k);
+
+	ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+				  SIX_LOCK_intent);
+
+	if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
+		struct btree_iter *linked;
+
+		if (!bch2_btree_node_relock(iter, level + 1))
+			goto out;
+
+		/*
+		 * We might have got -EINTR because trylock failed, and we're
+		 * holding other locks that would cause us to deadlock:
+		 */
+		trans_for_each_iter(trans, linked)
+			if (btree_iter_cmp(iter, linked) < 0)
+				__bch2_btree_iter_unlock(linked);
+
+		if (sib == btree_prev_sib)
+			btree_node_unlock(iter, level);
+
+		ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+					  SIX_LOCK_intent);
+
+		/*
+		 * before btree_iter_relock() calls btree_iter_verify_locks():
+		 */
+		if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
+			btree_node_unlock(iter, level + 1);
+
+		if (!bch2_btree_node_relock(iter, level)) {
+			btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+			if (!IS_ERR(ret)) {
+				six_unlock_intent(&ret->lock);
+				ret = ERR_PTR(-EINTR);
+			}
+		}
+
+		bch2_trans_relock(trans);
+	}
+out:
+	if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
+		btree_node_unlock(iter, level + 1);
+
+	if (PTR_ERR_OR_ZERO(ret) == -EINTR)
+		bch2_btree_iter_upgrade(iter, level + 2);
+
+	BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level));
+
+	if (!IS_ERR_OR_NULL(ret)) {
+		struct btree *n1 = ret, *n2 = b;
+
+		if (sib != btree_prev_sib)
+			swap(n1, n2);
+
+		BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id,
+						     n1->key.k.p),
+				n2->data->min_key));
+	}
+
+	bch2_btree_trans_verify_locks(trans);
+
+	return ret;
+}
+
+void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
+			      const struct bkey_i *k, unsigned level)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	BUG_ON(!btree_node_locked(iter, level + 1));
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+
+	b = btree_cache_find(bc, k);
+	if (b)
+		return;
+
+	bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
+}
+
+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+			     struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	struct bset_stats stats;
+
+	memset(&stats, 0, sizeof(stats));
+
+	bch2_btree_keys_stats(b, &stats);
+
+	pr_buf(out,
+	       "l %u %llu:%llu - %llu:%llu:\n"
+	       "    ptrs: ",
+	       b->level,
+	       b->data->min_key.inode,
+	       b->data->min_key.offset,
+	       b->data->max_key.inode,
+	       b->data->max_key.offset);
+	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+	pr_buf(out, "\n"
+	       "    format: u64s %u fields %u %u %u %u %u\n"
+	       "    unpack fn len: %u\n"
+	       "    bytes used %zu/%zu (%zu%% full)\n"
+	       "    sib u64s: %u, %u (merge threshold %zu)\n"
+	       "    nr packed keys %u\n"
+	       "    nr unpacked keys %u\n"
+	       "    floats %zu\n"
+	       "    failed unpacked %zu\n"
+	       "    failed prev %zu\n"
+	       "    failed overflow %zu\n",
+	       f->key_u64s,
+	       f->bits_per_field[0],
+	       f->bits_per_field[1],
+	       f->bits_per_field[2],
+	       f->bits_per_field[3],
+	       f->bits_per_field[4],
+	       b->unpack_fn_len,
+	       b->nr.live_u64s * sizeof(u64),
+	       btree_bytes(c) - sizeof(struct btree_node),
+	       b->nr.live_u64s * 100 / btree_max_u64s(c),
+	       b->sib_u64s[0],
+	       b->sib_u64s[1],
+	       BTREE_FOREGROUND_MERGE_THRESHOLD(c),
+	       b->nr.packed_keys,
+	       b->nr.unpacked_keys,
+	       stats.floats,
+	       stats.failed_unpacked,
+	       stats.failed_prev,
+	       stats.failed_overflow);
+}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
new file mode 100644
index 000000000000..c5873c58439c
--- /dev/null
+++ b/fs/bcachefs/btree_cache.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_CACHE_H
+#define _BCACHEFS_BTREE_CACHE_H
+
+#include "bcachefs.h"
+#include "btree_types.h"
+
+struct btree_iter;
+
+extern const char * const bch2_btree_ids[];
+
+void bch2_recalc_btree_reserve(struct bch_fs *);
+
+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
+				unsigned, enum btree_id);
+
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
+
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+
+struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
+				  const struct bkey_i *, unsigned,
+				  enum six_lock_type);
+
+struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
+				struct btree *, enum btree_node_sibling);
+
+void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
+			      const struct bkey_i *, unsigned);
+
+void bch2_fs_btree_cache_exit(struct bch_fs *);
+int bch2_fs_btree_cache_init(struct bch_fs *);
+void bch2_fs_btree_cache_init_early(struct btree_cache *);
+
+#define PTR_HASH(_k)	*((u64 *) &bkey_i_to_btree_ptr_c(_k)->v)
+
+/* is btree node in hash table? */
+static inline bool btree_node_hashed(struct btree *b)
+{
+	return b->key.k.type == KEY_TYPE_btree_ptr &&
+		PTR_HASH(&b->key);
+}
+
+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
+	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl,	\
+					  &(_c)->btree_cache.table),	\
+	     _iter = 0;	_iter < (_tbl)->size; _iter++)			\
+		rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
+
+static inline size_t btree_bytes(struct bch_fs *c)
+{
+	return c->opts.btree_node_size << 9;
+}
+
+static inline size_t btree_max_u64s(struct bch_fs *c)
+{
+	return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+}
+
+static inline size_t btree_page_order(struct bch_fs *c)
+{
+	return get_order(btree_bytes(c));
+}
+
+static inline size_t btree_pages(struct bch_fs *c)
+{
+	return 1 << btree_page_order(c);
+}
+
+static inline unsigned btree_blocks(struct bch_fs *c)
+{
+	return c->opts.btree_node_size >> c->block_bits;
+}
+
+#define BTREE_SPLIT_THRESHOLD(c)		(btree_blocks(c) * 3 / 4)
+
+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
+	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
+	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
+
+#define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->btree_id].b)
+
+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
+			     struct btree *);
+
+#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
new file mode 100644
index 000000000000..c4c2e1a3ee0e
--- /dev/null
+++ b/fs/bcachefs/btree_gc.c
@@ -0,0 +1,1230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright (C) 2014 Datera Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "bkey_methods.h"
+#include "btree_locking.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "super-io.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+#include <trace/events/bcachefs.h>
+
+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+	write_seqcount_begin(&c->gc_pos_lock);
+	c->gc_pos = new_pos;
+	write_seqcount_end(&c->gc_pos_lock);
+}
+
+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
+	__gc_pos_set(c, new_pos);
+}
+
+/* range_checks - for validating min/max pos of each btree node: */
+
+struct range_checks {
+	struct range_level {
+		struct bpos	min;
+		struct bpos	max;
+	}			l[BTREE_MAX_DEPTH];
+	unsigned		depth;
+};
+
+static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
+{
+	unsigned i;
+
+	for (i = 0; i < BTREE_MAX_DEPTH; i++)
+		r->l[i].min = r->l[i].max = POS_MIN;
+	r->depth = depth;
+}
+
+static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
+				    struct range_checks *r)
+{
+	struct range_level *l = &r->l[b->level];
+
+	struct bpos expected_min = bkey_cmp(l->min, l->max)
+		? btree_type_successor(b->btree_id, l->max)
+		: l->max;
+
+	bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
+		"btree node has incorrect min key: %llu:%llu != %llu:%llu",
+		b->data->min_key.inode,
+		b->data->min_key.offset,
+		expected_min.inode,
+		expected_min.offset);
+
+	l->max = b->data->max_key;
+
+	if (b->level > r->depth) {
+		l = &r->l[b->level - 1];
+
+		bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
+			"btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
+			b->data->min_key.inode,
+			b->data->min_key.offset,
+			l->min.inode,
+			l->min.offset);
+
+		bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
+			"btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
+			b->data->max_key.inode,
+			b->data->max_key.offset,
+			l->max.inode,
+			l->max.offset);
+
+		if (bkey_cmp(b->data->max_key, POS_MAX))
+			l->min = l->max =
+				btree_type_successor(b->btree_id,
+						     b->data->max_key);
+	}
+}
+
+/* marking of btree keys/nodes: */
+
+static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
+			    u8 *max_stale, bool initial)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	unsigned flags =
+		BCH_BUCKET_MARK_GC|
+		(initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
+	int ret = 0;
+
+	if (initial) {
+		BUG_ON(journal_seq_verify(c) &&
+		       k.k->version.lo > journal_cur_seq(&c->journal));
+
+		if (k.k->version.lo > atomic64_read(&c->key_version))
+			atomic64_set(&c->key_version, k.k->version.lo);
+
+		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+		    fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c,
+				"superblock not marked as containing replicas (type %u)",
+				k.k->type)) {
+			ret = bch2_mark_bkey_replicas(c, k);
+			if (ret)
+				return ret;
+		}
+
+		bkey_for_each_ptr(ptrs, ptr) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+			struct bucket *g = PTR_BUCKET(ca, ptr, true);
+			struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
+
+			if (mustfix_fsck_err_on(!g->gen_valid, c,
+					"bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
+					ptr->dev, PTR_BUCKET_NR(ca, ptr),
+					bch2_data_types[ptr_data_type(k.k, ptr)],
+					ptr->gen)) {
+				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
+				g2->gen_valid	= g->gen_valid		= true;
+			}
+
+			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
+					"bucket %u:%zu data type %s ptr gen in the future: %u > %u",
+					ptr->dev, PTR_BUCKET_NR(ca, ptr),
+					bch2_data_types[ptr_data_type(k.k, ptr)],
+					ptr->gen, g->mark.gen)) {
+				g2->_mark.gen	= g->_mark.gen		= ptr->gen;
+				g2->gen_valid	= g->gen_valid		= true;
+				g2->_mark.data_type		= 0;
+				g2->_mark.dirty_sectors		= 0;
+				g2->_mark.cached_sectors	= 0;
+				set_bit(BCH_FS_FIXED_GENS, &c->flags);
+			}
+		}
+	}
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bucket *g = PTR_BUCKET(ca, ptr, true);
+
+		if (gen_after(g->oldest_gen, ptr->gen))
+			g->oldest_gen = ptr->gen;
+
+		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
+	}
+
+	bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags);
+fsck_err:
+	return ret;
+}
+
+static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
+			      u8 *max_stale, bool initial)
+{
+	struct btree_node_iter iter;
+	struct bkey unpacked;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	*max_stale = 0;
+
+	if (!btree_node_type_needs_gc(btree_node_type(b)))
+		return 0;
+
+	for_each_btree_node_key_unpack(b, k, &iter,
+				       &unpacked) {
+		bch2_bkey_debugcheck(c, b, k);
+
+		ret = bch2_gc_mark_key(c, k, max_stale, initial);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
+			 bool initial, bool metadata_only)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct btree *b;
+	struct range_checks r;
+	unsigned depth = metadata_only			? 1
+		: expensive_debug_checks(c)		? 0
+		: !btree_node_type_needs_gc(btree_id)	? 1
+		: 0;
+	u8 max_stale = 0;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
+
+	btree_node_range_checks_init(&r, depth);
+
+	__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
+			      0, depth, BTREE_ITER_PREFETCH, b) {
+		btree_node_range_checks(c, b, &r);
+
+		bch2_verify_btree_nr_keys(b);
+
+		gc_pos_set(c, gc_pos_btree_node(b));
+
+		ret = btree_gc_mark_node(c, b, &max_stale, initial);
+		if (ret)
+			break;
+
+		if (!initial) {
+			if (max_stale > 64)
+				bch2_btree_node_rewrite(c, iter,
+						b->data->keys.seq,
+						BTREE_INSERT_USE_RESERVE|
+						BTREE_INSERT_NOWAIT|
+						BTREE_INSERT_GC_LOCK_HELD);
+			else if (!btree_gc_rewrite_disabled(c) &&
+				 (btree_gc_always_rewrite(c) || max_stale > 16))
+				bch2_btree_node_rewrite(c, iter,
+						b->data->keys.seq,
+						BTREE_INSERT_NOWAIT|
+						BTREE_INSERT_GC_LOCK_HELD);
+		}
+
+		bch2_trans_cond_resched(&trans);
+	}
+	ret = bch2_trans_exit(&trans) ?: ret;
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->btree_root_lock);
+	b = c->btree_roots[btree_id].b;
+	if (!btree_node_fake(b))
+		ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+				       &max_stale, initial);
+	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
+	mutex_unlock(&c->btree_root_lock);
+
+	return ret;
+}
+
+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
+{
+	return  (int) btree_id_to_gc_phase(l) -
+		(int) btree_id_to_gc_phase(r);
+}
+
+static int mark_journal_key(struct bch_fs *c, enum btree_id id,
+			    struct bkey_i *insert)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u8 max_stale;
+	int ret = 0;
+
+	ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
+	if (ret)
+		return ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
+			   BTREE_ITER_SLOTS, k, ret) {
+		percpu_down_read(&c->mark_lock);
+		ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
+					 BCH_BUCKET_MARK_GC|
+					 BCH_BUCKET_MARK_NOATOMIC);
+		percpu_up_read(&c->mark_lock);
+
+		if (!ret)
+			break;
+	}
+
+	return bch2_trans_exit(&trans) ?: ret;
+}
+
+static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
+			  bool initial, bool metadata_only)
+{
+	enum btree_id ids[BTREE_ID_NR];
+	unsigned i;
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		ids[i] = i;
+	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		enum btree_id id = ids[i];
+		enum btree_node_type type = __btree_node_type(0, id);
+
+		int ret = bch2_gc_btree(c, id, initial, metadata_only);
+		if (ret)
+			return ret;
+
+		if (journal_keys && !metadata_only &&
+		    btree_node_type_needs_gc(type)) {
+			struct journal_key *j;
+			int ret;
+
+			for_each_journal_key(*journal_keys, j)
+				if (j->btree_id == id) {
+					ret = mark_journal_key(c, id, j->k);
+					if (ret)
+						return ret;
+				}
+		}
+	}
+
+	return 0;
+}
+
+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
+				  u64 start, u64 end,
+				  enum bch_data_type type,
+				  unsigned flags)
+{
+	u64 b = sector_to_bucket(ca, start);
+
+	do {
+		unsigned sectors =
+			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+		bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+					  gc_phase(GC_PHASE_SB), flags);
+		b++;
+		start += sectors;
+	} while (start < end);
+}
+
+void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+			      unsigned flags)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	unsigned i;
+	u64 b;
+
+	/*
+	 * This conditional is kind of gross, but we may be called from the
+	 * device add path, before the new device has actually been added to the
+	 * running filesystem:
+	 */
+	if (c) {
+		lockdep_assert_held(&c->sb_lock);
+		percpu_down_read(&c->mark_lock);
+	}
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset == BCH_SB_SECTOR)
+			mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
+					      BCH_DATA_SB, flags);
+
+		mark_metadata_sectors(c, ca, offset,
+				      offset + (1 << layout->sb_max_size_bits),
+				      BCH_DATA_SB, flags);
+	}
+
+	for (i = 0; i < ca->journal.nr; i++) {
+		b = ca->journal.buckets[i];
+		bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
+					  ca->mi.bucket_size,
+					  gc_phase(GC_PHASE_SB), flags);
+	}
+
+	if (c)
+		percpu_up_read(&c->mark_lock);
+}
+
+static void bch2_mark_superblocks(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	mutex_lock(&c->sb_lock);
+	gc_pos_set(c, gc_phase(GC_PHASE_SB));
+
+	for_each_online_member(ca, c, i)
+		bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
+	mutex_unlock(&c->sb_lock);
+}
+
+/* Also see bch2_pending_btree_node_free_insert_done() */
+static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
+{
+	struct btree_update *as;
+	struct pending_btree_node_free *d;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
+
+	for_each_pending_btree_node_free(c, as, d)
+		if (d->index_update_done)
+			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+				      0, 0, NULL, 0,
+				      BCH_BUCKET_MARK_GC);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void bch2_mark_allocator_buckets(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	struct open_bucket *ob;
+	size_t i, j, iter;
+	unsigned ci;
+
+	percpu_down_read(&c->mark_lock);
+
+	spin_lock(&c->freelist_lock);
+	gc_pos_set(c, gc_pos_alloc(c, NULL));
+
+	for_each_member_device(ca, c, ci) {
+		fifo_for_each_entry(i, &ca->free_inc, iter)
+			bch2_mark_alloc_bucket(c, ca, i, true,
+					       gc_pos_alloc(c, NULL),
+					       BCH_BUCKET_MARK_GC);
+
+
+
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each_entry(i, &ca->free[j], iter)
+				bch2_mark_alloc_bucket(c, ca, i, true,
+						       gc_pos_alloc(c, NULL),
+						       BCH_BUCKET_MARK_GC);
+	}
+
+	spin_unlock(&c->freelist_lock);
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid) {
+			gc_pos_set(c, gc_pos_alloc(c, ob));
+			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
+					       gc_pos_alloc(c, ob),
+					       BCH_BUCKET_MARK_GC);
+		}
+		spin_unlock(&ob->lock);
+	}
+
+	percpu_up_read(&c->mark_lock);
+}
+
+static void bch2_gc_free(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	genradix_free(&c->stripes[1]);
+
+	for_each_member_device(ca, c, i) {
+		kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+			sizeof(struct bucket_array) +
+			ca->mi.nbuckets * sizeof(struct bucket));
+		ca->buckets[1] = NULL;
+
+		free_percpu(ca->usage[1]);
+		ca->usage[1] = NULL;
+	}
+
+	free_percpu(c->usage_gc);
+	c->usage_gc = NULL;
+}
+
+static int bch2_gc_done(struct bch_fs *c,
+			bool initial, bool metadata_only)
+{
+	struct bch_dev *ca;
+	bool verify = !metadata_only &&
+		(!initial ||
+		 (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
+	unsigned i;
+	int ret = 0;
+
+#define copy_field(_f, _msg, ...)					\
+	if (dst->_f != src->_f) {					\
+		if (verify)						\
+			fsck_err(c, _msg ": got %llu, should be %llu"	\
+				, ##__VA_ARGS__, dst->_f, src->_f);	\
+		dst->_f = src->_f;					\
+	}
+#define copy_stripe_field(_f, _msg, ...)				\
+	if (dst->_f != src->_f) {					\
+		if (verify)						\
+			fsck_err(c, "stripe %zu has wrong "_msg		\
+				": got %u, should be %u",		\
+				dst_iter.pos, ##__VA_ARGS__,		\
+				dst->_f, src->_f);			\
+		dst->_f = src->_f;					\
+		dst->dirty = true;					\
+	}
+#define copy_bucket_field(_f)						\
+	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
+		if (verify)						\
+			fsck_err(c, "dev %u bucket %zu has wrong " #_f	\
+				": got %u, should be %u", i, b,		\
+				dst->b[b].mark._f, src->b[b].mark._f);	\
+		dst->b[b]._mark._f = src->b[b].mark._f;			\
+	}
+#define copy_dev_field(_f, _msg, ...)					\
+	copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
+#define copy_fs_field(_f, _msg, ...)					\
+	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
+
+	if (!metadata_only) {
+		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
+		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
+		struct stripe *dst, *src;
+		unsigned i;
+
+		c->ec_stripes_heap.used = 0;
+
+		while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
+		       (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
+			BUG_ON(src_iter.pos != dst_iter.pos);
+
+			copy_stripe_field(alive,	"alive");
+			copy_stripe_field(sectors,	"sectors");
+			copy_stripe_field(algorithm,	"algorithm");
+			copy_stripe_field(nr_blocks,	"nr_blocks");
+			copy_stripe_field(nr_redundant,	"nr_redundant");
+			copy_stripe_field(blocks_nonempty,
+					  "blocks_nonempty");
+
+			for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
+				copy_stripe_field(block_sectors[i],
+						  "block_sectors[%u]", i);
+
+			if (dst->alive)
+				bch2_stripes_heap_insert(c, dst, dst_iter.pos);
+
+			genradix_iter_advance(&dst_iter, &c->stripes[0]);
+			genradix_iter_advance(&src_iter, &c->stripes[1]);
+		}
+	}
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *dst = __bucket_array(ca, 0);
+		struct bucket_array *src = __bucket_array(ca, 1);
+		size_t b;
+
+		for (b = 0; b < src->nbuckets; b++) {
+			copy_bucket_field(gen);
+			copy_bucket_field(data_type);
+			copy_bucket_field(owned_by_allocator);
+			copy_bucket_field(stripe);
+			copy_bucket_field(dirty_sectors);
+			copy_bucket_field(cached_sectors);
+
+			dst->b[b].oldest_gen = src->b[b].oldest_gen;
+		}
+	};
+
+	bch2_fs_usage_acc_to_base(c, 0);
+	bch2_fs_usage_acc_to_base(c, 1);
+
+	bch2_dev_usage_from_buckets(c);
+
+	{
+		unsigned nr = fs_usage_u64s(c);
+		struct bch_fs_usage *dst = c->usage_base;
+		struct bch_fs_usage *src = (void *)
+			bch2_acc_percpu_u64s((void *) c->usage_gc, nr);
+
+		copy_fs_field(hidden,		"hidden");
+		copy_fs_field(btree,		"btree");
+
+		if (!metadata_only) {
+			copy_fs_field(data,	"data");
+			copy_fs_field(cached,	"cached");
+			copy_fs_field(reserved,	"reserved");
+			copy_fs_field(nr_inodes,"nr_inodes");
+
+			for (i = 0; i < BCH_REPLICAS_MAX; i++)
+				copy_fs_field(persistent_reserved[i],
+					      "persistent_reserved[%i]", i);
+		}
+
+		for (i = 0; i < c->replicas.nr; i++) {
+			struct bch_replicas_entry *e =
+				cpu_replicas_entry(&c->replicas, i);
+			char buf[80];
+
+			if (metadata_only &&
+			    (e->data_type == BCH_DATA_USER ||
+			     e->data_type == BCH_DATA_CACHED))
+				continue;
+
+			bch2_replicas_entry_to_text(&PBUF(buf), e);
+
+			copy_fs_field(replicas[i], "%s", buf);
+		}
+	}
+
+#undef copy_fs_field
+#undef copy_dev_field
+#undef copy_bucket_field
+#undef copy_stripe_field
+#undef copy_field
+fsck_err:
+	return ret;
+}
+
+static int bch2_gc_start(struct bch_fs *c,
+			 bool metadata_only)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	BUG_ON(c->usage_gc);
+
+	c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
+					 sizeof(u64), GFP_KERNEL);
+	if (!c->usage_gc)
+		return -ENOMEM;
+
+	for_each_member_device(ca, c, i) {
+		BUG_ON(ca->buckets[1]);
+		BUG_ON(ca->usage[1]);
+
+		ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
+				ca->mi.nbuckets * sizeof(struct bucket),
+				GFP_KERNEL|__GFP_ZERO);
+		if (!ca->buckets[1]) {
+			percpu_ref_put(&ca->ref);
+			return -ENOMEM;
+		}
+
+		ca->usage[1] = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage[1]) {
+			percpu_ref_put(&ca->ref);
+			return -ENOMEM;
+		}
+	}
+
+	ret = bch2_ec_mem_alloc(c, true);
+	if (ret)
+		return ret;
+
+	percpu_down_write(&c->mark_lock);
+
+	/*
+	 * indicate to stripe code that we need to allocate for the gc stripes
+	 * radix tree, too
+	 */
+	gc_pos_set(c, gc_phase(GC_PHASE_START));
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *dst = __bucket_array(ca, 1);
+		struct bucket_array *src = __bucket_array(ca, 0);
+		size_t b;
+
+		dst->first_bucket	= src->first_bucket;
+		dst->nbuckets		= src->nbuckets;
+
+		for (b = 0; b < src->nbuckets; b++) {
+			struct bucket *d = &dst->b[b];
+			struct bucket *s = &src->b[b];
+
+			d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
+			d->gen_valid = s->gen_valid;
+
+			if (metadata_only &&
+			    (s->mark.data_type == BCH_DATA_USER ||
+			     s->mark.data_type == BCH_DATA_CACHED)) {
+				d->_mark = s->mark;
+				d->_mark.owned_by_allocator = 0;
+			}
+		}
+	};
+
+	percpu_up_write(&c->mark_lock);
+
+	return 0;
+}
+
+/**
+ * bch2_gc - walk _all_ references to buckets, and recompute them:
+ *
+ * Order matters here:
+ *  - Concurrent GC relies on the fact that we have a total ordering for
+ *    everything that GC walks - see  gc_will_visit_node(),
+ *    gc_will_visit_root()
+ *
+ *  - also, references move around in the course of index updates and
+ *    various other crap: everything needs to agree on the ordering
+ *    references are allowed to move around in - e.g., we're allowed to
+ *    start with a reference owned by an open_bucket (the allocator) and
+ *    move it to the btree, but not the reverse.
+ *
+ *    This is necessary to ensure that gc doesn't miss references that
+ *    move around - if references move backwards in the ordering GC
+ *    uses, GC could skip past them
+ */
+int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
+	    bool initial, bool metadata_only)
+{
+	struct bch_dev *ca;
+	u64 start_time = local_clock();
+	unsigned i, iter = 0;
+	int ret;
+
+	trace_gc_start(c);
+
+	down_write(&c->gc_lock);
+again:
+	ret = bch2_gc_start(c, metadata_only);
+	if (ret)
+		goto out;
+
+	bch2_mark_superblocks(c);
+
+	ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only);
+	if (ret)
+		goto out;
+
+	bch2_mark_pending_btree_node_frees(c);
+	bch2_mark_allocator_buckets(c);
+
+	c->gc_count++;
+out:
+	if (!ret &&
+	    (test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
+	     (!iter && test_restart_gc(c)))) {
+		/*
+		 * XXX: make sure gens we fixed got saved
+		 */
+		if (iter++ <= 2) {
+			bch_info(c, "Fixed gens, restarting mark and sweep:");
+			clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+			__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+
+			percpu_down_write(&c->mark_lock);
+			bch2_gc_free(c);
+			percpu_up_write(&c->mark_lock);
+			/* flush fsck errors, reset counters */
+			bch2_flush_fsck_errs(c);
+
+			goto again;
+		}
+
+		bch_info(c, "Unable to fix bucket gens, looping");
+		ret = -EINVAL;
+	}
+
+	if (!ret) {
+		bch2_journal_block(&c->journal);
+
+		percpu_down_write(&c->mark_lock);
+		ret = bch2_gc_done(c, initial, metadata_only);
+
+		bch2_journal_unblock(&c->journal);
+	} else {
+		percpu_down_write(&c->mark_lock);
+	}
+
+	/* Indicates that gc is no longer in progress: */
+	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+
+	bch2_gc_free(c);
+	percpu_up_write(&c->mark_lock);
+
+	up_write(&c->gc_lock);
+
+	trace_gc_end(c);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+
+	/*
+	 * Wake up allocator in case it was waiting for buckets
+	 * because of not being able to inc gens
+	 */
+	for_each_member_device(ca, c, i)
+		bch2_wake_allocator(ca);
+
+	/*
+	 * At startup, allocations can happen directly instead of via the
+	 * allocator thread - issue wakeup in case they blocked on gc_lock:
+	 */
+	closure_wake_up(&c->freelist_wait);
+	return ret;
+}
+
+/* Btree coalescing */
+
+static void recalc_packed_keys(struct btree *b)
+{
+	struct bset *i = btree_bset_first(b);
+	struct bkey_packed *k;
+
+	memset(&b->nr, 0, sizeof(b->nr));
+
+	BUG_ON(b->nsets != 1);
+
+	vstruct_for_each(i, k)
+		btree_keys_account_key_add(&b->nr, 0, k);
+}
+
+static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
+				struct btree *old_nodes[GC_MERGE_NODES])
+{
+	struct btree *parent = btree_node_parent(iter, old_nodes[0]);
+	unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
+	unsigned blocks = btree_blocks(c) * 2 / 3;
+	struct btree *new_nodes[GC_MERGE_NODES];
+	struct btree_update *as;
+	struct keylist keylist;
+	struct bkey_format_state format_state;
+	struct bkey_format new_format;
+
+	memset(new_nodes, 0, sizeof(new_nodes));
+	bch2_keylist_init(&keylist, NULL);
+
+	/* Count keys that are not deleted */
+	for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
+		u64s += old_nodes[i]->nr.live_u64s;
+
+	nr_old_nodes = nr_new_nodes = i;
+
+	/* Check if all keys in @old_nodes could fit in one fewer node */
+	if (nr_old_nodes <= 1 ||
+	    __vstruct_blocks(struct btree_node, c->block_bits,
+			     DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
+		return;
+
+	/* Find a format that all keys in @old_nodes can pack into */
+	bch2_bkey_format_init(&format_state);
+
+	for (i = 0; i < nr_old_nodes; i++)
+		__bch2_btree_calc_format(&format_state, old_nodes[i]);
+
+	new_format = bch2_bkey_format_done(&format_state);
+
+	/* Check if repacking would make any nodes too big to fit */
+	for (i = 0; i < nr_old_nodes; i++)
+		if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
+			trace_btree_gc_coalesce_fail(c,
+					BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
+			return;
+		}
+
+	if (bch2_keylist_realloc(&keylist, NULL, 0,
+			(BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
+		trace_btree_gc_coalesce_fail(c,
+				BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
+		return;
+	}
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+			btree_update_reserve_required(c, parent) + nr_old_nodes,
+			BTREE_INSERT_NOFAIL|
+			BTREE_INSERT_USE_RESERVE,
+			NULL);
+	if (IS_ERR(as)) {
+		trace_btree_gc_coalesce_fail(c,
+				BTREE_GC_COALESCE_FAIL_RESERVE_GET);
+		bch2_keylist_free(&keylist, NULL);
+		return;
+	}
+
+	trace_btree_gc_coalesce(c, old_nodes[0]);
+
+	for (i = 0; i < nr_old_nodes; i++)
+		bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
+
+	/* Repack everything with @new_format and sort down to one bset */
+	for (i = 0; i < nr_old_nodes; i++)
+		new_nodes[i] =
+			__bch2_btree_node_alloc_replacement(as, old_nodes[i],
+							    new_format);
+
+	/*
+	 * Conceptually we concatenate the nodes together and slice them
+	 * up at different boundaries.
+	 */
+	for (i = nr_new_nodes - 1; i > 0; --i) {
+		struct btree *n1 = new_nodes[i];
+		struct btree *n2 = new_nodes[i - 1];
+
+		struct bset *s1 = btree_bset_first(n1);
+		struct bset *s2 = btree_bset_first(n2);
+		struct bkey_packed *k, *last = NULL;
+
+		/* Calculate how many keys from @n2 we could fit inside @n1 */
+		u64s = 0;
+
+		for (k = s2->start;
+		     k < vstruct_last(s2) &&
+		     vstruct_blocks_plus(n1->data, c->block_bits,
+					 u64s + k->u64s) <= blocks;
+		     k = bkey_next(k)) {
+			last = k;
+			u64s += k->u64s;
+		}
+
+		if (u64s == le16_to_cpu(s2->u64s)) {
+			/* n2 fits entirely in n1 */
+			n1->key.k.p = n1->data->max_key = n2->data->max_key;
+
+			memcpy_u64s(vstruct_last(s1),
+				    s2->start,
+				    le16_to_cpu(s2->u64s));
+			le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
+
+			set_btree_bset_end(n1, n1->set);
+
+			six_unlock_write(&n2->lock);
+			bch2_btree_node_free_never_inserted(c, n2);
+			six_unlock_intent(&n2->lock);
+
+			memmove(new_nodes + i - 1,
+				new_nodes + i,
+				sizeof(new_nodes[0]) * (nr_new_nodes - i));
+			new_nodes[--nr_new_nodes] = NULL;
+		} else if (u64s) {
+			/* move part of n2 into n1 */
+			n1->key.k.p = n1->data->max_key =
+				bkey_unpack_pos(n1, last);
+
+			n2->data->min_key =
+				btree_type_successor(iter->btree_id,
+						     n1->data->max_key);
+
+			memcpy_u64s(vstruct_last(s1),
+				    s2->start, u64s);
+			le16_add_cpu(&s1->u64s, u64s);
+
+			memmove(s2->start,
+				vstruct_idx(s2, u64s),
+				(le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
+			s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
+
+			set_btree_bset_end(n1, n1->set);
+			set_btree_bset_end(n2, n2->set);
+		}
+	}
+
+	for (i = 0; i < nr_new_nodes; i++) {
+		struct btree *n = new_nodes[i];
+
+		recalc_packed_keys(n);
+		btree_node_reset_sib_u64s(n);
+
+		bch2_btree_build_aux_trees(n);
+		six_unlock_write(&n->lock);
+
+		bch2_btree_node_write(c, n, SIX_LOCK_intent);
+	}
+
+	/*
+	 * The keys for the old nodes get deleted. We don't want to insert keys
+	 * that compare equal to the keys for the new nodes we'll also be
+	 * inserting - we can't because keys on a keylist must be strictly
+	 * greater than the previous keys, and we also don't need to since the
+	 * key for the new node will serve the same purpose (overwriting the key
+	 * for the old node).
+	 */
+	for (i = 0; i < nr_old_nodes; i++) {
+		struct bkey_i delete;
+		unsigned j;
+
+		for (j = 0; j < nr_new_nodes; j++)
+			if (!bkey_cmp(old_nodes[i]->key.k.p,
+				      new_nodes[j]->key.k.p))
+				goto next;
+
+		bkey_init(&delete.k);
+		delete.k.p = old_nodes[i]->key.k.p;
+		bch2_keylist_add_in_order(&keylist, &delete);
+next:
+		i = i;
+	}
+
+	/*
+	 * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
+	 * does the lookup once and thus expects the keys to be in sorted order
+	 * so we have to make sure the new keys are correctly ordered with
+	 * respect to the deleted keys added in the previous loop
+	 */
+	for (i = 0; i < nr_new_nodes; i++)
+		bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
+
+	/* Insert the newly coalesced nodes */
+	bch2_btree_insert_node(as, parent, iter, &keylist, 0);
+
+	BUG_ON(!bch2_keylist_empty(&keylist));
+
+	BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
+
+	bch2_btree_iter_node_replace(iter, new_nodes[0]);
+
+	for (i = 0; i < nr_new_nodes; i++)
+		bch2_open_buckets_put(c, &new_nodes[i]->ob);
+
+	/* Free the old nodes and update our sliding window */
+	for (i = 0; i < nr_old_nodes; i++) {
+		bch2_btree_node_free_inmem(c, old_nodes[i], iter);
+
+		/*
+		 * the index update might have triggered a split, in which case
+		 * the nodes we coalesced - the new nodes we just created -
+		 * might not be sibling nodes anymore - don't add them to the
+		 * sliding window (except the first):
+		 */
+		if (!i) {
+			old_nodes[i] = new_nodes[i];
+		} else {
+			old_nodes[i] = NULL;
+		}
+	}
+
+	for (i = 0; i < nr_new_nodes; i++)
+		six_unlock_intent(&new_nodes[i]->lock);
+
+	bch2_btree_update_done(as);
+	bch2_keylist_free(&keylist, NULL);
+}
+
+static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct btree *b;
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	unsigned i;
+
+	/* Sliding window of adjacent btree nodes */
+	struct btree *merge[GC_MERGE_NODES];
+	u32 lock_seq[GC_MERGE_NODES];
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	/*
+	 * XXX: We don't have a good way of positively matching on sibling nodes
+	 * that have the same parent - this code works by handling the cases
+	 * where they might not have the same parent, and is thus fragile. Ugh.
+	 *
+	 * Perhaps redo this to use multiple linked iterators?
+	 */
+	memset(merge, 0, sizeof(merge));
+
+	__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
+			      BTREE_MAX_DEPTH, 0,
+			      BTREE_ITER_PREFETCH, b) {
+		memmove(merge + 1, merge,
+			sizeof(merge) - sizeof(merge[0]));
+		memmove(lock_seq + 1, lock_seq,
+			sizeof(lock_seq) - sizeof(lock_seq[0]));
+
+		merge[0] = b;
+
+		for (i = 1; i < GC_MERGE_NODES; i++) {
+			if (!merge[i] ||
+			    !six_relock_intent(&merge[i]->lock, lock_seq[i]))
+				break;
+
+			if (merge[i]->level != merge[0]->level) {
+				six_unlock_intent(&merge[i]->lock);
+				break;
+			}
+		}
+		memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
+
+		bch2_coalesce_nodes(c, iter, merge);
+
+		for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
+			lock_seq[i] = merge[i]->lock.state.seq;
+			six_unlock_intent(&merge[i]->lock);
+		}
+
+		lock_seq[0] = merge[0]->lock.state.seq;
+
+		if (kthread && kthread_should_stop()) {
+			bch2_trans_exit(&trans);
+			return -ESHUTDOWN;
+		}
+
+		bch2_trans_cond_resched(&trans);
+
+		/*
+		 * If the parent node wasn't relocked, it might have been split
+		 * and the nodes in our sliding window might not have the same
+		 * parent anymore - blow away the sliding window:
+		 */
+		if (btree_iter_node(iter, iter->level + 1) &&
+		    !btree_node_intent_locked(iter, iter->level + 1))
+			memset(merge + 1, 0,
+			       (GC_MERGE_NODES - 1) * sizeof(merge[0]));
+	}
+	return bch2_trans_exit(&trans);
+}
+
+/**
+ * bch_coalesce - coalesce adjacent nodes with low occupancy
+ */
+void bch2_coalesce(struct bch_fs *c)
+{
+	enum btree_id id;
+
+	down_read(&c->gc_lock);
+	trace_gc_coalesce_start(c);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		int ret = c->btree_roots[id].b
+			? bch2_coalesce_btree(c, id)
+			: 0;
+
+		if (ret) {
+			if (ret != -ESHUTDOWN)
+				bch_err(c, "btree coalescing failed: %d", ret);
+			return;
+		}
+	}
+
+	trace_gc_coalesce_end(c);
+	up_read(&c->gc_lock);
+}
+
+static int bch2_gc_thread(void *arg)
+{
+	struct bch_fs *c = arg;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	unsigned long last = atomic_long_read(&clock->now);
+	unsigned last_kick = atomic_read(&c->kick_gc);
+	int ret;
+
+	set_freezable();
+
+	while (1) {
+		while (1) {
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			if (kthread_should_stop()) {
+				__set_current_state(TASK_RUNNING);
+				return 0;
+			}
+
+			if (atomic_read(&c->kick_gc) != last_kick)
+				break;
+
+			if (c->btree_gc_periodic) {
+				unsigned long next = last + c->capacity / 16;
+
+				if (atomic_long_read(&clock->now) >= next)
+					break;
+
+				bch2_io_clock_schedule_timeout(clock, next);
+			} else {
+				schedule();
+			}
+
+			try_to_freeze();
+		}
+		__set_current_state(TASK_RUNNING);
+
+		last = atomic_long_read(&clock->now);
+		last_kick = atomic_read(&c->kick_gc);
+
+		ret = bch2_gc(c, NULL, false, false);
+		if (ret)
+			bch_err(c, "btree gc failed: %i", ret);
+
+		debug_check_no_locks_held();
+	}
+
+	return 0;
+}
+
+void bch2_gc_thread_stop(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	p = c->gc_thread;
+	c->gc_thread = NULL;
+
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_gc_thread_start(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	BUG_ON(c->gc_thread);
+
+	p = kthread_create(bch2_gc_thread, c, "bch_gc");
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	get_task_struct(p);
+	c->gc_thread = p;
+	wake_up_process(p);
+	return 0;
+}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
new file mode 100644
index 000000000000..bd5f2752954f
--- /dev/null
+++ b/fs/bcachefs/btree_gc.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_GC_H
+#define _BCACHEFS_BTREE_GC_H
+
+#include "btree_types.h"
+
+void bch2_coalesce(struct bch_fs *);
+
+struct journal_keys;
+int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool);
+void bch2_gc_thread_stop(struct bch_fs *);
+int bch2_gc_thread_start(struct bch_fs *);
+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
+
+/*
+ * For concurrent mark and sweep (with other index updates), we define a total
+ * ordering of _all_ references GC walks:
+ *
+ * Note that some references will have the same GC position as others - e.g.
+ * everything within the same btree node; in those cases we're relying on
+ * whatever locking exists for where those references live, i.e. the write lock
+ * on a btree node.
+ *
+ * That locking is also required to ensure GC doesn't pass the updater in
+ * between the updater adding/removing the reference and updating the GC marks;
+ * without that, we would at best double count sometimes.
+ *
+ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
+ * be held that prevents GC from passing the position the updater is at.
+ *
+ * (What about the start of gc, when we're clearing all the marks? GC clears the
+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
+ * position inside its cmpxchg loop, so crap magically works).
+ */
+
+/* Position of (the start of) a gc phase: */
+static inline struct gc_pos gc_phase(enum gc_phase phase)
+{
+	return (struct gc_pos) {
+		.phase	= phase,
+		.pos	= POS_MIN,
+		.level	= 0,
+	};
+}
+
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
+{
+	if (l.phase != r.phase)
+		return l.phase < r.phase ? -1 : 1;
+	if (bkey_cmp(l.pos, r.pos))
+		return bkey_cmp(l.pos, r.pos);
+	if (l.level != r.level)
+		return l.level < r.level ? -1 : 1;
+	return 0;
+}
+
+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
+{
+	switch (id) {
+#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
+	BCH_BTREE_IDS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+static inline struct gc_pos gc_pos_btree(enum btree_id id,
+					 struct bpos pos, unsigned level)
+{
+	return (struct gc_pos) {
+		.phase	= btree_id_to_gc_phase(id),
+		.pos	= pos,
+		.level	= level,
+	};
+}
+
+/*
+ * GC position of the pointers within a btree node: note, _not_ for &b->key
+ * itself, that lives in the parent node:
+ */
+static inline struct gc_pos gc_pos_btree_node(struct btree *b)
+{
+	return gc_pos_btree(b->btree_id, b->key.k.p, b->level);
+}
+
+/*
+ * GC position of the pointer to a btree root: we don't use
+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with
+ * btree_split() increasing the tree depth - the new root will have level > the
+ * old root and thus have a greater gc position than the old root, but that
+ * would be incorrect since once gc has marked the root it's not coming back.
+ */
+static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
+{
+	return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
+}
+
+static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
+{
+	return (struct gc_pos) {
+		.phase	= GC_PHASE_ALLOC,
+		.pos	= POS(ob ? ob - c->open_buckets : 0, 0),
+	};
+}
+
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
+{
+	unsigned seq;
+	bool ret;
+
+	do {
+		seq = read_seqcount_begin(&c->gc_pos_lock);
+		ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
+	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+	return ret;
+}
+
+#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
new file mode 100644
index 000000000000..591980d2011f
--- /dev/null
+++ b/fs/bcachefs/btree_io.c
@@ -0,0 +1,1703 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "bkey_sort.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "super-io.h"
+
+#include <trace/events/bcachefs.h>
+
+static void verify_no_dups(struct btree *b,
+			   struct bkey_packed *start,
+			   struct bkey_packed *end)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bkey_packed *k;
+
+	for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
+		struct bkey l = bkey_unpack_key(b, k);
+		struct bkey r = bkey_unpack_key(b, bkey_next(k));
+
+		BUG_ON(btree_node_is_extents(b)
+		       ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
+		       : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
+		//BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
+	}
+#endif
+}
+
+static void clear_needs_whiteout(struct bset *i)
+{
+	struct bkey_packed *k;
+
+	for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+		k->needs_whiteout = false;
+}
+
+static void set_needs_whiteout(struct bset *i)
+{
+	struct bkey_packed *k;
+
+	for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+		k->needs_whiteout = true;
+}
+
+static void btree_bounce_free(struct bch_fs *c, unsigned order,
+			      bool used_mempool, void *p)
+{
+	if (used_mempool)
+		mempool_free(p, &c->btree_bounce_pool);
+	else
+		vpfree(p, PAGE_SIZE << order);
+}
+
+static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
+				bool *used_mempool)
+{
+	void *p;
+
+	BUG_ON(order > btree_page_order(c));
+
+	*used_mempool = false;
+	p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
+	if (p)
+		return p;
+
+	*used_mempool = true;
+	return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
+}
+
+static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
+				    bool compacting,
+				    enum compact_mode mode)
+{
+	unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
+	unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+
+	if (mode == COMPACT_LAZY) {
+		if (should_compact_bset_lazy(b, t) ||
+		    (compacting && !bset_written(b, bset(b, t))))
+			return dead_u64s;
+	} else {
+		if (bset_written(b, bset(b, t)))
+			return dead_u64s;
+	}
+
+	return 0;
+}
+
+bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
+			     enum compact_mode mode)
+{
+	const struct bkey_format *f = &b->format;
+	struct bset_tree *t;
+	struct bkey_packed *whiteouts = NULL;
+	struct bkey_packed *u_start, *u_pos;
+	struct sort_iter sort_iter;
+	unsigned order, whiteout_u64s = 0, u64s;
+	bool used_mempool, compacting = false;
+
+	for_each_bset(b, t)
+		whiteout_u64s += should_compact_bset(b, t,
+					whiteout_u64s != 0, mode);
+
+	if (!whiteout_u64s)
+		return false;
+
+	sort_iter_init(&sort_iter, b);
+
+	whiteout_u64s += b->whiteout_u64s;
+	order = get_order(whiteout_u64s * sizeof(u64));
+
+	whiteouts = btree_bounce_alloc(c, order, &used_mempool);
+	u_start = u_pos = whiteouts;
+
+	memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
+		    b->whiteout_u64s);
+	u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
+
+	sort_iter_add(&sort_iter, u_start, u_pos);
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k, *n, *out, *start, *end;
+		struct btree_node_entry *src = NULL, *dst = NULL;
+
+		if (t != b->set && !bset_written(b, i)) {
+			src = container_of(i, struct btree_node_entry, keys);
+			dst = max(write_block(b),
+				  (void *) btree_bkey_last(b, t -1));
+		}
+
+		if (!should_compact_bset(b, t, compacting, mode)) {
+			if (src != dst) {
+				memmove(dst, src, sizeof(*src) +
+					le16_to_cpu(src->keys.u64s) *
+					sizeof(u64));
+				i = &dst->keys;
+				set_btree_bset(b, t, i);
+			}
+			continue;
+		}
+
+		compacting = true;
+		u_start = u_pos;
+		start = i->start;
+		end = vstruct_last(i);
+
+		if (src != dst) {
+			memmove(dst, src, sizeof(*src));
+			i = &dst->keys;
+			set_btree_bset(b, t, i);
+		}
+
+		out = i->start;
+
+		for (k = start; k != end; k = n) {
+			n = bkey_next(k);
+
+			if (bkey_deleted(k) && btree_node_is_extents(b))
+				continue;
+
+			if (bkey_whiteout(k) && !k->needs_whiteout)
+				continue;
+
+			if (bkey_whiteout(k)) {
+				unreserve_whiteout(b, k);
+				memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
+				set_bkeyp_val_u64s(f, u_pos, 0);
+				u_pos = bkey_next(u_pos);
+			} else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+				bkey_copy(out, k);
+				out = bkey_next(out);
+			}
+		}
+
+		sort_iter_add(&sort_iter, u_start, u_pos);
+
+		if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+			i->u64s = cpu_to_le16((u64 *) out - i->_data);
+			set_btree_bset_end(b, t);
+			bch2_bset_set_no_aux_tree(b, t);
+		}
+	}
+
+	b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
+
+	BUG_ON((void *) unwritten_whiteouts_start(c, b) <
+	       (void *) btree_bkey_last(b, bset_tree_last(b)));
+
+	u64s = (btree_node_is_extents(b)
+		? bch2_sort_extent_whiteouts
+		: bch2_sort_key_whiteouts)(unwritten_whiteouts_start(c, b),
+					   &sort_iter);
+
+	BUG_ON(u64s > b->whiteout_u64s);
+	BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
+	BUG_ON(u_pos != whiteouts && !u64s);
+
+	if (u64s != b->whiteout_u64s) {
+		void *src = unwritten_whiteouts_start(c, b);
+
+		b->whiteout_u64s = u64s;
+		memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
+	}
+
+	verify_no_dups(b,
+		       unwritten_whiteouts_start(c, b),
+		       unwritten_whiteouts_end(c, b));
+
+	btree_bounce_free(c, order, used_mempool, whiteouts);
+
+	if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
+		bch2_btree_build_aux_trees(b);
+
+	bch_btree_keys_u64s_remaining(c, b);
+	bch2_verify_btree_nr_keys(b);
+
+	return true;
+}
+
+static bool bch2_drop_whiteouts(struct btree *b)
+{
+	struct bset_tree *t;
+	bool ret = false;
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k, *n, *out, *start, *end;
+
+		if (!should_compact_bset(b, t, true, COMPACT_WRITTEN))
+			continue;
+
+		start	= btree_bkey_first(b, t);
+		end	= btree_bkey_last(b, t);
+
+		if (!bset_written(b, i) &&
+		    t != b->set) {
+			struct bset *dst =
+			       max_t(struct bset *, write_block(b),
+				     (void *) btree_bkey_last(b, t -1));
+
+			memmove(dst, i, sizeof(struct bset));
+			i = dst;
+			set_btree_bset(b, t, i);
+		}
+
+		out = i->start;
+
+		for (k = start; k != end; k = n) {
+			n = bkey_next(k);
+
+			if (!bkey_whiteout(k)) {
+				bkey_copy(out, k);
+				out = bkey_next(out);
+			}
+		}
+
+		i->u64s = cpu_to_le16((u64 *) out - i->_data);
+		bch2_bset_set_no_aux_tree(b, t);
+		ret = true;
+	}
+
+	bch2_verify_btree_nr_keys(b);
+
+	return ret;
+}
+
+static void btree_node_sort(struct bch_fs *c, struct btree *b,
+			    struct btree_iter *iter,
+			    unsigned start_idx,
+			    unsigned end_idx,
+			    bool filter_whiteouts)
+{
+	struct btree_node *out;
+	struct sort_iter sort_iter;
+	struct bset_tree *t;
+	struct bset *start_bset = bset(b, &b->set[start_idx]);
+	bool used_mempool = false;
+	u64 start_time, seq = 0;
+	unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
+	bool sorting_entire_node = start_idx == 0 &&
+		end_idx == b->nsets;
+
+	sort_iter_init(&sort_iter, b);
+
+	for (t = b->set + start_idx;
+	     t < b->set + end_idx;
+	     t++) {
+		u64s += le16_to_cpu(bset(b, t)->u64s);
+		sort_iter_add(&sort_iter,
+			      btree_bkey_first(b, t),
+			      btree_bkey_last(b, t));
+	}
+
+	order = sorting_entire_node
+		? btree_page_order(c)
+		: get_order(__vstruct_bytes(struct btree_node, u64s));
+
+	out = btree_bounce_alloc(c, order, &used_mempool);
+
+	start_time = local_clock();
+
+	if (btree_node_is_extents(b))
+		filter_whiteouts = bset_written(b, start_bset);
+
+	u64s = (btree_node_is_extents(b)
+		? bch2_sort_extents
+		: bch2_sort_keys)(out->keys.start,
+				  &sort_iter,
+				  filter_whiteouts);
+
+	out->keys.u64s = cpu_to_le16(u64s);
+
+	BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
+
+	if (sorting_entire_node)
+		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+				       start_time);
+
+	/* Make sure we preserve bset journal_seq: */
+	for (t = b->set + start_idx; t < b->set + end_idx; t++)
+		seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
+	start_bset->journal_seq = cpu_to_le64(seq);
+
+	if (sorting_entire_node) {
+		unsigned u64s = le16_to_cpu(out->keys.u64s);
+
+		BUG_ON(order != btree_page_order(c));
+
+		/*
+		 * Our temporary buffer is the same size as the btree node's
+		 * buffer, we can just swap buffers instead of doing a big
+		 * memcpy()
+		 */
+		*out = *b->data;
+		out->keys.u64s = cpu_to_le16(u64s);
+		swap(out, b->data);
+		set_btree_bset(b, b->set, &b->data->keys);
+	} else {
+		start_bset->u64s = out->keys.u64s;
+		memcpy_u64s(start_bset->start,
+			    out->keys.start,
+			    le16_to_cpu(out->keys.u64s));
+	}
+
+	for (i = start_idx + 1; i < end_idx; i++)
+		b->nr.bset_u64s[start_idx] +=
+			b->nr.bset_u64s[i];
+
+	b->nsets -= shift;
+
+	for (i = start_idx + 1; i < b->nsets; i++) {
+		b->nr.bset_u64s[i]	= b->nr.bset_u64s[i + shift];
+		b->set[i]		= b->set[i + shift];
+	}
+
+	for (i = b->nsets; i < MAX_BSETS; i++)
+		b->nr.bset_u64s[i] = 0;
+
+	set_btree_bset_end(b, &b->set[start_idx]);
+	bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
+
+	btree_bounce_free(c, order, used_mempool, out);
+
+	bch2_verify_btree_nr_keys(b);
+}
+
+void bch2_btree_sort_into(struct bch_fs *c,
+			 struct btree *dst,
+			 struct btree *src)
+{
+	struct btree_nr_keys nr;
+	struct btree_node_iter src_iter;
+	u64 start_time = local_clock();
+
+	BUG_ON(dst->nsets != 1);
+
+	bch2_bset_set_no_aux_tree(dst, dst->set);
+
+	bch2_btree_node_iter_init_from_start(&src_iter, src);
+
+	if (btree_node_is_extents(src))
+		nr = bch2_sort_repack_merge(c, btree_bset_first(dst),
+				src, &src_iter,
+				&dst->format,
+				true);
+	else
+		nr = bch2_sort_repack(btree_bset_first(dst),
+				src, &src_iter,
+				&dst->format,
+				true);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+			       start_time);
+
+	set_btree_bset_end(dst, dst->set);
+
+	dst->nr.live_u64s	+= nr.live_u64s;
+	dst->nr.bset_u64s[0]	+= nr.bset_u64s[0];
+	dst->nr.packed_keys	+= nr.packed_keys;
+	dst->nr.unpacked_keys	+= nr.unpacked_keys;
+
+	bch2_verify_btree_nr_keys(dst);
+}
+
+#define SORT_CRIT	(4096 / sizeof(u64))
+
+/*
+ * We're about to add another bset to the btree node, so if there's currently
+ * too many bsets - sort some of them together:
+ */
+static bool btree_node_compact(struct bch_fs *c, struct btree *b,
+			       struct btree_iter *iter)
+{
+	unsigned unwritten_idx;
+	bool ret = false;
+
+	for (unwritten_idx = 0;
+	     unwritten_idx < b->nsets;
+	     unwritten_idx++)
+		if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
+			break;
+
+	if (b->nsets - unwritten_idx > 1) {
+		btree_node_sort(c, b, iter, unwritten_idx,
+				b->nsets, false);
+		ret = true;
+	}
+
+	if (unwritten_idx > 1) {
+		btree_node_sort(c, b, iter, 0, unwritten_idx, false);
+		ret = true;
+	}
+
+	return ret;
+}
+
+void bch2_btree_build_aux_trees(struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		bch2_bset_build_aux_tree(b, t,
+				!bset_written(b, bset(b, t)) &&
+				t == bset_tree_last(b));
+}
+
+/*
+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
+ * inserted into
+ *
+ * Safe to call if there already is an unwritten bset - will only add a new bset
+ * if @b doesn't already have one.
+ *
+ * Returns true if we sorted (i.e. invalidated iterators
+ */
+void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
+			  struct btree_iter *iter)
+{
+	struct btree_node_entry *bne;
+	bool did_sort;
+
+	EBUG_ON(!(b->lock.state.seq & 1));
+	EBUG_ON(iter && iter->l[b->level].b != b);
+
+	did_sort = btree_node_compact(c, b, iter);
+
+	bne = want_new_bset(c, b);
+	if (bne)
+		bch2_bset_init_next(c, b, bne);
+
+	bch2_btree_build_aux_trees(b);
+
+	if (iter && did_sort)
+		bch2_btree_iter_reinit_node(iter, b);
+}
+
+static struct nonce btree_nonce(struct bset *i, unsigned offset)
+{
+	return (struct nonce) {{
+		[0] = cpu_to_le32(offset),
+		[1] = ((__le32 *) &i->seq)[0],
+		[2] = ((__le32 *) &i->seq)[1],
+		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
+	}};
+}
+
+static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+{
+	struct nonce nonce = btree_nonce(i, offset);
+
+	if (!offset) {
+		struct btree_node *bn = container_of(i, struct btree_node, keys);
+		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+		bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
+			     bytes);
+
+		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
+	}
+
+	bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+		     vstruct_end(i) - (void *) i->_data);
+}
+
+static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
+			  struct btree *b, struct bset *i,
+			  unsigned offset, int write)
+{
+	pr_buf(out, "error validating btree node %s"
+	       "at btree %u level %u/%u\n"
+	       "pos %llu:%llu node offset %u",
+	       write ? "before write " : "",
+	       b->btree_id, b->level,
+	       c->btree_roots[b->btree_id].level,
+	       b->key.k.p.inode, b->key.k.p.offset,
+	       b->written);
+	if (i)
+		pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+}
+
+enum btree_err_type {
+	BTREE_ERR_FIXABLE,
+	BTREE_ERR_WANT_RETRY,
+	BTREE_ERR_MUST_RETRY,
+	BTREE_ERR_FATAL,
+};
+
+enum btree_validate_ret {
+	BTREE_RETRY_READ = 64,
+};
+
+#define btree_err(type, c, b, i, msg, ...)				\
+({									\
+	__label__ out;							\
+	char _buf[300];							\
+	struct printbuf out = PBUF(_buf);				\
+									\
+	btree_err_msg(&out, c, b, i, b->written, write);		\
+	pr_buf(&out, ": " msg, ##__VA_ARGS__);				\
+									\
+	if (type == BTREE_ERR_FIXABLE &&				\
+	    write == READ &&						\
+	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {		\
+		mustfix_fsck_err(c, "%s", _buf);			\
+		goto out;						\
+	}								\
+									\
+	switch (write) {						\
+	case READ:							\
+		bch_err(c, "%s", _buf);					\
+									\
+		switch (type) {						\
+		case BTREE_ERR_FIXABLE:					\
+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			goto fsck_err;					\
+		case BTREE_ERR_WANT_RETRY:				\
+			if (have_retry) {				\
+				ret = BTREE_RETRY_READ;			\
+				goto fsck_err;				\
+			}						\
+			break;						\
+		case BTREE_ERR_MUST_RETRY:				\
+			ret = BTREE_RETRY_READ;				\
+			goto fsck_err;					\
+		case BTREE_ERR_FATAL:					\
+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			goto fsck_err;					\
+		}							\
+		break;							\
+	case WRITE:							\
+		bch_err(c, "corrupt metadata before write: %s", _buf);	\
+									\
+		if (bch2_fs_inconsistent(c)) {				\
+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			goto fsck_err;					\
+		}							\
+		break;							\
+	}								\
+out:									\
+	true;								\
+})
+
+#define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
+
+static int validate_bset(struct bch_fs *c, struct btree *b,
+			 struct bset *i, unsigned sectors,
+			 unsigned *whiteout_u64s, int write,
+			 bool have_retry)
+{
+	struct bkey_packed *k, *prev = NULL;
+	struct bpos prev_pos = POS_MIN;
+	bool seen_non_whiteout = false;
+	unsigned version;
+	const char *err;
+	int ret = 0;
+
+	if (i == &b->data->keys) {
+		/* These indicate that we read the wrong btree node: */
+		btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id,
+			     BTREE_ERR_MUST_RETRY, c, b, i,
+			     "incorrect btree id");
+
+		btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level,
+			     BTREE_ERR_MUST_RETRY, c, b, i,
+			     "incorrect level");
+
+		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
+			u64 *p = (u64 *) &b->data->ptr;
+
+			*p = swab64(*p);
+			bch2_bpos_swab(&b->data->min_key);
+			bch2_bpos_swab(&b->data->max_key);
+		}
+
+		btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p),
+			     BTREE_ERR_MUST_RETRY, c, b, i,
+			     "incorrect max key");
+
+		/* XXX: ideally we would be validating min_key too */
+#if 0
+		/*
+		 * not correct anymore, due to btree node write error
+		 * handling
+		 *
+		 * need to add b->data->seq to btree keys and verify
+		 * against that
+		 */
+		btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
+						  b->data->ptr),
+			     BTREE_ERR_FATAL, c, b, i,
+			     "incorrect backpointer");
+#endif
+		err = bch2_bkey_format_validate(&b->data->format);
+		btree_err_on(err,
+			     BTREE_ERR_FATAL, c, b, i,
+			     "invalid bkey format: %s", err);
+	}
+
+	version = le16_to_cpu(i->version);
+	btree_err_on((version != BCH_BSET_VERSION_OLD &&
+		      version < bcachefs_metadata_version_min) ||
+		     version >= bcachefs_metadata_version_max,
+		     BTREE_ERR_FATAL, c, b, i,
+		     "unsupported bset version");
+
+	if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
+			 BTREE_ERR_FIXABLE, c, b, i,
+			 "bset past end of btree node")) {
+		i->u64s = 0;
+		return 0;
+	}
+
+	btree_err_on(b->written && !i->u64s,
+		     BTREE_ERR_FIXABLE, c, b, i,
+		     "empty bset");
+
+	if (!BSET_SEPARATE_WHITEOUTS(i)) {
+		seen_non_whiteout = true;
+		*whiteout_u64s = 0;
+	}
+
+	for (k = i->start;
+	     k != vstruct_last(i);) {
+		struct bkey_s_c u;
+		struct bkey tmp;
+		const char *invalid;
+
+		if (btree_err_on(!k->u64s,
+				 BTREE_ERR_FIXABLE, c, b, i,
+				 "KEY_U64s 0: %zu bytes of metadata lost",
+				 vstruct_end(i) - (void *) k)) {
+			i->u64s = cpu_to_le16((u64 *) k - i->_data);
+			break;
+		}
+
+		if (btree_err_on(bkey_next(k) > vstruct_last(i),
+				 BTREE_ERR_FIXABLE, c, b, i,
+				 "key extends past end of bset")) {
+			i->u64s = cpu_to_le16((u64 *) k - i->_data);
+			break;
+		}
+
+		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
+				 BTREE_ERR_FIXABLE, c, b, i,
+				 "invalid bkey format %u", k->format)) {
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			continue;
+		}
+
+		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
+			bch2_bkey_swab(&b->format, k);
+
+		if (!write &&
+		    version < bcachefs_metadata_version_bkey_renumber)
+			bch2_bkey_renumber(btree_node_type(b), k, write);
+
+		u = bkey_disassemble(b, k, &tmp);
+
+		invalid = __bch2_bkey_invalid(c, u, btree_node_type(b)) ?:
+			bch2_bkey_in_btree_node(b, u) ?:
+			(write ? bch2_bkey_val_invalid(c, u) : NULL);
+		if (invalid) {
+			char buf[160];
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, u);
+			btree_err(BTREE_ERR_FIXABLE, c, b, i,
+				  "invalid bkey:\n%s\n%s", invalid, buf);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			continue;
+		}
+
+		if (write &&
+		    version < bcachefs_metadata_version_bkey_renumber)
+			bch2_bkey_renumber(btree_node_type(b), k, write);
+
+		/*
+		 * with the separate whiteouts thing (used for extents), the
+		 * second set of keys actually can have whiteouts too, so we
+		 * can't solely go off bkey_whiteout()...
+		 */
+
+		if (!seen_non_whiteout &&
+		    (!bkey_whiteout(k) ||
+		     (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
+			*whiteout_u64s = k->_data - i->_data;
+			seen_non_whiteout = true;
+		} else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
+			btree_err(BTREE_ERR_FATAL, c, b, i,
+				  "keys out of order: %llu:%llu > %llu:%llu",
+				  prev_pos.inode,
+				  prev_pos.offset,
+				  u.k->p.inode,
+				  bkey_start_offset(u.k));
+			/* XXX: repair this */
+		}
+
+		prev_pos = u.k->p;
+		prev = k;
+		k = bkey_next(k);
+	}
+
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+fsck_err:
+	return ret;
+}
+
+int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
+{
+	struct btree_node_entry *bne;
+	struct btree_node_iter_large *iter;
+	struct btree_node *sorted;
+	struct bkey_packed *k;
+	struct bset *i;
+	bool used_mempool, blacklisted;
+	unsigned u64s;
+	int ret, retry_read = 0, write = READ;
+
+	iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
+	iter->used = 0;
+
+	if (bch2_meta_read_fault("btree"))
+		btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
+			  "dynamic fault");
+
+	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
+		     BTREE_ERR_MUST_RETRY, c, b, NULL,
+		     "bad magic");
+
+	btree_err_on(!b->data->keys.seq,
+		     BTREE_ERR_MUST_RETRY, c, b, NULL,
+		     "bad btree header");
+
+	while (b->written < c->opts.btree_node_size) {
+		unsigned sectors, whiteout_u64s = 0;
+		struct nonce nonce;
+		struct bch_csum csum;
+		bool first = !b->written;
+
+		if (!b->written) {
+			i = &b->data->keys;
+
+			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+				     BTREE_ERR_WANT_RETRY, c, b, i,
+				     "unknown checksum type");
+
+			nonce = btree_nonce(i, b->written << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+
+			btree_err_on(bch2_crc_cmp(csum, b->data->csum),
+				     BTREE_ERR_WANT_RETRY, c, b, i,
+				     "invalid checksum");
+
+			bset_encrypt(c, i, b->written << 9);
+
+			sectors = vstruct_sectors(b->data, c->block_bits);
+
+			btree_node_set_format(b, b->data->format);
+		} else {
+			bne = write_block(b);
+			i = &bne->keys;
+
+			if (i->seq != b->data->keys.seq)
+				break;
+
+			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+				     BTREE_ERR_WANT_RETRY, c, b, i,
+				     "unknown checksum type");
+
+			nonce = btree_nonce(i, b->written << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+			btree_err_on(bch2_crc_cmp(csum, bne->csum),
+				     BTREE_ERR_WANT_RETRY, c, b, i,
+				     "invalid checksum");
+
+			bset_encrypt(c, i, b->written << 9);
+
+			sectors = vstruct_sectors(bne, c->block_bits);
+		}
+
+		ret = validate_bset(c, b, i, sectors, &whiteout_u64s,
+				    READ, have_retry);
+		if (ret)
+			goto fsck_err;
+
+		b->written += sectors;
+
+		blacklisted = bch2_journal_seq_is_blacklisted(c,
+					le64_to_cpu(i->journal_seq),
+					true);
+
+		btree_err_on(blacklisted && first,
+			     BTREE_ERR_FIXABLE, c, b, i,
+			     "first btree node bset has blacklisted journal seq");
+		if (blacklisted && !first)
+			continue;
+
+		bch2_btree_node_iter_large_push(iter, b,
+					   i->start,
+					   vstruct_idx(i, whiteout_u64s));
+
+		bch2_btree_node_iter_large_push(iter, b,
+					   vstruct_idx(i, whiteout_u64s),
+					   vstruct_last(i));
+	}
+
+	for (bne = write_block(b);
+	     bset_byte_offset(b, bne) < btree_bytes(c);
+	     bne = (void *) bne + block_bytes(c))
+		btree_err_on(bne->keys.seq == b->data->keys.seq,
+			     BTREE_ERR_WANT_RETRY, c, b, NULL,
+			     "found bset signature after last bset");
+
+	sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
+	sorted->keys.u64s = 0;
+
+	set_btree_bset(b, b->set, &b->data->keys);
+
+	b->nr = btree_node_is_extents(b)
+		? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
+		: bch2_key_sort_fix_overlapping(&sorted->keys, b, iter);
+
+	u64s = le16_to_cpu(sorted->keys.u64s);
+	*sorted = *b->data;
+	sorted->keys.u64s = cpu_to_le16(u64s);
+	swap(sorted, b->data);
+	set_btree_bset(b, b->set, &b->data->keys);
+	b->nsets = 1;
+
+	BUG_ON(b->nr.live_u64s != u64s);
+
+	btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
+
+	i = &b->data->keys;
+	for (k = i->start; k != vstruct_last(i);) {
+		struct bkey tmp;
+		struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
+		const char *invalid = bch2_bkey_val_invalid(c, u);
+
+		if (invalid ||
+		    (inject_invalid_keys(c) &&
+		     !bversion_cmp(u.k->version, MAX_VERSION))) {
+			char buf[160];
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, u);
+			btree_err(BTREE_ERR_FIXABLE, c, b, i,
+				  "invalid bkey %s: %s", buf, invalid);
+
+			btree_keys_account_key_drop(&b->nr, 0, k);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			set_btree_bset_end(b, b->set);
+			continue;
+		}
+
+		k = bkey_next(k);
+	}
+
+	bch2_bset_build_aux_tree(b, b->set, false);
+
+	set_needs_whiteout(btree_bset_first(b));
+
+	btree_node_reset_sib_u64s(b);
+out:
+	mempool_free(iter, &c->fill_iter);
+	return retry_read;
+fsck_err:
+	if (ret == BTREE_RETRY_READ) {
+		retry_read = 1;
+	} else {
+		bch2_inconsistent_error(c);
+		set_btree_node_read_error(b);
+	}
+	goto out;
+}
+
+static void btree_node_read_work(struct work_struct *work)
+{
+	struct btree_read_bio *rb =
+		container_of(work, struct btree_read_bio, work);
+	struct bch_fs *c	= rb->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+	struct btree *b		= rb->bio.bi_private;
+	struct bio *bio		= &rb->bio;
+	struct bch_io_failures failed = { .nr = 0 };
+	bool can_retry;
+
+	goto start;
+	while (1) {
+		bch_info(c, "retrying read");
+		ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
+		bio_reset(bio);
+		bio->bi_opf		= REQ_OP_READ|REQ_SYNC|REQ_META;
+		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
+		bio->bi_iter.bi_size	= btree_bytes(c);
+
+		if (rb->have_ioref) {
+			bio_set_dev(bio, ca->disk_sb.bdev);
+			submit_bio_wait(bio);
+		} else {
+			bio->bi_status = BLK_STS_REMOVED;
+		}
+start:
+		bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
+		if (rb->have_ioref)
+			percpu_ref_put(&ca->io_ref);
+		rb->have_ioref = false;
+
+		bch2_mark_io_failure(&failed, &rb->pick);
+
+		can_retry = bch2_bkey_pick_read_device(c,
+				bkey_i_to_s_c(&b->key),
+				&failed, &rb->pick) > 0;
+
+		if (!bio->bi_status &&
+		    !bch2_btree_node_read_done(c, b, can_retry))
+			break;
+
+		if (!can_retry) {
+			set_btree_node_read_error(b);
+			break;
+		}
+	}
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+			       rb->start_time);
+	bio_put(&rb->bio);
+	clear_btree_node_read_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_endio(struct bio *bio)
+{
+	struct btree_read_bio *rb =
+		container_of(bio, struct btree_read_bio, bio);
+	struct bch_fs *c	= rb->c;
+
+	if (rb->have_ioref) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+		bch2_latency_acct(ca, rb->start_time, READ);
+	}
+
+	queue_work(system_unbound_wq, &rb->work);
+}
+
+void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+			  bool sync)
+{
+	struct extent_ptr_decoded pick;
+	struct btree_read_bio *rb;
+	struct bch_dev *ca;
+	struct bio *bio;
+	int ret;
+
+	trace_btree_read(c, b);
+
+	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
+					 NULL, &pick);
+	if (bch2_fs_fatal_err_on(ret <= 0, c,
+			"btree node read error: no device to read from")) {
+		set_btree_node_read_error(b);
+		return;
+	}
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+	bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data,
+						   btree_bytes(c)),
+			       &c->btree_bio);
+	rb = container_of(bio, struct btree_read_bio, bio);
+	rb->c			= c;
+	rb->start_time		= local_clock();
+	rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
+	rb->pick		= pick;
+	INIT_WORK(&rb->work, btree_node_read_work);
+	bio->bi_opf		= REQ_OP_READ|REQ_SYNC|REQ_META;
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bio->bi_end_io		= btree_node_read_endio;
+	bio->bi_private		= b;
+	bch2_bio_map(bio, b->data, btree_bytes(c));
+
+	set_btree_node_read_in_flight(b);
+
+	if (rb->have_ioref) {
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
+			     bio_sectors(bio));
+		bio_set_dev(bio, ca->disk_sb.bdev);
+
+		if (sync) {
+			submit_bio_wait(bio);
+
+			bio->bi_private	= b;
+			btree_node_read_work(&rb->work);
+		} else {
+			submit_bio(bio);
+		}
+	} else {
+		bio->bi_status = BLK_STS_REMOVED;
+
+		if (sync)
+			btree_node_read_work(&rb->work);
+		else
+			queue_work(system_unbound_wq, &rb->work);
+
+	}
+}
+
+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
+			const struct bkey_i *k, unsigned level)
+{
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		closure_sync(&cl);
+	} while (ret);
+
+	b = bch2_btree_node_mem_alloc(c);
+	bch2_btree_cache_cannibalize_unlock(c);
+
+	BUG_ON(IS_ERR(b));
+
+	bkey_copy(&b->key, k);
+	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
+
+	bch2_btree_node_read(c, b, true);
+
+	if (btree_node_read_error(b)) {
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		mutex_lock(&c->btree_cache.lock);
+		list_move(&b->list, &c->btree_cache.freeable);
+		mutex_unlock(&c->btree_cache.lock);
+
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_btree_set_root_for_read(c, b);
+err:
+	six_unlock_write(&b->lock);
+	six_unlock_intent(&b->lock);
+
+	return ret;
+}
+
+void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
+			      struct btree_write *w)
+{
+	unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
+
+	do {
+		old = new = v;
+		if (!(old & 1))
+			break;
+
+		new &= ~1UL;
+	} while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
+
+	if (old & 1)
+		closure_put(&((struct btree_update *) new)->cl);
+
+	bch2_journal_pin_drop(&c->journal, &w->journal);
+	closure_wake_up(&w->wait);
+}
+
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+	struct btree_write *w = btree_prev_write(b);
+
+	bch2_btree_complete_write(c, b, w);
+	btree_node_io_unlock(b);
+}
+
+static void bch2_btree_node_write_error(struct bch_fs *c,
+					struct btree_write_bio *wbio)
+{
+	struct btree *b		= wbio->wbio.bio.bi_private;
+	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+	struct bkey_i_btree_ptr *new_key;
+	struct bkey_s_btree_ptr bp;
+	struct bch_extent_ptr *ptr;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_node_iter(&trans, b->btree_id, b->key.k.p,
+					BTREE_MAX_DEPTH, b->level, 0);
+retry:
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		goto err;
+
+	/* has node been freed? */
+	if (iter->l[b->level].b != b) {
+		/* node has been freed: */
+		BUG_ON(!btree_node_dying(b));
+		goto out;
+	}
+
+	BUG_ON(!btree_node_hashed(b));
+
+	bkey_copy(&tmp.k, &b->key);
+
+	new_key = bkey_i_to_btree_ptr(&tmp.k);
+	bp = btree_ptr_i_to_s(new_key);
+
+	bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr,
+		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
+
+	if (!bch2_bkey_nr_ptrs(bp.s_c))
+		goto err;
+
+	ret = bch2_btree_node_update_key(c, iter, b, new_key);
+	if (ret == -EINTR)
+		goto retry;
+	if (ret)
+		goto err;
+out:
+	bch2_trans_exit(&trans);
+	bio_put(&wbio->wbio.bio);
+	btree_node_write_done(c, b);
+	return;
+err:
+	set_btree_node_noevict(b);
+	bch2_fs_fatal_error(c, "fatal error writing btree node");
+	goto out;
+}
+
+void bch2_btree_write_error_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs,
+					btree_write_error_work);
+	struct bio *bio;
+
+	while (1) {
+		spin_lock_irq(&c->btree_write_error_lock);
+		bio = bio_list_pop(&c->btree_write_error_list);
+		spin_unlock_irq(&c->btree_write_error_lock);
+
+		if (!bio)
+			break;
+
+		bch2_btree_node_write_error(c,
+			container_of(bio, struct btree_write_bio, wbio.bio));
+	}
+}
+
+static void btree_node_write_work(struct work_struct *work)
+{
+	struct btree_write_bio *wbio =
+		container_of(work, struct btree_write_bio, work);
+	struct bch_fs *c	= wbio->wbio.c;
+	struct btree *b		= wbio->wbio.bio.bi_private;
+
+	btree_bounce_free(c,
+		wbio->wbio.order,
+		wbio->wbio.used_mempool,
+		wbio->data);
+
+	if (wbio->wbio.failed.nr) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&c->btree_write_error_lock, flags);
+		bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
+		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
+		queue_work(c->wq, &c->btree_write_error_work);
+		return;
+	}
+
+	bio_put(&wbio->wbio.bio);
+	btree_node_write_done(c, b);
+}
+
+static void btree_node_write_endio(struct bio *bio)
+{
+	struct bch_write_bio *wbio	= to_wbio(bio);
+	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
+	struct bch_write_bio *orig	= parent ?: wbio;
+	struct bch_fs *c		= wbio->c;
+	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
+	unsigned long flags;
+
+	if (wbio->have_ioref)
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);
+
+	if (bio->bi_status == BLK_STS_REMOVED ||
+	    bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+	    bch2_meta_write_fault("btree")) {
+		spin_lock_irqsave(&c->btree_write_error_lock, flags);
+		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
+		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+	}
+
+	if (wbio->have_ioref)
+		percpu_ref_put(&ca->io_ref);
+
+	if (parent) {
+		bio_put(bio);
+		bio_endio(&parent->bio);
+	} else {
+		struct btree_write_bio *wb =
+			container_of(orig, struct btree_write_bio, wbio);
+
+		INIT_WORK(&wb->work, btree_node_write_work);
+		queue_work(system_unbound_wq, &wb->work);
+	}
+}
+
+static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
+				   struct bset *i, unsigned sectors)
+{
+	unsigned whiteout_u64s = 0;
+	int ret;
+
+	if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE))
+		return -1;
+
+	ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false);
+	if (ret)
+		bch2_inconsistent_error(c);
+
+	return ret;
+}
+
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
+			    enum six_lock_type lock_type_held)
+{
+	struct btree_write_bio *wbio;
+	struct bset_tree *t;
+	struct bset *i;
+	struct btree_node *bn = NULL;
+	struct btree_node_entry *bne = NULL;
+	BKEY_PADDED(key) k;
+	struct bch_extent_ptr *ptr;
+	struct sort_iter sort_iter;
+	struct nonce nonce;
+	unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
+	u64 seq = 0;
+	bool used_mempool;
+	unsigned long old, new;
+	bool validate_before_checksum = false;
+	void *data;
+
+	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+		return;
+
+	/*
+	 * We may only have a read lock on the btree node - the dirty bit is our
+	 * "lock" against racing with other threads that may be trying to start
+	 * a write, we do a write iff we clear the dirty bit. Since setting the
+	 * dirty bit requires a write lock, we can't race with other threads
+	 * redirtying it:
+	 */
+	do {
+		old = new = READ_ONCE(b->flags);
+
+		if (!(old & (1 << BTREE_NODE_dirty)))
+			return;
+
+		if (!btree_node_may_write(b))
+			return;
+
+		if (old & (1 << BTREE_NODE_write_in_flight)) {
+			btree_node_wait_on_io(b);
+			continue;
+		}
+
+		new &= ~(1 << BTREE_NODE_dirty);
+		new &= ~(1 << BTREE_NODE_need_write);
+		new |=  (1 << BTREE_NODE_write_in_flight);
+		new |=  (1 << BTREE_NODE_just_written);
+		new ^=  (1 << BTREE_NODE_write_idx);
+	} while (cmpxchg_acquire(&b->flags, old, new) != old);
+
+	BUG_ON(btree_node_fake(b));
+	BUG_ON((b->will_make_reachable != 0) != !b->written);
+
+	BUG_ON(b->written >= c->opts.btree_node_size);
+	BUG_ON(b->written & (c->opts.block_size - 1));
+	BUG_ON(bset_written(b, btree_bset_last(b)));
+	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
+	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
+
+	/*
+	 * We can't block on six_lock_write() here; another thread might be
+	 * trying to get a journal reservation with read locks held, and getting
+	 * a journal reservation might be blocked on flushing the journal and
+	 * doing btree writes:
+	 */
+	if (lock_type_held == SIX_LOCK_intent &&
+	    six_trylock_write(&b->lock)) {
+		__bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
+		six_unlock_write(&b->lock);
+	} else {
+		__bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
+	}
+
+	BUG_ON(b->uncompacted_whiteout_u64s);
+
+	sort_iter_init(&sort_iter, b);
+
+	bytes = !b->written
+		? sizeof(struct btree_node)
+		: sizeof(struct btree_node_entry);
+
+	bytes += b->whiteout_u64s * sizeof(u64);
+
+	for_each_bset(b, t) {
+		i = bset(b, t);
+
+		if (bset_written(b, i))
+			continue;
+
+		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
+		sort_iter_add(&sort_iter,
+			      btree_bkey_first(b, t),
+			      btree_bkey_last(b, t));
+		seq = max(seq, le64_to_cpu(i->journal_seq));
+	}
+
+	order = get_order(bytes);
+	data = btree_bounce_alloc(c, order, &used_mempool);
+
+	if (!b->written) {
+		bn = data;
+		*bn = *b->data;
+		i = &bn->keys;
+	} else {
+		bne = data;
+		bne->keys = b->data->keys;
+		i = &bne->keys;
+	}
+
+	i->journal_seq	= cpu_to_le64(seq);
+	i->u64s		= 0;
+
+	if (!btree_node_is_extents(b)) {
+		sort_iter_add(&sort_iter,
+			      unwritten_whiteouts_start(c, b),
+			      unwritten_whiteouts_end(c, b));
+		SET_BSET_SEPARATE_WHITEOUTS(i, false);
+	} else {
+		memcpy_u64s(i->start,
+			    unwritten_whiteouts_start(c, b),
+			    b->whiteout_u64s);
+		i->u64s = cpu_to_le16(b->whiteout_u64s);
+		SET_BSET_SEPARATE_WHITEOUTS(i, true);
+	}
+
+	b->whiteout_u64s = 0;
+
+	u64s = btree_node_is_extents(b)
+		? bch2_sort_extents(vstruct_last(i), &sort_iter, false)
+		: bch2_sort_keys(i->start, &sort_iter, false);
+	le16_add_cpu(&i->u64s, u64s);
+
+	clear_needs_whiteout(i);
+
+	/* do we have data to write? */
+	if (b->written && !i->u64s)
+		goto nowrite;
+
+	bytes_to_write = vstruct_end(i) - data;
+	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+
+	memset(data + bytes_to_write, 0,
+	       (sectors_to_write << 9) - bytes_to_write);
+
+	BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size);
+	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
+	BUG_ON(i->seq != b->data->keys.seq);
+
+	i->version = c->sb.version < bcachefs_metadata_version_new_versioning
+		? cpu_to_le16(BCH_BSET_VERSION_OLD)
+		: cpu_to_le16(c->sb.version);
+	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
+
+	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
+		validate_before_checksum = true;
+
+	/* validate_bset will be modifying: */
+	if (le16_to_cpu(i->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		validate_before_checksum = true;
+
+	/* if we're going to be encrypting, check metadata validity first: */
+	if (validate_before_checksum &&
+	    validate_bset_for_write(c, b, i, sectors_to_write))
+		goto err;
+
+	bset_encrypt(c, i, b->written << 9);
+
+	nonce = btree_nonce(i, b->written << 9);
+
+	if (bn)
+		bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
+	else
+		bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+	/* if we're not encrypting, check metadata after checksumming: */
+	if (!validate_before_checksum &&
+	    validate_bset_for_write(c, b, i, sectors_to_write))
+		goto err;
+
+	/*
+	 * We handle btree write errors by immediately halting the journal -
+	 * after we've done that, we can't issue any subsequent btree writes
+	 * because they might have pointers to new nodes that failed to write.
+	 *
+	 * Furthermore, there's no point in doing any more btree writes because
+	 * with the journal stopped, we're never going to update the journal to
+	 * reflect that those writes were done and the data flushed from the
+	 * journal:
+	 *
+	 * Make sure to update b->written so bch2_btree_init_next() doesn't
+	 * break:
+	 */
+	if (bch2_journal_error(&c->journal) ||
+	    c->opts.nochanges)
+		goto err;
+
+	trace_btree_write(b, bytes_to_write, sectors_to_write);
+
+	wbio = container_of(bio_alloc_bioset(GFP_NOIO,
+				buf_pages(data, sectors_to_write << 9),
+				&c->btree_bio),
+			    struct btree_write_bio, wbio.bio);
+	wbio_init(&wbio->wbio.bio);
+	wbio->data			= data;
+	wbio->wbio.order		= order;
+	wbio->wbio.used_mempool		= used_mempool;
+	wbio->wbio.bio.bi_opf		= REQ_OP_WRITE|REQ_META;
+	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
+	wbio->wbio.bio.bi_private	= b;
+
+	if (b->level || !b->written)
+		wbio->wbio.bio.bi_opf |= REQ_FUA;
+
+	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
+
+	/*
+	 * If we're appending to a leaf node, we don't technically need FUA -
+	 * this write just needs to be persisted before the next journal write,
+	 * which will be marked FLUSH|FUA.
+	 *
+	 * Similarly if we're writing a new btree root - the pointer is going to
+	 * be in the next journal entry.
+	 *
+	 * But if we're writing a new btree node (that isn't a root) or
+	 * appending to a non leaf btree node, we need either FUA or a flush
+	 * when we write the parent with the new pointer. FUA is cheaper than a
+	 * flush, and writes appending to leaf nodes aren't blocking anything so
+	 * just make all btree node writes FUA to keep things sane.
+	 */
+
+	bkey_copy(&k.key, &b->key);
+
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr)
+		ptr->offset += b->written;
+
+	b->written += sectors_to_write;
+
+	bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
+	return;
+err:
+	set_btree_node_noevict(b);
+	b->written += sectors_to_write;
+nowrite:
+	btree_bounce_free(c, order, used_mempool, data);
+	btree_node_write_done(c, b);
+}
+
+/*
+ * Work that must be done with write lock held:
+ */
+bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
+{
+	bool invalidated_iter = false;
+	struct btree_node_entry *bne;
+	struct bset_tree *t;
+
+	if (!btree_node_just_written(b))
+		return false;
+
+	BUG_ON(b->whiteout_u64s);
+	BUG_ON(b->uncompacted_whiteout_u64s);
+
+	clear_btree_node_just_written(b);
+
+	/*
+	 * Note: immediately after write, bset_written() doesn't work - the
+	 * amount of data we had to write after compaction might have been
+	 * smaller than the offset of the last bset.
+	 *
+	 * However, we know that all bsets have been written here, as long as
+	 * we're still holding the write lock:
+	 */
+
+	/*
+	 * XXX: decide if we really want to unconditionally sort down to a
+	 * single bset:
+	 */
+	if (b->nsets > 1) {
+		btree_node_sort(c, b, NULL, 0, b->nsets, true);
+		invalidated_iter = true;
+	} else {
+		invalidated_iter = bch2_drop_whiteouts(b);
+	}
+
+	for_each_bset(b, t)
+		set_needs_whiteout(bset(b, t));
+
+	bch2_btree_verify(c, b);
+
+	/*
+	 * If later we don't unconditionally sort down to a single bset, we have
+	 * to ensure this is still true:
+	 */
+	BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
+
+	bne = want_new_bset(c, b);
+	if (bne)
+		bch2_bset_init_next(c, b, bne);
+
+	bch2_btree_build_aux_trees(b);
+
+	return invalidated_iter;
+}
+
+/*
+ * Use this one if the node is intent locked:
+ */
+void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
+			  enum six_lock_type lock_type_held)
+{
+	BUG_ON(lock_type_held == SIX_LOCK_write);
+
+	if (lock_type_held == SIX_LOCK_intent ||
+	    six_lock_tryupgrade(&b->lock)) {
+		__bch2_btree_node_write(c, b, SIX_LOCK_intent);
+
+		/* don't cycle lock unnecessarily: */
+		if (btree_node_just_written(b) &&
+		    six_trylock_write(&b->lock)) {
+			bch2_btree_post_write_cleanup(c, b);
+			six_unlock_write(&b->lock);
+		}
+
+		if (lock_type_held == SIX_LOCK_read)
+			six_lock_downgrade(&b->lock);
+	} else {
+		__bch2_btree_node_write(c, b, SIX_LOCK_read);
+	}
+}
+
+static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct btree *b;
+	unsigned i;
+restart:
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, i, pos)
+		if (test_bit(flag, &b->flags)) {
+			rcu_read_unlock();
+			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+			goto restart;
+
+		}
+	rcu_read_unlock();
+}
+
+void bch2_btree_flush_all_reads(struct bch_fs *c)
+{
+	__bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+}
+
+void bch2_btree_flush_all_writes(struct bch_fs *c)
+{
+	__bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+}
+
+void bch2_btree_verify_flushed(struct bch_fs *c)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct btree *b;
+	unsigned i;
+
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, i, pos) {
+		unsigned long flags = READ_ONCE(b->flags);
+
+		BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
+		       (flags & (1 << BTREE_NODE_write_in_flight)));
+	}
+	rcu_read_unlock();
+}
+
+ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
+{
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct btree *b;
+	unsigned i;
+
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, i, pos) {
+		unsigned long flags = READ_ONCE(b->flags);
+		unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
+
+		if (!(flags & (1 << BTREE_NODE_dirty)))
+			continue;
+
+		pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
+		       b,
+		       (flags & (1 << BTREE_NODE_dirty)) != 0,
+		       (flags & (1 << BTREE_NODE_need_write)) != 0,
+		       b->level,
+		       b->written,
+		       !list_empty_careful(&b->write_blocked),
+		       b->will_make_reachable != 0,
+		       b->will_make_reachable & 1,
+		       b->writes[ idx].wait.list.first != NULL,
+		       b->writes[!idx].wait.list.first != NULL);
+	}
+	rcu_read_unlock();
+
+	return out.pos - buf;
+}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
new file mode 100644
index 000000000000..955a80cafae3
--- /dev/null
+++ b/fs/bcachefs/btree_io.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_IO_H
+#define _BCACHEFS_BTREE_IO_H
+
+#include "bset.h"
+#include "btree_locking.h"
+#include "extents.h"
+#include "io_types.h"
+
+struct bch_fs;
+struct btree_write;
+struct btree;
+struct btree_iter;
+
+struct btree_read_bio {
+	struct bch_fs		*c;
+	u64			start_time;
+	unsigned		have_ioref:1;
+	struct extent_ptr_decoded	pick;
+	struct work_struct	work;
+	struct bio		bio;
+};
+
+struct btree_write_bio {
+	void			*data;
+	struct work_struct	work;
+	struct bch_write_bio	wbio;
+};
+
+static inline void btree_node_io_unlock(struct btree *b)
+{
+	EBUG_ON(!btree_node_write_in_flight(b));
+	clear_btree_node_write_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+static inline void btree_node_io_lock(struct btree *b)
+{
+	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+			    TASK_UNINTERRUPTIBLE);
+}
+
+static inline void btree_node_wait_on_io(struct btree *b)
+{
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+static inline bool btree_node_may_write(struct btree *b)
+{
+	return list_empty_careful(&b->write_blocked) &&
+		(!b->written || !b->will_make_reachable);
+}
+
+enum compact_mode {
+	COMPACT_LAZY,
+	COMPACT_WRITTEN,
+	COMPACT_WRITTEN_NO_WRITE_LOCK,
+};
+
+bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode);
+
+static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
+{
+	unsigned total_u64s = bset_u64s(t);
+	unsigned dead_u64s = total_u64s - b->nr.bset_u64s[t - b->set];
+
+	return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
+}
+
+static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		if (should_compact_bset_lazy(b, t))
+			return __bch2_compact_whiteouts(c, b, COMPACT_LAZY);
+
+	return false;
+}
+
+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
+
+void bch2_btree_build_aux_trees(struct btree *);
+void bch2_btree_init_next(struct bch_fs *, struct btree *,
+			 struct btree_iter *);
+
+int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool);
+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
+int bch2_btree_root_read(struct bch_fs *, enum btree_id,
+			 const struct bkey_i *, unsigned);
+
+void bch2_btree_complete_write(struct bch_fs *, struct btree *,
+			      struct btree_write *);
+void bch2_btree_write_error_work(struct work_struct *);
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *,
+			    enum six_lock_type);
+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
+
+void bch2_btree_node_write(struct bch_fs *, struct btree *,
+			  enum six_lock_type);
+
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
+{
+	while (b->written &&
+	       btree_node_need_write(b) &&
+	       btree_node_may_write(b)) {
+		if (!btree_node_write_in_flight(b)) {
+			bch2_btree_node_write(c, b, SIX_LOCK_read);
+			break;
+		}
+
+		six_unlock_read(&b->lock);
+		btree_node_wait_on_io(b);
+		btree_node_lock_type(c, b, SIX_LOCK_read);
+	}
+}
+
+#define bch2_btree_node_write_cond(_c, _b, cond)			\
+do {									\
+	unsigned long old, new, v = READ_ONCE((_b)->flags);		\
+									\
+	do {								\
+		old = new = v;						\
+									\
+		if (!(old & (1 << BTREE_NODE_dirty)) || !(cond))	\
+			break;						\
+									\
+		new |= (1 << BTREE_NODE_need_write);			\
+	} while ((v = cmpxchg(&(_b)->flags, old, new)) != old);		\
+									\
+	btree_node_write_if_need(_c, _b);				\
+} while (0)
+
+void bch2_btree_flush_all_reads(struct bch_fs *);
+void bch2_btree_flush_all_writes(struct bch_fs *);
+void bch2_btree_verify_flushed(struct bch_fs *);
+ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
+
+#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
new file mode 100644
index 000000000000..5fab505dbea0
--- /dev/null
+++ b/fs/bcachefs/btree_iter.c
@@ -0,0 +1,2158 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+
+#include <linux/prefetch.h>
+#include <trace/events/bcachefs.h>
+
+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
+						    struct btree_iter_level *,
+						    struct bkey *);
+
+#define BTREE_ITER_NO_NODE_GET_LOCKS	((struct btree *) 1)
+#define BTREE_ITER_NO_NODE_DROP		((struct btree *) 2)
+#define BTREE_ITER_NO_NODE_LOCK_ROOT	((struct btree *) 3)
+#define BTREE_ITER_NO_NODE_UP		((struct btree *) 4)
+#define BTREE_ITER_NO_NODE_DOWN		((struct btree *) 5)
+#define BTREE_ITER_NO_NODE_INIT		((struct btree *) 6)
+#define BTREE_ITER_NO_NODE_ERROR	((struct btree *) 7)
+
+static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
+{
+	return l < BTREE_MAX_DEPTH &&
+		(unsigned long) iter->l[l].b >= 128;
+}
+
+/* Returns < 0 if @k is before iter pos, > 0 if @k is after */
+static inline int __btree_iter_pos_cmp(struct btree_iter *iter,
+				       const struct btree *b,
+				       const struct bkey_packed *k,
+				       bool interior_node)
+{
+	int cmp = bkey_cmp_left_packed(b, k, &iter->pos);
+
+	if (cmp)
+		return cmp;
+	if (bkey_deleted(k))
+		return -1;
+
+	/*
+	 * Normally, for extents we want the first key strictly greater than
+	 * the iterator position - with the exception that for interior nodes,
+	 * we don't want to advance past the last key if the iterator position
+	 * is POS_MAX:
+	 */
+	if (iter->flags & BTREE_ITER_IS_EXTENTS &&
+	    (!interior_node ||
+	     bkey_cmp_left_packed_byval(b, k, POS_MAX)))
+		return -1;
+	return 1;
+}
+
+static inline int btree_iter_pos_cmp(struct btree_iter *iter,
+				     const struct btree *b,
+				     const struct bkey_packed *k)
+{
+	return __btree_iter_pos_cmp(iter, b, k, b->level != 0);
+}
+
+/* Btree node locking: */
+
+void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+{
+	bch2_btree_node_unlock_write_inlined(b, iter);
+}
+
+void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+	unsigned readers = 0;
+
+	EBUG_ON(!btree_node_intent_locked(iter, b->level));
+
+	trans_for_each_iter(iter->trans, linked)
+		if (linked->l[b->level].b == b &&
+		    btree_node_read_locked(linked, b->level))
+			readers++;
+
+	/*
+	 * Must drop our read locks before calling six_lock_write() -
+	 * six_unlock() won't do wakeups until the reader count
+	 * goes to 0, and it's safe because we have the node intent
+	 * locked:
+	 */
+	atomic64_sub(__SIX_VAL(read_lock, readers),
+		     &b->lock.state.counter);
+	btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
+	atomic64_add(__SIX_VAL(read_lock, readers),
+		     &b->lock.state.counter);
+}
+
+bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+{
+	struct btree *b = btree_iter_node(iter, level);
+	int want = __btree_lock_want(iter, level);
+
+	if (!is_btree_node(iter, level))
+		return false;
+
+	if (race_fault())
+		return false;
+
+	if (six_relock_type(&b->lock, want, iter->l[level].lock_seq) ||
+	    (btree_node_lock_seq_matches(iter, b, level) &&
+	     btree_node_lock_increment(iter, b, level, want))) {
+		mark_btree_node_locked(iter, level, want);
+		return true;
+	} else {
+		return false;
+	}
+}
+
+static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
+{
+	struct btree *b = iter->l[level].b;
+
+	EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
+
+	if (!is_btree_node(iter, level))
+		return false;
+
+	if (btree_node_intent_locked(iter, level))
+		return true;
+
+	if (race_fault())
+		return false;
+
+	if (btree_node_locked(iter, level)
+	    ? six_lock_tryupgrade(&b->lock)
+	    : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq))
+		goto success;
+
+	if (btree_node_lock_seq_matches(iter, b, level) &&
+	    btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
+		btree_node_unlock(iter, level);
+		goto success;
+	}
+
+	return false;
+success:
+	mark_btree_node_intent_locked(iter, level);
+	return true;
+}
+
+static inline bool btree_iter_get_locks(struct btree_iter *iter,
+					bool upgrade, bool trace)
+{
+	unsigned l = iter->level;
+	int fail_idx = -1;
+
+	do {
+		if (!btree_iter_node(iter, l))
+			break;
+
+		if (!(upgrade
+		      ? bch2_btree_node_upgrade(iter, l)
+		      : bch2_btree_node_relock(iter, l))) {
+			if (trace)
+				(upgrade
+				 ? trace_node_upgrade_fail
+				 : trace_node_relock_fail)(l, iter->l[l].lock_seq,
+						is_btree_node(iter, l)
+						? 0
+						: (unsigned long) iter->l[l].b,
+						is_btree_node(iter, l)
+						? iter->l[l].b->lock.state.seq
+						: 0);
+
+			fail_idx = l;
+			btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		}
+
+		l++;
+	} while (l < iter->locks_want);
+
+	/*
+	 * When we fail to get a lock, we have to ensure that any child nodes
+	 * can't be relocked so bch2_btree_iter_traverse has to walk back up to
+	 * the node that we failed to relock:
+	 */
+	while (fail_idx >= 0) {
+		btree_node_unlock(iter, fail_idx);
+		iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+		--fail_idx;
+	}
+
+	if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
+		iter->uptodate = BTREE_ITER_NEED_PEEK;
+
+	bch2_btree_trans_verify_locks(iter->trans);
+
+	return iter->uptodate < BTREE_ITER_NEED_RELOCK;
+}
+
+/* Slowpath: */
+bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
+			   unsigned level,
+			   struct btree_iter *iter,
+			   enum six_lock_type type)
+{
+	struct btree_iter *linked;
+	bool ret = true;
+
+	/* Check if it's safe to block: */
+	trans_for_each_iter(iter->trans, linked) {
+		if (!linked->nodes_locked)
+			continue;
+
+		/* * Must lock btree nodes in key order: */
+		if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
+			ret = false;
+
+		/*
+		 * Can't block taking an intent lock if we have _any_ nodes read
+		 * locked:
+		 *
+		 * - Our read lock blocks another thread with an intent lock on
+		 *   the same node from getting a write lock, and thus from
+		 *   dropping its intent lock
+		 *
+		 * - And the other thread may have multiple nodes intent locked:
+		 *   both the node we want to intent lock, and the node we
+		 *   already have read locked - deadlock:
+		 */
+		if (type == SIX_LOCK_intent &&
+		    linked->nodes_locked != linked->nodes_intent_locked) {
+			if (!(iter->trans->nounlock)) {
+				linked->locks_want = max_t(unsigned,
+						linked->locks_want,
+						__fls(linked->nodes_locked) + 1);
+				btree_iter_get_locks(linked, true, false);
+			}
+			ret = false;
+		}
+
+		/*
+		 * Interior nodes must be locked before their descendants: if
+		 * another iterator has possible descendants locked of the node
+		 * we're about to lock, it must have the ancestors locked too:
+		 */
+		if (linked->btree_id == iter->btree_id &&
+		    level > __fls(linked->nodes_locked)) {
+			if (!(iter->trans->nounlock)) {
+				linked->locks_want =
+					max(level + 1, max_t(unsigned,
+					    linked->locks_want,
+					    iter->locks_want));
+				btree_iter_get_locks(linked, true, false);
+			}
+			ret = false;
+		}
+	}
+
+	if (unlikely(!ret)) {
+		trace_trans_restart_would_deadlock(iter->trans->ip);
+		return false;
+	}
+
+	__btree_node_lock_type(iter->trans->c, b, type);
+	return true;
+}
+
+/* Btree iterator locking: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+{
+	unsigned l;
+
+	for (l = 0; btree_iter_node(iter, l); l++) {
+		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
+		    !btree_node_locked(iter, l))
+			continue;
+
+		BUG_ON(btree_lock_want(iter, l) !=
+		       btree_node_locked_type(iter, l));
+	}
+}
+
+void bch2_btree_trans_verify_locks(struct btree_trans *trans)
+{
+	struct btree_iter *iter;
+
+	trans_for_each_iter(trans, iter)
+		bch2_btree_iter_verify_locks(iter);
+}
+#endif
+
+__flatten
+static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
+{
+	return iter->uptodate >= BTREE_ITER_NEED_RELOCK
+		? btree_iter_get_locks(iter, false, trace)
+		: true;
+}
+
+bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
+			       unsigned new_locks_want)
+{
+	struct btree_iter *linked;
+
+	EBUG_ON(iter->locks_want >= new_locks_want);
+
+	iter->locks_want = new_locks_want;
+
+	if (btree_iter_get_locks(iter, true, true))
+		return true;
+
+	/*
+	 * Ancestor nodes must be locked before child nodes, so set locks_want
+	 * on iterators that might lock ancestors before us to avoid getting
+	 * -EINTR later:
+	 */
+	trans_for_each_iter(iter->trans, linked)
+		if (linked != iter &&
+		    linked->btree_id == iter->btree_id &&
+		    linked->locks_want < new_locks_want) {
+			linked->locks_want = new_locks_want;
+			btree_iter_get_locks(linked, true, false);
+		}
+
+	return false;
+}
+
+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
+					unsigned new_locks_want)
+{
+	unsigned l = iter->level;
+
+	EBUG_ON(iter->locks_want >= new_locks_want);
+
+	iter->locks_want = new_locks_want;
+
+	do {
+		if (!btree_iter_node(iter, l))
+			break;
+
+		if (!bch2_btree_node_upgrade(iter, l)) {
+			iter->locks_want = l;
+			return false;
+		}
+
+		l++;
+	} while (l < iter->locks_want);
+
+	return true;
+}
+
+void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+				 unsigned downgrade_to)
+{
+	struct btree_iter *linked;
+	unsigned l;
+
+	/*
+	 * We downgrade linked iterators as well because btree_iter_upgrade
+	 * might have had to modify locks_want on linked iterators due to lock
+	 * ordering:
+	 */
+	trans_for_each_iter(iter->trans, linked) {
+		unsigned new_locks_want = downgrade_to ?:
+			(linked->flags & BTREE_ITER_INTENT ? 1 : 0);
+
+		if (linked->locks_want <= new_locks_want)
+			continue;
+
+		linked->locks_want = new_locks_want;
+
+		while (linked->nodes_locked &&
+		       (l = __fls(linked->nodes_locked)) >= linked->locks_want) {
+			if (l > linked->level) {
+				btree_node_unlock(linked, l);
+			} else {
+				if (btree_node_intent_locked(linked, l)) {
+					six_lock_downgrade(&linked->l[l].b->lock);
+					linked->nodes_intent_locked ^= 1 << l;
+				}
+				break;
+			}
+		}
+	}
+
+	bch2_btree_trans_verify_locks(iter->trans);
+}
+
+/* Btree transaction locking: */
+
+bool bch2_trans_relock(struct btree_trans *trans)
+{
+	struct btree_iter *iter;
+	bool ret = true;
+
+	trans_for_each_iter(trans, iter)
+		if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
+			ret &= bch2_btree_iter_relock(iter, true);
+
+	return ret;
+}
+
+void bch2_trans_unlock(struct btree_trans *trans)
+{
+	struct btree_iter *iter;
+
+	trans_for_each_iter(trans, iter)
+		__bch2_btree_iter_unlock(iter);
+}
+
+/* Btree iterator: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static void __bch2_btree_iter_verify(struct btree_iter *iter,
+				     struct btree *b)
+{
+	struct btree_iter_level *l = &iter->l[b->level];
+	struct btree_node_iter tmp = l->iter;
+	struct bkey_packed *k;
+
+	if (!debug_check_iterators(iter->trans->c))
+		return;
+
+	if (iter->uptodate > BTREE_ITER_NEED_PEEK)
+		return;
+
+	bch2_btree_node_iter_verify(&l->iter, b);
+
+	/*
+	 * For interior nodes, the iterator will have skipped past
+	 * deleted keys:
+	 *
+	 * For extents, the iterator may have skipped past deleted keys (but not
+	 * whiteouts)
+	 */
+	k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
+		? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
+		: bch2_btree_node_iter_prev_all(&tmp, b);
+	if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
+		char buf[100];
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		bch2_bkey_to_text(&PBUF(buf), &uk);
+		panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
+		      buf, iter->pos.inode, iter->pos.offset);
+	}
+
+	k = bch2_btree_node_iter_peek_all(&l->iter, b);
+	if (k && btree_iter_pos_cmp(iter, b, k) < 0) {
+		char buf[100];
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		bch2_bkey_to_text(&PBUF(buf), &uk);
+		panic("iter should be after current key:\n"
+		      "iter pos %llu:%llu\n"
+		      "cur key  %s\n",
+		      iter->pos.inode, iter->pos.offset, buf);
+	}
+
+	BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
+	       btree_iter_type(iter) == BTREE_ITER_KEYS &&
+	       !bkey_whiteout(&iter->k) &&
+	       bch2_btree_node_iter_end(&l->iter));
+}
+
+void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+
+	if (!debug_check_iterators(iter->trans->c))
+		return;
+
+	trans_for_each_iter_with_node(iter->trans, b, linked)
+		__bch2_btree_iter_verify(linked, b);
+}
+
+#else
+
+static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
+					    struct btree *b) {}
+
+#endif
+
+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
+					struct btree *b,
+					struct bset_tree *t,
+					struct bkey_packed *k)
+{
+	struct btree_node_iter_set *set;
+
+	btree_node_iter_for_each(iter, set)
+		if (set->end == t->end_offset) {
+			set->k = __btree_node_key_to_offset(b, k);
+			bch2_btree_node_iter_sort(iter, b);
+			return;
+		}
+
+	bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
+}
+
+static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+						    struct btree *b,
+						    struct bkey_packed *where)
+{
+	struct btree_node_iter *node_iter = &iter->l[0].iter;
+
+	if (where == bch2_btree_node_iter_peek_all(node_iter, b)) {
+		bkey_disassemble(b, where, &iter->k);
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+	}
+}
+
+void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+				      struct btree *b,
+				      struct bkey_packed *where)
+{
+	struct btree_iter *linked;
+
+	trans_for_each_iter_with_node(iter->trans, b, linked) {
+		__bch2_btree_iter_fix_key_modified(linked, b, where);
+		__bch2_btree_iter_verify(linked, b);
+	}
+}
+
+static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
+				      struct btree *b,
+				      struct btree_node_iter *node_iter,
+				      struct bset_tree *t,
+				      struct bkey_packed *where,
+				      unsigned clobber_u64s,
+				      unsigned new_u64s)
+{
+	const struct bkey_packed *end = btree_bkey_last(b, t);
+	struct btree_node_iter_set *set;
+	unsigned offset = __btree_node_key_to_offset(b, where);
+	int shift = new_u64s - clobber_u64s;
+	unsigned old_end = t->end_offset - shift;
+	unsigned orig_iter_pos = node_iter->data[0].k;
+	bool iter_current_key_modified =
+		orig_iter_pos >= offset &&
+		orig_iter_pos <= offset + clobber_u64s;
+
+	btree_node_iter_for_each(node_iter, set)
+		if (set->end == old_end)
+			goto found;
+
+	/* didn't find the bset in the iterator - might have to readd it: */
+	if (new_u64s &&
+	    btree_iter_pos_cmp(iter, b, where) > 0) {
+		bch2_btree_node_iter_push(node_iter, b, where, end);
+		goto fixup_done;
+	} else {
+		/* Iterator is after key that changed */
+		return;
+	}
+found:
+	set->end = t->end_offset;
+
+	/* Iterator hasn't gotten to the key that changed yet: */
+	if (set->k < offset)
+		return;
+
+	if (new_u64s &&
+	    btree_iter_pos_cmp(iter, b, where) > 0) {
+		set->k = offset;
+	} else if (set->k < offset + clobber_u64s) {
+		set->k = offset + new_u64s;
+		if (set->k == set->end)
+			bch2_btree_node_iter_set_drop(node_iter, set);
+	} else {
+		/* Iterator is after key that changed */
+		set->k = (int) set->k + shift;
+		return;
+	}
+
+	bch2_btree_node_iter_sort(node_iter, b);
+fixup_done:
+	if (node_iter->data[0].k != orig_iter_pos)
+		iter_current_key_modified = true;
+
+	/*
+	 * When a new key is added, and the node iterator now points to that
+	 * key, the iterator might have skipped past deleted keys that should
+	 * come after the key the iterator now points to. We have to rewind to
+	 * before those deleted keys - otherwise
+	 * bch2_btree_node_iter_prev_all() breaks:
+	 */
+	if (!bch2_btree_node_iter_end(node_iter) &&
+	    iter_current_key_modified &&
+	    (b->level ||
+	     (iter->flags & BTREE_ITER_IS_EXTENTS))) {
+		struct bset_tree *t;
+		struct bkey_packed *k, *k2, *p;
+
+		k = bch2_btree_node_iter_peek_all(node_iter, b);
+
+		for_each_bset(b, t) {
+			bool set_pos = false;
+
+			if (node_iter->data[0].end == t->end_offset)
+				continue;
+
+			k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+
+			while ((p = bch2_bkey_prev_all(b, t, k2)) &&
+			       bkey_iter_cmp(b, k, p) < 0) {
+				k2 = p;
+				set_pos = true;
+			}
+
+			if (set_pos)
+				btree_node_iter_set_set_pos(node_iter,
+							    b, t, k2);
+		}
+	}
+
+	if (!b->level &&
+	    node_iter == &iter->l[0].iter &&
+	    iter_current_key_modified) {
+		struct bkey_packed *k =
+			bch2_btree_node_iter_peek_all(node_iter, b);
+
+		if (likely(k)) {
+			bkey_disassemble(b, k, &iter->k);
+		} else {
+			/* XXX: for extents, calculate size of hole? */
+			iter->k.type = KEY_TYPE_deleted;
+		}
+
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+	}
+}
+
+void bch2_btree_node_iter_fix(struct btree_iter *iter,
+			      struct btree *b,
+			      struct btree_node_iter *node_iter,
+			      struct bkey_packed *where,
+			      unsigned clobber_u64s,
+			      unsigned new_u64s)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, where);
+	struct btree_iter *linked;
+
+	if (node_iter != &iter->l[b->level].iter) {
+		__bch2_btree_node_iter_fix(iter, b, node_iter, t,
+					   where, clobber_u64s, new_u64s);
+		bch2_btree_node_iter_verify(node_iter, b);
+	}
+
+	trans_for_each_iter_with_node(iter->trans, b, linked) {
+		__bch2_btree_node_iter_fix(linked, b,
+					   &linked->l[b->level].iter, t,
+					   where, clobber_u64s, new_u64s);
+		__bch2_btree_iter_verify(linked, b);
+	}
+}
+
+static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
+						  struct btree_iter_level *l,
+						  struct bkey *u,
+						  struct bkey_packed *k)
+{
+	struct bkey_s_c ret;
+
+	if (unlikely(!k)) {
+		/*
+		 * signal to bch2_btree_iter_peek_slot() that we're currently at
+		 * a hole
+		 */
+		u->type = KEY_TYPE_deleted;
+		return bkey_s_c_null;
+	}
+
+	ret = bkey_disassemble(l->b, k, u);
+
+	if (debug_check_bkeys(iter->trans->c))
+		bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
+
+	return ret;
+}
+
+/* peek_all() doesn't skip deleted keys */
+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
+						    struct btree_iter_level *l,
+						    struct bkey *u)
+{
+	return __btree_iter_unpack(iter, l, u,
+			bch2_btree_node_iter_peek_all(&l->iter, l->b));
+}
+
+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
+						struct btree_iter_level *l)
+{
+	return __btree_iter_unpack(iter, l, &iter->k,
+			bch2_btree_node_iter_peek(&l->iter, l->b));
+}
+
+static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter,
+						struct btree_iter_level *l)
+{
+	return __btree_iter_unpack(iter, l, &iter->k,
+			bch2_btree_node_iter_prev(&l->iter, l->b));
+}
+
+static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
+					     struct btree_iter_level *l,
+					     int max_advance)
+{
+	struct bkey_packed *k;
+	int nr_advanced = 0;
+
+	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
+	       btree_iter_pos_cmp(iter, l->b, k) < 0) {
+		if (max_advance > 0 && nr_advanced >= max_advance)
+			return false;
+
+		bch2_btree_node_iter_advance(&l->iter, l->b);
+		nr_advanced++;
+	}
+
+	return true;
+}
+
+/*
+ * Verify that iterator for parent node points to child node:
+ */
+static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter_level *l;
+	unsigned plevel;
+	bool parent_locked;
+	struct bkey_packed *k;
+
+	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+		return;
+
+	plevel = b->level + 1;
+	if (!btree_iter_node(iter, plevel))
+		return;
+
+	parent_locked = btree_node_locked(iter, plevel);
+
+	if (!bch2_btree_node_relock(iter, plevel))
+		return;
+
+	l = &iter->l[plevel];
+	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+	if (!k ||
+	    bkey_deleted(k) ||
+	    bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
+		char buf[100];
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		bch2_bkey_to_text(&PBUF(buf), &uk);
+		panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
+		      buf, b->key.k.p.inode, b->key.k.p.offset);
+	}
+
+	if (!parent_locked)
+		btree_node_unlock(iter, b->level + 1);
+}
+
+static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
+					      struct btree *b)
+{
+	return bkey_cmp(iter->pos, b->data->min_key) < 0;
+}
+
+static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
+					     struct btree *b)
+{
+	int cmp = bkey_cmp(b->key.k.p, iter->pos);
+
+	if (!cmp &&
+	    (iter->flags & BTREE_ITER_IS_EXTENTS) &&
+	    bkey_cmp(b->key.k.p, POS_MAX))
+		cmp = -1;
+	return cmp < 0;
+}
+
+static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
+					  struct btree *b)
+{
+	return iter->btree_id == b->btree_id &&
+		!btree_iter_pos_before_node(iter, b) &&
+		!btree_iter_pos_after_node(iter, b);
+}
+
+static inline void __btree_iter_init(struct btree_iter *iter,
+				     unsigned level)
+{
+	struct btree_iter_level *l = &iter->l[level];
+
+	bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos);
+
+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+		btree_iter_advance_to_pos(iter, l, -1);
+
+	/* Skip to first non whiteout: */
+	if (level)
+		bch2_btree_node_iter_peek(&l->iter, l->b);
+
+	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+}
+
+static inline void btree_iter_node_set(struct btree_iter *iter,
+				       struct btree *b)
+{
+	btree_iter_verify_new_node(iter, b);
+
+	EBUG_ON(!btree_iter_pos_in_node(iter, b));
+	EBUG_ON(b->lock.state.seq & 1);
+
+	iter->l[b->level].lock_seq = b->lock.state.seq;
+	iter->l[b->level].b = b;
+	__btree_iter_init(iter, b->level);
+}
+
+/*
+ * A btree node is being replaced - update the iterator to point to the new
+ * node:
+ */
+void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+{
+	enum btree_node_locked_type t;
+	struct btree_iter *linked;
+
+	trans_for_each_iter(iter->trans, linked)
+		if (btree_iter_pos_in_node(linked, b)) {
+			/*
+			 * bch2_btree_iter_node_drop() has already been called -
+			 * the old node we're replacing has already been
+			 * unlocked and the pointer invalidated
+			 */
+			BUG_ON(btree_node_locked(linked, b->level));
+
+			t = btree_lock_want(linked, b->level);
+			if (t != BTREE_NODE_UNLOCKED) {
+				six_lock_increment(&b->lock, t);
+				mark_btree_node_locked(linked, b->level, t);
+			}
+
+			btree_iter_node_set(linked, b);
+		}
+}
+
+void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+	unsigned level = b->level;
+
+	trans_for_each_iter(iter->trans, linked)
+		if (linked->l[level].b == b) {
+			__btree_node_unlock(linked, level);
+			linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
+		}
+}
+
+/*
+ * A btree node has been modified in such a way as to invalidate iterators - fix
+ * them:
+ */
+void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
+{
+	struct btree_iter *linked;
+
+	trans_for_each_iter_with_node(iter->trans, b, linked)
+		__btree_iter_init(linked, b->level);
+}
+
+static inline int btree_iter_lock_root(struct btree_iter *iter,
+				       unsigned depth_want)
+{
+	struct bch_fs *c = iter->trans->c;
+	struct btree *b;
+	enum six_lock_type lock_type;
+	unsigned i;
+
+	EBUG_ON(iter->nodes_locked);
+
+	while (1) {
+		b = READ_ONCE(c->btree_roots[iter->btree_id].b);
+		iter->level = READ_ONCE(b->level);
+
+		if (unlikely(iter->level < depth_want)) {
+			/*
+			 * the root is at a lower depth than the depth we want:
+			 * got to the end of the btree, or we're walking nodes
+			 * greater than some depth and there are no nodes >=
+			 * that depth
+			 */
+			iter->level = depth_want;
+			for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
+				iter->l[i].b = NULL;
+			return 1;
+		}
+
+		lock_type = __btree_lock_want(iter, iter->level);
+		if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
+					      iter, lock_type)))
+			return -EINTR;
+
+		if (likely(b == c->btree_roots[iter->btree_id].b &&
+			   b->level == iter->level &&
+			   !race_fault())) {
+			for (i = 0; i < iter->level; i++)
+				iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
+			iter->l[iter->level].b = b;
+			for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
+				iter->l[i].b = NULL;
+
+			mark_btree_node_locked(iter, iter->level, lock_type);
+			btree_iter_node_set(iter, b);
+			return 0;
+		}
+
+		six_unlock_type(&b->lock, lock_type);
+	}
+}
+
+noinline
+static void btree_iter_prefetch(struct btree_iter *iter)
+{
+	struct bch_fs *c = iter->trans->c;
+	struct btree_iter_level *l = &iter->l[iter->level];
+	struct btree_node_iter node_iter = l->iter;
+	struct bkey_packed *k;
+	BKEY_PADDED(k) tmp;
+	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+		? (iter->level > 1 ? 0 :  2)
+		: (iter->level > 1 ? 1 : 16);
+	bool was_locked = btree_node_locked(iter, iter->level);
+
+	while (nr) {
+		if (!bch2_btree_node_relock(iter, iter->level))
+			return;
+
+		bch2_btree_node_iter_advance(&node_iter, l->b);
+		k = bch2_btree_node_iter_peek(&node_iter, l->b);
+		if (!k)
+			break;
+
+		bch2_bkey_unpack(l->b, &tmp.k, k);
+		bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
+	}
+
+	if (!was_locked)
+		btree_node_unlock(iter, iter->level);
+}
+
+static __always_inline int btree_iter_down(struct btree_iter *iter)
+{
+	struct bch_fs *c = iter->trans->c;
+	struct btree_iter_level *l = &iter->l[iter->level];
+	struct btree *b;
+	unsigned level = iter->level - 1;
+	enum six_lock_type lock_type = __btree_lock_want(iter, level);
+	BKEY_PADDED(k) tmp;
+
+	EBUG_ON(!btree_node_locked(iter, iter->level));
+
+	bch2_bkey_unpack(l->b, &tmp.k,
+			 bch2_btree_node_iter_peek(&l->iter, l->b));
+
+	b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type);
+	if (unlikely(IS_ERR(b)))
+		return PTR_ERR(b);
+
+	mark_btree_node_locked(iter, level, lock_type);
+	btree_iter_node_set(iter, b);
+
+	if (iter->flags & BTREE_ITER_PREFETCH)
+		btree_iter_prefetch(iter);
+
+	iter->level = level;
+
+	return 0;
+}
+
+static void btree_iter_up(struct btree_iter *iter)
+{
+	btree_node_unlock(iter, iter->level++);
+}
+
+static int btree_iter_traverse_one(struct btree_iter *);
+
+static int __btree_iter_traverse_all(struct btree_trans *trans,
+				   struct btree_iter *orig_iter, int ret)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter;
+	u8 sorted[BTREE_ITER_MAX];
+	unsigned i, nr_sorted = 0;
+
+	trans_for_each_iter(trans, iter)
+		sorted[nr_sorted++] = iter - trans->iters;
+
+#define btree_iter_cmp_by_idx(_l, _r)				\
+		btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
+
+	bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
+#undef btree_iter_cmp_by_idx
+
+retry_all:
+	bch2_trans_unlock(trans);
+
+	if (unlikely(ret == -ENOMEM)) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+
+		do {
+			ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+			closure_sync(&cl);
+		} while (ret);
+	}
+
+	if (unlikely(ret == -EIO)) {
+		trans->error = true;
+		if (orig_iter) {
+			orig_iter->flags |= BTREE_ITER_ERROR;
+			orig_iter->l[orig_iter->level].b =
+				BTREE_ITER_NO_NODE_ERROR;
+		}
+		goto out;
+	}
+
+	BUG_ON(ret && ret != -EINTR);
+
+	/* Now, redo traversals in correct order: */
+	for (i = 0; i < nr_sorted; i++) {
+		iter = &trans->iters[sorted[i]];
+
+		do {
+			ret = btree_iter_traverse_one(iter);
+		} while (ret == -EINTR);
+
+		if (ret)
+			goto retry_all;
+	}
+
+	ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0;
+out:
+	bch2_btree_cache_cannibalize_unlock(c);
+	return ret;
+}
+
+int bch2_btree_iter_traverse_all(struct btree_trans *trans)
+{
+	return __btree_iter_traverse_all(trans, NULL, 0);
+}
+
+static inline bool btree_iter_good_node(struct btree_iter *iter,
+					unsigned l, int check_pos)
+{
+	if (!is_btree_node(iter, l) ||
+	    !bch2_btree_node_relock(iter, l))
+		return false;
+
+	if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
+		return false;
+	if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
+		return false;
+	return true;
+}
+
+static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
+						     int check_pos)
+{
+	unsigned l = iter->level;
+
+	while (btree_iter_node(iter, l) &&
+	       !btree_iter_good_node(iter, l, check_pos)) {
+		btree_node_unlock(iter, l);
+		iter->l[l].b = BTREE_ITER_NO_NODE_UP;
+		l++;
+	}
+
+	return l;
+}
+
+/*
+ * This is the main state machine for walking down the btree - walks down to a
+ * specified depth
+ *
+ * Returns 0 on success, -EIO on error (error reading in a btree node).
+ *
+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is
+ * stashed in the iterator and returned from bch2_trans_exit().
+ */
+static int btree_iter_traverse_one(struct btree_iter *iter)
+{
+	unsigned depth_want = iter->level;
+
+	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
+		return 0;
+
+	if (bch2_btree_iter_relock(iter, false))
+		return 0;
+
+	/*
+	 * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
+	 * here unnecessary
+	 */
+	iter->level = btree_iter_up_until_good_node(iter, 0);
+
+	/*
+	 * If we've got a btree node locked (i.e. we aren't about to relock the
+	 * root) - advance its node iterator if necessary:
+	 *
+	 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
+	 */
+	if (btree_iter_node(iter, iter->level)) {
+		BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b));
+
+		btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
+	}
+
+	/*
+	 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
+	 * would indicate to other code that we got to the end of the btree,
+	 * here it indicates that relocking the root failed - it's critical that
+	 * btree_iter_lock_root() comes next and that it can't fail
+	 */
+	while (iter->level > depth_want) {
+		int ret = btree_iter_node(iter, iter->level)
+			? btree_iter_down(iter)
+			: btree_iter_lock_root(iter, depth_want);
+		if (unlikely(ret)) {
+			if (ret == 1)
+				return 0;
+
+			iter->level = depth_want;
+			iter->l[iter->level].b = BTREE_ITER_NO_NODE_DOWN;
+			return ret;
+		}
+	}
+
+	iter->uptodate = BTREE_ITER_NEED_PEEK;
+
+	bch2_btree_trans_verify_locks(iter->trans);
+	__bch2_btree_iter_verify(iter, iter->l[iter->level].b);
+	return 0;
+}
+
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	int ret;
+
+	ret =   bch2_trans_cond_resched(iter->trans) ?:
+		btree_iter_traverse_one(iter);
+	if (unlikely(ret))
+		ret = __btree_iter_traverse_all(iter->trans, iter, ret);
+
+	return ret;
+}
+
+static inline void bch2_btree_iter_checks(struct btree_iter *iter,
+					  enum btree_iter_type type)
+{
+	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+	EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
+		(btree_node_type_is_extents(iter->btree_id) &&
+		 type != BTREE_ITER_NODES));
+	EBUG_ON(btree_iter_type(iter) != type);
+
+	bch2_btree_trans_verify_locks(iter->trans);
+}
+
+/* Iterate across nodes (leaf and interior nodes) */
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
+{
+	struct btree *b;
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
+
+	if (iter->uptodate == BTREE_ITER_UPTODATE)
+		return iter->l[iter->level].b;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return NULL;
+
+	b = btree_iter_node(iter, iter->level);
+	if (!b)
+		return NULL;
+
+	BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+
+	iter->pos = b->key.k.p;
+	iter->uptodate = BTREE_ITER_UPTODATE;
+
+	return b;
+}
+
+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
+{
+	struct btree *b;
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
+
+	/* already got to end? */
+	if (!btree_iter_node(iter, iter->level))
+		return NULL;
+
+	bch2_trans_cond_resched(iter->trans);
+
+	btree_iter_up(iter);
+
+	if (!bch2_btree_node_relock(iter, iter->level))
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return NULL;
+
+	/* got to end? */
+	b = btree_iter_node(iter, iter->level);
+	if (!b)
+		return NULL;
+
+	if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
+		/*
+		 * Haven't gotten to the end of the parent node: go back down to
+		 * the next child node
+		 */
+
+		/*
+		 * We don't really want to be unlocking here except we can't
+		 * directly tell btree_iter_traverse() "traverse to this level"
+		 * except by setting iter->level, so we have to unlock so we
+		 * don't screw up our lock invariants:
+		 */
+		if (btree_node_read_locked(iter, iter->level))
+			btree_node_unlock(iter, iter->level);
+
+		/* ick: */
+		iter->pos	= iter->btree_id == BTREE_ID_INODES
+			? btree_type_successor(iter->btree_id, iter->pos)
+			: bkey_successor(iter->pos);
+		iter->level	= depth;
+
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		ret = bch2_btree_iter_traverse(iter);
+		if (ret)
+			return NULL;
+
+		b = iter->l[iter->level].b;
+	}
+
+	iter->pos = b->key.k.p;
+	iter->uptodate = BTREE_ITER_UPTODATE;
+
+	return b;
+}
+
+/* Iterate across keys (in leaf nodes only) */
+
+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
+{
+	struct btree_iter_level *l = &iter->l[0];
+
+	EBUG_ON(iter->level != 0);
+	EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
+	EBUG_ON(!btree_node_locked(iter, 0));
+	EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
+
+	iter->pos = new_pos;
+	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+	btree_iter_advance_to_pos(iter, l, -1);
+
+	if (bch2_btree_node_iter_end(&l->iter) &&
+	    btree_iter_pos_after_node(iter, l->b))
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+}
+
+static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp)
+{
+	unsigned l = btree_iter_up_until_good_node(iter, cmp);
+
+	if (btree_iter_node(iter, l)) {
+		/*
+		 * We might have to skip over many keys, or just a few: try
+		 * advancing the node iterator, and if we have to skip over too
+		 * many keys just reinit it (or if we're rewinding, since that
+		 * is expensive).
+		 */
+		if (cmp < 0 ||
+		    !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
+			__btree_iter_init(iter, l);
+
+		/* Don't leave it locked if we're not supposed to: */
+		if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
+			btree_node_unlock(iter, l);
+	}
+
+	return l;
+}
+
+void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+	int cmp = bkey_cmp(new_pos, iter->pos);
+	unsigned l;
+
+	if (!cmp)
+		return;
+
+	iter->pos = new_pos;
+
+	l = btree_iter_pos_changed(iter, cmp);
+
+	if (l != iter->level)
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+	else
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+}
+
+static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+
+	iter->pos	= l->b->key.k.p;
+	iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+
+	if (!bkey_cmp(iter->pos, POS_MAX)) {
+		bkey_init(&iter->k);
+		iter->k.p	= POS_MAX;
+		return false;
+	}
+
+	iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+	btree_iter_pos_changed(iter, 1);
+	return true;
+}
+
+static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+
+	iter->pos	= l->b->data->min_key;
+	iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+
+	if (!bkey_cmp(iter->pos, POS_MIN)) {
+		bkey_init(&iter->k);
+		iter->k.p	= POS_MIN;
+		return false;
+	}
+
+	iter->pos = btree_type_predecessor(iter->btree_id, iter->pos);
+	btree_iter_pos_changed(iter, -1);
+	return true;
+}
+
+static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_s_c ret = { .k = &iter->k };
+
+	if (!bkey_deleted(&iter->k)) {
+		struct bkey_packed *_k =
+			__bch2_btree_node_iter_peek_all(&l->iter, l->b);
+
+		ret.v = bkeyp_val(&l->b->format, _k);
+
+		if (debug_check_iterators(iter->trans->c)) {
+			struct bkey k = bkey_unpack_key(l->b, _k);
+
+			/*
+			 * this flag is internal to the btree code,
+			 * we don't care if it doesn't match - if it's now set
+			 * it just means the key has been written out to disk:
+			 */
+			k.needs_whiteout = iter->k.needs_whiteout;
+			BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
+		}
+
+		if (debug_check_bkeys(iter->trans->c))
+			bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
+	}
+
+	return ret;
+}
+
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+	if (iter->uptodate == BTREE_ITER_UPTODATE)
+		return btree_iter_peek_uptodate(iter);
+
+	while (1) {
+		ret = bch2_btree_iter_traverse(iter);
+		if (unlikely(ret))
+			return bkey_s_c_err(ret);
+
+		k = __btree_iter_peek(iter, l);
+		if (likely(k.k))
+			break;
+
+		if (!btree_iter_set_pos_to_next_leaf(iter))
+			return bkey_s_c_null;
+	}
+
+	/*
+	 * iter->pos should always be equal to the key we just
+	 * returned - except extents can straddle iter->pos:
+	 */
+	if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
+	    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+		iter->pos = bkey_start_pos(k.k);
+
+	iter->uptodate = BTREE_ITER_UPTODATE;
+	return k;
+}
+
+/**
+ * bch2_btree_iter_next: returns first key greater than iterator's current
+ * position
+ */
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_packed *p;
+	struct bkey_s_c k;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+		if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+			return bkey_s_c_null;
+
+		/*
+		 * XXX: when we just need to relock we should be able to avoid
+		 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
+		 * for that to work
+		 */
+		iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+
+		bch2_btree_iter_set_pos(iter,
+			btree_type_successor(iter->btree_id, iter->k.p));
+
+		return bch2_btree_iter_peek(iter);
+	}
+
+	if (unlikely(bkey_deleted(&iter->k))) {
+		/*
+		 * we're currently pointed at a hole, because previously we were
+		 * iterating over slots:
+		 */
+		return bch2_btree_iter_peek(iter);
+	}
+
+	do {
+		bch2_btree_node_iter_advance(&l->iter, l->b);
+		p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+	} while (likely(p) && bkey_whiteout(p));
+
+	if (unlikely(!p))
+		return btree_iter_set_pos_to_next_leaf(iter)
+			? bch2_btree_iter_peek(iter)
+			: bkey_s_c_null;
+
+	k = __btree_iter_unpack(iter, l, &iter->k, p);
+
+	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
+	iter->pos = bkey_start_pos(k.k);
+	return k;
+}
+
+/**
+ * bch2_btree_iter_peek_prev: returns first key less than or equal to
+ * iterator's current position
+ */
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+	if (iter->uptodate == BTREE_ITER_UPTODATE)
+		return btree_iter_peek_uptodate(iter);
+
+	while (1) {
+		ret = bch2_btree_iter_traverse(iter);
+		if (unlikely(ret))
+			return bkey_s_c_err(ret);
+
+		k = __btree_iter_peek(iter, l);
+		if (!k.k ||
+		    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+			k = __btree_iter_prev(iter, l);
+
+		if (likely(k.k))
+			break;
+
+		if (!btree_iter_set_pos_to_prev_leaf(iter))
+			return bkey_s_c_null;
+	}
+
+	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
+	iter->pos	= bkey_start_pos(k.k);
+	iter->uptodate	= BTREE_ITER_UPTODATE;
+	return k;
+}
+
+/**
+ * bch2_btree_iter_prev: returns first key less than iterator's current
+ * position
+ */
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_s_c k;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+		/*
+		 * XXX: when we just need to relock we should be able to avoid
+		 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
+		 * for that to work
+		 */
+		iter->pos	= btree_type_predecessor(iter->btree_id,
+							 iter->pos);
+		iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+
+		return bch2_btree_iter_peek_prev(iter);
+	}
+
+	k = __btree_iter_prev(iter, l);
+	if (unlikely(!k.k))
+		return btree_iter_set_pos_to_prev_leaf(iter)
+			? bch2_btree_iter_peek(iter)
+			: bkey_s_c_null;
+
+	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0);
+	iter->pos	= bkey_start_pos(k.k);
+	return k;
+}
+
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct btree_node_iter node_iter;
+	struct bkey_s_c k;
+	struct bkey n;
+	int ret;
+
+recheck:
+	while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
+	       bkey_cmp(k.k->p, iter->pos) <= 0)
+		bch2_btree_node_iter_advance(&l->iter, l->b);
+
+	/*
+	 * iterator is now at the correct position for inserting at iter->pos,
+	 * but we need to keep iterating until we find the first non whiteout so
+	 * we know how big a hole we have, if any:
+	 */
+
+	node_iter = l->iter;
+	if (k.k && bkey_whiteout(k.k))
+		k = __btree_iter_unpack(iter, l, &iter->k,
+			bch2_btree_node_iter_peek(&node_iter, l->b));
+
+	/*
+	 * If we got to the end of the node, check if we need to traverse to the
+	 * next node:
+	 */
+	if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		ret = bch2_btree_iter_traverse(iter);
+		if (unlikely(ret))
+			return bkey_s_c_err(ret);
+
+		goto recheck;
+	}
+
+	if (k.k &&
+	    !bkey_whiteout(k.k) &&
+	    bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
+		/*
+		 * if we skipped forward to find the first non whiteout and
+		 * there _wasn't_ actually a hole, we want the iterator to be
+		 * pointed at the key we found:
+		 */
+		l->iter = node_iter;
+
+		EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
+		EBUG_ON(bkey_deleted(k.k));
+		iter->uptodate = BTREE_ITER_UPTODATE;
+
+		__bch2_btree_iter_verify(iter, l->b);
+		return k;
+	}
+
+	/* hole */
+
+	/* holes can't span inode numbers: */
+	if (iter->pos.offset == KEY_OFFSET_MAX) {
+		if (iter->pos.inode == KEY_INODE_MAX)
+			return bkey_s_c_null;
+
+		iter->pos = bkey_successor(iter->pos);
+		goto recheck;
+	}
+
+	if (!k.k)
+		k.k = &l->b->key.k;
+
+	bkey_init(&n);
+	n.p = iter->pos;
+	bch2_key_resize(&n,
+			min_t(u64, KEY_SIZE_MAX,
+			      (k.k->p.inode == n.p.inode
+			       ? bkey_start_offset(k.k)
+			       : KEY_OFFSET_MAX) -
+			      n.p.offset));
+
+	EBUG_ON(!n.size);
+
+	iter->k	= n;
+	iter->uptodate = BTREE_ITER_UPTODATE;
+
+	__bch2_btree_iter_verify(iter, l->b);
+	return (struct bkey_s_c) { &iter->k, NULL };
+}
+
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_s_c k;
+	int ret;
+
+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+		return __bch2_btree_iter_peek_slot_extents(iter);
+
+recheck:
+	while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
+	       bkey_deleted(k.k) &&
+	       bkey_cmp(k.k->p, iter->pos) == 0)
+		bch2_btree_node_iter_advance(&l->iter, l->b);
+
+	/*
+	 * If we got to the end of the node, check if we need to traverse to the
+	 * next node:
+	 */
+	if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+		ret = bch2_btree_iter_traverse(iter);
+		if (unlikely(ret))
+			return bkey_s_c_err(ret);
+
+		goto recheck;
+	}
+
+	if (!k.k ||
+	    bkey_deleted(k.k) ||
+	    bkey_cmp(iter->pos, k.k->p)) {
+		/* hole */
+		bkey_init(&iter->k);
+		iter->k.p = iter->pos;
+		k = (struct bkey_s_c) { &iter->k, NULL };
+	}
+
+	iter->uptodate = BTREE_ITER_UPTODATE;
+	__bch2_btree_iter_verify(iter, l->b);
+	return k;
+}
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+	int ret;
+
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+	if (iter->uptodate == BTREE_ITER_UPTODATE)
+		return btree_iter_peek_uptodate(iter);
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
+
+	return __bch2_btree_iter_peek_slot(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
+{
+	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+	iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
+
+	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+		/*
+		 * XXX: when we just need to relock we should be able to avoid
+		 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
+		 * for that to work
+		 */
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+
+		return bch2_btree_iter_peek_slot(iter);
+	}
+
+	if (!bkey_deleted(&iter->k))
+		bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b);
+
+	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+	return __bch2_btree_iter_peek_slot(iter);
+}
+
+static inline void bch2_btree_iter_init(struct btree_trans *trans,
+			struct btree_iter *iter, enum btree_id btree_id,
+			struct bpos pos, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	unsigned i;
+
+	if (btree_node_type_is_extents(btree_id) &&
+	    !(flags & BTREE_ITER_NODES))
+		flags |= BTREE_ITER_IS_EXTENTS;
+
+	iter->trans			= trans;
+	iter->pos			= pos;
+	bkey_init(&iter->k);
+	iter->k.p			= pos;
+	iter->flags			= flags;
+	iter->uptodate			= BTREE_ITER_NEED_TRAVERSE;
+	iter->btree_id			= btree_id;
+	iter->level			= 0;
+	iter->locks_want		= flags & BTREE_ITER_INTENT ? 1 : 0;
+	iter->nodes_locked		= 0;
+	iter->nodes_intent_locked	= 0;
+	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
+		iter->l[i].b		= NULL;
+	iter->l[iter->level].b		= BTREE_ITER_NO_NODE_INIT;
+
+	prefetch(c->btree_roots[btree_id].b);
+}
+
+/* new transactional stuff: */
+
+static inline void __bch2_trans_iter_free(struct btree_trans *trans,
+					  unsigned idx)
+{
+	__bch2_btree_iter_unlock(&trans->iters[idx]);
+	trans->iters_linked		&= ~(1ULL << idx);
+	trans->iters_live		&= ~(1ULL << idx);
+	trans->iters_touched		&= ~(1ULL << idx);
+}
+
+int bch2_trans_iter_put(struct btree_trans *trans,
+			struct btree_iter *iter)
+{
+	int ret = btree_iter_err(iter);
+
+	if (!(trans->iters_touched & (1ULL << iter->idx)) &&
+	    !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
+		__bch2_trans_iter_free(trans, iter->idx);
+
+	trans->iters_live	&= ~(1ULL << iter->idx);
+	return ret;
+}
+
+int bch2_trans_iter_free(struct btree_trans *trans,
+			 struct btree_iter *iter)
+{
+	trans->iters_touched &= ~(1ULL << iter->idx);
+
+	return bch2_trans_iter_put(trans, iter);
+}
+
+static int bch2_trans_realloc_iters(struct btree_trans *trans,
+				    unsigned new_size)
+{
+	void *new_iters, *new_updates, *new_sorted;
+	size_t iters_bytes;
+	size_t updates_bytes;
+	size_t sorted_bytes;
+
+	new_size = roundup_pow_of_two(new_size);
+
+	BUG_ON(new_size > BTREE_ITER_MAX);
+
+	if (new_size <= trans->size)
+		return 0;
+
+	BUG_ON(trans->used_mempool);
+
+	bch2_trans_unlock(trans);
+
+	iters_bytes	= sizeof(struct btree_iter) * new_size;
+	updates_bytes	= sizeof(struct btree_insert_entry) * (new_size + 4);
+	sorted_bytes	= sizeof(u8) * (new_size + 4);
+
+	new_iters = kmalloc(iters_bytes +
+			    updates_bytes +
+			    sorted_bytes, GFP_NOFS);
+	if (new_iters)
+		goto success;
+
+	new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+	new_size = BTREE_ITER_MAX;
+
+	trans->used_mempool = true;
+success:
+	new_updates	= new_iters + iters_bytes;
+	new_sorted	= new_updates + updates_bytes;
+
+	memcpy(new_iters, trans->iters,
+	       sizeof(struct btree_iter) * trans->nr_iters);
+	memcpy(new_updates, trans->updates,
+	       sizeof(struct btree_insert_entry) * trans->nr_updates);
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+		memset(trans->iters, POISON_FREE,
+		       sizeof(struct btree_iter) * trans->nr_iters +
+		       sizeof(struct btree_insert_entry) * trans->nr_iters);
+
+	if (trans->iters != trans->iters_onstack)
+		kfree(trans->iters);
+
+	trans->iters		= new_iters;
+	trans->updates		= new_updates;
+	trans->updates_sorted	= new_sorted;
+	trans->size		= new_size;
+
+	if (trans->iters_live) {
+		trace_trans_restart_iters_realloced(trans->ip, trans->size);
+		return -EINTR;
+	}
+
+	return 0;
+}
+
+static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
+{
+	unsigned idx = __ffs64(~trans->iters_linked);
+
+	if (idx < trans->nr_iters)
+		goto got_slot;
+
+	if (trans->nr_iters == trans->size) {
+		int ret;
+
+		if (trans->nr_iters >= BTREE_ITER_MAX) {
+			struct btree_iter *iter;
+
+			trans_for_each_iter(trans, iter) {
+				pr_err("iter: btree %s pos %llu:%llu%s%s%s",
+				       bch2_btree_ids[iter->btree_id],
+				       iter->pos.inode,
+				       iter->pos.offset,
+				       (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
+				       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
+				       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "");
+			}
+
+			panic("trans iter oveflow\n");
+		}
+
+		ret = bch2_trans_realloc_iters(trans, trans->size * 2);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	idx = trans->nr_iters++;
+	BUG_ON(trans->nr_iters > trans->size);
+
+	trans->iters[idx].idx = idx;
+got_slot:
+	BUG_ON(trans->iters_linked & (1ULL << idx));
+	trans->iters_linked |= 1ULL << idx;
+	return &trans->iters[idx];
+}
+
+static inline void btree_iter_copy(struct btree_iter *dst,
+				   struct btree_iter *src)
+{
+	unsigned i, idx = dst->idx;
+
+	*dst = *src;
+	dst->idx = idx;
+
+	for (i = 0; i < BTREE_MAX_DEPTH; i++)
+		if (btree_node_locked(dst, i))
+			six_lock_increment(&dst->l[i].b->lock,
+					   __btree_lock_want(dst, i));
+}
+
+static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
+{
+	if (bkey_cmp(l, r) > 0)
+		swap(l, r);
+
+	return POS(r.inode - l.inode, r.offset - l.offset);
+}
+
+static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
+						 unsigned btree_id, struct bpos pos,
+						 unsigned flags)
+{
+	struct btree_iter *iter, *best = NULL;
+
+	BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
+
+	trans_for_each_iter(trans, iter) {
+		if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
+			continue;
+
+		if (iter->btree_id != btree_id)
+			continue;
+
+		if (best &&
+		    bkey_cmp(bpos_diff(best->pos, pos),
+			     bpos_diff(iter->pos, pos)) < 0)
+			continue;
+
+		best = iter;
+	}
+
+	if (!best) {
+		iter = btree_trans_iter_alloc(trans);
+		if (IS_ERR(iter))
+			return iter;
+
+		bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
+	} else if ((trans->iters_live & (1ULL << best->idx)) ||
+		   (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
+		iter = btree_trans_iter_alloc(trans);
+		if (IS_ERR(iter))
+			return iter;
+
+		btree_iter_copy(iter, best);
+	} else {
+		iter = best;
+	}
+
+	iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+	iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+	iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+
+	if (iter->flags & BTREE_ITER_INTENT)
+		bch2_btree_iter_upgrade(iter, 1);
+	else
+		bch2_btree_iter_downgrade(iter);
+
+	BUG_ON(iter->btree_id != btree_id);
+	BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
+	BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
+	BUG_ON(trans->iters_live & (1ULL << iter->idx));
+
+	trans->iters_live	|= 1ULL << iter->idx;
+	trans->iters_touched	|= 1ULL << iter->idx;
+
+	return iter;
+}
+
+struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans,
+				       enum btree_id btree_id,
+				       struct bpos pos, unsigned flags)
+{
+	struct btree_iter *iter =
+		__btree_trans_get_iter(trans, btree_id, pos, flags);
+
+	if (!IS_ERR(iter))
+		bch2_btree_iter_set_pos(iter, pos);
+	return iter;
+}
+
+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
+					    enum btree_id btree_id,
+					    struct bpos pos,
+					    unsigned locks_want,
+					    unsigned depth,
+					    unsigned flags)
+{
+	struct btree_iter *iter =
+		__btree_trans_get_iter(trans, btree_id, pos,
+				       flags|BTREE_ITER_NODES);
+	unsigned i;
+
+	BUG_ON(IS_ERR(iter));
+	BUG_ON(bkey_cmp(iter->pos, pos));
+
+	iter->locks_want = locks_want;
+	iter->level	= depth;
+
+	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
+		iter->l[i].b		= NULL;
+	iter->l[iter->level].b		= BTREE_ITER_NO_NODE_INIT;
+
+	return iter;
+}
+
+struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
+					struct btree_iter *src)
+{
+	struct btree_iter *iter;
+
+	iter = btree_trans_iter_alloc(trans);
+	if (IS_ERR(iter))
+		return iter;
+
+	btree_iter_copy(iter, src);
+
+	trans->iters_live |= 1ULL << iter->idx;
+	/*
+	 * Don't mark it as touched, we don't need to preserve this iter since
+	 * it's cheap to copy it again:
+	 */
+	trans->iters_touched &= ~(1ULL << iter->idx);
+	iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+
+	return iter;
+}
+
+static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
+{
+	if (size > trans->mem_bytes) {
+		size_t old_bytes = trans->mem_bytes;
+		size_t new_bytes = roundup_pow_of_two(size);
+		void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+
+		if (!new_mem)
+			return -ENOMEM;
+
+		trans->mem = new_mem;
+		trans->mem_bytes = new_bytes;
+
+		if (old_bytes) {
+			trace_trans_restart_mem_realloced(trans->ip, new_bytes);
+			return -EINTR;
+		}
+	}
+
+	return 0;
+}
+
+void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+	void *p;
+	int ret;
+
+	ret = bch2_trans_preload_mem(trans, trans->mem_top + size);
+	if (ret)
+		return ERR_PTR(ret);
+
+	p = trans->mem + trans->mem_top;
+	trans->mem_top += size;
+	return p;
+}
+
+inline void bch2_trans_unlink_iters(struct btree_trans *trans)
+{
+	u64 iters = trans->iters_linked &
+		~trans->iters_touched &
+		~trans->iters_live;
+
+	while (iters) {
+		unsigned idx = __ffs64(iters);
+
+		iters &= ~(1ULL << idx);
+		__bch2_trans_iter_free(trans, idx);
+	}
+}
+
+void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
+{
+	struct btree_iter *iter;
+
+	trans_for_each_iter(trans, iter)
+		iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+
+	bch2_trans_unlink_iters(trans);
+
+	if (flags & TRANS_RESET_ITERS)
+		trans->iters_live = 0;
+
+	trans->iters_touched &= trans->iters_live;
+
+	trans->nr_updates		= 0;
+
+	if (flags & TRANS_RESET_MEM)
+		trans->mem_top		= 0;
+
+	bch2_btree_iter_traverse_all(trans);
+}
+
+void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
+		     unsigned expected_nr_iters,
+		     size_t expected_mem_bytes)
+{
+	memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
+
+	trans->c		= c;
+	trans->ip		= _RET_IP_;
+	trans->size		= ARRAY_SIZE(trans->iters_onstack);
+	trans->iters		= trans->iters_onstack;
+	trans->updates		= trans->updates_onstack;
+	trans->updates_sorted	= trans->updates_sorted_onstack;
+	trans->fs_usage_deltas	= NULL;
+
+	if (expected_nr_iters > trans->size)
+		bch2_trans_realloc_iters(trans, expected_nr_iters);
+
+	if (expected_mem_bytes)
+		bch2_trans_preload_mem(trans, expected_mem_bytes);
+}
+
+int bch2_trans_exit(struct btree_trans *trans)
+{
+	bch2_trans_unlock(trans);
+
+	kfree(trans->fs_usage_deltas);
+	kfree(trans->mem);
+	if (trans->used_mempool)
+		mempool_free(trans->iters, &trans->c->btree_iters_pool);
+	else if (trans->iters != trans->iters_onstack)
+		kfree(trans->iters);
+	trans->mem	= (void *) 0x1;
+	trans->iters	= (void *) 0x1;
+
+	return trans->error ? -EIO : 0;
+}
+
+void bch2_fs_btree_iter_exit(struct bch_fs *c)
+{
+	mempool_exit(&c->btree_iters_pool);
+}
+
+int bch2_fs_btree_iter_init(struct bch_fs *c)
+{
+	unsigned nr = BTREE_ITER_MAX;
+
+	return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
+			sizeof(struct btree_iter) * nr +
+			sizeof(struct btree_insert_entry) * (nr + 4) +
+			sizeof(u8) * (nr + 4));
+}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
new file mode 100644
index 000000000000..4c5032222319
--- /dev/null
+++ b/fs/bcachefs/btree_iter.h
@@ -0,0 +1,314 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_ITER_H
+#define _BCACHEFS_BTREE_ITER_H
+
+#include "bset.h"
+#include "btree_types.h"
+
+static inline void btree_iter_set_dirty(struct btree_iter *iter,
+					enum btree_iter_uptodate u)
+{
+	iter->uptodate = max_t(unsigned, iter->uptodate, u);
+}
+
+static inline struct btree *btree_iter_node(struct btree_iter *iter,
+					    unsigned level)
+{
+	return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
+}
+
+static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter,
+					const struct btree *b, unsigned level)
+{
+	/*
+	 * We don't compare the low bits of the lock sequence numbers because
+	 * @iter might have taken a write lock on @b, and we don't want to skip
+	 * the linked iterator if the sequence numbers were equal before taking
+	 * that write lock. The lock sequence number is incremented by taking
+	 * and releasing write locks and is even when unlocked:
+	 */
+	return iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1;
+}
+
+static inline struct btree *btree_node_parent(struct btree_iter *iter,
+					      struct btree *b)
+{
+	return btree_iter_node(iter, b->level + 1);
+}
+
+static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans)
+{
+	return hweight64(trans->iters_linked) > 1;
+}
+
+static inline int btree_iter_err(const struct btree_iter *iter)
+{
+	return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
+}
+
+/* Iterate over iters within a transaction: */
+
+#define trans_for_each_iter_all(_trans, _iter)				\
+	for (_iter = (_trans)->iters;					\
+	     _iter < (_trans)->iters + (_trans)->nr_iters;		\
+	     _iter++)
+
+static inline struct btree_iter *
+__trans_next_iter(struct btree_trans *trans, unsigned idx)
+{
+	EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx);
+
+	for (; idx < trans->nr_iters; idx++)
+		if (trans->iters_linked & (1ULL << idx))
+			return &trans->iters[idx];
+
+	return NULL;
+}
+
+#define trans_for_each_iter(_trans, _iter)				\
+	for (_iter = __trans_next_iter((_trans), 0);			\
+	     (_iter);							\
+	     _iter = __trans_next_iter((_trans), (_iter)->idx + 1))
+
+static inline bool __iter_has_node(const struct btree_iter *iter,
+				   const struct btree *b)
+{
+	return iter->l[b->level].b == b &&
+		btree_node_lock_seq_matches(iter, b, b->level);
+}
+
+static inline struct btree_iter *
+__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
+			    unsigned idx)
+{
+	struct btree_iter *iter = __trans_next_iter(trans, idx);
+
+	while (iter && !__iter_has_node(iter, b))
+		iter = __trans_next_iter(trans, iter->idx + 1);
+
+	return iter;
+}
+
+#define trans_for_each_iter_with_node(_trans, _b, _iter)		\
+	for (_iter = __trans_next_iter_with_node((_trans), (_b), 0);	\
+	     (_iter);							\
+	     _iter = __trans_next_iter_with_node((_trans), (_b),	\
+						 (_iter)->idx + 1))
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
+void bch2_btree_trans_verify_locks(struct btree_trans *);
+#else
+static inline void bch2_btree_iter_verify(struct btree_iter *iter,
+					  struct btree *b) {}
+static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
+#endif
+
+void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *,
+					   struct bkey_packed *);
+void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
+			      struct btree_node_iter *, struct bkey_packed *,
+			      unsigned, unsigned);
+
+bool bch2_trans_relock(struct btree_trans *);
+void bch2_trans_unlock(struct btree_trans *);
+
+bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
+
+static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
+					   unsigned new_locks_want)
+{
+	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+	return iter->locks_want < new_locks_want
+		? (!iter->trans->nounlock
+		   ? __bch2_btree_iter_upgrade(iter, new_locks_want)
+		   : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
+		: iter->uptodate <= BTREE_ITER_NEED_PEEK;
+}
+
+void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
+
+static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
+{
+	if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
+		__bch2_btree_iter_downgrade(iter, 0);
+}
+
+void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
+void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
+
+void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
+
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
+
+static inline int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	return iter->uptodate >= BTREE_ITER_NEED_RELOCK
+		? __bch2_btree_iter_traverse(iter)
+		: 0;
+}
+
+int bch2_btree_iter_traverse_all(struct btree_trans *);
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
+struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
+
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
+
+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
+void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
+
+static inline struct bpos btree_type_successor(enum btree_id id,
+					       struct bpos pos)
+{
+	if (id == BTREE_ID_INODES) {
+		pos.inode++;
+		pos.offset = 0;
+	} else if (!btree_node_type_is_extents(id)) {
+		pos = bkey_successor(pos);
+	}
+
+	return pos;
+}
+
+static inline struct bpos btree_type_predecessor(enum btree_id id,
+					       struct bpos pos)
+{
+	if (id == BTREE_ID_INODES) {
+		--pos.inode;
+		pos.offset = 0;
+	} else {
+		pos = bkey_predecessor(pos);
+	}
+
+	return pos;
+}
+
+static inline int __btree_iter_cmp(enum btree_id id,
+				   struct bpos pos,
+				   const struct btree_iter *r)
+{
+	if (id != r->btree_id)
+		return id < r->btree_id ? -1 : 1;
+	return bkey_cmp(pos, r->pos);
+}
+
+static inline int btree_iter_cmp(const struct btree_iter *l,
+				 const struct btree_iter *r)
+{
+	return __btree_iter_cmp(l->btree_id, l->pos, r);
+}
+
+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline int bch2_trans_cond_resched(struct btree_trans *trans)
+{
+	if (need_resched() || race_fault()) {
+		bch2_trans_unlock(trans);
+		schedule();
+		return bch2_trans_relock(trans) ? 0 : -EINTR;
+	} else {
+		return 0;
+	}
+}
+
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start,	\
+			      _locks_want, _depth, _flags, _b)		\
+	for (iter = bch2_trans_get_node_iter((_trans), (_btree_id),	\
+				_start, _locks_want, _depth, _flags),	\
+	     _b = bch2_btree_iter_peek_node(_iter);			\
+	     (_b);							\
+	     (_b) = bch2_btree_iter_next_node(_iter, _depth))
+
+#define for_each_btree_node(_trans, _iter, _btree_id, _start,		\
+			    _flags, _b)					\
+	__for_each_btree_node(_trans, _iter, _btree_id, _start,		\
+			      0, 0, _flags, _b)
+
+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
+						     unsigned flags)
+{
+	return flags & BTREE_ITER_SLOTS
+		? bch2_btree_iter_peek_slot(iter)
+		: bch2_btree_iter_peek(iter);
+}
+
+static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
+						     unsigned flags)
+{
+	return flags & BTREE_ITER_SLOTS
+		? bch2_btree_iter_next_slot(iter)
+		: bch2_btree_iter_next(iter);
+}
+
+static inline int bkey_err(struct bkey_s_c k)
+{
+	return PTR_ERR_OR_ZERO(k.k);
+}
+
+#define for_each_btree_key(_trans, _iter, _btree_id,			\
+			   _start, _flags, _k, _ret)			\
+	for ((_ret) = PTR_ERR_OR_ZERO((_iter) =				\
+			bch2_trans_get_iter((_trans), (_btree_id),	\
+					    (_start), (_flags))) ?:	\
+		      PTR_ERR_OR_ZERO(((_k) =				\
+			__bch2_btree_iter_peek(_iter, _flags)).k);	\
+	     !_ret && (_k).k;						\
+	     (_ret) = PTR_ERR_OR_ZERO(((_k) =				\
+			__bch2_btree_iter_next(_iter, _flags)).k))
+
+#define for_each_btree_key_continue(_iter, _flags, _k, _ret)		\
+	for ((_k) = __bch2_btree_iter_peek(_iter, _flags);		\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     (_k) = __bch2_btree_iter_next(_iter, _flags))
+
+/* new multiple iterator interface: */
+
+int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
+int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
+
+void bch2_trans_unlink_iters(struct btree_trans *);
+
+struct btree_iter *bch2_trans_get_iter(struct btree_trans *, enum btree_id,
+				       struct bpos, unsigned);
+struct btree_iter *bch2_trans_copy_iter(struct btree_trans *,
+					struct btree_iter *);
+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
+				enum btree_id, struct bpos,
+				unsigned, unsigned, unsigned);
+
+#define TRANS_RESET_ITERS		(1 << 0)
+#define TRANS_RESET_MEM			(1 << 1)
+
+void bch2_trans_reset(struct btree_trans *, unsigned);
+
+static inline void bch2_trans_begin(struct btree_trans *trans)
+{
+	return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM);
+}
+
+static inline void bch2_trans_begin_updates(struct btree_trans *trans)
+{
+	return bch2_trans_reset(trans, TRANS_RESET_MEM);
+}
+
+void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
+int bch2_trans_exit(struct btree_trans *);
+
+void bch2_fs_btree_iter_exit(struct bch_fs *);
+int bch2_fs_btree_iter_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
new file mode 100644
index 000000000000..fe8b58384a9e
--- /dev/null
+++ b/fs/bcachefs/btree_locking.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_LOCKING_H
+#define _BCACHEFS_BTREE_LOCKING_H
+
+/*
+ * Only for internal btree use:
+ *
+ * The btree iterator tracks what locks it wants to take, and what locks it
+ * currently has - here we have wrappers for locking/unlocking btree nodes and
+ * updating the iterator state
+ */
+
+#include <linux/six.h>
+
+#include "btree_iter.h"
+
+/* matches six lock types */
+enum btree_node_locked_type {
+	BTREE_NODE_UNLOCKED		= -1,
+	BTREE_NODE_READ_LOCKED		= SIX_LOCK_read,
+	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
+};
+
+static inline int btree_node_locked_type(struct btree_iter *iter,
+					 unsigned level)
+{
+	/*
+	 * We're relying on the fact that if nodes_intent_locked is set
+	 * nodes_locked must be set as well, so that we can compute without
+	 * branches:
+	 */
+	return BTREE_NODE_UNLOCKED +
+		((iter->nodes_locked >> level) & 1) +
+		((iter->nodes_intent_locked >> level) & 1);
+}
+
+static inline bool btree_node_intent_locked(struct btree_iter *iter,
+					    unsigned level)
+{
+	return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
+}
+
+static inline bool btree_node_read_locked(struct btree_iter *iter,
+					  unsigned level)
+{
+	return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
+}
+
+static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
+{
+	return iter->nodes_locked & (1 << level);
+}
+
+static inline void mark_btree_node_unlocked(struct btree_iter *iter,
+					    unsigned level)
+{
+	iter->nodes_locked &= ~(1 << level);
+	iter->nodes_intent_locked &= ~(1 << level);
+}
+
+static inline void mark_btree_node_locked(struct btree_iter *iter,
+					  unsigned level,
+					  enum six_lock_type type)
+{
+	/* relying on this to avoid a branch */
+	BUILD_BUG_ON(SIX_LOCK_read   != 0);
+	BUILD_BUG_ON(SIX_LOCK_intent != 1);
+
+	iter->nodes_locked |= 1 << level;
+	iter->nodes_intent_locked |= type << level;
+}
+
+static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
+						 unsigned level)
+{
+	mark_btree_node_locked(iter, level, SIX_LOCK_intent);
+}
+
+static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
+{
+	return level < iter->locks_want
+		? SIX_LOCK_intent
+		: SIX_LOCK_read;
+}
+
+static inline enum btree_node_locked_type
+btree_lock_want(struct btree_iter *iter, int level)
+{
+	if (level < iter->level)
+		return BTREE_NODE_UNLOCKED;
+	if (level < iter->locks_want)
+		return BTREE_NODE_INTENT_LOCKED;
+	if (level == iter->level)
+		return BTREE_NODE_READ_LOCKED;
+	return BTREE_NODE_UNLOCKED;
+}
+
+static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
+{
+	int lock_type = btree_node_locked_type(iter, level);
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	if (lock_type != BTREE_NODE_UNLOCKED)
+		six_unlock_type(&iter->l[level].b->lock, lock_type);
+	mark_btree_node_unlocked(iter, level);
+}
+
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+{
+	EBUG_ON(!level && iter->trans->nounlock);
+
+	__btree_node_unlock(iter, level);
+}
+
+static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
+{
+	btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+	while (iter->nodes_locked)
+		btree_node_unlock(iter, __ffs(iter->nodes_locked));
+}
+
+static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+{
+	switch (type) {
+	case SIX_LOCK_read:
+		return BCH_TIME_btree_lock_contended_read;
+	case SIX_LOCK_intent:
+		return BCH_TIME_btree_lock_contended_intent;
+	case SIX_LOCK_write:
+		return BCH_TIME_btree_lock_contended_write;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * wrapper around six locks that just traces lock contended time
+ */
+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
+					  enum six_lock_type type)
+{
+	u64 start_time = local_clock();
+
+	six_lock_type(&b->lock, type, NULL, NULL);
+	bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+}
+
+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
+					enum six_lock_type type)
+{
+	if (!six_trylock_type(&b->lock, type))
+		__btree_node_lock_type(c, b, type);
+}
+
+/*
+ * Lock a btree node if we already have it locked on one of our linked
+ * iterators:
+ */
+static inline bool btree_node_lock_increment(struct btree_iter *iter,
+					     struct btree *b, unsigned level,
+					     enum btree_node_locked_type want)
+{
+	struct btree_iter *linked;
+
+	trans_for_each_iter(iter->trans, linked)
+		if (linked != iter &&
+		    linked->l[level].b == b &&
+		    btree_node_locked_type(linked, level) >= want) {
+			six_lock_increment(&b->lock, want);
+			return true;
+		}
+
+	return false;
+}
+
+bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
+			    struct btree_iter *, enum six_lock_type);
+
+static inline bool btree_node_lock(struct btree *b, struct bpos pos,
+				   unsigned level,
+				   struct btree_iter *iter,
+				   enum six_lock_type type)
+{
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	return likely(six_trylock_type(&b->lock, type)) ||
+		btree_node_lock_increment(iter, b, level, type) ||
+		__bch2_btree_node_lock(b, pos, level, iter, type);
+}
+
+bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
+
+static inline bool bch2_btree_node_relock(struct btree_iter *iter,
+					  unsigned level)
+{
+	EBUG_ON(btree_node_locked(iter, level) &&
+		btree_node_locked_type(iter, level) !=
+		__btree_lock_want(iter, level));
+
+	return likely(btree_node_locked(iter, level)) ||
+		__bch2_btree_node_relock(iter, level);
+}
+
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	EBUG_ON(iter->l[b->level].b != b);
+	EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
+
+	trans_for_each_iter_with_node(iter->trans, b, linked)
+		linked->l[b->level].lock_seq += 2;
+
+	six_unlock_write(&b->lock);
+}
+
+void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
+
+void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+
+static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+	EBUG_ON(iter->l[b->level].b != b);
+	EBUG_ON(iter->l[b->level].lock_seq != b->lock.state.seq);
+
+	if (unlikely(!six_trylock_write(&b->lock)))
+		__bch2_btree_node_lock_write(b, iter);
+}
+
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
+
+
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
new file mode 100644
index 000000000000..efa68bb578ab
--- /dev/null
+++ b/fs/bcachefs/btree_types.h
@@ -0,0 +1,523 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_TYPES_H
+#define _BCACHEFS_BTREE_TYPES_H
+
+#include <linux/list.h>
+#include <linux/rhashtable.h>
+#include <linux/six.h>
+
+#include "bkey_methods.h"
+#include "buckets_types.h"
+#include "journal_types.h"
+
+struct open_bucket;
+struct btree_update;
+struct btree_trans;
+
+#define MAX_BSETS		3U
+
+struct btree_nr_keys {
+
+	/*
+	 * Amount of live metadata (i.e. size of node after a compaction) in
+	 * units of u64s
+	 */
+	u16			live_u64s;
+	u16			bset_u64s[MAX_BSETS];
+
+	/* live keys only: */
+	u16			packed_keys;
+	u16			unpacked_keys;
+};
+
+struct bset_tree {
+	/*
+	 * We construct a binary tree in an array as if the array
+	 * started at 1, so that things line up on the same cachelines
+	 * better: see comments in bset.c at cacheline_to_bkey() for
+	 * details
+	 */
+
+	/* size of the binary tree and prev array */
+	u16			size;
+
+	/* function of size - precalculated for to_inorder() */
+	u16			extra;
+
+	u16			data_offset;
+	u16			aux_data_offset;
+	u16			end_offset;
+
+	struct bpos		max_key;
+};
+
+struct btree_write {
+	struct journal_entry_pin	journal;
+	struct closure_waitlist		wait;
+};
+
+struct btree_alloc {
+	struct open_buckets	ob;
+	BKEY_PADDED(k);
+};
+
+struct btree {
+	/* Hottest entries first */
+	struct rhash_head	hash;
+
+	/* Key/pointer for this btree node */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
+	struct six_lock		lock;
+
+	unsigned long		flags;
+	u16			written;
+	u8			level;
+	u8			btree_id;
+	u8			nsets;
+	u8			nr_key_bits;
+
+	struct bkey_format	format;
+
+	struct btree_node	*data;
+	void			*aux_data;
+
+	/*
+	 * Sets of sorted keys - the real btree node - plus a binary search tree
+	 *
+	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+	 * to the memory we have allocated for this btree node. Additionally,
+	 * set[0]->data points to the entire btree node as it exists on disk.
+	 */
+	struct bset_tree	set[MAX_BSETS];
+
+	struct btree_nr_keys	nr;
+	u16			sib_u64s[2];
+	u16			whiteout_u64s;
+	u16			uncompacted_whiteout_u64s;
+	u8			page_order;
+	u8			unpack_fn_len;
+
+	/*
+	 * XXX: add a delete sequence number, so when bch2_btree_node_relock()
+	 * fails because the lock sequence number has changed - i.e. the
+	 * contents were modified - we can still relock the node if it's still
+	 * the one we want, without redoing the traversal
+	 */
+
+	/*
+	 * For asynchronous splits/interior node updates:
+	 * When we do a split, we allocate new child nodes and update the parent
+	 * node to point to them: we update the parent in memory immediately,
+	 * but then we must wait until the children have been written out before
+	 * the update to the parent can be written - this is a list of the
+	 * btree_updates that are blocking this node from being
+	 * written:
+	 */
+	struct list_head	write_blocked;
+
+	/*
+	 * Also for asynchronous splits/interior node updates:
+	 * If a btree node isn't reachable yet, we don't want to kick off
+	 * another write - because that write also won't yet be reachable and
+	 * marking it as completed before it's reachable would be incorrect:
+	 */
+	unsigned long		will_make_reachable;
+
+	struct open_buckets	ob;
+
+	/* lru list */
+	struct list_head	list;
+
+	struct btree_write	writes[2];
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	bool			*expensive_debug_checks;
+#endif
+};
+
+struct btree_cache {
+	struct rhashtable	table;
+	bool			table_init_done;
+	/*
+	 * We never free a struct btree, except on shutdown - we just put it on
+	 * the btree_cache_freed list and reuse it later. This simplifies the
+	 * code, and it doesn't cost us much memory as the memory usage is
+	 * dominated by buffers that hold the actual btree node data and those
+	 * can be freed - and the number of struct btrees allocated is
+	 * effectively bounded.
+	 *
+	 * btree_cache_freeable effectively is a small cache - we use it because
+	 * high order page allocations can be rather expensive, and it's quite
+	 * common to delete and allocate btree nodes in quick succession. It
+	 * should never grow past ~2-3 nodes in practice.
+	 */
+	struct mutex		lock;
+	struct list_head	live;
+	struct list_head	freeable;
+	struct list_head	freed;
+
+	/* Number of elements in live + freeable lists */
+	unsigned		used;
+	unsigned		reserve;
+	struct shrinker		shrink;
+
+	/*
+	 * If we need to allocate memory for a new btree node and that
+	 * allocation fails, we can cannibalize another node in the btree cache
+	 * to satisfy the allocation - lock to guarantee only one thread does
+	 * this at a time:
+	 */
+	struct task_struct	*alloc_lock;
+	struct closure_waitlist	alloc_wait;
+};
+
+struct btree_node_iter {
+	struct btree_node_iter_set {
+		u16	k, end;
+	} data[MAX_BSETS];
+};
+
+enum btree_iter_type {
+	BTREE_ITER_KEYS,
+	BTREE_ITER_NODES,
+};
+
+#define BTREE_ITER_TYPE			((1 << 2) - 1)
+
+#define BTREE_ITER_SLOTS		(1 << 2)
+#define BTREE_ITER_INTENT		(1 << 3)
+#define BTREE_ITER_PREFETCH		(1 << 4)
+#define BTREE_ITER_KEEP_UNTIL_COMMIT	(1 << 5)
+/*
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ * @pos or the first key strictly greater than @pos
+ */
+#define BTREE_ITER_IS_EXTENTS		(1 << 6)
+#define BTREE_ITER_ERROR		(1 << 7)
+
+enum btree_iter_uptodate {
+	BTREE_ITER_UPTODATE		= 0,
+	BTREE_ITER_NEED_PEEK		= 1,
+	BTREE_ITER_NEED_RELOCK		= 2,
+	BTREE_ITER_NEED_TRAVERSE	= 3,
+};
+
+/*
+ * @pos			- iterator's current position
+ * @level		- current btree depth
+ * @locks_want		- btree level below which we start taking intent locks
+ * @nodes_locked	- bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked	- bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+	u8			idx;
+
+	struct btree_trans	*trans;
+	struct bpos		pos;
+
+	u8			flags;
+	enum btree_iter_uptodate uptodate:4;
+	enum btree_id		btree_id:4;
+	unsigned		level:4,
+				locks_want:4,
+				nodes_locked:4,
+				nodes_intent_locked:4;
+
+	struct btree_iter_level {
+		struct btree	*b;
+		struct btree_node_iter iter;
+		u32		lock_seq;
+	}			l[BTREE_MAX_DEPTH];
+
+	/*
+	 * Current unpacked key - so that bch2_btree_iter_next()/
+	 * bch2_btree_iter_next_slot() can correctly advance pos.
+	 */
+	struct bkey		k;
+};
+
+static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
+{
+	return iter->flags & BTREE_ITER_TYPE;
+}
+
+struct btree_insert_entry {
+	struct bkey_i		*k;
+	struct btree_iter	*iter;
+};
+
+#define BTREE_ITER_MAX		64
+
+struct btree_trans {
+	struct bch_fs		*c;
+	unsigned long		ip;
+
+	u64			iters_linked;
+	u64			iters_live;
+	u64			iters_touched;
+
+	u8			nr_iters;
+	u8			nr_updates;
+	u8			size;
+	unsigned		used_mempool:1;
+	unsigned		error:1;
+	unsigned		nounlock:1;
+
+	unsigned		mem_top;
+	unsigned		mem_bytes;
+	void			*mem;
+
+	struct btree_iter	*iters;
+	struct btree_insert_entry *updates;
+	u8			*updates_sorted;
+
+	/* update path: */
+	struct journal_res	journal_res;
+	struct journal_preres	journal_preres;
+	u64			*journal_seq;
+	struct disk_reservation *disk_res;
+	unsigned		flags;
+	unsigned		journal_u64s;
+	struct replicas_delta_list *fs_usage_deltas;
+
+	struct btree_iter	iters_onstack[2];
+	struct btree_insert_entry updates_onstack[6];
+	u8			updates_sorted_onstack[6];
+};
+
+#define BTREE_FLAG(flag)						\
+static inline bool btree_node_ ## flag(struct btree *b)			\
+{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
+									\
+static inline void set_btree_node_ ## flag(struct btree *b)		\
+{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
+									\
+static inline void clear_btree_node_ ## flag(struct btree *b)		\
+{	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
+
+enum btree_flags {
+	BTREE_NODE_read_in_flight,
+	BTREE_NODE_read_error,
+	BTREE_NODE_dirty,
+	BTREE_NODE_need_write,
+	BTREE_NODE_noevict,
+	BTREE_NODE_write_idx,
+	BTREE_NODE_accessed,
+	BTREE_NODE_write_in_flight,
+	BTREE_NODE_just_written,
+	BTREE_NODE_dying,
+	BTREE_NODE_fake,
+};
+
+BTREE_FLAG(read_in_flight);
+BTREE_FLAG(read_error);
+BTREE_FLAG(dirty);
+BTREE_FLAG(need_write);
+BTREE_FLAG(noevict);
+BTREE_FLAG(write_idx);
+BTREE_FLAG(accessed);
+BTREE_FLAG(write_in_flight);
+BTREE_FLAG(just_written);
+BTREE_FLAG(dying);
+BTREE_FLAG(fake);
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+	return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+	return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline struct bset_tree *bset_tree_last(struct btree *b)
+{
+	EBUG_ON(!b->nsets);
+	return b->set + b->nsets - 1;
+}
+
+static inline void *
+__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
+{
+	return (void *) ((u64 *) b->data + 1 + offset);
+}
+
+static inline u16
+__btree_node_ptr_to_offset(const struct btree *b, const void *p)
+{
+	u16 ret = (u64 *) p - 1 - (u64 *) b->data;
+
+	EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
+	return ret;
+}
+
+static inline struct bset *bset(const struct btree *b,
+				const struct bset_tree *t)
+{
+	return __btree_node_offset_to_ptr(b, t->data_offset);
+}
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+	t->end_offset =
+		__btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+				  const struct bset *i)
+{
+	t->data_offset = __btree_node_ptr_to_offset(b, i);
+	set_btree_bset_end(b, t);
+}
+
+static inline struct bset *btree_bset_first(struct btree *b)
+{
+	return bset(b, b->set);
+}
+
+static inline struct bset *btree_bset_last(struct btree *b)
+{
+	return bset(b, bset_tree_last(b));
+}
+
+static inline u16
+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
+{
+	return __btree_node_ptr_to_offset(b, k);
+}
+
+static inline struct bkey_packed *
+__btree_node_offset_to_key(const struct btree *b, u16 k)
+{
+	return __btree_node_offset_to_ptr(b, k);
+}
+
+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
+{
+	return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
+}
+
+#define btree_bkey_first(_b, _t)					\
+({									\
+	EBUG_ON(bset(_b, _t)->start !=					\
+		__btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
+									\
+	bset(_b, _t)->start;						\
+})
+
+#define btree_bkey_last(_b, _t)						\
+({									\
+	EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=	\
+		vstruct_last(bset(_b, _t)));				\
+									\
+	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
+})
+
+static inline unsigned bset_u64s(struct bset_tree *t)
+{
+	return t->end_offset - t->data_offset -
+		sizeof(struct bset) / sizeof(u64);
+}
+
+static inline unsigned bset_byte_offset(struct btree *b, void *i)
+{
+	return i - (void *) b->data;
+}
+
+enum btree_node_type {
+#define x(kwd, val, name) BKEY_TYPE_##kwd = val,
+	BCH_BTREE_IDS()
+#undef x
+	BKEY_TYPE_BTREE,
+};
+
+/* Type of a key in btree @id at level @level: */
+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
+{
+	return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id;
+}
+
+/* Type of keys @b contains: */
+static inline enum btree_node_type btree_node_type(struct btree *b)
+{
+	return __btree_node_type(b->level, b->btree_id);
+}
+
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+	switch (type) {
+	case BKEY_TYPE_EXTENTS:
+	case BKEY_TYPE_REFLINK:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool btree_node_is_extents(struct btree *b)
+{
+	return btree_node_type_is_extents(btree_node_type(b));
+}
+
+#define BTREE_NODE_TYPE_HAS_TRIGGERS			\
+	((1U << BKEY_TYPE_EXTENTS)|			\
+	 (1U << BKEY_TYPE_ALLOC)|			\
+	 (1U << BKEY_TYPE_INODES)|			\
+	 (1U << BKEY_TYPE_REFLINK)|			\
+	 (1U << BKEY_TYPE_EC)|				\
+	 (1U << BKEY_TYPE_BTREE))
+
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
+	((1U << BKEY_TYPE_EXTENTS)|			\
+	 (1U << BKEY_TYPE_INODES)|			\
+	 (1U << BKEY_TYPE_REFLINK))
+
+static inline bool btree_node_type_needs_gc(enum btree_node_type type)
+{
+	return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
+}
+
+struct btree_root {
+	struct btree		*b;
+
+	struct btree_update	*as;
+
+	/* On disk root - see async splits: */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+	u8			level;
+	u8			alive;
+	s8			error;
+};
+
+/*
+ * Optional hook that will be called just prior to a btree node update, when
+ * we're holding the write lock and we know what key is about to be overwritten:
+ */
+
+enum btree_insert_ret {
+	BTREE_INSERT_OK,
+	/* leaf node needs to be split */
+	BTREE_INSERT_BTREE_NODE_FULL,
+	BTREE_INSERT_ENOSPC,
+	BTREE_INSERT_NEED_MARK_REPLICAS,
+	BTREE_INSERT_NEED_JOURNAL_RES,
+};
+
+enum btree_gc_coalesce_fail_reason {
+	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
+	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
+	BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
+};
+
+enum btree_node_sibling {
+	btree_prev_sib,
+	btree_next_sib,
+};
+
+typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
+							struct btree *,
+							struct btree_node_iter *);
+
+#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
new file mode 100644
index 000000000000..ad8cbf3fb778
--- /dev/null
+++ b/fs/bcachefs/btree_update.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_H
+#define _BCACHEFS_BTREE_UPDATE_H
+
+#include "btree_iter.h"
+#include "journal.h"
+
+struct bch_fs;
+struct btree;
+
+void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
+				     struct btree_iter *);
+bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
+				struct btree_node_iter *, struct bkey_i *);
+void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *,
+			    struct bkey_i *);
+
+enum {
+	__BTREE_INSERT_ATOMIC,
+	__BTREE_INSERT_NOUNLOCK,
+	__BTREE_INSERT_NOFAIL,
+	__BTREE_INSERT_NOCHECK_RW,
+	__BTREE_INSERT_LAZY_RW,
+	__BTREE_INSERT_USE_RESERVE,
+	__BTREE_INSERT_USE_ALLOC_RESERVE,
+	__BTREE_INSERT_JOURNAL_REPLAY,
+	__BTREE_INSERT_JOURNAL_RESERVED,
+	__BTREE_INSERT_NOMARK_OVERWRITES,
+	__BTREE_INSERT_NOMARK,
+	__BTREE_INSERT_NO_CLEAR_REPLICAS,
+	__BTREE_INSERT_BUCKET_INVALIDATE,
+	__BTREE_INSERT_NOWAIT,
+	__BTREE_INSERT_GC_LOCK_HELD,
+	__BCH_HASH_SET_MUST_CREATE,
+	__BCH_HASH_SET_MUST_REPLACE,
+};
+
+/*
+ * Don't drop/retake locks before doing btree update, instead return -EINTR if
+ * we had to drop locks for any reason
+ */
+#define BTREE_INSERT_ATOMIC		(1 << __BTREE_INSERT_ATOMIC)
+
+/*
+ * Don't drop locks _after_ successfully updating btree:
+ */
+#define BTREE_INSERT_NOUNLOCK		(1 << __BTREE_INSERT_NOUNLOCK)
+
+/* Don't check for -ENOSPC: */
+#define BTREE_INSERT_NOFAIL		(1 << __BTREE_INSERT_NOFAIL)
+
+#define BTREE_INSERT_NOCHECK_RW		(1 << __BTREE_INSERT_NOCHECK_RW)
+#define BTREE_INSERT_LAZY_RW		(1 << __BTREE_INSERT_LAZY_RW)
+
+/* for copygc, or when merging btree nodes */
+#define BTREE_INSERT_USE_RESERVE	(1 << __BTREE_INSERT_USE_RESERVE)
+#define BTREE_INSERT_USE_ALLOC_RESERVE	(1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
+
+/* Insert is for journal replay - don't get journal reservations: */
+#define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)
+
+#define BTREE_INSERT_JOURNAL_RESERVED	(1 << __BTREE_INSERT_JOURNAL_RESERVED)
+
+/* Don't mark overwrites, just new key: */
+#define BTREE_INSERT_NOMARK_OVERWRITES	(1 << __BTREE_INSERT_NOMARK_OVERWRITES)
+
+/* Don't call mark new key at all: */
+#define BTREE_INSERT_NOMARK		(1 << __BTREE_INSERT_NOMARK)
+
+#define BTREE_INSERT_NO_CLEAR_REPLICAS	(1 << __BTREE_INSERT_NO_CLEAR_REPLICAS)
+
+#define BTREE_INSERT_BUCKET_INVALIDATE	(1 << __BTREE_INSERT_BUCKET_INVALIDATE)
+
+/* Don't block on allocation failure (for new btree nodes: */
+#define BTREE_INSERT_NOWAIT		(1 << __BTREE_INSERT_NOWAIT)
+#define BTREE_INSERT_GC_LOCK_HELD	(1 << __BTREE_INSERT_GC_LOCK_HELD)
+
+#define BCH_HASH_SET_MUST_CREATE	(1 << __BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE	(1 << __BCH_HASH_SET_MUST_REPLACE)
+
+int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
+
+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
+		     struct disk_reservation *, u64 *, int flags);
+
+int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *,
+			       struct bpos, u64 *);
+int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
+			    struct bpos, struct bpos, u64 *);
+
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
+			    __le64, unsigned);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
+			       struct btree *, struct bkey_i_btree_ptr *);
+
+int __bch2_trans_commit(struct btree_trans *);
+
+/**
+ * bch2_trans_commit - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+static inline int bch2_trans_commit(struct btree_trans *trans,
+				    struct disk_reservation *disk_res,
+				    u64 *journal_seq,
+				    unsigned flags)
+{
+	trans->disk_res		= disk_res;
+	trans->journal_seq	= journal_seq;
+	trans->flags		= flags;
+
+	return __bch2_trans_commit(trans);
+}
+
+static inline void bch2_trans_update(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_i *k)
+{
+	EBUG_ON(trans->nr_updates >= trans->nr_iters + 4);
+
+	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+
+	trans->updates[trans->nr_updates++] = (struct btree_insert_entry) {
+		.iter = iter, .k = k
+	};
+}
+
+#define bch2_trans_do(_c, _journal_seq, _flags, _do)			\
+({									\
+	struct btree_trans trans;					\
+	int _ret;							\
+									\
+	bch2_trans_init(&trans, (_c), 0, 0);				\
+									\
+	do {								\
+		bch2_trans_begin(&trans);				\
+									\
+		_ret = (_do) ?:	bch2_trans_commit(&trans, NULL,		\
+					(_journal_seq), (_flags));	\
+	} while (_ret == -EINTR);					\
+									\
+	bch2_trans_exit(&trans);					\
+	_ret;								\
+})
+
+#define trans_for_each_update(_trans, _i)				\
+	for ((_i) = (_trans)->updates;					\
+	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
+	     (_i)++)
+
+#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
new file mode 100644
index 000000000000..40d801e1094f
--- /dev/null
+++ b/fs/bcachefs/btree_update_interior.c
@@ -0,0 +1,2234 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "extents.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "replicas.h"
+#include "super-io.h"
+
+#include <linux/random.h>
+#include <trace/events/bcachefs.h>
+
+static void btree_node_will_make_reachable(struct btree_update *,
+					   struct btree *);
+static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
+static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
+
+/* Debug code: */
+
+static void btree_node_interior_verify(struct btree *b)
+{
+	struct btree_node_iter iter;
+	struct bkey_packed *k;
+
+	BUG_ON(!b->level);
+
+	bch2_btree_node_iter_init(&iter, b, &b->key.k.p);
+#if 1
+	BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
+	       bkey_cmp_left_packed(b, k, &b->key.k.p));
+
+	BUG_ON((bch2_btree_node_iter_advance(&iter, b),
+		!bch2_btree_node_iter_end(&iter)));
+#else
+	const char *msg;
+
+	msg = "not found";
+	k = bch2_btree_node_iter_peek(&iter, b);
+	if (!k)
+		goto err;
+
+	msg = "isn't what it should be";
+	if (bkey_cmp_left_packed(b, k, &b->key.k.p))
+		goto err;
+
+	bch2_btree_node_iter_advance(&iter, b);
+
+	msg = "isn't last key";
+	if (!bch2_btree_node_iter_end(&iter))
+		goto err;
+	return;
+err:
+	bch2_dump_btree_node(b);
+	printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
+	       b->key.k.p.offset, msg);
+	BUG();
+#endif
+}
+
+/* Calculate ideal packed bkey format for new btree nodes: */
+
+void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
+{
+	struct bkey_packed *k;
+	struct bset_tree *t;
+	struct bkey uk;
+
+	bch2_bkey_format_add_pos(s, b->data->min_key);
+
+	for_each_bset(b, t)
+		for (k = btree_bkey_first(b, t);
+		     k != btree_bkey_last(b, t);
+		     k = bkey_next(k))
+			if (!bkey_whiteout(k)) {
+				uk = bkey_unpack_key(b, k);
+				bch2_bkey_format_add_key(s, &uk);
+			}
+}
+
+static struct bkey_format bch2_btree_calc_format(struct btree *b)
+{
+	struct bkey_format_state s;
+
+	bch2_bkey_format_init(&s);
+	__bch2_btree_calc_format(&s, b);
+
+	return bch2_bkey_format_done(&s);
+}
+
+static size_t btree_node_u64s_with_format(struct btree *b,
+					  struct bkey_format *new_f)
+{
+	struct bkey_format *old_f = &b->format;
+
+	/* stupid integer promotion rules */
+	ssize_t delta =
+	    (((int) new_f->key_u64s - old_f->key_u64s) *
+	     (int) b->nr.packed_keys) +
+	    (((int) new_f->key_u64s - BKEY_U64s) *
+	     (int) b->nr.unpacked_keys);
+
+	BUG_ON(delta + b->nr.live_u64s < 0);
+
+	return b->nr.live_u64s + delta;
+}
+
+/**
+ * btree_node_format_fits - check if we could rewrite node with a new format
+ *
+ * This assumes all keys can pack with the new format -- it just checks if
+ * the re-packed keys would fit inside the node itself.
+ */
+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
+				 struct bkey_format *new_f)
+{
+	size_t u64s = btree_node_u64s_with_format(b, new_f);
+
+	return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+}
+
+/* Btree node freeing/allocation: */
+
+static bool btree_key_matches(struct bch_fs *c,
+			      struct bkey_s_c l,
+			      struct bkey_s_c r)
+{
+	struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l);
+	struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r);
+	const struct bch_extent_ptr *ptr1, *ptr2;
+
+	bkey_for_each_ptr(ptrs1, ptr1)
+		bkey_for_each_ptr(ptrs2, ptr2)
+			if (ptr1->dev == ptr2->dev &&
+			    ptr1->gen == ptr2->gen &&
+			    ptr1->offset == ptr2->offset)
+				return true;
+
+	return false;
+}
+
+/*
+ * We're doing the index update that makes @b unreachable, update stuff to
+ * reflect that:
+ *
+ * Must be called _before_ btree_update_updated_root() or
+ * btree_update_updated_node:
+ */
+static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
+				       struct bkey_s_c k,
+				       struct bch_fs_usage *stats)
+{
+	struct bch_fs *c = as->c;
+	struct pending_btree_node_free *d;
+
+	for (d = as->pending; d < as->pending + as->nr_pending; d++)
+		if (!bkey_cmp(k.k->p, d->key.k.p) &&
+		    btree_key_matches(c, k, bkey_i_to_s_c(&d->key)))
+			goto found;
+	BUG();
+found:
+	BUG_ON(d->index_update_done);
+	d->index_update_done = true;
+
+	/*
+	 * We're dropping @k from the btree, but it's still live until the
+	 * index update is persistent so we need to keep a reference around for
+	 * mark and sweep to find - that's primarily what the
+	 * btree_node_pending_free list is for.
+	 *
+	 * So here (when we set index_update_done = true), we're moving an
+	 * existing reference to a different part of the larger "gc keyspace" -
+	 * and the new position comes after the old position, since GC marks
+	 * the pending free list after it walks the btree.
+	 *
+	 * If we move the reference while mark and sweep is _between_ the old
+	 * and the new position, mark and sweep will see the reference twice
+	 * and it'll get double accounted - so check for that here and subtract
+	 * to cancel out one of mark and sweep's markings if necessary:
+	 */
+
+	if (gc_pos_cmp(c->gc_pos, b
+		       ? gc_pos_btree_node(b)
+		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
+	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
+		bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
+			      0, 0, NULL, 0,
+			      BCH_BUCKET_MARK_OVERWRITE|
+			      BCH_BUCKET_MARK_GC);
+}
+
+static void __btree_node_free(struct bch_fs *c, struct btree *b)
+{
+	trace_btree_node_free(c, b);
+
+	BUG_ON(btree_node_dirty(b));
+	BUG_ON(btree_node_need_write(b));
+	BUG_ON(b == btree_node_root(c, b));
+	BUG_ON(b->ob.nr);
+	BUG_ON(!list_empty(&b->write_blocked));
+	BUG_ON(b->will_make_reachable);
+
+	clear_btree_node_noevict(b);
+
+	bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+	mutex_lock(&c->btree_cache.lock);
+	list_move(&b->list, &c->btree_cache.freeable);
+	mutex_unlock(&c->btree_cache.lock);
+}
+
+void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
+{
+	struct open_buckets ob = b->ob;
+
+	btree_update_drop_new_node(c, b);
+
+	b->ob.nr = 0;
+
+	clear_btree_node_dirty(b);
+
+	btree_node_lock_type(c, b, SIX_LOCK_write);
+	__btree_node_free(c, b);
+	six_unlock_write(&b->lock);
+
+	bch2_open_buckets_put(c, &ob);
+}
+
+void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
+				struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	trans_for_each_iter(iter->trans, linked)
+		BUG_ON(linked->l[b->level].b == b);
+
+	/*
+	 * Is this a node that isn't reachable on disk yet?
+	 *
+	 * Nodes that aren't reachable yet have writes blocked until they're
+	 * reachable - now that we've cancelled any pending writes and moved
+	 * things waiting on that write to wait on this update, we can drop this
+	 * node from the list of nodes that the other update is making
+	 * reachable, prior to freeing it:
+	 */
+	btree_update_drop_new_node(c, b);
+
+	six_lock_write(&b->lock, NULL, NULL);
+	__btree_node_free(c, b);
+	six_unlock_write(&b->lock);
+	six_unlock_intent(&b->lock);
+}
+
+static void bch2_btree_node_free_ondisk(struct bch_fs *c,
+					struct pending_btree_node_free *pending)
+{
+	BUG_ON(!pending->index_update_done);
+
+	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+		      0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE);
+
+	if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
+		bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+			      0, 0, NULL, 0,
+			      BCH_BUCKET_MARK_OVERWRITE|
+			      BCH_BUCKET_MARK_GC);
+}
+
+static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
+					     struct disk_reservation *res,
+					     struct closure *cl,
+					     unsigned flags)
+{
+	struct write_point *wp;
+	struct btree *b;
+	BKEY_PADDED(k) tmp;
+	struct open_buckets ob = { .nr = 0 };
+	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
+	unsigned nr_reserve;
+	enum alloc_reserve alloc_reserve;
+
+	if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
+		nr_reserve	= 0;
+		alloc_reserve	= RESERVE_ALLOC;
+	} else if (flags & BTREE_INSERT_USE_RESERVE) {
+		nr_reserve	= BTREE_NODE_RESERVE / 2;
+		alloc_reserve	= RESERVE_BTREE;
+	} else {
+		nr_reserve	= BTREE_NODE_RESERVE;
+		alloc_reserve	= RESERVE_NONE;
+	}
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	if (c->btree_reserve_cache_nr > nr_reserve) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		ob = a->ob;
+		bkey_copy(&tmp.k, &a->k);
+		mutex_unlock(&c->btree_reserve_cache_lock);
+		goto mem_alloc;
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+retry:
+	wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
+				      writepoint_ptr(&c->btree_write_point),
+				      &devs_have,
+				      res->nr_replicas,
+				      c->opts.metadata_replicas_required,
+				      alloc_reserve, 0, cl);
+	if (IS_ERR(wp))
+		return ERR_CAST(wp);
+
+	if (wp->sectors_free < c->opts.btree_node_size) {
+		struct open_bucket *ob;
+		unsigned i;
+
+		open_bucket_for_each(c, &wp->ptrs, ob, i)
+			if (ob->sectors_free < c->opts.btree_node_size)
+				ob->sectors_free = 0;
+
+		bch2_alloc_sectors_done(c, wp);
+		goto retry;
+	}
+
+	bkey_btree_ptr_init(&tmp.k);
+	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
+
+	bch2_open_bucket_get(c, wp, &ob);
+	bch2_alloc_sectors_done(c, wp);
+mem_alloc:
+	b = bch2_btree_node_mem_alloc(c);
+
+	/* we hold cannibalize_lock: */
+	BUG_ON(IS_ERR(b));
+	BUG_ON(b->ob.nr);
+
+	bkey_copy(&b->key, &tmp.k);
+	b->ob = ob;
+
+	return b;
+}
+
+static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
+{
+	struct bch_fs *c = as->c;
+	struct btree *b;
+
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+	BUG_ON(!as->reserve->nr);
+
+	b = as->reserve->b[--as->reserve->nr];
+
+	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id));
+
+	set_btree_node_accessed(b);
+	set_btree_node_dirty(b);
+	set_btree_node_need_write(b);
+
+	bch2_bset_init_first(b, &b->data->keys);
+	memset(&b->nr, 0, sizeof(b->nr));
+	b->data->magic = cpu_to_le64(bset_magic(c));
+	b->data->flags = 0;
+	SET_BTREE_NODE_ID(b->data, as->btree_id);
+	SET_BTREE_NODE_LEVEL(b->data, level);
+	b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0];
+
+	bch2_btree_build_aux_trees(b);
+
+	btree_node_will_make_reachable(as, b);
+
+	trace_btree_node_alloc(c, b);
+	return b;
+}
+
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
+						  struct btree *b,
+						  struct bkey_format format)
+{
+	struct btree *n;
+
+	n = bch2_btree_node_alloc(as, b->level);
+
+	n->data->min_key	= b->data->min_key;
+	n->data->max_key	= b->data->max_key;
+	n->data->format		= format;
+	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
+
+	btree_node_set_format(n, format);
+
+	bch2_btree_sort_into(as->c, n, b);
+
+	btree_node_reset_sib_u64s(n);
+
+	n->key.k.p = b->key.k.p;
+	return n;
+}
+
+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+						       struct btree *b)
+{
+	struct bkey_format new_f = bch2_btree_calc_format(b);
+
+	/*
+	 * The keys might expand with the new format - if they wouldn't fit in
+	 * the btree node anymore, use the old format for now:
+	 */
+	if (!bch2_btree_node_format_fits(as->c, b, &new_f))
+		new_f = b->format;
+
+	return __bch2_btree_node_alloc_replacement(as, b, new_f);
+}
+
+static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
+{
+	struct btree *b = bch2_btree_node_alloc(as, level);
+
+	b->data->min_key = POS_MIN;
+	b->data->max_key = POS_MAX;
+	b->data->format = bch2_btree_calc_format(b);
+	b->key.k.p = POS_MAX;
+
+	btree_node_set_format(b, b->data->format);
+	bch2_btree_build_aux_trees(b);
+
+	six_unlock_write(&b->lock);
+
+	return b;
+}
+
+static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
+{
+	bch2_disk_reservation_put(c, &reserve->disk_res);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+
+	while (reserve->nr) {
+		struct btree *b = reserve->b[--reserve->nr];
+
+		six_unlock_write(&b->lock);
+
+		if (c->btree_reserve_cache_nr <
+		    ARRAY_SIZE(c->btree_reserve_cache)) {
+			struct btree_alloc *a =
+				&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+
+			a->ob = b->ob;
+			b->ob.nr = 0;
+			bkey_copy(&a->k, &b->key);
+		} else {
+			bch2_open_buckets_put(c, &b->ob);
+		}
+
+		btree_node_lock_type(c, b, SIX_LOCK_write);
+		__btree_node_free(c, b);
+		six_unlock_write(&b->lock);
+
+		six_unlock_intent(&b->lock);
+	}
+
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	mempool_free(reserve, &c->btree_reserve_pool);
+}
+
+static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
+						    unsigned nr_nodes,
+						    unsigned flags,
+						    struct closure *cl)
+{
+	struct btree_reserve *reserve;
+	struct btree *b;
+	struct disk_reservation disk_res = { 0, 0 };
+	unsigned sectors = nr_nodes * c->opts.btree_node_size;
+	int ret, disk_res_flags = 0;
+
+	if (flags & BTREE_INSERT_NOFAIL)
+		disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
+
+	/*
+	 * This check isn't necessary for correctness - it's just to potentially
+	 * prevent us from doing a lot of work that'll end up being wasted:
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (bch2_disk_reservation_get(c, &disk_res, sectors,
+				      c->opts.metadata_replicas,
+				      disk_res_flags))
+		return ERR_PTR(-ENOSPC);
+
+	BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+
+	/*
+	 * Protects reaping from the btree node cache and using the btree node
+	 * open bucket reserve:
+	 */
+	ret = bch2_btree_cache_cannibalize_lock(c, cl);
+	if (ret) {
+		bch2_disk_reservation_put(c, &disk_res);
+		return ERR_PTR(ret);
+	}
+
+	reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
+
+	reserve->disk_res = disk_res;
+	reserve->nr = 0;
+
+	while (reserve->nr < nr_nodes) {
+		b = __bch2_btree_node_alloc(c, &disk_res,
+					    flags & BTREE_INSERT_NOWAIT
+					    ? NULL : cl, flags);
+		if (IS_ERR(b)) {
+			ret = PTR_ERR(b);
+			goto err_free;
+		}
+
+		ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
+		if (ret)
+			goto err_free;
+
+		reserve->b[reserve->nr++] = b;
+	}
+
+	bch2_btree_cache_cannibalize_unlock(c);
+	return reserve;
+err_free:
+	bch2_btree_reserve_put(c, reserve);
+	bch2_btree_cache_cannibalize_unlock(c);
+	trace_btree_reserve_get_fail(c, nr_nodes, cl);
+	return ERR_PTR(ret);
+}
+
+/* Asynchronous interior node update machinery */
+
+static void bch2_btree_update_free(struct btree_update *as)
+{
+	struct bch_fs *c = as->c;
+
+	bch2_journal_pin_flush(&c->journal, &as->journal);
+
+	BUG_ON(as->nr_new_nodes);
+	BUG_ON(as->nr_pending);
+
+	if (as->reserve)
+		bch2_btree_reserve_put(c, as->reserve);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_del(&as->list);
+
+	closure_debug_destroy(&as->cl);
+	mempool_free(as, &c->btree_interior_update_pool);
+
+	closure_wake_up(&c->btree_interior_update_wait);
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_nodes_reachable(struct closure *cl)
+{
+	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	struct bch_fs *c = as->c;
+
+	bch2_journal_pin_drop(&c->journal, &as->journal);
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	while (as->nr_new_nodes) {
+		struct btree *b = as->new_nodes[--as->nr_new_nodes];
+
+		BUG_ON(b->will_make_reachable != (unsigned long) as);
+		b->will_make_reachable = 0;
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		/*
+		 * b->will_make_reachable prevented it from being written, so
+		 * write it now if it needs to be written:
+		 */
+		btree_node_lock_type(c, b, SIX_LOCK_read);
+		bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
+		six_unlock_read(&b->lock);
+		mutex_lock(&c->btree_interior_update_lock);
+	}
+
+	while (as->nr_pending)
+		bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	closure_wake_up(&as->wait);
+
+	bch2_btree_update_free(as);
+}
+
+static void btree_update_wait_on_journal(struct closure *cl)
+{
+	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	struct bch_fs *c = as->c;
+	int ret;
+
+	ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
+	if (ret == -EAGAIN) {
+		continue_at(cl, btree_update_wait_on_journal, system_wq);
+		return;
+	}
+	if (ret < 0)
+		goto err;
+
+	bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
+err:
+	continue_at(cl, btree_update_nodes_reachable, system_wq);
+}
+
+static void btree_update_nodes_written(struct closure *cl)
+{
+	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	struct bch_fs *c = as->c;
+	struct btree *b;
+
+	/*
+	 * We did an update to a parent node where the pointers we added pointed
+	 * to child nodes that weren't written yet: now, the child nodes have
+	 * been written so we can write out the update to the interior node.
+	 */
+retry:
+	mutex_lock(&c->btree_interior_update_lock);
+	as->nodes_written = true;
+
+	switch (as->mode) {
+	case BTREE_INTERIOR_NO_UPDATE:
+		BUG();
+	case BTREE_INTERIOR_UPDATING_NODE:
+		/* The usual case: */
+		b = READ_ONCE(as->b);
+
+		if (!six_trylock_read(&b->lock)) {
+			mutex_unlock(&c->btree_interior_update_lock);
+			btree_node_lock_type(c, b, SIX_LOCK_read);
+			six_unlock_read(&b->lock);
+			goto retry;
+		}
+
+		BUG_ON(!btree_node_dirty(b));
+		closure_wait(&btree_current_write(b)->wait, cl);
+
+		list_del(&as->write_blocked_list);
+
+		/*
+		 * for flush_held_btree_writes() waiting on updates to flush or
+		 * nodes to be writeable:
+		 */
+		closure_wake_up(&c->btree_interior_update_wait);
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		/*
+		 * b->write_blocked prevented it from being written, so
+		 * write it now if it needs to be written:
+		 */
+		bch2_btree_node_write_cond(c, b, true);
+		six_unlock_read(&b->lock);
+		break;
+
+	case BTREE_INTERIOR_UPDATING_AS:
+		/*
+		 * The btree node we originally updated has been freed and is
+		 * being rewritten - so we need to write anything here, we just
+		 * need to signal to that btree_update that it's ok to make the
+		 * new replacement node visible:
+		 */
+		closure_put(&as->parent_as->cl);
+
+		/*
+		 * and then we have to wait on that btree_update to finish:
+		 */
+		closure_wait(&as->parent_as->wait, cl);
+		mutex_unlock(&c->btree_interior_update_lock);
+		break;
+
+	case BTREE_INTERIOR_UPDATING_ROOT:
+		/* b is the new btree root: */
+		b = READ_ONCE(as->b);
+
+		if (!six_trylock_read(&b->lock)) {
+			mutex_unlock(&c->btree_interior_update_lock);
+			btree_node_lock_type(c, b, SIX_LOCK_read);
+			six_unlock_read(&b->lock);
+			goto retry;
+		}
+
+		BUG_ON(c->btree_roots[b->btree_id].as != as);
+		c->btree_roots[b->btree_id].as = NULL;
+
+		bch2_btree_set_root_ondisk(c, b, WRITE);
+
+		/*
+		 * We don't have to wait anything anything here (before
+		 * btree_update_nodes_reachable frees the old nodes
+		 * ondisk) - we've ensured that the very next journal write will
+		 * have the pointer to the new root, and before the allocator
+		 * can reuse the old nodes it'll have to do a journal commit:
+		 */
+		six_unlock_read(&b->lock);
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		/*
+		 * Bit of funny circularity going on here we have to break:
+		 *
+		 * We have to drop our journal pin before writing the journal
+		 * entry that points to the new btree root: else, we could
+		 * deadlock if the journal currently happens to be full.
+		 *
+		 * This mean we're dropping the journal pin _before_ the new
+		 * nodes are technically reachable - but this is safe, because
+		 * after the bch2_btree_set_root_ondisk() call above they will
+		 * be reachable as of the very next journal write:
+		 */
+		bch2_journal_pin_drop(&c->journal, &as->journal);
+
+		as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
+
+		btree_update_wait_on_journal(cl);
+		return;
+	}
+
+	continue_at(cl, btree_update_nodes_reachable, system_wq);
+}
+
+/*
+ * We're updating @b with pointers to nodes that haven't finished writing yet:
+ * block @b from being written until @as completes
+ */
+static void btree_update_updated_node(struct btree_update *as, struct btree *b)
+{
+	struct bch_fs *c = as->c;
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+	BUG_ON(!btree_node_dirty(b));
+
+	as->mode = BTREE_INTERIOR_UPDATING_NODE;
+	as->b = b;
+	list_add(&as->write_blocked_list, &b->write_blocked);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	/*
+	 * In general, when you're staging things in a journal that will later
+	 * be written elsewhere, and you also want to guarantee ordering: that
+	 * is, if you have updates a, b, c, after a crash you should never see c
+	 * and not a or b - there's a problem:
+	 *
+	 * If the final destination of the update(s) (i.e. btree node) can be
+	 * written/flushed _before_ the relevant journal entry - oops, that
+	 * breaks ordering, since the various leaf nodes can be written in any
+	 * order.
+	 *
+	 * Normally we use bset->journal_seq to deal with this - if during
+	 * recovery we find a btree node write that's newer than the newest
+	 * journal entry, we just ignore it - we don't need it, anything we're
+	 * supposed to have (that we reported as completed via fsync()) will
+	 * still be in the journal, and as far as the state of the journal is
+	 * concerned that btree node write never happened.
+	 *
+	 * That breaks when we're rewriting/splitting/merging nodes, since we're
+	 * mixing btree node writes that haven't happened yet with previously
+	 * written data that has been reported as completed to the journal.
+	 *
+	 * Thus, before making the new nodes reachable, we have to wait the
+	 * newest journal sequence number we have data for to be written (if it
+	 * hasn't been yet).
+	 */
+	bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
+}
+
+static void interior_update_flush(struct journal *j,
+			struct journal_entry_pin *pin, u64 seq)
+{
+	struct btree_update *as =
+		container_of(pin, struct btree_update, journal);
+
+	bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
+}
+
+static void btree_update_reparent(struct btree_update *as,
+				  struct btree_update *child)
+{
+	struct bch_fs *c = as->c;
+
+	child->b = NULL;
+	child->mode = BTREE_INTERIOR_UPDATING_AS;
+	child->parent_as = as;
+	closure_get(&as->cl);
+
+	/*
+	 * When we write a new btree root, we have to drop our journal pin
+	 * _before_ the new nodes are technically reachable; see
+	 * btree_update_nodes_written().
+	 *
+	 * This goes for journal pins that are recursively blocked on us - so,
+	 * just transfer the journal pin to the new interior update so
+	 * btree_update_nodes_written() can drop it.
+	 */
+	bch2_journal_pin_add_if_older(&c->journal, &child->journal,
+				      &as->journal, interior_update_flush);
+	bch2_journal_pin_drop(&c->journal, &child->journal);
+
+	as->journal_seq = max(as->journal_seq, child->journal_seq);
+}
+
+static void btree_update_updated_root(struct btree_update *as)
+{
+	struct bch_fs *c = as->c;
+	struct btree_root *r = &c->btree_roots[as->btree_id];
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+
+	/*
+	 * Old root might not be persistent yet - if so, redirect its
+	 * btree_update operation to point to us:
+	 */
+	if (r->as)
+		btree_update_reparent(as, r->as);
+
+	as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+	as->b = r->b;
+	r->as = as;
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	/*
+	 * When we're rewriting nodes and updating interior nodes, there's an
+	 * issue with updates that haven't been written in the journal getting
+	 * mixed together with older data - see btree_update_updated_node()
+	 * for the explanation.
+	 *
+	 * However, this doesn't affect us when we're writing a new btree root -
+	 * because to make that new root reachable we have to write out a new
+	 * journal entry, which must necessarily be newer than as->journal_seq.
+	 */
+}
+
+static void btree_node_will_make_reachable(struct btree_update *as,
+					   struct btree *b)
+{
+	struct bch_fs *c = as->c;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
+	BUG_ON(b->will_make_reachable);
+
+	as->new_nodes[as->nr_new_nodes++] = b;
+	b->will_make_reachable = 1UL|(unsigned long) as;
+
+	closure_get(&as->cl);
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
+{
+	struct btree_update *as;
+	unsigned long v;
+	unsigned i;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	v = xchg(&b->will_make_reachable, 0);
+	as = (struct btree_update *) (v & ~1UL);
+
+	if (!as) {
+		mutex_unlock(&c->btree_interior_update_lock);
+		return;
+	}
+
+	for (i = 0; i < as->nr_new_nodes; i++)
+		if (as->new_nodes[i] == b)
+			goto found;
+
+	BUG();
+found:
+	array_remove_item(as->new_nodes, as->nr_new_nodes, i);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	if (v & 1)
+		closure_put(&as->cl);
+}
+
+static void btree_interior_update_add_node_reference(struct btree_update *as,
+						     struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct pending_btree_node_free *d;
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	/* Add this node to the list of nodes being freed: */
+	BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
+
+	d = &as->pending[as->nr_pending++];
+	d->index_update_done	= false;
+	d->seq			= b->data->keys.seq;
+	d->btree_id		= b->btree_id;
+	d->level		= b->level;
+	bkey_copy(&d->key, &b->key);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+/*
+ * @b is being split/rewritten: it may have pointers to not-yet-written btree
+ * nodes and thus outstanding btree_updates - redirect @b's
+ * btree_updates to point to this btree_update:
+ */
+void bch2_btree_interior_update_will_free_node(struct btree_update *as,
+					       struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct closure *cl, *cl_n;
+	struct btree_update *p, *n;
+	struct btree_write *w;
+	struct bset_tree *t;
+
+	set_btree_node_dying(b);
+
+	if (btree_node_fake(b))
+		return;
+
+	btree_interior_update_add_node_reference(as, b);
+
+	/*
+	 * Does this node have data that hasn't been written in the journal?
+	 *
+	 * If so, we have to wait for the corresponding journal entry to be
+	 * written before making the new nodes reachable - we can't just carry
+	 * over the bset->journal_seq tracking, since we'll be mixing those keys
+	 * in with keys that aren't in the journal anymore:
+	 */
+	for_each_bset(b, t)
+		as->journal_seq = max(as->journal_seq,
+				      le64_to_cpu(bset(b, t)->journal_seq));
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	/*
+	 * Does this node have any btree_update operations preventing
+	 * it from being written?
+	 *
+	 * If so, redirect them to point to this btree_update: we can
+	 * write out our new nodes, but we won't make them visible until those
+	 * operations complete
+	 */
+	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
+		list_del(&p->write_blocked_list);
+		btree_update_reparent(as, p);
+
+		/*
+		 * for flush_held_btree_writes() waiting on updates to flush or
+		 * nodes to be writeable:
+		 */
+		closure_wake_up(&c->btree_interior_update_wait);
+	}
+
+	clear_btree_node_dirty(b);
+	clear_btree_node_need_write(b);
+	w = btree_current_write(b);
+
+	/*
+	 * Does this node have any btree_update operations waiting on this node
+	 * to be written?
+	 *
+	 * If so, wake them up when this btree_update operation is reachable:
+	 */
+	llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
+		llist_add(&cl->list, &as->wait.list);
+
+	/*
+	 * Does this node have unwritten data that has a pin on the journal?
+	 *
+	 * If so, transfer that pin to the btree_update operation -
+	 * note that if we're freeing multiple nodes, we only need to keep the
+	 * oldest pin of any of the nodes we're freeing. We'll release the pin
+	 * when the new nodes are persistent and reachable on disk:
+	 */
+	bch2_journal_pin_add_if_older(&c->journal, &w->journal,
+				      &as->journal, interior_update_flush);
+	bch2_journal_pin_drop(&c->journal, &w->journal);
+
+	w = btree_prev_write(b);
+	bch2_journal_pin_add_if_older(&c->journal, &w->journal,
+				      &as->journal, interior_update_flush);
+	bch2_journal_pin_drop(&c->journal, &w->journal);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+void bch2_btree_update_done(struct btree_update *as)
+{
+	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+
+	bch2_btree_reserve_put(as->c, as->reserve);
+	as->reserve = NULL;
+
+	continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq);
+}
+
+struct btree_update *
+bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
+			unsigned nr_nodes, unsigned flags,
+			struct closure *cl)
+{
+	struct btree_reserve *reserve;
+	struct btree_update *as;
+
+	reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
+	if (IS_ERR(reserve))
+		return ERR_CAST(reserve);
+
+	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
+	memset(as, 0, sizeof(*as));
+	closure_init(&as->cl, NULL);
+	as->c		= c;
+	as->mode	= BTREE_INTERIOR_NO_UPDATE;
+	as->btree_id	= id;
+	as->reserve	= reserve;
+	INIT_LIST_HEAD(&as->write_blocked_list);
+
+	bch2_keylist_init(&as->parent_keys, as->inline_keys);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->list, &c->btree_interior_update_list);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	return as;
+}
+
+/* Btree root updates: */
+
+static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
+{
+	/* Root nodes cannot be reaped */
+	mutex_lock(&c->btree_cache.lock);
+	list_del_init(&b->list);
+	mutex_unlock(&c->btree_cache.lock);
+
+	mutex_lock(&c->btree_root_lock);
+	BUG_ON(btree_node_root(c, b) &&
+	       (b->level < btree_node_root(c, b)->level ||
+		!btree_node_dying(btree_node_root(c, b))));
+
+	btree_node_root(c, b) = b;
+	mutex_unlock(&c->btree_root_lock);
+
+	bch2_recalc_btree_reserve(c);
+}
+
+static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct btree *old = btree_node_root(c, b);
+	struct bch_fs_usage *fs_usage;
+
+	__bch2_btree_set_root_inmem(c, b);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	percpu_down_read(&c->mark_lock);
+	fs_usage = bch2_fs_usage_scratch_get(c);
+
+	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
+		      0, 0, fs_usage, 0,
+		      BCH_BUCKET_MARK_INSERT);
+	if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
+		bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
+				     0, 0, NULL, 0,
+				     BCH_BUCKET_MARK_INSERT|
+				     BCH_BUCKET_MARK_GC);
+
+	if (old && !btree_node_fake(old))
+		bch2_btree_node_free_index(as, NULL,
+					   bkey_i_to_s_c(&old->key),
+					   fs_usage);
+	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
+
+	bch2_fs_usage_scratch_put(c, fs_usage);
+	percpu_up_read(&c->mark_lock);
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
+{
+	struct btree_root *r = &c->btree_roots[b->btree_id];
+
+	mutex_lock(&c->btree_root_lock);
+
+	BUG_ON(b != r->b);
+	bkey_copy(&r->key, &b->key);
+	r->level = b->level;
+	r->alive = true;
+	if (rw == WRITE)
+		c->btree_roots_dirty = true;
+
+	mutex_unlock(&c->btree_root_lock);
+}
+
+/**
+ * bch_btree_set_root - update the root in memory and on disk
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks. However, you must hold an intent lock on the
+ * old root.
+ *
+ * Note: This allocates a journal entry but doesn't add any keys to
+ * it.  All the btree roots are part of every journal write, so there
+ * is nothing new to be done.  This just guarantees that there is a
+ * journal write.
+ */
+static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
+				struct btree_iter *iter)
+{
+	struct bch_fs *c = as->c;
+	struct btree *old;
+
+	trace_btree_set_root(c, b);
+	BUG_ON(!b->written &&
+	       !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
+
+	old = btree_node_root(c, b);
+
+	/*
+	 * Ensure no one is using the old root while we switch to the
+	 * new root:
+	 */
+	bch2_btree_node_lock_write(old, iter);
+
+	bch2_btree_set_root_inmem(as, b);
+
+	btree_update_updated_root(as);
+
+	/*
+	 * Unlock old root after new root is visible:
+	 *
+	 * The new root isn't persistent, but that's ok: we still have
+	 * an intent lock on the new root, and any updates that would
+	 * depend on the new root would have to update the new root.
+	 */
+	bch2_btree_node_unlock_write(old, iter);
+}
+
+/* Interior node updates: */
+
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
+					struct btree_iter *iter,
+					struct bkey_i *insert,
+					struct btree_node_iter *node_iter)
+{
+	struct bch_fs *c = as->c;
+	struct bch_fs_usage *fs_usage;
+	struct bkey_packed *k;
+	struct bkey tmp;
+
+	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
+
+	mutex_lock(&c->btree_interior_update_lock);
+	percpu_down_read(&c->mark_lock);
+	fs_usage = bch2_fs_usage_scratch_get(c);
+
+	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
+			     0, 0, fs_usage, 0,
+			     BCH_BUCKET_MARK_INSERT);
+
+	if (gc_visited(c, gc_pos_btree_node(b)))
+		bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
+				     0, 0, NULL, 0,
+				     BCH_BUCKET_MARK_INSERT|
+				     BCH_BUCKET_MARK_GC);
+
+	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
+	       bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
+		bch2_btree_node_iter_advance(node_iter, b);
+
+	/*
+	 * If we're overwriting, look up pending delete and mark so that gc
+	 * marks it on the pending delete list:
+	 */
+	if (k && !bkey_cmp_packed(b, k, &insert->k))
+		bch2_btree_node_free_index(as, b,
+					   bkey_disassemble(b, k, &tmp),
+					   fs_usage);
+
+	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
+
+	bch2_fs_usage_scratch_put(c, fs_usage);
+	percpu_up_read(&c->mark_lock);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	bch2_btree_bset_insert_key(iter, b, node_iter, insert);
+	set_btree_node_dirty(b);
+	set_btree_node_need_write(b);
+}
+
+/*
+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher
+ * node)
+ */
+static struct btree *__btree_split_node(struct btree_update *as,
+					struct btree *n1,
+					struct btree_iter *iter)
+{
+	size_t nr_packed = 0, nr_unpacked = 0;
+	struct btree *n2;
+	struct bset *set1, *set2;
+	struct bkey_packed *k, *prev = NULL;
+
+	n2 = bch2_btree_node_alloc(as, n1->level);
+
+	n2->data->max_key	= n1->data->max_key;
+	n2->data->format	= n1->format;
+	SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
+	n2->key.k.p = n1->key.k.p;
+
+	btree_node_set_format(n2, n2->data->format);
+
+	set1 = btree_bset_first(n1);
+	set2 = btree_bset_first(n2);
+
+	/*
+	 * Has to be a linear search because we don't have an auxiliary
+	 * search tree yet
+	 */
+	k = set1->start;
+	while (1) {
+		if (bkey_next(k) == vstruct_last(set1))
+			break;
+		if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
+			break;
+
+		if (bkey_packed(k))
+			nr_packed++;
+		else
+			nr_unpacked++;
+
+		prev = k;
+		k = bkey_next(k);
+	}
+
+	BUG_ON(!prev);
+
+	n1->key.k.p = bkey_unpack_pos(n1, prev);
+	n1->data->max_key = n1->key.k.p;
+	n2->data->min_key =
+		btree_type_successor(n1->btree_id, n1->key.k.p);
+
+	set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
+	set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
+
+	set_btree_bset_end(n1, n1->set);
+	set_btree_bset_end(n2, n2->set);
+
+	n2->nr.live_u64s	= le16_to_cpu(set2->u64s);
+	n2->nr.bset_u64s[0]	= le16_to_cpu(set2->u64s);
+	n2->nr.packed_keys	= n1->nr.packed_keys - nr_packed;
+	n2->nr.unpacked_keys	= n1->nr.unpacked_keys - nr_unpacked;
+
+	n1->nr.live_u64s	= le16_to_cpu(set1->u64s);
+	n1->nr.bset_u64s[0]	= le16_to_cpu(set1->u64s);
+	n1->nr.packed_keys	= nr_packed;
+	n1->nr.unpacked_keys	= nr_unpacked;
+
+	BUG_ON(!set1->u64s);
+	BUG_ON(!set2->u64s);
+
+	memcpy_u64s(set2->start,
+		    vstruct_end(set1),
+		    le16_to_cpu(set2->u64s));
+
+	btree_node_reset_sib_u64s(n1);
+	btree_node_reset_sib_u64s(n2);
+
+	bch2_verify_btree_nr_keys(n1);
+	bch2_verify_btree_nr_keys(n2);
+
+	if (n1->level) {
+		btree_node_interior_verify(n1);
+		btree_node_interior_verify(n2);
+	}
+
+	return n2;
+}
+
+/*
+ * For updates to interior nodes, we've got to do the insert before we split
+ * because the stuff we're inserting has to be inserted atomically. Post split,
+ * the keys might have to go in different nodes and the split would no longer be
+ * atomic.
+ *
+ * Worse, if the insert is from btree node coalescing, if we do the insert after
+ * we do the split (and pick the pivot) - the pivot we pick might be between
+ * nodes that were coalesced, and thus in the middle of a child node post
+ * coalescing:
+ */
+static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
+				    struct btree_iter *iter,
+				    struct keylist *keys)
+{
+	struct btree_node_iter node_iter;
+	struct bkey_i *k = bch2_keylist_front(keys);
+	struct bkey_packed *p;
+	struct bset *i;
+
+	BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
+
+	bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
+
+	while (!bch2_keylist_empty(keys)) {
+		k = bch2_keylist_front(keys);
+
+		BUG_ON(bch_keylist_u64s(keys) >
+		       bch_btree_keys_u64s_remaining(as->c, b));
+		BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
+		BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
+
+		bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
+		bch2_keylist_pop_front(keys);
+	}
+
+	/*
+	 * We can't tolerate whiteouts here - with whiteouts there can be
+	 * duplicate keys, and it would be rather bad if we picked a duplicate
+	 * for the pivot:
+	 */
+	i = btree_bset_first(b);
+	p = i->start;
+	while (p != vstruct_last(i))
+		if (bkey_deleted(p)) {
+			le16_add_cpu(&i->u64s, -p->u64s);
+			set_btree_bset_end(b, b->set);
+			memmove_u64s_down(p, bkey_next(p),
+					  (u64 *) vstruct_last(i) -
+					  (u64 *) p);
+		} else
+			p = bkey_next(p);
+
+	BUG_ON(b->nsets != 1 ||
+	       b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
+
+	btree_node_interior_verify(b);
+}
+
+static void btree_split(struct btree_update *as, struct btree *b,
+			struct btree_iter *iter, struct keylist *keys,
+			unsigned flags)
+{
+	struct bch_fs *c = as->c;
+	struct btree *parent = btree_node_parent(iter, b);
+	struct btree *n1, *n2 = NULL, *n3 = NULL;
+	u64 start_time = local_clock();
+
+	BUG_ON(!parent && (b != btree_node_root(c, b)));
+	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
+
+	bch2_btree_interior_update_will_free_node(as, b);
+
+	n1 = bch2_btree_node_alloc_replacement(as, b);
+
+	if (keys)
+		btree_split_insert_keys(as, n1, iter, keys);
+
+	if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
+		trace_btree_split(c, b);
+
+		n2 = __btree_split_node(as, n1, iter);
+
+		bch2_btree_build_aux_trees(n2);
+		bch2_btree_build_aux_trees(n1);
+		six_unlock_write(&n2->lock);
+		six_unlock_write(&n1->lock);
+
+		bch2_btree_node_write(c, n2, SIX_LOCK_intent);
+
+		/*
+		 * Note that on recursive parent_keys == keys, so we
+		 * can't start adding new keys to parent_keys before emptying it
+		 * out (which we did with btree_split_insert_keys() above)
+		 */
+		bch2_keylist_add(&as->parent_keys, &n1->key);
+		bch2_keylist_add(&as->parent_keys, &n2->key);
+
+		if (!parent) {
+			/* Depth increases, make a new root */
+			n3 = __btree_root_alloc(as, b->level + 1);
+
+			n3->sib_u64s[0] = U16_MAX;
+			n3->sib_u64s[1] = U16_MAX;
+
+			btree_split_insert_keys(as, n3, iter, &as->parent_keys);
+
+			bch2_btree_node_write(c, n3, SIX_LOCK_intent);
+		}
+	} else {
+		trace_btree_compact(c, b);
+
+		bch2_btree_build_aux_trees(n1);
+		six_unlock_write(&n1->lock);
+
+		bch2_keylist_add(&as->parent_keys, &n1->key);
+	}
+
+	bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+
+	/* New nodes all written, now make them visible: */
+
+	if (parent) {
+		/* Split a non root node */
+		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+	} else if (n3) {
+		bch2_btree_set_root(as, n3, iter);
+	} else {
+		/* Root filled up but didn't need to be split */
+		bch2_btree_set_root(as, n1, iter);
+	}
+
+	bch2_open_buckets_put(c, &n1->ob);
+	if (n2)
+		bch2_open_buckets_put(c, &n2->ob);
+	if (n3)
+		bch2_open_buckets_put(c, &n3->ob);
+
+	/* Successful split, update the iterator to point to the new nodes: */
+
+	six_lock_increment(&b->lock, SIX_LOCK_intent);
+	bch2_btree_iter_node_drop(iter, b);
+	if (n3)
+		bch2_btree_iter_node_replace(iter, n3);
+	if (n2)
+		bch2_btree_iter_node_replace(iter, n2);
+	bch2_btree_iter_node_replace(iter, n1);
+
+	/*
+	 * The old node must be freed (in memory) _before_ unlocking the new
+	 * nodes - else another thread could re-acquire a read lock on the old
+	 * node after another thread has locked and updated the new node, thus
+	 * seeing stale data:
+	 */
+	bch2_btree_node_free_inmem(c, b, iter);
+
+	if (n3)
+		six_unlock_intent(&n3->lock);
+	if (n2)
+		six_unlock_intent(&n2->lock);
+	six_unlock_intent(&n1->lock);
+
+	bch2_btree_trans_verify_locks(iter->trans);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
+			       start_time);
+}
+
+static void
+bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
+				struct btree_iter *iter, struct keylist *keys)
+{
+	struct btree_iter *linked;
+	struct btree_node_iter node_iter;
+	struct bkey_i *insert = bch2_keylist_front(keys);
+	struct bkey_packed *k;
+
+	/* Don't screw up @iter's position: */
+	node_iter = iter->l[b->level].iter;
+
+	/*
+	 * btree_split(), btree_gc_coalesce() will insert keys before
+	 * the iterator's current position - they know the keys go in
+	 * the node the iterator points to:
+	 */
+	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
+	       (bkey_cmp_packed(b, k, &insert->k) >= 0))
+		;
+
+	while (!bch2_keylist_empty(keys)) {
+		insert = bch2_keylist_front(keys);
+
+		bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
+		bch2_keylist_pop_front(keys);
+	}
+
+	btree_update_updated_node(as, b);
+
+	trans_for_each_iter_with_node(iter->trans, b, linked)
+		bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
+
+	bch2_btree_iter_verify(iter, b);
+}
+
+/**
+ * bch_btree_insert_node - insert bkeys into a given btree node
+ *
+ * @iter:		btree iterator
+ * @keys:		list of keys to insert
+ * @hook:		insert callback
+ * @persistent:		if not null, @persistent will wait on journal write
+ *
+ * Inserts as many keys as it can into a given btree node, splitting it if full.
+ * If a split occurred, this function will return early. This can only happen
+ * for leaf nodes -- inserts into interior nodes have to be atomic.
+ */
+void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
+			    struct btree_iter *iter, struct keylist *keys,
+			    unsigned flags)
+{
+	struct bch_fs *c = as->c;
+	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+
+	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
+	BUG_ON(!b->level);
+	BUG_ON(!as || as->b);
+	bch2_verify_keylist_sorted(keys);
+
+	if (as->must_rewrite)
+		goto split;
+
+	bch2_btree_node_lock_for_insert(c, b, iter);
+
+	if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
+		bch2_btree_node_unlock_write(b, iter);
+		goto split;
+	}
+
+	bch2_btree_insert_keys_interior(as, b, iter, keys);
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch2_maybe_compact_whiteouts(c, b))
+		bch2_btree_iter_reinit_node(iter, b);
+
+	bch2_btree_node_unlock_write(b, iter);
+
+	btree_node_interior_verify(b);
+
+	/*
+	 * when called from the btree_split path the new nodes aren't added to
+	 * the btree iterator yet, so the merge path's unlock/wait/relock dance
+	 * won't work:
+	 */
+	bch2_foreground_maybe_merge(c, iter, b->level,
+				    flags|BTREE_INSERT_NOUNLOCK);
+	return;
+split:
+	btree_split(as, b, iter, keys, flags);
+}
+
+int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
+			  unsigned flags)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree *b = iter->l[0].b;
+	struct btree_update *as;
+	struct closure cl;
+	int ret = 0;
+	struct btree_iter *linked;
+
+	/*
+	 * We already have a disk reservation and open buckets pinned; this
+	 * allocation must not block:
+	 */
+	trans_for_each_iter(trans, linked)
+		if (linked->btree_id == BTREE_ID_EXTENTS)
+			flags |= BTREE_INSERT_USE_RESERVE;
+
+	closure_init_stack(&cl);
+
+	/* Hack, because gc and splitting nodes doesn't mix yet: */
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
+	    !down_read_trylock(&c->gc_lock)) {
+		if (flags & BTREE_INSERT_NOUNLOCK)
+			return -EINTR;
+
+		bch2_trans_unlock(trans);
+		down_read(&c->gc_lock);
+
+		if (!bch2_trans_relock(trans))
+			ret = -EINTR;
+	}
+
+	/*
+	 * XXX: figure out how far we might need to split,
+	 * instead of locking/reserving all the way to the root:
+	 */
+	if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
+		trace_trans_restart_iter_upgrade(trans->ip);
+		ret = -EINTR;
+		goto out;
+	}
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+		btree_update_reserve_required(c, b), flags,
+		!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
+	if (IS_ERR(as)) {
+		ret = PTR_ERR(as);
+		if (ret == -EAGAIN) {
+			BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
+			bch2_trans_unlock(trans);
+			ret = -EINTR;
+		}
+		goto out;
+	}
+
+	btree_split(as, b, iter, NULL, flags);
+	bch2_btree_update_done(as);
+
+	/*
+	 * We haven't successfully inserted yet, so don't downgrade all the way
+	 * back to read locks;
+	 */
+	__bch2_btree_iter_downgrade(iter, 1);
+out:
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+		up_read(&c->gc_lock);
+	closure_sync(&cl);
+	return ret;
+}
+
+void __bch2_foreground_maybe_merge(struct bch_fs *c,
+				   struct btree_iter *iter,
+				   unsigned level,
+				   unsigned flags,
+				   enum btree_node_sibling sib)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree_update *as;
+	struct bkey_format_state new_s;
+	struct bkey_format new_f;
+	struct bkey_i delete;
+	struct btree *b, *m, *n, *prev, *next, *parent;
+	struct closure cl;
+	size_t sib_u64s;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+retry:
+	BUG_ON(!btree_node_locked(iter, level));
+
+	b = iter->l[level].b;
+
+	parent = btree_node_parent(iter, b);
+	if (!parent)
+		goto out;
+
+	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
+		goto out;
+
+	/* XXX: can't be holding read locks */
+	m = bch2_btree_node_get_sibling(c, iter, b, sib);
+	if (IS_ERR(m)) {
+		ret = PTR_ERR(m);
+		goto err;
+	}
+
+	/* NULL means no sibling: */
+	if (!m) {
+		b->sib_u64s[sib] = U16_MAX;
+		goto out;
+	}
+
+	if (sib == btree_prev_sib) {
+		prev = m;
+		next = b;
+	} else {
+		prev = b;
+		next = m;
+	}
+
+	bch2_bkey_format_init(&new_s);
+	__bch2_btree_calc_format(&new_s, b);
+	__bch2_btree_calc_format(&new_s, m);
+	new_f = bch2_bkey_format_done(&new_s);
+
+	sib_u64s = btree_node_u64s_with_format(b, &new_f) +
+		btree_node_u64s_with_format(m, &new_f);
+
+	if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
+		sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+		sib_u64s /= 2;
+		sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+	}
+
+	sib_u64s = min(sib_u64s, btree_max_u64s(c));
+	b->sib_u64s[sib] = sib_u64s;
+
+	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
+		six_unlock_intent(&m->lock);
+		goto out;
+	}
+
+	/* We're changing btree topology, doesn't mix with gc: */
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
+	    !down_read_trylock(&c->gc_lock))
+		goto err_cycle_gc_lock;
+
+	if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
+		ret = -EINTR;
+		goto err_unlock;
+	}
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+			 btree_update_reserve_required(c, parent) + 1,
+			 BTREE_INSERT_NOFAIL|
+			 BTREE_INSERT_USE_RESERVE,
+			 !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
+	if (IS_ERR(as)) {
+		ret = PTR_ERR(as);
+		goto err_unlock;
+	}
+
+	trace_btree_merge(c, b);
+
+	bch2_btree_interior_update_will_free_node(as, b);
+	bch2_btree_interior_update_will_free_node(as, m);
+
+	n = bch2_btree_node_alloc(as, b->level);
+
+	n->data->min_key	= prev->data->min_key;
+	n->data->max_key	= next->data->max_key;
+	n->data->format		= new_f;
+	n->key.k.p		= next->key.k.p;
+
+	btree_node_set_format(n, new_f);
+
+	bch2_btree_sort_into(c, n, prev);
+	bch2_btree_sort_into(c, n, next);
+
+	bch2_btree_build_aux_trees(n);
+	six_unlock_write(&n->lock);
+
+	bkey_init(&delete.k);
+	delete.k.p = prev->key.k.p;
+	bch2_keylist_add(&as->parent_keys, &delete);
+	bch2_keylist_add(&as->parent_keys, &n->key);
+
+	bch2_btree_node_write(c, n, SIX_LOCK_intent);
+
+	bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+
+	bch2_open_buckets_put(c, &n->ob);
+
+	six_lock_increment(&b->lock, SIX_LOCK_intent);
+	bch2_btree_iter_node_drop(iter, b);
+	bch2_btree_iter_node_drop(iter, m);
+
+	bch2_btree_iter_node_replace(iter, n);
+
+	bch2_btree_iter_verify(iter, n);
+
+	bch2_btree_node_free_inmem(c, b, iter);
+	bch2_btree_node_free_inmem(c, m, iter);
+
+	six_unlock_intent(&n->lock);
+
+	bch2_btree_update_done(as);
+
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+		up_read(&c->gc_lock);
+out:
+	bch2_btree_trans_verify_locks(trans);
+
+	/*
+	 * Don't downgrade locks here: we're called after successful insert,
+	 * and the caller will downgrade locks after a successful insert
+	 * anyways (in case e.g. a split was required first)
+	 *
+	 * And we're also called when inserting into interior nodes in the
+	 * split path, and downgrading to read locks in there is potentially
+	 * confusing:
+	 */
+	closure_sync(&cl);
+	return;
+
+err_cycle_gc_lock:
+	six_unlock_intent(&m->lock);
+
+	if (flags & BTREE_INSERT_NOUNLOCK)
+		goto out;
+
+	bch2_trans_unlock(trans);
+
+	down_read(&c->gc_lock);
+	up_read(&c->gc_lock);
+	ret = -EINTR;
+	goto err;
+
+err_unlock:
+	six_unlock_intent(&m->lock);
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+		up_read(&c->gc_lock);
+err:
+	BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
+
+	if ((ret == -EAGAIN || ret == -EINTR) &&
+	    !(flags & BTREE_INSERT_NOUNLOCK)) {
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+		ret = bch2_btree_iter_traverse(iter);
+		if (ret)
+			goto out;
+
+		goto retry;
+	}
+
+	goto out;
+}
+
+static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+				struct btree *b, unsigned flags,
+				struct closure *cl)
+{
+	struct btree *n, *parent = btree_node_parent(iter, b);
+	struct btree_update *as;
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+		(parent
+		 ? btree_update_reserve_required(c, parent)
+		 : 0) + 1,
+		flags, cl);
+	if (IS_ERR(as)) {
+		trace_btree_gc_rewrite_node_fail(c, b);
+		return PTR_ERR(as);
+	}
+
+	bch2_btree_interior_update_will_free_node(as, b);
+
+	n = bch2_btree_node_alloc_replacement(as, b);
+
+	bch2_btree_build_aux_trees(n);
+	six_unlock_write(&n->lock);
+
+	trace_btree_gc_rewrite_node(c, b);
+
+	bch2_btree_node_write(c, n, SIX_LOCK_intent);
+
+	if (parent) {
+		bch2_keylist_add(&as->parent_keys, &n->key);
+		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+	} else {
+		bch2_btree_set_root(as, n, iter);
+	}
+
+	bch2_open_buckets_put(c, &n->ob);
+
+	six_lock_increment(&b->lock, SIX_LOCK_intent);
+	bch2_btree_iter_node_drop(iter, b);
+	bch2_btree_iter_node_replace(iter, n);
+	bch2_btree_node_free_inmem(c, b, iter);
+	six_unlock_intent(&n->lock);
+
+	bch2_btree_update_done(as);
+	return 0;
+}
+
+/**
+ * bch_btree_node_rewrite - Rewrite/move a btree node
+ *
+ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
+ * btree_check_reserve() has to wait)
+ */
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+			    __le64 seq, unsigned flags)
+{
+	struct btree_trans *trans = iter->trans;
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	flags |= BTREE_INSERT_NOFAIL;
+
+	closure_init_stack(&cl);
+
+	bch2_btree_iter_upgrade(iter, U8_MAX);
+
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
+		if (!down_read_trylock(&c->gc_lock)) {
+			bch2_trans_unlock(trans);
+			down_read(&c->gc_lock);
+		}
+	}
+
+	while (1) {
+		ret = bch2_btree_iter_traverse(iter);
+		if (ret)
+			break;
+
+		b = bch2_btree_iter_peek_node(iter);
+		if (!b || b->data->keys.seq != seq)
+			break;
+
+		ret = __btree_node_rewrite(c, iter, b, flags, &cl);
+		if (ret != -EAGAIN &&
+		    ret != -EINTR)
+			break;
+
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+	}
+
+	bch2_btree_iter_downgrade(iter);
+
+	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+		up_read(&c->gc_lock);
+
+	closure_sync(&cl);
+	return ret;
+}
+
+static void __bch2_btree_node_update_key(struct bch_fs *c,
+					 struct btree_update *as,
+					 struct btree_iter *iter,
+					 struct btree *b, struct btree *new_hash,
+					 struct bkey_i_btree_ptr *new_key)
+{
+	struct btree *parent;
+	int ret;
+
+	/*
+	 * Two corner cases that need to be thought about here:
+	 *
+	 * @b may not be reachable yet - there might be another interior update
+	 * operation waiting on @b to be written, and we're gonna deliver the
+	 * write completion to that interior update operation _before_
+	 * persisting the new_key update
+	 *
+	 * That ends up working without us having to do anything special here:
+	 * the reason is, we do kick off (and do the in memory updates) for the
+	 * update for @new_key before we return, creating a new interior_update
+	 * operation here.
+	 *
+	 * The new interior update operation here will in effect override the
+	 * previous one. The previous one was going to terminate - make @b
+	 * reachable - in one of two ways:
+	 * - updating the btree root pointer
+	 *   In that case,
+	 *   no, this doesn't work. argh.
+	 */
+
+	if (b->will_make_reachable)
+		as->must_rewrite = true;
+
+	btree_interior_update_add_node_reference(as, b);
+
+	/*
+	 * XXX: the rest of the update path treats this like we're actually
+	 * inserting a new node and deleting the existing node, so the
+	 * reservation needs to include enough space for @b
+	 *
+	 * that is actually sketch as fuck though and I am surprised the code
+	 * seems to work like that, definitely need to go back and rework it
+	 * into something saner.
+	 *
+	 * (I think @b is just getting double counted until the btree update
+	 * finishes and "deletes" @b on disk)
+	 */
+	ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
+			c->opts.btree_node_size *
+			bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)),
+			BCH_DISK_RESERVATION_NOFAIL);
+	BUG_ON(ret);
+
+	parent = btree_node_parent(iter, b);
+	if (parent) {
+		if (new_hash) {
+			bkey_copy(&new_hash->key, &new_key->k_i);
+			ret = bch2_btree_node_hash_insert(&c->btree_cache,
+					new_hash, b->level, b->btree_id);
+			BUG_ON(ret);
+		}
+
+		bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
+
+		if (new_hash) {
+			mutex_lock(&c->btree_cache.lock);
+			bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+
+			bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+			bkey_copy(&b->key, &new_key->k_i);
+			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+			BUG_ON(ret);
+			mutex_unlock(&c->btree_cache.lock);
+		} else {
+			bkey_copy(&b->key, &new_key->k_i);
+		}
+	} else {
+		struct bch_fs_usage *fs_usage;
+
+		BUG_ON(btree_node_root(c, b) != b);
+
+		bch2_btree_node_lock_write(b, iter);
+
+		mutex_lock(&c->btree_interior_update_lock);
+		percpu_down_read(&c->mark_lock);
+		fs_usage = bch2_fs_usage_scratch_get(c);
+
+		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
+			      0, 0, fs_usage, 0,
+			      BCH_BUCKET_MARK_INSERT);
+		if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
+			bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
+					     0, 0, NULL, 0,
+					     BCH_BUCKET_MARK_INSERT||
+					     BCH_BUCKET_MARK_GC);
+
+		bch2_btree_node_free_index(as, NULL,
+					   bkey_i_to_s_c(&b->key),
+					   fs_usage);
+		bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
+
+		bch2_fs_usage_scratch_put(c, fs_usage);
+		percpu_up_read(&c->mark_lock);
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+			mutex_lock(&c->btree_cache.lock);
+			bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+			bkey_copy(&b->key, &new_key->k_i);
+			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+			BUG_ON(ret);
+			mutex_unlock(&c->btree_cache.lock);
+		} else {
+			bkey_copy(&b->key, &new_key->k_i);
+		}
+
+		btree_update_updated_root(as);
+		bch2_btree_node_unlock_write(b, iter);
+	}
+
+	bch2_btree_update_done(as);
+}
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
+			       struct btree *b,
+			       struct bkey_i_btree_ptr *new_key)
+{
+	struct btree *parent = btree_node_parent(iter, b);
+	struct btree_update *as = NULL;
+	struct btree *new_hash = NULL;
+	struct closure cl;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	if (!bch2_btree_iter_upgrade(iter, U8_MAX))
+		return -EINTR;
+
+	if (!down_read_trylock(&c->gc_lock)) {
+		bch2_trans_unlock(iter->trans);
+		down_read(&c->gc_lock);
+
+		if (!bch2_trans_relock(iter->trans)) {
+			ret = -EINTR;
+			goto err;
+		}
+	}
+
+	/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
+	if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+		/* bch2_btree_reserve_get will unlock */
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		if (ret) {
+			bch2_trans_unlock(iter->trans);
+			up_read(&c->gc_lock);
+			closure_sync(&cl);
+			down_read(&c->gc_lock);
+
+			if (!bch2_trans_relock(iter->trans)) {
+				ret = -EINTR;
+				goto err;
+			}
+		}
+
+		new_hash = bch2_btree_node_mem_alloc(c);
+	}
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+		parent ? btree_update_reserve_required(c, parent) : 0,
+		BTREE_INSERT_NOFAIL|
+		BTREE_INSERT_USE_RESERVE|
+		BTREE_INSERT_USE_ALLOC_RESERVE,
+		&cl);
+
+	if (IS_ERR(as)) {
+		ret = PTR_ERR(as);
+		if (ret == -EAGAIN)
+			ret = -EINTR;
+
+		if (ret != -EINTR)
+			goto err;
+
+		bch2_trans_unlock(iter->trans);
+		up_read(&c->gc_lock);
+		closure_sync(&cl);
+		down_read(&c->gc_lock);
+
+		if (!bch2_trans_relock(iter->trans))
+			goto err;
+	}
+
+	ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&new_key->k_i));
+	if (ret)
+		goto err_free_update;
+
+	__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+
+	bch2_btree_iter_downgrade(iter);
+err:
+	if (new_hash) {
+		mutex_lock(&c->btree_cache.lock);
+		list_move(&new_hash->list, &c->btree_cache.freeable);
+		mutex_unlock(&c->btree_cache.lock);
+
+		six_unlock_write(&new_hash->lock);
+		six_unlock_intent(&new_hash->lock);
+	}
+	up_read(&c->gc_lock);
+	closure_sync(&cl);
+	return ret;
+err_free_update:
+	bch2_btree_update_free(as);
+	goto err;
+}
+
+/* Init code: */
+
+/*
+ * Only for filesystem bringup, when first reading the btree roots or allocating
+ * btree roots when initializing a new filesystem:
+ */
+void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
+{
+	BUG_ON(btree_node_root(c, b));
+
+	__bch2_btree_set_root_inmem(c, b);
+}
+
+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+{
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		closure_sync(&cl);
+	} while (ret);
+
+	b = bch2_btree_node_mem_alloc(c);
+	bch2_btree_cache_cannibalize_unlock(c);
+
+	set_btree_node_fake(b);
+	b->level	= 0;
+	b->btree_id	= id;
+
+	bkey_btree_ptr_init(&b->key);
+	b->key.k.p = POS_MAX;
+	PTR_HASH(&b->key) = U64_MAX - id;
+
+	bch2_bset_init_first(b, &b->data->keys);
+	bch2_btree_build_aux_trees(b);
+
+	b->data->flags = 0;
+	b->data->min_key = POS_MIN;
+	b->data->max_key = POS_MAX;
+	b->data->format = bch2_btree_calc_format(b);
+	btree_node_set_format(b, b->data->format);
+
+	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
+	BUG_ON(ret);
+
+	__bch2_btree_set_root_inmem(c, b);
+
+	six_unlock_write(&b->lock);
+	six_unlock_intent(&b->lock);
+}
+
+ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
+{
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
+	struct btree_update *as;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_for_each_entry(as, &c->btree_interior_update_list, list)
+		pr_buf(&out, "%p m %u w %u r %u j %llu\n",
+		       as,
+		       as->mode,
+		       as->nodes_written,
+		       atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
+		       as->journal.seq);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	return out.pos - buf;
+}
+
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+{
+	size_t ret = 0;
+	struct list_head *i;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_for_each(i, &c->btree_interior_update_list)
+		ret++;
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	return ret;
+}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
new file mode 100644
index 000000000000..c5a0ab5d7bb8
--- /dev/null
+++ b/fs/bcachefs/btree_update_interior.h
@@ -0,0 +1,341 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+
+#include "btree_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+
+struct btree_reserve {
+	struct disk_reservation	disk_res;
+	unsigned		nr;
+	struct btree		*b[BTREE_RESERVE_MAX];
+};
+
+void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
+				struct bkey_format *);
+
+/* Btree node freeing/allocation: */
+
+/*
+ * Tracks a btree node that has been (or is about to be) freed in memory, but
+ * has _not_ yet been freed on disk (because the write that makes the new
+ * node(s) visible and frees the old hasn't completed yet)
+ */
+struct pending_btree_node_free {
+	bool			index_update_done;
+
+	__le64			seq;
+	enum btree_id		btree_id;
+	unsigned		level;
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+};
+
+/*
+ * Tracks an in progress split/rewrite of a btree node and the update to the
+ * parent node:
+ *
+ * When we split/rewrite a node, we do all the updates in memory without
+ * waiting for any writes to complete - we allocate the new node(s) and update
+ * the parent node, possibly recursively up to the root.
+ *
+ * The end result is that we have one or more new nodes being written -
+ * possibly several, if there were multiple splits - and then a write (updating
+ * an interior node) which will make all these new nodes visible.
+ *
+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
+ * nodes can't be freed (their space on disk can't be reclaimed) until the
+ * update to the interior node that makes the new node visible completes -
+ * until then, the old nodes are still reachable on disk.
+ *
+ */
+struct btree_update {
+	struct closure			cl;
+	struct bch_fs			*c;
+
+	struct list_head		list;
+
+	/* What kind of update are we doing? */
+	enum {
+		BTREE_INTERIOR_NO_UPDATE,
+		BTREE_INTERIOR_UPDATING_NODE,
+		BTREE_INTERIOR_UPDATING_ROOT,
+		BTREE_INTERIOR_UPDATING_AS,
+	} mode;
+
+	unsigned			must_rewrite:1;
+	unsigned			nodes_written:1;
+
+	enum btree_id			btree_id;
+
+	struct btree_reserve		*reserve;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_NODE:
+	 * The update that made the new nodes visible was a regular update to an
+	 * existing interior node - @b. We can't write out the update to @b
+	 * until the new nodes we created are finished writing, so we block @b
+	 * from writing by putting this btree_interior update on the
+	 * @b->write_blocked list with @write_blocked_list:
+	 */
+	struct btree			*b;
+	struct list_head		write_blocked_list;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
+	 * we're now blocking another btree_update
+	 * @parent_as - btree_update that's waiting on our nodes to finish
+	 * writing, before it can make new nodes visible on disk
+	 * @wait - list of child btree_updates that are waiting on this
+	 * btree_update to make all the new nodes visible before they can free
+	 * their old btree nodes
+	 */
+	struct btree_update		*parent_as;
+	struct closure_waitlist		wait;
+
+	/*
+	 * We may be freeing nodes that were dirty, and thus had journal entries
+	 * pinned: we need to transfer the oldest of those pins to the
+	 * btree_update operation, and release it when the new node(s)
+	 * are all persistent and reachable:
+	 */
+	struct journal_entry_pin	journal;
+
+	u64				journal_seq;
+
+	/*
+	 * Nodes being freed:
+	 * Protected by c->btree_node_pending_free_lock
+	 */
+	struct pending_btree_node_free	pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
+	unsigned			nr_pending;
+
+	/* New nodes, that will be made reachable by this update: */
+	struct btree			*new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
+	unsigned			nr_new_nodes;
+
+	/* Only here to reduce stack usage on recursive splits: */
+	struct keylist			parent_keys;
+	/*
+	 * Enough room for btree_split's keys without realloc - btree node
+	 * pointers never have crc/compression info, so we only need to acount
+	 * for the pointers for three keys
+	 */
+	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
+};
+
+#define for_each_pending_btree_node_free(c, as, p)			\
+	list_for_each_entry(as, &c->btree_interior_update_list, list)	\
+		for (p = as->pending; p < as->pending + as->nr_pending; p++)
+
+void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
+				struct btree_iter *);
+void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
+
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+						  struct btree *,
+						  struct bkey_format);
+
+void bch2_btree_update_done(struct btree_update *);
+struct btree_update *
+bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
+			unsigned, struct closure *);
+
+void bch2_btree_interior_update_will_free_node(struct btree_update *,
+					       struct btree *);
+
+void bch2_btree_insert_node(struct btree_update *, struct btree *,
+			    struct btree_iter *, struct keylist *,
+			    unsigned);
+int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
+
+void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+				   unsigned, unsigned, enum btree_node_sibling);
+
+static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
+					struct btree_iter *iter,
+					unsigned level, unsigned flags,
+					enum btree_node_sibling sib)
+{
+	struct btree *b;
+
+	if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
+		return;
+
+	if (!bch2_btree_node_relock(iter, level))
+		return;
+
+	b = iter->l[level].b;
+	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
+		return;
+
+	__bch2_foreground_maybe_merge(c, iter, level, flags, sib);
+}
+
+static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
+					       struct btree_iter *iter,
+					       unsigned level,
+					       unsigned flags)
+{
+	bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+					    btree_prev_sib);
+	bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+					    btree_next_sib);
+}
+
+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+
+static inline unsigned btree_update_reserve_required(struct bch_fs *c,
+						     struct btree *b)
+{
+	unsigned depth = btree_node_root(c, b)->level + 1;
+
+	/*
+	 * Number of nodes we might have to allocate in a worst case btree
+	 * split operation - we split all the way up to the root, then allocate
+	 * a new root, unless we're already at max depth:
+	 */
+	if (depth < BTREE_MAX_DEPTH)
+		return (depth - b->level) * 2 + 1;
+	else
+		return (depth - b->level) * 2 - 1;
+}
+
+static inline void btree_node_reset_sib_u64s(struct btree *b)
+{
+	b->sib_u64s[0] = b->nr.live_u64s;
+	b->sib_u64s[1] = b->nr.live_u64s;
+}
+
+static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+{
+	return (void *) b->data + btree_bytes(c);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
+							    struct btree *b)
+{
+	return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
+							  struct btree *b)
+{
+	return btree_data_end(c, b);
+}
+
+static inline void *write_block(struct btree *b)
+{
+	return (void *) b->data + (b->written << 9);
+}
+
+static inline bool __btree_addr_written(struct btree *b, void *p)
+{
+	return p < write_block(b);
+}
+
+static inline bool bset_written(struct btree *b, struct bset *i)
+{
+	return __btree_addr_written(b, i);
+}
+
+static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
+{
+	return __btree_addr_written(b, k);
+}
+
+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
+						 struct btree *b,
+						 void *end)
+{
+	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
+		b->whiteout_u64s +
+		b->uncompacted_whiteout_u64s;
+	ssize_t total = c->opts.btree_node_size << 6;
+
+	return total - used;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
+						   struct btree *b)
+{
+	ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+				btree_bkey_last(b, bset_tree_last(b)));
+
+	BUG_ON(remaining < 0);
+
+	if (bset_written(b, btree_bset_last(b)))
+		return 0;
+
+	return remaining;
+}
+
+static inline unsigned btree_write_set_buffer(struct btree *b)
+{
+	/*
+	 * Could buffer up larger amounts of keys for btrees with larger keys,
+	 * pending benchmarking:
+	 */
+	return 4 << 10;
+}
+
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
+						     struct btree *b)
+{
+	struct bset_tree *t = bset_tree_last(b);
+	struct btree_node_entry *bne = max(write_block(b),
+			(void *) btree_bkey_last(b, bset_tree_last(b)));
+	ssize_t remaining_space =
+		__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
+
+	if (unlikely(bset_written(b, bset(b, t)))) {
+		if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+			return bne;
+	} else {
+		if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
+		    remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
+			return bne;
+	}
+
+	return NULL;
+}
+
+static inline void unreserve_whiteout(struct btree *b, struct bkey_packed *k)
+{
+	if (bkey_written(b, k)) {
+		EBUG_ON(b->uncompacted_whiteout_u64s <
+			bkeyp_key_u64s(&b->format, k));
+		b->uncompacted_whiteout_u64s -=
+			bkeyp_key_u64s(&b->format, k);
+	}
+}
+
+static inline void reserve_whiteout(struct btree *b, struct bkey_packed *k)
+{
+	if (bkey_written(b, k)) {
+		BUG_ON(!k->needs_whiteout);
+		b->uncompacted_whiteout_u64s +=
+			bkeyp_key_u64s(&b->format, k);
+	}
+}
+
+/*
+ * write lock must be held on @b (else the dirty bset that we were going to
+ * insert into could be written out from under us)
+ */
+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
+					       struct btree *b, unsigned u64s)
+{
+	if (unlikely(btree_node_fake(b)))
+		return false;
+
+	return u64s <= bch_btree_keys_u64s_remaining(c, b);
+}
+
+ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
+
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
new file mode 100644
index 000000000000..5f5574ecc176
--- /dev/null
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -0,0 +1,952 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "replicas.h"
+
+#include <linux/prefetch.h>
+#include <linux/sort.h>
+#include <trace/events/bcachefs.h>
+
+static inline bool same_leaf_as_prev(struct btree_trans *trans,
+				     unsigned idx)
+{
+	return idx &&
+		trans->updates[trans->updates_sorted[idx]].iter->l[0].b ==
+		trans->updates[trans->updates_sorted[idx - 1]].iter->l[0].b;
+}
+
+#define trans_for_each_update_sorted(_trans, _i, _iter)			\
+	for (_iter = 0;							\
+	     _iter < _trans->nr_updates &&				\
+	     (_i = _trans->updates + _trans->updates_sorted[_iter], 1);	\
+	     _iter++)
+
+inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
+					    struct btree_iter *iter)
+{
+	bch2_btree_node_lock_write(b, iter);
+
+	if (unlikely(btree_node_just_written(b)) &&
+	    bch2_btree_post_write_cleanup(c, b))
+		bch2_btree_iter_reinit_node(iter, b);
+
+	/*
+	 * If the last bset has been written, or if it's gotten too big - start
+	 * a new bset to insert into:
+	 */
+	if (want_new_bset(c, b))
+		bch2_btree_init_next(c, b, iter);
+}
+
+static inline void btree_trans_sort_updates(struct btree_trans *trans)
+{
+	struct btree_insert_entry *l, *r;
+	unsigned nr = 0, pos;
+
+	trans_for_each_update(trans, l) {
+		for (pos = 0; pos < nr; pos++) {
+			r = trans->updates + trans->updates_sorted[pos];
+
+			if (btree_iter_cmp(l->iter, r->iter) <= 0)
+				break;
+		}
+
+		memmove(&trans->updates_sorted[pos + 1],
+			&trans->updates_sorted[pos],
+			(nr - pos) * sizeof(trans->updates_sorted[0]));
+
+		trans->updates_sorted[pos] = l - trans->updates;
+		nr++;
+	}
+}
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch2_btree_bset_insert_key(struct btree_iter *iter,
+				struct btree *b,
+				struct btree_node_iter *node_iter,
+				struct bkey_i *insert)
+{
+	const struct bkey_format *f = &b->format;
+	struct bkey_packed *k;
+	unsigned clobber_u64s;
+
+	EBUG_ON(btree_node_just_written(b));
+	EBUG_ON(bset_written(b, btree_bset_last(b)));
+	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+	EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
+		bkey_cmp(insert->k.p, b->data->max_key) > 0);
+
+	k = bch2_btree_node_iter_peek_all(node_iter, b);
+	if (k && !bkey_cmp_packed(b, k, &insert->k)) {
+		BUG_ON(bkey_whiteout(k));
+
+		if (!bkey_written(b, k) &&
+		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
+		    !bkey_whiteout(&insert->k)) {
+			k->type = insert->k.type;
+			memcpy_u64s(bkeyp_val(f, k), &insert->v,
+				    bkey_val_u64s(&insert->k));
+			return true;
+		}
+
+		insert->k.needs_whiteout = k->needs_whiteout;
+
+		btree_account_key_drop(b, k);
+
+		if (k >= btree_bset_last(b)->start) {
+			clobber_u64s = k->u64s;
+
+			/*
+			 * If we're deleting, and the key we're deleting doesn't
+			 * need a whiteout (it wasn't overwriting a key that had
+			 * been written to disk) - just delete it:
+			 */
+			if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
+				bch2_bset_delete(b, k, clobber_u64s);
+				bch2_btree_node_iter_fix(iter, b, node_iter,
+							 k, clobber_u64s, 0);
+				return true;
+			}
+
+			goto overwrite;
+		}
+
+		k->type = KEY_TYPE_deleted;
+		bch2_btree_node_iter_fix(iter, b, node_iter, k,
+					 k->u64s, k->u64s);
+
+		if (bkey_whiteout(&insert->k)) {
+			reserve_whiteout(b, k);
+			return true;
+		} else {
+			k->needs_whiteout = false;
+		}
+	} else {
+		/*
+		 * Deleting, but the key to delete wasn't found - nothing to do:
+		 */
+		if (bkey_whiteout(&insert->k))
+			return false;
+
+		insert->k.needs_whiteout = false;
+	}
+
+	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
+	clobber_u64s = 0;
+overwrite:
+	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+	bch2_btree_node_iter_fix(iter, b, node_iter, k,
+				 clobber_u64s, k->u64s);
+	return true;
+}
+
+static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+			       unsigned i, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct btree_write *w = container_of(pin, struct btree_write, journal);
+	struct btree *b = container_of(w, struct btree, writes[i]);
+
+	btree_node_lock_type(c, b, SIX_LOCK_read);
+	bch2_btree_node_write_cond(c, b,
+		(btree_current_write(b) == w && w->journal.seq == seq));
+	six_unlock_read(&b->lock);
+}
+
+static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 0, seq);
+}
+
+static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 1, seq);
+}
+
+static inline void __btree_journal_key(struct btree_trans *trans,
+				       enum btree_id btree_id,
+				       struct bkey_i *insert)
+{
+	struct journal *j = &trans->c->journal;
+	u64 seq = trans->journal_res.seq;
+	bool needs_whiteout = insert->k.needs_whiteout;
+
+	/* ick */
+	insert->k.needs_whiteout = false;
+	bch2_journal_add_keys(j, &trans->journal_res,
+			      btree_id, insert);
+	insert->k.needs_whiteout = needs_whiteout;
+
+	bch2_journal_set_has_inode(j, &trans->journal_res,
+				   insert->k.p.inode);
+
+	if (trans->journal_seq)
+		*trans->journal_seq = seq;
+}
+
+void bch2_btree_journal_key(struct btree_trans *trans,
+			   struct btree_iter *iter,
+			   struct bkey_i *insert)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree *b = iter->l[0].b;
+	struct btree_write *w = btree_current_write(b);
+
+	EBUG_ON(iter->level || b->level);
+	EBUG_ON(trans->journal_res.ref !=
+		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
+
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		__btree_journal_key(trans, iter->btree_id, insert);
+		btree_bset_last(b)->journal_seq =
+			cpu_to_le64(trans->journal_res.seq);
+	}
+
+	if (unlikely(!journal_pin_active(&w->journal))) {
+		u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+			? trans->journal_res.seq
+			: j->replay_journal_seq;
+
+		bch2_journal_pin_add(j, seq, &w->journal,
+				     btree_node_write_idx(b) == 0
+				     ? btree_node_flush0
+				     : btree_node_flush1);
+	}
+
+	if (unlikely(!btree_node_dirty(b)))
+		set_btree_node_dirty(b);
+}
+
+static void bch2_insert_fixup_key(struct btree_trans *trans,
+				  struct btree_insert_entry *insert)
+{
+	struct btree_iter *iter = insert->iter;
+	struct btree_iter_level *l = &iter->l[0];
+
+	EBUG_ON(iter->level);
+	EBUG_ON(insert->k->k.u64s >
+		bch_btree_keys_u64s_remaining(trans->c, l->b));
+
+	if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter,
+					      insert->k)))
+		bch2_btree_journal_key(trans, iter, insert->k);
+}
+
+/**
+ * btree_insert_key - insert a key one key into a leaf node
+ */
+static void btree_insert_key_leaf(struct btree_trans *trans,
+				  struct btree_insert_entry *insert)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter = insert->iter;
+	struct btree *b = iter->l[0].b;
+	struct bset_tree *t = bset_tree_last(b);
+	int old_u64s = bset_u64s(t);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+
+	if (!btree_node_is_extents(b))
+		bch2_insert_fixup_key(trans, insert);
+	else
+		bch2_insert_fixup_extent(trans, insert);
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) bset_u64s(t) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch2_maybe_compact_whiteouts(c, b))
+		bch2_btree_iter_reinit_node(iter, b);
+
+	trace_btree_insert_key(c, b, insert->k);
+}
+
+/* Normal update interface: */
+
+static inline void btree_insert_entry_checks(struct btree_trans *trans,
+					     struct btree_insert_entry *i)
+{
+	struct bch_fs *c = trans->c;
+
+	BUG_ON(i->iter->level);
+	BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+	EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
+		bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
+	EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
+		!(trans->flags & BTREE_INSERT_ATOMIC));
+
+	BUG_ON(debug_check_bkeys(c) &&
+	       !bkey_deleted(&i->k->k) &&
+	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id));
+}
+
+static noinline int
+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s)
+{
+	struct bch_fs *c = trans->c;
+	int ret;
+
+	bch2_trans_unlock(trans);
+
+	ret = bch2_journal_preres_get(&c->journal,
+			&trans->journal_preres, u64s, 0);
+	if (ret)
+		return ret;
+
+	if (!bch2_trans_relock(trans)) {
+		trace_trans_restart_journal_preres_get(trans->ip);
+		return -EINTR;
+	}
+
+	return 0;
+}
+
+static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+					     unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	int ret;
+
+	if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
+		flags |= JOURNAL_RES_GET_RESERVED;
+
+	ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
+				   trans->journal_u64s, flags);
+
+	return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
+}
+
+static enum btree_insert_ret
+btree_key_can_insert(struct btree_trans *trans,
+		     struct btree_insert_entry *insert,
+		     unsigned *u64s)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b = insert->iter->l[0].b;
+	static enum btree_insert_ret ret;
+
+	if (unlikely(btree_node_fake(b)))
+		return BTREE_INSERT_BTREE_NODE_FULL;
+
+	ret = !btree_node_is_extents(b)
+		? BTREE_INSERT_OK
+		: bch2_extent_can_insert(trans, insert, u64s);
+	if (ret)
+		return ret;
+
+	if (*u64s > bch_btree_keys_u64s_remaining(c, b))
+		return BTREE_INSERT_BTREE_NODE_FULL;
+
+	return BTREE_INSERT_OK;
+}
+
+static inline void do_btree_insert_one(struct btree_trans *trans,
+				       struct btree_insert_entry *insert)
+{
+	btree_insert_key_leaf(trans, insert);
+}
+
+static inline bool update_has_trans_triggers(struct btree_insert_entry *i)
+{
+	return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->iter->btree_id);
+}
+
+static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i)
+{
+	return (BTREE_NODE_TYPE_HAS_TRIGGERS &
+		~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) &
+		(1U << i->iter->btree_id);
+}
+
+static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
+{
+	__bch2_btree_iter_unlock(iter);
+}
+
+static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
+		? BCH_BUCKET_MARK_BUCKET_INVALIDATE
+		: 0;
+
+	if (unlikely(trans->flags & BTREE_INSERT_NOMARK))
+		return;
+
+	trans_for_each_update(trans, i)
+		if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+			bch2_mark_update(trans, i, NULL,
+					 mark_flags|BCH_BUCKET_MARK_GC);
+}
+
+static inline int
+bch2_trans_commit_write_locked(struct btree_trans *trans,
+			       struct btree_insert_entry **stopped_at)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_fs_usage *fs_usage = NULL;
+	struct btree_insert_entry *i;
+	unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
+		? BCH_BUCKET_MARK_BUCKET_INVALIDATE
+		: 0;
+	unsigned iter, u64s = 0;
+	bool marking = false;
+	int ret;
+
+	if (race_fault()) {
+		trace_trans_restart_fault_inject(trans->ip);
+		return -EINTR;
+	}
+
+	/*
+	 * Check if the insert will fit in the leaf node with the write lock
+	 * held, otherwise another thread could write the node changing the
+	 * amount of space available:
+	 */
+
+	prefetch(&trans->c->journal.flags);
+
+	trans_for_each_update_sorted(trans, i, iter) {
+		/* Multiple inserts might go to same leaf: */
+		if (!same_leaf_as_prev(trans, iter))
+			u64s = 0;
+
+		u64s += i->k->k.u64s;
+		ret = btree_key_can_insert(trans, i, &u64s);
+		if (ret) {
+			*stopped_at = i;
+			return ret;
+		}
+
+		if (btree_node_type_needs_gc(i->iter->btree_id))
+			marking = true;
+	}
+
+	if (marking) {
+		percpu_down_read(&c->mark_lock);
+		fs_usage = bch2_fs_usage_scratch_get(c);
+	}
+
+	/*
+	 * Don't get journal reservation until after we know insert will
+	 * succeed:
+	 */
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		ret = bch2_trans_journal_res_get(trans,
+				JOURNAL_RES_GET_NONBLOCK);
+		if (ret)
+			goto err;
+	}
+
+	/*
+	 * Not allowed to fail after we've gotten our journal reservation - we
+	 * have to use it:
+	 */
+
+	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+		if (journal_seq_verify(c))
+			trans_for_each_update(trans, i)
+				i->k->k.version.lo = trans->journal_res.seq;
+		else if (inject_invalid_keys(c))
+			trans_for_each_update(trans, i)
+				i->k->k.version = MAX_VERSION;
+	}
+
+	/* Must be called under mark_lock: */
+	if (marking && trans->fs_usage_deltas &&
+	    bch2_replicas_delta_list_apply(c, fs_usage,
+					   trans->fs_usage_deltas)) {
+		ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+		goto err;
+	}
+
+	trans_for_each_update(trans, i)
+		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+		    update_has_nontrans_triggers(i))
+			bch2_mark_update(trans, i, fs_usage, mark_flags);
+
+	if (marking)
+		bch2_trans_fs_usage_apply(trans, fs_usage);
+
+	if (unlikely(c->gc_pos.phase))
+		bch2_trans_mark_gc(trans);
+
+	trans_for_each_update(trans, i)
+		do_btree_insert_one(trans, i);
+err:
+	if (marking) {
+		bch2_fs_usage_scratch_put(c, fs_usage);
+		percpu_up_read(&c->mark_lock);
+	}
+
+	return ret;
+}
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_bch2_trans_commit(struct btree_trans *trans,
+				       struct btree_insert_entry **stopped_at)
+{
+	struct btree_insert_entry *i;
+	struct btree_iter *iter;
+	unsigned idx, u64s, journal_preres_u64s = 0;
+	int ret;
+
+	/*
+	 * note: running triggers will append more updates to the list of
+	 * updates as we're walking it:
+	 */
+	trans_for_each_update(trans, i) {
+		/* we know trans->nounlock won't be set here: */
+		if (unlikely(!(i->iter->locks_want < 1
+			       ? __bch2_btree_iter_upgrade(i->iter, 1)
+			       : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) {
+			trace_trans_restart_upgrade(trans->ip);
+			return -EINTR;
+		}
+
+		if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+		    update_has_trans_triggers(i)) {
+			ret = bch2_trans_mark_update(trans, i->iter, i->k);
+			if (unlikely(ret)) {
+				if (ret == -EINTR)
+					trace_trans_restart_mark(trans->ip);
+				return ret;
+			}
+		}
+
+		u64s = jset_u64s(i->k->k.u64s);
+		if (0)
+			journal_preres_u64s += u64s;
+		trans->journal_u64s += u64s;
+	}
+
+	ret = bch2_journal_preres_get(&trans->c->journal,
+			&trans->journal_preres, journal_preres_u64s,
+			JOURNAL_RES_GET_NONBLOCK);
+	if (unlikely(ret == -EAGAIN))
+		ret = bch2_trans_journal_preres_get_cold(trans,
+						journal_preres_u64s);
+	if (unlikely(ret))
+		return ret;
+
+	/*
+	 * Can't be holding any read locks when we go to take write locks:
+	 *
+	 * note - this must be done after bch2_trans_journal_preres_get_cold()
+	 * or anything else that might call bch2_trans_relock(), since that
+	 * would just retake the read locks:
+	 */
+	trans_for_each_iter_all(trans, iter) {
+		if (iter->nodes_locked != iter->nodes_intent_locked) {
+			EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
+			EBUG_ON(trans->iters_live & (1ULL << iter->idx));
+			bch2_btree_iter_unlock_noinline(iter);
+		}
+	}
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+		trans_for_each_update(trans, i)
+			btree_insert_entry_checks(trans, i);
+	bch2_btree_trans_verify_locks(trans);
+
+	/*
+	 * No more updates can be added - sort updates so we can take write
+	 * locks in the correct order:
+	 */
+	btree_trans_sort_updates(trans);
+
+	trans_for_each_update_sorted(trans, i, idx)
+		if (!same_leaf_as_prev(trans, idx))
+			bch2_btree_node_lock_for_insert(trans->c,
+						i->iter->l[0].b, i->iter);
+
+	ret = bch2_trans_commit_write_locked(trans, stopped_at);
+
+	trans_for_each_update_sorted(trans, i, idx)
+		if (!same_leaf_as_prev(trans, idx))
+			bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
+							     i->iter);
+
+	/*
+	 * Drop journal reservation after dropping write locks, since dropping
+	 * the journal reservation may kick off a journal write:
+	 */
+	bch2_journal_res_put(&trans->c->journal, &trans->journal_res);
+
+	if (unlikely(ret))
+		return ret;
+
+	if (trans->flags & BTREE_INSERT_NOUNLOCK)
+		trans->nounlock = true;
+
+	trans_for_each_update_sorted(trans, i, idx)
+		if (!same_leaf_as_prev(trans, idx))
+			bch2_foreground_maybe_merge(trans->c, i->iter,
+						    0, trans->flags);
+
+	trans->nounlock = false;
+
+	trans_for_each_update(trans, i)
+		bch2_btree_iter_downgrade(i->iter);
+
+	return 0;
+}
+
+static noinline
+int bch2_trans_commit_error(struct btree_trans *trans,
+			    struct btree_insert_entry *i,
+			    int ret)
+{
+	struct bch_fs *c = trans->c;
+	unsigned flags = trans->flags;
+
+	/*
+	 * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
+	 * update; if we haven't done anything yet it doesn't apply
+	 */
+	flags &= ~BTREE_INSERT_NOUNLOCK;
+
+	switch (ret) {
+	case BTREE_INSERT_BTREE_NODE_FULL:
+		ret = bch2_btree_split_leaf(c, i->iter, flags);
+
+		/*
+		 * if the split succeeded without dropping locks the insert will
+		 * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
+		 * caller peeked() and is overwriting won't have changed)
+		 */
+#if 0
+		/*
+		 * XXX:
+		 * split -> btree node merging (of parent node) might still drop
+		 * locks when we're not passing it BTREE_INSERT_NOUNLOCK
+		 *
+		 * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that
+		 * will inhibit merging - but we don't have a reliable way yet
+		 * (do we?) of checking if we dropped locks in this path
+		 */
+		if (!ret)
+			goto retry;
+#endif
+
+		/*
+		 * don't care if we got ENOSPC because we told split it
+		 * couldn't block:
+		 */
+		if (!ret ||
+		    ret == -EINTR ||
+		    (flags & BTREE_INSERT_NOUNLOCK)) {
+			trace_trans_restart_btree_node_split(trans->ip);
+			ret = -EINTR;
+		}
+		break;
+	case BTREE_INSERT_ENOSPC:
+		ret = -ENOSPC;
+		break;
+	case BTREE_INSERT_NEED_MARK_REPLICAS:
+		bch2_trans_unlock(trans);
+
+		trans_for_each_update(trans, i) {
+			ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
+			if (ret)
+				return ret;
+		}
+
+		if (bch2_trans_relock(trans))
+			return 0;
+
+		trace_trans_restart_mark_replicas(trans->ip);
+		ret = -EINTR;
+		break;
+	case BTREE_INSERT_NEED_JOURNAL_RES:
+		bch2_trans_unlock(trans);
+
+		ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
+		if (ret)
+			return ret;
+
+		if (bch2_trans_relock(trans))
+			return 0;
+
+		trace_trans_restart_journal_res_get(trans->ip);
+		ret = -EINTR;
+		break;
+	default:
+		BUG_ON(ret >= 0);
+		break;
+	}
+
+	if (ret == -EINTR) {
+		int ret2 = bch2_btree_iter_traverse_all(trans);
+
+		if (ret2) {
+			trace_trans_restart_traverse(trans->ip);
+			return ret2;
+		}
+
+		/*
+		 * BTREE_ITER_ATOMIC means we have to return -EINTR if we
+		 * dropped locks:
+		 */
+		if (!(flags & BTREE_INSERT_ATOMIC))
+			return 0;
+
+		trace_trans_restart_atomic(trans->ip);
+	}
+
+	return ret;
+}
+
+static noinline int
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	int ret;
+
+	if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
+		return -EROFS;
+
+	bch2_trans_unlock(trans);
+
+	ret = bch2_fs_read_write_early(c);
+	if (ret)
+		return ret;
+
+	percpu_ref_get(&c->writes);
+	return 0;
+}
+
+int __bch2_trans_commit(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i = NULL;
+	struct btree_iter *iter;
+	unsigned orig_nr_updates	= trans->nr_updates;
+	unsigned orig_mem_top		= trans->mem_top;
+	int ret = 0;
+
+	if (!trans->nr_updates)
+		goto out_noupdates;
+
+	/* for the sake of sanity: */
+	EBUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
+
+	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
+		lockdep_assert_held(&trans->c->gc_lock);
+
+	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
+
+	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
+	    unlikely(!percpu_ref_tryget(&trans->c->writes))) {
+		ret = bch2_trans_commit_get_rw_cold(trans);
+		if (ret)
+			return ret;
+	}
+retry:
+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+	trans->journal_u64s	= 0;
+
+	ret = do_bch2_trans_commit(trans, &i);
+
+	if (trans->fs_usage_deltas) {
+		trans->fs_usage_deltas->used = 0;
+		memset(&trans->fs_usage_deltas->memset_start, 0,
+		       (void *) &trans->fs_usage_deltas->memset_end -
+		       (void *) &trans->fs_usage_deltas->memset_start);
+	}
+
+	/* make sure we didn't drop or screw up locks: */
+	bch2_btree_trans_verify_locks(trans);
+
+	if (ret)
+		goto err;
+out:
+	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
+
+	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
+		percpu_ref_put(&trans->c->writes);
+out_noupdates:
+	EBUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
+
+	trans_for_each_iter_all(trans, iter)
+		iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+
+	if (!ret) {
+		bch2_trans_unlink_iters(trans);
+		trans->iters_touched = 0;
+	}
+	trans->nr_updates	= 0;
+	trans->mem_top		= 0;
+
+	return ret;
+err:
+	ret = bch2_trans_commit_error(trans, i, ret);
+
+	/* can't loop if it was passed in and we changed it: */
+	if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret)
+		ret = -EINTR;
+	if (ret)
+		goto out;
+
+	/* free updates and memory used by triggers, they'll be reexecuted: */
+	trans->nr_updates	= orig_nr_updates;
+	trans->mem_top		= orig_mem_top;
+	goto retry;
+}
+
+/**
+ * bch2_btree_insert - insert keys into the extent btree
+ * @c:			pointer to struct bch_fs
+ * @id:			btree to insert into
+ * @insert_keys:	list of keys to insert
+ * @hook:		insert callback
+ */
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
+		     struct bkey_i *k,
+		     struct disk_reservation *disk_res,
+		     u64 *journal_seq, int flags)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k),
+				   BTREE_ITER_INTENT);
+
+	bch2_trans_update(&trans, iter, k);
+
+	ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags);
+	if (ret == -EINTR)
+		goto retry;
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+int bch2_btree_delete_at_range(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bpos end,
+			       u64 *journal_seq)
+{
+	struct bkey_s_c k;
+	int ret = 0;
+retry:
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k)) &&
+	       bkey_cmp(iter->pos, end) < 0) {
+		struct bkey_i delete;
+
+		bkey_init(&delete.k);
+
+		/*
+		 * For extents, iter.pos won't necessarily be the same as
+		 * bkey_start_pos(k.k) (for non extents they always will be the
+		 * same). It's important that we delete starting from iter.pos
+		 * because the range we want to delete could start in the middle
+		 * of k.
+		 *
+		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
+		 * bkey_start_pos(k.k)).
+		 */
+		delete.k.p = iter->pos;
+
+		if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+			unsigned max_sectors =
+				KEY_SIZE_MAX & (~0 << trans->c->block_bits);
+
+			/* create the biggest key we can */
+			bch2_key_resize(&delete.k, max_sectors);
+			bch2_cut_back(end, &delete.k);
+
+			ret = bch2_extent_trim_atomic(&delete, iter);
+			if (ret)
+				break;
+		}
+
+		bch2_trans_update(trans, iter, &delete);
+		ret = bch2_trans_commit(trans, NULL, journal_seq,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL);
+		if (ret)
+			break;
+
+		bch2_trans_cond_resched(trans);
+	}
+
+	if (ret == -EINTR) {
+		ret = 0;
+		goto retry;
+	}
+
+	return ret;
+
+}
+
+int bch2_btree_delete_at(struct btree_trans *trans,
+			 struct btree_iter *iter, unsigned flags)
+{
+	struct bkey_i k;
+
+	bkey_init(&k.k);
+	k.k.p = iter->pos;
+
+	bch2_trans_update(trans, iter, &k);
+	return bch2_trans_commit(trans, NULL, NULL,
+				 BTREE_INSERT_NOFAIL|
+				 BTREE_INSERT_USE_RESERVE|flags);
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+			    struct bpos start, struct bpos end,
+			    u64 *journal_seq)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	int ret = 0;
+
+	/*
+	 * XXX: whether we need mem/more iters depends on whether this btree id
+	 * has triggers
+	 */
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
+
+	iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
+
+	ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq);
+	ret = bch2_trans_exit(&trans) ?: ret;
+
+	BUG_ON(ret == -EINTR);
+	return ret;
+}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
new file mode 100644
index 000000000000..c418398266a3
--- /dev/null
+++ b/fs/bcachefs/buckets.c
@@ -0,0 +1,2095 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ *
+ * Bucket states:
+ * - free bucket: mark == 0
+ *   The bucket contains no data and will not be read
+ *
+ * - allocator bucket: owned_by_allocator == 1
+ *   The bucket is on a free list, or it is an open bucket
+ *
+ * - cached bucket: owned_by_allocator == 0 &&
+ *                  dirty_sectors == 0 &&
+ *                  cached_sectors > 0
+ *   The bucket contains data but may be safely discarded as there are
+ *   enough replicas of the data on other cache devices, or it has been
+ *   written back to the backing device
+ *
+ * - dirty bucket: owned_by_allocator == 0 &&
+ *                 dirty_sectors > 0
+ *   The bucket contains data that we must not discard (either only copy,
+ *   or one of the 'main copies' for data requiring multiple replicas)
+ *
+ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
+ *   This is a btree node, journal or gen/prio bucket
+ *
+ * Lifecycle:
+ *
+ * bucket invalidated => bucket on freelist => open bucket =>
+ *     [dirty bucket =>] cached bucket => bucket invalidated => ...
+ *
+ * Note that cache promotion can skip the dirty bucket step, as data
+ * is copied from a deeper tier to a shallower tier, onto a cached
+ * bucket.
+ * Note also that a cached bucket can spontaneously become dirty --
+ * see below.
+ *
+ * Only a traversal of the key space can determine whether a bucket is
+ * truly dirty or cached.
+ *
+ * Transitions:
+ *
+ * - free => allocator: bucket was invalidated
+ * - cached => allocator: bucket was invalidated
+ *
+ * - allocator => dirty: open bucket was filled up
+ * - allocator => cached: open bucket was filled up
+ * - allocator => metadata: metadata was allocated
+ *
+ * - dirty => cached: dirty sectors were copied to a deeper tier
+ * - dirty => free: dirty sectors were overwritten or moved (copy gc)
+ * - cached => free: cached sectors were overwritten
+ *
+ * - metadata => free: metadata was freed
+ *
+ * Oddities:
+ * - cached => dirty: a device was removed so formerly replicated data
+ *                    is no longer sufficiently replicated
+ * - free => cached: cannot happen
+ * - free => dirty: cannot happen
+ * - free => metadata: cannot happen
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "bset.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "ec.h"
+#include "error.h"
+#include "movinggc.h"
+#include "replicas.h"
+
+#include <linux/preempt.h>
+#include <trace/events/bcachefs.h>
+
+/*
+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent
+ * wraparound:
+ */
+void bch2_bucket_seq_cleanup(struct bch_fs *c)
+{
+	u64 journal_seq = atomic64_read(&c->journal.seq);
+	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
+	struct bch_dev *ca;
+	struct bucket_array *buckets;
+	struct bucket *g;
+	struct bucket_mark m;
+	unsigned i;
+
+	if (journal_seq - c->last_bucket_seq_cleanup <
+	    (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
+		return;
+
+	c->last_bucket_seq_cleanup = journal_seq;
+
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		buckets = bucket_array(ca);
+
+		for_each_bucket(g, buckets) {
+			bucket_cmpxchg(g, m, ({
+				if (!m.journal_seq_valid ||
+				    bucket_needs_journal_commit(m, last_seq_ondisk))
+					break;
+
+				m.journal_seq_valid = 0;
+			}));
+		}
+		up_read(&ca->bucket_lock);
+	}
+}
+
+void bch2_fs_usage_initialize(struct bch_fs *c)
+{
+	struct bch_fs_usage *usage;
+	unsigned i;
+
+	percpu_down_write(&c->mark_lock);
+	usage = c->usage_base;
+
+	bch2_fs_usage_acc_to_base(c, 0);
+	bch2_fs_usage_acc_to_base(c, 1);
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++)
+		usage->reserved += usage->persistent_reserved[i];
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		switch (e->data_type) {
+		case BCH_DATA_BTREE:
+			usage->btree	+= usage->replicas[i];
+			break;
+		case BCH_DATA_USER:
+			usage->data	+= usage->replicas[i];
+			break;
+		case BCH_DATA_CACHED:
+			usage->cached	+= usage->replicas[i];
+			break;
+		}
+	}
+
+	percpu_up_write(&c->mark_lock);
+}
+
+void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage)
+{
+	if (fs_usage == c->usage_scratch)
+		mutex_unlock(&c->usage_scratch_lock);
+	else
+		kfree(fs_usage);
+}
+
+struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c)
+{
+	struct bch_fs_usage *ret;
+	unsigned bytes = fs_usage_u64s(c) * sizeof(u64);
+
+	ret = kzalloc(bytes, GFP_NOWAIT);
+	if (ret)
+		return ret;
+
+	if (mutex_trylock(&c->usage_scratch_lock))
+		goto out_pool;
+
+	ret = kzalloc(bytes, GFP_NOFS);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->usage_scratch_lock);
+out_pool:
+	ret = c->usage_scratch;
+	memset(ret, 0, bytes);
+	return ret;
+}
+
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bch_dev_usage ret;
+
+	memset(&ret, 0, sizeof(ret));
+	acc_u64s_percpu((u64 *) &ret,
+			(u64 __percpu *) ca->usage[0],
+			sizeof(ret) / sizeof(u64));
+
+	return ret;
+}
+
+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
+						unsigned journal_seq,
+						bool gc)
+{
+	return this_cpu_ptr(gc
+			    ? c->usage_gc
+			    : c->usage[journal_seq & 1]);
+}
+
+u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
+{
+	ssize_t offset = v - (u64 *) c->usage_base;
+	unsigned seq;
+	u64 ret;
+
+	BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
+	percpu_rwsem_assert_held(&c->mark_lock);
+
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		ret = *v +
+			percpu_u64_get((u64 __percpu *) c->usage[0] + offset) +
+			percpu_u64_get((u64 __percpu *) c->usage[1] + offset);
+	} while (read_seqcount_retry(&c->usage_lock, seq));
+
+	return ret;
+}
+
+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
+{
+	struct bch_fs_usage *ret;
+	unsigned seq, v, u64s = fs_usage_u64s(c);
+retry:
+	ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
+	if (unlikely(!ret))
+		return NULL;
+
+	percpu_down_read(&c->mark_lock);
+
+	v = fs_usage_u64s(c);
+	if (unlikely(u64s != v)) {
+		u64s = v;
+		percpu_up_read(&c->mark_lock);
+		kfree(ret);
+		goto retry;
+	}
+
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		memcpy(ret, c->usage_base, u64s * sizeof(u64));
+		acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
+		acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s);
+	} while (read_seqcount_retry(&c->usage_lock, seq));
+
+	return ret;
+}
+
+void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
+{
+	unsigned u64s = fs_usage_u64s(c);
+
+	BUG_ON(idx >= 2);
+
+	write_seqcount_begin(&c->usage_lock);
+
+	acc_u64s_percpu((u64 *) c->usage_base,
+			(u64 __percpu *) c->usage[idx], u64s);
+	percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
+
+	write_seqcount_end(&c->usage_lock);
+}
+
+void bch2_fs_usage_to_text(struct printbuf *out,
+			   struct bch_fs *c,
+			   struct bch_fs_usage *fs_usage)
+{
+	unsigned i;
+
+	pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
+
+	pr_buf(out, "hidden:\t\t\t\t%llu\n",
+	       fs_usage->hidden);
+	pr_buf(out, "data:\t\t\t\t%llu\n",
+	       fs_usage->data);
+	pr_buf(out, "cached:\t\t\t\t%llu\n",
+	       fs_usage->cached);
+	pr_buf(out, "reserved:\t\t\t%llu\n",
+	       fs_usage->reserved);
+	pr_buf(out, "nr_inodes:\t\t\t%llu\n",
+	       fs_usage->nr_inodes);
+	pr_buf(out, "online reserved:\t\t%llu\n",
+	       fs_usage->online_reserved);
+
+	for (i = 0;
+	     i < ARRAY_SIZE(fs_usage->persistent_reserved);
+	     i++) {
+		pr_buf(out, "%u replicas:\n", i + 1);
+		pr_buf(out, "\treserved:\t\t%llu\n",
+		       fs_usage->persistent_reserved[i]);
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		pr_buf(out, "\t");
+		bch2_replicas_entry_to_text(out, e);
+		pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]);
+	}
+}
+
+#define RESERVE_FACTOR	6
+
+static u64 reserve_factor(u64 r)
+{
+	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
+}
+
+static u64 avail_factor(u64 r)
+{
+	return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
+}
+
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
+{
+	return min(fs_usage->hidden +
+		   fs_usage->btree +
+		   fs_usage->data +
+		   reserve_factor(fs_usage->reserved +
+				  fs_usage->online_reserved),
+		   c->capacity);
+}
+
+static struct bch_fs_usage_short
+__bch2_fs_usage_read_short(struct bch_fs *c)
+{
+	struct bch_fs_usage_short ret;
+	u64 data, reserved;
+
+	ret.capacity = c->capacity -
+		bch2_fs_usage_read_one(c, &c->usage_base->hidden);
+
+	data		= bch2_fs_usage_read_one(c, &c->usage_base->data) +
+		bch2_fs_usage_read_one(c, &c->usage_base->btree);
+	reserved	= bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
+		bch2_fs_usage_read_one(c, &c->usage_base->online_reserved);
+
+	ret.used	= min(ret.capacity, data + reserve_factor(reserved));
+	ret.free	= ret.capacity - ret.used;
+
+	ret.nr_inodes	= bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
+
+	return ret;
+}
+
+struct bch_fs_usage_short
+bch2_fs_usage_read_short(struct bch_fs *c)
+{
+	struct bch_fs_usage_short ret;
+
+	percpu_down_read(&c->mark_lock);
+	ret = __bch2_fs_usage_read_short(c);
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
+}
+
+static inline int is_unavailable_bucket(struct bucket_mark m)
+{
+	return !is_available_bucket(m);
+}
+
+static inline int is_fragmented_bucket(struct bucket_mark m,
+				       struct bch_dev *ca)
+{
+	if (!m.owned_by_allocator &&
+	    m.data_type == BCH_DATA_USER &&
+	    bucket_sectors_used(m))
+		return max_t(int, 0, (int) ca->mi.bucket_size -
+			     bucket_sectors_used(m));
+	return 0;
+}
+
+static inline enum bch_data_type bucket_type(struct bucket_mark m)
+{
+	return m.cached_sectors && !m.dirty_sectors
+		? BCH_DATA_CACHED
+		: m.data_type;
+}
+
+static bool bucket_became_unavailable(struct bucket_mark old,
+				      struct bucket_mark new)
+{
+	return is_available_bucket(old) &&
+	       !is_available_bucket(new);
+}
+
+int bch2_fs_usage_apply(struct bch_fs *c,
+			struct bch_fs_usage *fs_usage,
+			struct disk_reservation *disk_res,
+			unsigned journal_seq)
+{
+	s64 added = fs_usage->data + fs_usage->reserved;
+	s64 should_not_have_added;
+	int ret = 0;
+
+	percpu_rwsem_assert_held(&c->mark_lock);
+
+	/*
+	 * Not allowed to reduce sectors_available except by getting a
+	 * reservation:
+	 */
+	should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
+	if (WARN_ONCE(should_not_have_added > 0,
+		      "disk usage increased by %lli without a reservation",
+		      should_not_have_added)) {
+		atomic64_sub(should_not_have_added, &c->sectors_available);
+		added -= should_not_have_added;
+		ret = -1;
+	}
+
+	if (added > 0) {
+		disk_res->sectors		-= added;
+		fs_usage->online_reserved	-= added;
+	}
+
+	preempt_disable();
+	acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false),
+		 (u64 *) fs_usage, fs_usage_u64s(c));
+	preempt_enable();
+
+	return ret;
+}
+
+static inline void account_bucket(struct bch_fs_usage *fs_usage,
+				  struct bch_dev_usage *dev_usage,
+				  enum bch_data_type type,
+				  int nr, s64 size)
+{
+	if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
+		fs_usage->hidden	+= size;
+
+	dev_usage->buckets[type]	+= nr;
+}
+
+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+				  struct bch_fs_usage *fs_usage,
+				  struct bucket_mark old, struct bucket_mark new,
+				  bool gc)
+{
+	struct bch_dev_usage *dev_usage;
+
+	percpu_rwsem_assert_held(&c->mark_lock);
+
+	preempt_disable();
+	dev_usage = this_cpu_ptr(ca->usage[gc]);
+
+	if (bucket_type(old))
+		account_bucket(fs_usage, dev_usage, bucket_type(old),
+			       -1, -ca->mi.bucket_size);
+
+	if (bucket_type(new))
+		account_bucket(fs_usage, dev_usage, bucket_type(new),
+			       1, ca->mi.bucket_size);
+
+	dev_usage->buckets_alloc +=
+		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+	dev_usage->buckets_ec +=
+		(int) new.stripe - (int) old.stripe;
+	dev_usage->buckets_unavailable +=
+		is_unavailable_bucket(new) - is_unavailable_bucket(old);
+
+	dev_usage->sectors[old.data_type] -= old.dirty_sectors;
+	dev_usage->sectors[new.data_type] += new.dirty_sectors;
+	dev_usage->sectors[BCH_DATA_CACHED] +=
+		(int) new.cached_sectors - (int) old.cached_sectors;
+	dev_usage->sectors_fragmented +=
+		is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
+	preempt_enable();
+
+	if (!is_available_bucket(old) && is_available_bucket(new))
+		bch2_wake_allocator(ca);
+}
+
+void bch2_dev_usage_from_buckets(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	struct bucket_mark old = { .v.counter = 0 };
+	struct bucket_array *buckets;
+	struct bucket *g;
+	unsigned i;
+	int cpu;
+
+	c->usage_base->hidden = 0;
+
+	for_each_member_device(ca, c, i) {
+		for_each_possible_cpu(cpu)
+			memset(per_cpu_ptr(ca->usage[0], cpu), 0,
+			       sizeof(*ca->usage[0]));
+
+		buckets = bucket_array(ca);
+
+		for_each_bucket(g, buckets)
+			bch2_dev_usage_update(c, ca, c->usage_base,
+					      old, g->mark, false);
+	}
+}
+
+static inline int update_replicas(struct bch_fs *c,
+				  struct bch_fs_usage *fs_usage,
+				  struct bch_replicas_entry *r,
+				  s64 sectors)
+{
+	int idx = bch2_replicas_entry_idx(c, r);
+
+	if (idx < 0)
+		return -1;
+
+	if (!fs_usage)
+		return 0;
+
+	switch (r->data_type) {
+	case BCH_DATA_BTREE:
+		fs_usage->btree		+= sectors;
+		break;
+	case BCH_DATA_USER:
+		fs_usage->data		+= sectors;
+		break;
+	case BCH_DATA_CACHED:
+		fs_usage->cached	+= sectors;
+		break;
+	}
+	fs_usage->replicas[idx]		+= sectors;
+	return 0;
+}
+
+static inline void update_cached_sectors(struct bch_fs *c,
+					 struct bch_fs_usage *fs_usage,
+					 unsigned dev, s64 sectors)
+{
+	struct bch_replicas_padded r;
+
+	bch2_replicas_entry_cached(&r.e, dev);
+
+	update_replicas(c, fs_usage, &r.e, sectors);
+}
+
+static struct replicas_delta_list *
+replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
+{
+	struct replicas_delta_list *d = trans->fs_usage_deltas;
+	unsigned new_size = d ? (d->size + more) * 2 : 128;
+
+	if (!d || d->used + more > d->size) {
+		d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO);
+		BUG_ON(!d);
+
+		d->size = new_size;
+		trans->fs_usage_deltas = d;
+	}
+	return d;
+}
+
+static inline void update_replicas_list(struct btree_trans *trans,
+					struct bch_replicas_entry *r,
+					s64 sectors)
+{
+	struct replicas_delta_list *d;
+	struct replicas_delta *n;
+	unsigned b;
+
+	if (!sectors)
+		return;
+
+	b = replicas_entry_bytes(r) + 8;
+	d = replicas_deltas_realloc(trans, b);
+
+	n = (void *) d->d + d->used;
+	n->delta = sectors;
+	memcpy(&n->r, r, replicas_entry_bytes(r));
+	d->used += b;
+}
+
+static inline void update_cached_sectors_list(struct btree_trans *trans,
+					      unsigned dev, s64 sectors)
+{
+	struct bch_replicas_padded r;
+
+	bch2_replicas_entry_cached(&r.e, dev);
+
+	update_replicas_list(trans, &r.e, sectors);
+}
+
+static inline struct replicas_delta *
+replicas_delta_next(struct replicas_delta *d)
+{
+	return (void *) d + replicas_entry_bytes(&d->r) + 8;
+}
+
+int bch2_replicas_delta_list_apply(struct bch_fs *c,
+				   struct bch_fs_usage *fs_usage,
+				   struct replicas_delta_list *r)
+{
+	struct replicas_delta *d = r->d;
+	struct replicas_delta *top = (void *) r->d + r->used;
+	unsigned i;
+
+	for (d = r->d; d != top; d = replicas_delta_next(d))
+		if (update_replicas(c, fs_usage, &d->r, d->delta)) {
+			top = d;
+			goto unwind;
+		}
+
+	if (!fs_usage)
+		return 0;
+
+	fs_usage->nr_inodes += r->nr_inodes;
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		fs_usage->reserved += r->persistent_reserved[i];
+		fs_usage->persistent_reserved[i] += r->persistent_reserved[i];
+	}
+
+	return 0;
+unwind:
+	for (d = r->d; d != top; d = replicas_delta_next(d))
+		update_replicas(c, fs_usage, &d->r, -d->delta);
+	return -1;
+}
+
+#define do_mark_fn(fn, c, pos, flags, ...)				\
+({									\
+	int gc, ret = 0;						\
+									\
+	percpu_rwsem_assert_held(&c->mark_lock);			\
+									\
+	for (gc = 0; gc < 2 && !ret; gc++)				\
+		if (!gc == !(flags & BCH_BUCKET_MARK_GC) ||		\
+		    (gc && gc_visited(c, pos)))				\
+			ret = fn(c, __VA_ARGS__, gc);			\
+	ret;								\
+})
+
+static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+				    size_t b, struct bucket_mark *ret,
+				    bool gc)
+{
+	struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
+	struct bucket *g = __bucket(ca, b, gc);
+	struct bucket_mark old, new;
+
+	old = bucket_cmpxchg(g, new, ({
+		BUG_ON(!is_available_bucket(new));
+
+		new.owned_by_allocator	= true;
+		new.data_type		= 0;
+		new.cached_sectors	= 0;
+		new.dirty_sectors	= 0;
+		new.gen++;
+	}));
+
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+
+	if (old.cached_sectors)
+		update_cached_sectors(c, fs_usage, ca->dev_idx,
+				      -((s64) old.cached_sectors));
+
+	if (!gc)
+		*ret = old;
+	return 0;
+}
+
+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, struct bucket_mark *old)
+{
+	do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
+		   ca, b, old);
+
+	if (!old->owned_by_allocator && old->cached_sectors)
+		trace_invalidate(ca, bucket_to_sector(ca, b),
+				 old->cached_sectors);
+}
+
+static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+				    size_t b, bool owned_by_allocator,
+				    bool gc)
+{
+	struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
+	struct bucket *g = __bucket(ca, b, gc);
+	struct bucket_mark old, new;
+
+	old = bucket_cmpxchg(g, new, ({
+		new.owned_by_allocator	= owned_by_allocator;
+	}));
+
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+
+	BUG_ON(!gc &&
+	       !owned_by_allocator && !old.owned_by_allocator);
+
+	return 0;
+}
+
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, bool owned_by_allocator,
+			    struct gc_pos pos, unsigned flags)
+{
+	preempt_disable();
+
+	do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
+		   ca, b, owned_by_allocator);
+
+	preempt_enable();
+}
+
+static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
+			   struct bch_fs_usage *fs_usage,
+			   u64 journal_seq, unsigned flags)
+{
+	bool gc = flags & BCH_BUCKET_MARK_GC;
+	struct bkey_alloc_unpacked u;
+	struct bch_dev *ca;
+	struct bucket *g;
+	struct bucket_mark old, m;
+
+	/*
+	 * alloc btree is read in by bch2_alloc_read, not gc:
+	 */
+	if ((flags & BCH_BUCKET_MARK_GC) &&
+	    !(flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE))
+		return 0;
+
+	ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+	if (k.k->p.offset >= ca->mi.nbuckets)
+		return 0;
+
+	g = __bucket(ca, k.k->p.offset, gc);
+	u = bch2_alloc_unpack(k);
+
+	old = bucket_cmpxchg(g, m, ({
+		m.gen			= u.gen;
+		m.data_type		= u.data_type;
+		m.dirty_sectors		= u.dirty_sectors;
+		m.cached_sectors	= u.cached_sectors;
+
+		if (journal_seq) {
+			m.journal_seq_valid	= 1;
+			m.journal_seq		= journal_seq;
+		}
+	}));
+
+	if (!(flags & BCH_BUCKET_MARK_ALLOC_READ))
+		bch2_dev_usage_update(c, ca, fs_usage, old, m, gc);
+
+	g->io_time[READ]	= u.read_time;
+	g->io_time[WRITE]	= u.write_time;
+	g->oldest_gen		= u.oldest_gen;
+	g->gen_valid		= 1;
+
+	/*
+	 * need to know if we're getting called from the invalidate path or
+	 * not:
+	 */
+
+	if ((flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE) &&
+	    old.cached_sectors) {
+		update_cached_sectors(c, fs_usage, ca->dev_idx,
+				      -old.cached_sectors);
+		trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset),
+				 old.cached_sectors);
+	}
+
+	return 0;
+}
+
+#define checked_add(a, b)					\
+({								\
+	unsigned _res = (unsigned) (a) + (b);			\
+	bool overflow = _res > U16_MAX;				\
+	if (overflow)						\
+		_res = U16_MAX;					\
+	(a) = _res;						\
+	overflow;						\
+})
+
+static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+				       size_t b, enum bch_data_type type,
+				       unsigned sectors, bool gc)
+{
+	struct bucket *g = __bucket(ca, b, gc);
+	struct bucket_mark old, new;
+	bool overflow;
+
+	BUG_ON(type != BCH_DATA_SB &&
+	       type != BCH_DATA_JOURNAL);
+
+	old = bucket_cmpxchg(g, new, ({
+		new.data_type	= type;
+		overflow = checked_add(new.dirty_sectors, sectors);
+	}));
+
+	bch2_fs_inconsistent_on(old.data_type &&
+				old.data_type != type, c,
+		"different types of data in same bucket: %s, %s",
+		bch2_data_types[old.data_type],
+		bch2_data_types[type]);
+
+	bch2_fs_inconsistent_on(overflow, c,
+		"bucket sector count overflow: %u + %u > U16_MAX",
+		old.dirty_sectors, sectors);
+
+	if (c)
+		bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
+				      old, new, gc);
+
+	return 0;
+}
+
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+			       size_t b, enum bch_data_type type,
+			       unsigned sectors, struct gc_pos pos,
+			       unsigned flags)
+{
+	BUG_ON(type != BCH_DATA_SB &&
+	       type != BCH_DATA_JOURNAL);
+
+	preempt_disable();
+
+	if (likely(c)) {
+		do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
+			   ca, b, type, sectors);
+	} else {
+		__bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
+	}
+
+	preempt_enable();
+}
+
+static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors)
+{
+	return DIV_ROUND_UP(sectors * n, d);
+}
+
+static s64 __ptr_disk_sectors_delta(unsigned old_size,
+				    unsigned offset, s64 delta,
+				    unsigned flags,
+				    unsigned n, unsigned d)
+{
+	BUG_ON(!n || !d);
+
+	if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) {
+		BUG_ON(offset + -delta > old_size);
+
+		return -disk_sectors_scaled(n, d, old_size) +
+			disk_sectors_scaled(n, d, offset) +
+			disk_sectors_scaled(n, d, old_size - offset + delta);
+	} else if (flags & BCH_BUCKET_MARK_OVERWRITE) {
+		BUG_ON(offset + -delta > old_size);
+
+		return -disk_sectors_scaled(n, d, old_size) +
+			disk_sectors_scaled(n, d, old_size + delta);
+	} else {
+		return  disk_sectors_scaled(n, d, delta);
+	}
+}
+
+static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
+				  unsigned offset, s64 delta,
+				  unsigned flags)
+{
+	return __ptr_disk_sectors_delta(p.crc.live_size,
+					offset, delta, flags,
+					p.crc.compressed_size,
+					p.crc.uncompressed_size);
+}
+
+static void bucket_set_stripe(struct bch_fs *c,
+			      const struct bch_stripe *v,
+			      struct bch_fs_usage *fs_usage,
+			      u64 journal_seq,
+			      unsigned flags)
+{
+	bool enabled = !(flags & BCH_BUCKET_MARK_OVERWRITE);
+	bool gc = flags & BCH_BUCKET_MARK_GC;
+	unsigned i;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		const struct bch_extent_ptr *ptr = v->ptrs + i;
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bucket *g = PTR_BUCKET(ca, ptr, gc);
+		struct bucket_mark new, old;
+
+		old = bucket_cmpxchg(g, new, ({
+			new.stripe			= enabled;
+			if (journal_seq) {
+				new.journal_seq_valid	= 1;
+				new.journal_seq		= journal_seq;
+			}
+		}));
+
+		bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+
+		/*
+		 * XXX write repair code for these, flag stripe as possibly bad
+		 */
+		if (old.gen != ptr->gen)
+			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+				      "stripe with stale pointer");
+#if 0
+		/*
+		 * We'd like to check for these, but these checks don't work
+		 * yet:
+		 */
+		if (old.stripe && enabled)
+			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+				      "multiple stripes using same bucket");
+
+		if (!old.stripe && !enabled)
+			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+				      "deleting stripe but bucket not marked as stripe bucket");
+#endif
+	}
+}
+
+static bool bch2_mark_pointer(struct bch_fs *c,
+			      struct extent_ptr_decoded p,
+			      s64 sectors, enum bch_data_type data_type,
+			      struct bch_fs_usage *fs_usage,
+			      u64 journal_seq, unsigned flags)
+{
+	bool gc = flags & BCH_BUCKET_MARK_GC;
+	struct bucket_mark old, new;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+	struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
+	bool overflow;
+	u64 v;
+
+	v = atomic64_read(&g->_mark.v);
+	do {
+		new.v.counter = old.v.counter = v;
+
+		/*
+		 * Check this after reading bucket mark to guard against
+		 * the allocator invalidating a bucket after we've already
+		 * checked the gen
+		 */
+		if (gen_after(p.ptr.gen, new.gen)) {
+			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+				      "pointer gen in the future");
+			return true;
+		}
+
+		if (new.gen != p.ptr.gen) {
+			/* XXX write repair code for this */
+			if (!p.ptr.cached &&
+			    test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+				bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+					      "stale dirty pointer");
+			return true;
+		}
+
+		if (!p.ptr.cached)
+			overflow = checked_add(new.dirty_sectors, sectors);
+		else
+			overflow = checked_add(new.cached_sectors, sectors);
+
+		if (!new.dirty_sectors &&
+		    !new.cached_sectors) {
+			new.data_type	= 0;
+
+			if (journal_seq) {
+				new.journal_seq_valid = 1;
+				new.journal_seq = journal_seq;
+			}
+		} else {
+			new.data_type = data_type;
+		}
+
+		if (flags & BCH_BUCKET_MARK_NOATOMIC) {
+			g->_mark = new;
+			break;
+		}
+	} while ((v = atomic64_cmpxchg(&g->_mark.v,
+			      old.v.counter,
+			      new.v.counter)) != old.v.counter);
+
+	if (old.data_type && old.data_type != data_type)
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s",
+			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			new.gen,
+			bch2_data_types[old.data_type],
+			bch2_data_types[data_type]);
+
+	bch2_fs_inconsistent_on(overflow, c,
+		"bucket sector count overflow: %u + %lli > U16_MAX",
+		!p.ptr.cached
+		? old.dirty_sectors
+		: old.cached_sectors, sectors);
+
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+
+	BUG_ON(!gc && bucket_became_unavailable(old, new));
+
+	return false;
+}
+
+static int bch2_mark_stripe_ptr(struct bch_fs *c,
+				struct bch_extent_stripe_ptr p,
+				enum bch_data_type data_type,
+				struct bch_fs_usage *fs_usage,
+				s64 sectors, unsigned flags,
+				struct bch_replicas_padded *r,
+				unsigned *nr_data,
+				unsigned *nr_parity)
+{
+	bool gc = flags & BCH_BUCKET_MARK_GC;
+	struct stripe *m;
+	unsigned old, new;
+	int blocks_nonempty_delta;
+
+	m = genradix_ptr(&c->stripes[gc], p.idx);
+
+	spin_lock(&c->ec_stripes_heap_lock);
+
+	if (!m || !m->alive) {
+		spin_unlock(&c->ec_stripes_heap_lock);
+		bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
+				    (u64) p.idx);
+		return -EIO;
+	}
+
+	BUG_ON(m->r.e.data_type != data_type);
+
+	*nr_data	= m->nr_blocks - m->nr_redundant;
+	*nr_parity	= m->nr_redundant;
+	*r = m->r;
+
+	old = m->block_sectors[p.block];
+	m->block_sectors[p.block] += sectors;
+	new = m->block_sectors[p.block];
+
+	blocks_nonempty_delta = (int) !!new - (int) !!old;
+	if (blocks_nonempty_delta) {
+		m->blocks_nonempty += blocks_nonempty_delta;
+
+		if (!gc)
+			bch2_stripes_heap_update(c, m, p.idx);
+	}
+
+	m->dirty = true;
+
+	spin_unlock(&c->ec_stripes_heap_lock);
+
+	return 0;
+}
+
+static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
+			    unsigned offset, s64 sectors,
+			    enum bch_data_type data_type,
+			    struct bch_fs_usage *fs_usage,
+			    unsigned journal_seq, unsigned flags)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_replicas_padded r;
+	s64 dirty_sectors = 0;
+	int ret;
+
+	r.e.data_type	= data_type;
+	r.e.nr_devs	= 0;
+	r.e.nr_required	= 1;
+
+	BUG_ON(!sectors);
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		s64 disk_sectors = data_type == BCH_DATA_BTREE
+			? sectors
+			: ptr_disk_sectors_delta(p, offset, sectors, flags);
+		bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
+					       fs_usage, journal_seq, flags);
+
+		if (p.ptr.cached) {
+			if (!stale)
+				update_cached_sectors(c, fs_usage, p.ptr.dev,
+						      disk_sectors);
+		} else if (!p.has_ec) {
+			dirty_sectors	       += disk_sectors;
+			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
+		} else {
+			struct bch_replicas_padded ec_r;
+			unsigned nr_data, nr_parity;
+			s64 parity_sectors;
+
+			ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
+					fs_usage, disk_sectors, flags,
+					&ec_r, &nr_data, &nr_parity);
+			if (ret)
+				return ret;
+
+			parity_sectors =
+				__ptr_disk_sectors_delta(p.crc.live_size,
+					offset, sectors, flags,
+					p.crc.compressed_size * nr_parity,
+					p.crc.uncompressed_size * nr_data);
+
+			update_replicas(c, fs_usage, &ec_r.e,
+					disk_sectors + parity_sectors);
+
+			/*
+			 * There may be other dirty pointers in this extent, but
+			 * if so they're not required for mounting if we have an
+			 * erasure coded pointer in this extent:
+			 */
+			r.e.nr_required = 0;
+		}
+	}
+
+	if (r.e.nr_devs)
+		update_replicas(c, fs_usage, &r.e, dirty_sectors);
+
+	return 0;
+}
+
+static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
+			    struct bch_fs_usage *fs_usage,
+			    u64 journal_seq, unsigned flags)
+{
+	bool gc = flags & BCH_BUCKET_MARK_GC;
+	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+	size_t idx = s.k->p.offset;
+	struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
+	unsigned i;
+
+	spin_lock(&c->ec_stripes_heap_lock);
+
+	if (!m || ((flags & BCH_BUCKET_MARK_OVERWRITE) && !m->alive)) {
+		spin_unlock(&c->ec_stripes_heap_lock);
+		bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
+				    idx);
+		return -1;
+	}
+
+	if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) {
+		m->sectors	= le16_to_cpu(s.v->sectors);
+		m->algorithm	= s.v->algorithm;
+		m->nr_blocks	= s.v->nr_blocks;
+		m->nr_redundant	= s.v->nr_redundant;
+
+		bch2_bkey_to_replicas(&m->r.e, k);
+
+		/*
+		 * XXX: account for stripes somehow here
+		 */
+#if 0
+		update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
+#endif
+
+		/* gc recalculates these fields: */
+		if (!(flags & BCH_BUCKET_MARK_GC)) {
+			for (i = 0; i < s.v->nr_blocks; i++) {
+				m->block_sectors[i] =
+					stripe_blockcount_get(s.v, i);
+				m->blocks_nonempty += !!m->block_sectors[i];
+			}
+		}
+
+		if (!gc)
+			bch2_stripes_heap_update(c, m, idx);
+		m->alive	= true;
+	} else {
+		if (!gc)
+			bch2_stripes_heap_del(c, m, idx);
+		memset(m, 0, sizeof(*m));
+	}
+
+	spin_unlock(&c->ec_stripes_heap_lock);
+
+	bucket_set_stripe(c, s.v, fs_usage, 0, flags);
+	return 0;
+}
+
+int bch2_mark_key_locked(struct bch_fs *c,
+		   struct bkey_s_c k,
+		   unsigned offset, s64 sectors,
+		   struct bch_fs_usage *fs_usage,
+		   u64 journal_seq, unsigned flags)
+{
+	int ret = 0;
+
+	preempt_disable();
+
+	if (!fs_usage || (flags & BCH_BUCKET_MARK_GC))
+		fs_usage = fs_usage_ptr(c, journal_seq,
+					flags & BCH_BUCKET_MARK_GC);
+
+	switch (k.k->type) {
+	case KEY_TYPE_alloc:
+		ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags);
+		break;
+	case KEY_TYPE_btree_ptr:
+		sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE)
+			?  c->opts.btree_node_size
+			: -c->opts.btree_node_size;
+
+		ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE,
+				fs_usage, journal_seq, flags);
+		break;
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER,
+				fs_usage, journal_seq, flags);
+		break;
+	case KEY_TYPE_stripe:
+		ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags);
+		break;
+	case KEY_TYPE_inode:
+		if (!(flags & BCH_BUCKET_MARK_OVERWRITE))
+			fs_usage->nr_inodes++;
+		else
+			fs_usage->nr_inodes--;
+		break;
+	case KEY_TYPE_reservation: {
+		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+
+		sectors *= replicas;
+		replicas = clamp_t(unsigned, replicas, 1,
+				   ARRAY_SIZE(fs_usage->persistent_reserved));
+
+		fs_usage->reserved				+= sectors;
+		fs_usage->persistent_reserved[replicas - 1]	+= sectors;
+		break;
+	}
+	}
+
+	preempt_enable();
+
+	return ret;
+}
+
+int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
+		  unsigned offset, s64 sectors,
+		  struct bch_fs_usage *fs_usage,
+		  u64 journal_seq, unsigned flags)
+{
+	int ret;
+
+	percpu_down_read(&c->mark_lock);
+	ret = bch2_mark_key_locked(c, k, offset, sectors,
+				   fs_usage, journal_seq, flags);
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
+}
+
+inline int bch2_mark_overwrite(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c old,
+			       struct bkey_i *new,
+			       struct bch_fs_usage *fs_usage,
+			       unsigned flags)
+{
+	struct bch_fs		*c = trans->c;
+	struct btree		*b = iter->l[0].b;
+	unsigned		offset = 0;
+	s64			sectors = 0;
+
+	flags |= BCH_BUCKET_MARK_OVERWRITE;
+
+	if (btree_node_is_extents(b)
+	    ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
+	    : bkey_cmp(new->k.p, old.k->p))
+		return 0;
+
+	if (btree_node_is_extents(b)) {
+		switch (bch2_extent_overlap(&new->k, old.k)) {
+		case BCH_EXTENT_OVERLAP_ALL:
+			offset = 0;
+			sectors = -((s64) old.k->size);
+			break;
+		case BCH_EXTENT_OVERLAP_BACK:
+			offset = bkey_start_offset(&new->k) -
+				bkey_start_offset(old.k);
+			sectors = bkey_start_offset(&new->k) -
+				old.k->p.offset;
+			break;
+		case BCH_EXTENT_OVERLAP_FRONT:
+			offset = 0;
+			sectors = bkey_start_offset(old.k) -
+				new->k.p.offset;
+			break;
+		case BCH_EXTENT_OVERLAP_MIDDLE:
+			offset = bkey_start_offset(&new->k) -
+				bkey_start_offset(old.k);
+			sectors = -((s64) new->k.size);
+			flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
+			break;
+		}
+
+		BUG_ON(sectors >= 0);
+	}
+
+	return bch2_mark_key_locked(c, old, offset, sectors, fs_usage,
+				    trans->journal_res.seq, flags) ?: 1;
+}
+
+int bch2_mark_update(struct btree_trans *trans,
+		     struct btree_insert_entry *insert,
+		     struct bch_fs_usage *fs_usage,
+		     unsigned flags)
+{
+	struct bch_fs		*c = trans->c;
+	struct btree_iter	*iter = insert->iter;
+	struct btree		*b = iter->l[0].b;
+	struct btree_node_iter	node_iter = iter->l[0].iter;
+	struct bkey_packed	*_k;
+	int ret = 0;
+
+	if (!btree_node_type_needs_gc(iter->btree_id))
+		return 0;
+
+	bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k),
+		0, insert->k->k.size,
+		fs_usage, trans->journal_res.seq,
+		BCH_BUCKET_MARK_INSERT|flags);
+
+	if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
+		return 0;
+
+	/*
+	 * For non extents, we only mark the new key, not the key being
+	 * overwritten - unless we're actually deleting:
+	 */
+	if ((iter->btree_id == BTREE_ID_ALLOC ||
+	     iter->btree_id == BTREE_ID_EC) &&
+	    !bkey_deleted(&insert->k->k))
+		return 0;
+
+	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+						      KEY_TYPE_discard))) {
+		struct bkey		unpacked;
+		struct bkey_s_c		k = bkey_disassemble(b, _k, &unpacked);
+
+		ret = bch2_mark_overwrite(trans, iter, k, insert->k,
+					  fs_usage, flags);
+		if (ret <= 0)
+			break;
+
+		bch2_btree_node_iter_advance(&node_iter, b);
+	}
+
+	return ret;
+}
+
+void bch2_trans_fs_usage_apply(struct btree_trans *trans,
+			       struct bch_fs_usage *fs_usage)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	static int warned_disk_usage = 0;
+	u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+	char buf[200];
+
+	if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res,
+				 trans->journal_res.seq) ||
+	    warned_disk_usage ||
+	    xchg(&warned_disk_usage, 1))
+		return;
+
+	bch_err(c, "disk usage increased more than %llu sectors reserved",
+		disk_res_sectors);
+
+	trans_for_each_update(trans, i) {
+		struct btree_iter	*iter = i->iter;
+		struct btree		*b = iter->l[0].b;
+		struct btree_node_iter	node_iter = iter->l[0].iter;
+		struct bkey_packed	*_k;
+
+		pr_err("while inserting");
+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+		pr_err("%s", buf);
+		pr_err("overlapping with");
+
+		node_iter = iter->l[0].iter;
+		while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+							KEY_TYPE_discard))) {
+			struct bkey		unpacked;
+			struct bkey_s_c		k;
+
+			k = bkey_disassemble(b, _k, &unpacked);
+
+			if (btree_node_is_extents(b)
+			    ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
+			    : bkey_cmp(i->k->k.p, k.k->p))
+				break;
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, k);
+			pr_err("%s", buf);
+
+			bch2_btree_node_iter_advance(&node_iter, b);
+		}
+	}
+}
+
+/* trans_mark: */
+
+static int trans_get_key(struct btree_trans *trans,
+			 enum btree_id btree_id, struct bpos pos,
+			 struct btree_iter **iter,
+			 struct bkey_s_c *k)
+{
+	struct btree_insert_entry *i;
+	int ret;
+
+	trans_for_each_update(trans, i)
+		if (i->iter->btree_id == btree_id &&
+		    (btree_node_type_is_extents(btree_id)
+		     ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
+		       bkey_cmp(pos, i->k->k.p) < 0
+		     : !bkey_cmp(pos, i->iter->pos))) {
+			*iter	= i->iter;
+			*k	= bkey_i_to_s_c(i->k);
+			return 1;
+		}
+
+	*iter = bch2_trans_get_iter(trans, btree_id, pos,
+				    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if (IS_ERR(*iter))
+		return PTR_ERR(*iter);
+
+	*k = bch2_btree_iter_peek_slot(*iter);
+	ret = bkey_err(*k);
+	if (ret)
+		bch2_trans_iter_put(trans, *iter);
+	return ret;
+}
+
+static void *trans_update_key(struct btree_trans *trans,
+			      struct btree_iter *iter,
+			      unsigned u64s)
+{
+	struct btree_insert_entry *i;
+	struct bkey_i *new_k;
+
+	new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+	if (IS_ERR(new_k))
+		return new_k;
+
+	bkey_init(&new_k->k);
+	new_k->k.p = iter->pos;
+
+	trans_for_each_update(trans, i)
+		if (i->iter == iter) {
+			i->k = new_k;
+			return new_k;
+		}
+
+	bch2_trans_update(trans, iter, new_k);
+	return new_k;
+}
+
+static int bch2_trans_mark_pointer(struct btree_trans *trans,
+			struct extent_ptr_decoded p,
+			s64 sectors, enum bch_data_type data_type)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_alloc_unpacked u;
+	struct bkey_i_alloc *a;
+	unsigned old;
+	bool overflow;
+	int ret;
+
+	ret = trans_get_key(trans, BTREE_ID_ALLOC,
+			    POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)),
+			    &iter, &k);
+	if (ret < 0)
+		return ret;
+
+	if (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) {
+		/*
+		 * During journal replay, and if gc repairs alloc info at
+		 * runtime, the alloc info in the btree might not be up to date
+		 * yet - so, trust the in memory mark:
+		 */
+		struct bucket *g;
+		struct bucket_mark m;
+
+		percpu_down_read(&c->mark_lock);
+		g	= bucket(ca, iter->pos.offset);
+		m	= READ_ONCE(g->mark);
+		u	= alloc_mem_to_key(g, m);
+		percpu_up_read(&c->mark_lock);
+	} else {
+		/*
+		 * Unless we're already updating that key:
+		 */
+		if (k.k->type != KEY_TYPE_alloc) {
+			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+				      "pointer to nonexistent bucket %llu:%llu",
+				      iter->pos.inode, iter->pos.offset);
+			ret = -1;
+			goto out;
+		}
+
+		u = bch2_alloc_unpack(k);
+	}
+
+	if (gen_after(u.gen, p.ptr.gen)) {
+		ret = 1;
+		goto out;
+	}
+
+	if (u.data_type && u.data_type != data_type) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s",
+			iter->pos.inode, iter->pos.offset,
+			u.gen,
+			bch2_data_types[u.data_type],
+			bch2_data_types[data_type]);
+		ret = -1;
+		goto out;
+	}
+
+	if (!p.ptr.cached) {
+		old = u.dirty_sectors;
+		overflow = checked_add(u.dirty_sectors, sectors);
+	} else {
+		old = u.cached_sectors;
+		overflow = checked_add(u.cached_sectors, sectors);
+	}
+
+	u.data_type = u.dirty_sectors || u.cached_sectors
+		? data_type : 0;
+
+	bch2_fs_inconsistent_on(overflow, c,
+		"bucket sector count overflow: %u + %lli > U16_MAX",
+		old, sectors);
+	BUG_ON(overflow);
+
+	a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto out;
+
+	bkey_alloc_init(&a->k_i);
+	a->k.p = iter->pos;
+	bch2_alloc_pack(a, u);
+out:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
+			struct bch_extent_stripe_ptr p,
+			s64 sectors, enum bch_data_type data_type,
+			struct bch_replicas_padded *r,
+			unsigned *nr_data,
+			unsigned *nr_parity)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter;
+	struct bkey_i *new_k;
+	struct bkey_s_c k;
+	struct bkey_s_stripe s;
+	int ret = 0;
+
+	ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
+	if (ret < 0)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_stripe) {
+		bch2_fs_inconsistent(c,
+			"pointer to nonexistent stripe %llu",
+			(u64) p.idx);
+		ret = -EIO;
+		goto out;
+	}
+
+	new_k = trans_update_key(trans, iter, k.k->u64s);
+	ret = PTR_ERR_OR_ZERO(new_k);
+	if (ret)
+		goto out;
+
+	bkey_reassemble(new_k, k);
+	s = bkey_i_to_s_stripe(new_k);
+
+	stripe_blockcount_set(s.v, p.block,
+		stripe_blockcount_get(s.v, p.block) +
+		sectors);
+
+	*nr_data	= s.v->nr_blocks - s.v->nr_redundant;
+	*nr_parity	= s.v->nr_redundant;
+	bch2_bkey_to_replicas(&r->e, s.s_c);
+out:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static int bch2_trans_mark_extent(struct btree_trans *trans,
+			struct bkey_s_c k, unsigned offset,
+			s64 sectors, unsigned flags,
+			enum bch_data_type data_type)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_replicas_padded r;
+	s64 dirty_sectors = 0;
+	bool stale;
+	int ret;
+
+	r.e.data_type	= data_type;
+	r.e.nr_devs	= 0;
+	r.e.nr_required	= 1;
+
+	BUG_ON(!sectors);
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		s64 disk_sectors = data_type == BCH_DATA_BTREE
+			? sectors
+			: ptr_disk_sectors_delta(p, offset, sectors, flags);
+
+		ret = bch2_trans_mark_pointer(trans, p, disk_sectors,
+					      data_type);
+		if (ret < 0)
+			return ret;
+
+		stale = ret > 0;
+
+		if (p.ptr.cached) {
+			if (!stale)
+				update_cached_sectors_list(trans, p.ptr.dev,
+							   disk_sectors);
+		} else if (!p.has_ec) {
+			dirty_sectors	       += disk_sectors;
+			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
+		} else {
+			struct bch_replicas_padded ec_r;
+			unsigned nr_data, nr_parity;
+			s64 parity_sectors;
+
+			ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
+					disk_sectors, data_type,
+					&ec_r, &nr_data, &nr_parity);
+			if (ret)
+				return ret;
+
+			parity_sectors =
+				__ptr_disk_sectors_delta(p.crc.live_size,
+					offset, sectors, flags,
+					p.crc.compressed_size * nr_parity,
+					p.crc.uncompressed_size * nr_data);
+
+			update_replicas_list(trans, &ec_r.e,
+					     disk_sectors + parity_sectors);
+
+			r.e.nr_required = 0;
+		}
+	}
+
+	if (r.e.nr_devs)
+		update_replicas_list(trans, &r.e, dirty_sectors);
+
+	return 0;
+}
+
+static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
+			struct bkey_s_c_reflink_p p,
+			u64 idx, unsigned sectors,
+			unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter;
+	struct bkey_i *new_k;
+	struct bkey_s_c k;
+	struct bkey_i_reflink_v *r_v;
+	s64 ret;
+
+	ret = trans_get_key(trans, BTREE_ID_REFLINK,
+			    POS(0, idx), &iter, &k);
+	if (ret < 0)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_reflink_v) {
+		bch2_fs_inconsistent(c,
+			"%llu:%llu len %u points to nonexistent indirect extent %llu",
+			p.k->p.inode, p.k->p.offset, p.k->size, idx);
+		ret = -EIO;
+		goto err;
+	}
+
+	if ((flags & BCH_BUCKET_MARK_OVERWRITE) &&
+	    (bkey_start_offset(k.k) < idx ||
+	     k.k->p.offset > idx + sectors))
+		goto out;
+
+	bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+	BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
+	new_k = trans_update_key(trans, iter, k.k->u64s);
+	ret = PTR_ERR_OR_ZERO(new_k);
+	if (ret)
+		goto err;
+
+	bkey_reassemble(new_k, k);
+	r_v = bkey_i_to_reflink_v(new_k);
+
+	le64_add_cpu(&r_v->v.refcount,
+		     !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1);
+
+	if (!r_v->v.refcount) {
+		r_v->k.type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(&r_v->k, 0);
+	}
+out:
+	ret = k.k->p.offset - idx;
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+			struct bkey_s_c_reflink_p p, unsigned offset,
+			s64 sectors, unsigned flags)
+{
+	u64 idx = le64_to_cpu(p.v->idx) + offset;
+	s64 ret = 0;
+
+	sectors = abs(sectors);
+	BUG_ON(offset + sectors > p.k->size);
+
+	while (sectors) {
+		ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags);
+		if (ret < 0)
+			break;
+
+		idx += ret;
+		sectors = max_t(s64, 0LL, sectors - ret);
+		ret = 0;
+	}
+
+	return ret;
+}
+
+int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
+			unsigned offset, s64 sectors, unsigned flags)
+{
+	struct replicas_delta_list *d;
+	struct bch_fs *c = trans->c;
+
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+		sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE)
+			?  c->opts.btree_node_size
+			: -c->opts.btree_node_size;
+
+		return bch2_trans_mark_extent(trans, k, offset, sectors,
+					      flags, BCH_DATA_BTREE);
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		return bch2_trans_mark_extent(trans, k, offset, sectors,
+					      flags, BCH_DATA_USER);
+	case KEY_TYPE_inode:
+		d = replicas_deltas_realloc(trans, 0);
+
+		if (!(flags & BCH_BUCKET_MARK_OVERWRITE))
+			d->nr_inodes++;
+		else
+			d->nr_inodes--;
+		return 0;
+	case KEY_TYPE_reservation: {
+		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+
+		d = replicas_deltas_realloc(trans, 0);
+
+		sectors *= replicas;
+		replicas = clamp_t(unsigned, replicas, 1,
+				   ARRAY_SIZE(d->persistent_reserved));
+
+		d->persistent_reserved[replicas - 1] += sectors;
+		return 0;
+	}
+	case KEY_TYPE_reflink_p:
+		return bch2_trans_mark_reflink_p(trans,
+					bkey_s_c_to_reflink_p(k),
+					offset, sectors, flags);
+	default:
+		return 0;
+	}
+}
+
+int bch2_trans_mark_update(struct btree_trans *trans,
+			   struct btree_iter *iter,
+			   struct bkey_i *insert)
+{
+	struct btree		*b = iter->l[0].b;
+	struct btree_node_iter	node_iter = iter->l[0].iter;
+	struct bkey_packed	*_k;
+	int ret;
+
+	if (!btree_node_type_needs_gc(iter->btree_id))
+		return 0;
+
+	ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert),
+			0, insert->k.size, BCH_BUCKET_MARK_INSERT);
+	if (ret)
+		return ret;
+
+	if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
+		return 0;
+
+	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+						      KEY_TYPE_discard))) {
+		struct bkey		unpacked;
+		struct bkey_s_c		k;
+		unsigned		offset = 0;
+		s64			sectors = 0;
+		unsigned		flags = BCH_BUCKET_MARK_OVERWRITE;
+
+		k = bkey_disassemble(b, _k, &unpacked);
+
+		if (btree_node_is_extents(b)
+		    ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0
+		    : bkey_cmp(insert->k.p, k.k->p))
+			break;
+
+		if (btree_node_is_extents(b)) {
+			switch (bch2_extent_overlap(&insert->k, k.k)) {
+			case BCH_EXTENT_OVERLAP_ALL:
+				offset = 0;
+				sectors = -((s64) k.k->size);
+				break;
+			case BCH_EXTENT_OVERLAP_BACK:
+				offset = bkey_start_offset(&insert->k) -
+					bkey_start_offset(k.k);
+				sectors = bkey_start_offset(&insert->k) -
+					k.k->p.offset;
+				break;
+			case BCH_EXTENT_OVERLAP_FRONT:
+				offset = 0;
+				sectors = bkey_start_offset(k.k) -
+					insert->k.p.offset;
+				break;
+			case BCH_EXTENT_OVERLAP_MIDDLE:
+				offset = bkey_start_offset(&insert->k) -
+					bkey_start_offset(k.k);
+				sectors = -((s64) insert->k.size);
+				flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
+				break;
+			}
+
+			BUG_ON(sectors >= 0);
+		}
+
+		ret = bch2_trans_mark_key(trans, k, offset, sectors, flags);
+		if (ret)
+			return ret;
+
+		bch2_btree_node_iter_advance(&node_iter, b);
+	}
+
+	return 0;
+}
+
+/* Disk reservations: */
+
+static u64 bch2_recalc_sectors_available(struct bch_fs *c)
+{
+	percpu_u64_set(&c->pcpu->sectors_available, 0);
+
+	return avail_factor(__bch2_fs_usage_read_short(c).free);
+}
+
+void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
+{
+	percpu_down_read(&c->mark_lock);
+	this_cpu_sub(c->usage[0]->online_reserved,
+		     res->sectors);
+	percpu_up_read(&c->mark_lock);
+
+	res->sectors = 0;
+}
+
+#define SECTORS_CACHE	1024
+
+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+			      unsigned sectors, int flags)
+{
+	struct bch_fs_pcpu *pcpu;
+	u64 old, v, get;
+	s64 sectors_available;
+	int ret;
+
+	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	pcpu = this_cpu_ptr(c->pcpu);
+
+	if (sectors <= pcpu->sectors_available)
+		goto out;
+
+	v = atomic64_read(&c->sectors_available);
+	do {
+		old = v;
+		get = min((u64) sectors + SECTORS_CACHE, old);
+
+		if (get < sectors) {
+			preempt_enable();
+			percpu_up_read(&c->mark_lock);
+			goto recalculate;
+		}
+	} while ((v = atomic64_cmpxchg(&c->sectors_available,
+				       old, old - get)) != old);
+
+	pcpu->sectors_available		+= get;
+
+out:
+	pcpu->sectors_available		-= sectors;
+	this_cpu_add(c->usage[0]->online_reserved, sectors);
+	res->sectors			+= sectors;
+
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+	return 0;
+
+recalculate:
+	percpu_down_write(&c->mark_lock);
+
+	sectors_available = bch2_recalc_sectors_available(c);
+
+	if (sectors <= sectors_available ||
+	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
+		atomic64_set(&c->sectors_available,
+			     max_t(s64, 0, sectors_available - sectors));
+		this_cpu_add(c->usage[0]->online_reserved, sectors);
+		res->sectors			+= sectors;
+		ret = 0;
+	} else {
+		atomic64_set(&c->sectors_available, sectors_available);
+		ret = -ENOSPC;
+	}
+
+	percpu_up_write(&c->mark_lock);
+
+	return ret;
+}
+
+/* Startup/shutdown: */
+
+static void buckets_free_rcu(struct rcu_head *rcu)
+{
+	struct bucket_array *buckets =
+		container_of(rcu, struct bucket_array, rcu);
+
+	kvpfree(buckets,
+		sizeof(struct bucket_array) +
+		buckets->nbuckets * sizeof(struct bucket));
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+	struct bucket_array *buckets = NULL, *old_buckets = NULL;
+	unsigned long *buckets_nouse = NULL;
+	alloc_fifo	free[RESERVE_NR];
+	alloc_fifo	free_inc;
+	alloc_heap	alloc_heap;
+	copygc_heap	copygc_heap;
+
+	size_t btree_reserve	= DIV_ROUND_UP(BTREE_NODE_RESERVE,
+			     ca->mi.bucket_size / c->opts.btree_node_size);
+	/* XXX: these should be tunable */
+	size_t reserve_none	= max_t(size_t, 1, nbuckets >> 9);
+	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 7);
+	size_t free_inc_nr	= max(max_t(size_t, 1, nbuckets >> 12),
+				      btree_reserve * 2);
+	bool resize = ca->buckets[0] != NULL,
+	     start_copygc = ca->copygc_thread != NULL;
+	int ret = -ENOMEM;
+	unsigned i;
+
+	memset(&free,		0, sizeof(free));
+	memset(&free_inc,	0, sizeof(free_inc));
+	memset(&alloc_heap,	0, sizeof(alloc_heap));
+	memset(&copygc_heap,	0, sizeof(copygc_heap));
+
+	if (!(buckets		= kvpmalloc(sizeof(struct bucket_array) +
+					    nbuckets * sizeof(struct bucket),
+					    GFP_KERNEL|__GFP_ZERO)) ||
+	    !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
+					    sizeof(unsigned long),
+					    GFP_KERNEL|__GFP_ZERO)) ||
+	    !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
+	    !init_fifo(&free[RESERVE_MOVINGGC],
+		       copygc_reserve, GFP_KERNEL) ||
+	    !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
+	    !init_fifo(&free_inc,	free_inc_nr, GFP_KERNEL) ||
+	    !init_heap(&alloc_heap,	ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
+	    !init_heap(&copygc_heap,	copygc_reserve, GFP_KERNEL))
+		goto err;
+
+	buckets->first_bucket	= ca->mi.first_bucket;
+	buckets->nbuckets	= nbuckets;
+
+	bch2_copygc_stop(ca);
+
+	if (resize) {
+		down_write(&c->gc_lock);
+		down_write(&ca->bucket_lock);
+		percpu_down_write(&c->mark_lock);
+	}
+
+	old_buckets = bucket_array(ca);
+
+	if (resize) {
+		size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
+
+		memcpy(buckets->b,
+		       old_buckets->b,
+		       n * sizeof(struct bucket));
+		memcpy(buckets_nouse,
+		       ca->buckets_nouse,
+		       BITS_TO_LONGS(n) * sizeof(unsigned long));
+	}
+
+	rcu_assign_pointer(ca->buckets[0], buckets);
+	buckets = old_buckets;
+
+	swap(ca->buckets_nouse, buckets_nouse);
+
+	if (resize)
+		percpu_up_write(&c->mark_lock);
+
+	spin_lock(&c->freelist_lock);
+	for (i = 0; i < RESERVE_NR; i++) {
+		fifo_move(&free[i], &ca->free[i]);
+		swap(ca->free[i], free[i]);
+	}
+	fifo_move(&free_inc, &ca->free_inc);
+	swap(ca->free_inc, free_inc);
+	spin_unlock(&c->freelist_lock);
+
+	/* with gc lock held, alloc_heap can't be in use: */
+	swap(ca->alloc_heap, alloc_heap);
+
+	/* and we shut down copygc: */
+	swap(ca->copygc_heap, copygc_heap);
+
+	nbuckets = ca->mi.nbuckets;
+
+	if (resize) {
+		up_write(&ca->bucket_lock);
+		up_write(&c->gc_lock);
+	}
+
+	if (start_copygc &&
+	    bch2_copygc_start(c, ca))
+		bch_err(ca, "error restarting copygc thread");
+
+	ret = 0;
+err:
+	free_heap(&copygc_heap);
+	free_heap(&alloc_heap);
+	free_fifo(&free_inc);
+	for (i = 0; i < RESERVE_NR; i++)
+		free_fifo(&free[i]);
+	kvpfree(buckets_nouse,
+		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+	if (buckets)
+		call_rcu(&old_buckets->rcu, buckets_free_rcu);
+
+	return ret;
+}
+
+void bch2_dev_buckets_free(struct bch_dev *ca)
+{
+	unsigned i;
+
+	free_heap(&ca->copygc_heap);
+	free_heap(&ca->alloc_heap);
+	free_fifo(&ca->free_inc);
+	for (i = 0; i < RESERVE_NR; i++)
+		free_fifo(&ca->free[i]);
+	kvpfree(ca->buckets_nouse,
+		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
+	kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
+		sizeof(struct bucket_array) +
+		ca->mi.nbuckets * sizeof(struct bucket));
+
+	free_percpu(ca->usage[0]);
+}
+
+int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+	if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
+		return -ENOMEM;
+
+	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
+}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
new file mode 100644
index 000000000000..ad6f731b1cea
--- /dev/null
+++ b/fs/bcachefs/buckets.h
@@ -0,0 +1,337 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ */
+
+#ifndef _BUCKETS_H
+#define _BUCKETS_H
+
+#include "buckets_types.h"
+#include "super.h"
+
+#define for_each_bucket(_b, _buckets)				\
+	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
+	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
+
+#define bucket_cmpxchg(g, new, expr)				\
+({								\
+	struct bucket *_g = g;					\
+	u64 _v = atomic64_read(&(g)->_mark.v);			\
+	struct bucket_mark _old;				\
+								\
+	do {							\
+		(new).v.counter = _old.v.counter = _v;		\
+		expr;						\
+	} while ((_v = atomic64_cmpxchg(&(_g)->_mark.v,		\
+			       _old.v.counter,			\
+			       (new).v.counter)) != _old.v.counter);\
+	_old;							\
+})
+
+static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
+						  bool gc)
+{
+	return rcu_dereference_check(ca->buckets[gc],
+				     !ca->fs ||
+				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+				     lockdep_is_held(&ca->fs->gc_lock) ||
+				     lockdep_is_held(&ca->bucket_lock));
+}
+
+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+{
+	return __bucket_array(ca, false);
+}
+
+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
+{
+	struct bucket_array *buckets = __bucket_array(ca, gc);
+
+	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+	return buckets->b + b;
+}
+
+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+{
+	return __bucket(ca, b, false);
+}
+
+static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
+					 size_t b, int rw)
+{
+	bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
+}
+
+static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
+{
+	return c->bucket_clock[rw].hand - g->io_time[rw];
+}
+
+/*
+ * bucket_gc_gen() returns the difference between the bucket's current gen and
+ * the oldest gen of any pointer into that bucket in the btree.
+ */
+
+static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+{
+	struct bucket *g = bucket(ca, b);
+
+	return g->mark.gen - g->oldest_gen;
+}
+
+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+				   const struct bch_extent_ptr *ptr)
+{
+	return sector_to_bucket(ca, ptr->offset);
+}
+
+static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
+					const struct bch_extent_ptr *ptr,
+					bool gc)
+{
+	return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
+}
+
+static inline enum bch_data_type ptr_data_type(const struct bkey *k,
+					       const struct bch_extent_ptr *ptr)
+{
+	if (k->type == KEY_TYPE_btree_ptr)
+		return BCH_DATA_BTREE;
+
+	return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER;
+}
+
+static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
+						 const struct bch_extent_ptr *ptr)
+{
+	struct bucket_mark m;
+
+	rcu_read_lock();
+	m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark);
+	rcu_read_unlock();
+
+	return m;
+}
+
+static inline int gen_cmp(u8 a, u8 b)
+{
+	return (s8) (a - b);
+}
+
+static inline int gen_after(u8 a, u8 b)
+{
+	int r = gen_cmp(a, b);
+
+	return r > 0 ? r : 0;
+}
+
+/**
+ * ptr_stale() - check if a pointer points into a bucket that has been
+ * invalidated.
+ */
+static inline u8 ptr_stale(struct bch_dev *ca,
+			   const struct bch_extent_ptr *ptr)
+{
+	return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
+}
+
+static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p,
+				     unsigned live_size)
+{
+	return live_size && p.crc.compression_type
+		? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size,
+				       p.crc.uncompressed_size))
+		: live_size;
+}
+
+static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p)
+{
+	return __ptr_disk_sectors(p, p.crc.live_size);
+}
+
+/* bucket gc marks */
+
+static inline unsigned bucket_sectors_used(struct bucket_mark mark)
+{
+	return mark.dirty_sectors + mark.cached_sectors;
+}
+
+static inline bool bucket_unused(struct bucket_mark mark)
+{
+	return !mark.owned_by_allocator &&
+		!mark.data_type &&
+		!bucket_sectors_used(mark);
+}
+
+static inline bool is_available_bucket(struct bucket_mark mark)
+{
+	return (!mark.owned_by_allocator &&
+		!mark.dirty_sectors &&
+		!mark.stripe);
+}
+
+static inline bool bucket_needs_journal_commit(struct bucket_mark m,
+					       u16 last_seq_ondisk)
+{
+	return m.journal_seq_valid &&
+		((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
+}
+
+/* Device usage: */
+
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
+
+void bch2_dev_usage_from_buckets(struct bch_fs *);
+
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
+					  struct bch_dev_usage stats)
+{
+	u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+
+	if (WARN_ONCE(stats.buckets_unavailable > total,
+		      "buckets_unavailable overflow (%llu > %llu)\n",
+		      stats.buckets_unavailable, total))
+		return 0;
+
+	return total - stats.buckets_unavailable;
+}
+
+/*
+ * Number of reclaimable buckets - only for use by the allocator thread:
+ */
+static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
+{
+	return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
+}
+
+static inline u64 __dev_buckets_free(struct bch_dev *ca,
+				     struct bch_dev_usage stats)
+{
+	return __dev_buckets_available(ca, stats) +
+		fifo_used(&ca->free[RESERVE_NONE]) +
+		fifo_used(&ca->free_inc);
+}
+
+static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
+{
+	return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
+}
+
+/* Filesystem usage: */
+
+static inline unsigned fs_usage_u64s(struct bch_fs *c)
+{
+
+	return sizeof(struct bch_fs_usage) / sizeof(u64) +
+		READ_ONCE(c->replicas.nr);
+}
+
+void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *);
+struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *);
+
+u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
+
+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
+
+void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
+
+void bch2_fs_usage_to_text(struct printbuf *,
+			   struct bch_fs *, struct bch_fs_usage *);
+
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *);
+
+struct bch_fs_usage_short
+bch2_fs_usage_read_short(struct bch_fs *);
+
+/* key/bucket marking: */
+
+void bch2_bucket_seq_cleanup(struct bch_fs *);
+void bch2_fs_usage_initialize(struct bch_fs *);
+
+void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
+			    size_t, struct bucket_mark *);
+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
+			    size_t, bool, struct gc_pos, unsigned);
+void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+			       size_t, enum bch_data_type, unsigned,
+			       struct gc_pos, unsigned);
+
+#define BCH_BUCKET_MARK_INSERT			(1 << 0)
+#define BCH_BUCKET_MARK_OVERWRITE		(1 << 1)
+#define BCH_BUCKET_MARK_OVERWRITE_SPLIT		(1 << 2)
+#define BCH_BUCKET_MARK_BUCKET_INVALIDATE	(1 << 3)
+#define BCH_BUCKET_MARK_GC			(1 << 4)
+#define BCH_BUCKET_MARK_ALLOC_READ		(1 << 5)
+#define BCH_BUCKET_MARK_NOATOMIC		(1 << 6)
+
+int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64,
+			 struct bch_fs_usage *, u64, unsigned);
+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
+		  struct bch_fs_usage *, u64, unsigned);
+int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
+			struct disk_reservation *, unsigned);
+
+int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
+			struct bkey_s_c, struct bkey_i *,
+			struct bch_fs_usage *, unsigned);
+int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
+		     struct bch_fs_usage *, unsigned);
+
+int bch2_replicas_delta_list_apply(struct bch_fs *,
+				   struct bch_fs_usage *,
+				   struct replicas_delta_list *);
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+			unsigned, s64, unsigned);
+int bch2_trans_mark_update(struct btree_trans *,
+			   struct btree_iter *iter,
+			   struct bkey_i *insert);
+void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
+
+/* disk reservations: */
+
+void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
+
+static inline void bch2_disk_reservation_put(struct bch_fs *c,
+					     struct disk_reservation *res)
+{
+	if (res->sectors)
+		__bch2_disk_reservation_put(c, res);
+}
+
+#define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
+
+int bch2_disk_reservation_add(struct bch_fs *,
+			     struct disk_reservation *,
+			     unsigned, int);
+
+static inline struct disk_reservation
+bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
+{
+	return (struct disk_reservation) {
+		.sectors	= 0,
+#if 0
+		/* not used yet: */
+		.gen		= c->capacity_gen,
+#endif
+		.nr_replicas	= nr_replicas,
+	};
+}
+
+static inline int bch2_disk_reservation_get(struct bch_fs *c,
+					    struct disk_reservation *res,
+					    unsigned sectors,
+					    unsigned nr_replicas,
+					    int flags)
+{
+	*res = bch2_disk_reservation_init(c, nr_replicas);
+
+	return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
+void bch2_dev_buckets_free(struct bch_dev *);
+int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
+
+#endif /* _BUCKETS_H */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
new file mode 100644
index 000000000000..f3ff4a18b1fd
--- /dev/null
+++ b/fs/bcachefs/buckets_types.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_TYPES_H
+#define _BUCKETS_TYPES_H
+
+#include "bcachefs_format.h"
+#include "util.h"
+
+#define BUCKET_JOURNAL_SEQ_BITS		16
+
+struct bucket_mark {
+	union {
+	atomic64_t	v;
+
+	struct {
+	u8		gen;
+	u8		data_type:3,
+			owned_by_allocator:1,
+			journal_seq_valid:1,
+			stripe:1;
+	u16		dirty_sectors;
+	u16		cached_sectors;
+
+	/*
+	 * low bits of journal sequence number when this bucket was most
+	 * recently modified: if journal_seq_valid is set, this bucket can't be
+	 * reused until the journal sequence number written to disk is >= the
+	 * bucket's journal sequence number:
+	 */
+	u16		journal_seq;
+	};
+	};
+};
+
+struct bucket {
+	union {
+		struct bucket_mark	_mark;
+		const struct bucket_mark mark;
+	};
+
+	u16				io_time[2];
+	u8				oldest_gen;
+	unsigned			gen_valid:1;
+};
+
+struct bucket_array {
+	struct rcu_head		rcu;
+	u16			first_bucket;
+	size_t			nbuckets;
+	struct bucket		b[];
+};
+
+struct bch_dev_usage {
+	u64			buckets[BCH_DATA_NR];
+	u64			buckets_alloc;
+	u64			buckets_ec;
+	u64			buckets_unavailable;
+
+	/* _compressed_ sectors: */
+	u64			sectors[BCH_DATA_NR];
+	u64			sectors_fragmented;
+};
+
+struct bch_fs_usage {
+	/* all fields are in units of 512 byte sectors: */
+
+	u64			online_reserved;
+
+	/* fields after online_reserved are cleared/recalculated by gc: */
+	u64			gc_start[0];
+
+	u64			hidden;
+	u64			btree;
+	u64			data;
+	u64			cached;
+	u64			reserved;
+	u64			nr_inodes;
+
+	/* XXX: add stats for compression ratio */
+#if 0
+	u64			uncompressed;
+	u64			compressed;
+#endif
+
+	/* broken out: */
+
+	u64			persistent_reserved[BCH_REPLICAS_MAX];
+	u64			replicas[];
+};
+
+struct bch_fs_usage_short {
+	u64			capacity;
+	u64			used;
+	u64			free;
+	u64			nr_inodes;
+};
+
+struct replicas_delta {
+	s64			delta;
+	struct bch_replicas_entry r;
+} __packed;
+
+struct replicas_delta_list {
+	unsigned		size;
+	unsigned		used;
+
+	struct			{} memset_start;
+	u64			nr_inodes;
+	u64			persistent_reserved[BCH_REPLICAS_MAX];
+	struct			{} memset_end;
+	struct replicas_delta	d[0];
+};
+
+/*
+ * A reservation for space on disk:
+ */
+struct disk_reservation {
+	u64			sectors;
+	u32			gen;
+	unsigned		nr_replicas;
+};
+
+struct copygc_heap_entry {
+	u8			gen;
+	u32			sectors;
+	u64			offset;
+};
+
+typedef HEAP(struct copygc_heap_entry) copygc_heap;
+
+#endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
new file mode 100644
index 000000000000..059eca01ccc4
--- /dev/null
+++ b/fs/bcachefs/chardev.c
@@ -0,0 +1,671 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_CHARDEV
+
+#include "bcachefs.h"
+#include "bcachefs_ioctl.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "move.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/kthread.h>
+#include <linux/major.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+/* returns with ref on ca->ref */
+static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
+					  unsigned flags)
+{
+	struct bch_dev *ca;
+
+	if (flags & BCH_BY_INDEX) {
+		if (dev >= c->sb.nr_devices)
+			return ERR_PTR(-EINVAL);
+
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		if (!ca)
+			return ERR_PTR(-EINVAL);
+	} else {
+		char *path;
+
+		path = strndup_user((const char __user *)
+				    (unsigned long) dev, PATH_MAX);
+		if (IS_ERR(path))
+			return ERR_CAST(path);
+
+		ca = bch2_dev_lookup(c, path);
+		kfree(path);
+	}
+
+	return ca;
+}
+
+#if 0
+static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
+{
+	struct bch_ioctl_assemble arg;
+	struct bch_fs *c;
+	u64 *user_devs = NULL;
+	char **devs = NULL;
+	unsigned i;
+	int ret = -EFAULT;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
+	if (!user_devs)
+		return -ENOMEM;
+
+	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
+
+	if (copy_from_user(user_devs, user_arg->devs,
+			   sizeof(u64) * arg.nr_devs))
+		goto err;
+
+	for (i = 0; i < arg.nr_devs; i++) {
+		devs[i] = strndup_user((const char __user *)(unsigned long)
+				       user_devs[i],
+				       PATH_MAX);
+		if (!devs[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
+	ret = PTR_ERR_OR_ZERO(c);
+	if (!ret)
+		closure_put(&c->cl);
+err:
+	if (devs)
+		for (i = 0; i < arg.nr_devs; i++)
+			kfree(devs[i]);
+	kfree(devs);
+	return ret;
+}
+
+static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
+{
+	struct bch_ioctl_incremental arg;
+	const char *err;
+	char *path;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	err = bch2_fs_open_incremental(path);
+	kfree(path);
+
+	if (err) {
+		pr_err("Could not register bcachefs devices: %s", err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif
+
+static long bch2_global_ioctl(unsigned cmd, void __user *arg)
+{
+	switch (cmd) {
+#if 0
+	case BCH_IOCTL_ASSEMBLE:
+		return bch2_ioctl_assemble(arg);
+	case BCH_IOCTL_INCREMENTAL:
+		return bch2_ioctl_incremental(arg);
+#endif
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long bch2_ioctl_query_uuid(struct bch_fs *c,
+			struct bch_ioctl_query_uuid __user *user_arg)
+{
+	return copy_to_user(&user_arg->uuid,
+			    &c->sb.user_uuid,
+			    sizeof(c->sb.user_uuid));
+}
+
+#if 0
+static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
+{
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	return bch2_fs_start(c);
+}
+
+static long bch2_ioctl_stop(struct bch_fs *c)
+{
+	bch2_fs_stop(c);
+	return 0;
+}
+#endif
+
+static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	char *path;
+	int ret;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	ret = bch2_dev_add(c, path);
+	kfree(path);
+
+	return ret;
+}
+
+static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	struct bch_dev *ca;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	return bch2_dev_remove(c, ca, arg.flags);
+}
+
+static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	char *path;
+	int ret;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	ret = bch2_dev_online(c, path);
+	kfree(path);
+	return ret;
+}
+
+static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_offline(c, ca, arg.flags);
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+static long bch2_ioctl_disk_set_state(struct bch_fs *c,
+			struct bch_ioctl_disk_set_state arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad[0] || arg.pad[1] || arg.pad[2])
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+struct bch_data_ctx {
+	struct bch_fs			*c;
+	struct bch_ioctl_data		arg;
+	struct bch_move_stats		stats;
+
+	int				ret;
+
+	struct task_struct		*thread;
+};
+
+static int bch2_data_thread(void *arg)
+{
+	struct bch_data_ctx *ctx = arg;
+
+	ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+
+	ctx->stats.data_type = U8_MAX;
+	return 0;
+}
+
+static int bch2_data_job_release(struct inode *inode, struct file *file)
+{
+	struct bch_data_ctx *ctx = file->private_data;
+
+	kthread_stop(ctx->thread);
+	put_task_struct(ctx->thread);
+	kfree(ctx);
+	return 0;
+}
+
+static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
+				  size_t len, loff_t *ppos)
+{
+	struct bch_data_ctx *ctx = file->private_data;
+	struct bch_fs *c = ctx->c;
+	struct bch_ioctl_data_event e = {
+		.type			= BCH_DATA_EVENT_PROGRESS,
+		.p.data_type		= ctx->stats.data_type,
+		.p.btree_id		= ctx->stats.btree_id,
+		.p.pos			= ctx->stats.pos,
+		.p.sectors_done		= atomic64_read(&ctx->stats.sectors_seen),
+		.p.sectors_total	= bch2_fs_usage_read_short(c).used,
+	};
+
+	if (len < sizeof(e))
+		return -EINVAL;
+
+	return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
+}
+
+static const struct file_operations bcachefs_data_ops = {
+	.release	= bch2_data_job_release,
+	.read		= bch2_data_job_read,
+	.llseek		= no_llseek,
+};
+
+static long bch2_ioctl_data(struct bch_fs *c,
+			    struct bch_ioctl_data arg)
+{
+	struct bch_data_ctx *ctx = NULL;
+	struct file *file = NULL;
+	unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
+	int ret, fd = -1;
+
+	if (arg.op >= BCH_DATA_OP_NR || arg.flags)
+		return -EINVAL;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->c = c;
+	ctx->arg = arg;
+
+	ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
+	if (IS_ERR(ctx->thread)) {
+		ret = PTR_ERR(ctx->thread);
+		goto err;
+	}
+
+	ret = get_unused_fd_flags(flags);
+	if (ret < 0)
+		goto err;
+	fd = ret;
+
+	file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto err;
+	}
+
+	fd_install(fd, file);
+
+	get_task_struct(ctx->thread);
+	wake_up_process(ctx->thread);
+
+	return fd;
+err:
+	if (fd >= 0)
+		put_unused_fd(fd);
+	if (!IS_ERR_OR_NULL(ctx->thread))
+		kthread_stop(ctx->thread);
+	kfree(ctx);
+	return ret;
+}
+
+static long bch2_ioctl_usage(struct bch_fs *c,
+			     struct bch_ioctl_usage __user *user_arg)
+{
+	struct bch_ioctl_usage arg;
+	struct bch_dev *ca;
+	unsigned i, j;
+	int ret;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	for (i = 0; i < arg.nr_devices; i++) {
+		struct bch_ioctl_dev_usage dst = { .alive = 0 };
+
+		ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
+		if (ret)
+			return ret;
+	}
+
+	{
+		struct bch_fs_usage *src;
+		struct bch_ioctl_fs_usage dst = {
+			.capacity		= c->capacity,
+		};
+
+		src = bch2_fs_usage_read(c);
+		if (!src)
+			return -ENOMEM;
+
+		dst.used		= bch2_fs_sectors_used(c, src);
+		dst.online_reserved	= src->online_reserved;
+
+		percpu_up_read(&c->mark_lock);
+
+		for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+			dst.persistent_reserved[i] =
+				src->persistent_reserved[i];
+#if 0
+			for (j = 0; j < BCH_DATA_NR; j++)
+				dst.sectors[j][i] = src.replicas[i].data[j];
+#endif
+		}
+
+		kfree(src);
+
+		ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
+		if (ret)
+			return ret;
+	}
+
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage src = bch2_dev_usage_read(c, ca);
+		struct bch_ioctl_dev_usage dst = {
+			.alive		= 1,
+			.state		= ca->mi.state,
+			.bucket_size	= ca->mi.bucket_size,
+			.nr_buckets	= ca->mi.nbuckets - ca->mi.first_bucket,
+		};
+
+		if (ca->dev_idx >= arg.nr_devices) {
+			percpu_ref_put(&ca->ref);
+			return -ERANGE;
+		}
+
+		if (percpu_ref_tryget(&ca->io_ref)) {
+			dst.dev = huge_encode_dev(ca->disk_sb.bdev->bd_dev);
+			percpu_ref_put(&ca->io_ref);
+		}
+
+		for (j = 0; j < BCH_DATA_NR; j++) {
+			dst.buckets[j] = src.buckets[j];
+			dst.sectors[j] = src.sectors[j];
+		}
+
+		ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static long bch2_ioctl_read_super(struct bch_fs *c,
+				  struct bch_ioctl_read_super arg)
+{
+	struct bch_dev *ca = NULL;
+	struct bch_sb *sb;
+	int ret = 0;
+
+	if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	if (arg.flags & BCH_READ_DEV) {
+		ca = bch2_device_lookup(c, arg.dev, arg.flags);
+
+		if (IS_ERR(ca)) {
+			ret = PTR_ERR(ca);
+			goto err;
+		}
+
+		sb = ca->disk_sb.sb;
+	} else {
+		sb = c->disk_sb.sb;
+	}
+
+	if (vstruct_bytes(sb) > arg.size) {
+		ret = -ERANGE;
+		goto err;
+	}
+
+	ret = copy_to_user((void __user *)(unsigned long)arg.sb,
+			   sb, vstruct_bytes(sb));
+err:
+	if (ca)
+		percpu_ref_put(&ca->ref);
+	mutex_unlock(&c->sb_lock);
+	return ret;
+}
+
+static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
+				    struct bch_ioctl_disk_get_idx arg)
+{
+	dev_t dev = huge_decode_dev(arg.dev);
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		if (ca->disk_sb.bdev->bd_dev == dev) {
+			percpu_ref_put(&ca->io_ref);
+			return i;
+		}
+
+	return -ENOENT;
+}
+
+static long bch2_ioctl_disk_resize(struct bch_fs *c,
+				   struct bch_ioctl_disk_resize arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_resize(c, ca, arg.nbuckets);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+#define BCH_IOCTL(_name, _argtype)					\
+do {									\
+	_argtype i;							\
+									\
+	if (copy_from_user(&i, arg, sizeof(i)))				\
+		return -EFAULT;						\
+	return bch2_ioctl_##_name(c, i);				\
+} while (0)
+
+long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
+{
+	/* ioctls that don't require admin cap: */
+	switch (cmd) {
+	case BCH_IOCTL_QUERY_UUID:
+		return bch2_ioctl_query_uuid(c, arg);
+	case BCH_IOCTL_USAGE:
+		return bch2_ioctl_usage(c, arg);
+	}
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+#if 0
+	case BCH_IOCTL_START:
+		BCH_IOCTL(start, struct bch_ioctl_start);
+	case BCH_IOCTL_STOP:
+		return bch2_ioctl_stop(c);
+#endif
+	case BCH_IOCTL_READ_SUPER:
+		BCH_IOCTL(read_super, struct bch_ioctl_read_super);
+	case BCH_IOCTL_DISK_GET_IDX:
+		BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
+	}
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	/* ioctls that do require admin cap: */
+	switch (cmd) {
+	case BCH_IOCTL_DISK_ADD:
+		BCH_IOCTL(disk_add, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_REMOVE:
+		BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_ONLINE:
+		BCH_IOCTL(disk_online, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_OFFLINE:
+		BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_SET_STATE:
+		BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
+	case BCH_IOCTL_DATA:
+		BCH_IOCTL(data, struct bch_ioctl_data);
+	case BCH_IOCTL_DISK_RESIZE:
+		BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+static DEFINE_IDR(bch_chardev_minor);
+
+static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
+{
+	unsigned minor = iminor(file_inode(filp));
+	struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
+	void __user *arg = (void __user *) v;
+
+	return c
+		? bch2_fs_ioctl(c, cmd, arg)
+		: bch2_global_ioctl(cmd, arg);
+}
+
+static const struct file_operations bch_chardev_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl = bch2_chardev_ioctl,
+	.open		= nonseekable_open,
+};
+
+static int bch_chardev_major;
+static struct class *bch_chardev_class;
+static struct device *bch_chardev;
+
+void bch2_fs_chardev_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->chardev))
+		device_unregister(c->chardev);
+	if (c->minor >= 0)
+		idr_remove(&bch_chardev_minor, c->minor);
+}
+
+int bch2_fs_chardev_init(struct bch_fs *c)
+{
+	c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
+	if (c->minor < 0)
+		return c->minor;
+
+	c->chardev = device_create(bch_chardev_class, NULL,
+				   MKDEV(bch_chardev_major, c->minor), c,
+				   "bcachefs%u-ctl", c->minor);
+	if (IS_ERR(c->chardev))
+		return PTR_ERR(c->chardev);
+
+	return 0;
+}
+
+void bch2_chardev_exit(void)
+{
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		device_destroy(bch_chardev_class,
+			       MKDEV(bch_chardev_major, U8_MAX));
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		class_destroy(bch_chardev_class);
+	if (bch_chardev_major > 0)
+		unregister_chrdev(bch_chardev_major, "bcachefs");
+}
+
+int __init bch2_chardev_init(void)
+{
+	bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
+	if (bch_chardev_major < 0)
+		return bch_chardev_major;
+
+	bch_chardev_class = class_create(THIS_MODULE, "bcachefs");
+	if (IS_ERR(bch_chardev_class))
+		return PTR_ERR(bch_chardev_class);
+
+	bch_chardev = device_create(bch_chardev_class, NULL,
+				    MKDEV(bch_chardev_major, U8_MAX),
+				    NULL, "bcachefs-ctl");
+	if (IS_ERR(bch_chardev))
+		return PTR_ERR(bch_chardev);
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
new file mode 100644
index 000000000000..3a4890d39ff9
--- /dev/null
+++ b/fs/bcachefs/chardev.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHARDEV_H
+#define _BCACHEFS_CHARDEV_H
+
+#ifndef NO_BCACHEFS_FS
+
+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
+
+void bch2_fs_chardev_exit(struct bch_fs *);
+int bch2_fs_chardev_init(struct bch_fs *);
+
+void bch2_chardev_exit(void);
+int __init bch2_chardev_init(void);
+
+#else
+
+static inline long bch2_fs_ioctl(struct bch_fs *c,
+				unsigned cmd, void __user * arg)
+{
+	return -ENOSYS;
+}
+
+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
+
+static inline void bch2_chardev_exit(void) {}
+static inline int __init bch2_chardev_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_CHARDEV_H */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
new file mode 100644
index 000000000000..2e1dfdc68e15
--- /dev/null
+++ b/fs/bcachefs/checksum.c
@@ -0,0 +1,617 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/crc32c.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/hash.h>
+#include <crypto/poly1305.h>
+#include <crypto/skcipher.h>
+#include <keys/user-type.h>
+
+static u64 bch2_checksum_init(unsigned type)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return 0;
+	case BCH_CSUM_CRC32C_NONZERO:
+		return U32_MAX;
+	case BCH_CSUM_CRC64_NONZERO:
+		return U64_MAX;
+	case BCH_CSUM_CRC32C:
+		return 0;
+	case BCH_CSUM_CRC64:
+		return 0;
+	default:
+		BUG();
+	}
+}
+
+static u64 bch2_checksum_final(unsigned type, u64 crc)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return 0;
+	case BCH_CSUM_CRC32C_NONZERO:
+		return crc ^ U32_MAX;
+	case BCH_CSUM_CRC64_NONZERO:
+		return crc ^ U64_MAX;
+	case BCH_CSUM_CRC32C:
+		return crc;
+	case BCH_CSUM_CRC64:
+		return crc;
+	default:
+		BUG();
+	}
+}
+
+static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return 0;
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC32C:
+		return crc32c(crc, data, len);
+	case BCH_CSUM_CRC64_NONZERO:
+	case BCH_CSUM_CRC64:
+		return crc64_be(crc, data, len);
+	default:
+		BUG();
+	}
+}
+
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+				 struct nonce nonce,
+				 struct scatterlist *sg, size_t len)
+{
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+	int ret;
+
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
+
+	ret = crypto_skcipher_encrypt(req);
+	BUG_ON(ret);
+}
+
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
+			      struct nonce nonce,
+			      void *buf, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, buf, len);
+	do_encrypt_sg(tfm, nonce, &sg, len);
+}
+
+int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
+			    void *buf, size_t len)
+{
+	struct crypto_sync_skcipher *chacha20 =
+		crypto_alloc_sync_skcipher("chacha20", 0, 0);
+	int ret;
+
+	if (!chacha20) {
+		pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
+		return PTR_ERR(chacha20);
+	}
+
+	ret = crypto_skcipher_setkey(&chacha20->base,
+				     (void *) key, sizeof(*key));
+	if (ret) {
+		pr_err("crypto_skcipher_setkey() error: %i", ret);
+		goto err;
+	}
+
+	do_encrypt(chacha20, nonce, buf, len);
+err:
+	crypto_free_sync_skcipher(chacha20);
+	return ret;
+}
+
+static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+			 struct nonce nonce)
+{
+	u8 key[POLY1305_KEY_SIZE];
+
+	nonce.d[3] ^= BCH_NONCE_POLY;
+
+	memset(key, 0, sizeof(key));
+	do_encrypt(c->chacha20, nonce, key, sizeof(key));
+
+	desc->tfm = c->poly1305;
+	crypto_shash_init(desc);
+	crypto_shash_update(desc, key, sizeof(key));
+}
+
+struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
+			      struct nonce nonce, const void *data, size_t len)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC64_NONZERO:
+	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC64: {
+		u64 crc = bch2_checksum_init(type);
+
+		crc = bch2_checksum_update(type, crc, data, len);
+		crc = bch2_checksum_final(type, crc);
+
+		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+	}
+
+	case BCH_CSUM_CHACHA20_POLY1305_80:
+	case BCH_CSUM_CHACHA20_POLY1305_128: {
+		SHASH_DESC_ON_STACK(desc, c->poly1305);
+		u8 digest[POLY1305_DIGEST_SIZE];
+		struct bch_csum ret = { 0 };
+
+		gen_poly_key(c, desc, nonce);
+
+		crypto_shash_update(desc, data, len);
+		crypto_shash_final(desc, digest);
+
+		memcpy(&ret, digest, bch_crc_bytes[type]);
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+void bch2_encrypt(struct bch_fs *c, unsigned type,
+		  struct nonce nonce, void *data, size_t len)
+{
+	if (!bch2_csum_type_is_encryption(type))
+		return;
+
+	do_encrypt(c->chacha20, nonce, data, len);
+}
+
+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
+					   struct nonce nonce, struct bio *bio,
+					   struct bvec_iter *iter)
+{
+	struct bio_vec bv;
+
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return (struct bch_csum) { 0 };
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC64_NONZERO:
+	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC64: {
+		u64 crc = bch2_checksum_init(type);
+
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
+			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+			crc = bch2_checksum_update(type,
+				crc, p, bv.bv_len);
+			kunmap_atomic(p);
+		}
+#else
+		__bio_for_each_bvec(bv, bio, *iter, *iter)
+			crc = bch2_checksum_update(type, crc,
+				page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
+		crc = bch2_checksum_final(type, crc);
+		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+	}
+
+	case BCH_CSUM_CHACHA20_POLY1305_80:
+	case BCH_CSUM_CHACHA20_POLY1305_128: {
+		SHASH_DESC_ON_STACK(desc, c->poly1305);
+		u8 digest[POLY1305_DIGEST_SIZE];
+		struct bch_csum ret = { 0 };
+
+		gen_poly_key(c, desc, nonce);
+
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
+			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+
+			crypto_shash_update(desc, p, bv.bv_len);
+			kunmap_atomic(p);
+		}
+#else
+		__bio_for_each_bvec(bv, bio, *iter, *iter)
+			crypto_shash_update(desc,
+				page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
+		crypto_shash_final(desc, digest);
+
+		memcpy(&ret, digest, bch_crc_bytes[type]);
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
+				  struct nonce nonce, struct bio *bio)
+{
+	struct bvec_iter iter = bio->bi_iter;
+
+	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
+}
+
+void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+		      struct nonce nonce, struct bio *bio)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	struct scatterlist sgl[16], *sg = sgl;
+	size_t bytes = 0;
+
+	if (!bch2_csum_type_is_encryption(type))
+		return;
+
+	sg_init_table(sgl, ARRAY_SIZE(sgl));
+
+	bio_for_each_segment(bv, bio, iter) {
+		if (sg == sgl + ARRAY_SIZE(sgl)) {
+			sg_mark_end(sg - 1);
+			do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+
+			nonce = nonce_add(nonce, bytes);
+			bytes = 0;
+
+			sg_init_table(sgl, ARRAY_SIZE(sgl));
+			sg = sgl;
+		}
+
+		sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
+		bytes += bv.bv_len;
+	}
+
+	sg_mark_end(sg - 1);
+	do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+}
+
+struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
+				    struct bch_csum b, size_t b_len)
+{
+	BUG_ON(!bch2_checksum_mergeable(type));
+
+	while (b_len) {
+		unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
+
+		a.lo = bch2_checksum_update(type, a.lo,
+				page_address(ZERO_PAGE(0)), b);
+		b_len -= b;
+	}
+
+	a.lo ^= b.lo;
+	a.hi ^= b.hi;
+	return a;
+}
+
+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
+			struct bversion version,
+			struct bch_extent_crc_unpacked crc_old,
+			struct bch_extent_crc_unpacked *crc_a,
+			struct bch_extent_crc_unpacked *crc_b,
+			unsigned len_a, unsigned len_b,
+			unsigned new_csum_type)
+{
+	struct bvec_iter iter = bio->bi_iter;
+	struct nonce nonce = extent_nonce(version, crc_old);
+	struct bch_csum merged = { 0 };
+	struct crc_split {
+		struct bch_extent_crc_unpacked	*crc;
+		unsigned			len;
+		unsigned			csum_type;
+		struct bch_csum			csum;
+	} splits[3] = {
+		{ crc_a, len_a, new_csum_type },
+		{ crc_b, len_b, new_csum_type },
+		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type },
+	}, *i;
+	bool mergeable = crc_old.csum_type == new_csum_type &&
+		bch2_checksum_mergeable(new_csum_type);
+	unsigned crc_nonce = crc_old.nonce;
+
+	BUG_ON(len_a + len_b > bio_sectors(bio));
+	BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
+	BUG_ON(crc_old.compression_type);
+	BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
+	       bch2_csum_type_is_encryption(new_csum_type));
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		iter.bi_size = i->len << 9;
+		if (mergeable || i->crc)
+			i->csum = __bch2_checksum_bio(c, i->csum_type,
+						      nonce, bio, &iter);
+		else
+			bio_advance_iter(bio, &iter, i->len << 9);
+		nonce = nonce_add(nonce, i->len << 9);
+	}
+
+	if (mergeable)
+		for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
+			merged = bch2_checksum_merge(new_csum_type, merged,
+						     i->csum, i->len << 9);
+	else
+		merged = bch2_checksum_bio(c, crc_old.csum_type,
+				extent_nonce(version, crc_old), bio);
+
+	if (bch2_crc_cmp(merged, crc_old.csum))
+		return -EIO;
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		if (i->crc)
+			*i->crc = (struct bch_extent_crc_unpacked) {
+				.csum_type		= i->csum_type,
+				.compressed_size	= i->len,
+				.uncompressed_size	= i->len,
+				.offset			= 0,
+				.live_size		= i->len,
+				.nonce			= crc_nonce,
+				.csum			= i->csum,
+			};
+
+		if (bch2_csum_type_is_encryption(new_csum_type))
+			crc_nonce += i->len;
+	}
+
+	return 0;
+}
+
+#ifdef __KERNEL__
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+	char key_description[60];
+	struct key *keyring_key;
+	const struct user_key_payload *ukp;
+	int ret;
+
+	snprintf(key_description, sizeof(key_description),
+		 "bcachefs:%pUb", &sb->user_uuid);
+
+	keyring_key = request_key(&key_type_logon, key_description, NULL);
+	if (IS_ERR(keyring_key))
+		return PTR_ERR(keyring_key);
+
+	down_read(&keyring_key->sem);
+	ukp = dereference_key_locked(keyring_key);
+	if (ukp->datalen == sizeof(*key)) {
+		memcpy(key, ukp->data, ukp->datalen);
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+	up_read(&keyring_key->sem);
+	key_put(keyring_key);
+
+	return ret;
+}
+#else
+#include <keyutils.h>
+#include <uuid/uuid.h>
+
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+	key_serial_t key_id;
+	char key_description[60];
+	char uuid[40];
+
+	uuid_unparse_lower(sb->user_uuid.b, uuid);
+	sprintf(key_description, "bcachefs:%s", uuid);
+
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_USER_KEYRING);
+	if (key_id < 0)
+		return -errno;
+
+	if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
+		return -1;
+
+	return 0;
+}
+#endif
+
+int bch2_decrypt_sb_key(struct bch_fs *c,
+			struct bch_sb_field_crypt *crypt,
+			struct bch_key *key)
+{
+	struct bch_encrypted_key sb_key = crypt->key;
+	struct bch_key user_key;
+	int ret = 0;
+
+	/* is key encrypted? */
+	if (!bch2_key_is_encrypted(&sb_key))
+		goto out;
+
+	ret = bch2_request_key(c->disk_sb.sb, &user_key);
+	if (ret) {
+		bch_err(c, "error requesting encryption key: %i", ret);
+		goto err;
+	}
+
+	/* decrypt real key: */
+	ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+			     &sb_key, sizeof(sb_key));
+	if (ret)
+		goto err;
+
+	if (bch2_key_is_encrypted(&sb_key)) {
+		bch_err(c, "incorrect encryption key");
+		ret = -EINVAL;
+		goto err;
+	}
+out:
+	*key = sb_key.key;
+err:
+	memzero_explicit(&sb_key, sizeof(sb_key));
+	memzero_explicit(&user_key, sizeof(user_key));
+	return ret;
+}
+
+static int bch2_alloc_ciphers(struct bch_fs *c)
+{
+	if (!c->chacha20)
+		c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
+	if (IS_ERR(c->chacha20)) {
+		bch_err(c, "error requesting chacha20 module: %li",
+			PTR_ERR(c->chacha20));
+		return PTR_ERR(c->chacha20);
+	}
+
+	if (!c->poly1305)
+		c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
+	if (IS_ERR(c->poly1305)) {
+		bch_err(c, "error requesting poly1305 module: %li",
+			PTR_ERR(c->poly1305));
+		return PTR_ERR(c->poly1305);
+	}
+
+	return 0;
+}
+
+int bch2_disable_encryption(struct bch_fs *c)
+{
+	struct bch_sb_field_crypt *crypt;
+	struct bch_key key;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+	if (!crypt)
+		goto out;
+
+	/* is key encrypted? */
+	ret = 0;
+	if (bch2_key_is_encrypted(&crypt->key))
+		goto out;
+
+	ret = bch2_decrypt_sb_key(c, crypt, &key);
+	if (ret)
+		goto out;
+
+	crypt->key.magic	= BCH_KEY_MAGIC;
+	crypt->key.key		= key;
+
+	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
+	bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_enable_encryption(struct bch_fs *c, bool keyed)
+{
+	struct bch_encrypted_key key;
+	struct bch_key user_key;
+	struct bch_sb_field_crypt *crypt;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	/* Do we already have an encryption key? */
+	if (bch2_sb_get_crypt(c->disk_sb.sb))
+		goto err;
+
+	ret = bch2_alloc_ciphers(c);
+	if (ret)
+		goto err;
+
+	key.magic = BCH_KEY_MAGIC;
+	get_random_bytes(&key.key, sizeof(key.key));
+
+	if (keyed) {
+		ret = bch2_request_key(c->disk_sb.sb, &user_key);
+		if (ret) {
+			bch_err(c, "error requesting encryption key: %i", ret);
+			goto err;
+		}
+
+		ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+					      &key, sizeof(key));
+		if (ret)
+			goto err;
+	}
+
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
+			(void *) &key.key, sizeof(key.key));
+	if (ret)
+		goto err;
+
+	crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
+	if (!crypt) {
+		ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
+		goto err;
+	}
+
+	crypt->key = key;
+
+	/* write superblock */
+	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
+	bch2_write_super(c);
+err:
+	mutex_unlock(&c->sb_lock);
+	memzero_explicit(&user_key, sizeof(user_key));
+	memzero_explicit(&key, sizeof(key));
+	return ret;
+}
+
+void bch2_fs_encryption_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->poly1305))
+		crypto_free_shash(c->poly1305);
+	if (!IS_ERR_OR_NULL(c->chacha20))
+		crypto_free_sync_skcipher(c->chacha20);
+	if (!IS_ERR_OR_NULL(c->sha256))
+		crypto_free_shash(c->sha256);
+}
+
+int bch2_fs_encryption_init(struct bch_fs *c)
+{
+	struct bch_sb_field_crypt *crypt;
+	struct bch_key key;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	c->sha256 = crypto_alloc_shash("sha256", 0, 0);
+	if (IS_ERR(c->sha256)) {
+		bch_err(c, "error requesting sha256 module");
+		ret = PTR_ERR(c->sha256);
+		goto out;
+	}
+
+	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+	if (!crypt)
+		goto out;
+
+	ret = bch2_alloc_ciphers(c);
+	if (ret)
+		goto out;
+
+	ret = bch2_decrypt_sb_key(c, crypt, &key);
+	if (ret)
+		goto out;
+
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
+			(void *) &key.key, sizeof(key.key));
+	if (ret)
+		goto out;
+out:
+	memzero_explicit(&key, sizeof(key));
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
new file mode 100644
index 000000000000..b84e81bac8ff
--- /dev/null
+++ b/fs/bcachefs/checksum.h
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHECKSUM_H
+#define _BCACHEFS_CHECKSUM_H
+
+#include "bcachefs.h"
+#include "extents_types.h"
+#include "super-io.h"
+
+#include <linux/crc64.h>
+#include <crypto/chacha.h>
+
+static inline bool bch2_checksum_mergeable(unsigned type)
+{
+
+	switch (type) {
+	case BCH_CSUM_NONE:
+	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC64:
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
+				    struct bch_csum, size_t);
+
+#define BCH_NONCE_EXTENT	cpu_to_le32(1 << 28)
+#define BCH_NONCE_BTREE		cpu_to_le32(2 << 28)
+#define BCH_NONCE_JOURNAL	cpu_to_le32(3 << 28)
+#define BCH_NONCE_PRIO		cpu_to_le32(4 << 28)
+#define BCH_NONCE_POLY		cpu_to_le32(1 << 31)
+
+struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
+			     const void *, size_t);
+
+/*
+ * This is used for various on disk data structures - bch_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first field of these structs
+ */
+#define csum_vstruct(_c, _type, _nonce, _i)				\
+({									\
+	const void *start = ((const void *) (_i)) + sizeof((_i)->csum);	\
+	const void *end = vstruct_end(_i);				\
+									\
+	bch2_checksum(_c, _type, _nonce, start, end - start);		\
+})
+
+int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
+int bch2_request_key(struct bch_sb *, struct bch_key *);
+
+void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+		 void *data, size_t);
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
+				  struct nonce, struct bio *);
+
+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
+			struct bch_extent_crc_unpacked,
+			struct bch_extent_crc_unpacked *,
+			struct bch_extent_crc_unpacked *,
+			unsigned, unsigned, unsigned);
+
+void bch2_encrypt_bio(struct bch_fs *, unsigned,
+		    struct nonce, struct bio *);
+
+int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
+			struct bch_key *);
+
+int bch2_disable_encryption(struct bch_fs *);
+int bch2_enable_encryption(struct bch_fs *, bool);
+
+void bch2_fs_encryption_exit(struct bch_fs *);
+int bch2_fs_encryption_init(struct bch_fs *);
+
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+						       bool data)
+{
+	switch (type) {
+	case BCH_CSUM_OPT_NONE:
+	     return BCH_CSUM_NONE;
+	case BCH_CSUM_OPT_CRC32C:
+	     return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
+	case BCH_CSUM_OPT_CRC64:
+	     return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
+	default:
+	     BUG();
+	}
+}
+
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
+							 unsigned opt)
+{
+	if (c->sb.encryption_type)
+		return c->opts.wide_macs
+			? BCH_CSUM_CHACHA20_POLY1305_128
+			: BCH_CSUM_CHACHA20_POLY1305_80;
+
+	return bch2_csum_opt_to_type(opt, true);
+}
+
+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
+{
+	if (c->sb.encryption_type)
+		return BCH_CSUM_CHACHA20_POLY1305_128;
+
+	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
+}
+
+static const unsigned bch2_compression_opt_to_type[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t,
+	BCH_COMPRESSION_TYPES()
+#undef x
+};
+
+static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
+					   unsigned type)
+{
+	if (type >= BCH_CSUM_NR)
+		return false;
+
+	if (bch2_csum_type_is_encryption(type) && !c->chacha20)
+		return false;
+
+	return true;
+}
+
+/* returns true if not equal */
+static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
+{
+	/*
+	 * XXX: need some way of preventing the compiler from optimizing this
+	 * into a form that isn't constant time..
+	 */
+	return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
+}
+
+/* for skipping ahead and encrypting/decrypting at an offset: */
+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
+{
+	EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
+
+	le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
+	return nonce;
+}
+
+static inline struct nonce null_nonce(void)
+{
+	struct nonce ret;
+
+	memset(&ret, 0, sizeof(ret));
+	return ret;
+}
+
+static inline struct nonce extent_nonce(struct bversion version,
+					struct bch_extent_crc_unpacked crc)
+{
+	unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
+	struct nonce nonce = (struct nonce) {{
+		[0] = cpu_to_le32(size << 22),
+		[1] = cpu_to_le32(version.lo),
+		[2] = cpu_to_le32(version.lo >> 32),
+		[3] = cpu_to_le32(version.hi|
+				  (crc.compression_type << 24))^BCH_NONCE_EXTENT,
+	}};
+
+	return nonce_add(nonce, crc.nonce << 9);
+}
+
+static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
+{
+	return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
+}
+
+static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
+{
+	__le64 magic = __bch2_sb_magic(sb);
+
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = 0,
+		[2] = ((__le32 *) &magic)[0],
+		[3] = ((__le32 *) &magic)[1],
+	}};
+}
+
+static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
+{
+	__le64 magic = bch2_sb_magic(c);
+
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = 0,
+		[2] = ((__le32 *) &magic)[0],
+		[3] = ((__le32 *) &magic)[1],
+	}};
+}
+
+#endif /* _BCACHEFS_CHECKSUM_H */
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
new file mode 100644
index 000000000000..8ac6990c6971
--- /dev/null
+++ b/fs/bcachefs/clock.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "clock.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+
+static inline long io_timer_cmp(io_timer_heap *h,
+				struct io_timer *l,
+				struct io_timer *r)
+{
+	return l->expire - r->expire;
+}
+
+void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer)
+			goto out;
+
+	BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL));
+out:
+	spin_unlock(&clock->timer_lock);
+}
+
+void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer) {
+			heap_del(&clock->timers, i, io_timer_cmp, NULL);
+			break;
+		}
+
+	spin_unlock(&clock->timer_lock);
+}
+
+struct io_clock_wait {
+	struct io_timer		io_timer;
+	struct timer_list	cpu_timer;
+	struct task_struct	*task;
+	int			expired;
+};
+
+static void io_clock_wait_fn(struct io_timer *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, io_timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+static void io_clock_cpu_timeout(struct timer_list *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, cpu_timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+{
+	struct io_clock_wait wait;
+
+	/* XXX: calculate sleep time rigorously */
+	wait.io_timer.expire	= until;
+	wait.io_timer.fn	= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch2_io_timer_add(clock, &wait.io_timer);
+
+	schedule();
+
+	bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+void bch2_kthread_io_clock_wait(struct io_clock *clock,
+				unsigned long io_until,
+				unsigned long cpu_timeout)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct io_clock_wait wait;
+
+	wait.io_timer.expire	= io_until;
+	wait.io_timer.fn	= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch2_io_timer_add(clock, &wait.io_timer);
+
+	timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
+
+	if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
+		mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread && kthread_should_stop())
+			break;
+
+		if (wait.expired)
+			break;
+
+		schedule();
+		try_to_freeze();
+	}
+
+	__set_current_state(TASK_RUNNING);
+	del_singleshot_timer_sync(&wait.cpu_timer);
+	destroy_timer_on_stack(&wait.cpu_timer);
+	bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+static struct io_timer *get_expired_timer(struct io_clock *clock,
+					  unsigned long now)
+{
+	struct io_timer *ret = NULL;
+
+	spin_lock(&clock->timer_lock);
+
+	if (clock->timers.used &&
+	    time_after_eq(now, clock->timers.data[0]->expire))
+		heap_pop(&clock->timers, ret, io_timer_cmp, NULL);
+
+	spin_unlock(&clock->timer_lock);
+
+	return ret;
+}
+
+void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw)
+{
+	struct io_clock *clock = &c->io_clock[rw];
+	struct io_timer *timer;
+	unsigned long now;
+
+	/* Buffer up one megabyte worth of IO in the percpu counter */
+	preempt_disable();
+
+	if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
+		   IO_CLOCK_PCPU_SECTORS)) {
+		preempt_enable();
+		return;
+	}
+
+	sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
+	preempt_enable();
+	now = atomic_long_add_return(sectors, &clock->now);
+
+	while ((timer = get_expired_timer(clock, now)))
+		timer->fn(timer);
+}
+
+void bch2_io_clock_exit(struct io_clock *clock)
+{
+	free_heap(&clock->timers);
+	free_percpu(clock->pcpu_buf);
+}
+
+int bch2_io_clock_init(struct io_clock *clock)
+{
+	atomic_long_set(&clock->now, 0);
+	spin_lock_init(&clock->timer_lock);
+
+	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
+	if (!clock->pcpu_buf)
+		return -ENOMEM;
+
+	if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
new file mode 100644
index 000000000000..5cb043c579d8
--- /dev/null
+++ b/fs/bcachefs/clock.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_H
+#define _BCACHEFS_CLOCK_H
+
+void bch2_io_timer_add(struct io_clock *, struct io_timer *);
+void bch2_io_timer_del(struct io_clock *, struct io_timer *);
+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
+				unsigned long);
+void bch2_increment_clock(struct bch_fs *, unsigned, int);
+
+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
+
+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_event_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+
+void bch2_io_clock_exit(struct io_clock *);
+int bch2_io_clock_init(struct io_clock *);
+
+#endif /* _BCACHEFS_CLOCK_H */
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
new file mode 100644
index 000000000000..2b5e499e12b4
--- /dev/null
+++ b/fs/bcachefs/clock_types.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_TYPES_H
+#define _BCACHEFS_CLOCK_TYPES_H
+
+#include "util.h"
+
+#define NR_IO_TIMERS		(BCH_SB_MEMBERS_MAX * 3)
+
+/*
+ * Clocks/timers in units of sectors of IO:
+ *
+ * Note - they use percpu batching, so they're only approximate.
+ */
+
+struct io_timer;
+typedef void (*io_timer_fn)(struct io_timer *);
+
+struct io_timer {
+	io_timer_fn		fn;
+	unsigned long		expire;
+};
+
+/* Amount to buffer up on a percpu counter */
+#define IO_CLOCK_PCPU_SECTORS	128
+
+typedef HEAP(struct io_timer *)	io_timer_heap;
+
+struct io_clock {
+	atomic_long_t		now;
+	u16 __percpu		*pcpu_buf;
+
+	spinlock_t		timer_lock;
+	io_timer_heap		timers;
+};
+
+#endif /* _BCACHEFS_CLOCK_TYPES_H */
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
new file mode 100644
index 000000000000..3787390da47f
--- /dev/null
+++ b/fs/bcachefs/compress.c
@@ -0,0 +1,623 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "compress.h"
+#include "extents.h"
+#include "io.h"
+#include "super-io.h"
+
+#include <linux/lz4.h>
+#include <linux/zlib.h>
+#include <linux/zstd.h>
+
+/* Bounce buffer: */
+struct bbuf {
+	void		*b;
+	enum {
+		BB_NONE,
+		BB_VMAP,
+		BB_KMALLOC,
+		BB_VMALLOC,
+		BB_MEMPOOL,
+	}		type;
+	int		rw;
+};
+
+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
+{
+	void *b;
+
+	BUG_ON(size > c->sb.encoded_extent_max << 9);
+
+	b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
+
+	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT);
+	b = b ? page_address(b) : NULL;
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+	b = vmalloc(size);
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw };
+
+	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
+	b = b ? page_address(b) : NULL;
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+	BUG();
+}
+
+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
+				       struct bvec_iter start, int rw)
+{
+	struct bbuf ret;
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	unsigned nr_pages = 0;
+	struct page *stack_pages[16];
+	struct page **pages = NULL;
+	bool first = true;
+	unsigned prev_end = PAGE_SIZE;
+	void *data;
+
+	BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
+
+#ifndef CONFIG_HIGHMEM
+	__bio_for_each_bvec(bv, bio, iter, start) {
+		if (bv.bv_len == start.bi_size)
+			return (struct bbuf) {
+				.b = page_address(bv.bv_page) + bv.bv_offset,
+				.type = BB_NONE, .rw = rw
+			};
+	}
+#endif
+	__bio_for_each_segment(bv, bio, iter, start) {
+		if ((!first && bv.bv_offset) ||
+		    prev_end != PAGE_SIZE)
+			goto bounce;
+
+		prev_end = bv.bv_offset + bv.bv_len;
+		nr_pages++;
+	}
+
+	BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
+
+	pages = nr_pages > ARRAY_SIZE(stack_pages)
+		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
+		: stack_pages;
+	if (!pages)
+		goto bounce;
+
+	nr_pages = 0;
+	__bio_for_each_segment(bv, bio, iter, start)
+		pages[nr_pages++] = bv.bv_page;
+
+	data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (pages != stack_pages)
+		kfree(pages);
+
+	if (data)
+		return (struct bbuf) {
+			.b = data + bio_iter_offset(bio, start),
+			.type = BB_VMAP, .rw = rw
+		};
+bounce:
+	ret = __bounce_alloc(c, start.bi_size, rw);
+
+	if (rw == READ)
+		memcpy_from_bio(ret.b, bio, start);
+
+	return ret;
+}
+
+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
+{
+	return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
+}
+
+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
+{
+	switch (buf.type) {
+	case BB_NONE:
+		break;
+	case BB_VMAP:
+		vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
+		break;
+	case BB_KMALLOC:
+		kfree(buf.b);
+		break;
+	case BB_VMALLOC:
+		vfree(buf.b);
+		break;
+	case BB_MEMPOOL:
+		mempool_free(virt_to_page(buf.b),
+			     &c->compression_bounce[buf.rw]);
+		break;
+	}
+}
+
+static inline void zlib_set_workspace(z_stream *strm, void *workspace)
+{
+#ifdef __KERNEL__
+	strm->workspace = workspace;
+#endif
+}
+
+static int __bio_uncompress(struct bch_fs *c, struct bio *src,
+			    void *dst_data, struct bch_extent_crc_unpacked crc)
+{
+	struct bbuf src_data = { NULL };
+	size_t src_len = src->bi_iter.bi_size;
+	size_t dst_len = crc.uncompressed_size << 9;
+	void *workspace;
+	int ret;
+
+	src_data = bio_map_or_bounce(c, src, READ);
+
+	switch (crc.compression_type) {
+	case BCH_COMPRESSION_LZ4_OLD:
+	case BCH_COMPRESSION_LZ4:
+		ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
+						  src_len, dst_len, dst_len);
+		if (ret != dst_len)
+			goto err;
+		break;
+	case BCH_COMPRESSION_GZIP: {
+		z_stream strm = {
+			.next_in	= src_data.b,
+			.avail_in	= src_len,
+			.next_out	= dst_data,
+			.avail_out	= dst_len,
+		};
+
+		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+
+		zlib_set_workspace(&strm, workspace);
+		zlib_inflateInit2(&strm, -MAX_WBITS);
+		ret = zlib_inflate(&strm, Z_FINISH);
+
+		mempool_free(workspace, &c->decompress_workspace);
+
+		if (ret != Z_STREAM_END)
+			goto err;
+		break;
+	}
+	case BCH_COMPRESSION_ZSTD: {
+		ZSTD_DCtx *ctx;
+		size_t len;
+
+		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+		ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
+
+		src_len = le32_to_cpup(src_data.b);
+
+		len = ZSTD_decompressDCtx(ctx,
+				dst_data,	dst_len,
+				src_data.b + 4, src_len);
+
+		mempool_free(workspace, &c->decompress_workspace);
+
+		if (len != dst_len)
+			goto err;
+		break;
+	}
+	default:
+		BUG();
+	}
+	ret = 0;
+out:
+	bio_unmap_or_unbounce(c, src_data);
+	return ret;
+err:
+	ret = -EIO;
+	goto out;
+}
+
+int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
+				struct bch_extent_crc_unpacked *crc)
+{
+	struct bbuf data = { NULL };
+	size_t dst_len = crc->uncompressed_size << 9;
+
+	/* bio must own its pages: */
+	BUG_ON(!bio->bi_vcnt);
+	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
+
+	if (crc->uncompressed_size	> c->sb.encoded_extent_max ||
+	    crc->compressed_size	> c->sb.encoded_extent_max) {
+		bch_err(c, "error rewriting existing data: extent too big");
+		return -EIO;
+	}
+
+	data = __bounce_alloc(c, dst_len, WRITE);
+
+	if (__bio_uncompress(c, bio, data.b, *crc)) {
+		bch_err(c, "error rewriting existing data: decompression error");
+		bio_unmap_or_unbounce(c, data);
+		return -EIO;
+	}
+
+	/*
+	 * XXX: don't have a good way to assert that the bio was allocated with
+	 * enough space, we depend on bch2_move_extent doing the right thing
+	 */
+	bio->bi_iter.bi_size = crc->live_size << 9;
+
+	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
+
+	crc->csum_type		= 0;
+	crc->compression_type	= 0;
+	crc->compressed_size	= crc->live_size;
+	crc->uncompressed_size	= crc->live_size;
+	crc->offset		= 0;
+	crc->csum		= (struct bch_csum) { 0, 0 };
+
+	bio_unmap_or_unbounce(c, data);
+	return 0;
+}
+
+int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
+		       struct bio *dst, struct bvec_iter dst_iter,
+		       struct bch_extent_crc_unpacked crc)
+{
+	struct bbuf dst_data = { NULL };
+	size_t dst_len = crc.uncompressed_size << 9;
+	int ret = -ENOMEM;
+
+	if (crc.uncompressed_size	> c->sb.encoded_extent_max ||
+	    crc.compressed_size		> c->sb.encoded_extent_max)
+		return -EIO;
+
+	dst_data = dst_len == dst_iter.bi_size
+		? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
+		: __bounce_alloc(c, dst_len, WRITE);
+
+	ret = __bio_uncompress(c, src, dst_data.b, crc);
+	if (ret)
+		goto err;
+
+	if (dst_data.type != BB_NONE)
+		memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
+err:
+	bio_unmap_or_unbounce(c, dst_data);
+	return ret;
+}
+
+static int attempt_compress(struct bch_fs *c,
+			    void *workspace,
+			    void *dst, size_t dst_len,
+			    void *src, size_t src_len,
+			    unsigned compression_type)
+{
+	switch (compression_type) {
+	case BCH_COMPRESSION_LZ4: {
+		int len = src_len;
+		int ret = LZ4_compress_destSize(
+				src,		dst,
+				&len,		dst_len,
+				workspace);
+
+		if (len < src_len)
+			return -len;
+
+		return ret;
+	}
+	case BCH_COMPRESSION_GZIP: {
+		z_stream strm = {
+			.next_in	= src,
+			.avail_in	= src_len,
+			.next_out	= dst,
+			.avail_out	= dst_len,
+		};
+
+		zlib_set_workspace(&strm, workspace);
+		zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+				  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+				  Z_DEFAULT_STRATEGY);
+
+		if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
+			return 0;
+
+		if (zlib_deflateEnd(&strm) != Z_OK)
+			return 0;
+
+		return strm.total_out;
+	}
+	case BCH_COMPRESSION_ZSTD: {
+		ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
+			ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
+
+		size_t len = ZSTD_compressCCtx(ctx,
+				dst + 4,	dst_len - 4,
+				src,		src_len,
+				c->zstd_params);
+		if (ZSTD_isError(len))
+			return 0;
+
+		*((__le32 *) dst) = cpu_to_le32(len);
+		return len + 4;
+	}
+	default:
+		BUG();
+	}
+}
+
+static unsigned __bio_compress(struct bch_fs *c,
+			       struct bio *dst, size_t *dst_len,
+			       struct bio *src, size_t *src_len,
+			       unsigned compression_type)
+{
+	struct bbuf src_data = { NULL }, dst_data = { NULL };
+	void *workspace;
+	unsigned pad;
+	int ret = 0;
+
+	BUG_ON(compression_type >= BCH_COMPRESSION_NR);
+	BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+
+	/* If it's only one block, don't bother trying to compress: */
+	if (bio_sectors(src) <= c->opts.block_size)
+		return 0;
+
+	dst_data = bio_map_or_bounce(c, dst, WRITE);
+	src_data = bio_map_or_bounce(c, src, READ);
+
+	workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
+
+	*src_len = src->bi_iter.bi_size;
+	*dst_len = dst->bi_iter.bi_size;
+
+	/*
+	 * XXX: this algorithm sucks when the compression code doesn't tell us
+	 * how much would fit, like LZ4 does:
+	 */
+	while (1) {
+		if (*src_len <= block_bytes(c)) {
+			ret = -1;
+			break;
+		}
+
+		ret = attempt_compress(c, workspace,
+				       dst_data.b,	*dst_len,
+				       src_data.b,	*src_len,
+				       compression_type);
+		if (ret > 0) {
+			*dst_len = ret;
+			ret = 0;
+			break;
+		}
+
+		/* Didn't fit: should we retry with a smaller amount?  */
+		if (*src_len <= *dst_len) {
+			ret = -1;
+			break;
+		}
+
+		/*
+		 * If ret is negative, it's a hint as to how much data would fit
+		 */
+		BUG_ON(-ret >= *src_len);
+
+		if (ret < 0)
+			*src_len = -ret;
+		else
+			*src_len -= (*src_len - *dst_len) / 2;
+		*src_len = round_down(*src_len, block_bytes(c));
+	}
+
+	mempool_free(workspace, &c->compress_workspace[compression_type]);
+
+	if (ret)
+		goto err;
+
+	/* Didn't get smaller: */
+	if (round_up(*dst_len, block_bytes(c)) >= *src_len)
+		goto err;
+
+	pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
+
+	memset(dst_data.b + *dst_len, 0, pad);
+	*dst_len += pad;
+
+	if (dst_data.type != BB_NONE)
+		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
+
+	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
+	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
+	BUG_ON(*dst_len & (block_bytes(c) - 1));
+	BUG_ON(*src_len & (block_bytes(c) - 1));
+out:
+	bio_unmap_or_unbounce(c, src_data);
+	bio_unmap_or_unbounce(c, dst_data);
+	return compression_type;
+err:
+	compression_type = 0;
+	goto out;
+}
+
+unsigned bch2_bio_compress(struct bch_fs *c,
+			   struct bio *dst, size_t *dst_len,
+			   struct bio *src, size_t *src_len,
+			   unsigned compression_type)
+{
+	unsigned orig_dst = dst->bi_iter.bi_size;
+	unsigned orig_src = src->bi_iter.bi_size;
+
+	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
+	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
+				     c->sb.encoded_extent_max << 9);
+	/* Don't generate a bigger output than input: */
+	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+
+	if (compression_type == BCH_COMPRESSION_LZ4_OLD)
+		compression_type = BCH_COMPRESSION_LZ4;
+
+	compression_type =
+		__bio_compress(c, dst, dst_len, src, src_len, compression_type);
+
+	dst->bi_iter.bi_size = orig_dst;
+	src->bi_iter.bi_size = orig_src;
+	return compression_type;
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *, u64);
+
+#define BCH_FEATURE_NONE	0
+
+static const unsigned bch2_compression_opt_to_feature[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+	BCH_COMPRESSION_TYPES()
+#undef x
+};
+
+#undef BCH_FEATURE_NONE
+
+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
+{
+	int ret = 0;
+
+	if ((c->sb.features & f) == f)
+		return 0;
+
+	mutex_lock(&c->sb_lock);
+
+	if ((c->sb.features & f) == f) {
+		mutex_unlock(&c->sb_lock);
+		return 0;
+	}
+
+	ret = __bch2_fs_compress_init(c, c->sb.features|f);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		return ret;
+	}
+
+	c->disk_sb.sb->features[0] |= cpu_to_le64(f);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_check_set_has_compressed_data(struct bch_fs *c,
+				       unsigned compression_type)
+{
+	BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+
+	return compression_type
+		? __bch2_check_set_has_compressed_data(c,
+				1ULL << bch2_compression_opt_to_feature[compression_type])
+		: 0;
+}
+
+void bch2_fs_compress_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	mempool_exit(&c->decompress_workspace);
+	for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
+		mempool_exit(&c->compress_workspace[i]);
+	mempool_exit(&c->compression_bounce[WRITE]);
+	mempool_exit(&c->compression_bounce[READ]);
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
+{
+	size_t max_extent = c->sb.encoded_extent_max << 9;
+	size_t order = get_order(max_extent);
+	size_t decompress_workspace_size = 0;
+	bool decompress_workspace_needed;
+	ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0);
+	struct {
+		unsigned	feature;
+		unsigned	type;
+		size_t		compress_workspace;
+		size_t		decompress_workspace;
+	} compression_types[] = {
+		{ BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 },
+		{ BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP,
+			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+			zlib_inflate_workspacesize(), },
+		{ BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD,
+			ZSTD_CCtxWorkspaceBound(params.cParams),
+			ZSTD_DCtxWorkspaceBound() },
+	}, *i;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	c->zstd_params = params;
+
+	for (i = compression_types;
+	     i < compression_types + ARRAY_SIZE(compression_types);
+	     i++)
+		if (features & (1 << i->feature))
+			goto have_compressed;
+
+	goto out;
+have_compressed:
+
+	if (!mempool_initialized(&c->compression_bounce[READ])) {
+		ret = mempool_init_page_pool(&c->compression_bounce[READ],
+					     1, order);
+		if (ret)
+			goto out;
+	}
+
+	if (!mempool_initialized(&c->compression_bounce[WRITE])) {
+		ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
+					     1, order);
+		if (ret)
+			goto out;
+	}
+
+	for (i = compression_types;
+	     i < compression_types + ARRAY_SIZE(compression_types);
+	     i++) {
+		decompress_workspace_size =
+			max(decompress_workspace_size, i->decompress_workspace);
+
+		if (!(features & (1 << i->feature)))
+			continue;
+
+		if (i->decompress_workspace)
+			decompress_workspace_needed = true;
+
+		if (mempool_initialized(&c->compress_workspace[i->type]))
+			continue;
+
+		ret = mempool_init_kvpmalloc_pool(
+				&c->compress_workspace[i->type],
+				1, i->compress_workspace);
+		if (ret)
+			goto out;
+	}
+
+	if (!mempool_initialized(&c->decompress_workspace)) {
+		ret = mempool_init_kmalloc_pool(
+				&c->decompress_workspace,
+				1, decompress_workspace_size);
+		if (ret)
+			goto out;
+	}
+out:
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
+
+int bch2_fs_compress_init(struct bch_fs *c)
+{
+	u64 f = c->sb.features;
+
+	if (c->opts.compression)
+		f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression];
+
+	if (c->opts.background_compression)
+		f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression];
+
+	return __bch2_fs_compress_init(c, f);
+
+}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
new file mode 100644
index 000000000000..4bab1f61b3b5
--- /dev/null
+++ b/fs/bcachefs/compress.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COMPRESS_H
+#define _BCACHEFS_COMPRESS_H
+
+#include "extents_types.h"
+
+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
+				struct bch_extent_crc_unpacked *);
+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
+		       struct bvec_iter, struct bch_extent_crc_unpacked);
+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
+			   struct bio *, size_t *, unsigned);
+
+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
+void bch2_fs_compress_exit(struct bch_fs *);
+int bch2_fs_compress_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
new file mode 100644
index 000000000000..69b123bad83b
--- /dev/null
+++ b/fs/bcachefs/debug.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Assorted bcachefs debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io.h"
+#include "super.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bch_debug;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+	struct btree *v = c->verify_data;
+	struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
+	struct bset *sorted, *inmemory;
+	struct extent_ptr_decoded pick;
+	struct bch_dev *ca;
+	struct bio *bio;
+
+	if (c->opts.nochanges)
+		return;
+
+	btree_node_io_lock(b);
+	mutex_lock(&c->verify_lock);
+
+	n_ondisk = c->verify_ondisk;
+	n_sorted = c->verify_data->data;
+	n_inmemory = b->data;
+
+	bkey_copy(&v->key, &b->key);
+	v->written	= 0;
+	v->level	= b->level;
+	v->btree_id	= b->btree_id;
+	bch2_btree_keys_init(v, &c->expensive_debug_checks);
+
+	if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
+				       NULL, &pick) <= 0)
+		return;
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+	if (!bch2_dev_get_ioref(ca, READ))
+		return;
+
+	bio = bio_alloc_bioset(GFP_NOIO,
+			buf_pages(n_sorted, btree_bytes(c)),
+			&c->btree_bio);
+	bio_set_dev(bio, ca->disk_sb.bdev);
+	bio->bi_opf		= REQ_OP_READ|REQ_META;
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bch2_bio_map(bio, n_sorted, btree_bytes(c));
+
+	submit_bio_wait(bio);
+
+	bio_put(bio);
+	percpu_ref_put(&ca->io_ref);
+
+	memcpy(n_ondisk, n_sorted, btree_bytes(c));
+
+	if (bch2_btree_node_read_done(c, v, false))
+		goto out;
+
+	n_sorted = c->verify_data->data;
+	sorted = &n_sorted->keys;
+	inmemory = &n_inmemory->keys;
+
+	if (inmemory->u64s != sorted->u64s ||
+	    memcmp(inmemory->start,
+		   sorted->start,
+		   vstruct_end(inmemory) - (void *) inmemory->start)) {
+		unsigned offset = 0, sectors;
+		struct bset *i;
+		unsigned j;
+
+		console_lock();
+
+		printk(KERN_ERR "*** in memory:\n");
+		bch2_dump_bset(b, inmemory, 0);
+
+		printk(KERN_ERR "*** read back in:\n");
+		bch2_dump_bset(v, sorted, 0);
+
+		while (offset < b->written) {
+			if (!offset ) {
+				i = &n_ondisk->keys;
+				sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
+					c->block_bits;
+			} else {
+				struct btree_node_entry *bne =
+					(void *) n_ondisk + (offset << 9);
+				i = &bne->keys;
+
+				sectors = vstruct_blocks(bne, c->block_bits) <<
+					c->block_bits;
+			}
+
+			printk(KERN_ERR "*** on disk block %u:\n", offset);
+			bch2_dump_bset(b, i, offset);
+
+			offset += sectors;
+		}
+
+		printk(KERN_ERR "*** block %u/%u not written\n",
+		       offset >> c->block_bits, btree_blocks(c));
+
+		for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
+			if (inmemory->_data[j] != sorted->_data[j])
+				break;
+
+		printk(KERN_ERR "b->written %u\n", b->written);
+
+		console_unlock();
+		panic("verify failed at %u\n", j);
+	}
+out:
+	mutex_unlock(&c->verify_lock);
+	btree_node_io_unlock(b);
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: bch_fs refcounting */
+
+struct dump_iter {
+	struct bpos		from;
+	struct bch_fs	*c;
+	enum btree_id		id;
+
+	char			buf[PAGE_SIZE];
+	size_t			bytes;	/* what's currently in buf */
+
+	char __user		*ubuf;	/* destination user buffer */
+	size_t			size;	/* size of requested read */
+	ssize_t			ret;	/* bytes read so far */
+};
+
+static int flush_buf(struct dump_iter *i)
+{
+	if (i->bytes) {
+		size_t bytes = min(i->bytes, i->size);
+		int err = copy_to_user(i->ubuf, i->buf, bytes);
+
+		if (err)
+			return err;
+
+		i->ret	 += bytes;
+		i->ubuf	 += bytes;
+		i->size	 -= bytes;
+		i->bytes -= bytes;
+		memmove(i->buf, i->buf + bytes, i->bytes);
+	}
+
+	return 0;
+}
+
+static int bch2_dump_open(struct inode *inode, struct file *file)
+{
+	struct btree_debug *bd = inode->i_private;
+	struct dump_iter *i;
+
+	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+	if (!i)
+		return -ENOMEM;
+
+	file->private_data = i;
+	i->from = POS_MIN;
+	i->c	= container_of(bd, struct bch_fs, btree_debug[bd->id]);
+	i->id	= bd->id;
+
+	return 0;
+}
+
+static int bch2_dump_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t bch2_read_btree(struct file *file, char __user *buf,
+			       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size)
+		return i->ret;
+
+	bch2_trans_init(&trans, i->c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+	k = bch2_btree_iter_peek(iter);
+
+	while (k.k && !(err = bkey_err(k))) {
+		bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
+		i->bytes = strlen(i->buf);
+		BUG_ON(i->bytes >= PAGE_SIZE);
+		i->buf[i->bytes] = '\n';
+		i->bytes++;
+
+		k = bch2_btree_iter_next(iter);
+		i->from = iter->pos;
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		if (!i->size)
+			break;
+	}
+	bch2_trans_exit(&trans);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations btree_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_btree,
+};
+
+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
+				       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct btree *b;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size || !bkey_cmp(POS_MAX, i->from))
+		return i->ret;
+
+	bch2_trans_init(&trans, i->c, 0, 0);
+
+	for_each_btree_node(&trans, iter, i->id, i->from, 0, b) {
+		bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
+		i->bytes = strlen(i->buf);
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		/*
+		 * can't easily correctly restart a btree node traversal across
+		 * all nodes, meh
+		 */
+		i->from = bkey_cmp(POS_MAX, b->key.k.p)
+			? bkey_successor(b->key.k.p)
+			: b->key.k.p;
+
+		if (!i->size)
+			break;
+	}
+	bch2_trans_exit(&trans);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations btree_format_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_btree_formats,
+};
+
+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
+				       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct btree *prev_node = NULL;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size)
+		return i->ret;
+
+	bch2_trans_init(&trans, i->c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(err = bkey_err(k))) {
+		struct btree_iter_level *l = &iter->l[0];
+		struct bkey_packed *_k =
+			bch2_btree_node_iter_peek(&l->iter, l->b);
+
+		if (l->b != prev_node) {
+			bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b);
+			i->bytes = strlen(i->buf);
+			err = flush_buf(i);
+			if (err)
+				break;
+		}
+		prev_node = l->b;
+
+		bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k);
+		i->bytes = strlen(i->buf);
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		bch2_btree_iter_next(iter);
+		i->from = iter->pos;
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		if (!i->size)
+			break;
+	}
+	bch2_trans_exit(&trans);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations bfloat_failed_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_bfloat_failed,
+};
+
+void bch2_fs_debug_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->debug))
+		debugfs_remove_recursive(c->debug);
+}
+
+void bch2_fs_debug_init(struct bch_fs *c)
+{
+	struct btree_debug *bd;
+	char name[100];
+
+	if (IS_ERR_OR_NULL(bch_debug))
+		return;
+
+	snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
+	c->debug = debugfs_create_dir(name, bch_debug);
+	if (IS_ERR_OR_NULL(c->debug))
+		return;
+
+	for (bd = c->btree_debug;
+	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
+	     bd++) {
+		bd->id = bd - c->btree_debug;
+		bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
+						0400, c->debug, bd,
+						&btree_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-formats",
+			 bch2_btree_ids[bd->id]);
+
+		bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
+						       &btree_format_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-bfloat-failed",
+			 bch2_btree_ids[bd->id]);
+
+		bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
+						 &bfloat_failed_debug_ops);
+	}
+}
+
+#endif
+
+void bch2_debug_exit(void)
+{
+	if (!IS_ERR_OR_NULL(bch_debug))
+		debugfs_remove_recursive(bch_debug);
+}
+
+int __init bch2_debug_init(void)
+{
+	int ret = 0;
+
+	bch_debug = debugfs_create_dir("bcachefs", NULL);
+	return ret;
+}
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
new file mode 100644
index 000000000000..56c2d1ab5f63
--- /dev/null
+++ b/fs/bcachefs/debug.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DEBUG_H
+#define _BCACHEFS_DEBUG_H
+
+#include "bcachefs.h"
+
+struct bio;
+struct btree;
+struct bch_fs;
+
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct bch_fs *c)			\
+	{ return bch2_##name || c->name;	}
+BCH_DEBUG_PARAMS_ALWAYS()
+#undef BCH_DEBUG_PARAM
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct bch_fs *c)			\
+	{ return bch2_##name || c->name;	}
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+
+void __bch2_btree_verify(struct bch_fs *, struct btree *);
+
+#define bypass_torture_test(d)		((d)->bypass_torture_test)
+
+#else /* DEBUG */
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct bch_fs *c) { return false; }
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+
+static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
+
+#define bypass_torture_test(d)		0
+
+#endif
+
+static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+	if (verify_btree_ondisk(c))
+		__bch2_btree_verify(c, b);
+}
+
+#ifdef CONFIG_DEBUG_FS
+void bch2_fs_debug_exit(struct bch_fs *);
+void bch2_fs_debug_init(struct bch_fs *);
+#else
+static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
+static inline void bch2_fs_debug_init(struct bch_fs *c) {}
+#endif
+
+void bch2_debug_exit(void);
+int bch2_debug_init(void);
+
+#endif /* _BCACHEFS_DEBUG_H */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
new file mode 100644
index 000000000000..38017699c04a
--- /dev/null
+++ b/fs/bcachefs/dirent.c
@@ -0,0 +1,386 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "dirent.h"
+#include "fs.h"
+#include "keylist.h"
+#include "str_hash.h"
+
+#include <linux/dcache.h>
+
+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
+{
+	unsigned len = bkey_val_bytes(d.k) -
+		offsetof(struct bch_dirent, d_name);
+
+	return strnlen(d.v->d_name, len);
+}
+
+static u64 bch2_dirent_hash(const struct bch_hash_info *info,
+			    const struct qstr *name)
+{
+	struct bch_str_hash_ctx ctx;
+
+	bch2_str_hash_init(&ctx, info);
+	bch2_str_hash_update(&ctx, info, name->name, name->len);
+
+	/* [0,2) reserved for dots */
+	return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
+}
+
+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch2_dirent_hash(info, key);
+}
+
+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+
+	return bch2_dirent_hash(info, &name);
+}
+
+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	int len = bch2_dirent_name_bytes(l);
+	const struct qstr *r = _r;
+
+	return len - r->len ?: memcmp(l.v->d_name, r->name, len);
+}
+
+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
+	int l_len = bch2_dirent_name_bytes(l);
+	int r_len = bch2_dirent_name_bytes(r);
+
+	return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
+}
+
+const struct bch_hash_desc bch2_dirent_hash_desc = {
+	.btree_id	= BTREE_ID_DIRENTS,
+	.key_type	= KEY_TYPE_dirent,
+	.hash_key	= dirent_hash_key,
+	.hash_bkey	= dirent_hash_bkey,
+	.cmp_key	= dirent_cmp_key,
+	.cmp_bkey	= dirent_cmp_bkey,
+};
+
+const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	unsigned len;
+
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
+		return "value too small";
+
+	len = bch2_dirent_name_bytes(d);
+	if (!len)
+		return "empty name";
+
+	/*
+	 * older versions of bcachefs were buggy and creating dirent
+	 * keys that were bigger than necessary:
+	 */
+	if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7))
+		return "value too big";
+
+	if (len > BCH_NAME_MAX)
+		return "dirent name too big";
+
+	return NULL;
+}
+
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+	bch_scnmemcpy(out, d.v->d_name,
+		      bch2_dirent_name_bytes(d));
+	pr_buf(out, " -> %llu", d.v->d_inum);
+}
+
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+				u8 type, const struct qstr *name, u64 dst)
+{
+	struct bkey_i_dirent *dirent;
+	unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
+
+	if (name->len > BCH_NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	BUG_ON(u64s > U8_MAX);
+
+	dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+	if (IS_ERR(dirent))
+		return dirent;
+
+	bkey_dirent_init(&dirent->k_i);
+	dirent->k.u64s = u64s;
+	dirent->v.d_inum = cpu_to_le64(dst);
+	dirent->v.d_type = type;
+
+	memcpy(dirent->v.d_name, name->name, name->len);
+	memset(dirent->v.d_name + name->len, 0,
+	       bkey_val_bytes(&dirent->k) -
+	       offsetof(struct bch_dirent, d_name) -
+	       name->len);
+
+	EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+
+	return dirent;
+}
+
+int bch2_dirent_create(struct btree_trans *trans,
+		       u64 dir_inum, const struct bch_hash_info *hash_info,
+		       u8 type, const struct qstr *name, u64 dst_inum,
+		       int flags)
+{
+	struct bkey_i_dirent *dirent;
+	int ret;
+
+	dirent = dirent_create_key(trans, type, name, dst_inum);
+	ret = PTR_ERR_OR_ZERO(dirent);
+	if (ret)
+		return ret;
+
+	return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+			     dir_inum, &dirent->k_i, flags);
+}
+
+static void dirent_copy_target(struct bkey_i_dirent *dst,
+			       struct bkey_s_c_dirent src)
+{
+	dst->v.d_inum = src.v->d_inum;
+	dst->v.d_type = src.v->d_type;
+}
+
+int bch2_dirent_rename(struct btree_trans *trans,
+		       u64 src_dir, struct bch_hash_info *src_hash,
+		       u64 dst_dir, struct bch_hash_info *dst_hash,
+		       const struct qstr *src_name, u64 *src_inum,
+		       const struct qstr *dst_name, u64 *dst_inum,
+		       enum bch_rename_mode mode)
+{
+	struct btree_iter *src_iter, *dst_iter;
+	struct bkey_s_c old_src, old_dst;
+	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
+	struct bpos dst_pos =
+		POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
+	int ret;
+
+	*src_inum = *dst_inum = 0;
+
+	/*
+	 * Lookup dst:
+	 *
+	 * Note that in BCH_RENAME mode, we're _not_ checking if
+	 * the target already exists - we're relying on the VFS
+	 * to do that check for us for correctness:
+	 */
+	dst_iter = mode == BCH_RENAME
+		? bch2_hash_hole(trans, bch2_dirent_hash_desc,
+				 dst_hash, dst_dir, dst_name)
+		: bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+				   dst_hash, dst_dir, dst_name,
+				   BTREE_ITER_INTENT);
+	if (IS_ERR(dst_iter))
+		return PTR_ERR(dst_iter);
+	old_dst = bch2_btree_iter_peek_slot(dst_iter);
+
+	if (mode != BCH_RENAME)
+		*dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
+
+	/* Lookup src: */
+	src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+				    src_hash, src_dir, src_name,
+				    BTREE_ITER_INTENT);
+	if (IS_ERR(src_iter))
+		return PTR_ERR(src_iter);
+	old_src = bch2_btree_iter_peek_slot(src_iter);
+	*src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
+
+	/* Create new dst key: */
+	new_dst = dirent_create_key(trans, 0, dst_name, 0);
+	if (IS_ERR(new_dst))
+		return PTR_ERR(new_dst);
+
+	dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+	new_dst->k.p = dst_iter->pos;
+
+	/* Create new src key: */
+	if (mode == BCH_RENAME_EXCHANGE) {
+		new_src = dirent_create_key(trans, 0, src_name, 0);
+		if (IS_ERR(new_src))
+			return PTR_ERR(new_src);
+
+		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
+		new_src->k.p = src_iter->pos;
+	} else {
+		new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+		if (IS_ERR(new_src))
+			return PTR_ERR(new_src);
+		bkey_init(&new_src->k);
+		new_src->k.p = src_iter->pos;
+
+		if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
+		    bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
+			/*
+			 * We have a hash collision for the new dst key,
+			 * and new_src - the key we're deleting - is between
+			 * new_dst's hashed slot and the slot we're going to be
+			 * inserting it into - oops.  This will break the hash
+			 * table if we don't deal with it:
+			 */
+			if (mode == BCH_RENAME) {
+				/*
+				 * If we're not overwriting, we can just insert
+				 * new_dst at the src position:
+				 */
+				new_dst->k.p = src_iter->pos;
+				bch2_trans_update(trans, src_iter,
+						  &new_dst->k_i);
+				return 0;
+			} else {
+				/* If we're overwriting, we can't insert new_dst
+				 * at a different slot because it has to
+				 * overwrite old_dst - just make sure to use a
+				 * whiteout when deleting src:
+				 */
+				new_src->k.type = KEY_TYPE_whiteout;
+			}
+		} else {
+			/* Check if we need a whiteout to delete src: */
+			ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
+						       src_hash, src_iter);
+			if (ret < 0)
+				return ret;
+
+			if (ret)
+				new_src->k.type = KEY_TYPE_whiteout;
+		}
+	}
+
+	bch2_trans_update(trans, src_iter, &new_src->k_i);
+	bch2_trans_update(trans, dst_iter, &new_dst->k_i);
+	return 0;
+}
+
+int bch2_dirent_delete_at(struct btree_trans *trans,
+			  const struct bch_hash_info *hash_info,
+			  struct btree_iter *iter)
+{
+	return bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				   hash_info, iter);
+}
+
+int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
+		       const struct bch_hash_info *hash_info,
+		       const struct qstr *name,
+		       u64 *journal_seq)
+{
+	return bch2_trans_do(c, journal_seq,
+			     BTREE_INSERT_ATOMIC|
+			     BTREE_INSERT_NOFAIL,
+		bch2_hash_delete(&trans, bch2_dirent_hash_desc, hash_info,
+				 dir_inum, name));
+}
+
+struct btree_iter *
+__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum,
+			   const struct bch_hash_info *hash_info,
+			   const struct qstr *name, unsigned flags)
+{
+	return bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+				hash_info, dir_inum, name, flags);
+}
+
+u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
+		       const struct bch_hash_info *hash_info,
+		       const struct qstr *name)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 inum = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = __bch2_dirent_lookup_trans(&trans, dir_inum,
+					  hash_info, name, 0);
+	if (IS_ERR(iter)) {
+		BUG_ON(PTR_ERR(iter) == -EINTR);
+		goto out;
+	}
+
+	k = bch2_btree_iter_peek_slot(iter);
+	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+out:
+	bch2_trans_exit(&trans);
+	return inum;
+}
+
+int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key(trans, iter, BTREE_ID_DIRENTS,
+			   POS(dir_inum, 0), 0, k, ret) {
+		if (k.k->p.inode > dir_inum)
+			break;
+
+		if (k.k->type == KEY_TYPE_dirent) {
+			ret = -ENOTEMPTY;
+			break;
+		}
+	}
+	bch2_trans_iter_put(trans, iter);
+
+	return ret;
+}
+
+int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent dirent;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
+			   POS(inum, ctx->pos), 0, k, ret) {
+		if (k.k->p.inode > inum)
+			break;
+
+		if (k.k->type != KEY_TYPE_dirent)
+			continue;
+
+		dirent = bkey_s_c_to_dirent(k);
+
+		/*
+		 * XXX: dir_emit() can fault and block, while we're holding
+		 * locks
+		 */
+		ctx->pos = dirent.k->p.offset;
+		if (!dir_emit(ctx, dirent.v->d_name,
+			      bch2_dirent_name_bytes(dirent),
+			      le64_to_cpu(dirent.v->d_inum),
+			      dirent.v->d_type))
+			break;
+		ctx->pos = dirent.k->p.offset + 1;
+	}
+	ret = bch2_trans_exit(&trans) ?: ret;
+
+	return ret;
+}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
new file mode 100644
index 000000000000..e6184dc796d3
--- /dev/null
+++ b/fs/bcachefs/dirent.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_H
+#define _BCACHEFS_DIRENT_H
+
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_dirent_hash_desc;
+
+const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_dirent (struct bkey_ops) {	\
+	.key_invalid	= bch2_dirent_invalid,		\
+	.val_to_text	= bch2_dirent_to_text,		\
+}
+
+struct qstr;
+struct file;
+struct dir_context;
+struct bch_fs;
+struct bch_hash_info;
+struct bch_inode_info;
+
+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
+
+static inline unsigned dirent_val_u64s(unsigned len)
+{
+	return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
+			    sizeof(u64));
+}
+
+int bch2_dirent_create(struct btree_trans *, u64,
+		       const struct bch_hash_info *, u8,
+		       const struct qstr *, u64, int);
+
+int bch2_dirent_delete_at(struct btree_trans *,
+			  const struct bch_hash_info *,
+			  struct btree_iter *);
+int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
+		       const struct qstr *, u64 *);
+
+enum bch_rename_mode {
+	BCH_RENAME,
+	BCH_RENAME_OVERWRITE,
+	BCH_RENAME_EXCHANGE,
+};
+
+int bch2_dirent_rename(struct btree_trans *,
+		       u64, struct bch_hash_info *,
+		       u64, struct bch_hash_info *,
+		       const struct qstr *, u64 *,
+		       const struct qstr *, u64 *,
+		       enum bch_rename_mode);
+
+struct btree_iter *
+__bch2_dirent_lookup_trans(struct btree_trans *, u64,
+			   const struct bch_hash_info *,
+			   const struct qstr *, unsigned);
+u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
+		       const struct qstr *);
+
+int bch2_empty_dir_trans(struct btree_trans *, u64);
+int bch2_readdir(struct bch_fs *, u64, struct dir_context *);
+
+#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
new file mode 100644
index 000000000000..4a4ec8f46108
--- /dev/null
+++ b/fs/bcachefs/disk_groups.c
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+
+static int group_cmp(const void *_l, const void *_r)
+{
+	const struct bch_disk_group *l = _l;
+	const struct bch_disk_group *r = _r;
+
+	return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
+		(BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
+		((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
+		 (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
+		strncmp(l->label, r->label, sizeof(l->label));
+}
+
+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
+						struct bch_sb_field *f)
+{
+	struct bch_sb_field_disk_groups *groups =
+		field_to_type(f, disk_groups);
+	struct bch_disk_group *g, *sorted = NULL;
+	struct bch_sb_field_members *mi;
+	struct bch_member *m;
+	unsigned i, nr_groups, len;
+	const char *err = NULL;
+
+	mi		= bch2_sb_get_members(sb);
+	groups		= bch2_sb_get_disk_groups(sb);
+	nr_groups	= disk_groups_nr(groups);
+
+	for (m = mi->members;
+	     m < mi->members + sb->nr_devices;
+	     m++) {
+		unsigned g;
+
+		if (!BCH_MEMBER_GROUP(m))
+			continue;
+
+		g = BCH_MEMBER_GROUP(m) - 1;
+
+		if (g >= nr_groups ||
+		    BCH_GROUP_DELETED(&groups->entries[g]))
+			return "disk has invalid group";
+	}
+
+	if (!nr_groups)
+		return NULL;
+
+	for (g = groups->entries;
+	     g < groups->entries + nr_groups;
+	     g++) {
+		if (BCH_GROUP_DELETED(g))
+			continue;
+
+		len = strnlen(g->label, sizeof(g->label));
+		if (!len) {
+			err = "group with empty label";
+			goto err;
+		}
+	}
+
+	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
+	if (!sorted)
+		return "cannot allocate memory";
+
+	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
+	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
+
+	for (i = 0; i + 1 < nr_groups; i++)
+		if (!BCH_GROUP_DELETED(sorted + i) &&
+		    !group_cmp(sorted + i, sorted + i + 1)) {
+			err = "duplicate groups";
+			goto err;
+		}
+
+	err = NULL;
+err:
+	kfree(sorted);
+	return err;
+}
+
+static void bch2_sb_disk_groups_to_text(struct printbuf *out,
+					struct bch_sb *sb,
+					struct bch_sb_field *f)
+{
+	struct bch_sb_field_disk_groups *groups =
+		field_to_type(f, disk_groups);
+	struct bch_disk_group *g;
+	unsigned nr_groups = disk_groups_nr(groups);
+
+	for (g = groups->entries;
+	     g < groups->entries + nr_groups;
+	     g++) {
+		if (g != groups->entries)
+			pr_buf(out, " ");
+
+		if (BCH_GROUP_DELETED(g))
+			pr_buf(out, "[deleted]");
+		else
+			pr_buf(out, "[parent %llu name %s]",
+			       BCH_GROUP_PARENT(g), g->label);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
+	.validate	= bch2_sb_disk_groups_validate,
+	.to_text	= bch2_sb_disk_groups_to_text
+};
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_members *mi;
+	struct bch_sb_field_disk_groups *groups;
+	struct bch_disk_groups_cpu *cpu_g, *old_g;
+	unsigned i, g, nr_groups;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	mi		= bch2_sb_get_members(c->disk_sb.sb);
+	groups		= bch2_sb_get_disk_groups(c->disk_sb.sb);
+	nr_groups	= disk_groups_nr(groups);
+
+	if (!groups)
+		return 0;
+
+	cpu_g = kzalloc(sizeof(*cpu_g) +
+			sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
+	if (!cpu_g)
+		return -ENOMEM;
+
+	cpu_g->nr = nr_groups;
+
+	for (i = 0; i < nr_groups; i++) {
+		struct bch_disk_group *src	= &groups->entries[i];
+		struct bch_disk_group_cpu *dst	= &cpu_g->entries[i];
+
+		dst->deleted	= BCH_GROUP_DELETED(src);
+		dst->parent	= BCH_GROUP_PARENT(src);
+	}
+
+	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+		struct bch_member *m = mi->members + i;
+		struct bch_disk_group_cpu *dst =
+			&cpu_g->entries[BCH_MEMBER_GROUP(m)];
+
+		if (!bch2_member_exists(m))
+			continue;
+
+		g = BCH_MEMBER_GROUP(m);
+		while (g) {
+			dst = &cpu_g->entries[g - 1];
+			__set_bit(i, dst->devs.d);
+			g = dst->parent;
+		}
+	}
+
+	old_g = rcu_dereference_protected(c->disk_groups,
+				lockdep_is_held(&c->sb_lock));
+	rcu_assign_pointer(c->disk_groups, cpu_g);
+	if (old_g)
+		kfree_rcu(old_g, rcu);
+
+	return 0;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
+{
+	struct target t = target_decode(target);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return NULL;
+	case TARGET_DEV: {
+		struct bch_dev *ca = t.dev < c->sb.nr_devices
+			? rcu_dereference(c->devs[t.dev])
+			: NULL;
+		return ca ? &ca->self : NULL;
+	}
+	case TARGET_GROUP: {
+		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+		return t.group < g->nr && !g->entries[t.group].deleted
+			? &g->entries[t.group].devs
+			: NULL;
+	}
+	default:
+		BUG();
+	}
+}
+
+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+	struct target t = target_decode(target);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return false;
+	case TARGET_DEV:
+		return dev == t.dev;
+	case TARGET_GROUP: {
+		struct bch_disk_groups_cpu *g;
+		const struct bch_devs_mask *m;
+		bool ret;
+
+		rcu_read_lock();
+		g = rcu_dereference(c->disk_groups);
+		m = t.group < g->nr && !g->entries[t.group].deleted
+			? &g->entries[t.group].devs
+			: NULL;
+
+		ret = m ? test_bit(dev, m->d) : false;
+		rcu_read_unlock();
+
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
+				  unsigned parent,
+				  const char *name, unsigned namelen)
+{
+	unsigned i, nr_groups = disk_groups_nr(groups);
+
+	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+		return -EINVAL;
+
+	for (i = 0; i < nr_groups; i++) {
+		struct bch_disk_group *g = groups->entries + i;
+
+		if (BCH_GROUP_DELETED(g))
+			continue;
+
+		if (!BCH_GROUP_DELETED(g) &&
+		    BCH_GROUP_PARENT(g) == parent &&
+		    strnlen(g->label, sizeof(g->label)) == namelen &&
+		    !memcmp(name, g->label, namelen))
+			return i;
+	}
+
+	return -1;
+}
+
+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
+				 const char *name, unsigned namelen)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_get_disk_groups(sb->sb);
+	unsigned i, nr_groups = disk_groups_nr(groups);
+	struct bch_disk_group *g;
+
+	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+		return -EINVAL;
+
+	for (i = 0;
+	     i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
+	     i++)
+		;
+
+	if (i == nr_groups) {
+		unsigned u64s =
+			(sizeof(struct bch_sb_field_disk_groups) +
+			 sizeof(struct bch_disk_group) * (nr_groups + 1)) /
+			sizeof(u64);
+
+		groups = bch2_sb_resize_disk_groups(sb, u64s);
+		if (!groups)
+			return -ENOSPC;
+
+		nr_groups = disk_groups_nr(groups);
+	}
+
+	BUG_ON(i >= nr_groups);
+
+	g = &groups->entries[i];
+
+	memcpy(g->label, name, namelen);
+	if (namelen < sizeof(g->label))
+		g->label[namelen] = '\0';
+	SET_BCH_GROUP_DELETED(g, 0);
+	SET_BCH_GROUP_PARENT(g, parent);
+	SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
+
+	return i;
+}
+
+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_get_disk_groups(sb->sb);
+	int v = -1;
+
+	do {
+		const char *next = strchrnul(name, '.');
+		unsigned len = next - name;
+
+		if (*next == '.')
+			next++;
+
+		v = __bch2_disk_group_find(groups, v + 1, name, len);
+		name = next;
+	} while (*name && v >= 0);
+
+	return v;
+}
+
+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
+{
+	struct bch_sb_field_disk_groups *groups;
+	unsigned parent = 0;
+	int v = -1;
+
+	do {
+		const char *next = strchrnul(name, '.');
+		unsigned len = next - name;
+
+		if (*next == '.')
+			next++;
+
+		groups = bch2_sb_get_disk_groups(sb->sb);
+
+		v = __bch2_disk_group_find(groups, parent, name, len);
+		if (v < 0)
+			v = __bch2_disk_group_add(sb, parent, name, len);
+		if (v < 0)
+			return v;
+
+		parent = v + 1;
+		name = next;
+	} while (*name && v >= 0);
+
+	return v;
+}
+
+void bch2_disk_path_to_text(struct printbuf *out,
+			    struct bch_sb_handle *sb,
+			    unsigned v)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_get_disk_groups(sb->sb);
+	struct bch_disk_group *g;
+	unsigned nr = 0;
+	u16 path[32];
+
+	while (1) {
+		if (nr == ARRAY_SIZE(path))
+			goto inval;
+
+		if (v >= disk_groups_nr(groups))
+			goto inval;
+
+		g = groups->entries + v;
+
+		if (BCH_GROUP_DELETED(g))
+			goto inval;
+
+		path[nr++] = v;
+
+		if (!BCH_GROUP_PARENT(g))
+			break;
+
+		v = BCH_GROUP_PARENT(g) - 1;
+	}
+
+	while (nr) {
+		v = path[--nr];
+		g = groups->entries + v;
+
+		bch_scnmemcpy(out, g->label,
+			      strnlen(g->label, sizeof(g->label)));
+
+		if (nr)
+			pr_buf(out, ".");
+	}
+	return;
+inval:
+	pr_buf(out, "invalid group %u", v);
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+	struct bch_member *mi;
+	int v = -1;
+
+	mutex_lock(&c->sb_lock);
+
+	if (!strlen(name) || !strcmp(name, "none"))
+		goto write_sb;
+
+	v = bch2_disk_path_find_or_create(&c->disk_sb, name);
+	if (v < 0) {
+		mutex_unlock(&c->sb_lock);
+		return v;
+	}
+
+write_sb:
+	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+	SET_BCH_MEMBER_GROUP(mi, v + 1);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
+{
+	struct bch_dev *ca;
+	int g;
+
+	if (!strlen(buf) || !strcmp(buf, "none")) {
+		*v = 0;
+		return 0;
+	}
+
+	/* Is it a device? */
+	ca = bch2_dev_lookup(c, buf);
+	if (!IS_ERR(ca)) {
+		*v = dev_to_target(ca->dev_idx);
+		percpu_ref_put(&ca->ref);
+		return 0;
+	}
+
+	mutex_lock(&c->sb_lock);
+	g = bch2_disk_path_find(&c->disk_sb, buf);
+	mutex_unlock(&c->sb_lock);
+
+	if (g >= 0) {
+		*v = group_to_target(g);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v)
+{
+	struct target t = target_decode(v);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		pr_buf(out, "none");
+		break;
+	case TARGET_DEV: {
+		struct bch_dev *ca;
+
+		rcu_read_lock();
+		ca = t.dev < c->sb.nr_devices
+			? rcu_dereference(c->devs[t.dev])
+			: NULL;
+
+		if (ca && percpu_ref_tryget(&ca->io_ref)) {
+			char b[BDEVNAME_SIZE];
+
+			pr_buf(out, "/dev/%s",
+			     bdevname(ca->disk_sb.bdev, b));
+			percpu_ref_put(&ca->io_ref);
+		} else if (ca) {
+			pr_buf(out, "offline device %u", t.dev);
+		} else {
+			pr_buf(out, "invalid device %u", t.dev);
+		}
+
+		rcu_read_unlock();
+		break;
+	}
+	case TARGET_GROUP:
+		mutex_lock(&c->sb_lock);
+		bch2_disk_path_to_text(out, &c->disk_sb, t.group);
+		mutex_unlock(&c->sb_lock);
+		break;
+	default:
+		BUG();
+	}
+}
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
new file mode 100644
index 000000000000..c8e0c37a5e1a
--- /dev/null
+++ b/fs/bcachefs/disk_groups.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_H
+#define _BCACHEFS_DISK_GROUPS_H
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
+
+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
+{
+	return groups
+		? (vstruct_end(&groups->field) -
+		   (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
+		: 0;
+}
+
+struct target {
+	enum {
+		TARGET_NULL,
+		TARGET_DEV,
+		TARGET_GROUP,
+	}			type;
+	union {
+		unsigned	dev;
+		unsigned	group;
+	};
+};
+
+#define TARGET_DEV_START	1
+#define TARGET_GROUP_START	(256 + TARGET_DEV_START)
+
+static inline u16 dev_to_target(unsigned dev)
+{
+	return TARGET_DEV_START + dev;
+}
+
+static inline u16 group_to_target(unsigned group)
+{
+	return TARGET_GROUP_START + group;
+}
+
+static inline struct target target_decode(unsigned target)
+{
+	if (target >= TARGET_GROUP_START)
+		return (struct target) {
+			.type	= TARGET_GROUP,
+			.group	= target - TARGET_GROUP_START
+		};
+
+	if (target >= TARGET_DEV_START)
+		return (struct target) {
+			.type	= TARGET_DEV,
+			.group	= target - TARGET_DEV_START
+		};
+
+	return (struct target) { .type = TARGET_NULL };
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
+						  enum bch_data_type data_type,
+						  u16 target)
+{
+	struct bch_devs_mask devs = c->rw_devs[data_type];
+	const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
+
+	if (t)
+		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+	return devs;
+}
+
+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
+
+int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *,
+			    unsigned);
+
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64);
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+
+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+
+const char *bch2_sb_validate_disk_groups(struct bch_sb *,
+					 struct bch_sb_field *);
+
+#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
new file mode 100644
index 000000000000..ad92d3b452c0
--- /dev/null
+++ b/fs/bcachefs/ec.c
@@ -0,0 +1,1401 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* erasure coding */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bset.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io.h"
+#include "keylist.h"
+#include "recovery.h"
+#include "super-io.h"
+#include "util.h"
+
+#include <linux/sort.h>
+
+#ifdef __KERNEL__
+
+#include <linux/raid/pq.h>
+#include <linux/raid/xor.h>
+
+static void raid5_recov(unsigned disks, unsigned failed_idx,
+			size_t size, void **data)
+{
+	unsigned i = 2, nr;
+
+	BUG_ON(failed_idx >= disks);
+
+	swap(data[0], data[failed_idx]);
+	memcpy(data[0], data[1], size);
+
+	while (i < disks) {
+		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
+		xor_blocks(nr, size, data[0], data + i);
+		i += nr;
+	}
+
+	swap(data[0], data[failed_idx]);
+}
+
+static void raid_gen(int nd, int np, size_t size, void **v)
+{
+	if (np >= 1)
+		raid5_recov(nd + np, nd, size, v);
+	if (np >= 2)
+		raid6_call.gen_syndrome(nd + np, size, v);
+	BUG_ON(np > 2);
+}
+
+static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
+{
+	switch (nr) {
+	case 0:
+		break;
+	case 1:
+		if (ir[0] < nd + 1)
+			raid5_recov(nd + 1, ir[0], size, v);
+		else
+			raid6_call.gen_syndrome(nd + np, size, v);
+		break;
+	case 2:
+		if (ir[1] < nd) {
+			/* data+data failure. */
+			raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
+		} else if (ir[0] < nd) {
+			/* data + p/q failure */
+
+			if (ir[1] == nd) /* data + p failure */
+				raid6_datap_recov(nd + np, size, ir[0], v);
+			else { /* data + q failure */
+				raid5_recov(nd + 1, ir[0], size, v);
+				raid6_call.gen_syndrome(nd + np, size, v);
+			}
+		} else {
+			raid_gen(nd, np, size, v);
+		}
+		break;
+	default:
+		BUG();
+	}
+}
+
+#else
+
+#include <raid/raid.h>
+
+#endif
+
+struct ec_bio {
+	struct bch_dev		*ca;
+	struct ec_stripe_buf	*buf;
+	size_t			idx;
+	struct bio		bio;
+};
+
+/* Stripes btree keys: */
+
+const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+
+	if (k.k->p.inode)
+		return "invalid stripe key";
+
+	if (bkey_val_bytes(k.k) < sizeof(*s))
+		return "incorrect value size";
+
+	if (bkey_val_bytes(k.k) < sizeof(*s) ||
+	    bkey_val_u64s(k.k) < stripe_val_u64s(s))
+		return "incorrect value size";
+
+	return bch2_bkey_ptrs_invalid(c, k);
+}
+
+void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
+{
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	unsigned i;
+
+	pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+	       s->algorithm,
+	       le16_to_cpu(s->sectors),
+	       s->nr_blocks - s->nr_redundant,
+	       s->nr_redundant,
+	       s->csum_type,
+	       1U << s->csum_granularity_bits);
+
+	for (i = 0; i < s->nr_blocks; i++)
+		pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
+		       (u64) s->ptrs[i].offset,
+		       stripe_blockcount_get(s, i));
+}
+
+static int ptr_matches_stripe(struct bch_fs *c,
+			      struct bch_stripe *v,
+			      const struct bch_extent_ptr *ptr)
+{
+	unsigned i;
+
+	for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
+		const struct bch_extent_ptr *ptr2 = v->ptrs + i;
+
+		if (ptr->dev == ptr2->dev &&
+		    ptr->gen == ptr2->gen &&
+		    ptr->offset >= ptr2->offset &&
+		    ptr->offset <  ptr2->offset + le16_to_cpu(v->sectors))
+			return i;
+	}
+
+	return -1;
+}
+
+static int extent_matches_stripe(struct bch_fs *c,
+				 struct bch_stripe *v,
+				 struct bkey_s_c k)
+{
+
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
+		int idx;
+
+		extent_for_each_ptr(e, ptr) {
+			idx = ptr_matches_stripe(c, v, ptr);
+			if (idx >= 0)
+				return idx;
+		}
+		break;
+	}
+	}
+
+	return -1;
+}
+
+static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+
+		extent_for_each_entry(e, entry)
+			if (extent_entry_type(entry) ==
+			    BCH_EXTENT_ENTRY_stripe_ptr &&
+			    entry->stripe_ptr.idx == idx)
+				return true;
+
+		break;
+	}
+	}
+
+	return false;
+}
+
+static void ec_stripe_key_init(struct bch_fs *c,
+			       struct bkey_i_stripe *s,
+			       struct open_buckets *blocks,
+			       struct open_buckets *parity,
+			       unsigned stripe_size)
+{
+	struct open_bucket *ob;
+	unsigned i, u64s;
+
+	bkey_stripe_init(&s->k_i);
+	s->v.sectors			= cpu_to_le16(stripe_size);
+	s->v.algorithm			= 0;
+	s->v.nr_blocks			= parity->nr + blocks->nr;
+	s->v.nr_redundant		= parity->nr;
+	s->v.csum_granularity_bits	= ilog2(c->sb.encoded_extent_max);
+	s->v.csum_type			= BCH_CSUM_CRC32C;
+	s->v.pad			= 0;
+
+	open_bucket_for_each(c, blocks, ob, i)
+		s->v.ptrs[i]			= ob->ptr;
+
+	open_bucket_for_each(c, parity, ob, i)
+		s->v.ptrs[blocks->nr + i]	= ob->ptr;
+
+	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
+		BUG_ON(1 << s->v.csum_granularity_bits >=
+		       le16_to_cpu(s->v.sectors) ||
+		       s->v.csum_granularity_bits == U8_MAX);
+		s->v.csum_granularity_bits++;
+	}
+
+	set_bkey_val_u64s(&s->k, u64s);
+}
+
+/* Checksumming: */
+
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned csum_granularity = 1 << v->csum_granularity_bits;
+	unsigned csums_per_device = stripe_csums_per_device(v);
+	unsigned csum_bytes = bch_crc_bytes[v->csum_type];
+	unsigned i, j;
+
+	if (!csum_bytes)
+		return;
+
+	BUG_ON(buf->offset);
+	BUG_ON(buf->size != le16_to_cpu(v->sectors));
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		for (j = 0; j < csums_per_device; j++) {
+			unsigned offset = j << v->csum_granularity_bits;
+			unsigned len = min(csum_granularity, buf->size - offset);
+
+			struct bch_csum csum =
+				bch2_checksum(NULL, v->csum_type,
+					      null_nonce(),
+					      buf->data[i] + (offset << 9),
+					      len << 9);
+
+			memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
+		}
+	}
+}
+
+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned csum_granularity = 1 << v->csum_granularity_bits;
+	unsigned csum_bytes = bch_crc_bytes[v->csum_type];
+	unsigned i;
+
+	if (!csum_bytes)
+		return;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		unsigned offset = buf->offset;
+		unsigned end = buf->offset + buf->size;
+
+		if (!test_bit(i, buf->valid))
+			continue;
+
+		while (offset < end) {
+			unsigned j = offset >> v->csum_granularity_bits;
+			unsigned len = min(csum_granularity, end - offset);
+			struct bch_csum csum;
+
+			BUG_ON(offset & (csum_granularity - 1));
+			BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
+			       ((offset + len) & (csum_granularity - 1)));
+
+			csum = bch2_checksum(NULL, v->csum_type,
+					     null_nonce(),
+					     buf->data[i] + ((offset - buf->offset) << 9),
+					     len << 9);
+
+			if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
+				__bcache_io_error(c,
+					"checksum error while doing reconstruct read (%u:%u)",
+					i, j);
+				clear_bit(i, buf->valid);
+				break;
+			}
+
+			offset += len;
+		}
+	}
+}
+
+/* Erasure coding: */
+
+static void ec_generate_ec(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned nr_data = v->nr_blocks - v->nr_redundant;
+	unsigned bytes = le16_to_cpu(v->sectors) << 9;
+
+	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
+}
+
+static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
+{
+	return nr - bitmap_weight(buf->valid, nr);
+}
+
+static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
+{
+	return __ec_nr_failed(buf, buf->key.v.nr_blocks);
+}
+
+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
+	unsigned nr_data = v->nr_blocks - v->nr_redundant;
+	unsigned bytes = buf->size << 9;
+
+	if (ec_nr_failed(buf) > v->nr_redundant) {
+		__bcache_io_error(c,
+			"error doing reconstruct read: unable to read enough blocks");
+		return -1;
+	}
+
+	for (i = 0; i < nr_data; i++)
+		if (!test_bit(i, buf->valid))
+			failed[nr_failed++] = i;
+
+	raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
+	return 0;
+}
+
+/* IO: */
+
+static void ec_block_endio(struct bio *bio)
+{
+	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+	struct bch_dev *ca = ec_bio->ca;
+	struct closure *cl = bio->bi_private;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding"))
+		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+
+	bio_put(&ec_bio->bio);
+	percpu_ref_put(&ca->io_ref);
+	closure_put(cl);
+}
+
+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
+			unsigned rw, unsigned idx, struct closure *cl)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned offset = 0, bytes = buf->size << 9;
+	struct bch_extent_ptr *ptr = &v->ptrs[idx];
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+	if (!bch2_dev_get_ioref(ca, rw)) {
+		clear_bit(idx, buf->valid);
+		return;
+	}
+
+	while (offset < bytes) {
+		unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES,
+					   DIV_ROUND_UP(bytes, PAGE_SIZE));
+		unsigned b = min_t(size_t, bytes - offset,
+				   nr_iovecs << PAGE_SHIFT);
+		struct ec_bio *ec_bio;
+
+		ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs,
+						       &c->ec_bioset),
+				      struct ec_bio, bio);
+
+		ec_bio->ca			= ca;
+		ec_bio->buf			= buf;
+		ec_bio->idx			= idx;
+
+		bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev);
+		bio_set_op_attrs(&ec_bio->bio, rw, 0);
+
+		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
+		ec_bio->bio.bi_end_io		= ec_block_endio;
+		ec_bio->bio.bi_private		= cl;
+
+		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
+
+		closure_get(cl);
+		percpu_ref_get(&ca->io_ref);
+
+		submit_bio(&ec_bio->bio);
+
+		offset += b;
+	}
+
+	percpu_ref_put(&ca->io_ref);
+}
+
+/* recovery read path: */
+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct ec_stripe_buf *buf;
+	struct closure cl;
+	struct bkey_s_c k;
+	struct bch_stripe *v;
+	unsigned stripe_idx;
+	unsigned offset, end;
+	unsigned i, nr_data, csum_granularity;
+	int ret = 0, idx;
+
+	closure_init_stack(&cl);
+
+	BUG_ON(!rbio->pick.has_ec);
+
+	stripe_idx = rbio->pick.ec.idx;
+
+	buf = kzalloc(sizeof(*buf), GFP_NOIO);
+	if (!buf)
+		return -ENOMEM;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
+				   POS(0, stripe_idx),
+				   BTREE_ITER_SLOTS);
+	k = bch2_btree_iter_peek_slot(iter);
+	if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
+		__bcache_io_error(c,
+			"error doing reconstruct read: stripe not found");
+		kfree(buf);
+		return bch2_trans_exit(&trans) ?: -EIO;
+	}
+
+	bkey_reassemble(&buf->key.k_i, k);
+	bch2_trans_exit(&trans);
+
+	v = &buf->key.v;
+
+	nr_data = v->nr_blocks - v->nr_redundant;
+
+	idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
+	BUG_ON(idx < 0);
+
+	csum_granularity = 1U << v->csum_granularity_bits;
+
+	offset	= rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
+	end	= offset + bio_sectors(&rbio->bio);
+
+	BUG_ON(end > le16_to_cpu(v->sectors));
+
+	buf->offset	= round_down(offset, csum_granularity);
+	buf->size	= min_t(unsigned, le16_to_cpu(v->sectors),
+				round_up(end, csum_granularity)) - buf->offset;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
+		if (!buf->data[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	memset(buf->valid, 0xFF, sizeof(buf->valid));
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		struct bch_extent_ptr *ptr = v->ptrs + i;
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ptr_stale(ca, ptr)) {
+			__bcache_io_error(c,
+					  "error doing reconstruct read: stale pointer");
+			clear_bit(i, buf->valid);
+			continue;
+		}
+
+		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
+	}
+
+	closure_sync(&cl);
+
+	if (ec_nr_failed(buf) > v->nr_redundant) {
+		__bcache_io_error(c,
+			"error doing reconstruct read: unable to read enough blocks");
+		ret = -EIO;
+		goto err;
+	}
+
+	ec_validate_checksums(c, buf);
+
+	ret = ec_do_recov(c, buf);
+	if (ret)
+		goto err;
+
+	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
+		      buf->data[idx] + ((offset - buf->offset) << 9));
+err:
+	for (i = 0; i < v->nr_blocks; i++)
+		kfree(buf->data[i]);
+	kfree(buf);
+	return ret;
+}
+
+/* stripe bucket accounting: */
+
+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
+{
+	ec_stripes_heap n, *h = &c->ec_stripes_heap;
+
+	if (idx >= h->size) {
+		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
+			return -ENOMEM;
+
+		spin_lock(&c->ec_stripes_heap_lock);
+		if (n.size > h->size) {
+			memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
+			n.used = h->used;
+			swap(*h, n);
+		}
+		spin_unlock(&c->ec_stripes_heap_lock);
+
+		free_heap(&n);
+	}
+
+	if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
+		return -ENOMEM;
+
+	if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
+	    !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int ec_stripe_mem_alloc(struct bch_fs *c,
+			       struct btree_iter *iter)
+{
+	size_t idx = iter->pos.offset;
+	int ret = 0;
+
+	if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
+		return ret;
+
+	bch2_trans_unlock(iter->trans);
+	ret = -EINTR;
+
+	if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
+		return ret;
+
+	return -ENOMEM;
+}
+
+static ssize_t stripe_idx_to_delete(struct bch_fs *c)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+
+	return h->used && h->data[0].blocks_nonempty == 0
+		? h->data[0].idx : -1;
+}
+
+static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
+				      struct ec_stripe_heap_entry l,
+				      struct ec_stripe_heap_entry r)
+{
+	return ((l.blocks_nonempty > r.blocks_nonempty) -
+		(l.blocks_nonempty < r.blocks_nonempty));
+}
+
+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
+						   size_t i)
+{
+	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
+
+	genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
+}
+
+static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct stripe *m = genradix_ptr(&c->stripes[0], idx);
+
+	BUG_ON(!m->alive);
+	BUG_ON(m->heap_idx >= h->used);
+	BUG_ON(h->data[m->heap_idx].idx != idx);
+}
+
+void bch2_stripes_heap_update(struct bch_fs *c,
+			      struct stripe *m, size_t idx)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	size_t i;
+
+	if (m->alive) {
+		heap_verify_backpointer(c, idx);
+
+		h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
+
+		i = m->heap_idx;
+		heap_sift_up(h,	  i, ec_stripes_heap_cmp,
+			     ec_stripes_heap_set_backpointer);
+		heap_sift_down(h, i, ec_stripes_heap_cmp,
+			       ec_stripes_heap_set_backpointer);
+
+		heap_verify_backpointer(c, idx);
+	} else {
+		bch2_stripes_heap_insert(c, m, idx);
+	}
+
+	if (stripe_idx_to_delete(c) >= 0 &&
+	    !percpu_ref_is_dying(&c->writes))
+		schedule_work(&c->ec_stripe_delete_work);
+}
+
+void bch2_stripes_heap_del(struct bch_fs *c,
+			   struct stripe *m, size_t idx)
+{
+	heap_verify_backpointer(c, idx);
+
+	m->alive = false;
+	heap_del(&c->ec_stripes_heap, m->heap_idx,
+		 ec_stripes_heap_cmp,
+		 ec_stripes_heap_set_backpointer);
+}
+
+void bch2_stripes_heap_insert(struct bch_fs *c,
+			      struct stripe *m, size_t idx)
+{
+	BUG_ON(heap_full(&c->ec_stripes_heap));
+
+	heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
+			.idx = idx,
+			.blocks_nonempty = m->blocks_nonempty,
+		}),
+		 ec_stripes_heap_cmp,
+		 ec_stripes_heap_set_backpointer);
+	m->alive = true;
+
+	heap_verify_backpointer(c, idx);
+}
+
+/* stripe deletion */
+
+static int ec_stripe_delete(struct bch_fs *c, size_t idx)
+{
+	return bch2_btree_delete_range(c, BTREE_ID_EC,
+				       POS(0, idx),
+				       POS(0, idx + 1),
+				       NULL);
+}
+
+static void ec_stripe_delete_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, ec_stripe_delete_work);
+	ssize_t idx;
+
+	down_read(&c->gc_lock);
+	mutex_lock(&c->ec_stripe_create_lock);
+
+	while (1) {
+		spin_lock(&c->ec_stripes_heap_lock);
+		idx = stripe_idx_to_delete(c);
+		spin_unlock(&c->ec_stripes_heap_lock);
+
+		if (idx < 0)
+			break;
+
+		if (ec_stripe_delete(c, idx))
+			break;
+	}
+
+	mutex_unlock(&c->ec_stripe_create_lock);
+	up_read(&c->gc_lock);
+}
+
+/* stripe creation: */
+
+static int ec_stripe_bkey_insert(struct bch_fs *c,
+				 struct bkey_i_stripe *stripe)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bpos start_pos = POS(0, c->ec_stripe_hint);
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos,
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
+			if (start_pos.offset) {
+				start_pos = POS_MIN;
+				bch2_btree_iter_set_pos(iter, start_pos);
+				continue;
+			}
+
+			ret = -ENOSPC;
+			break;
+		}
+
+		if (bkey_deleted(k.k))
+			goto found_slot;
+	}
+
+	goto err;
+found_slot:
+	start_pos = iter->pos;
+
+	ret = ec_stripe_mem_alloc(c, iter);
+	if (ret)
+		goto err;
+
+	stripe->k.p = iter->pos;
+
+	bch2_trans_update(&trans, iter, &stripe->k_i);
+
+	ret = bch2_trans_commit(&trans, NULL, NULL,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL);
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1;
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+static void extent_stripe_ptr_add(struct bkey_s_extent e,
+				  struct ec_stripe_buf *s,
+				  struct bch_extent_ptr *ptr,
+				  unsigned block)
+{
+	struct bch_extent_stripe_ptr *dst = (void *) ptr;
+	union bch_extent_entry *end = extent_entry_last(e);
+
+	memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
+	e.k->u64s += sizeof(*dst) / sizeof(u64);
+
+	*dst = (struct bch_extent_stripe_ptr) {
+		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
+		.block		= block,
+		.idx		= s->key.k.p.offset,
+	};
+}
+
+static int ec_stripe_update_ptrs(struct bch_fs *c,
+				 struct ec_stripe_buf *s,
+				 struct bkey *pos)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	BKEY_PADDED(k) tmp;
+	int ret = 0, dev, idx;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   bkey_start_pos(pos),
+				   BTREE_ITER_INTENT);
+
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k)) &&
+	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
+		if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
+			bch2_btree_iter_next(iter);
+			continue;
+		}
+
+		idx = extent_matches_stripe(c, &s->key.v, k);
+		if (idx < 0) {
+			bch2_btree_iter_next(iter);
+			continue;
+		}
+
+		bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+
+		dev = s->key.v.ptrs[idx].dev;
+
+		bkey_reassemble(&tmp.k, k);
+		e = bkey_i_to_s_extent(&tmp.k);
+
+		extent_for_each_ptr(e, ptr)
+			if (ptr->dev != dev)
+				ptr->cached = true;
+
+		ptr = (void *) bch2_extent_has_device(e.c, dev);
+		BUG_ON(!ptr);
+
+		extent_stripe_ptr_add(e, s, ptr, idx);
+
+		bch2_trans_update(&trans, iter, &tmp.k);
+
+		ret = bch2_trans_commit(&trans, NULL, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_USE_RESERVE);
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+	}
+
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+/*
+ * data buckets of new stripe all written: create the stripe
+ */
+static void ec_stripe_create(struct ec_stripe_new *s)
+{
+	struct bch_fs *c = s->c;
+	struct open_bucket *ob;
+	struct bkey_i *k;
+	struct bch_stripe *v = &s->stripe.key.v;
+	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+	struct closure cl;
+	int ret;
+
+	BUG_ON(s->h->s == s);
+
+	closure_init_stack(&cl);
+
+	if (s->err) {
+		bch_err(c, "error creating stripe: error writing data buckets");
+		goto err;
+	}
+
+	if (!percpu_ref_tryget(&c->writes))
+		goto err;
+
+	BUG_ON(bitmap_weight(s->blocks_allocated,
+			     s->blocks.nr) != s->blocks.nr);
+
+	ec_generate_ec(&s->stripe);
+
+	ec_generate_checksums(&s->stripe);
+
+	/* write p/q: */
+	for (i = nr_data; i < v->nr_blocks; i++)
+		ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
+
+	closure_sync(&cl);
+
+	for (i = nr_data; i < v->nr_blocks; i++)
+		if (!test_bit(i, s->stripe.valid)) {
+			bch_err(c, "error creating stripe: error writing redundancy buckets");
+			goto err_put_writes;
+		}
+
+	mutex_lock(&c->ec_stripe_create_lock);
+
+	ret = ec_stripe_bkey_insert(c, &s->stripe.key);
+	if (ret) {
+		bch_err(c, "error creating stripe: error creating stripe key");
+		goto err_unlock;
+	}
+
+	for_each_keylist_key(&s->keys, k) {
+		ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
+		if (ret)
+			break;
+	}
+
+err_unlock:
+	mutex_unlock(&c->ec_stripe_create_lock);
+err_put_writes:
+	percpu_ref_put(&c->writes);
+err:
+	open_bucket_for_each(c, &s->blocks, ob, i) {
+		ob->ec = NULL;
+		__bch2_open_bucket_put(c, ob);
+	}
+
+	bch2_open_buckets_put(c, &s->parity);
+
+	bch2_keylist_free(&s->keys, s->inline_keys);
+
+	mutex_lock(&s->h->lock);
+	list_del(&s->list);
+	mutex_unlock(&s->h->lock);
+
+	for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
+		kvpfree(s->stripe.data[i], s->stripe.size << 9);
+	kfree(s);
+}
+
+static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s = h->s;
+
+	list_add(&s->list, &h->stripes);
+	h->s = NULL;
+
+	return s;
+}
+
+static void ec_stripe_new_put(struct ec_stripe_new *s)
+{
+	BUG_ON(atomic_read(&s->pin) <= 0);
+	if (atomic_dec_and_test(&s->pin))
+		ec_stripe_create(s);
+}
+
+/* have a full bucket - hand it off to be erasure coded: */
+void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct ec_stripe_new *s = ob->ec;
+
+	if (ob->sectors_free)
+		s->err = -1;
+
+	ec_stripe_new_put(s);
+}
+
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct ec_stripe_new *s = ob->ec;
+
+	s->err = -EIO;
+}
+
+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
+{
+	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
+	struct bch_dev *ca;
+	unsigned offset;
+
+	if (!ob)
+		return NULL;
+
+	ca	= bch_dev_bkey_exists(c, ob->ptr.dev);
+	offset	= ca->mi.bucket_size - ob->sectors_free;
+
+	return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
+}
+
+void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
+			     struct bpos pos, unsigned sectors)
+{
+	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
+	struct ec_stripe_new *ec;
+
+	if (!ob)
+		return;
+
+	ec = ob->ec;
+	mutex_lock(&ec->lock);
+
+	if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
+				 ARRAY_SIZE(ec->inline_keys),
+				 BKEY_U64s)) {
+		BUG();
+	}
+
+	bkey_init(&ec->keys.top->k);
+	ec->keys.top->k.p	= pos;
+	bch2_key_resize(&ec->keys.top->k, sectors);
+	bch2_keylist_push(&ec->keys);
+
+	mutex_unlock(&ec->lock);
+}
+
+static int unsigned_cmp(const void *_l, const void *_r)
+{
+	unsigned l = *((const unsigned *) _l);
+	unsigned r = *((const unsigned *) _r);
+
+	return cmp_int(l, r);
+}
+
+/* pick most common bucket size: */
+static unsigned pick_blocksize(struct bch_fs *c,
+			       struct bch_devs_mask *devs)
+{
+	struct bch_dev *ca;
+	unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
+	struct {
+		unsigned nr, size;
+	} cur = { 0, 0 }, best = { 0, 0 };
+
+	for_each_member_device_rcu(ca, c, i, devs)
+		sizes[nr++] = ca->mi.bucket_size;
+
+	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
+
+	for (i = 0; i < nr; i++) {
+		if (sizes[i] != cur.size) {
+			if (cur.nr > best.nr)
+				best = cur;
+
+			cur.nr = 0;
+			cur.size = sizes[i];
+		}
+
+		cur.nr++;
+	}
+
+	if (cur.nr > best.nr)
+		best = cur;
+
+	return best.size;
+}
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s;
+	unsigned i;
+
+	BUG_ON(h->parity.nr != h->redundancy);
+	BUG_ON(!h->blocks.nr);
+	BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX);
+	lockdep_assert_held(&h->lock);
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	mutex_init(&s->lock);
+	atomic_set(&s->pin, 1);
+	s->c		= c;
+	s->h		= h;
+	s->blocks	= h->blocks;
+	s->parity	= h->parity;
+
+	memset(&h->blocks, 0, sizeof(h->blocks));
+	memset(&h->parity, 0, sizeof(h->parity));
+
+	bch2_keylist_init(&s->keys, s->inline_keys);
+
+	s->stripe.offset	= 0;
+	s->stripe.size		= h->blocksize;
+	memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
+
+	ec_stripe_key_init(c, &s->stripe.key,
+			   &s->blocks, &s->parity,
+			   h->blocksize);
+
+	for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
+		s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
+		if (!s->stripe.data[i])
+			goto err;
+	}
+
+	h->s = s;
+
+	return 0;
+err:
+	for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
+		kvpfree(s->stripe.data[i], s->stripe.size << 9);
+	kfree(s);
+	return -ENOMEM;
+}
+
+static struct ec_stripe_head *
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
+			 unsigned algo, unsigned redundancy)
+{
+	struct ec_stripe_head *h;
+	struct bch_dev *ca;
+	unsigned i;
+
+	h = kzalloc(sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return NULL;
+
+	mutex_init(&h->lock);
+	mutex_lock(&h->lock);
+	INIT_LIST_HEAD(&h->stripes);
+
+	h->target	= target;
+	h->algo		= algo;
+	h->redundancy	= redundancy;
+
+	rcu_read_lock();
+	h->devs = target_rw_devs(c, BCH_DATA_USER, target);
+
+	for_each_member_device_rcu(ca, c, i, &h->devs)
+		if (!ca->mi.durability)
+			__clear_bit(i, h->devs.d);
+
+	h->blocksize = pick_blocksize(c, &h->devs);
+
+	for_each_member_device_rcu(ca, c, i, &h->devs)
+		if (ca->mi.bucket_size == h->blocksize)
+			h->nr_active_devs++;
+
+	rcu_read_unlock();
+	list_add(&h->list, &c->ec_new_stripe_list);
+	return h;
+}
+
+void bch2_ec_stripe_head_put(struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s = NULL;
+
+	if (h->s &&
+	    bitmap_weight(h->s->blocks_allocated,
+			  h->s->blocks.nr) == h->s->blocks.nr)
+		s = ec_stripe_set_pending(h);
+
+	mutex_unlock(&h->lock);
+
+	if (s)
+		ec_stripe_new_put(s);
+}
+
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
+					       unsigned target,
+					       unsigned algo,
+					       unsigned redundancy)
+{
+	struct ec_stripe_head *h;
+
+	if (!redundancy)
+		return NULL;
+
+	mutex_lock(&c->ec_new_stripe_lock);
+	list_for_each_entry(h, &c->ec_new_stripe_list, list)
+		if (h->target		== target &&
+		    h->algo		== algo &&
+		    h->redundancy	== redundancy) {
+			mutex_lock(&h->lock);
+			goto found;
+		}
+
+	h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
+found:
+	mutex_unlock(&c->ec_new_stripe_lock);
+	return h;
+}
+
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	unsigned i;
+
+	mutex_lock(&c->ec_new_stripe_lock);
+	list_for_each_entry(h, &c->ec_new_stripe_list, list) {
+		struct ec_stripe_new *s = NULL;
+
+		mutex_lock(&h->lock);
+		bch2_open_buckets_stop_dev(c, ca, &h->blocks);
+		bch2_open_buckets_stop_dev(c, ca, &h->parity);
+
+		if (!h->s)
+			goto unlock;
+
+		open_bucket_for_each(c, &h->s->blocks, ob, i)
+			if (ob->ptr.dev == ca->dev_idx)
+				goto found;
+		open_bucket_for_each(c, &h->s->parity, ob, i)
+			if (ob->ptr.dev == ca->dev_idx)
+				goto found;
+		goto unlock;
+found:
+		h->s->err = -1;
+		s = ec_stripe_set_pending(h);
+unlock:
+		mutex_unlock(&h->lock);
+
+		if (s)
+			ec_stripe_new_put(s);
+	}
+	mutex_unlock(&c->ec_new_stripe_lock);
+}
+
+static int __bch2_stripe_write_key(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct stripe *m,
+				   size_t idx,
+				   struct bkey_i_stripe *new_key,
+				   unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	unsigned i;
+	int ret;
+
+	bch2_btree_iter_set_pos(iter, POS(0, idx));
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_stripe)
+		return -EIO;
+
+	bkey_reassemble(&new_key->k_i, k);
+
+	spin_lock(&c->ec_stripes_heap_lock);
+
+	for (i = 0; i < new_key->v.nr_blocks; i++)
+		stripe_blockcount_set(&new_key->v, i,
+				      m->block_sectors[i]);
+	m->dirty = false;
+
+	spin_unlock(&c->ec_stripes_heap_lock);
+
+	bch2_trans_update(trans, iter, &new_key->k_i);
+
+	return bch2_trans_commit(trans, NULL, NULL,
+				 BTREE_INSERT_NOFAIL|flags);
+}
+
+int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct genradix_iter giter;
+	struct bkey_i_stripe *new_key;
+	struct stripe *m;
+	int ret = 0;
+
+	new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
+	BUG_ON(!new_key);
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	genradix_for_each(&c->stripes[0], giter, m) {
+		if (!m->dirty)
+			continue;
+
+		ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos,
+					      new_key, flags);
+		if (ret)
+			break;
+
+		*wrote = true;
+	}
+
+	bch2_trans_exit(&trans);
+
+	kfree(new_key);
+
+	return ret;
+}
+
+int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
+{
+	struct btree_trans trans;
+	struct btree_iter *btree_iter;
+	struct journal_iter journal_iter;
+	struct bkey_s_c btree_k, journal_k;
+	int ret;
+
+	ret = bch2_fs_ec_start(c);
+	if (ret)
+		return ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	btree_iter	= bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, 0);
+	journal_iter	= bch2_journal_iter_init(journal_keys, BTREE_ID_EC);
+
+	btree_k		= bch2_btree_iter_peek(btree_iter);
+	journal_k	= bch2_journal_iter_peek(&journal_iter);
+
+	while (1) {
+		bool btree;
+
+		if (btree_k.k && journal_k.k) {
+			int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
+
+			if (!cmp)
+				btree_k = bch2_btree_iter_next(btree_iter);
+			btree = cmp < 0;
+		} else if (btree_k.k) {
+			btree = true;
+		} else if (journal_k.k) {
+			btree = false;
+		} else {
+			break;
+		}
+
+		bch2_mark_key(c, btree ? btree_k : journal_k,
+			      0, 0, NULL, 0,
+			      BCH_BUCKET_MARK_ALLOC_READ|
+			      BCH_BUCKET_MARK_NOATOMIC);
+
+		if (btree)
+			btree_k = bch2_btree_iter_next(btree_iter);
+		else
+			journal_k = bch2_journal_iter_next(&journal_iter);
+	}
+
+	ret = bch2_trans_exit(&trans) ?: ret;
+	if (ret) {
+		bch_err(c, "error reading stripes: %i", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	size_t i, idx = 0;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0);
+
+	k = bch2_btree_iter_prev(iter);
+	if (!IS_ERR_OR_NULL(k.k))
+		idx = k.k->p.offset + 1;
+	ret = bch2_trans_exit(&trans);
+	if (ret)
+		return ret;
+
+	if (!idx)
+		return 0;
+
+	if (!gc &&
+	    !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
+		       GFP_KERNEL))
+		return -ENOMEM;
+#if 0
+	ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
+#else
+	for (i = 0; i < idx; i++)
+		if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
+			return -ENOMEM;
+#endif
+	return 0;
+}
+
+int bch2_fs_ec_start(struct bch_fs *c)
+{
+	return bch2_ec_mem_alloc(c, false);
+}
+
+void bch2_fs_ec_exit(struct bch_fs *c)
+{
+	struct ec_stripe_head *h;
+
+	while (1) {
+		mutex_lock(&c->ec_new_stripe_lock);
+		h = list_first_entry_or_null(&c->ec_new_stripe_list,
+					     struct ec_stripe_head, list);
+		if (h)
+			list_del(&h->list);
+		mutex_unlock(&c->ec_new_stripe_lock);
+		if (!h)
+			break;
+
+		BUG_ON(h->s);
+		BUG_ON(!list_empty(&h->stripes));
+		kfree(h);
+	}
+
+	free_heap(&c->ec_stripes_heap);
+	genradix_free(&c->stripes[0]);
+	bioset_exit(&c->ec_bioset);
+}
+
+int bch2_fs_ec_init(struct bch_fs *c)
+{
+	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
+
+	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
+			   BIOSET_NEED_BVECS);
+}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
new file mode 100644
index 000000000000..8d9fbfd19f66
--- /dev/null
+++ b/fs/bcachefs/ec.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_H
+#define _BCACHEFS_EC_H
+
+#include "ec_types.h"
+#include "keylist_types.h"
+
+const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
+			 struct bkey_s_c);
+
+#define bch2_bkey_ops_stripe (struct bkey_ops) {	\
+	.key_invalid	= bch2_stripe_invalid,		\
+	.val_to_text	= bch2_stripe_to_text,		\
+}
+
+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(le16_to_cpu(s->sectors),
+			    1 << s->csum_granularity_bits);
+}
+
+static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
+					  unsigned dev, unsigned csum_idx)
+{
+	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
+
+	return sizeof(struct bch_stripe) +
+		sizeof(struct bch_extent_ptr) * s->nr_blocks +
+		(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
+}
+
+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
+						unsigned idx)
+{
+	return stripe_csum_offset(s, s->nr_blocks, 0) +
+		sizeof(u16) * idx;
+}
+
+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
+					     unsigned idx)
+{
+	return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
+}
+
+static inline void stripe_blockcount_set(struct bch_stripe *s,
+					 unsigned idx, unsigned v)
+{
+	__le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
+
+	*p = cpu_to_le16(v);
+}
+
+static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
+			    sizeof(u64));
+}
+
+static inline void *stripe_csum(struct bch_stripe *s,
+				unsigned dev, unsigned csum_idx)
+{
+	return (void *) s + stripe_csum_offset(s, dev, csum_idx);
+}
+
+struct bch_read_bio;
+
+struct ec_stripe_buf {
+	/* might not be buffering the entire stripe: */
+	unsigned		offset;
+	unsigned		size;
+	unsigned long		valid[BITS_TO_LONGS(EC_STRIPE_MAX)];
+
+	void			*data[EC_STRIPE_MAX];
+
+	union {
+		struct bkey_i_stripe	key;
+		u64			pad[255];
+	};
+};
+
+struct ec_stripe_head;
+
+struct ec_stripe_new {
+	struct bch_fs		*c;
+	struct ec_stripe_head	*h;
+	struct mutex		lock;
+	struct list_head	list;
+
+	/* counts in flight writes, stripe is created when pin == 0 */
+	atomic_t		pin;
+
+	int			err;
+
+	unsigned long		blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
+
+	struct open_buckets	blocks;
+	struct open_buckets	parity;
+
+	struct keylist		keys;
+	u64			inline_keys[BKEY_U64s * 8];
+
+	struct ec_stripe_buf	stripe;
+};
+
+struct ec_stripe_head {
+	struct list_head	list;
+	struct mutex		lock;
+
+	struct list_head	stripes;
+
+	unsigned		target;
+	unsigned		algo;
+	unsigned		redundancy;
+
+	struct bch_devs_mask	devs;
+	unsigned		nr_active_devs;
+
+	unsigned		blocksize;
+
+	struct dev_stripe_state	block_stripe;
+	struct dev_stripe_state	parity_stripe;
+
+	struct open_buckets	blocks;
+	struct open_buckets	parity;
+
+	struct ec_stripe_new	*s;
+};
+
+int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
+
+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
+void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *,
+			     struct bpos, unsigned);
+
+void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
+
+void bch2_ec_stripe_head_put(struct ec_stripe_head *);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
+					       unsigned, unsigned);
+
+void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
+
+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
+
+void bch2_ec_flush_new_stripes(struct bch_fs *);
+
+struct journal_keys;
+int bch2_stripes_read(struct bch_fs *, struct journal_keys *);
+int bch2_stripes_write(struct bch_fs *, unsigned, bool *);
+
+int bch2_ec_mem_alloc(struct bch_fs *, bool);
+
+int bch2_fs_ec_start(struct bch_fs *);
+
+void bch2_fs_ec_exit(struct bch_fs *);
+int bch2_fs_ec_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
new file mode 100644
index 000000000000..5c3f77c8aac7
--- /dev/null
+++ b/fs/bcachefs/ec_types.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_TYPES_H
+#define _BCACHEFS_EC_TYPES_H
+
+#include <linux/llist.h>
+
+#define EC_STRIPE_MAX	16
+
+struct bch_replicas_padded {
+	struct bch_replicas_entry	e;
+	u8				pad[EC_STRIPE_MAX];
+};
+
+struct stripe {
+	size_t			heap_idx;
+
+	u16			sectors;
+	u8			algorithm;
+
+	u8			nr_blocks;
+	u8			nr_redundant;
+
+	unsigned		alive:1;
+	unsigned		dirty:1;
+	u8			blocks_nonempty;
+	u16			block_sectors[EC_STRIPE_MAX];
+
+	struct bch_replicas_padded r;
+};
+
+struct ec_stripe_heap_entry {
+	size_t			idx;
+	unsigned		blocks_nonempty;
+};
+
+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
+
+#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
new file mode 100644
index 000000000000..304ff92500be
--- /dev/null
+++ b/fs/bcachefs/error.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "error.h"
+#include "io.h"
+#include "super.h"
+
+#define FSCK_ERR_RATELIMIT_NR	10
+
+bool bch2_inconsistent_error(struct bch_fs *c)
+{
+	set_bit(BCH_FS_ERROR, &c->flags);
+
+	switch (c->opts.errors) {
+	case BCH_ON_ERROR_CONTINUE:
+		return false;
+	case BCH_ON_ERROR_RO:
+		if (bch2_fs_emergency_read_only(c))
+			bch_err(c, "emergency read only");
+		return true;
+	case BCH_ON_ERROR_PANIC:
+		panic(bch2_fmt(c, "panic after error"));
+		return true;
+	default:
+		BUG();
+	}
+}
+
+void bch2_fatal_error(struct bch_fs *c)
+{
+	if (bch2_fs_emergency_read_only(c))
+		bch_err(c, "emergency read only");
+}
+
+void bch2_io_error_work(struct work_struct *work)
+{
+	struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
+	struct bch_fs *c = ca->fs;
+	bool dev;
+
+	mutex_lock(&c->state_lock);
+	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
+				    BCH_FORCE_IF_DEGRADED);
+	if (dev
+	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
+				  BCH_FORCE_IF_DEGRADED)
+	    : bch2_fs_emergency_read_only(c))
+		bch_err(ca,
+			"too many IO errors, setting %s RO",
+			dev ? "device" : "filesystem");
+	mutex_unlock(&c->state_lock);
+}
+
+void bch2_io_error(struct bch_dev *ca)
+{
+	//queue_work(system_long_wq, &ca->io_error_work);
+}
+
+#ifdef __KERNEL__
+#define ask_yn()	false
+#else
+#include "tools-util.h"
+#endif
+
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
+				const char *fmt, ...)
+{
+	struct fsck_err_state *s;
+	va_list args;
+	bool fix = false, print = true, suppressing = false;
+	char _buf[sizeof(s->buf)], *buf = _buf;
+
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+		va_start(args, fmt);
+		vprintk(fmt, args);
+		va_end(args);
+
+		return bch2_inconsistent_error(c)
+			? FSCK_ERR_EXIT
+			: FSCK_ERR_FIX;
+	}
+
+	mutex_lock(&c->fsck_error_lock);
+
+	list_for_each_entry(s, &c->fsck_errors, list)
+		if (s->fmt == fmt)
+			goto found;
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		if (!c->fsck_alloc_err)
+			bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
+		c->fsck_alloc_err = true;
+		buf = _buf;
+		goto print;
+	}
+
+	INIT_LIST_HEAD(&s->list);
+	s->fmt = fmt;
+found:
+	list_move(&s->list, &c->fsck_errors);
+	s->nr++;
+	suppressing	= s->nr == FSCK_ERR_RATELIMIT_NR;
+	print		= s->nr <= FSCK_ERR_RATELIMIT_NR;
+	buf		= s->buf;
+print:
+	va_start(args, fmt);
+	vscnprintf(buf, sizeof(_buf), fmt, args);
+	va_end(args);
+
+	if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+		bch_err(c, "%s, exiting", buf);
+	} else if (flags & FSCK_CAN_FIX) {
+		if (c->opts.fix_errors == FSCK_OPT_ASK) {
+			printk(KERN_ERR "%s: fix?", buf);
+			fix = ask_yn();
+		} else if (c->opts.fix_errors == FSCK_OPT_YES ||
+			   (c->opts.nochanges &&
+			    !(flags & FSCK_CAN_IGNORE))) {
+			if (print)
+				bch_err(c, "%s, fixing", buf);
+			fix = true;
+		} else {
+			if (print)
+				bch_err(c, "%s, not fixing", buf);
+			fix = false;
+		}
+	} else if (flags & FSCK_NEED_FSCK) {
+		if (print)
+			bch_err(c, "%s (run fsck to correct)", buf);
+	} else {
+		if (print)
+			bch_err(c, "%s (repair unimplemented)", buf);
+	}
+
+	if (suppressing)
+		bch_err(c, "Ratelimiting new instances of previous error");
+
+	mutex_unlock(&c->fsck_error_lock);
+
+	if (fix) {
+		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+		return FSCK_ERR_FIX;
+	} else {
+		set_bit(BCH_FS_ERROR, &c->flags);
+		return c->opts.fix_errors == FSCK_OPT_EXIT ||
+			!(flags & FSCK_CAN_IGNORE)
+			? FSCK_ERR_EXIT
+			: FSCK_ERR_IGNORE;
+	}
+}
+
+void bch2_flush_fsck_errs(struct bch_fs *c)
+{
+	struct fsck_err_state *s, *n;
+
+	mutex_lock(&c->fsck_error_lock);
+
+	list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
+		if (s->nr > FSCK_ERR_RATELIMIT_NR)
+			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf);
+
+		list_del(&s->list);
+		kfree(s);
+	}
+
+	mutex_unlock(&c->fsck_error_lock);
+}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
new file mode 100644
index 000000000000..2591e12305b7
--- /dev/null
+++ b/fs/bcachefs/error.h
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERROR_H
+#define _BCACHEFS_ERROR_H
+
+#include <linux/list.h>
+#include <linux/printk.h>
+
+struct bch_dev;
+struct bch_fs;
+struct work_struct;
+
+/*
+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag
+ * superblock as such
+ */
+
+/* Error messages: */
+
+/*
+ * Very fatal logic/inconsistency errors: these indicate that we've majorly
+ * screwed up at runtime, i.e. it's not likely that it was just caused by the
+ * data on disk being inconsistent. These BUG():
+ *
+ * XXX: audit and convert to inconsistent() checks
+ */
+
+#define bch2_fs_bug(c, ...)						\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	BUG();								\
+} while (0)
+
+#define bch2_fs_bug_on(cond, c, ...)					\
+do {									\
+	if (cond)							\
+		bch2_fs_bug(c, __VA_ARGS__);				\
+} while (0)
+
+/*
+ * Inconsistency errors: The on disk data is inconsistent. If these occur during
+ * initial recovery, they don't indicate a bug in the running code - we walk all
+ * the metadata before modifying anything. If they occur at runtime, they
+ * indicate either a bug in the running code or (less likely) data is being
+ * silently corrupted under us.
+ *
+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in
+ * BCH_ON_ERROR_CONTINUE mode
+ */
+
+bool bch2_inconsistent_error(struct bch_fs *);
+
+#define bch2_fs_inconsistent(c, ...)					\
+({									\
+	bch_err(c, __VA_ARGS__);					\
+	bch2_inconsistent_error(c);					\
+})
+
+#define bch2_fs_inconsistent_on(cond, c, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		bch2_fs_inconsistent(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Later we might want to mark only the particular device inconsistent, not the
+ * entire filesystem:
+ */
+
+#define bch2_dev_inconsistent(ca, ...)					\
+do {									\
+	bch_err(ca, __VA_ARGS__);					\
+	bch2_inconsistent_error((ca)->fs);				\
+} while (0)
+
+#define bch2_dev_inconsistent_on(cond, ca, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		bch2_dev_inconsistent(ca, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally
+ * be able to repair:
+ */
+
+enum {
+	BCH_FSCK_OK			= 0,
+	BCH_FSCK_ERRORS_NOT_FIXED	= 1,
+	BCH_FSCK_REPAIR_UNIMPLEMENTED	= 2,
+	BCH_FSCK_REPAIR_IMPOSSIBLE	= 3,
+	BCH_FSCK_UNKNOWN_VERSION	= 4,
+};
+
+enum fsck_err_opts {
+	FSCK_OPT_EXIT,
+	FSCK_OPT_YES,
+	FSCK_OPT_NO,
+	FSCK_OPT_ASK,
+};
+
+enum fsck_err_ret {
+	FSCK_ERR_IGNORE	= 0,
+	FSCK_ERR_FIX	= 1,
+	FSCK_ERR_EXIT	= 2,
+};
+
+struct fsck_err_state {
+	struct list_head	list;
+	const char		*fmt;
+	u64			nr;
+	char			buf[512];
+};
+
+#define FSCK_CAN_FIX		(1 << 0)
+#define FSCK_CAN_IGNORE		(1 << 1)
+#define FSCK_NEED_FSCK		(1 << 2)
+
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
+				unsigned, const char *, ...);
+void bch2_flush_fsck_errs(struct bch_fs *);
+
+#define __fsck_err(c, _flags, msg, ...)					\
+({									\
+	int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
+									\
+	if (_fix == FSCK_ERR_EXIT) {					\
+		bch_err(c, "Unable to continue, halting");		\
+		ret = BCH_FSCK_ERRORS_NOT_FIXED;			\
+		goto fsck_err;						\
+	}								\
+									\
+	_fix;								\
+})
+
+/* These macros return true if error should be fixed: */
+
+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
+
+#define __fsck_err_on(cond, c, _flags, ...)				\
+	((cond) ? __fsck_err(c, _flags,	##__VA_ARGS__) : false)
+
+#define need_fsck_err_on(cond, c, ...)					\
+	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+
+#define need_fsck_err(c, ...)						\
+	__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+
+#define mustfix_fsck_err(c, ...)					\
+	__fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
+
+#define mustfix_fsck_err_on(cond, c, ...)				\
+	__fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
+
+#define fsck_err(c, ...)						\
+	__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+
+#define fsck_err_on(cond, c, ...)					\
+	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+
+/*
+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW
+ * mode - pretty much just due to metadata IO errors:
+ */
+
+void bch2_fatal_error(struct bch_fs *);
+
+#define bch2_fs_fatal_error(c, ...)					\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	bch2_fatal_error(c);						\
+} while (0)
+
+#define bch2_fs_fatal_err_on(cond, c, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		bch2_fs_fatal_error(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * IO errors: either recoverable metadata IO (because we have replicas), or data
+ * IO - we need to log it and print out a message, but we don't (necessarily)
+ * want to shut down the fs:
+ */
+
+void bch2_io_error_work(struct work_struct *);
+
+/* Does the error handling without logging a message */
+void bch2_io_error(struct bch_dev *);
+
+/* Logs message and handles the error: */
+#define bch2_dev_io_error(ca, fmt, ...)					\
+do {									\
+	printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs,			\
+		"IO error on %s for " fmt),				\
+		(ca)->name, ##__VA_ARGS__);				\
+	bch2_io_error(ca);						\
+} while (0)
+
+#define bch2_dev_io_err_on(cond, ca, ...)				\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret)							\
+		bch2_dev_io_error(ca, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/* kill? */
+
+#define __bcache_io_error(c, fmt, ...)					\
+	printk_ratelimited(KERN_ERR bch2_fmt(c,				\
+			"IO error: " fmt), ##__VA_ARGS__)
+
+#define bcache_io_error(c, bio, fmt, ...)				\
+do {									\
+	__bcache_io_error(c, fmt, ##__VA_ARGS__);			\
+	(bio)->bi_status = BLK_STS_IOERR;					\
+} while (0)
+
+#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
new file mode 100644
index 000000000000..4cc2a4b13199
--- /dev/null
+++ b/fs/bcachefs/extents.c
@@ -0,0 +1,1752 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Code for managing the extent btree and dynamically updating the writeback
+ * dirty sector count.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "debug.h"
+#include "dirent.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "journal.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+#include "util.h"
+#include "xattr.h"
+
+#include <trace/events/bcachefs.h>
+
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	unsigned nr_ptrs = 0;
+
+	bkey_for_each_ptr(p, ptr)
+		nr_ptrs++;
+
+	return nr_ptrs;
+}
+
+unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k)
+{
+	unsigned nr_ptrs = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v: {
+		struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+		const struct bch_extent_ptr *ptr;
+
+		bkey_for_each_ptr(p, ptr)
+			nr_ptrs += !ptr->cached;
+		BUG_ON(!nr_ptrs);
+		break;
+	}
+	case KEY_TYPE_reservation:
+		nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
+		break;
+	}
+
+	return nr_ptrs;
+}
+
+static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+					   struct extent_ptr_decoded p)
+{
+	unsigned durability = 0;
+	struct bch_dev *ca;
+
+	if (p.ptr.cached)
+		return 0;
+
+	ca = bch_dev_bkey_exists(c, p.ptr.dev);
+
+	if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+		durability = max_t(unsigned, durability, ca->mi.durability);
+
+	if (p.has_ec) {
+		struct stripe *s =
+			genradix_ptr(&c->stripes[0], p.ec.idx);
+
+		if (WARN_ON(!s))
+			goto out;
+
+		durability = max_t(unsigned, durability, s->nr_redundant);
+	}
+out:
+	return durability;
+}
+
+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned durability = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		durability += bch2_extent_ptr_durability(c, p);
+
+	return durability;
+}
+
+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
+						   unsigned dev)
+{
+	struct bch_dev_io_failures *i;
+
+	for (i = f->devs; i < f->devs + f->nr; i++)
+		if (i->dev == dev)
+			return i;
+
+	return NULL;
+}
+
+void bch2_mark_io_failure(struct bch_io_failures *failed,
+			  struct extent_ptr_decoded *p)
+{
+	struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
+
+	if (!f) {
+		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
+
+		f = &failed->devs[failed->nr++];
+		f->dev		= p->ptr.dev;
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else if (p->idx != f->idx) {
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else {
+		f->nr_failed++;
+	}
+}
+
+/*
+ * returns true if p1 is better than p2:
+ */
+static inline bool ptr_better(struct bch_fs *c,
+			      const struct extent_ptr_decoded p1,
+			      const struct extent_ptr_decoded p2)
+{
+	if (likely(!p1.idx && !p2.idx)) {
+		struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
+		struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+
+		u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+		u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+
+		/* Pick at random, biased in favor of the faster device: */
+
+		return bch2_rand_range(l1 + l2) > l1;
+	}
+
+	if (force_reconstruct_read(c))
+		return p1.idx > p2.idx;
+
+	return p1.idx < p2.idx;
+}
+
+/*
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
+ */
+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
+			       struct bch_io_failures *failed,
+			       struct extent_ptr_decoded *pick)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_dev_io_failures *f;
+	struct bch_dev *ca;
+	int ret = 0;
+
+	if (k.k->type == KEY_TYPE_error)
+		return -EIO;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		ca = bch_dev_bkey_exists(c, p.ptr.dev);
+
+		/*
+		 * If there are any dirty pointers it's an error if we can't
+		 * read:
+		 */
+		if (!ret && !p.ptr.cached)
+			ret = -EIO;
+
+		if (p.ptr.cached && ptr_stale(ca, &p.ptr))
+			continue;
+
+		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
+		if (f)
+			p.idx = f->nr_failed < f->nr_retries
+				? f->idx
+				: f->idx + 1;
+
+		if (!p.idx &&
+		    !bch2_dev_is_readable(ca))
+			p.idx++;
+
+		if (force_reconstruct_read(c) &&
+		    !p.idx && p.has_ec)
+			p.idx++;
+
+		if (p.idx >= (unsigned) p.has_ec + 1)
+			continue;
+
+		if (ret > 0 && !ptr_better(c, p, *pick))
+			continue;
+
+		*pick = p;
+		ret = 1;
+	}
+
+	return ret;
+}
+
+void bch2_bkey_append_ptr(struct bkey_i *k,
+			  struct bch_extent_ptr ptr)
+{
+	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
+
+	switch (k->k.type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_extent:
+		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
+
+		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+
+		memcpy((void *) &k->v + bkey_val_bytes(&k->k),
+		       &ptr,
+		       sizeof(ptr));
+		k->u64s++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+{
+	struct bch_extent_ptr *ptr;
+
+	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
+}
+
+const struct bch_extent_ptr *
+bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->dev == dev)
+			return ptr;
+
+	return NULL;
+}
+
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (bch2_dev_in_target(c, ptr->dev, target) &&
+		    (!ptr->cached ||
+		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+			return true;
+
+	return false;
+}
+
+/* extent specific utility code */
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
+{
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		if (ptr->dev == dev)
+			return ptr;
+
+	return NULL;
+}
+
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
+{
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ca->mi.group &&
+		    ca->mi.group - 1 == group)
+			return ptr;
+	}
+
+	return NULL;
+}
+
+unsigned bch2_extent_is_compressed(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned ret = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (!p.ptr.cached &&
+		    p.crc.compression_type != BCH_COMPRESSION_NONE)
+			ret += p.crc.compressed_size;
+
+	return ret;
+}
+
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
+			   struct bch_extent_ptr m, u64 offset)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (p.ptr.dev	== m.dev &&
+		    p.ptr.gen	== m.gen &&
+		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
+		    (s64) m.offset  - offset)
+			return true;
+
+	return false;
+}
+
+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
+					  union bch_extent_entry *entry)
+{
+	union bch_extent_entry *i = ptrs.start;
+
+	if (i == entry)
+		return NULL;
+
+	while (extent_entry_next(i) != entry)
+		i = extent_entry_next(i);
+	return i;
+}
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+					   struct bch_extent_ptr *ptr)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *dst, *src, *prev;
+	bool drop_crc = true;
+
+	EBUG_ON(ptr < &ptrs.start->ptr ||
+		ptr >= &ptrs.end->ptr);
+	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+
+	src = extent_entry_next(to_entry(ptr));
+	if (src != ptrs.end &&
+	    !extent_entry_is_crc(src))
+		drop_crc = false;
+
+	dst = to_entry(ptr);
+	while ((prev = extent_entry_prev(ptrs, dst))) {
+		if (extent_entry_is_ptr(prev))
+			break;
+
+		if (extent_entry_is_crc(prev)) {
+			if (drop_crc)
+				dst = prev;
+			break;
+		}
+
+		dst = prev;
+	}
+
+	memmove_u64s_down(dst, src,
+			  (u64 *) ptrs.end - (u64 *) src);
+	k.k->u64s -= (u64 *) src - (u64 *) dst;
+
+	return dst;
+}
+
+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
+				  struct bch_extent_crc_unpacked n)
+{
+	return !u.compression_type &&
+		u.csum_type &&
+		u.uncompressed_size > u.live_size &&
+		bch2_csum_type_is_encryption(u.csum_type) ==
+		bch2_csum_type_is_encryption(n.csum_type);
+}
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
+				 struct bch_extent_crc_unpacked n)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
+
+	if (!n.csum_type)
+		return false;
+
+	bkey_for_each_crc(k.k, ptrs, crc, i)
+		if (can_narrow_crc(crc, n))
+			return true;
+
+	return false;
+}
+
+/*
+ * We're writing another replica for this extent, so while we've got the data in
+ * memory we'll be computing a new checksum for the currently live data.
+ *
+ * If there are other replicas we aren't moving, and they are checksummed but
+ * not compressed, we can modify them to point to only the data that is
+ * currently live (so that readers won't have to bounce) while we've got the
+ * checksum we need:
+ */
+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	struct bch_extent_crc_unpacked u;
+	struct extent_ptr_decoded p;
+	union bch_extent_entry *i;
+	bool ret = false;
+
+	/* Find a checksum entry that covers only live data: */
+	if (!n.csum_type) {
+		bkey_for_each_crc(&k->k, ptrs, u, i)
+			if (!u.compression_type &&
+			    u.csum_type &&
+			    u.live_size == u.uncompressed_size) {
+				n = u;
+				goto found;
+			}
+		return false;
+	}
+found:
+	BUG_ON(n.compression_type);
+	BUG_ON(n.offset);
+	BUG_ON(n.live_size != k->k.size);
+
+restart_narrow_pointers:
+	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+
+	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
+		if (can_narrow_crc(p.crc, n)) {
+			bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
+			p.ptr.offset += p.crc.offset;
+			p.crc = n;
+			bch2_extent_ptr_decoded_append(k, &p);
+			ret = true;
+			goto restart_narrow_pointers;
+		}
+
+	return ret;
+}
+
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+					 struct bch_extent_crc_unpacked r)
+{
+	return (l.csum_type		!= r.csum_type ||
+		l.compression_type	!= r.compression_type ||
+		l.compressed_size	!= r.compressed_size ||
+		l.uncompressed_size	!= r.uncompressed_size ||
+		l.offset		!= r.offset ||
+		l.live_size		!= r.live_size ||
+		l.nonce			!= r.nonce ||
+		bch2_crc_cmp(l.csum, r.csum));
+}
+
+void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+{
+	union bch_extent_entry *entry;
+	u64 *d = (u64 *) bkeyp_val(f, k);
+	unsigned i;
+
+	for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+		d[i] = swab64(d[i]);
+
+	for (entry = (union bch_extent_entry *) d;
+	     entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+	     entry = extent_entry_next(entry)) {
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+			entry->crc32.csum = swab32(entry->crc32.csum);
+			break;
+		case BCH_EXTENT_ENTRY_crc64:
+			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+			break;
+		case BCH_EXTENT_ENTRY_crc128:
+			entry->crc128.csum.hi = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.hi);
+			entry->crc128.csum.lo = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.lo);
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			break;
+		}
+	}
+}
+
+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	const struct bch_extent_ptr *ptr;
+	const struct bch_extent_stripe_ptr *ec;
+	struct bch_dev *ca;
+	bool first = true;
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (!first)
+			pr_buf(out, " ");
+
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = entry_to_ptr(entry);
+			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;
+
+			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
+			       (u64) ptr->offset, ptr->gen,
+			       ptr->cached ? " cached" : "",
+			       ca && ptr_stale(ca, ptr)
+			       ? " stale" : "");
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+
+			pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+			       crc.compressed_size,
+			       crc.uncompressed_size,
+			       crc.offset, crc.nonce,
+			       crc.csum_type,
+			       crc.compression_type);
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ec = &entry->stripe_ptr;
+
+			pr_buf(out, "ec: idx %llu block %u",
+			       (u64) ec->idx, ec->block);
+			break;
+		default:
+			pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+			return;
+		}
+
+		first = false;
+	}
+}
+
+static const char *extent_ptr_invalid(const struct bch_fs *c,
+				      struct bkey_s_c k,
+				      const struct bch_extent_ptr *ptr,
+				      unsigned size_ondisk,
+				      bool metadata)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr2;
+	struct bch_dev *ca;
+
+	if (!bch2_dev_exists2(c, ptr->dev))
+		return "pointer to invalid device";
+
+	ca = bch_dev_bkey_exists(c, ptr->dev);
+	if (!ca)
+		return "pointer to invalid device";
+
+	bkey_for_each_ptr(ptrs, ptr2)
+		if (ptr != ptr2 && ptr->dev == ptr2->dev)
+			return "multiple pointers to same device";
+
+	if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
+		return "offset past end of device";
+
+	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
+		return "offset before first bucket";
+
+	if (bucket_remainder(ca, ptr->offset) +
+	    size_ondisk > ca->mi.bucket_size)
+		return "spans multiple buckets";
+
+	return NULL;
+}
+
+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	unsigned size_ondisk = k.k->size;
+	const char *reason;
+	unsigned nonce = UINT_MAX;
+
+	if (k.k->type == KEY_TYPE_btree_ptr)
+		size_ondisk = c->opts.btree_node_size;
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+			return "invalid extent entry type";
+
+		if (k.k->type == KEY_TYPE_btree_ptr &&
+		    !extent_entry_is_ptr(entry))
+			return "has non ptr field";
+
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			reason = extent_ptr_invalid(c, k, &entry->ptr,
+						    size_ondisk, false);
+			if (reason)
+				return reason;
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+
+			if (crc.offset + crc.live_size >
+			    crc.uncompressed_size)
+				return "checksum offset + key size > uncompressed size";
+
+			size_ondisk = crc.compressed_size;
+
+			if (!bch2_checksum_type_valid(c, crc.csum_type))
+				return "invalid checksum type";
+
+			if (crc.compression_type >= BCH_COMPRESSION_NR)
+				return "invalid compression type";
+
+			if (bch2_csum_type_is_encryption(crc.csum_type)) {
+				if (nonce == UINT_MAX)
+					nonce = crc.offset + crc.nonce;
+				else if (nonce != crc.offset + crc.nonce)
+					return "incorrect nonce";
+			}
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+/* Btree ptrs */
+
+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+		return "value too big";
+
+	return bch2_bkey_ptrs_invalid(c, k);
+}
+
+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	const char *err;
+	char buf[160];
+	struct bucket_mark mark;
+	struct bch_dev *ca;
+
+	bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+		       !bch2_bkey_replicas_marked(c, k, false), c,
+		       "btree key bad (replicas not marked in superblock):\n%s",
+		       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+
+	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+		return;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		mark = ptr_bucket_mark(ca, ptr);
+
+		err = "stale";
+		if (gen_after(mark.gen, ptr->gen))
+			goto err;
+
+		err = "inconsistent";
+		if (mark.data_type != BCH_DATA_BTREE ||
+		    mark.dirty_sectors < c->opts.btree_node_size)
+			goto err;
+	}
+
+	return;
+err:
+	bch2_bkey_val_to_text(&PBUF(buf), c, k);
+	bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
+		    err, buf, PTR_BUCKET_NR(ca, ptr),
+		    mark.gen, (unsigned) mark.v.counter);
+}
+
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+/* Extents */
+
+void __bch2_cut_front(struct bpos where, struct bkey_s k)
+{
+	u64 sub;
+
+	if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
+		return;
+
+	EBUG_ON(bkey_cmp(where, k.k->p) > 0);
+
+	sub = where.offset - bkey_start_offset(k.k);
+
+	k.k->size -= sub;
+
+	if (!k.k->size)
+		k.k->type = KEY_TYPE_deleted;
+
+	switch (k.k->type) {
+	case KEY_TYPE_deleted:
+	case KEY_TYPE_discard:
+	case KEY_TYPE_error:
+	case KEY_TYPE_cookie:
+		break;
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v: {
+		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+		union bch_extent_entry *entry;
+		bool seen_crc = false;
+
+		bkey_extent_entry_for_each(ptrs, entry) {
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_ptr:
+				if (!seen_crc)
+					entry->ptr.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc32:
+				entry->crc32.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc64:
+				entry->crc64.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc128:
+				entry->crc128.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
+			}
+
+			if (extent_entry_is_crc(entry))
+				seen_crc = true;
+		}
+
+		break;
+	}
+	case KEY_TYPE_reflink_p: {
+		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
+
+		le64_add_cpu(&p.v->idx, sub);
+		break;
+	}
+	case KEY_TYPE_reservation:
+		break;
+	default:
+		BUG();
+	}
+}
+
+bool bch2_cut_back(struct bpos where, struct bkey *k)
+{
+	u64 len = 0;
+
+	if (bkey_cmp(where, k->p) >= 0)
+		return false;
+
+	EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
+
+	len = where.offset - bkey_start_offset(k);
+
+	k->p = where;
+	k->size = len;
+
+	if (!len)
+		k->type = KEY_TYPE_deleted;
+
+	return true;
+}
+
+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	unsigned ret = 0;
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ret++;
+		}
+	}
+
+	return ret;
+}
+
+static int count_iters_for_insert(struct btree_trans *trans,
+				  struct bkey_s_c k,
+				  unsigned offset,
+				  struct bpos *end,
+				  unsigned *nr_iters,
+				  unsigned max_iters,
+				  bool overwrite)
+{
+	int ret = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+
+		if (*nr_iters >= max_iters) {
+			*end = bpos_min(*end, k.k->p);
+			ret = 1;
+		}
+
+		break;
+	case KEY_TYPE_reflink_p: {
+		struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+		u64 idx = le64_to_cpu(p.v->idx);
+		unsigned sectors = bpos_min(*end, p.k->p).offset -
+			bkey_start_offset(p.k);
+		struct btree_iter *iter;
+		struct bkey_s_c r_k;
+
+		for_each_btree_key(trans, iter,
+				   BTREE_ID_REFLINK, POS(0, idx + offset),
+				   BTREE_ITER_SLOTS, r_k, ret) {
+			if (bkey_cmp(bkey_start_pos(r_k.k),
+				     POS(0, idx + sectors)) >= 0)
+				break;
+
+			*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
+
+			if (*nr_iters >= max_iters) {
+				struct bpos pos = bkey_start_pos(k.k);
+				pos.offset += r_k.k->p.offset - idx;
+
+				*end = bpos_min(*end, pos);
+				ret = 1;
+				break;
+			}
+		}
+
+		bch2_trans_iter_put(trans, iter);
+		break;
+	}
+	}
+
+	return ret;
+}
+
+#define EXTENT_ITERS_MAX	(BTREE_ITER_MAX / 3)
+
+int bch2_extent_atomic_end(struct btree_iter *iter,
+			   struct bkey_i *insert,
+			   struct bpos *end)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree *b;
+	struct btree_node_iter	node_iter;
+	struct bkey_packed	*_k;
+	unsigned		nr_iters = 0;
+	int ret;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	b = iter->l[0].b;
+	node_iter = iter->l[0].iter;
+
+	BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
+
+	*end = bpos_min(insert->k.p, b->key.k.p);
+
+	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
+				     &nr_iters, EXTENT_ITERS_MAX / 2, false);
+	if (ret < 0)
+		return ret;
+
+	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+						      KEY_TYPE_discard))) {
+		struct bkey	unpacked;
+		struct bkey_s_c	k = bkey_disassemble(b, _k, &unpacked);
+		unsigned offset = 0;
+
+		if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
+			break;
+
+		if (bkey_cmp(bkey_start_pos(&insert->k),
+			     bkey_start_pos(k.k)) > 0)
+			offset = bkey_start_offset(&insert->k) -
+				bkey_start_offset(k.k);
+
+		ret = count_iters_for_insert(trans, k, offset, end,
+					&nr_iters, EXTENT_ITERS_MAX, true);
+		if (ret)
+			break;
+
+		bch2_btree_node_iter_advance(&node_iter, b);
+	}
+
+	return ret < 0 ? ret : 0;
+}
+
+int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+{
+	struct bpos end;
+	int ret;
+
+	ret = bch2_extent_atomic_end(iter, k, &end);
+	if (ret)
+		return ret;
+
+	bch2_cut_back(end, &k->k);
+	return 0;
+}
+
+int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
+{
+	struct bpos end;
+	int ret;
+
+	ret = bch2_extent_atomic_end(iter, k, &end);
+	if (ret)
+		return ret;
+
+	return !bkey_cmp(end, k->k.p);
+}
+
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_trans *trans,
+		       struct btree_insert_entry *insert,
+		       unsigned *u64s)
+{
+	struct btree_iter_level *l = &insert->iter->l[0];
+	struct btree_node_iter node_iter = l->iter;
+	enum bch_extent_overlap overlap;
+	struct bkey_packed *_k;
+	struct bkey unpacked;
+	struct bkey_s_c k;
+	int sectors;
+
+	/*
+	 * We avoid creating whiteouts whenever possible when deleting, but
+	 * those optimizations mean we may potentially insert two whiteouts
+	 * instead of one (when we overlap with the front of one extent and the
+	 * back of another):
+	 */
+	if (bkey_whiteout(&insert->k->k))
+		*u64s += BKEY_U64s;
+
+	_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
+					      KEY_TYPE_discard);
+	if (!_k)
+		return BTREE_INSERT_OK;
+
+	k = bkey_disassemble(l->b, _k, &unpacked);
+
+	overlap = bch2_extent_overlap(&insert->k->k, k.k);
+
+	/* account for having to split existing extent: */
+	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+		*u64s += _k->u64s;
+
+	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
+	    (sectors = bch2_extent_is_compressed(k))) {
+		int flags = trans->flags & BTREE_INSERT_NOFAIL
+			? BCH_DISK_RESERVATION_NOFAIL : 0;
+
+		switch (bch2_disk_reservation_add(trans->c,
+				trans->disk_res,
+				sectors, flags)) {
+		case 0:
+			break;
+		case -ENOSPC:
+			return BTREE_INSERT_ENOSPC;
+		default:
+			BUG();
+		}
+	}
+
+	return BTREE_INSERT_OK;
+}
+
+static void verify_extent_nonoverlapping(struct bch_fs *c,
+					 struct btree *b,
+					 struct btree_node_iter *_iter,
+					 struct bkey_i *insert)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct btree_node_iter iter;
+	struct bkey_packed *k;
+	struct bkey uk;
+
+	if (!expensive_debug_checks(c))
+		return;
+
+	iter = *_iter;
+	k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
+	BUG_ON(k &&
+	       (uk = bkey_unpack_key(b, k),
+		bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
+
+	iter = *_iter;
+	k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
+#if 0
+	BUG_ON(k &&
+	       (uk = bkey_unpack_key(b, k),
+		bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
+#else
+	if (k &&
+	    (uk = bkey_unpack_key(b, k),
+	     bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
+		char buf1[100];
+		char buf2[100];
+
+		bch2_bkey_to_text(&PBUF(buf1), &insert->k);
+		bch2_bkey_to_text(&PBUF(buf2), &uk);
+
+		bch2_dump_btree_node(b);
+		panic("insert > next :\n"
+		      "insert %s\n"
+		      "next   %s\n",
+		      buf1, buf2);
+	}
+#endif
+
+#endif
+}
+
+static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
+			       struct bkey_i *insert)
+{
+	struct btree_iter_level *l = &iter->l[0];
+	struct bkey_packed *k =
+		bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
+
+	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
+
+	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+	verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
+
+	if (debug_check_bkeys(c))
+		bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
+
+	bch2_bset_insert(l->b, &l->iter, k, insert, 0);
+	bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
+}
+
+static void
+extent_squash(struct bch_fs *c, struct btree_iter *iter,
+	      struct bkey_i *insert,
+	      struct bkey_packed *_k, struct bkey_s k,
+	      enum bch_extent_overlap overlap)
+{
+	struct btree_iter_level *l = &iter->l[0];
+
+	switch (overlap) {
+	case BCH_EXTENT_OVERLAP_FRONT:
+		/* insert overlaps with start of k: */
+		__bch2_cut_front(insert->k.p, k);
+		EBUG_ON(bkey_deleted(k.k));
+		extent_save(l->b, _k, k.k);
+		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+		break;
+
+	case BCH_EXTENT_OVERLAP_BACK:
+		/* insert overlaps with end of k: */
+		bch2_cut_back(bkey_start_pos(&insert->k), k.k);
+		EBUG_ON(bkey_deleted(k.k));
+		extent_save(l->b, _k, k.k);
+
+		/*
+		 * As the auxiliary tree is indexed by the end of the
+		 * key and we've just changed the end, update the
+		 * auxiliary tree.
+		 */
+		bch2_bset_fix_invalidated_key(l->b, _k);
+		bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+					 _k, _k->u64s, _k->u64s);
+		break;
+
+	case BCH_EXTENT_OVERLAP_ALL: {
+		/* The insert key completely covers k, invalidate k */
+		if (!bkey_whiteout(k.k))
+			btree_account_key_drop(l->b, _k);
+
+		k.k->size = 0;
+		k.k->type = KEY_TYPE_deleted;
+
+		if (_k >= btree_bset_last(l->b)->start) {
+			unsigned u64s = _k->u64s;
+
+			bch2_bset_delete(l->b, _k, _k->u64s);
+			bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+						 _k, u64s, 0);
+		} else {
+			extent_save(l->b, _k, k.k);
+			bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+		}
+
+		break;
+	}
+	case BCH_EXTENT_OVERLAP_MIDDLE: {
+		BKEY_PADDED(k) split;
+		/*
+		 * The insert key falls 'in the middle' of k
+		 * The insert key splits k in 3:
+		 * - start only in k, preserve
+		 * - middle common section, invalidate in k
+		 * - end only in k, preserve
+		 *
+		 * We update the old key to preserve the start,
+		 * insert will be the new common section,
+		 * we manually insert the end that we are preserving.
+		 *
+		 * modify k _before_ doing the insert (which will move
+		 * what k points to)
+		 */
+		bkey_reassemble(&split.k, k.s_c);
+		split.k.k.needs_whiteout |= bkey_written(l->b, _k);
+
+		bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
+		BUG_ON(bkey_deleted(&split.k.k));
+
+		__bch2_cut_front(insert->k.p, k);
+		BUG_ON(bkey_deleted(k.k));
+		extent_save(l->b, _k, k.k);
+		bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+
+		extent_bset_insert(c, iter, &split.k);
+		break;
+	}
+	}
+}
+
+/**
+ * bch_extent_insert_fixup - insert a new extent and deal with overlaps
+ *
+ * this may result in not actually doing the insert, or inserting some subset
+ * of the insert key. For cmpxchg operations this is where that logic lives.
+ *
+ * All subsets of @insert that need to be inserted are inserted using
+ * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
+ * returns false, setting @iter->pos for the prefix of @insert that actually got
+ * inserted.
+ *
+ * BSET INVARIANTS: this function is responsible for maintaining all the
+ * invariants for bsets of extents in memory. things get really hairy with 0
+ * size extents
+ *
+ * within one bset:
+ *
+ * bkey_start_pos(bkey_next(k)) >= k
+ * or bkey_start_offset(bkey_next(k)) >= k->offset
+ *
+ * i.e. strict ordering, no overlapping extents.
+ *
+ * multiple bsets (i.e. full btree node):
+ *
+ * ∀ k, j
+ *   k.size != 0 ∧ j.size != 0 →
+ *     ¬ (k > bkey_start_pos(j) ∧ k < j)
+ *
+ * i.e. no two overlapping keys _of nonzero size_
+ *
+ * We can't realistically maintain this invariant for zero size keys because of
+ * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
+ * there may be another 0 size key between them in another bset, and it will
+ * thus overlap with the merged key.
+ *
+ * In addition, the end of iter->pos indicates how much has been processed.
+ * If the end of iter->pos is not the same as the end of insert, then
+ * key insertion needs to continue/be retried.
+ */
+void bch2_insert_fixup_extent(struct btree_trans *trans,
+			      struct btree_insert_entry *insert_entry)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter	= insert_entry->iter;
+	struct bkey_i *insert	= insert_entry->k;
+	struct btree_iter_level *l = &iter->l[0];
+	struct btree_node_iter node_iter = l->iter;
+	bool deleting		= bkey_whiteout(&insert->k);
+	bool update_journal	= !deleting;
+	bool update_btree	= !deleting;
+	struct bkey_i whiteout	= *insert;
+	struct bkey_packed *_k;
+	struct bkey unpacked;
+	BKEY_PADDED(k) tmp;
+
+	EBUG_ON(iter->level);
+	EBUG_ON(!insert->k.size);
+	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+
+	while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
+						      KEY_TYPE_discard))) {
+		struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
+		struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
+		enum bch_extent_overlap overlap =
+			bch2_extent_overlap(&insert->k, k.k);
+
+		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+			break;
+
+		if (!bkey_whiteout(k.k))
+			update_journal = true;
+
+		if (!update_journal) {
+			bch2_cut_front(cur_end, insert);
+			bch2_cut_front(cur_end, &whiteout);
+			bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
+			goto next;
+		}
+
+		/*
+		 * When deleting, if possible just do it by switching the type
+		 * of the key we're deleting, instead of creating and inserting
+		 * a new whiteout:
+		 */
+		if (deleting &&
+		    !update_btree &&
+		    !bkey_cmp(insert->k.p, k.k->p) &&
+		    !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
+			if (!bkey_whiteout(k.k)) {
+				btree_account_key_drop(l->b, _k);
+				_k->type = KEY_TYPE_discard;
+				reserve_whiteout(l->b, _k);
+				bch2_btree_iter_fix_key_modified(iter,
+								 l->b, _k);
+			}
+			break;
+		}
+
+		if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
+			insert->k.needs_whiteout = true;
+			update_btree = true;
+		}
+
+		if (update_btree &&
+		    overlap == BCH_EXTENT_OVERLAP_ALL &&
+		    bkey_whiteout(k.k) &&
+		    k.k->needs_whiteout) {
+			unreserve_whiteout(l->b, _k);
+			_k->needs_whiteout = false;
+		}
+
+		extent_squash(c, iter, insert, _k, k, overlap);
+
+		if (!update_btree)
+			bch2_cut_front(cur_end, insert);
+next:
+		node_iter = l->iter;
+
+		if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
+		    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+			break;
+	}
+
+	l->iter = node_iter;
+	bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
+
+	if (update_btree) {
+		bkey_copy(&tmp.k, insert);
+
+		if (deleting)
+			tmp.k.k.type = KEY_TYPE_discard;
+
+		EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
+
+		extent_bset_insert(c, iter, &tmp.k);
+	}
+
+	if (update_journal) {
+		bkey_copy(&tmp.k, !deleting ? insert : &whiteout);
+
+		if (deleting)
+			tmp.k.k.type = KEY_TYPE_discard;
+
+		EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
+
+		bch2_btree_journal_key(trans, iter, &tmp.k);
+	}
+
+	bch2_cut_front(insert->k.p, insert);
+}
+
+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	return bch2_bkey_ptrs_invalid(c, k);
+}
+
+void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	char buf[160];
+
+	/*
+	 * XXX: we should be doing most/all of these checks at startup time,
+	 * where we check bch2_bkey_invalid() in btree_node_read_done()
+	 *
+	 * But note that we can't check for stale pointers or incorrect gc marks
+	 * until after journal replay is done (it might be an extent that's
+	 * going to get overwritten during replay)
+	 */
+
+	if (percpu_down_read_trylock(&c->mark_lock)) {
+		bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+			       !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
+			       "extent key bad (replicas not marked in superblock):\n%s",
+			       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
+		percpu_up_read(&c->mark_lock);
+	}
+	/*
+	 * If journal replay hasn't finished, we might be seeing keys
+	 * that will be overwritten by the time journal replay is done:
+	 */
+	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+		return;
+
+	extent_for_each_ptr_decode(e, p, entry) {
+		struct bch_dev *ca	= bch_dev_bkey_exists(c, p.ptr.dev);
+		struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
+		unsigned stale		= gen_after(mark.gen, p.ptr.gen);
+		unsigned disk_sectors	= ptr_disk_sectors(p);
+		unsigned mark_sectors	= p.ptr.cached
+			? mark.cached_sectors
+			: mark.dirty_sectors;
+
+		bch2_fs_bug_on(stale && !p.ptr.cached, c,
+			       "stale dirty pointer (ptr gen %u bucket %u",
+			       p.ptr.gen, mark.gen);
+
+		bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale);
+
+		bch2_fs_bug_on(!stale &&
+			       (mark.data_type != BCH_DATA_USER ||
+				mark_sectors < disk_sectors), c,
+			       "extent pointer not marked: %s:\n"
+			       "type %u sectors %u < %u",
+			       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
+			       mark.data_type,
+			       mark_sectors, disk_sectors);
+	}
+}
+
+void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
+{
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+static unsigned bch2_crc_field_size_max[] = {
+	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
+	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
+	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
+};
+
+static void bch2_extent_crc_pack(union bch_extent_crc *dst,
+				 struct bch_extent_crc_unpacked src)
+{
+#define set_common_fields(_dst, _src)					\
+		_dst.csum_type		= _src.csum_type,		\
+		_dst.compression_type	= _src.compression_type,	\
+		_dst._compressed_size	= _src.compressed_size - 1,	\
+		_dst._uncompressed_size	= _src.uncompressed_size - 1,	\
+		_dst.offset		= _src.offset
+
+	switch (extent_entry_type(to_entry(dst))) {
+	case BCH_EXTENT_ENTRY_crc32:
+		set_common_fields(dst->crc32, src);
+		dst->crc32.csum	 = *((__le32 *) &src.csum.lo);
+		break;
+	case BCH_EXTENT_ENTRY_crc64:
+		set_common_fields(dst->crc64, src);
+		dst->crc64.nonce	= src.nonce;
+		dst->crc64.csum_lo	= src.csum.lo;
+		dst->crc64.csum_hi	= *((__le16 *) &src.csum.hi);
+		break;
+	case BCH_EXTENT_ENTRY_crc128:
+		set_common_fields(dst->crc128, src);
+		dst->crc128.nonce	= src.nonce;
+		dst->crc128.csum	= src.csum;
+		break;
+	default:
+		BUG();
+	}
+#undef set_common_fields
+}
+
+void bch2_extent_crc_append(struct bkey_i *k,
+			    struct bch_extent_crc_unpacked new)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	union bch_extent_crc *crc = (void *) ptrs.end;
+
+	if (bch_crc_bytes[new.csum_type]	<= 4 &&
+	    new.uncompressed_size - 1		<= CRC32_SIZE_MAX &&
+	    new.nonce				<= CRC32_NONCE_MAX)
+		crc->type = 1 << BCH_EXTENT_ENTRY_crc32;
+	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
+		   new.uncompressed_size - 1	<= CRC64_SIZE_MAX &&
+		   new.nonce			<= CRC64_NONCE_MAX)
+		crc->type = 1 << BCH_EXTENT_ENTRY_crc64;
+	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
+		   new.uncompressed_size - 1	<= CRC128_SIZE_MAX &&
+		   new.nonce			<= CRC128_NONCE_MAX)
+		crc->type = 1 << BCH_EXTENT_ENTRY_crc128;
+	else
+		BUG();
+
+	bch2_extent_crc_pack(crc, new);
+
+	k->k.u64s += extent_entry_u64s(ptrs.end);
+
+	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
+}
+
+static inline void __extent_entry_insert(struct bkey_i *k,
+					 union bch_extent_entry *dst,
+					 union bch_extent_entry *new)
+{
+	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+
+	memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
+			      dst, (u64 *) end - (u64 *) dst);
+	k->k.u64s += extent_entry_u64s(new);
+	memcpy(dst, new, extent_entry_bytes(new));
+}
+
+void bch2_extent_ptr_decoded_append(struct bkey_i *k,
+				    struct extent_ptr_decoded *p)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	struct bch_extent_crc_unpacked crc =
+		bch2_extent_crc_unpack(&k->k, NULL);
+	union bch_extent_entry *pos;
+
+	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+		pos = ptrs.start;
+		goto found;
+	}
+
+	bkey_for_each_crc(&k->k, ptrs, crc, pos)
+		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+			pos = extent_entry_next(pos);
+			goto found;
+		}
+
+	bch2_extent_crc_append(k, p->crc);
+	pos = bkey_val_end(bkey_i_to_s(k));
+found:
+	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+	__extent_entry_insert(k, pos, to_entry(&p->ptr));
+
+	if (p->has_ec) {
+		p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
+		__extent_entry_insert(k, pos, to_entry(&p->ec));
+	}
+}
+
+/*
+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
+ *
+ * Returns true if @k should be dropped entirely
+ *
+ * For existing keys, only called when btree nodes are being rewritten, not when
+ * they're merely being compacted/resorted in memory.
+ */
+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
+{
+	struct bch_extent_ptr *ptr;
+
+	bch2_bkey_drop_ptrs(k, ptr,
+		ptr->cached &&
+		ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
+
+	/* will only happen if all pointers were cached: */
+	if (!bkey_val_u64s(k.k))
+		k.k->type = KEY_TYPE_discard;
+
+	return bkey_whiteout(k.k);
+}
+
+void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
+				    unsigned target,
+				    unsigned nr_desired_replicas)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
+
+	if (target && extra > 0)
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			int n = bch2_extent_ptr_durability(c, p);
+
+			if (n && n <= extra &&
+			    !bch2_dev_in_target(c, p.ptr.dev, target)) {
+				entry->ptr.cached = true;
+				extra -= n;
+			}
+		}
+
+	if (extra > 0)
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			int n = bch2_extent_ptr_durability(c, p);
+
+			if (n && n <= extra) {
+				entry->ptr.cached = true;
+				extra -= n;
+			}
+		}
+}
+
+enum merge_result bch2_extent_merge(struct bch_fs *c,
+				    struct bkey_s _l, struct bkey_s _r)
+{
+	struct bkey_s_extent l = bkey_s_to_extent(_l);
+	struct bkey_s_extent r = bkey_s_to_extent(_r);
+	union bch_extent_entry *en_l = l.v->start;
+	union bch_extent_entry *en_r = r.v->start;
+	struct bch_extent_crc_unpacked crc_l, crc_r;
+
+	if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
+		return BCH_MERGE_NOMERGE;
+
+	crc_l = bch2_extent_crc_unpack(l.k, NULL);
+
+	extent_for_each_entry(l, en_l) {
+		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
+
+		if (extent_entry_type(en_l) != extent_entry_type(en_r))
+			return BCH_MERGE_NOMERGE;
+
+		switch (extent_entry_type(en_l)) {
+		case BCH_EXTENT_ENTRY_ptr: {
+			const struct bch_extent_ptr *lp = &en_l->ptr;
+			const struct bch_extent_ptr *rp = &en_r->ptr;
+			struct bch_dev *ca;
+
+			if (lp->offset + crc_l.compressed_size != rp->offset ||
+			    lp->dev			!= rp->dev ||
+			    lp->gen			!= rp->gen)
+				return BCH_MERGE_NOMERGE;
+
+			/* We don't allow extents to straddle buckets: */
+			ca = bch_dev_bkey_exists(c, lp->dev);
+
+			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
+				return BCH_MERGE_NOMERGE;
+
+			break;
+		}
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			if (en_l->stripe_ptr.block	!= en_r->stripe_ptr.block ||
+			    en_l->stripe_ptr.idx	!= en_r->stripe_ptr.idx)
+				return BCH_MERGE_NOMERGE;
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+			crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+			if (crc_l.csum_type		!= crc_r.csum_type ||
+			    crc_l.compression_type	!= crc_r.compression_type ||
+			    crc_l.nonce			!= crc_r.nonce)
+				return BCH_MERGE_NOMERGE;
+
+			if (crc_l.offset + crc_l.live_size != crc_l.compressed_size ||
+			    crc_r.offset)
+				return BCH_MERGE_NOMERGE;
+
+			if (!bch2_checksum_mergeable(crc_l.csum_type))
+				return BCH_MERGE_NOMERGE;
+
+			if (crc_l.compression_type)
+				return BCH_MERGE_NOMERGE;
+
+			if (crc_l.csum_type &&
+			    crc_l.uncompressed_size +
+			    crc_r.uncompressed_size > c->sb.encoded_extent_max)
+				return BCH_MERGE_NOMERGE;
+
+			if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 >
+			    bch2_crc_field_size_max[extent_entry_type(en_l)])
+				return BCH_MERGE_NOMERGE;
+
+			break;
+		default:
+			return BCH_MERGE_NOMERGE;
+		}
+	}
+
+	extent_for_each_entry(l, en_l) {
+		struct bch_extent_crc_unpacked crc_l, crc_r;
+
+		en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
+
+		if (!extent_entry_is_crc(en_l))
+			continue;
+
+		crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+		crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+		crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
+						 crc_l.csum,
+						 crc_r.csum,
+						 crc_r.uncompressed_size << 9);
+
+		crc_l.uncompressed_size	+= crc_r.uncompressed_size;
+		crc_l.compressed_size	+= crc_r.compressed_size;
+
+		bch2_extent_crc_pack(entry_to_crc(en_l), crc_l);
+	}
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+
+	return BCH_MERGE_MERGE;
+}
+
+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
+			       unsigned nr_replicas)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bpos end = pos;
+	struct bkey_s_c k;
+	bool ret = true;
+	int err;
+
+	end.offset += size;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
+			   BTREE_ITER_SLOTS, k, err) {
+		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+			break;
+
+		if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) {
+			ret = false;
+			break;
+		}
+	}
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
+{
+	unsigned ret = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+
+		extent_for_each_ptr_decode(e, p, entry)
+			ret += !p.ptr.cached &&
+				p.crc.compression_type == BCH_COMPRESSION_NONE;
+		break;
+	}
+	case KEY_TYPE_reservation:
+		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+		break;
+	}
+
+	return ret;
+}
+
+/* KEY_TYPE_reservation: */
+
+const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
+		return "incorrect value size";
+
+	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
+		return "invalid nr_replicas";
+
+	return NULL;
+}
+
+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
+			      struct bkey_s_c k)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+	pr_buf(out, "generation %u replicas %u",
+	       le32_to_cpu(r.v->generation),
+	       r.v->nr_replicas);
+}
+
+enum merge_result bch2_reservation_merge(struct bch_fs *c,
+					 struct bkey_s _l, struct bkey_s _r)
+{
+	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
+	struct bkey_s_reservation r = bkey_s_to_reservation(_r);
+
+	if (l.v->generation != r.v->generation ||
+	    l.v->nr_replicas != r.v->nr_replicas)
+		return BCH_MERGE_NOMERGE;
+
+	if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
+		bch2_key_resize(l.k, KEY_SIZE_MAX);
+		__bch2_cut_front(l.k->p, r.s);
+		return BCH_MERGE_PARTIAL;
+	}
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+
+	return BCH_MERGE_MERGE;
+}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
new file mode 100644
index 000000000000..cc7ee9067b50
--- /dev/null
+++ b/fs/bcachefs/extents.h
@@ -0,0 +1,582 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_H
+#define _BCACHEFS_EXTENTS_H
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "extents_types.h"
+
+struct bch_fs;
+struct btree_trans;
+struct btree_insert_entry;
+
+/* extent entries: */
+
+#define extent_entry_last(_e)						\
+	((typeof(&(_e).v->start[0])) bkey_val_end(_e))
+
+#define entry_to_ptr(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
+									\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const struct bch_extent_ptr *) (_entry),		\
+		(struct bch_extent_ptr *) (_entry));			\
+})
+
+/* downcast, preserves const */
+#define to_entry(_entry)						\
+({									\
+	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
+		     !type_is(_entry, struct bch_extent_ptr *) &&	\
+		     !type_is(_entry, struct bch_extent_stripe_ptr *));	\
+									\
+	__builtin_choose_expr(						\
+		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
+		 type_is_exact(_entry, const struct bch_extent_ptr *) ||\
+		 type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
+		(const union bch_extent_entry *) (_entry),		\
+		(union bch_extent_entry *) (_entry));			\
+})
+
+static inline unsigned
+__extent_entry_type(const union bch_extent_entry *e)
+{
+	return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
+}
+
+static inline enum bch_extent_entry_type
+extent_entry_type(const union bch_extent_entry *e)
+{
+	int ret = __ffs(e->type);
+
+	EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
+
+	return ret;
+}
+
+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
+{
+	switch (extent_entry_type(entry)) {
+#define x(f, n)						\
+	case BCH_EXTENT_ENTRY_##f:			\
+		return sizeof(struct bch_extent_##f);
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
+{
+	return extent_entry_bytes(entry) / sizeof(u64);
+}
+
+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
+{
+	switch (extent_entry_type(e)) {
+	case BCH_EXTENT_ENTRY_ptr:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
+{
+	switch (extent_entry_type(e)) {
+	case BCH_EXTENT_ENTRY_crc32:
+	case BCH_EXTENT_ENTRY_crc64:
+	case BCH_EXTENT_ENTRY_crc128:
+		return true;
+	default:
+		return false;
+	}
+}
+
+union bch_extent_crc {
+	u8				type;
+	struct bch_extent_crc32		crc32;
+	struct bch_extent_crc64		crc64;
+	struct bch_extent_crc128	crc128;
+};
+
+#define __entry_to_crc(_entry)						\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const union bch_extent_crc *) (_entry),		\
+		(union bch_extent_crc *) (_entry))
+
+#define entry_to_crc(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_crc(_entry));		\
+									\
+	__entry_to_crc(_entry);						\
+})
+
+static inline struct bch_extent_crc_unpacked
+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
+{
+#define common_fields(_crc)						\
+		.csum_type		= _crc.csum_type,		\
+		.compression_type	= _crc.compression_type,	\
+		.compressed_size	= _crc._compressed_size + 1,	\
+		.uncompressed_size	= _crc._uncompressed_size + 1,	\
+		.offset			= _crc.offset,			\
+		.live_size		= k->size
+
+	if (!crc)
+		return (struct bch_extent_crc_unpacked) {
+			.compressed_size	= k->size,
+			.uncompressed_size	= k->size,
+			.live_size		= k->size,
+		};
+
+	switch (extent_entry_type(to_entry(crc))) {
+	case BCH_EXTENT_ENTRY_crc32: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc32),
+		};
+
+		*((__le32 *) &ret.csum.lo) = crc->crc32.csum;
+
+		memcpy(&ret.csum.lo, &crc->crc32.csum,
+		       sizeof(crc->crc32.csum));
+
+		return ret;
+	}
+	case BCH_EXTENT_ENTRY_crc64: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc64),
+			.nonce			= crc->crc64.nonce,
+			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
+		};
+
+		*((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
+
+		return ret;
+	}
+	case BCH_EXTENT_ENTRY_crc128: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc128),
+			.nonce			= crc->crc128.nonce,
+			.csum			= crc->crc128.csum,
+		};
+
+		return ret;
+	}
+	default:
+		BUG();
+	}
+#undef common_fields
+}
+
+/* bkey_ptrs: generically over any key type that has ptrs */
+
+struct bkey_ptrs_c {
+	const union bch_extent_entry	*start;
+	const union bch_extent_entry	*end;
+};
+
+struct bkey_ptrs {
+	union bch_extent_entry	*start;
+	union bch_extent_entry	*end;
+};
+
+/* iterate over bkey ptrs */
+
+#define extent_entry_next(_entry)					\
+	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
+#define __bkey_extent_entry_for_each_from(_start, _end, _entry)		\
+	for ((_entry) = (_start);					\
+	     (_entry) < (_end);						\
+	     (_entry) = extent_entry_next(_entry))
+
+#define __bkey_ptr_next(_ptr, _end)					\
+({									\
+	typeof(_end) _entry;						\
+									\
+	__bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry)	\
+		if (extent_entry_is_ptr(_entry))			\
+			break;						\
+									\
+	_entry < (_end) ? entry_to_ptr(_entry) : NULL;			\
+})
+
+#define bkey_extent_entry_for_each_from(_p, _entry, _start)		\
+	__bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
+
+#define bkey_extent_entry_for_each(_p, _entry)				\
+	bkey_extent_entry_for_each_from(_p, _entry, _p.start)
+
+#define __bkey_for_each_ptr(_start, _end, _ptr)				\
+	for ((_ptr) = (_start);						\
+	     ((_ptr) = __bkey_ptr_next(_ptr, _end));			\
+	     (_ptr)++)
+
+#define bkey_ptr_next(_p, _ptr)						\
+	__bkey_ptr_next(_ptr, (_p).end)
+
+#define bkey_for_each_ptr(_p, _ptr)					\
+	__bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
+
+#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry)			\
+({									\
+	__label__ out;							\
+									\
+	(_ptr).idx	= 0;						\
+	(_ptr).has_ec	= false;					\
+									\
+	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
+		switch (extent_entry_type(_entry)) {			\
+		case BCH_EXTENT_ENTRY_ptr:				\
+			(_ptr).ptr		= _entry->ptr;		\
+			goto out;					\
+		case BCH_EXTENT_ENTRY_crc32:				\
+		case BCH_EXTENT_ENTRY_crc64:				\
+		case BCH_EXTENT_ENTRY_crc128:				\
+			(_ptr).crc = bch2_extent_crc_unpack(_k,		\
+					entry_to_crc(_entry));		\
+			break;						\
+		case BCH_EXTENT_ENTRY_stripe_ptr:			\
+			(_ptr).ec = _entry->stripe_ptr;			\
+			(_ptr).has_ec	= true;				\
+			break;						\
+		}							\
+out:									\
+	_entry < (_end);						\
+})
+
+#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry)	\
+	for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL),		\
+	     (_entry) = _start;						\
+	     __bkey_ptr_next_decode(_k, _end, _ptr, _entry);		\
+	     (_entry) = extent_entry_next(_entry))
+
+#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry)			\
+	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
+				   _ptr, _entry)
+
+#define bkey_crc_next(_k, _start, _end, _crc, _iter)			\
+({									\
+	__bkey_extent_entry_for_each_from(_iter, _end, _iter)		\
+		if (extent_entry_is_crc(_iter)) {			\
+			(_crc) = bch2_extent_crc_unpack(_k,		\
+						entry_to_crc(_iter));	\
+			break;						\
+		}							\
+									\
+	(_iter) < (_end);						\
+})
+
+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter)		\
+	for ((_crc) = bch2_extent_crc_unpack(_k, NULL),			\
+	     (_iter) = (_start);					\
+	     bkey_crc_next(_k, _start, _end, _crc, _iter);		\
+	     (_iter) = extent_entry_next(_iter))
+
+#define bkey_for_each_crc(_k, _p, _crc, _iter)				\
+	__bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
+
+/* utility code common to all keys with pointers: */
+
+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr: {
+		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
+		return (struct bkey_ptrs_c) {
+			to_entry(&e.v->start[0]),
+			to_entry(extent_entry_last(e))
+		};
+	}
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		return (struct bkey_ptrs_c) {
+			e.v->start,
+			extent_entry_last(e)
+		};
+	}
+	case KEY_TYPE_stripe: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+		return (struct bkey_ptrs_c) {
+			to_entry(&s.v->ptrs[0]),
+			to_entry(&s.v->ptrs[s.v->nr_blocks]),
+		};
+	}
+	case KEY_TYPE_reflink_v: {
+		struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+		return (struct bkey_ptrs_c) {
+			r.v->start,
+			bkey_val_end(r),
+		};
+	}
+	default:
+		return (struct bkey_ptrs_c) { NULL, NULL };
+	}
+}
+
+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
+{
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
+
+	return (struct bkey_ptrs) {
+		(void *) p.start,
+		(void *) p.end
+	};
+}
+
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		if (!ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		if (ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+
+void bch2_mark_io_failure(struct bch_io_failures *,
+			  struct extent_ptr_decoded *);
+int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
+			       struct bch_io_failures *,
+			       struct extent_ptr_decoded *);
+
+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
+
+/* bch_btree_ptr: */
+
+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+
+#define bch2_bkey_ops_btree_ptr (struct bkey_ops) {		\
+	.key_invalid	= bch2_btree_ptr_invalid,		\
+	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
+	.val_to_text	= bch2_btree_ptr_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+}
+
+/* bch_extent: */
+
+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c);
+void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+enum merge_result bch2_extent_merge(struct bch_fs *,
+				    struct bkey_s, struct bkey_s);
+
+#define bch2_bkey_ops_extent (struct bkey_ops) {		\
+	.key_invalid	= bch2_extent_invalid,			\
+	.key_debugcheck	= bch2_extent_debugcheck,		\
+	.val_to_text	= bch2_extent_to_text,			\
+	.swab		= bch2_ptr_swab,			\
+	.key_normalize	= bch2_extent_normalize,		\
+	.key_merge	= bch2_extent_merge,			\
+}
+
+/* bch_reservation: */
+
+const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+enum merge_result bch2_reservation_merge(struct bch_fs *,
+					 struct bkey_s, struct bkey_s);
+
+#define bch2_bkey_ops_reservation (struct bkey_ops) {		\
+	.key_invalid	= bch2_reservation_invalid,		\
+	.val_to_text	= bch2_reservation_to_text,		\
+	.key_merge	= bch2_reservation_merge,		\
+}
+
+int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
+			   struct bpos *);
+int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
+
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
+		       unsigned *);
+void bch2_insert_fixup_extent(struct btree_trans *,
+			      struct btree_insert_entry *);
+
+void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
+				    unsigned, unsigned);
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
+
+unsigned bch2_extent_is_compressed(struct bkey_s_c);
+
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
+			   struct bch_extent_ptr, u64);
+
+static inline bool bkey_extent_is_direct_data(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+	return bkey_extent_is_direct_data(k) ||
+		k->type == KEY_TYPE_reflink_p;
+}
+
+/*
+ * Should extent be counted under inode->i_sectors?
+ */
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reservation:
+	case KEY_TYPE_reflink_p:
+	case KEY_TYPE_reflink_v:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/* Extent entry iteration: */
+
+#define extent_for_each_entry_from(_e, _entry, _start)			\
+	__bkey_extent_entry_for_each_from(_start,			\
+				extent_entry_last(_e),_entry)
+
+#define extent_for_each_entry(_e, _entry)				\
+	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+#define extent_ptr_next(_e, _ptr)					\
+	__bkey_ptr_next(_ptr, extent_entry_last(_e))
+
+#define extent_for_each_ptr(_e, _ptr)					\
+	__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
+
+#define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
+	__bkey_for_each_ptr_decode((_e).k, (_e).v->start,		\
+				   extent_entry_last(_e), _ptr, _entry)
+
+void bch2_extent_crc_append(struct bkey_i *,
+			    struct bch_extent_crc_unpacked);
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
+				    struct extent_ptr_decoded *);
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
+				 struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
+					   struct bch_extent_ptr *);
+
+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond)				\
+do {									\
+	struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k);			\
+									\
+	_ptr = &_ptrs.start->ptr;					\
+									\
+	while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) {			\
+		if (_cond) {						\
+			_ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr);	\
+			_ptrs = bch2_bkey_ptrs(_k);			\
+			continue;					\
+		}							\
+									\
+		(_ptr)++;						\
+	}								\
+} while (0)
+
+void __bch2_cut_front(struct bpos, struct bkey_s);
+
+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
+{
+	__bch2_cut_front(where, bkey_i_to_s(k));
+}
+
+bool bch2_cut_back(struct bpos, struct bkey *);
+
+/**
+ * bch_key_resize - adjust size of @k
+ *
+ * bkey_start_offset(k) will be preserved, modifies where the extent ends
+ */
+static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
+{
+	k->p.offset -= k->size;
+	k->p.offset += new_size;
+	k->size = new_size;
+}
+
+/*
+ * In extent_sort_fix_overlapping(), insert_fixup_extent(),
+ * extent_merge_inline() - we're modifying keys in place that are packed. To do
+ * that we have to unpack the key, modify the unpacked key - then this
+ * copies/repacks the unpacked to the original as necessary.
+ */
+static inline void extent_save(struct btree *b, struct bkey_packed *dst,
+			       struct bkey *src)
+{
+	struct bkey_format *f = &b->format;
+	struct bkey_i *dst_unpacked;
+
+	if ((dst_unpacked = packed_to_bkey(dst)))
+		dst_unpacked->k = *src;
+	else
+		BUG_ON(!bch2_bkey_pack_key(dst, src, f));
+}
+
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
+
+#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
new file mode 100644
index 000000000000..43d6c341ecca
--- /dev/null
+++ b/fs/bcachefs/extents_types.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_TYPES_H
+#define _BCACHEFS_EXTENTS_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_extent_crc_unpacked {
+	u32			compressed_size;
+	u32			uncompressed_size;
+	u32			live_size;
+
+	u8			csum_type;
+	u8			compression_type;
+
+	u16			offset;
+
+	u16			nonce;
+
+	struct bch_csum		csum;
+};
+
+struct extent_ptr_decoded {
+	unsigned			idx;
+	bool				has_ec;
+	struct bch_extent_crc_unpacked	crc;
+	struct bch_extent_ptr		ptr;
+	struct bch_extent_stripe_ptr	ec;
+};
+
+struct bch_io_failures {
+	u8			nr;
+	struct bch_dev_io_failures {
+		u8		dev;
+		u8		idx;
+		u8		nr_failed;
+		u8		nr_retries;
+	}			devs[BCH_REPLICAS_MAX];
+};
+
+#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
new file mode 100644
index 000000000000..26d5cad7e6a5
--- /dev/null
+++ b/fs/bcachefs/eytzinger.h
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _EYTZINGER_H
+#define _EYTZINGER_H
+
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "util.h"
+
+/*
+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an
+ * array
+ */
+
+/*
+ * One based indexing version:
+ *
+ * With one based indexing each level of the tree starts at a power of two -
+ * good for cacheline alignment:
+ *
+ * Size parameter is treated as if we were using 0 based indexing, however:
+ * valid nodes, and inorder indices, are in the range [1..size) - that is, there
+ * are actually size - 1 elements
+ */
+
+static inline unsigned eytzinger1_child(unsigned i, unsigned child)
+{
+	EBUG_ON(child > 1);
+
+	return (i << 1) + child;
+}
+
+static inline unsigned eytzinger1_left_child(unsigned i)
+{
+	return eytzinger1_child(i, 0);
+}
+
+static inline unsigned eytzinger1_right_child(unsigned i)
+{
+	return eytzinger1_child(i, 1);
+}
+
+static inline unsigned eytzinger1_first(unsigned size)
+{
+	return rounddown_pow_of_two(size - 1);
+}
+
+static inline unsigned eytzinger1_last(unsigned size)
+{
+	return rounddown_pow_of_two(size) - 1;
+}
+
+/*
+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that
+ *
+ * eytzinger1_next(0) == eytzinger1_first())
+ * eytzinger1_prev(0) == eytzinger1_last())
+ *
+ * eytzinger1_prev(eytzinger1_first()) == 0
+ * eytzinger1_next(eytzinger1_last()) == 0
+ */
+
+static inline unsigned eytzinger1_next(unsigned i, unsigned size)
+{
+	EBUG_ON(i >= size);
+
+	if (eytzinger1_right_child(i) < size) {
+		i = eytzinger1_right_child(i);
+
+		i <<= __fls(size) - __fls(i);
+		i >>= i >= size;
+	} else {
+		i >>= ffz(i) + 1;
+	}
+
+	return i;
+}
+
+static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
+{
+	EBUG_ON(i >= size);
+
+	if (eytzinger1_left_child(i) < size) {
+		i = eytzinger1_left_child(i) + 1;
+
+		i <<= __fls(size) - __fls(i);
+		i -= 1;
+		i >>= i >= size;
+	} else {
+		i >>= __ffs(i) + 1;
+	}
+
+	return i;
+}
+
+static inline unsigned eytzinger1_extra(unsigned size)
+{
+	return (size - rounddown_pow_of_two(size - 1)) << 1;
+}
+
+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
+					      unsigned extra)
+{
+	unsigned b = __fls(i);
+	unsigned shift = __fls(size - 1) - b;
+	int s;
+
+	EBUG_ON(!i || i >= size);
+
+	i  ^= 1U << b;
+	i <<= 1;
+	i  |= 1;
+	i <<= shift;
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (i > extra)
+	 *	i -= (i - extra) >> 1;
+	 */
+	s = extra - i;
+	i += (s >> 1) & (s >> 31);
+
+	return i;
+}
+
+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	unsigned shift;
+	int s;
+
+	EBUG_ON(!i || i >= size);
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (i > extra)
+	 *	i += i - extra;
+	 */
+	s = extra - i;
+	i -= s & (s >> 31);
+
+	shift = __ffs(i);
+
+	i >>= shift + 1;
+	i  |= 1U << (__fls(size - 1) - shift);
+
+	return i;
+}
+
+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
+{
+	return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
+{
+	return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
+}
+
+#define eytzinger1_for_each(_i, _size)			\
+	for ((_i) = eytzinger1_first((_size));		\
+	     (_i) != 0;					\
+	     (_i) = eytzinger1_next((_i), (_size)))
+
+/* Zero based indexing version: */
+
+static inline unsigned eytzinger0_child(unsigned i, unsigned child)
+{
+	EBUG_ON(child > 1);
+
+	return (i << 1) + 1 + child;
+}
+
+static inline unsigned eytzinger0_left_child(unsigned i)
+{
+	return eytzinger0_child(i, 0);
+}
+
+static inline unsigned eytzinger0_right_child(unsigned i)
+{
+	return eytzinger0_child(i, 1);
+}
+
+static inline unsigned eytzinger0_first(unsigned size)
+{
+	return eytzinger1_first(size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_last(unsigned size)
+{
+	return eytzinger1_last(size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_next(unsigned i, unsigned size)
+{
+	return eytzinger1_next(i + 1, size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
+{
+	return eytzinger1_prev(i + 1, size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_extra(unsigned size)
+{
+	return eytzinger1_extra(size + 1);
+}
+
+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
+}
+
+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
+}
+
+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
+{
+	return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
+{
+	return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
+}
+
+#define eytzinger0_for_each(_i, _size)			\
+	for ((_i) = eytzinger0_first((_size));		\
+	     (_i) != -1;				\
+	     (_i) = eytzinger0_next((_i), (_size)))
+
+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
+
+/* return greatest node <= @search, or -1 if not found */
+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
+					 eytzinger_cmp_fn cmp, const void *search)
+{
+	unsigned i, n = 0;
+
+	if (!nr)
+		return -1;
+
+	do {
+		i = n;
+		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+	} while (n < nr);
+
+	if (n & 1) {
+		/* @i was greater than @search, return previous node: */
+
+		if (i == eytzinger0_first(nr))
+			return -1;
+
+		return eytzinger0_prev(i, nr);
+	} else {
+		return i;
+	}
+}
+
+#define eytzinger0_find(base, nr, size, _cmp, search)			\
+({									\
+	void *_base	= (base);					\
+	void *_search	= (search);					\
+	size_t _nr	= (nr);						\
+	size_t _size	= (size);					\
+	size_t _i	= 0;						\
+	int _res;							\
+									\
+	while (_i < _nr &&						\
+	       (_res = _cmp(_search, _base + _i * _size, _size)))	\
+		_i = eytzinger0_child(_i, _res > 0);			\
+	_i;								\
+})
+
+void eytzinger0_sort(void *, size_t, size_t,
+		    int (*cmp_func)(const void *, const void *, size_t),
+		    void (*swap_func)(void *, void *, size_t));
+
+#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
new file mode 100644
index 000000000000..cdb272708a4b
--- /dev/null
+++ b/fs/bcachefs/fifo.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FIFO_H
+#define _BCACHEFS_FIFO_H
+
+#include "util.h"
+
+#define FIFO(type)							\
+struct {								\
+	size_t front, back, size, mask;					\
+	type *data;							\
+}
+
+#define DECLARE_FIFO(type, name)	FIFO(type) name
+
+#define fifo_buf_size(fifo)						\
+	((fifo)->size							\
+	 ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])	\
+	 : 0)
+
+#define init_fifo(fifo, _size, _gfp)					\
+({									\
+	(fifo)->front	= (fifo)->back = 0;				\
+	(fifo)->size	= (_size);					\
+	(fifo)->mask	= (fifo)->size					\
+		? roundup_pow_of_two((fifo)->size) - 1			\
+		: 0;							\
+	(fifo)->data	= kvpmalloc(fifo_buf_size(fifo), (_gfp));	\
+})
+
+#define free_fifo(fifo)							\
+do {									\
+	kvpfree((fifo)->data, fifo_buf_size(fifo));			\
+	(fifo)->data = NULL;						\
+} while (0)
+
+#define fifo_swap(l, r)							\
+do {									\
+	swap((l)->front, (r)->front);					\
+	swap((l)->back, (r)->back);					\
+	swap((l)->size, (r)->size);					\
+	swap((l)->mask, (r)->mask);					\
+	swap((l)->data, (r)->data);					\
+} while (0)
+
+#define fifo_move(dest, src)						\
+do {									\
+	typeof(*((dest)->data)) _t;					\
+	while (!fifo_full(dest) &&					\
+	       fifo_pop(src, _t))					\
+		fifo_push(dest, _t);					\
+} while (0)
+
+#define fifo_used(fifo)		(((fifo)->back - (fifo)->front))
+#define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo)	((fifo)->front == (fifo)->back)
+#define fifo_full(fifo)		(fifo_used(fifo) == (fifo)->size)
+
+#define fifo_peek_front(fifo)	((fifo)->data[(fifo)->front & (fifo)->mask])
+#define fifo_peek_back(fifo)	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_entry_idx_abs(fifo, p)					\
+	((((p) >= &fifo_peek_front(fifo)				\
+	   ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) +		\
+	   (((p) - (fifo)->data)))
+
+#define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
+#define fifo_idx_entry(fifo, i)	(fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
+
+#define fifo_push_back_ref(f)						\
+	(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
+
+#define fifo_push_front_ref(f)						\
+	(fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
+
+#define fifo_push_back(fifo, new)					\
+({									\
+	typeof((fifo)->data) _r = fifo_push_back_ref(fifo);		\
+	if (_r)								\
+		*_r = (new);						\
+	_r != NULL;							\
+})
+
+#define fifo_push_front(fifo, new)					\
+({									\
+	typeof((fifo)->data) _r = fifo_push_front_ref(fifo);		\
+	if (_r)								\
+		*_r = (new);						\
+	_r != NULL;							\
+})
+
+#define fifo_pop_front(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[(fifo)->front++ & (fifo)->mask];	\
+	_r;								\
+})
+
+#define fifo_pop_back(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask];	\
+	_r;								\
+})
+
+#define fifo_push_ref(fifo)	fifo_push_back_ref(fifo)
+#define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
+#define fifo_peek(fifo)		fifo_peek_front(fifo)
+
+#define fifo_for_each_entry(_entry, _fifo, _iter)			\
+	for (typecheck(typeof((_fifo)->front), _iter),			\
+	     (_iter) = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     (_iter)++)
+
+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)			\
+	for (typecheck(typeof((_fifo)->front), _iter),			\
+	     (_iter) = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     (_iter)++)
+
+#endif /* _BCACHEFS_FIFO_H */
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
new file mode 100644
index 000000000000..a4497eeb1f1b
--- /dev/null
+++ b/fs/bcachefs/fs-common.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "fs-common.h"
+#include "inode.h"
+#include "xattr.h"
+
+#include <linux/posix_acl.h>
+
+int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
+		      struct bch_inode_unpacked *dir_u,
+		      struct bch_inode_unpacked *new_inode,
+		      const struct qstr *name,
+		      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		      struct posix_acl *default_acl,
+		      struct posix_acl *acl)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *dir_iter;
+	struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
+	u64 now = bch2_current_time(trans->c);
+	int ret;
+
+	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
+	if (IS_ERR(dir_iter))
+		return PTR_ERR(dir_iter);
+
+	bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+
+	if (!name)
+		new_inode->bi_flags |= BCH_INODE_UNLINKED;
+
+	ret = bch2_inode_create(trans, new_inode,
+				BLOCKDEV_INODE_MAX, 0,
+				&c->unused_inode_hint);
+	if (ret)
+		return ret;
+
+	if (default_acl) {
+		ret = bch2_set_acl_trans(trans, new_inode, &hash,
+					 default_acl, ACL_TYPE_DEFAULT);
+		if (ret)
+			return ret;
+	}
+
+	if (acl) {
+		ret = bch2_set_acl_trans(trans, new_inode, &hash,
+					 acl, ACL_TYPE_ACCESS);
+		if (ret)
+			return ret;
+	}
+
+	if (name) {
+		struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
+		dir_u->bi_mtime = dir_u->bi_ctime = now;
+
+		if (S_ISDIR(new_inode->bi_mode))
+			dir_u->bi_nlink++;
+
+		ret = bch2_inode_write(trans, dir_iter, dir_u);
+		if (ret)
+			return ret;
+
+		ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+					 mode_to_type(new_inode->bi_mode),
+					 name, new_inode->bi_inum,
+					 BCH_HASH_SET_MUST_CREATE);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
+		    u64 inum, struct bch_inode_unpacked *inode_u,
+		    const struct qstr *name)
+{
+	struct btree_iter *dir_iter, *inode_iter;
+	struct bch_inode_unpacked dir_u;
+	struct bch_hash_info dir_hash;
+	u64 now = bch2_current_time(trans->c);
+
+	inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
+	if (IS_ERR(inode_iter))
+		return PTR_ERR(inode_iter);
+
+	inode_u->bi_ctime = now;
+	bch2_inode_nlink_inc(inode_u);
+
+	dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0);
+	if (IS_ERR(dir_iter))
+		return PTR_ERR(dir_iter);
+
+	/* XXX: shouldn't we be updating mtime/ctime on the directory? */
+
+	dir_hash = bch2_hash_info_init(trans->c, &dir_u);
+	bch2_trans_iter_put(trans, dir_iter);
+
+	return bch2_dirent_create(trans, dir_inum, &dir_hash,
+				  mode_to_type(inode_u->bi_mode),
+				  name, inum, BCH_HASH_SET_MUST_CREATE) ?:
+		bch2_inode_write(trans, inode_iter, inode_u);
+}
+
+int bch2_unlink_trans(struct btree_trans *trans,
+		      u64 dir_inum, struct bch_inode_unpacked *dir_u,
+		      struct bch_inode_unpacked *inode_u,
+		      const struct qstr *name)
+{
+	struct btree_iter *dir_iter, *dirent_iter, *inode_iter;
+	struct bch_hash_info dir_hash;
+	u64 inum, now = bch2_current_time(trans->c);
+	struct bkey_s_c k;
+
+	dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
+	if (IS_ERR(dir_iter))
+		return PTR_ERR(dir_iter);
+
+	dir_hash = bch2_hash_info_init(trans->c, dir_u);
+
+	dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
+						 name, BTREE_ITER_INTENT);
+	if (IS_ERR(dirent_iter))
+		return PTR_ERR(dirent_iter);
+
+	k = bch2_btree_iter_peek_slot(dirent_iter);
+	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+
+	inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
+	if (IS_ERR(inode_iter))
+		return PTR_ERR(inode_iter);
+
+	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
+	dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
+	bch2_inode_nlink_dec(inode_u);
+
+	return  (S_ISDIR(inode_u->bi_mode)
+		 ? bch2_empty_dir_trans(trans, inum)
+		 : 0) ?:
+		bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?:
+		bch2_inode_write(trans, dir_iter, dir_u) ?:
+		bch2_inode_write(trans, inode_iter, inode_u);
+}
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
+			  struct bch_inode_unpacked *src_u)
+{
+	u64 src, dst;
+	unsigned id;
+	bool ret = false;
+
+	for (id = 0; id < Inode_opt_nr; id++) {
+		if (dst_u->bi_fields_set & (1 << id))
+			continue;
+
+		src = bch2_inode_opt_get(src_u, id);
+		dst = bch2_inode_opt_get(dst_u, id);
+
+		if (src == dst)
+			continue;
+
+		bch2_inode_opt_set(dst_u, id, src);
+		ret = true;
+	}
+
+	return ret;
+}
+
+int bch2_rename_trans(struct btree_trans *trans,
+		      u64 src_dir, struct bch_inode_unpacked *src_dir_u,
+		      u64 dst_dir, struct bch_inode_unpacked *dst_dir_u,
+		      struct bch_inode_unpacked *src_inode_u,
+		      struct bch_inode_unpacked *dst_inode_u,
+		      const struct qstr *src_name,
+		      const struct qstr *dst_name,
+		      enum bch_rename_mode mode)
+{
+	struct btree_iter *src_dir_iter, *dst_dir_iter = NULL;
+	struct btree_iter *src_inode_iter, *dst_inode_iter = NULL;
+	struct bch_hash_info src_hash, dst_hash;
+	u64 src_inode, dst_inode, now = bch2_current_time(trans->c);
+	int ret;
+
+	src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
+				       BTREE_ITER_INTENT);
+	if (IS_ERR(src_dir_iter))
+		return PTR_ERR(src_dir_iter);
+
+	src_hash = bch2_hash_info_init(trans->c, src_dir_u);
+
+	if (dst_dir != src_dir) {
+		dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
+					       BTREE_ITER_INTENT);
+		if (IS_ERR(dst_dir_iter))
+			return PTR_ERR(dst_dir_iter);
+
+		dst_hash = bch2_hash_info_init(trans->c, dst_dir_u);
+	} else {
+		dst_dir_u = src_dir_u;
+		dst_hash = src_hash;
+	}
+
+	ret = bch2_dirent_rename(trans,
+				 src_dir, &src_hash,
+				 dst_dir, &dst_hash,
+				 src_name, &src_inode,
+				 dst_name, &dst_inode,
+				 mode);
+	if (ret)
+		return ret;
+
+	src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode,
+					 BTREE_ITER_INTENT);
+	if (IS_ERR(src_inode_iter))
+		return PTR_ERR(src_inode_iter);
+
+	if (dst_inode) {
+		dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode,
+						 BTREE_ITER_INTENT);
+		if (IS_ERR(dst_inode_iter))
+			return PTR_ERR(dst_inode_iter);
+	}
+
+	if (mode == BCH_RENAME_OVERWRITE) {
+		if (S_ISDIR(src_inode_u->bi_mode) !=
+		    S_ISDIR(dst_inode_u->bi_mode))
+			return -ENOTDIR;
+
+		if (S_ISDIR(dst_inode_u->bi_mode) &&
+		    bch2_empty_dir_trans(trans, dst_inode))
+			return -ENOTEMPTY;
+	}
+
+	if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
+	    S_ISDIR(src_inode_u->bi_mode))
+		return -EXDEV;
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
+	    S_ISDIR(dst_inode_u->bi_mode))
+		return -EXDEV;
+
+	if (S_ISDIR(src_inode_u->bi_mode)) {
+		src_dir_u->bi_nlink--;
+		dst_dir_u->bi_nlink++;
+	}
+
+	if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) {
+		dst_dir_u->bi_nlink--;
+		src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
+	}
+
+	if (mode == BCH_RENAME_OVERWRITE)
+		bch2_inode_nlink_dec(dst_inode_u);
+
+	src_dir_u->bi_mtime		= now;
+	src_dir_u->bi_ctime		= now;
+
+	if (src_dir != dst_dir) {
+		dst_dir_u->bi_mtime	= now;
+		dst_dir_u->bi_ctime	= now;
+	}
+
+	src_inode_u->bi_ctime		= now;
+
+	if (dst_inode)
+		dst_inode_u->bi_ctime	= now;
+
+	return  bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
+		(src_dir != dst_dir
+		 ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u)
+		 : 0 ) ?:
+		bch2_inode_write(trans, src_inode_iter, src_inode_u) ?:
+		(dst_inode
+		 ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u)
+		 : 0 );
+}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
new file mode 100644
index 000000000000..c1621485a526
--- /dev/null
+++ b/fs/bcachefs/fs-common.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_COMMON_H
+#define _BCACHEFS_FS_COMMON_H
+
+struct posix_acl;
+
+int bch2_create_trans(struct btree_trans *, u64,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *,
+		      uid_t, gid_t, umode_t, dev_t,
+		      struct posix_acl *,
+		      struct posix_acl *);
+
+int bch2_link_trans(struct btree_trans *, u64,
+		    u64, struct bch_inode_unpacked *,
+		    const struct qstr *);
+
+int bch2_unlink_trans(struct btree_trans *,
+		      u64, struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *);
+
+int bch2_rename_trans(struct btree_trans *,
+		      u64, struct bch_inode_unpacked *,
+		      u64, struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *,
+		      const struct qstr *,
+		      enum bch_rename_mode);
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
+			  struct bch_inode_unpacked *);
+
+#endif /* _BCACHEFS_FS_COMMON_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
new file mode 100644
index 000000000000..6d0045793bf1
--- /dev/null
+++ b/fs/bcachefs/fs-io.c
@@ -0,0 +1,3157 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "io.h"
+#include "keylist.h"
+#include "quota.h"
+#include "reflink.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/migrate.h>
+#include <linux/mmu_context.h>
+#include <linux/pagevec.h>
+#include <linux/sched/signal.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+
+#include <trace/events/bcachefs.h>
+#include <trace/events/writeback.h>
+
+struct quota_res {
+	u64				sectors;
+};
+
+struct bch_writepage_io {
+	struct closure			cl;
+	struct bch_inode_info		*inode;
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+struct dio_write {
+	struct completion		done;
+	struct kiocb			*req;
+	struct mm_struct		*mm;
+	unsigned			loop:1,
+					sync:1,
+					free_iov:1;
+	struct quota_res		quota_res;
+
+	struct iov_iter			iter;
+	struct iovec			inline_vecs[2];
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+struct dio_read {
+	struct closure			cl;
+	struct kiocb			*req;
+	long				ret;
+	struct bch_read_bio		rbio;
+};
+
+/* stub version */
+static int add_to_page_cache_lru_vec(struct address_space *mapping,
+				     struct page **pages,
+				     unsigned nr_pages,
+				     pgoff_t offset, gfp_t gfp_mask)
+{
+	int i, err = 0;
+
+	for (i = 0; i < nr_pages; i++) {
+		err = add_to_page_cache_lru(pages[i], mapping,
+					    offset + i, gfp_mask);
+		if (err)
+			break;
+	}
+
+	return i ?: err;
+}
+
+/* pagecache_block must be held */
+static int write_invalidate_inode_pages_range(struct address_space *mapping,
+					      loff_t start, loff_t end)
+{
+	int ret;
+
+	/*
+	 * XXX: the way this is currently implemented, we can spin if a process
+	 * is continually redirtying a specific page
+	 */
+	do {
+		if (!mapping->nrpages &&
+		    !mapping->nrexceptional)
+			return 0;
+
+		ret = filemap_write_and_wait_range(mapping, start, end);
+		if (ret)
+			break;
+
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = invalidate_inode_pages2_range(mapping,
+				start >> PAGE_SHIFT,
+				end >> PAGE_SHIFT);
+	} while (ret == -EBUSY);
+
+	return ret;
+}
+
+/* quotas */
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+static void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res)
+{
+	if (!res->sectors)
+		return;
+
+	mutex_lock(&inode->ei_quota_lock);
+	BUG_ON(res->sectors > inode->ei_quota_reserved);
+
+	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
+			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
+	inode->ei_quota_reserved -= res->sectors;
+	mutex_unlock(&inode->ei_quota_lock);
+
+	res->sectors = 0;
+}
+
+static int bch2_quota_reservation_add(struct bch_fs *c,
+				      struct bch_inode_info *inode,
+				      struct quota_res *res,
+				      unsigned sectors,
+				      bool check_enospc)
+{
+	int ret;
+
+	mutex_lock(&inode->ei_quota_lock);
+	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
+			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
+	if (likely(!ret)) {
+		inode->ei_quota_reserved += sectors;
+		res->sectors += sectors;
+	}
+	mutex_unlock(&inode->ei_quota_lock);
+
+	return ret;
+}
+
+#else
+
+static void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res)
+{
+}
+
+static int bch2_quota_reservation_add(struct bch_fs *c,
+				      struct bch_inode_info *inode,
+				      struct quota_res *res,
+				      unsigned sectors,
+				      bool check_enospc)
+{
+	return 0;
+}
+
+#endif
+
+/* i_size updates: */
+
+struct inode_new_size {
+	loff_t		new_size;
+	u64		now;
+	unsigned	fields;
+};
+
+static int inode_set_size(struct bch_inode_info *inode,
+			  struct bch_inode_unpacked *bi,
+			  void *p)
+{
+	struct inode_new_size *s = p;
+
+	bi->bi_size = s->new_size;
+	if (s->fields & ATTR_ATIME)
+		bi->bi_atime = s->now;
+	if (s->fields & ATTR_MTIME)
+		bi->bi_mtime = s->now;
+	if (s->fields & ATTR_CTIME)
+		bi->bi_ctime = s->now;
+
+	return 0;
+}
+
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       loff_t new_size, unsigned fields)
+{
+	struct inode_new_size s = {
+		.new_size	= new_size,
+		.now		= bch2_current_time(c),
+		.fields		= fields,
+	};
+
+	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
+}
+
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+			   struct quota_res *quota_res, s64 sectors)
+{
+	if (!sectors)
+		return;
+
+	mutex_lock(&inode->ei_quota_lock);
+#ifdef CONFIG_BCACHEFS_QUOTA
+	if (quota_res && sectors > 0) {
+		BUG_ON(sectors > quota_res->sectors);
+		BUG_ON(sectors > inode->ei_quota_reserved);
+
+		quota_res->sectors -= sectors;
+		inode->ei_quota_reserved -= sectors;
+	} else {
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
+	}
+#endif
+	inode->v.i_blocks += sectors;
+	mutex_unlock(&inode->ei_quota_lock);
+}
+
+/* page state: */
+
+/* stored in page->private: */
+
+struct bch_page_sector {
+	/* Uncompressed, fully allocated replicas: */
+	unsigned		nr_replicas:3;
+
+	/* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
+	unsigned		replicas_reserved:3;
+
+	/* i_sectors: */
+	enum {
+		SECTOR_UNALLOCATED,
+		SECTOR_RESERVED,
+		SECTOR_DIRTY,
+		SECTOR_ALLOCATED,
+	}			state:2;
+};
+
+struct bch_page_state {
+	spinlock_t		lock;
+	atomic_t		write_count;
+	struct bch_page_sector	s[PAGE_SECTORS];
+};
+
+static inline struct bch_page_state *__bch2_page_state(struct page *page)
+{
+	return page_has_private(page)
+		? (struct bch_page_state *) page_private(page)
+		: NULL;
+}
+
+static inline struct bch_page_state *bch2_page_state(struct page *page)
+{
+	EBUG_ON(!PageLocked(page));
+
+	return __bch2_page_state(page);
+}
+
+/* for newly allocated pages: */
+static void __bch2_page_state_release(struct page *page)
+{
+	struct bch_page_state *s = __bch2_page_state(page);
+
+	if (!s)
+		return;
+
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	put_page(page);
+	kfree(s);
+}
+
+static void bch2_page_state_release(struct page *page)
+{
+	struct bch_page_state *s = bch2_page_state(page);
+
+	if (!s)
+		return;
+
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	put_page(page);
+	kfree(s);
+}
+
+/* for newly allocated pages: */
+static struct bch_page_state *__bch2_page_state_create(struct page *page,
+						       gfp_t gfp)
+{
+	struct bch_page_state *s;
+
+	s = kzalloc(sizeof(*s), GFP_NOFS|gfp);
+	if (!s)
+		return NULL;
+
+	spin_lock_init(&s->lock);
+	/*
+	 * migrate_page_move_mapping() assumes that pages with private data
+	 * have their count elevated by 1.
+	 */
+	get_page(page);
+	set_page_private(page, (unsigned long) s);
+	SetPagePrivate(page);
+	return s;
+}
+
+static struct bch_page_state *bch2_page_state_create(struct page *page,
+						     gfp_t gfp)
+{
+	return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
+}
+
+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
+{
+	/* XXX: this should not be open coded */
+	return inode->ei_inode.bi_data_replicas
+		? inode->ei_inode.bi_data_replicas - 1
+		: c->opts.data_replicas;
+}
+
+static inline unsigned sectors_to_reserve(struct bch_page_sector *s,
+						  unsigned nr_replicas)
+{
+	return max(0, (int) nr_replicas -
+		   s->nr_replicas -
+		   s->replicas_reserved);
+}
+
+static int bch2_get_page_disk_reservation(struct bch_fs *c,
+				struct bch_inode_info *inode,
+				struct page *page, bool check_enospc)
+{
+	struct bch_page_state *s = bch2_page_state_create(page, 0);
+	unsigned nr_replicas = inode_nr_replicas(c, inode);
+	struct disk_reservation disk_res = { 0 };
+	unsigned i, disk_res_sectors = 0;
+	int ret;
+
+	if (!s)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(s->s); i++)
+		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
+
+	if (!disk_res_sectors)
+		return 0;
+
+	ret = bch2_disk_reservation_get(c, &disk_res,
+					disk_res_sectors, 1,
+					!check_enospc
+					? BCH_DISK_RESERVATION_NOFAIL
+					: 0);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(s->s); i++)
+		s->s[i].replicas_reserved +=
+			sectors_to_reserve(&s->s[i], nr_replicas);
+
+	return 0;
+}
+
+struct bch2_page_reservation {
+	struct disk_reservation	disk;
+	struct quota_res	quota;
+};
+
+static void bch2_page_reservation_init(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct bch2_page_reservation *res)
+{
+	memset(res, 0, sizeof(*res));
+
+	res->disk.nr_replicas = inode_nr_replicas(c, inode);
+}
+
+static void bch2_page_reservation_put(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct bch2_page_reservation *res)
+{
+	bch2_disk_reservation_put(c, &res->disk);
+	bch2_quota_reservation_put(c, inode, &res->quota);
+}
+
+static int bch2_page_reservation_get(struct bch_fs *c,
+			struct bch_inode_info *inode, struct page *page,
+			struct bch2_page_reservation *res,
+			unsigned offset, unsigned len, bool check_enospc)
+{
+	struct bch_page_state *s = bch2_page_state_create(page, 0);
+	unsigned i, disk_sectors = 0, quota_sectors = 0;
+	int ret;
+
+	if (!s)
+		return -ENOMEM;
+
+	for (i = round_down(offset, block_bytes(c)) >> 9;
+	     i < round_up(offset + len, block_bytes(c)) >> 9;
+	     i++) {
+		disk_sectors += sectors_to_reserve(&s->s[i],
+						res->disk.nr_replicas);
+		quota_sectors += s->s[i].state == SECTOR_UNALLOCATED;
+	}
+
+	if (disk_sectors) {
+		ret = bch2_disk_reservation_add(c, &res->disk,
+						disk_sectors,
+						!check_enospc
+						? BCH_DISK_RESERVATION_NOFAIL
+						: 0);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	if (quota_sectors) {
+		ret = bch2_quota_reservation_add(c, inode, &res->quota,
+						 quota_sectors,
+						 check_enospc);
+		if (unlikely(ret)) {
+			struct disk_reservation tmp = {
+				.sectors = disk_sectors
+			};
+
+			bch2_disk_reservation_put(c, &tmp);
+			res->disk.sectors -= disk_sectors;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void bch2_clear_page_bits(struct page *page)
+{
+	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_page_state *s = bch2_page_state(page);
+	struct disk_reservation disk_res = { 0 };
+	int i, dirty_sectors = 0;
+
+	if (!s)
+		return;
+
+	EBUG_ON(!PageLocked(page));
+	EBUG_ON(PageWriteback(page));
+
+	for (i = 0; i < ARRAY_SIZE(s->s); i++) {
+		disk_res.sectors += s->s[i].replicas_reserved;
+		s->s[i].replicas_reserved = 0;
+
+		if (s->s[i].state == SECTOR_DIRTY) {
+			dirty_sectors++;
+			s->s[i].state = SECTOR_UNALLOCATED;
+		}
+	}
+
+	bch2_disk_reservation_put(c, &disk_res);
+
+	if (dirty_sectors)
+		i_sectors_acct(c, inode, NULL, -dirty_sectors);
+
+	bch2_page_state_release(page);
+}
+
+static void bch2_set_page_dirty(struct bch_fs *c,
+			struct bch_inode_info *inode, struct page *page,
+			struct bch2_page_reservation *res,
+			unsigned offset, unsigned len)
+{
+	struct bch_page_state *s = bch2_page_state(page);
+	unsigned i, dirty_sectors = 0;
+
+	WARN_ON((u64) page_offset(page) + offset + len >
+		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
+
+	spin_lock(&s->lock);
+
+	for (i = round_down(offset, block_bytes(c)) >> 9;
+	     i < round_up(offset + len, block_bytes(c)) >> 9;
+	     i++) {
+		unsigned sectors = sectors_to_reserve(&s->s[i],
+						res->disk.nr_replicas);
+
+		/*
+		 * This can happen if we race with the error path in
+		 * bch2_writepage_io_done():
+		 */
+		sectors = min_t(unsigned, sectors, res->disk.sectors);
+
+		s->s[i].replicas_reserved += sectors;
+		res->disk.sectors -= sectors;
+
+		if (s->s[i].state == SECTOR_UNALLOCATED)
+			dirty_sectors++;
+
+		s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
+	}
+
+	spin_unlock(&s->lock);
+
+	if (dirty_sectors)
+		i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+
+	if (!PageDirty(page))
+		__set_page_dirty_nobuffers(page);
+}
+
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
+{
+	struct file *file = vmf->vma->vm_file;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	int ret;
+
+	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+	ret = filemap_fault(vmf);
+	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+	return ret;
+}
+
+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct file *file = vmf->vma->vm_file;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct address_space *mapping = file->f_mapping;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_page_reservation res;
+	unsigned len;
+	loff_t isize;
+	int ret = VM_FAULT_LOCKED;
+
+	bch2_page_reservation_init(c, inode, &res);
+
+	sb_start_pagefault(inode->v.i_sb);
+	file_update_time(file);
+
+	/*
+	 * Not strictly necessary, but helps avoid dio writes livelocking in
+	 * write_invalidate_inode_pages_range() - can drop this if/when we get
+	 * a write_invalidate_inode_pages_range() that works without dropping
+	 * page lock before invalidating page
+	 */
+	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+
+	lock_page(page);
+	isize = i_size_read(&inode->v);
+
+	if (page->mapping != mapping || page_offset(page) >= isize) {
+		unlock_page(page);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
+
+	len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
+
+	if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
+		unlock_page(page);
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+	bch2_set_page_dirty(c, inode, page, &res, 0, len);
+	bch2_page_reservation_put(c, inode, &res);
+
+	wait_for_stable_page(page);
+out:
+	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	sb_end_pagefault(inode->v.i_sb);
+
+	return ret;
+}
+
+void bch2_invalidatepage(struct page *page, unsigned int offset,
+			 unsigned int length)
+{
+	if (offset || length < PAGE_SIZE)
+		return;
+
+	bch2_clear_page_bits(page);
+}
+
+int bch2_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	if (PageDirty(page))
+		return 0;
+
+	bch2_clear_page_bits(page);
+	return 1;
+}
+
+#ifdef CONFIG_MIGRATION
+int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
+		      struct page *page, enum migrate_mode mode)
+{
+	int ret;
+
+	EBUG_ON(!PageLocked(page));
+	EBUG_ON(!PageLocked(newpage));
+
+	ret = migrate_page_move_mapping(mapping, newpage, page, 0);
+	if (ret != MIGRATEPAGE_SUCCESS)
+		return ret;
+
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		get_page(newpage);
+		set_page_private(newpage, page_private(page));
+		set_page_private(page, 0);
+		put_page(page);
+		SetPagePrivate(newpage);
+	}
+
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		migrate_page_copy(newpage, page);
+	else
+		migrate_page_states(newpage, page);
+	return MIGRATEPAGE_SUCCESS;
+}
+#endif
+
+/* readpage(s): */
+
+static void bch2_readpages_end_io(struct bio *bio)
+{
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, iter) {
+		struct page *page = bv->bv_page;
+
+		if (!bio->bi_status) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		unlock_page(page);
+	}
+
+	bio_put(bio);
+}
+
+static inline void page_state_init_for_read(struct page *page)
+{
+	SetPagePrivate(page);
+	page->private = 0;
+}
+
+struct readpages_iter {
+	struct address_space	*mapping;
+	struct page		**pages;
+	unsigned		nr_pages;
+	unsigned		nr_added;
+	unsigned		idx;
+	pgoff_t			offset;
+};
+
+static int readpages_iter_init(struct readpages_iter *iter,
+			       struct address_space *mapping,
+			       struct list_head *pages, unsigned nr_pages)
+{
+	memset(iter, 0, sizeof(*iter));
+
+	iter->mapping	= mapping;
+	iter->offset	= list_last_entry(pages, struct page, lru)->index;
+
+	iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
+	if (!iter->pages)
+		return -ENOMEM;
+
+	while (!list_empty(pages)) {
+		struct page *page = list_last_entry(pages, struct page, lru);
+
+		__bch2_page_state_create(page, __GFP_NOFAIL);
+
+		iter->pages[iter->nr_pages++] = page;
+		list_del(&page->lru);
+	}
+
+	return 0;
+}
+
+static inline struct page *readpage_iter_next(struct readpages_iter *iter)
+{
+	struct page *page;
+	unsigned i;
+	int ret;
+
+	BUG_ON(iter->idx > iter->nr_added);
+	BUG_ON(iter->nr_added > iter->nr_pages);
+
+	if (iter->idx < iter->nr_added)
+		goto out;
+
+	while (1) {
+		if (iter->idx == iter->nr_pages)
+			return NULL;
+
+		ret = add_to_page_cache_lru_vec(iter->mapping,
+				iter->pages	+ iter->nr_added,
+				iter->nr_pages	- iter->nr_added,
+				iter->offset	+ iter->nr_added,
+				GFP_NOFS);
+		if (ret > 0)
+			break;
+
+		page = iter->pages[iter->nr_added];
+		iter->idx++;
+		iter->nr_added++;
+
+		__bch2_page_state_release(page);
+		put_page(page);
+	}
+
+	iter->nr_added += ret;
+
+	for (i = iter->idx; i < iter->nr_added; i++)
+		put_page(iter->pages[i]);
+out:
+	EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
+
+	return iter->pages[iter->idx];
+}
+
+static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+		? 0 : bch2_bkey_nr_ptrs_allocated(k);
+	unsigned state = k.k->type == KEY_TYPE_reservation
+		? SECTOR_RESERVED
+		: SECTOR_ALLOCATED;
+
+	bio_for_each_segment(bv, bio, iter) {
+		struct bch_page_state *s = bch2_page_state(bv.bv_page);
+		unsigned i;
+
+		for (i = bv.bv_offset >> 9;
+		     i < (bv.bv_offset + bv.bv_len) >> 9;
+		     i++) {
+			s->s[i].nr_replicas = nr_ptrs;
+			s->s[i].state = state;
+		}
+	}
+}
+
+static void readpage_bio_extend(struct readpages_iter *iter,
+				struct bio *bio,
+				unsigned sectors_this_extent,
+				bool get_more)
+{
+	while (bio_sectors(bio) < sectors_this_extent &&
+	       bio->bi_vcnt < bio->bi_max_vecs) {
+		pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
+		struct page *page = readpage_iter_next(iter);
+		int ret;
+
+		if (page) {
+			if (iter->offset + iter->idx != page_offset)
+				break;
+
+			iter->idx++;
+		} else {
+			if (!get_more)
+				break;
+
+			page = xa_load(&iter->mapping->i_pages, page_offset);
+			if (page && !xa_is_value(page))
+				break;
+
+			page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
+			if (!page)
+				break;
+
+			if (!__bch2_page_state_create(page, 0)) {
+				put_page(page);
+				break;
+			}
+
+			ret = add_to_page_cache_lru(page, iter->mapping,
+						    page_offset, GFP_NOFS);
+			if (ret) {
+				__bch2_page_state_release(page);
+				put_page(page);
+				break;
+			}
+
+			put_page(page);
+		}
+
+		BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
+	}
+}
+
+static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
+		       struct bch_read_bio *rbio, u64 inum,
+		       struct readpages_iter *readpages_iter)
+{
+	struct bch_fs *c = trans->c;
+	int flags = BCH_READ_RETRY_IF_STALE|
+		BCH_READ_MAY_PROMOTE;
+	int ret = 0;
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+retry:
+	while (1) {
+		BKEY_PADDED(k) tmp;
+		struct bkey_s_c k;
+		unsigned bytes, sectors, offset_into_extent;
+
+		bch2_btree_iter_set_pos(iter,
+				POS(inum, rbio->bio.bi_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+
+		offset_into_extent = iter->pos.offset -
+			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		ret = bch2_read_indirect_extent(trans,
+					&offset_into_extent, &tmp.k);
+		if (ret)
+			break;
+
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		bch2_trans_unlock(trans);
+
+		if (readpages_iter) {
+			bool want_full_extent = false;
+
+			if (bkey_extent_is_data(k.k)) {
+				struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+				const union bch_extent_entry *i;
+				struct extent_ptr_decoded p;
+
+				bkey_for_each_ptr_decode(k.k, ptrs, p, i)
+					want_full_extent |= ((p.crc.csum_type != 0) |
+							     (p.crc.compression_type != 0));
+			}
+
+			readpage_bio_extend(readpages_iter, &rbio->bio,
+					    sectors, want_full_extent);
+		}
+
+		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+
+		if (rbio->bio.bi_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		if (bkey_extent_is_allocation(k.k))
+			bch2_add_page_sectors(&rbio->bio, k);
+
+		bch2_read_extent(c, rbio, k, offset_into_extent, flags);
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			return;
+
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+		bio_advance(&rbio->bio, bytes);
+	}
+
+	if (ret == -EINTR)
+		goto retry;
+
+	bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+	bio_endio(&rbio->bio);
+}
+
+int bch2_readpages(struct file *file, struct address_space *mapping,
+		   struct list_head *pages, unsigned nr_pages)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct page *page;
+	struct readpages_iter readpages_iter;
+	int ret;
+
+	ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages);
+	BUG_ON(ret);
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
+				   BTREE_ITER_SLOTS);
+
+	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+
+	while ((page = readpage_iter_next(&readpages_iter))) {
+		pgoff_t index = readpages_iter.offset + readpages_iter.idx;
+		unsigned n = min_t(unsigned,
+				   readpages_iter.nr_pages -
+				   readpages_iter.idx,
+				   BIO_MAX_PAGES);
+		struct bch_read_bio *rbio =
+			rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read),
+				  opts);
+
+		readpages_iter.idx++;
+
+		bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0);
+		rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT;
+		rbio->bio.bi_end_io = bch2_readpages_end_io;
+		BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
+
+		bchfs_read(&trans, iter, rbio, inode->v.i_ino,
+			   &readpages_iter);
+	}
+
+	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+	bch2_trans_exit(&trans);
+	kfree(readpages_iter.pages);
+
+	return 0;
+}
+
+static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
+			     u64 inum, struct page *page)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+
+	bch2_page_state_create(page, __GFP_NOFAIL);
+
+	bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
+	rbio->bio.bi_iter.bi_sector =
+		(sector_t) page->index << PAGE_SECTOR_SHIFT;
+	BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
+
+	bch2_trans_init(&trans, c, 0, 0);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
+				   BTREE_ITER_SLOTS);
+
+	bchfs_read(&trans, iter, rbio, inum, NULL);
+
+	bch2_trans_exit(&trans);
+}
+
+int bch2_readpage(struct file *file, struct page *page)
+{
+	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
+	struct bch_read_bio *rbio;
+
+	rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
+	rbio->bio.bi_end_io = bch2_readpages_end_io;
+
+	__bchfs_readpage(c, rbio, inode->v.i_ino, page);
+	return 0;
+}
+
+static void bch2_read_single_page_end_io(struct bio *bio)
+{
+	complete(bio->bi_private);
+}
+
+static int bch2_read_single_page(struct page *page,
+				 struct address_space *mapping)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_read_bio *rbio;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read),
+			 io_opts(c, &inode->ei_inode));
+	rbio->bio.bi_private = &done;
+	rbio->bio.bi_end_io = bch2_read_single_page_end_io;
+
+	__bchfs_readpage(c, rbio, inode->v.i_ino, page);
+	wait_for_completion(&done);
+
+	ret = blk_status_to_errno(rbio->bio.bi_status);
+	bio_put(&rbio->bio);
+
+	if (ret < 0)
+		return ret;
+
+	SetPageUptodate(page);
+	return 0;
+}
+
+/* writepages: */
+
+struct bch_writepage_state {
+	struct bch_writepage_io	*io;
+	struct bch_io_opts	opts;
+};
+
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+								  struct bch_inode_info *inode)
+{
+	return (struct bch_writepage_state) {
+		.opts = io_opts(c, &inode->ei_inode)
+	};
+}
+
+static void bch2_writepage_io_free(struct closure *cl)
+{
+	struct bch_writepage_io *io = container_of(cl,
+					struct bch_writepage_io, cl);
+
+	bio_put(&io->op.wbio.bio);
+}
+
+static void bch2_writepage_io_done(struct closure *cl)
+{
+	struct bch_writepage_io *io = container_of(cl,
+					struct bch_writepage_io, cl);
+	struct bch_fs *c = io->op.c;
+	struct bio *bio = &io->op.wbio.bio;
+	struct bvec_iter_all iter;
+	struct bio_vec *bvec;
+	unsigned i;
+
+	if (io->op.error) {
+		bio_for_each_segment_all(bvec, bio, iter) {
+			struct bch_page_state *s;
+
+			SetPageError(bvec->bv_page);
+			mapping_set_error(bvec->bv_page->mapping, -EIO);
+
+			s = __bch2_page_state(bvec->bv_page);
+			spin_lock(&s->lock);
+			for (i = 0; i < PAGE_SECTORS; i++)
+				s->s[i].nr_replicas = 0;
+			spin_unlock(&s->lock);
+		}
+	}
+
+	/*
+	 * racing with fallocate can cause us to add fewer sectors than
+	 * expected - but we shouldn't add more sectors than expected:
+	 */
+	BUG_ON(io->op.i_sectors_delta > 0);
+
+	/*
+	 * (error (due to going RO) halfway through a page can screw that up
+	 * slightly)
+	 * XXX wtf?
+	   BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
+	 */
+
+	/*
+	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
+	 * before calling end_page_writeback:
+	 */
+	i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
+
+	bio_for_each_segment_all(bvec, bio, iter) {
+		struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
+
+		if (atomic_dec_and_test(&s->write_count))
+			end_page_writeback(bvec->bv_page);
+	}
+
+	closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
+}
+
+static void bch2_writepage_do_io(struct bch_writepage_state *w)
+{
+	struct bch_writepage_io *io = w->io;
+
+	w->io = NULL;
+	closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
+	continue_at(&io->cl, bch2_writepage_io_done, NULL);
+}
+
+/*
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
+ * possible, else allocating a new one:
+ */
+static void bch2_writepage_io_alloc(struct bch_fs *c,
+				    struct bch_writepage_state *w,
+				    struct bch_inode_info *inode,
+				    u64 sector,
+				    unsigned nr_replicas)
+{
+	struct bch_write_op *op;
+
+	w->io = container_of(bio_alloc_bioset(GFP_NOFS,
+					      BIO_MAX_PAGES,
+					      &c->writepage_bioset),
+			     struct bch_writepage_io, op.wbio.bio);
+
+	closure_init(&w->io->cl, NULL);
+	w->io->inode		= inode;
+
+	op			= &w->io->op;
+	bch2_write_op_init(op, c, w->opts);
+	op->target		= w->opts.foreground_target;
+	op_journal_seq_set(op, &inode->ei_journal_seq);
+	op->nr_replicas		= nr_replicas;
+	op->res.nr_replicas	= nr_replicas;
+	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
+	op->pos			= POS(inode->v.i_ino, sector);
+	op->wbio.bio.bi_iter.bi_sector = sector;
+}
+
+static int __bch2_writepage(struct page *page,
+			    struct writeback_control *wbc,
+			    void *data)
+{
+	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_writepage_state *w = data;
+	struct bch_page_state *s, orig;
+	unsigned i, offset, nr_replicas_this_write = U32_MAX;
+	loff_t i_size = i_size_read(&inode->v);
+	pgoff_t end_index = i_size >> PAGE_SHIFT;
+	int ret;
+
+	EBUG_ON(!PageUptodate(page));
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		goto do_io;
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_SIZE - 1);
+	if (page->index > end_index || !offset) {
+		unlock_page(page);
+		return 0;
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_SIZE);
+do_io:
+	s = bch2_page_state_create(page, __GFP_NOFAIL);
+
+	ret = bch2_get_page_disk_reservation(c, inode, page, true);
+	if (ret) {
+		SetPageError(page);
+		mapping_set_error(page->mapping, ret);
+		unlock_page(page);
+		return 0;
+	}
+
+	/* Before unlocking the page, get copy of reservations: */
+	orig = *s;
+
+	for (i = 0; i < PAGE_SECTORS; i++) {
+		if (s->s[i].state < SECTOR_DIRTY)
+			continue;
+
+		nr_replicas_this_write =
+			min_t(unsigned, nr_replicas_this_write,
+			      s->s[i].nr_replicas +
+			      s->s[i].replicas_reserved);
+	}
+
+	for (i = 0; i < PAGE_SECTORS; i++) {
+		if (s->s[i].state < SECTOR_DIRTY)
+			continue;
+
+		s->s[i].nr_replicas = w->opts.compression
+			? 0 : nr_replicas_this_write;
+
+		s->s[i].replicas_reserved = 0;
+		s->s[i].state = SECTOR_ALLOCATED;
+	}
+
+	BUG_ON(atomic_read(&s->write_count));
+	atomic_set(&s->write_count, 1);
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	unlock_page(page);
+
+	offset = 0;
+	while (1) {
+		unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0;
+		u64 sector;
+
+		while (offset < PAGE_SECTORS &&
+		       orig.s[offset].state < SECTOR_DIRTY)
+			offset++;
+
+		if (offset == PAGE_SECTORS)
+			break;
+
+		sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
+
+		while (offset + sectors < PAGE_SECTORS &&
+		       orig.s[offset + sectors].state >= SECTOR_DIRTY)
+			sectors++;
+
+		for (i = offset; i < offset + sectors; i++) {
+			reserved_sectors += orig.s[i].replicas_reserved;
+			dirty_sectors += orig.s[i].state == SECTOR_DIRTY;
+		}
+
+		if (w->io &&
+		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
+		     bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
+		     w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) ||
+		     bio_end_sector(&w->io->op.wbio.bio) != sector))
+			bch2_writepage_do_io(w);
+
+		if (!w->io)
+			bch2_writepage_io_alloc(c, w, inode, sector,
+						nr_replicas_this_write);
+
+		atomic_inc(&s->write_count);
+
+		BUG_ON(inode != w->io->inode);
+		BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page,
+				     sectors << 9, offset << 9));
+
+		/* Check for writing past i_size: */
+		WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+			round_up(i_size, block_bytes(c)));
+
+		w->io->op.res.sectors += reserved_sectors;
+		w->io->op.i_sectors_delta -= dirty_sectors;
+		w->io->op.new_i_size = i_size;
+
+		if (wbc->sync_mode == WB_SYNC_ALL)
+			w->io->op.wbio.bio.bi_opf |= REQ_SYNC;
+
+		offset += sectors;
+	}
+
+	if (atomic_dec_and_test(&s->write_count))
+		end_page_writeback(page);
+
+	return 0;
+}
+
+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
+	struct bch_writepage_state w =
+		bch_writepage_state_init(c, to_bch_ei(mapping->host));
+	struct blk_plug plug;
+	int ret;
+
+	blk_start_plug(&plug);
+	ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
+	if (w.io)
+		bch2_writepage_do_io(&w);
+	blk_finish_plug(&plug);
+	return ret;
+}
+
+int bch2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
+	struct bch_writepage_state w =
+		bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
+	int ret;
+
+	ret = __bch2_writepage(page, wbc, &w);
+	if (w.io)
+		bch2_writepage_do_io(&w);
+
+	return ret;
+}
+
+/* buffered writes: */
+
+int bch2_write_begin(struct file *file, struct address_space *mapping,
+		     loff_t pos, unsigned len, unsigned flags,
+		     struct page **pagep, void **fsdata)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_page_reservation *res;
+	pgoff_t index = pos >> PAGE_SHIFT;
+	unsigned offset = pos & (PAGE_SIZE - 1);
+	struct page *page;
+	int ret = -ENOMEM;
+
+	res = kmalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	bch2_page_reservation_init(c, inode, res);
+	*fsdata = res;
+
+	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		goto err_unlock;
+
+	if (PageUptodate(page))
+		goto out;
+
+	/* If we're writing entire page, don't need to read it in first: */
+	if (len == PAGE_SIZE)
+		goto out;
+
+	if (!offset && pos + len >= inode->v.i_size) {
+		zero_user_segment(page, len, PAGE_SIZE);
+		flush_dcache_page(page);
+		goto out;
+	}
+
+	if (index > inode->v.i_size >> PAGE_SHIFT) {
+		zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
+		flush_dcache_page(page);
+		goto out;
+	}
+readpage:
+	ret = bch2_read_single_page(page, mapping);
+	if (ret)
+		goto err;
+out:
+	ret = bch2_page_reservation_get(c, inode, page, res,
+					offset, len, true);
+	if (ret) {
+		if (!PageUptodate(page)) {
+			/*
+			 * If the page hasn't been read in, we won't know if we
+			 * actually need a reservation - we don't actually need
+			 * to read here, we just need to check if the page is
+			 * fully backed by uncompressed data:
+			 */
+			goto readpage;
+		}
+
+		goto err;
+	}
+
+	*pagep = page;
+	return 0;
+err:
+	unlock_page(page);
+	put_page(page);
+	*pagep = NULL;
+err_unlock:
+	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	kfree(res);
+	*fsdata = NULL;
+	return ret;
+}
+
+int bch2_write_end(struct file *file, struct address_space *mapping,
+		   loff_t pos, unsigned len, unsigned copied,
+		   struct page *page, void *fsdata)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_page_reservation *res = fsdata;
+	unsigned offset = pos & (PAGE_SIZE - 1);
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+
+	if (unlikely(copied < len && !PageUptodate(page))) {
+		/*
+		 * The page needs to be read in, but that would destroy
+		 * our partial write - simplest thing is to just force
+		 * userspace to redo the write:
+		 */
+		zero_user(page, 0, PAGE_SIZE);
+		flush_dcache_page(page);
+		copied = 0;
+	}
+
+	spin_lock(&inode->v.i_lock);
+	if (pos + copied > inode->v.i_size)
+		i_size_write(&inode->v, pos + copied);
+	spin_unlock(&inode->v.i_lock);
+
+	if (copied) {
+		if (!PageUptodate(page))
+			SetPageUptodate(page);
+
+		bch2_set_page_dirty(c, inode, page, res, offset, copied);
+
+		inode->ei_last_dirtied = (unsigned long) current;
+	}
+
+	unlock_page(page);
+	put_page(page);
+	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+	bch2_page_reservation_put(c, inode, res);
+	kfree(res);
+
+	return copied;
+}
+
+#define WRITE_BATCH_PAGES	32
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+				 struct address_space *mapping,
+				 struct iov_iter *iter,
+				 loff_t pos, unsigned len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct page *pages[WRITE_BATCH_PAGES];
+	struct bch2_page_reservation res;
+	unsigned long index = pos >> PAGE_SHIFT;
+	unsigned offset = pos & (PAGE_SIZE - 1);
+	unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+	unsigned i, reserved = 0, set_dirty = 0;
+	unsigned copied = 0, nr_pages_copied = 0;
+	int ret = 0;
+
+	BUG_ON(!len);
+	BUG_ON(nr_pages > ARRAY_SIZE(pages));
+
+	bch2_page_reservation_init(c, inode, &res);
+
+	for (i = 0; i < nr_pages; i++) {
+		pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
+		if (!pages[i]) {
+			nr_pages = i;
+			if (!i) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			len = min_t(unsigned, len,
+				    nr_pages * PAGE_SIZE - offset);
+			break;
+		}
+	}
+
+	if (offset && !PageUptodate(pages[0])) {
+		ret = bch2_read_single_page(pages[0], mapping);
+		if (ret)
+			goto out;
+	}
+
+	if ((pos + len) & (PAGE_SIZE - 1) &&
+	    !PageUptodate(pages[nr_pages - 1])) {
+		if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
+			zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
+		} else {
+			ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
+			if (ret)
+				goto out;
+		}
+	}
+
+	while (reserved < len) {
+		struct page *page = pages[(offset + reserved) >> PAGE_SHIFT];
+		unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
+		unsigned pg_len = min_t(unsigned, len - reserved,
+					PAGE_SIZE - pg_offset);
+retry_reservation:
+		ret = bch2_page_reservation_get(c, inode, page, &res,
+						pg_offset, pg_len, true);
+
+		if (ret && !PageUptodate(page)) {
+			ret = bch2_read_single_page(page, mapping);
+			if (!ret)
+				goto retry_reservation;
+		}
+
+		if (ret)
+			goto out;
+
+		reserved += pg_len;
+	}
+
+	if (mapping_writably_mapped(mapping))
+		for (i = 0; i < nr_pages; i++)
+			flush_dcache_page(pages[i]);
+
+	while (copied < len) {
+		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
+		unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
+		unsigned pg_len = min_t(unsigned, len - copied,
+					PAGE_SIZE - pg_offset);
+		unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
+						iter, pg_offset, pg_len);
+
+		if (!pg_copied)
+			break;
+
+		flush_dcache_page(page);
+		iov_iter_advance(iter, pg_copied);
+		copied += pg_copied;
+	}
+
+	if (!copied)
+		goto out;
+
+	if (copied < len &&
+	    ((offset + copied) & (PAGE_SIZE - 1))) {
+		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
+
+		if (!PageUptodate(page)) {
+			zero_user(page, 0, PAGE_SIZE);
+			copied -= (offset + copied) & (PAGE_SIZE - 1);
+		}
+	}
+
+	spin_lock(&inode->v.i_lock);
+	if (pos + copied > inode->v.i_size)
+		i_size_write(&inode->v, pos + copied);
+	spin_unlock(&inode->v.i_lock);
+
+	while (set_dirty < copied) {
+		struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
+		unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
+		unsigned pg_len = min_t(unsigned, copied - set_dirty,
+					PAGE_SIZE - pg_offset);
+
+		if (!PageUptodate(page))
+			SetPageUptodate(page);
+
+		bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len);
+		unlock_page(page);
+		put_page(page);
+
+		set_dirty += pg_len;
+	}
+
+	nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
+	inode->ei_last_dirtied = (unsigned long) current;
+out:
+	for (i = nr_pages_copied; i < nr_pages; i++) {
+		unlock_page(pages[i]);
+		put_page(pages[i]);
+	}
+
+	bch2_page_reservation_put(c, inode, &res);
+
+	return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	loff_t pos = iocb->ki_pos;
+	ssize_t written = 0;
+	int ret = 0;
+
+	bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+
+	do {
+		unsigned offset = pos & (PAGE_SIZE - 1);
+		unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
+			      PAGE_SIZE * WRITE_BATCH_PAGES - offset);
+again:
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+			bytes = min_t(unsigned long, iov_iter_count(iter),
+				      PAGE_SIZE - offset);
+
+			if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+				ret = -EFAULT;
+				break;
+			}
+		}
+
+		if (unlikely(fatal_signal_pending(current))) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+		if (unlikely(ret < 0))
+			break;
+
+		cond_resched();
+
+		if (unlikely(ret == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+				      iov_iter_single_seg_count(iter));
+			goto again;
+		}
+		pos += ret;
+		written += ret;
+
+		balance_dirty_pages_ratelimited(mapping);
+	} while (iov_iter_count(iter));
+
+	bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+	return written ? written : ret;
+}
+
+/* O_DIRECT reads */
+
+static void bch2_dio_read_complete(struct closure *cl)
+{
+	struct dio_read *dio = container_of(cl, struct dio_read, cl);
+
+	dio->req->ki_complete(dio->req, dio->ret, 0);
+	bio_check_pages_dirty(&dio->rbio.bio);	/* transfers ownership */
+}
+
+static void bch2_direct_IO_read_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+
+	if (bio->bi_status)
+		dio->ret = blk_status_to_errno(bio->bi_status);
+
+	closure_put(&dio->cl);
+}
+
+static void bch2_direct_IO_read_split_endio(struct bio *bio)
+{
+	bch2_direct_IO_read_endio(bio);
+	bio_check_pages_dirty(bio);	/* transfers ownership */
+}
+
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
+	struct dio_read *dio;
+	struct bio *bio;
+	loff_t offset = req->ki_pos;
+	bool sync = is_sync_kiocb(req);
+	size_t shorten;
+	ssize_t ret;
+
+	if ((offset|iter->count) & (block_bytes(c) - 1))
+		return -EINVAL;
+
+	ret = min_t(loff_t, iter->count,
+		    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
+
+	if (!ret)
+		return ret;
+
+	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+	iter->count -= shorten;
+
+	bio = bio_alloc_bioset(GFP_KERNEL,
+			       iov_iter_npages(iter, BIO_MAX_PAGES),
+			       &c->dio_read_bioset);
+
+	bio->bi_end_io = bch2_direct_IO_read_endio;
+
+	dio = container_of(bio, struct dio_read, rbio.bio);
+	closure_init(&dio->cl, NULL);
+
+	/*
+	 * this is a _really_ horrible hack just to avoid an atomic sub at the
+	 * end:
+	 */
+	if (!sync) {
+		set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER -
+			   CLOSURE_RUNNING +
+			   CLOSURE_DESTRUCTOR);
+	} else {
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER + 1);
+	}
+
+	dio->req	= req;
+	dio->ret	= ret;
+
+	goto start;
+	while (iter->count) {
+		bio = bio_alloc_bioset(GFP_KERNEL,
+				       iov_iter_npages(iter, BIO_MAX_PAGES),
+				       &c->bio_read);
+		bio->bi_end_io		= bch2_direct_IO_read_split_endio;
+start:
+		bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
+		bio->bi_iter.bi_sector	= offset >> 9;
+		bio->bi_private		= dio;
+
+		ret = bio_iov_iter_get_pages(bio, iter);
+		if (ret < 0) {
+			/* XXX: fault inject this path */
+			bio->bi_status = BLK_STS_RESOURCE;
+			bio_endio(bio);
+			break;
+		}
+
+		offset += bio->bi_iter.bi_size;
+		bio_set_pages_dirty(bio);
+
+		if (iter->count)
+			closure_get(&dio->cl);
+
+		bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
+	}
+
+	iter->count += shorten;
+
+	if (sync) {
+		closure_sync(&dio->cl);
+		closure_debug_destroy(&dio->cl);
+		ret = dio->ret;
+		bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+		return ret;
+	} else {
+		return -EIOCBQUEUED;
+	}
+}
+
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct address_space *mapping = file->f_mapping;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	if (!count)
+		return 0; /* skip atime */
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		struct blk_plug plug;
+
+		ret = filemap_write_and_wait_range(mapping,
+					iocb->ki_pos,
+					iocb->ki_pos + count - 1);
+		if (ret < 0)
+			return ret;
+
+		file_accessed(file);
+
+		blk_start_plug(&plug);
+		ret = bch2_direct_IO_read(iocb, iter);
+		blk_finish_plug(&plug);
+
+		if (ret >= 0)
+			iocb->ki_pos += ret;
+	} else {
+		bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+		ret = generic_file_read_iter(iocb, iter);
+		bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+	}
+
+	return ret;
+}
+
+/* O_DIRECT writes */
+
+static long bch2_dio_write_loop(struct dio_write *dio)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct bch_fs *c = dio->op.c;
+	struct kiocb *req = dio->req;
+	struct address_space *mapping = req->ki_filp->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
+	struct bio *bio = &dio->op.wbio.bio;
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+	unsigned unaligned;
+	u64 new_i_size;
+	bool sync;
+	long ret;
+
+	if (dio->loop)
+		goto loop;
+
+	while (1) {
+		if (kthread)
+			kthread_use_mm(dio->mm);
+		BUG_ON(current->faults_disabled_mapping);
+		current->faults_disabled_mapping = mapping;
+
+		ret = bio_iov_iter_get_pages(bio, &dio->iter);
+
+		current->faults_disabled_mapping = NULL;
+		if (kthread)
+			kthread_unuse_mm(dio->mm);
+
+		if (unlikely(ret < 0))
+			goto err;
+
+		unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
+		bio->bi_iter.bi_size -= unaligned;
+		iov_iter_revert(&dio->iter, unaligned);
+
+		if (!bio->bi_iter.bi_size) {
+			/*
+			 * bio_iov_iter_get_pages was only able to get <
+			 * blocksize worth of pages:
+			 */
+			bio_for_each_segment_all(bv, bio, iter)
+				put_page(bv->bv_page);
+			ret = -EFAULT;
+			goto err;
+		}
+
+		dio->op.pos = POS(inode->v.i_ino,
+				  (req->ki_pos >> 9) + dio->op.written);
+
+		task_io_account_write(bio->bi_iter.bi_size);
+
+		if (!dio->sync && !dio->loop && dio->iter.count) {
+			struct iovec *iov = dio->inline_vecs;
+
+			if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+				iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
+					      GFP_KERNEL);
+				if (unlikely(!iov)) {
+					dio->sync = true;
+					goto do_io;
+				}
+
+				dio->free_iov = true;
+			}
+
+			memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
+			dio->iter.iov = iov;
+		}
+do_io:
+		dio->loop = true;
+		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
+
+		if (dio->sync)
+			wait_for_completion(&dio->done);
+		else
+			return -EIOCBQUEUED;
+loop:
+		i_sectors_acct(c, inode, &dio->quota_res,
+			       dio->op.i_sectors_delta);
+		dio->op.i_sectors_delta = 0;
+
+		new_i_size = req->ki_pos + ((u64) dio->op.written << 9);
+
+		spin_lock(&inode->v.i_lock);
+		if (new_i_size > inode->v.i_size)
+			i_size_write(&inode->v, new_i_size);
+		spin_unlock(&inode->v.i_lock);
+
+		bio_for_each_segment_all(bv, bio, iter)
+			put_page(bv->bv_page);
+		if (!dio->iter.count || dio->op.error)
+			break;
+
+		bio_reset(bio);
+		reinit_completion(&dio->done);
+	}
+
+	ret = dio->op.error ?: ((long) dio->op.written << 9);
+err:
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	bch2_disk_reservation_put(c, &dio->op.res);
+	bch2_quota_reservation_put(c, inode, &dio->quota_res);
+
+	if (dio->free_iov)
+		kfree(dio->iter.iov);
+
+	sync = dio->sync;
+	bio_put(bio);
+
+	/* inode->i_dio_count is our ref on inode and thus bch_fs */
+	inode_dio_end(&inode->v);
+
+	if (!sync) {
+		req->ki_complete(req, ret, 0);
+		ret = -EIOCBQUEUED;
+	}
+	return ret;
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
+{
+	struct dio_write *dio = container_of(op, struct dio_write, op);
+
+	if (dio->sync)
+		complete(&dio->done);
+	else
+		bch2_dio_write_loop(dio);
+}
+
+static noinline
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
+	struct dio_write *dio;
+	struct bio *bio;
+	bool locked = true, extending;
+	ssize_t ret;
+
+	prefetch(&c->opts);
+	prefetch((void *) &c->opts + 64);
+	prefetch(&inode->ei_inode);
+	prefetch((void *) &inode->ei_inode + 64);
+
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(req, iter);
+	if (unlikely(ret <= 0))
+		goto err;
+
+	ret = file_remove_privs(file);
+	if (unlikely(ret))
+		goto err;
+
+	ret = file_update_time(file);
+	if (unlikely(ret))
+		goto err;
+
+	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
+		goto err;
+
+	inode_dio_begin(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	extending = req->ki_pos + iter->count > inode->v.i_size;
+	if (!extending) {
+		inode_unlock(&inode->v);
+		locked = false;
+	}
+
+	bio = bio_alloc_bioset(GFP_KERNEL,
+			       iov_iter_npages(iter, BIO_MAX_PAGES),
+			       &c->dio_write_bioset);
+	dio = container_of(bio, struct dio_write, op.wbio.bio);
+	init_completion(&dio->done);
+	dio->req		= req;
+	dio->mm			= current->mm;
+	dio->loop		= false;
+	dio->sync		= is_sync_kiocb(req) || extending;
+	dio->free_iov		= false;
+	dio->quota_res.sectors	= 0;
+	dio->iter		= *iter;
+
+	bch2_write_op_init(&dio->op, c, opts);
+	dio->op.end_io		= bch2_dio_write_loop_async;
+	dio->op.target		= opts.foreground_target;
+	op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
+	dio->op.write_point	= writepoint_hashed((unsigned long) current);
+	dio->op.flags |= BCH_WRITE_NOPUT_RESERVATION;
+
+	if ((req->ki_flags & IOCB_DSYNC) &&
+	    !c->opts.journal_flush_disabled)
+		dio->op.flags |= BCH_WRITE_FLUSH;
+
+	ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
+					 iter->count >> 9, true);
+	if (unlikely(ret))
+		goto err_put_bio;
+
+	dio->op.nr_replicas	= dio->op.opts.data_replicas;
+
+	ret = bch2_disk_reservation_get(c, &dio->op.res, iter->count >> 9,
+					dio->op.opts.data_replicas, 0);
+	if (unlikely(ret) &&
+	    !bch2_check_range_allocated(c, POS(inode->v.i_ino,
+					       req->ki_pos >> 9),
+					iter->count >> 9,
+					dio->op.opts.data_replicas))
+		goto err_put_bio;
+
+	ret = write_invalidate_inode_pages_range(mapping,
+					req->ki_pos,
+					req->ki_pos + iter->count - 1);
+	if (unlikely(ret))
+		goto err_put_bio;
+
+	ret = bch2_dio_write_loop(dio);
+err:
+	if (locked)
+		inode_unlock(&inode->v);
+	if (ret > 0)
+		req->ki_pos += ret;
+	return ret;
+err_put_bio:
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	bch2_disk_reservation_put(c, &dio->op.res);
+	bch2_quota_reservation_put(c, inode, &dio->quota_res);
+	bio_put(bio);
+	inode_dio_end(&inode->v);
+	goto err;
+}
+
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	ssize_t ret;
+
+	if (iocb->ki_flags & IOCB_DIRECT)
+		return bch2_direct_write(iocb, from);
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = inode_to_bdi(&inode->v);
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto unlock;
+
+	ret = file_remove_privs(file);
+	if (ret)
+		goto unlock;
+
+	ret = file_update_time(file);
+	if (ret)
+		goto unlock;
+
+	ret = bch2_buffered_write(iocb, from);
+	if (likely(ret > 0))
+		iocb->ki_pos += ret;
+unlock:
+	inode_unlock(&inode->v);
+	current->backing_dev_info = NULL;
+
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+
+	return ret;
+}
+
+/* fsync: */
+
+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret, ret2;
+
+	ret = file_write_and_wait_range(file, start, end);
+	if (ret)
+		return ret;
+
+	if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
+		goto out;
+
+	ret = sync_inode_metadata(&inode->v, 1);
+	if (ret)
+		return ret;
+out:
+	if (!c->opts.journal_flush_disabled)
+		ret = bch2_journal_flush_seq(&c->journal,
+					     inode->ei_journal_seq);
+	ret2 = file_check_and_advance_wb_err(file);
+
+	return ret ?: ret2;
+}
+
+/* truncate: */
+
+static inline int range_has_data(struct bch_fs *c,
+				  struct bpos start,
+				  struct bpos end)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) {
+		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+			break;
+
+		if (bkey_extent_is_data(k.k)) {
+			ret = 1;
+			break;
+		}
+	}
+
+	return bch2_trans_exit(&trans) ?: ret;
+}
+
+static int __bch2_truncate_page(struct bch_inode_info *inode,
+				pgoff_t index, loff_t start, loff_t end)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_page_state *s;
+	unsigned start_offset = start & (PAGE_SIZE - 1);
+	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+	unsigned i;
+	struct page *page;
+	int ret = 0;
+
+	/* Page boundary? Nothing to do */
+	if (!((index == start >> PAGE_SHIFT && start_offset) ||
+	      (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
+		return 0;
+
+	/* Above i_size? */
+	if (index << PAGE_SHIFT >= inode->v.i_size)
+		return 0;
+
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		/*
+		 * XXX: we're doing two index lookups when we end up reading the
+		 * page
+		 */
+		ret = range_has_data(c,
+				POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
+				POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
+		if (ret <= 0)
+			return ret;
+
+		page = find_or_create_page(mapping, index, GFP_KERNEL);
+		if (unlikely(!page)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	s = bch2_page_state_create(page, 0);
+	if (!s) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	if (!PageUptodate(page)) {
+		ret = bch2_read_single_page(page, mapping);
+		if (ret)
+			goto unlock;
+	}
+
+	if (index != start >> PAGE_SHIFT)
+		start_offset = 0;
+	if (index != end >> PAGE_SHIFT)
+		end_offset = PAGE_SIZE;
+
+	for (i = round_up(start_offset, block_bytes(c)) >> 9;
+	     i < round_down(end_offset, block_bytes(c)) >> 9;
+	     i++) {
+		s->s[i].nr_replicas	= 0;
+		s->s[i].state		= SECTOR_UNALLOCATED;
+	}
+
+	zero_user_segment(page, start_offset, end_offset);
+
+	/*
+	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
+	 *
+	 * XXX: because we aren't currently tracking whether the page has actual
+	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
+	 */
+	ret = bch2_get_page_disk_reservation(c, inode, page, false);
+	BUG_ON(ret);
+
+	__set_page_dirty_nobuffers(page);
+unlock:
+	unlock_page(page);
+	put_page(page);
+out:
+	return ret;
+}
+
+static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
+{
+	return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
+				    from, round_up(from, PAGE_SIZE));
+}
+
+static int bch2_extend(struct bch_inode_info *inode,
+		       struct bch_inode_unpacked *inode_u,
+		       struct iattr *iattr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	int ret;
+
+	/*
+	 * sync appends:
+	 *
+	 * this has to be done _before_ extending i_size:
+	 */
+	ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
+	if (ret)
+		return ret;
+
+	truncate_setsize(&inode->v, iattr->ia_size);
+	setattr_copy(&inode->v, iattr);
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_write_inode_size(c, inode, inode->v.i_size,
+				    ATTR_MTIME|ATTR_CTIME);
+	mutex_unlock(&inode->ei_update_lock);
+
+	return ret;
+}
+
+static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
+				   struct bch_inode_unpacked *bi,
+				   void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
+	return 0;
+}
+
+static int bch2_truncate_start_fn(struct bch_inode_info *inode,
+				  struct bch_inode_unpacked *bi, void *p)
+{
+	u64 *new_i_size = p;
+
+	bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
+	bi->bi_size = *new_i_size;
+	return 0;
+}
+
+int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_inode_unpacked inode_u;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	u64 new_i_size = iattr->ia_size;
+	s64 i_sectors_delta = 0;
+	int ret = 0;
+
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	/*
+	 * fetch current on disk i_size: inode is locked, i_size can only
+	 * increase underneath us:
+	 */
+	bch2_trans_init(&trans, c, 0, 0);
+	iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0);
+	ret = PTR_ERR_OR_ZERO(iter);
+	bch2_trans_exit(&trans);
+
+	if (ret)
+		goto err;
+
+	BUG_ON(inode->v.i_size < inode_u.bi_size);
+
+	if (iattr->ia_size > inode->v.i_size) {
+		ret = bch2_extend(inode, &inode_u, iattr);
+		goto err;
+	}
+
+	ret = bch2_truncate_page(inode, iattr->ia_size);
+	if (unlikely(ret))
+		goto err;
+
+	/*
+	 * When extending, we're going to write the new i_size to disk
+	 * immediately so we need to flush anything above the current on disk
+	 * i_size first:
+	 *
+	 * Also, when extending we need to flush the page that i_size currently
+	 * straddles - if it's mapped to userspace, we need to ensure that
+	 * userspace has to redirty it and call .mkwrite -> set_page_dirty
+	 * again to allocate the part of the page that was extended.
+	 */
+	if (iattr->ia_size > inode_u.bi_size)
+		ret = filemap_write_and_wait_range(mapping,
+				inode_u.bi_size,
+				iattr->ia_size - 1);
+	else if (iattr->ia_size & (PAGE_SIZE - 1))
+		ret = filemap_write_and_wait_range(mapping,
+				round_down(iattr->ia_size, PAGE_SIZE),
+				iattr->ia_size - 1);
+	if (ret)
+		goto err;
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
+			       &new_i_size, 0);
+	mutex_unlock(&inode->ei_update_lock);
+
+	if (unlikely(ret))
+		goto err;
+
+	truncate_setsize(&inode->v, iattr->ia_size);
+
+	ret = bch2_fpunch(c, inode->v.i_ino,
+			round_up(iattr->ia_size, block_bytes(c)) >> 9,
+			U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
+	i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+	if (unlikely(ret))
+		goto err;
+
+	setattr_copy(&inode->v, iattr);
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL,
+			       ATTR_MTIME|ATTR_CTIME);
+	mutex_unlock(&inode->ei_update_lock);
+err:
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	return ret;
+}
+
+/* fallocate: */
+
+static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
+	u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
+	int ret = 0;
+
+	inode_lock(&inode->v);
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	ret = __bch2_truncate_page(inode,
+				   offset >> PAGE_SHIFT,
+				   offset, offset + len);
+	if (unlikely(ret))
+		goto err;
+
+	if (offset >> PAGE_SHIFT !=
+	    (offset + len) >> PAGE_SHIFT) {
+		ret = __bch2_truncate_page(inode,
+					   (offset + len) >> PAGE_SHIFT,
+					   offset, offset + len);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	truncate_pagecache_range(&inode->v, offset, offset + len - 1);
+
+	if (discard_start < discard_end) {
+		s64 i_sectors_delta = 0;
+
+		ret = bch2_fpunch(c, inode->v.i_ino,
+				  discard_start, discard_end,
+				  &inode->ei_journal_seq,
+				  &i_sectors_delta);
+		i_sectors_acct(c, inode, NULL, i_sectors_delta);
+	}
+err:
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	inode_unlock(&inode->v);
+
+	return ret;
+}
+
+static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
+				   loff_t offset, loff_t len,
+				   bool insert)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	struct btree_trans trans;
+	struct btree_iter *src, *dst, *del = NULL;
+	loff_t shift, new_size;
+	u64 src_start;
+	int ret;
+
+	if ((offset | len) & (block_bytes(c) - 1))
+		return -EINVAL;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
+
+	/*
+	 * We need i_mutex to keep the page cache consistent with the extents
+	 * btree, and the btree consistent with i_size - we don't need outside
+	 * locking for the extents btree itself, because we're using linked
+	 * iterators
+	 */
+	inode_lock(&inode->v);
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	if (insert) {
+		ret = -EFBIG;
+		if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
+			goto err;
+
+		ret = -EINVAL;
+		if (offset >= inode->v.i_size)
+			goto err;
+
+		src_start	= U64_MAX;
+		shift		= len;
+	} else {
+		ret = -EINVAL;
+		if (offset + len >= inode->v.i_size)
+			goto err;
+
+		src_start	= offset + len;
+		shift		= -len;
+	}
+
+	new_size = inode->v.i_size + shift;
+
+	ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
+	if (ret)
+		goto err;
+
+	if (insert) {
+		i_size_write(&inode->v, new_size);
+		mutex_lock(&inode->ei_update_lock);
+		ret = bch2_write_inode_size(c, inode, new_size,
+					    ATTR_MTIME|ATTR_CTIME);
+		mutex_unlock(&inode->ei_update_lock);
+	} else {
+		s64 i_sectors_delta = 0;
+
+		ret = bch2_fpunch(c, inode->v.i_ino,
+				  offset >> 9, (offset + len) >> 9,
+				  &inode->ei_journal_seq,
+				  &i_sectors_delta);
+		i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+		if (ret)
+			goto err;
+	}
+
+	src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+			POS(inode->v.i_ino, src_start >> 9),
+			BTREE_ITER_INTENT);
+	BUG_ON(IS_ERR_OR_NULL(src));
+
+	dst = bch2_trans_copy_iter(&trans, src);
+	BUG_ON(IS_ERR_OR_NULL(dst));
+
+	while (1) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		BKEY_PADDED(k) copy;
+		struct bkey_i delete;
+		struct bkey_s_c k;
+		struct bpos next_pos;
+		struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
+		struct bpos atomic_end;
+		unsigned commit_flags = BTREE_INSERT_NOFAIL|
+			BTREE_INSERT_ATOMIC|
+			BTREE_INSERT_USE_RESERVE;
+
+		k = insert
+			? bch2_btree_iter_peek_prev(src)
+			: bch2_btree_iter_peek(src);
+		if ((ret = bkey_err(k)))
+			goto bkey_err;
+
+		if (!k.k || k.k->p.inode != inode->v.i_ino)
+			break;
+
+		BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k)));
+
+		if (insert &&
+		    bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
+			break;
+reassemble:
+		bkey_reassemble(&copy.k, k);
+
+		if (insert &&
+		    bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) {
+			bch2_cut_front(move_pos, &copy.k);
+			bch2_btree_iter_set_pos(src, bkey_start_pos(&copy.k.k));
+		}
+
+		copy.k.k.p.offset += shift >> 9;
+		bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k.k));
+
+		ret = bch2_extent_atomic_end(dst, &copy.k, &atomic_end);
+		if (ret)
+			goto bkey_err;
+
+		if (bkey_cmp(atomic_end, copy.k.k.p)) {
+			if (insert) {
+				move_pos = atomic_end;
+				move_pos.offset -= shift >> 9;
+				goto reassemble;
+			} else {
+				bch2_cut_back(atomic_end, &copy.k.k);
+			}
+		}
+
+		bkey_init(&delete.k);
+		delete.k.p = src->pos;
+		bch2_key_resize(&delete.k, copy.k.k.size);
+
+		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
+
+		/*
+		 * If the new and old keys overlap (because we're moving an
+		 * extent that's bigger than the amount we're collapsing by),
+		 * we need to trim the delete key here so they don't overlap
+		 * because overlaps on insertions aren't handled before
+		 * triggers are run, so the overwrite will get double counted
+		 * by the triggers machinery:
+		 */
+		if (insert &&
+		    bkey_cmp(bkey_start_pos(&copy.k.k), delete.k.p) < 0) {
+			bch2_cut_back(bkey_start_pos(&copy.k.k), &delete.k);
+		} else if (!insert &&
+			   bkey_cmp(copy.k.k.p,
+				    bkey_start_pos(&delete.k)) > 0) {
+			bch2_cut_front(copy.k.k.p, &delete);
+
+			del = bch2_trans_copy_iter(&trans, src);
+			BUG_ON(IS_ERR_OR_NULL(del));
+
+			bch2_btree_iter_set_pos(del,
+				bkey_start_pos(&delete.k));
+		}
+
+		bch2_trans_update(&trans, dst, &copy.k);
+		bch2_trans_update(&trans, del ?: src, &delete);
+
+		if (copy.k.k.size == k.k->size) {
+			/*
+			 * If we're moving the entire extent, we can skip
+			 * running triggers:
+			 */
+			commit_flags |= BTREE_INSERT_NOMARK;
+		} else {
+			/* We might end up splitting compressed extents: */
+			unsigned nr_ptrs =
+				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k));
+
+			ret = bch2_disk_reservation_get(c, &disk_res,
+					copy.k.k.size, nr_ptrs,
+					BCH_DISK_RESERVATION_NOFAIL);
+			BUG_ON(ret);
+		}
+
+		ret = bch2_trans_commit(&trans, &disk_res,
+					&inode->ei_journal_seq,
+					commit_flags);
+		bch2_disk_reservation_put(c, &disk_res);
+bkey_err:
+		if (del)
+			bch2_trans_iter_put(&trans, del);
+		del = NULL;
+
+		if (!ret)
+			bch2_btree_iter_set_pos(src, next_pos);
+
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			goto err;
+
+		bch2_trans_cond_resched(&trans);
+	}
+	bch2_trans_unlock(&trans);
+
+	if (!insert) {
+		i_size_write(&inode->v, new_size);
+		mutex_lock(&inode->ei_update_lock);
+		ret = bch2_write_inode_size(c, inode, new_size,
+					    ATTR_MTIME|ATTR_CTIME);
+		mutex_unlock(&inode->ei_update_lock);
+	}
+err:
+	bch2_trans_exit(&trans);
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	inode_unlock(&inode->v);
+	return ret;
+}
+
+static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bpos end_pos;
+	loff_t end		= offset + len;
+	loff_t block_start	= round_down(offset,	block_bytes(c));
+	loff_t block_end	= round_up(end,		block_bytes(c));
+	unsigned sectors;
+	unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
+	int ret;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	inode_lock(&inode->v);
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+		ret = inode_newsize_ok(&inode->v, end);
+		if (ret)
+			goto err;
+	}
+
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = __bch2_truncate_page(inode,
+					   offset >> PAGE_SHIFT,
+					   offset, end);
+
+		if (!ret &&
+		    offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
+			ret = __bch2_truncate_page(inode,
+						   end >> PAGE_SHIFT,
+						   offset, end);
+
+		if (unlikely(ret))
+			goto err;
+
+		truncate_pagecache_range(&inode->v, offset, end - 1);
+	}
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+			POS(inode->v.i_ino, block_start >> 9),
+			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	end_pos = POS(inode->v.i_ino, block_end >> 9);
+
+	while (bkey_cmp(iter->pos, end_pos) < 0) {
+		s64 i_sectors_delta = 0;
+		struct disk_reservation disk_res = { 0 };
+		struct quota_res quota_res = { 0 };
+		struct bkey_i_reservation reservation;
+		struct bkey_s_c k;
+
+		k = bch2_btree_iter_peek_slot(iter);
+		if ((ret = bkey_err(k)))
+			goto bkey_err;
+
+		/* already reserved */
+		if (k.k->type == KEY_TYPE_reservation &&
+		    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
+			bch2_btree_iter_next_slot(iter);
+			continue;
+		}
+
+		if (bkey_extent_is_data(k.k) &&
+		    !(mode & FALLOC_FL_ZERO_RANGE)) {
+			bch2_btree_iter_next_slot(iter);
+			continue;
+		}
+
+		bkey_reservation_init(&reservation.k_i);
+		reservation.k.type	= KEY_TYPE_reservation;
+		reservation.k.p		= k.k->p;
+		reservation.k.size	= k.k->size;
+
+		bch2_cut_front(iter->pos, &reservation.k_i);
+		bch2_cut_back(end_pos, &reservation.k);
+
+		sectors = reservation.k.size;
+		reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k);
+
+		if (!bkey_extent_is_allocation(k.k)) {
+			ret = bch2_quota_reservation_add(c, inode,
+					&quota_res,
+					sectors, true);
+			if (unlikely(ret))
+				goto bkey_err;
+		}
+
+		if (reservation.v.nr_replicas < replicas ||
+		    bch2_extent_is_compressed(k)) {
+			ret = bch2_disk_reservation_get(c, &disk_res, sectors,
+							replicas, 0);
+			if (unlikely(ret))
+				goto bkey_err;
+
+			reservation.v.nr_replicas = disk_res.nr_replicas;
+		}
+
+		bch2_trans_begin_updates(&trans);
+
+		ret = bch2_extent_update(&trans, iter, &reservation.k_i,
+				&disk_res, &inode->ei_journal_seq,
+				0, &i_sectors_delta);
+		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+bkey_err:
+		bch2_quota_reservation_put(c, inode, &quota_res);
+		bch2_disk_reservation_put(c, &disk_res);
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			goto err;
+	}
+
+	/*
+	 * Do we need to extend the file?
+	 *
+	 * If we zeroed up to the end of the file, we dropped whatever writes
+	 * were going to write out the current i_size, so we have to extend
+	 * manually even if FL_KEEP_SIZE was set:
+	 */
+	if (end >= inode->v.i_size &&
+	    (!(mode & FALLOC_FL_KEEP_SIZE) ||
+	     (mode & FALLOC_FL_ZERO_RANGE))) {
+		struct btree_iter *inode_iter;
+		struct bch_inode_unpacked inode_u;
+
+		do {
+			bch2_trans_begin(&trans);
+			inode_iter = bch2_inode_peek(&trans, &inode_u,
+						     inode->v.i_ino, 0);
+			ret = PTR_ERR_OR_ZERO(inode_iter);
+		} while (ret == -EINTR);
+
+		bch2_trans_unlock(&trans);
+
+		if (ret)
+			goto err;
+
+		/*
+		 * Sync existing appends before extending i_size,
+		 * as in bch2_extend():
+		 */
+		ret = filemap_write_and_wait_range(mapping,
+					inode_u.bi_size, S64_MAX);
+		if (ret)
+			goto err;
+
+		if (mode & FALLOC_FL_KEEP_SIZE)
+			end = inode->v.i_size;
+		else
+			i_size_write(&inode->v, end);
+
+		mutex_lock(&inode->ei_update_lock);
+		ret = bch2_write_inode_size(c, inode, end, 0);
+		mutex_unlock(&inode->ei_update_lock);
+	}
+err:
+	bch2_trans_exit(&trans);
+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+	inode_unlock(&inode->v);
+	return ret;
+}
+
+long bch2_fallocate_dispatch(struct file *file, int mode,
+			     loff_t offset, loff_t len)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	long ret;
+
+	if (!percpu_ref_tryget(&c->writes))
+		return -EROFS;
+
+	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
+		ret = bchfs_fallocate(inode, mode, offset, len);
+	else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
+		ret = bchfs_fpunch(inode, offset, len);
+	else if (mode == FALLOC_FL_INSERT_RANGE)
+		ret = bchfs_fcollapse_finsert(inode, offset, len, true);
+	else if (mode == FALLOC_FL_COLLAPSE_RANGE)
+		ret = bchfs_fcollapse_finsert(inode, offset, len, false);
+	else
+		ret = -EOPNOTSUPP;
+
+	percpu_ref_put(&c->writes);
+
+	return ret;
+}
+
+static void mark_range_unallocated(struct bch_inode_info *inode,
+				   loff_t start, loff_t end)
+{
+	pgoff_t index = start >> PAGE_SHIFT;
+	pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
+	struct pagevec pvec;
+
+	pagevec_init(&pvec);
+
+	do {
+		unsigned nr_pages, i, j;
+
+		nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
+						&index, end_index);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			struct bch_page_state *s;
+
+			lock_page(page);
+			s = bch2_page_state(page);
+
+			if (s) {
+				spin_lock(&s->lock);
+				for (j = 0; j < PAGE_SECTORS; j++)
+					s->s[j].nr_replicas = 0;
+				spin_unlock(&s->lock);
+			}
+
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+	} while (index <= end_index);
+}
+
+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
+			     struct file *file_dst, loff_t pos_dst,
+			     loff_t len, unsigned remap_flags)
+{
+	struct bch_inode_info *src = file_bch_inode(file_src);
+	struct bch_inode_info *dst = file_bch_inode(file_dst);
+	struct bch_fs *c = src->v.i_sb->s_fs_info;
+	s64 i_sectors_delta = 0;
+	loff_t ret = 0;
+	loff_t aligned_len;
+
+	if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
+		return -EINVAL;
+
+	if (remap_flags & REMAP_FILE_DEDUP)
+		return -EOPNOTSUPP;
+
+	if ((pos_src & (block_bytes(c) - 1)) ||
+	    (pos_dst & (block_bytes(c) - 1)))
+		return -EINVAL;
+
+	if (src == dst &&
+	    abs(pos_src - pos_dst) < len)
+		return -EINVAL;
+
+	bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+
+	file_update_time(file_dst);
+
+	inode_dio_wait(&src->v);
+	inode_dio_wait(&dst->v);
+
+	ret = generic_remap_file_range_prep(file_src, pos_src,
+					    file_dst, pos_dst,
+					    &len, remap_flags);
+	if (ret < 0 || len == 0)
+		goto err;
+
+	aligned_len = round_up(len, block_bytes(c));
+
+	ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
+				pos_dst, pos_dst + aligned_len);
+	if (ret)
+		goto err;
+
+	mark_range_unallocated(src, pos_src, pos_src + aligned_len);
+
+	ret = bch2_remap_range(c,
+			       POS(dst->v.i_ino, pos_dst >> 9),
+			       POS(src->v.i_ino, pos_src >> 9),
+			       aligned_len >> 9,
+			       &dst->ei_journal_seq,
+			       pos_dst + len, &i_sectors_delta);
+	if (ret < 0)
+		goto err;
+
+	ret <<= 9;
+	/*
+	 * due to alignment, we might have remapped slightly more than requsted
+	 */
+	ret = min(ret, len);
+
+	/* XXX get a quota reservation */
+	i_sectors_acct(c, dst, NULL, i_sectors_delta);
+
+	spin_lock(&dst->v.i_lock);
+	if (pos_dst + len > dst->v.i_size)
+		i_size_write(&dst->v, pos_dst + len);
+	spin_unlock(&dst->v.i_lock);
+err:
+	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+
+	return ret;
+}
+
+/* fseek: */
+
+static int page_data_offset(struct page *page, unsigned offset)
+{
+	struct bch_page_state *s = bch2_page_state(page);
+	unsigned i;
+
+	if (s)
+		for (i = offset >> 9; i < PAGE_SECTORS; i++)
+			if (s->s[i].state >= SECTOR_DIRTY)
+				return i << 9;
+
+	return -1;
+}
+
+static loff_t bch2_seek_pagecache_data(struct inode *vinode,
+				       loff_t start_offset,
+				       loff_t end_offset)
+{
+	struct address_space *mapping = vinode->i_mapping;
+	struct page *page;
+	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
+	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
+	pgoff_t index		= start_index;
+	loff_t ret;
+	int offset;
+
+	while (index <= end_index) {
+		if (find_get_pages_range(mapping, &index, end_index, 1, &page)) {
+			lock_page(page);
+
+			offset = page_data_offset(page,
+					page->index == start_index
+					? start_offset & (PAGE_SIZE - 1)
+					: 0);
+			if (offset >= 0) {
+				ret = clamp(((loff_t) page->index << PAGE_SHIFT) +
+					    offset,
+					    start_offset, end_offset);
+				unlock_page(page);
+				put_page(page);
+				return ret;
+			}
+
+			unlock_page(page);
+			put_page(page);
+		} else {
+			break;
+		}
+	}
+
+	return end_offset;
+}
+
+static loff_t bch2_seek_data(struct file *file, u64 offset)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 isize, next_data = MAX_LFS_FILESIZE;
+	int ret;
+
+	isize = i_size_read(&inode->v);
+	if (offset >= isize)
+		return -ENXIO;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+			   POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
+		if (k.k->p.inode != inode->v.i_ino) {
+			break;
+		} else if (bkey_extent_is_data(k.k)) {
+			next_data = max(offset, bkey_start_offset(k.k) << 9);
+			break;
+		} else if (k.k->p.offset >> 9 > isize)
+			break;
+	}
+
+	ret = bch2_trans_exit(&trans) ?: ret;
+	if (ret)
+		return ret;
+
+	if (next_data > offset)
+		next_data = bch2_seek_pagecache_data(&inode->v,
+						     offset, next_data);
+
+	if (next_data >= isize)
+		return -ENXIO;
+
+	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
+}
+
+static int __page_hole_offset(struct page *page, unsigned offset)
+{
+	struct bch_page_state *s = bch2_page_state(page);
+	unsigned i;
+
+	if (!s)
+		return 0;
+
+	for (i = offset >> 9; i < PAGE_SECTORS; i++)
+		if (s->s[i].state < SECTOR_DIRTY)
+			return i << 9;
+
+	return -1;
+}
+
+static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
+{
+	pgoff_t index = offset >> PAGE_SHIFT;
+	struct page *page;
+	int pg_offset;
+	loff_t ret = -1;
+
+	page = find_lock_page(mapping, index);
+	if (!page)
+		return offset;
+
+	pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
+	if (pg_offset >= 0)
+		ret = ((loff_t) index << PAGE_SHIFT) + pg_offset;
+
+	unlock_page(page);
+
+	return ret;
+}
+
+static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
+				       loff_t start_offset,
+				       loff_t end_offset)
+{
+	struct address_space *mapping = vinode->i_mapping;
+	loff_t offset = start_offset, hole;
+
+	while (offset < end_offset) {
+		hole = page_hole_offset(mapping, offset);
+		if (hole >= 0 && hole <= end_offset)
+			return max(start_offset, hole);
+
+		offset += PAGE_SIZE;
+		offset &= PAGE_MASK;
+	}
+
+	return end_offset;
+}
+
+static loff_t bch2_seek_hole(struct file *file, u64 offset)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 isize, next_hole = MAX_LFS_FILESIZE;
+	int ret;
+
+	isize = i_size_read(&inode->v);
+	if (offset >= isize)
+		return -ENXIO;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+			   POS(inode->v.i_ino, offset >> 9),
+			   BTREE_ITER_SLOTS, k, ret) {
+		if (k.k->p.inode != inode->v.i_ino) {
+			next_hole = bch2_seek_pagecache_hole(&inode->v,
+					offset, MAX_LFS_FILESIZE);
+			break;
+		} else if (!bkey_extent_is_data(k.k)) {
+			next_hole = bch2_seek_pagecache_hole(&inode->v,
+					max(offset, bkey_start_offset(k.k) << 9),
+					k.k->p.offset << 9);
+
+			if (next_hole < k.k->p.offset << 9)
+				break;
+		} else {
+			offset = max(offset, bkey_start_offset(k.k) << 9);
+		}
+	}
+
+	ret = bch2_trans_exit(&trans) ?: ret;
+	if (ret)
+		return ret;
+
+	if (next_hole > isize)
+		next_hole = isize;
+
+	return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
+}
+
+loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
+{
+	switch (whence) {
+	case SEEK_SET:
+	case SEEK_CUR:
+	case SEEK_END:
+		return generic_file_llseek(file, offset, whence);
+	case SEEK_DATA:
+		return bch2_seek_data(file, offset);
+	case SEEK_HOLE:
+		return bch2_seek_hole(file, offset);
+	}
+
+	return -EINVAL;
+}
+
+void bch2_fs_fsio_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->dio_write_bioset);
+	bioset_exit(&c->dio_read_bioset);
+	bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fsio_init(struct bch_fs *c)
+{
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	if (bioset_init(&c->writepage_bioset,
+			4, offsetof(struct bch_writepage_io, op.wbio.bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->dio_read_bioset,
+			4, offsetof(struct dio_read, rbio.bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->dio_write_bioset,
+			4, offsetof(struct dio_write, op.wbio.bio),
+			BIOSET_NEED_BVECS))
+		ret = -ENOMEM;
+
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
new file mode 100644
index 000000000000..7063556d289b
--- /dev/null
+++ b/fs/bcachefs/fs-io.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_H
+#define _BCACHEFS_FS_IO_H
+
+#ifndef NO_BCACHEFS_FS
+
+#include "buckets.h"
+#include "io_types.h"
+
+#include <linux/uio.h>
+
+struct quota_res;
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+				       struct bch_inode_info *,
+				       loff_t, unsigned);
+
+int bch2_writepage(struct page *, struct writeback_control *);
+int bch2_readpage(struct file *, struct page *);
+
+int bch2_writepages(struct address_space *, struct writeback_control *);
+int bch2_readpages(struct file *, struct address_space *,
+		   struct list_head *, unsigned);
+
+int bch2_write_begin(struct file *, struct address_space *, loff_t,
+		     unsigned, unsigned, struct page **, void **);
+int bch2_write_end(struct file *, struct address_space *, loff_t,
+		   unsigned, unsigned, struct page *, void *);
+
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+
+int bch2_fsync(struct file *, loff_t, loff_t, int);
+
+int bch2_truncate(struct bch_inode_info *, struct iattr *);
+long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
+
+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
+			     loff_t, loff_t, unsigned);
+
+loff_t bch2_llseek(struct file *, loff_t, int);
+
+vm_fault_t bch2_page_fault(struct vm_fault *);
+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
+void bch2_invalidatepage(struct page *, unsigned int, unsigned int);
+int bch2_releasepage(struct page *, gfp_t);
+int bch2_migrate_page(struct address_space *, struct page *,
+		      struct page *, enum migrate_mode);
+
+void bch2_fs_fsio_exit(struct bch_fs *);
+int bch2_fs_fsio_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
new file mode 100644
index 000000000000..031e6d931171
--- /dev/null
+++ b/fs/bcachefs/fs-ioctl.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "chardev.h"
+#include "dirent.h"
+#include "fs.h"
+#include "fs-common.h"
+#include "fs-ioctl.h"
+#include "quota.h"
+
+#include <linux/compat.h>
+#include <linux/mount.h>
+
+#define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
+
+struct flags_set {
+	unsigned		mask;
+	unsigned		flags;
+
+	unsigned		projid;
+};
+
+static int bch2_inode_flags_set(struct bch_inode_info *inode,
+				struct bch_inode_unpacked *bi,
+				void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	/*
+	 * We're relying on btree locking here for exclusion with other ioctl
+	 * calls - use the flags in the btree (@bi), not inode->i_flags:
+	 */
+	struct flags_set *s = p;
+	unsigned newflags = s->flags;
+	unsigned oldflags = bi->bi_flags & s->mask;
+
+	if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	if (!S_ISREG(bi->bi_mode) &&
+	    !S_ISDIR(bi->bi_mode) &&
+	    (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
+		return -EINVAL;
+
+	bi->bi_flags &= ~s->mask;
+	bi->bi_flags |= newflags;
+
+	bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
+	return 0;
+}
+
+static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
+{
+	unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
+
+	return put_user(flags, arg);
+}
+
+static int bch2_ioc_setflags(struct bch_fs *c,
+			     struct file *file,
+			     struct bch_inode_info *inode,
+			     void __user *arg)
+{
+	struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
+	unsigned uflags;
+	int ret;
+
+	if (get_user(uflags, (int __user *) arg))
+		return -EFAULT;
+
+	s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
+	if (uflags)
+		return -EOPNOTSUPP;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	inode_lock(&inode->v);
+	if (!inode_owner_or_capable(&inode->v)) {
+		ret = -EACCES;
+		goto setflags_out;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
+			       ATTR_CTIME);
+	mutex_unlock(&inode->ei_update_lock);
+
+setflags_out:
+	inode_unlock(&inode->v);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
+			       struct fsxattr __user *arg)
+{
+	struct fsxattr fa = { 0 };
+
+	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
+
+	return copy_to_user(arg, &fa, sizeof(fa));
+}
+
+static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
+				      struct bch_inode_unpacked *bi,
+				      void *p)
+{
+	struct flags_set *s = p;
+
+	if (s->projid != bi->bi_project) {
+		bi->bi_fields_set |= 1U << Inode_opt_project;
+		bi->bi_project = s->projid;
+	}
+
+	return bch2_inode_flags_set(inode, bi, p);
+}
+
+static int bch2_ioc_fssetxattr(struct bch_fs *c,
+			       struct file *file,
+			       struct bch_inode_info *inode,
+			       struct fsxattr __user *arg)
+{
+	struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
+	struct fsxattr fa;
+	int ret;
+
+	if (copy_from_user(&fa, arg, sizeof(fa)))
+		return -EFAULT;
+
+	s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
+	if (fa.fsx_xflags)
+		return -EOPNOTSUPP;
+
+	if (fa.fsx_projid >= U32_MAX)
+		return -EINVAL;
+
+	s.projid = fa.fsx_projid + 1;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	inode_lock(&inode->v);
+	if (!inode_owner_or_capable(&inode->v)) {
+		ret = -EACCES;
+		goto err;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_set_projid(c, inode, s.projid);
+	if (ret)
+		goto err_unlock;
+
+	ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
+			       ATTR_CTIME);
+err_unlock:
+	mutex_unlock(&inode->ei_update_lock);
+err:
+	inode_unlock(&inode->v);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
+				   struct bch_inode_unpacked *bi,
+				   void *p)
+{
+	struct bch_inode_info *dir = p;
+
+	return !bch2_reinherit_attrs(bi, &dir->ei_inode);
+}
+
+static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
+				    struct file *file,
+				    struct bch_inode_info *src,
+				    const char __user *name)
+{
+	struct bch_inode_info *dst;
+	struct inode *vinode = NULL;
+	char *kname = NULL;
+	struct qstr qstr;
+	int ret = 0;
+	u64 inum;
+
+	kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
+	if (!kname)
+		return -ENOMEM;
+
+	ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
+	if (unlikely(ret < 0))
+		goto err1;
+
+	qstr.len	= ret;
+	qstr.name	= kname;
+
+	ret = -ENOENT;
+	inum = bch2_dirent_lookup(c, src->v.i_ino,
+				  &src->ei_str_hash,
+				  &qstr);
+	if (!inum)
+		goto err1;
+
+	vinode = bch2_vfs_inode_get(c, inum);
+	ret = PTR_ERR_OR_ZERO(vinode);
+	if (ret)
+		goto err1;
+
+	dst = to_bch_ei(vinode);
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		goto err2;
+
+	bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
+
+	if (inode_attr_changing(src, dst, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, dst,
+					     src->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err3;
+	}
+
+	ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
+err3:
+	bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
+
+	/* return true if we did work */
+	if (ret >= 0)
+		ret = !ret;
+
+	mnt_drop_write_file(file);
+err2:
+	iput(vinode);
+err1:
+	kfree(kname);
+
+	return ret;
+}
+
+long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct super_block *sb = inode->v.i_sb;
+	struct bch_fs *c = sb->s_fs_info;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return bch2_ioc_getflags(inode, (int __user *) arg);
+
+	case FS_IOC_SETFLAGS:
+		return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+
+	case FS_IOC_FSGETXATTR:
+		return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+	case FS_IOC_FSSETXATTR:
+		return bch2_ioc_fssetxattr(c, file, inode,
+					   (void __user *) arg);
+
+	case BCHFS_IOC_REINHERIT_ATTRS:
+		return bch2_ioc_reinherit_attrs(c, file, inode,
+						(void __user *) arg);
+
+	case FS_IOC_GETVERSION:
+		return -ENOTTY;
+	case FS_IOC_SETVERSION:
+		return -ENOTTY;
+
+	case FS_IOC_GOINGDOWN:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		down_write(&sb->s_umount);
+		sb->s_flags |= SB_RDONLY;
+		if (bch2_fs_emergency_read_only(c))
+			bch_err(c, "emergency read only due to ioctl");
+		up_write(&sb->s_umount);
+		return 0;
+
+	default:
+		return bch2_fs_ioctl(c, cmd, (void __user *) arg);
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
new file mode 100644
index 000000000000..f201980ef2c3
--- /dev/null
+++ b/fs/bcachefs/fs-ioctl.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IOCTL_H
+#define _BCACHEFS_FS_IOCTL_H
+
+/* Inode flags: */
+
+/* bcachefs inode flags -> vfs inode flags: */
+static const unsigned bch_flags_to_vfs[] = {
+	[__BCH_INODE_SYNC]	= S_SYNC,
+	[__BCH_INODE_IMMUTABLE]	= S_IMMUTABLE,
+	[__BCH_INODE_APPEND]	= S_APPEND,
+	[__BCH_INODE_NOATIME]	= S_NOATIME,
+};
+
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
+static const unsigned bch_flags_to_uflags[] = {
+	[__BCH_INODE_SYNC]	= FS_SYNC_FL,
+	[__BCH_INODE_IMMUTABLE]	= FS_IMMUTABLE_FL,
+	[__BCH_INODE_APPEND]	= FS_APPEND_FL,
+	[__BCH_INODE_NODUMP]	= FS_NODUMP_FL,
+	[__BCH_INODE_NOATIME]	= FS_NOATIME_FL,
+};
+
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
+static const unsigned bch_flags_to_xflags[] = {
+	[__BCH_INODE_SYNC]	= FS_XFLAG_SYNC,
+	[__BCH_INODE_IMMUTABLE]	= FS_XFLAG_IMMUTABLE,
+	[__BCH_INODE_APPEND]	= FS_XFLAG_APPEND,
+	[__BCH_INODE_NODUMP]	= FS_XFLAG_NODUMP,
+	[__BCH_INODE_NOATIME]	= FS_XFLAG_NOATIME,
+	//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
+};
+
+#define set_flags(_map, _in, _out)					\
+do {									\
+	unsigned _i;							\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & (1 << _i))					\
+			(_out) |= _map[_i];				\
+		else							\
+			(_out) &= ~_map[_i];				\
+} while (0)
+
+#define map_flags(_map, _in)						\
+({									\
+	unsigned _out = 0;						\
+									\
+	set_flags(_map, _in, _out);					\
+	_out;								\
+})
+
+#define map_flags_rev(_map, _in)					\
+({									\
+	unsigned _i, _out = 0;						\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & _map[_i]) {					\
+			(_out) |= 1 << _i;				\
+			(_in) &= ~_map[_i];				\
+		}							\
+	(_out);								\
+})
+
+#define map_defined(_map)						\
+({									\
+	unsigned _in = ~0;						\
+									\
+	map_flags_rev(_map, _in);					\
+})
+
+/* Set VFS inode flags from bcachefs inode: */
+static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
+{
+	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
+}
+
+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
+
+#endif /* _BCACHEFS_FS_IOCTL_H */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
new file mode 100644
index 000000000000..9775a9825c5b
--- /dev/null
+++ b/fs/bcachefs/fs.c
@@ -0,0 +1,1602 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "dirent.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-common.h"
+#include "fs-io.h"
+#include "fs-ioctl.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "quota.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/exportfs.h>
+#include <linux/fiemap.h>
+#include <linux/module.h>
+#include <linux/posix_acl.h>
+#include <linux/random.h>
+#include <linux/statfs.h>
+#include <linux/xattr.h>
+
+static struct kmem_cache *bch2_inode_cache;
+
+static void bch2_vfs_inode_init(struct bch_fs *,
+				struct bch_inode_info *,
+				struct bch_inode_unpacked *);
+
+static void journal_seq_copy(struct bch_inode_info *dst,
+			     u64 journal_seq)
+{
+	u64 old, v = READ_ONCE(dst->ei_journal_seq);
+
+	do {
+		old = v;
+
+		if (old >= journal_seq)
+			break;
+	} while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+}
+
+static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
+{
+	BUG_ON(atomic_long_read(&lock->v) == 0);
+
+	if (atomic_long_sub_return_release(i, &lock->v) == 0)
+		wake_up_all(&lock->wait);
+}
+
+static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
+{
+	long v = atomic_long_read(&lock->v), old;
+
+	do {
+		old = v;
+
+		if (i > 0 ? v < 0 : v > 0)
+			return false;
+	} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+					old, old + i)) != old);
+	return true;
+}
+
+static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
+{
+	wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
+}
+
+void bch2_pagecache_add_put(struct pagecache_lock *lock)
+{
+	__pagecache_lock_put(lock, 1);
+}
+
+void bch2_pagecache_add_get(struct pagecache_lock *lock)
+{
+	__pagecache_lock_get(lock, 1);
+}
+
+void bch2_pagecache_block_put(struct pagecache_lock *lock)
+{
+	__pagecache_lock_put(lock, -1);
+}
+
+void bch2_pagecache_block_get(struct pagecache_lock *lock)
+{
+	__pagecache_lock_get(lock, -1);
+}
+
+void bch2_inode_update_after_write(struct bch_fs *c,
+				   struct bch_inode_info *inode,
+				   struct bch_inode_unpacked *bi,
+				   unsigned fields)
+{
+	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
+	i_uid_write(&inode->v, bi->bi_uid);
+	i_gid_write(&inode->v, bi->bi_gid);
+	inode->v.i_mode	= bi->bi_mode;
+
+	if (fields & ATTR_ATIME)
+		inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
+	if (fields & ATTR_MTIME)
+		inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
+	if (fields & ATTR_CTIME)
+		inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
+
+	inode->ei_inode		= *bi;
+
+	bch2_inode_flags_to_vfs(inode);
+}
+
+int __must_check bch2_write_inode(struct bch_fs *c,
+				  struct bch_inode_info *inode,
+				  inode_set_fn set,
+				  void *p, unsigned fields)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
+			       BTREE_ITER_INTENT);
+	ret   = PTR_ERR_OR_ZERO(iter) ?:
+		(set ? set(inode, &inode_u, p) : 0) ?:
+		bch2_inode_write(&trans, iter, &inode_u) ?:
+		bch2_trans_commit(&trans, NULL,
+				  &inode->ei_journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK|
+				  BTREE_INSERT_NOFAIL);
+	if (ret == -EINTR)
+		goto retry;
+
+	/*
+	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
+	 * this is important for inode updates via bchfs_write_index_update
+	 */
+	if (!ret)
+		bch2_inode_update_after_write(c, inode, &inode_u, fields);
+
+	bch2_trans_exit(&trans);
+	return ret < 0 ? ret : 0;
+}
+
+int bch2_fs_quota_transfer(struct bch_fs *c,
+			   struct bch_inode_info *inode,
+			   struct bch_qid new_qid,
+			   unsigned qtypes,
+			   enum quota_acct_mode mode)
+{
+	unsigned i;
+	int ret;
+
+	qtypes &= enabled_qtypes(c);
+
+	for (i = 0; i < QTYP_NR; i++)
+		if (new_qid.q[i] == inode->ei_qid.q[i])
+			qtypes &= ~(1U << i);
+
+	if (!qtypes)
+		return 0;
+
+	mutex_lock(&inode->ei_quota_lock);
+
+	ret = bch2_quota_transfer(c, qtypes, new_qid,
+				  inode->ei_qid,
+				  inode->v.i_blocks +
+				  inode->ei_quota_reserved,
+				  mode);
+	if (!ret)
+		for (i = 0; i < QTYP_NR; i++)
+			if (qtypes & (1 << i))
+				inode->ei_qid.q[i] = new_qid.q[i];
+
+	mutex_unlock(&inode->ei_quota_lock);
+
+	return ret;
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
+{
+	struct bch_inode_unpacked inode_u;
+	struct bch_inode_info *inode;
+	int ret;
+
+	inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->v.i_state & I_NEW))
+		return &inode->v;
+
+	ret = bch2_inode_find_by_inum(c, inum, &inode_u);
+	if (ret) {
+		iget_failed(&inode->v);
+		return ERR_PTR(ret);
+	}
+
+	bch2_vfs_inode_init(c, inode, &inode_u);
+
+	inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
+
+	unlock_new_inode(&inode->v);
+
+	return &inode->v;
+}
+
+static int inum_test(struct inode *inode, void *p)
+{
+	unsigned long *ino = p;
+
+	return *ino == inode->i_ino;
+}
+
+static struct bch_inode_info *
+__bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
+	      umode_t mode, dev_t rdev, bool tmpfile)
+{
+	struct bch_fs *c = dir->v.i_sb->s_fs_info;
+	struct user_namespace *ns = dir->v.i_sb->s_user_ns;
+	struct btree_trans trans;
+	struct bch_inode_unpacked dir_u;
+	struct bch_inode_info *inode, *old;
+	struct bch_inode_unpacked inode_u;
+	struct posix_acl *default_acl = NULL, *acl = NULL;
+	u64 journal_seq = 0;
+	int ret;
+
+	/*
+	 * preallocate acls + vfs inode before btree transaction, so that
+	 * nothing can fail after the transaction succeeds:
+	 */
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
+	if (ret)
+		return ERR_PTR(ret);
+#endif
+	inode = to_bch_ei(new_inode(c->vfs_sb));
+	if (unlikely(!inode)) {
+		inode = ERR_PTR(-ENOMEM);
+		goto err;
+	}
+
+	bch2_inode_init_early(c, &inode_u);
+
+	if (!tmpfile)
+		mutex_lock(&dir->ei_update_lock);
+
+	bch2_trans_init(&trans, c, 8, 1024);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret   = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u,
+				  !tmpfile ? &dentry->d_name : NULL,
+				  from_kuid(ns, current_fsuid()),
+				  from_kgid(ns, current_fsgid()),
+				  mode, rdev,
+				  default_acl, acl) ?:
+		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
+				KEY_TYPE_QUOTA_PREALLOC);
+	if (unlikely(ret))
+		goto err_before_quota;
+
+	ret   = bch2_trans_commit(&trans, NULL, &journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK);
+	if (unlikely(ret)) {
+		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
+				KEY_TYPE_QUOTA_WARN);
+err_before_quota:
+		if (ret == -EINTR)
+			goto retry;
+		goto err_trans;
+	}
+
+	if (!tmpfile) {
+		bch2_inode_update_after_write(c, dir, &dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+		journal_seq_copy(dir, journal_seq);
+		mutex_unlock(&dir->ei_update_lock);
+	}
+
+	bch2_vfs_inode_init(c, inode, &inode_u);
+	journal_seq_copy(inode, journal_seq);
+
+	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
+
+	/*
+	 * we must insert the new inode into the inode cache before calling
+	 * bch2_trans_exit() and dropping locks, else we could race with another
+	 * thread pulling the inode in and modifying it:
+	 */
+
+	inode->v.i_state |= I_CREATING;
+	old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
+				      inum_test, NULL, &inode->v.i_ino));
+	BUG_ON(!old);
+
+	if (unlikely(old != inode)) {
+		/*
+		 * We raced, another process pulled the new inode into cache
+		 * before us:
+		 */
+		journal_seq_copy(old, journal_seq);
+		make_bad_inode(&inode->v);
+		iput(&inode->v);
+
+		inode = old;
+	} else {
+		/*
+		 * we really don't want insert_inode_locked2() to be setting
+		 * I_NEW...
+		 */
+		unlock_new_inode(&inode->v);
+	}
+
+	bch2_trans_exit(&trans);
+err:
+	posix_acl_release(default_acl);
+	posix_acl_release(acl);
+	return inode;
+err_trans:
+	if (!tmpfile)
+		mutex_unlock(&dir->ei_update_lock);
+
+	bch2_trans_exit(&trans);
+	make_bad_inode(&inode->v);
+	iput(&inode->v);
+	inode = ERR_PTR(ret);
+	goto err;
+}
+
+/* methods */
+
+static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
+				  unsigned int flags)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct inode *vinode = NULL;
+	u64 inum;
+
+	inum = bch2_dirent_lookup(c, dir->v.i_ino,
+				  &dir->ei_str_hash,
+				  &dentry->d_name);
+
+	if (inum)
+		vinode = bch2_vfs_inode_get(c, inum);
+
+	return d_splice_alias(vinode, dentry);
+}
+
+static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
+		      umode_t mode, dev_t rdev)
+{
+	struct bch_inode_info *inode =
+		__bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	d_instantiate(dentry, &inode->v);
+	return 0;
+}
+
+static int bch2_create(struct inode *vdir, struct dentry *dentry,
+		       umode_t mode, bool excl)
+{
+	return bch2_mknod(vdir, dentry, mode|S_IFREG, 0);
+}
+
+static int __bch2_link(struct bch_fs *c,
+		       struct bch_inode_info *inode,
+		       struct bch_inode_info *dir,
+		       struct dentry *dentry)
+{
+	struct btree_trans trans;
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+	bch2_trans_init(&trans, c, 4, 1024);
+
+	do {
+		bch2_trans_begin(&trans);
+		ret   = bch2_link_trans(&trans,
+					dir->v.i_ino,
+					inode->v.i_ino, &inode_u,
+					&dentry->d_name) ?:
+			bch2_trans_commit(&trans, NULL,
+					&inode->ei_journal_seq,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOUNLOCK);
+	} while (ret == -EINTR);
+
+	if (likely(!ret))
+		bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+
+	bch2_trans_exit(&trans);
+	mutex_unlock(&inode->ei_update_lock);
+	return ret;
+}
+
+static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
+		     struct dentry *dentry)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
+	int ret;
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+
+	ret = __bch2_link(c, inode, dir, dentry);
+	if (unlikely(ret))
+		return ret;
+
+	ihold(&inode->v);
+	d_instantiate(dentry, &inode->v);
+	return 0;
+}
+
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_inode_unpacked dir_u, inode_u;
+	struct btree_trans trans;
+	int ret;
+
+	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
+	bch2_trans_init(&trans, c, 4, 1024);
+
+	do {
+		bch2_trans_begin(&trans);
+
+		ret   = bch2_unlink_trans(&trans,
+					  dir->v.i_ino, &dir_u,
+					  &inode_u, &dentry->d_name) ?:
+			bch2_trans_commit(&trans, NULL,
+					  &dir->ei_journal_seq,
+					  BTREE_INSERT_ATOMIC|
+					  BTREE_INSERT_NOUNLOCK|
+					  BTREE_INSERT_NOFAIL);
+	} while (ret == -EINTR);
+
+	if (likely(!ret)) {
+		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
+
+		journal_seq_copy(inode, dir->ei_journal_seq);
+		bch2_inode_update_after_write(c, dir, &dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+		bch2_inode_update_after_write(c, inode, &inode_u,
+					      ATTR_MTIME);
+	}
+
+	bch2_trans_exit(&trans);
+	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
+
+	return ret;
+}
+
+static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
+			const char *symname)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
+	int ret;
+
+	inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
+	if (unlikely(IS_ERR(inode)))
+		return PTR_ERR(inode);
+
+	inode_lock(&inode->v);
+	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
+	inode_unlock(&inode->v);
+
+	if (unlikely(ret))
+		goto err;
+
+	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
+	if (unlikely(ret))
+		goto err;
+
+	journal_seq_copy(dir, inode->ei_journal_seq);
+
+	ret = __bch2_link(c, inode, dir, dentry);
+	if (unlikely(ret))
+		goto err;
+
+	d_instantiate(dentry, &inode->v);
+	return 0;
+err:
+	iput(&inode->v);
+	return ret;
+}
+
+static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
+{
+	return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0);
+}
+
+static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
+			struct inode *dst_vdir, struct dentry *dst_dentry,
+			unsigned flags)
+{
+	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
+	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
+	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
+	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
+	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
+	struct bch_inode_unpacked dst_dir_u, src_dir_u;
+	struct bch_inode_unpacked src_inode_u, dst_inode_u;
+	struct btree_trans trans;
+	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
+		? BCH_RENAME_EXCHANGE
+		: dst_dentry->d_inode
+		? BCH_RENAME_OVERWRITE : BCH_RENAME;
+	u64 journal_seq = 0;
+	int ret;
+
+	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+		return -EINVAL;
+
+	if (mode == BCH_RENAME_OVERWRITE) {
+		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
+						   0, LLONG_MAX);
+		if (ret)
+			return ret;
+	}
+
+	bch2_trans_init(&trans, c, 8, 2048);
+
+	bch2_lock_inodes(INODE_UPDATE_LOCK,
+			 src_dir,
+			 dst_dir,
+			 src_inode,
+			 dst_inode);
+
+	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, src_inode,
+					     dst_dir->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, dst_inode,
+					     src_dir->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
+
+retry:
+	bch2_trans_begin(&trans);
+	ret   = bch2_rename_trans(&trans,
+				  src_dir->v.i_ino, &src_dir_u,
+				  dst_dir->v.i_ino, &dst_dir_u,
+				  &src_inode_u,
+				  &dst_inode_u,
+				  &src_dentry->d_name,
+				  &dst_dentry->d_name,
+				  mode) ?:
+		bch2_trans_commit(&trans, NULL,
+				  &journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK);
+	if (ret == -EINTR)
+		goto retry;
+	if (unlikely(ret))
+		goto err;
+
+	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
+	BUG_ON(dst_inode &&
+	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
+
+	bch2_inode_update_after_write(c, src_dir, &src_dir_u,
+				      ATTR_MTIME|ATTR_CTIME);
+	journal_seq_copy(src_dir, journal_seq);
+
+	if (src_dir != dst_dir) {
+		bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+		journal_seq_copy(dst_dir, journal_seq);
+	}
+
+	bch2_inode_update_after_write(c, src_inode, &src_inode_u,
+				      ATTR_CTIME);
+	journal_seq_copy(src_inode, journal_seq);
+
+	if (dst_inode) {
+		bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
+					      ATTR_CTIME);
+		journal_seq_copy(dst_inode, journal_seq);
+	}
+err:
+	bch2_trans_exit(&trans);
+
+	bch2_fs_quota_transfer(c, src_inode,
+			       bch_qid(&src_inode->ei_inode),
+			       1 << QTYP_PRJ,
+			       KEY_TYPE_QUOTA_NOCHECK);
+	if (dst_inode)
+		bch2_fs_quota_transfer(c, dst_inode,
+				       bch_qid(&dst_inode->ei_inode),
+				       1 << QTYP_PRJ,
+				       KEY_TYPE_QUOTA_NOCHECK);
+
+	bch2_unlock_inodes(INODE_UPDATE_LOCK,
+			   src_dir,
+			   dst_dir,
+			   src_inode,
+			   dst_inode);
+
+	return ret;
+}
+
+void bch2_setattr_copy(struct bch_inode_info *inode,
+		       struct bch_inode_unpacked *bi,
+		       struct iattr *attr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	unsigned int ia_valid = attr->ia_valid;
+
+	if (ia_valid & ATTR_UID)
+		bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid);
+	if (ia_valid & ATTR_GID)
+		bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid);
+
+	if (ia_valid & ATTR_ATIME)
+		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
+	if (ia_valid & ATTR_MTIME)
+		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
+	if (ia_valid & ATTR_CTIME)
+		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
+
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = attr->ia_mode;
+		kgid_t gid = ia_valid & ATTR_GID
+			? attr->ia_gid
+			: inode->v.i_gid;
+
+		if (!in_group_p(gid) &&
+		    !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID))
+			mode &= ~S_ISGID;
+		bi->bi_mode = mode;
+	}
+}
+
+static int bch2_setattr_nonsize(struct bch_inode_info *inode,
+				struct iattr *attr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_qid qid;
+	struct btree_trans trans;
+	struct btree_iter *inode_iter;
+	struct bch_inode_unpacked inode_u;
+	struct posix_acl *acl = NULL;
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+
+	qid = inode->ei_qid;
+
+	if (attr->ia_valid & ATTR_UID)
+		qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid);
+
+	if (attr->ia_valid & ATTR_GID)
+		qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid);
+
+	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
+				     KEY_TYPE_QUOTA_PREALLOC);
+	if (ret)
+		goto err;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+	kfree(acl);
+	acl = NULL;
+
+	inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
+				     BTREE_ITER_INTENT);
+	ret = PTR_ERR_OR_ZERO(inode_iter);
+	if (ret)
+		goto btree_err;
+
+	bch2_setattr_copy(inode, &inode_u, attr);
+
+	if (attr->ia_valid & ATTR_MODE) {
+		ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl);
+		if (ret)
+			goto btree_err;
+	}
+
+	ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
+		bch2_trans_commit(&trans, NULL,
+				  &inode->ei_journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK|
+				  BTREE_INSERT_NOFAIL);
+btree_err:
+	if (ret == -EINTR)
+		goto retry;
+	if (unlikely(ret))
+		goto err_trans;
+
+	bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
+
+	if (acl)
+		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+err_trans:
+	bch2_trans_exit(&trans);
+err:
+	mutex_unlock(&inode->ei_update_lock);
+
+	return ret;
+}
+
+static int bch2_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned query_flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	stat->dev	= inode->v.i_sb->s_dev;
+	stat->ino	= inode->v.i_ino;
+	stat->mode	= inode->v.i_mode;
+	stat->nlink	= inode->v.i_nlink;
+	stat->uid	= inode->v.i_uid;
+	stat->gid	= inode->v.i_gid;
+	stat->rdev	= inode->v.i_rdev;
+	stat->size	= i_size_read(&inode->v);
+	stat->atime	= inode->v.i_atime;
+	stat->mtime	= inode->v.i_mtime;
+	stat->ctime	= inode->v.i_ctime;
+	stat->blksize	= block_bytes(c);
+	stat->blocks	= inode->v.i_blocks;
+
+	if (request_mask & STATX_BTIME) {
+		stat->result_mask |= STATX_BTIME;
+		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
+	}
+
+	if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
+		stat->attributes |= STATX_ATTR_IMMUTABLE;
+	if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
+		stat->attributes |= STATX_ATTR_APPEND;
+	if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
+		stat->attributes |= STATX_ATTR_NODUMP;
+
+	return 0;
+}
+
+static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	int ret;
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+
+	ret = setattr_prepare(dentry, iattr);
+	if (ret)
+		return ret;
+
+	return iattr->ia_valid & ATTR_SIZE
+		? bch2_truncate(inode, iattr)
+		: bch2_setattr_nonsize(inode, iattr);
+}
+
+static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
+{
+	struct bch_inode_info *inode =
+		__bch2_create(to_bch_ei(vdir), dentry, mode, 0, true);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	d_mark_tmpfile(dentry, &inode->v);
+	d_instantiate(dentry, &inode->v);
+	return 0;
+}
+
+static int bch2_fill_extent(struct bch_fs *c,
+			    struct fiemap_extent_info *info,
+			    struct bkey_s_c k, unsigned flags)
+{
+	if (bkey_extent_is_data(k.k)) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+		int ret;
+
+		if (k.k->type == KEY_TYPE_reflink_v)
+			flags |= FIEMAP_EXTENT_SHARED;
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			int flags2 = 0;
+			u64 offset = p.ptr.offset;
+
+			if (p.crc.compression_type)
+				flags2 |= FIEMAP_EXTENT_ENCODED;
+			else
+				offset += p.crc.offset;
+
+			if ((offset & (c->opts.block_size - 1)) ||
+			    (k.k->size & (c->opts.block_size - 1)))
+				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+			ret = fiemap_fill_next_extent(info,
+						bkey_start_offset(k.k) << 9,
+						offset << 9,
+						k.k->size << 9, flags|flags2);
+			if (ret)
+				return ret;
+		}
+
+		return 0;
+	} else if (k.k->type == KEY_TYPE_reservation) {
+		return fiemap_fill_next_extent(info,
+					       bkey_start_offset(k.k) << 9,
+					       0, k.k->size << 9,
+					       flags|
+					       FIEMAP_EXTENT_DELALLOC|
+					       FIEMAP_EXTENT_UNWRITTEN);
+	} else {
+		BUG();
+	}
+}
+
+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
+		       u64 start, u64 len)
+{
+	struct bch_fs *c = vinode->i_sb->s_fs_info;
+	struct bch_inode_info *ei = to_bch_ei(vinode);
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	BKEY_PADDED(k) cur, prev;
+	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
+	unsigned offset_into_extent, sectors;
+	bool have_extent = false;
+	int ret = 0;
+
+	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	if (start + len < start)
+		return -EINVAL;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   POS(ei->v.i_ino, start >> 9), 0);
+retry:
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k)) &&
+	       bkey_cmp(iter->pos, end) < 0) {
+		if (!bkey_extent_is_data(k.k) &&
+		    k.k->type != KEY_TYPE_reservation) {
+			bch2_btree_iter_next(iter);
+			continue;
+		}
+
+		bkey_reassemble(&cur.k, k);
+		k = bkey_i_to_s_c(&cur.k);
+
+		offset_into_extent	= iter->pos.offset -
+			bkey_start_offset(k.k);
+		sectors			= k.k->size - offset_into_extent;
+
+		ret = bch2_read_indirect_extent(&trans,
+					&offset_into_extent, &cur.k);
+		if (ret)
+			break;
+
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		if (offset_into_extent)
+			bch2_cut_front(POS(k.k->p.inode,
+					   bkey_start_offset(k.k) +
+					   offset_into_extent),
+				       &cur.k);
+		bch2_key_resize(&cur.k.k, sectors);
+		cur.k.k.p = iter->pos;
+		cur.k.k.p.offset += cur.k.k.size;
+
+		if (have_extent) {
+			ret = bch2_fill_extent(c, info,
+					bkey_i_to_s_c(&prev.k), 0);
+			if (ret)
+				break;
+		}
+
+		bkey_copy(&prev.k, &cur.k);
+		have_extent = true;
+
+		if (k.k->type == KEY_TYPE_reflink_v)
+			bch2_btree_iter_set_pos(iter, k.k->p);
+		else
+			bch2_btree_iter_next(iter);
+	}
+
+	if (ret == -EINTR)
+		goto retry;
+
+	if (!ret && have_extent)
+		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k),
+				       FIEMAP_EXTENT_LAST);
+
+	ret = bch2_trans_exit(&trans) ?: ret;
+	return ret < 0 ? ret : 0;
+}
+
+static const struct vm_operations_struct bch_vm_ops = {
+	.fault		= bch2_page_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite   = bch2_page_mkwrite,
+};
+
+static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+
+	vma->vm_ops = &bch_vm_ops;
+	return 0;
+}
+
+/* Directories: */
+
+static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	return generic_file_llseek_size(file, offset, whence,
+					S64_MAX, S64_MAX);
+}
+
+static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	return bch2_readdir(c, inode->v.i_ino, ctx);
+}
+
+static const struct file_operations bch_file_operations = {
+	.llseek		= bch2_llseek,
+	.read_iter	= bch2_read_iter,
+	.write_iter	= bch2_write_iter,
+	.mmap		= bch2_mmap,
+	.open		= generic_file_open,
+	.fsync		= bch2_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.fallocate	= bch2_fallocate_dispatch,
+	.unlocked_ioctl = bch2_fs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch2_compat_fs_ioctl,
+#endif
+	.remap_file_range = bch2_remap_file_range,
+};
+
+static const struct inode_operations bch_file_inode_operations = {
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.fiemap		= bch2_fiemap,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct inode_operations bch_dir_inode_operations = {
+	.lookup		= bch2_lookup,
+	.create		= bch2_create,
+	.link		= bch2_link,
+	.unlink		= bch2_unlink,
+	.symlink	= bch2_symlink,
+	.mkdir		= bch2_mkdir,
+	.rmdir		= bch2_unlink,
+	.mknod		= bch2_mknod,
+	.rename		= bch2_rename2,
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.tmpfile	= bch2_tmpfile,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct file_operations bch_dir_file_operations = {
+	.llseek		= bch2_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate_shared	= bch2_vfs_readdir,
+	.fsync		= bch2_fsync,
+	.unlocked_ioctl = bch2_fs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch2_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_symlink_inode_operations = {
+	.get_link	= page_get_link,
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct inode_operations bch_special_inode_operations = {
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct address_space_operations bch_address_space_operations = {
+	.writepage	= bch2_writepage,
+	.readpage	= bch2_readpage,
+	.writepages	= bch2_writepages,
+	.readpages	= bch2_readpages,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.write_begin	= bch2_write_begin,
+	.write_end	= bch2_write_end,
+	.invalidatepage	= bch2_invalidatepage,
+	.releasepage	= bch2_releasepage,
+	.direct_IO	= noop_direct_IO,
+#ifdef CONFIG_MIGRATION
+	.migratepage	= bch2_migrate_page,
+#endif
+	.error_remove_page = generic_error_remove_page,
+};
+
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct inode *vinode;
+
+	if (ino < BCACHEFS_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+
+	vinode = bch2_vfs_inode_get(c, ino);
+	if (IS_ERR(vinode))
+		return ERR_CAST(vinode);
+	if (generation && vinode->i_generation != generation) {
+		/* we didn't find the right inode.. */
+		iput(vinode);
+		return ERR_PTR(-ESTALE);
+	}
+	return vinode;
+}
+
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    bch2_nfs_get_inode);
+}
+
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    bch2_nfs_get_inode);
+}
+
+static const struct export_operations bch_export_ops = {
+	.fh_to_dentry	= bch2_fh_to_dentry,
+	.fh_to_parent	= bch2_fh_to_parent,
+	//.get_parent	= bch2_get_parent,
+};
+
+static void bch2_vfs_inode_init(struct bch_fs *c,
+				struct bch_inode_info *inode,
+				struct bch_inode_unpacked *bi)
+{
+	bch2_inode_update_after_write(c, inode, bi, ~0);
+
+	inode->v.i_blocks	= bi->bi_sectors;
+	inode->v.i_ino		= bi->bi_inum;
+	inode->v.i_rdev		= bi->bi_dev;
+	inode->v.i_generation	= bi->bi_generation;
+	inode->v.i_size		= bi->bi_size;
+
+	inode->ei_journal_seq	= 0;
+	inode->ei_quota_reserved = 0;
+	inode->ei_str_hash	= bch2_hash_info_init(c, bi);
+	inode->ei_qid		= bch_qid(bi);
+
+	inode->v.i_mapping->a_ops = &bch_address_space_operations;
+
+	switch (inode->v.i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->v.i_op	= &bch_file_inode_operations;
+		inode->v.i_fop	= &bch_file_operations;
+		break;
+	case S_IFDIR:
+		inode->v.i_op	= &bch_dir_inode_operations;
+		inode->v.i_fop	= &bch_dir_file_operations;
+		break;
+	case S_IFLNK:
+		inode_nohighmem(&inode->v);
+		inode->v.i_op	= &bch_symlink_inode_operations;
+		break;
+	default:
+		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
+		inode->v.i_op	= &bch_special_inode_operations;
+		break;
+	}
+}
+
+static struct inode *bch2_alloc_inode(struct super_block *sb)
+{
+	struct bch_inode_info *inode;
+
+	inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+	if (!inode)
+		return NULL;
+
+	inode_init_once(&inode->v);
+	mutex_init(&inode->ei_update_lock);
+	pagecache_lock_init(&inode->ei_pagecache_lock);
+	mutex_init(&inode->ei_quota_lock);
+	inode->ei_journal_seq = 0;
+
+	return &inode->v;
+}
+
+static void bch2_i_callback(struct rcu_head *head)
+{
+	struct inode *vinode = container_of(head, struct inode, i_rcu);
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+
+	kmem_cache_free(bch2_inode_cache, inode);
+}
+
+static void bch2_destroy_inode(struct inode *vinode)
+{
+	call_rcu(&vinode->i_rcu, bch2_i_callback);
+}
+
+static int inode_update_times_fn(struct bch_inode_info *inode,
+				 struct bch_inode_unpacked *bi,
+				 void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	bi->bi_atime	= timespec_to_bch2_time(c, inode->v.i_atime);
+	bi->bi_mtime	= timespec_to_bch2_time(c, inode->v.i_mtime);
+	bi->bi_ctime	= timespec_to_bch2_time(c, inode->v.i_ctime);
+
+	return 0;
+}
+
+static int bch2_vfs_write_inode(struct inode *vinode,
+				struct writeback_control *wbc)
+{
+	struct bch_fs *c = vinode->i_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
+	mutex_unlock(&inode->ei_update_lock);
+
+	return ret;
+}
+
+static void bch2_evict_inode(struct inode *vinode)
+{
+	struct bch_fs *c = vinode->i_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+
+	truncate_inode_pages_final(&inode->v.i_data);
+
+	clear_inode(&inode->v);
+
+	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
+
+	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
+				KEY_TYPE_QUOTA_WARN);
+		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
+				KEY_TYPE_QUOTA_WARN);
+		bch2_inode_rm(c, inode->v.i_ino);
+	}
+}
+
+static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
+	unsigned shift = sb->s_blocksize_bits - 9;
+	u64 fsid;
+
+	buf->f_type	= BCACHEFS_STATFS_MAGIC;
+	buf->f_bsize	= sb->s_blocksize;
+	buf->f_blocks	= usage.capacity >> shift;
+	buf->f_bfree	= (usage.capacity - usage.used) >> shift;
+	buf->f_bavail	= buf->f_bfree;
+	buf->f_files	= usage.nr_inodes;
+	buf->f_ffree	= U64_MAX;
+
+	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
+	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	buf->f_namelen	= BCH_NAME_MAX;
+
+	return 0;
+}
+
+static int bch2_sync_fs(struct super_block *sb, int wait)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (c->opts.journal_flush_disabled)
+		return 0;
+
+	if (!wait) {
+		bch2_journal_flush_async(&c->journal, NULL);
+		return 0;
+	}
+
+	return bch2_journal_flush(&c->journal);
+}
+
+static struct bch_fs *bch2_path_to_fs(const char *path)
+{
+	struct bch_fs *c;
+	dev_t dev;
+	int ret;
+
+	ret = lookup_bdev(path, &dev);
+	if (ret)
+		return ERR_PTR(ret);
+
+	c = bch2_dev_to_fs(dev);
+	return c ?: ERR_PTR(-ENOENT);
+}
+
+static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
+					       unsigned nr_devs, struct bch_opts opts)
+{
+	struct bch_fs *c, *c1, *c2;
+	size_t i;
+
+	if (!nr_devs)
+		return ERR_PTR(-EINVAL);
+
+	c = bch2_fs_open(devs, nr_devs, opts);
+
+	if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
+		/*
+		 * Already open?
+		 * Look up each block device, make sure they all belong to a
+		 * filesystem and they all belong to the _same_ filesystem
+		 */
+
+		c1 = bch2_path_to_fs(devs[0]);
+		if (IS_ERR(c1))
+			return c;
+
+		for (i = 1; i < nr_devs; i++) {
+			c2 = bch2_path_to_fs(devs[i]);
+			if (!IS_ERR(c2))
+				closure_put(&c2->cl);
+
+			if (c1 != c2) {
+				closure_put(&c1->cl);
+				return c;
+			}
+		}
+
+		c = c1;
+	}
+
+	if (IS_ERR(c))
+		return c;
+
+	mutex_lock(&c->state_lock);
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+		mutex_unlock(&c->state_lock);
+		closure_put(&c->cl);
+		pr_err("err mounting %s: incomplete filesystem", dev_name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	mutex_unlock(&c->state_lock);
+
+	set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
+	return c;
+}
+
+static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
+					     struct bch_opts opts)
+{
+	char *dev_name = NULL, **devs = NULL, *s;
+	struct bch_fs *c = ERR_PTR(-ENOMEM);
+	size_t i, nr_devs = 0;
+
+	dev_name = kstrdup(_dev_name, GFP_KERNEL);
+	if (!dev_name)
+		goto err;
+
+	for (s = dev_name; s; s = strchr(s + 1, ':'))
+		nr_devs++;
+
+	devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
+	if (!devs)
+		goto err;
+
+	for (i = 0, s = dev_name;
+	     s;
+	     (s = strchr(s, ':')) && (*s++ = '\0'))
+		devs[i++] = s;
+
+	c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
+err:
+	kfree(devs);
+	kfree(dev_name);
+	return c;
+}
+
+static int bch2_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_opts opts = bch2_opts_empty();
+	int ret;
+
+	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
+
+	ret = bch2_parse_mount_opts(&opts, data);
+	if (ret)
+		return ret;
+
+	if (opts.read_only != c->opts.read_only) {
+		mutex_lock(&c->state_lock);
+
+		if (opts.read_only) {
+			bch2_fs_read_only(c);
+
+			sb->s_flags |= SB_RDONLY;
+		} else {
+			ret = bch2_fs_read_write(c);
+			if (ret) {
+				bch_err(c, "error going rw: %i", ret);
+				mutex_unlock(&c->state_lock);
+				return -EINVAL;
+			}
+
+			sb->s_flags &= ~SB_RDONLY;
+		}
+
+		c->opts.read_only = opts.read_only;
+
+		mutex_unlock(&c->state_lock);
+	}
+
+	if (opts.errors >= 0)
+		c->opts.errors = opts.errors;
+
+	return ret;
+}
+
+static int bch2_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct bch_fs *c = root->d_sb->s_fs_info;
+	enum bch_opt_id i;
+	char buf[512];
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		const struct bch_option *opt = &bch2_opt_table[i];
+		u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+		if (!(opt->mode & OPT_MOUNT))
+			continue;
+
+		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+			continue;
+
+		bch2_opt_to_text(&PBUF(buf), c, opt, v,
+				 OPT_SHOW_MOUNT_STYLE);
+		seq_putc(seq, ',');
+		seq_puts(seq, buf);
+	}
+
+	return 0;
+
+}
+
+static const struct super_operations bch_super_operations = {
+	.alloc_inode	= bch2_alloc_inode,
+	.destroy_inode	= bch2_destroy_inode,
+	.write_inode	= bch2_vfs_write_inode,
+	.evict_inode	= bch2_evict_inode,
+	.sync_fs	= bch2_sync_fs,
+	.statfs		= bch2_statfs,
+	.show_options	= bch2_show_options,
+	.remount_fs	= bch2_remount,
+#if 0
+	.put_super	= bch2_put_super,
+	.freeze_fs	= bch2_freeze,
+	.unfreeze_fs	= bch2_unfreeze,
+#endif
+};
+
+static int bch2_test_super(struct super_block *s, void *data)
+{
+	return s->s_fs_info == data;
+}
+
+static int bch2_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+	return 0;
+}
+
+static struct dentry *bch2_mount(struct file_system_type *fs_type,
+				 int flags, const char *dev_name, void *data)
+{
+	struct bch_fs *c;
+	struct bch_dev *ca;
+	struct super_block *sb;
+	struct inode *vinode;
+	struct bch_opts opts = bch2_opts_empty();
+	unsigned i;
+	int ret;
+
+	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
+
+	ret = bch2_parse_mount_opts(&opts, data);
+	if (ret)
+		return ERR_PTR(ret);
+
+	c = bch2_open_as_blockdevs(dev_name, opts);
+	if (IS_ERR(c))
+		return ERR_CAST(c);
+
+	sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c);
+	if (IS_ERR(sb)) {
+		closure_put(&c->cl);
+		return ERR_CAST(sb);
+	}
+
+	BUG_ON(sb->s_fs_info != c);
+
+	if (sb->s_root) {
+		closure_put(&c->cl);
+
+		if ((flags ^ sb->s_flags) & SB_RDONLY) {
+			ret = -EBUSY;
+			goto err_put_super;
+		}
+		goto out;
+	}
+
+	sb->s_blocksize		= block_bytes(c);
+	sb->s_blocksize_bits	= ilog2(block_bytes(c));
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_op		= &bch_super_operations;
+	sb->s_export_op		= &bch_export_ops;
+#ifdef CONFIG_BCACHEFS_QUOTA
+	sb->s_qcop		= &bch2_quotactl_operations;
+	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
+#endif
+	sb->s_xattr		= bch2_xattr_handlers;
+	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
+	sb->s_time_gran		= c->sb.time_precision;
+	c->vfs_sb		= sb;
+	strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
+
+	ret = super_setup_bdi(sb);
+	if (ret)
+		goto err_put_super;
+
+	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
+
+	for_each_online_member(ca, c, i) {
+		struct block_device *bdev = ca->disk_sb.bdev;
+
+		/* XXX: create an anonymous device for multi device filesystems */
+		sb->s_bdev	= bdev;
+		sb->s_dev	= bdev->bd_dev;
+		percpu_ref_put(&ca->io_ref);
+		break;
+	}
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	if (c->opts.acl)
+		sb->s_flags	|= SB_POSIXACL;
+#endif
+
+	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
+	if (IS_ERR(vinode)) {
+		bch_err(c, "error mounting: error getting root inode %i",
+			(int) PTR_ERR(vinode));
+		ret = PTR_ERR(vinode);
+		goto err_put_super;
+	}
+
+	sb->s_root = d_make_root(vinode);
+	if (!sb->s_root) {
+		bch_err(c, "error mounting: error allocating root dentry");
+		ret = -ENOMEM;
+		goto err_put_super;
+	}
+
+	sb->s_flags |= SB_ACTIVE;
+out:
+	return dget(sb->s_root);
+
+err_put_super:
+	deactivate_locked_super(sb);
+	return ERR_PTR(ret);
+}
+
+static void bch2_kill_sb(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	generic_shutdown_super(sb);
+
+	if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
+		bch2_fs_stop(c);
+	else
+		closure_put(&c->cl);
+}
+
+static struct file_system_type bcache_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "bcachefs",
+	.mount		= bch2_mount,
+	.kill_sb	= bch2_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+MODULE_ALIAS_FS("bcachefs");
+
+void bch2_vfs_exit(void)
+{
+	unregister_filesystem(&bcache_fs_type);
+	if (bch2_inode_cache)
+		kmem_cache_destroy(bch2_inode_cache);
+}
+
+int __init bch2_vfs_init(void)
+{
+	int ret = -ENOMEM;
+
+	bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
+	if (!bch2_inode_cache)
+		goto err;
+
+	ret = register_filesystem(&bcache_fs_type);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	bch2_vfs_exit();
+	return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
new file mode 100644
index 000000000000..eda903a45325
--- /dev/null
+++ b/fs/bcachefs/fs.h
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_H
+#define _BCACHEFS_FS_H
+
+#include "inode.h"
+#include "opts.h"
+#include "str_hash.h"
+#include "quota_types.h"
+
+#include <linux/seqlock.h>
+#include <linux/stat.h>
+
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+struct pagecache_lock {
+	atomic_long_t		v;
+	wait_queue_head_t	wait;
+};
+
+static inline void pagecache_lock_init(struct pagecache_lock *lock)
+{
+	atomic_long_set(&lock->v, 0);
+	init_waitqueue_head(&lock->wait);
+}
+
+void bch2_pagecache_add_put(struct pagecache_lock *);
+void bch2_pagecache_add_get(struct pagecache_lock *);
+void bch2_pagecache_block_put(struct pagecache_lock *);
+void bch2_pagecache_block_get(struct pagecache_lock *);
+
+struct bch_inode_info {
+	struct inode		v;
+
+	struct mutex		ei_update_lock;
+	u64			ei_journal_seq;
+	u64			ei_quota_reserved;
+	unsigned long		ei_last_dirtied;
+
+	struct pagecache_lock	ei_pagecache_lock;
+
+	struct mutex		ei_quota_lock;
+	struct bch_qid		ei_qid;
+
+	struct bch_hash_info	ei_str_hash;
+
+	/* copy of inode in btree: */
+	struct bch_inode_unpacked ei_inode;
+};
+
+#define to_bch_ei(_inode)					\
+	container_of_or_null(_inode, struct bch_inode_info, v)
+
+static inline int ptrcmp(void *l, void *r)
+{
+	return cmp_int(l, r);
+}
+
+enum bch_inode_lock_op {
+	INODE_LOCK		= (1U << 0),
+	INODE_PAGECACHE_BLOCK	= (1U << 1),
+	INODE_UPDATE_LOCK	= (1U << 2),
+};
+
+#define bch2_lock_inodes(_locks, ...)					\
+do {									\
+	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
+	unsigned i;							\
+									\
+	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
+									\
+	for (i = 1; i < ARRAY_SIZE(a); i++)				\
+		if (a[i] != a[i - 1]) {					\
+			if ((_locks) & INODE_LOCK)			\
+				down_write_nested(&a[i]->v.i_rwsem, i);	\
+			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
+				bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\
+			if ((_locks) & INODE_UPDATE_LOCK)			\
+				mutex_lock_nested(&a[i]->ei_update_lock, i);\
+		}							\
+} while (0)
+
+#define bch2_unlock_inodes(_locks, ...)					\
+do {									\
+	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
+	unsigned i;							\
+									\
+	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
+									\
+	for (i = 1; i < ARRAY_SIZE(a); i++)				\
+		if (a[i] != a[i - 1]) {					\
+			if ((_locks) & INODE_LOCK)			\
+				up_write(&a[i]->v.i_rwsem);		\
+			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
+				bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\
+			if ((_locks) & INODE_UPDATE_LOCK)			\
+				mutex_unlock(&a[i]->ei_update_lock);	\
+		}							\
+} while (0)
+
+static inline struct bch_inode_info *file_bch_inode(struct file *file)
+{
+	return to_bch_ei(file_inode(file));
+}
+
+static inline bool inode_attr_changing(struct bch_inode_info *dir,
+				struct bch_inode_info *inode,
+				enum inode_opt_id id)
+{
+	return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
+		bch2_inode_opt_get(&dir->ei_inode, id) !=
+		bch2_inode_opt_get(&inode->ei_inode, id);
+}
+
+static inline bool inode_attrs_changing(struct bch_inode_info *dir,
+				 struct bch_inode_info *inode)
+{
+	unsigned id;
+
+	for (id = 0; id < Inode_opt_nr; id++)
+		if (inode_attr_changing(dir, inode, id))
+			return true;
+
+	return false;
+}
+
+struct bch_inode_unpacked;
+
+#ifndef NO_BCACHEFS_FS
+
+int bch2_fs_quota_transfer(struct bch_fs *,
+			   struct bch_inode_info *,
+			   struct bch_qid,
+			   unsigned,
+			   enum quota_acct_mode);
+
+static inline int bch2_set_projid(struct bch_fs *c,
+				  struct bch_inode_info *inode,
+				  u32 projid)
+{
+	struct bch_qid qid = inode->ei_qid;
+
+	qid.q[QTYP_PRJ] = projid;
+
+	return bch2_fs_quota_transfer(c, inode, qid,
+				      1 << QTYP_PRJ,
+				      KEY_TYPE_QUOTA_PREALLOC);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *, u64);
+
+/* returns 0 if we want to do the update, or error is passed up */
+typedef int (*inode_set_fn)(struct bch_inode_info *,
+			    struct bch_inode_unpacked *, void *);
+
+void bch2_inode_update_after_write(struct bch_fs *,
+				   struct bch_inode_info *,
+				   struct bch_inode_unpacked *,
+				   unsigned);
+int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
+				  inode_set_fn, void *, unsigned);
+
+void bch2_vfs_exit(void);
+int bch2_vfs_init(void);
+
+#else
+
+static inline void bch2_vfs_exit(void) {}
+static inline int bch2_vfs_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_FS_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
new file mode 100644
index 000000000000..3cced2b99f3f
--- /dev/null
+++ b/fs/bcachefs/fsck.c
@@ -0,0 +1,1436 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "error.h"
+#include "fs-common.h"
+#include "fsck.h"
+#include "inode.h"
+#include "keylist.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/dcache.h> /* struct qstr */
+#include <linux/generic-radix-tree.h>
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 sectors = 0;
+	int ret;
+
+	for_each_btree_key(trans, iter, BTREE_ID_EXTENTS,
+			   POS(inum, 0), 0, k, ret) {
+		if (k.k->p.inode != inum)
+			break;
+
+		if (bkey_extent_is_allocation(k.k))
+			sectors += k.k->size;
+	}
+
+	bch2_trans_iter_free(trans, iter);
+
+	return ret ?: sectors;
+}
+
+static int remove_dirent(struct btree_trans *trans,
+			 struct bkey_s_c_dirent dirent)
+{
+	struct bch_fs *c = trans->c;
+	struct qstr name;
+	struct bch_inode_unpacked dir_inode;
+	struct bch_hash_info dir_hash_info;
+	u64 dir_inum = dirent.k->p.inode;
+	int ret;
+	char *buf;
+
+	name.len = bch2_dirent_name_bytes(dirent);
+	buf = kmalloc(name.len + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	memcpy(buf, dirent.v->d_name, name.len);
+	buf[name.len] = '\0';
+	name.name = buf;
+
+	/* Unlock so we don't deadlock, after copying name: */
+	bch2_trans_unlock(trans);
+
+	ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode);
+	if (ret) {
+		bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
+		goto err;
+	}
+
+	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
+
+	ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL);
+	if (ret)
+		bch_err(c, "remove_dirent: err %i deleting dirent", ret);
+err:
+	kfree(buf);
+	return ret;
+}
+
+static int reattach_inode(struct bch_fs *c,
+			  struct bch_inode_unpacked *lostfound_inode,
+			  u64 inum)
+{
+	struct bch_inode_unpacked inode_u;
+	char name_buf[20];
+	struct qstr name;
+	int ret;
+
+	snprintf(name_buf, sizeof(name_buf), "%llu", inum);
+	name = (struct qstr) QSTR(name_buf);
+
+	ret = bch2_trans_do(c, NULL,
+			    BTREE_INSERT_ATOMIC|
+			    BTREE_INSERT_LAZY_RW,
+		bch2_link_trans(&trans, lostfound_inode->bi_inum,
+				inum, &inode_u, &name));
+	if (ret)
+		bch_err(c, "error %i reattaching inode %llu", ret, inum);
+
+	return ret;
+}
+
+struct inode_walker {
+	bool			first_this_inode;
+	bool			have_inode;
+	u64			cur_inum;
+	struct bch_inode_unpacked inode;
+};
+
+static struct inode_walker inode_walker_init(void)
+{
+	return (struct inode_walker) {
+		.cur_inum	= -1,
+		.have_inode	= false,
+	};
+}
+
+static int walk_inode(struct btree_trans *trans,
+		      struct inode_walker *w, u64 inum)
+{
+	if (inum != w->cur_inum) {
+		int ret = bch2_inode_find_by_inum_trans(trans, inum,
+							&w->inode);
+
+		if (ret && ret != -ENOENT)
+			return ret;
+
+		w->have_inode	= !ret;
+		w->cur_inum	= inum;
+		w->first_this_inode = true;
+	} else {
+		w->first_this_inode = false;
+	}
+
+	return 0;
+}
+
+struct hash_check {
+	struct bch_hash_info	info;
+
+	/* start of current chain of hash collisions: */
+	struct btree_iter	*chain;
+
+	/* next offset in current chain of hash collisions: */
+	u64			chain_end;
+};
+
+static void hash_check_init(struct hash_check *h)
+{
+	h->chain = NULL;
+	h->chain_end = 0;
+}
+
+static void hash_stop_chain(struct btree_trans *trans,
+			    struct hash_check *h)
+{
+	if (h->chain)
+		bch2_trans_iter_free(trans, h->chain);
+	h->chain = NULL;
+}
+
+static void hash_check_set_inode(struct btree_trans *trans,
+				 struct hash_check *h,
+				 const struct bch_inode_unpacked *bi)
+{
+	h->info = bch2_hash_info_init(trans->c, bi);
+	hash_stop_chain(trans, h);
+}
+
+static int hash_redo_key(const struct bch_hash_desc desc,
+			 struct btree_trans *trans, struct hash_check *h,
+			 struct btree_iter *k_iter, struct bkey_s_c k,
+			 u64 hashed)
+{
+	struct bkey_i *tmp;
+	int ret = 0;
+
+	tmp = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	bkey_reassemble(tmp, k);
+
+	ret = bch2_btree_delete_at(trans, k_iter, 0);
+	if (ret)
+		goto err;
+
+	bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
+		      tmp, BCH_HASH_SET_MUST_CREATE);
+	ret = bch2_trans_commit(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_LAZY_RW);
+err:
+	kfree(tmp);
+	return ret;
+}
+
+static int fsck_hash_delete_at(struct btree_trans *trans,
+			       const struct bch_hash_desc desc,
+			       struct bch_hash_info *info,
+			       struct btree_iter *iter)
+{
+	int ret;
+retry:
+	ret   = bch2_hash_delete_at(trans, desc, info, iter) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW);
+	if (ret == -EINTR) {
+		ret = bch2_btree_iter_traverse(iter);
+		if (!ret)
+			goto retry;
+	}
+
+	return ret;
+}
+
+static int hash_check_duplicates(struct btree_trans *trans,
+			const struct bch_hash_desc desc, struct hash_check *h,
+			struct btree_iter *k_iter, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter;
+	struct bkey_s_c k2;
+	char buf[200];
+	int ret = 0;
+
+	if (!bkey_cmp(h->chain->pos, k_iter->pos))
+		return 0;
+
+	iter = bch2_trans_copy_iter(trans, h->chain);
+	BUG_ON(IS_ERR(iter));
+
+	for_each_btree_key_continue(iter, 0, k2, ret) {
+		if (bkey_cmp(k2.k->p, k.k->p) >= 0)
+			break;
+
+		if (fsck_err_on(k2.k->type == desc.key_type &&
+				!desc.cmp_bkey(k, k2), c,
+				"duplicate hash table keys:\n%s",
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       k), buf))) {
+			ret = fsck_hash_delete_at(trans, desc, &h->info, k_iter);
+			if (ret)
+				return ret;
+			ret = 1;
+			break;
+		}
+	}
+fsck_err:
+	bch2_trans_iter_free(trans, iter);
+	return ret;
+}
+
+static void hash_set_chain_start(struct btree_trans *trans,
+			const struct bch_hash_desc desc,
+			struct hash_check *h,
+			struct btree_iter *k_iter, struct bkey_s_c k)
+{
+	bool hole = (k.k->type != KEY_TYPE_whiteout &&
+		     k.k->type != desc.key_type);
+
+	if (hole || k.k->p.offset > h->chain_end + 1)
+		hash_stop_chain(trans, h);
+
+	if (!hole) {
+		if (!h->chain) {
+			h->chain = bch2_trans_copy_iter(trans, k_iter);
+			BUG_ON(IS_ERR(h->chain));
+		}
+
+		h->chain_end = k.k->p.offset;
+	}
+}
+
+static bool key_has_correct_hash(struct btree_trans *trans,
+			const struct bch_hash_desc desc,
+			struct hash_check *h,
+			struct btree_iter *k_iter, struct bkey_s_c k)
+{
+	u64 hash;
+
+	hash_set_chain_start(trans, desc, h, k_iter, k);
+
+	if (k.k->type != desc.key_type)
+		return true;
+
+	hash = desc.hash_bkey(&h->info, k);
+
+	return hash >= h->chain->pos.offset &&
+		hash <= k.k->p.offset;
+}
+
+static int hash_check_key(struct btree_trans *trans,
+			const struct bch_hash_desc desc, struct hash_check *h,
+			struct btree_iter *k_iter, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	char buf[200];
+	u64 hashed;
+	int ret = 0;
+
+	hash_set_chain_start(trans, desc, h, k_iter, k);
+
+	if (k.k->type != desc.key_type)
+		return 0;
+
+	hashed = desc.hash_bkey(&h->info, k);
+
+	if (fsck_err_on(hashed < h->chain->pos.offset ||
+			hashed > k.k->p.offset, c,
+			"hash table key at wrong offset: btree %u, %llu, "
+			"hashed to %llu chain starts at %llu\n%s",
+			desc.btree_id, k.k->p.offset,
+			hashed, h->chain->pos.offset,
+			(bch2_bkey_val_to_text(&PBUF(buf), c,
+					       k), buf))) {
+		ret = hash_redo_key(desc, trans, h, k_iter, k, hashed);
+		if (ret) {
+			bch_err(c, "hash_redo_key err %i", ret);
+			return ret;
+		}
+		return 1;
+	}
+
+	ret = hash_check_duplicates(trans, desc, h, k_iter, k);
+fsck_err:
+	return ret;
+}
+
+static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
+			     struct btree_iter *iter, struct bkey_s_c *k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_dirent *d = NULL;
+	int ret = -EINVAL;
+	char buf[200];
+	unsigned len;
+	u64 hash;
+
+	if (key_has_correct_hash(trans, bch2_dirent_hash_desc, h, iter, *k))
+		return 0;
+
+	len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k));
+	BUG_ON(!len);
+
+	memcpy(buf, bkey_s_c_to_dirent(*k).v->d_name, len);
+	buf[len] = '\0';
+
+	d = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+	if (!d) {
+		bch_err(c, "memory allocation failure");
+		return -ENOMEM;
+	}
+
+	bkey_reassemble(&d->k_i, *k);
+
+	do {
+		--len;
+		if (!len)
+			goto err_redo;
+
+		d->k.u64s = BKEY_U64s + dirent_val_u64s(len);
+
+		BUG_ON(bkey_val_bytes(&d->k) <
+		       offsetof(struct bch_dirent, d_name) + len);
+
+		memset(d->v.d_name + len, 0,
+		       bkey_val_bytes(&d->k) -
+		       offsetof(struct bch_dirent, d_name) - len);
+
+		hash = bch2_dirent_hash_desc.hash_bkey(&h->info,
+						bkey_i_to_s_c(&d->k_i));
+	} while (hash < h->chain->pos.offset ||
+		 hash > k->k->p.offset);
+
+	if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)",
+		     buf, strlen(buf), d->v.d_name, len)) {
+		bch2_trans_update(trans, iter, &d->k_i);
+
+		ret = bch2_trans_commit(trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW);
+		if (ret)
+			goto err;
+
+		*k = bch2_btree_iter_peek(iter);
+
+		BUG_ON(k->k->type != KEY_TYPE_dirent);
+	}
+err:
+fsck_err:
+	kfree(d);
+	return ret;
+err_redo:
+	hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k);
+
+	if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n"
+		     "hash table key at wrong offset: btree %u, offset %llu, "
+		     "hashed to %llu chain starts at %llu\n%s",
+		     buf, strlen(buf), BTREE_ID_DIRENTS,
+		     k->k->p.offset, hash, h->chain->pos.offset,
+		     (bch2_bkey_val_to_text(&PBUF(buf), c,
+					    *k), buf))) {
+		ret = hash_redo_key(bch2_dirent_hash_desc, trans,
+				    h, iter, *k, hash);
+		if (ret)
+			bch_err(c, "hash_redo_key err %i", ret);
+		else
+			ret = 1;
+	}
+
+	goto err;
+}
+
+static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
+{
+	return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+			POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9),
+			POS(inode_nr + 1, 0), NULL);
+}
+
+/*
+ * Walk extents: verify that extents have a corresponding S_ISREG inode, and
+ * that i_size an i_sectors are consistent
+ */
+noinline_for_stack
+static int check_extents(struct bch_fs *c)
+{
+	struct inode_walker w = inode_walker_init();
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 i_sectors;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	bch_verbose(c, "checking extents");
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   POS(BCACHEFS_ROOT_INO, 0), 0);
+retry:
+	for_each_btree_key_continue(iter, 0, k, ret) {
+		ret = walk_inode(&trans, &w, k.k->p.inode);
+		if (ret)
+			break;
+
+		if (fsck_err_on(!w.have_inode, c,
+			"extent type %u for missing inode %llu",
+			k.k->type, k.k->p.inode) ||
+		    fsck_err_on(w.have_inode &&
+			!S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
+			"extent type %u for non regular file, inode %llu mode %o",
+			k.k->type, k.k->p.inode, w.inode.bi_mode)) {
+			bch2_trans_unlock(&trans);
+
+			ret = bch2_inode_truncate(c, k.k->p.inode, 0);
+			if (ret)
+				goto err;
+			continue;
+		}
+
+		if (fsck_err_on(w.first_this_inode &&
+			w.have_inode &&
+			!(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
+			w.inode.bi_sectors !=
+			(i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)),
+			c, "i_sectors wrong: got %llu, should be %llu",
+			w.inode.bi_sectors, i_sectors)) {
+			struct bkey_inode_buf p;
+
+			w.inode.bi_sectors = i_sectors;
+
+			bch2_trans_unlock(&trans);
+
+			bch2_inode_pack(&p, &w.inode);
+
+			ret = bch2_btree_insert(c, BTREE_ID_INODES,
+						&p.inode.k_i, NULL, NULL,
+						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_LAZY_RW);
+			if (ret) {
+				bch_err(c, "error in fsck: error %i updating inode", ret);
+				goto err;
+			}
+
+			/* revalidate iterator: */
+			k = bch2_btree_iter_peek(iter);
+		}
+
+		if (fsck_err_on(w.have_inode &&
+			!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+			k.k->type != KEY_TYPE_reservation &&
+			k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
+			"extent type %u offset %llu past end of inode %llu, i_size %llu",
+			k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
+			bch2_trans_unlock(&trans);
+
+			ret = bch2_inode_truncate(c, k.k->p.inode,
+						  w.inode.bi_size);
+			if (ret)
+				goto err;
+			continue;
+		}
+	}
+err:
+fsck_err:
+	if (ret == -EINTR)
+		goto retry;
+	return bch2_trans_exit(&trans) ?: ret;
+}
+
+/*
+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
+ * validate d_type
+ */
+noinline_for_stack
+static int check_dirents(struct bch_fs *c)
+{
+	struct inode_walker w = inode_walker_init();
+	struct hash_check h;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	unsigned name_len;
+	char buf[200];
+	int ret = 0;
+
+	bch_verbose(c, "checking dirents");
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	hash_check_init(&h);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+				   POS(BCACHEFS_ROOT_INO, 0), 0);
+retry:
+	for_each_btree_key_continue(iter, 0, k, ret) {
+		struct bkey_s_c_dirent d;
+		struct bch_inode_unpacked target;
+		bool have_target;
+		u64 d_inum;
+
+		ret = walk_inode(&trans, &w, k.k->p.inode);
+		if (ret)
+			break;
+
+		if (fsck_err_on(!w.have_inode, c,
+				"dirent in nonexisting directory:\n%s",
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       k), buf)) ||
+		    fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
+				"dirent in non directory inode type %u:\n%s",
+				mode_to_type(w.inode.bi_mode),
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       k), buf))) {
+			ret = bch2_btree_delete_at(&trans, iter, 0);
+			if (ret)
+				goto err;
+			continue;
+		}
+
+		if (w.first_this_inode && w.have_inode)
+			hash_check_set_inode(&trans, &h, &w.inode);
+
+		ret = check_dirent_hash(&trans, &h, iter, &k);
+		if (ret > 0) {
+			ret = 0;
+			continue;
+		}
+		if (ret)
+			goto fsck_err;
+
+		if (ret)
+			goto fsck_err;
+
+		if (k.k->type != KEY_TYPE_dirent)
+			continue;
+
+		d = bkey_s_c_to_dirent(k);
+		d_inum = le64_to_cpu(d.v->d_inum);
+
+		name_len = bch2_dirent_name_bytes(d);
+
+		if (fsck_err_on(!name_len, c, "empty dirent") ||
+		    fsck_err_on(name_len == 1 &&
+				!memcmp(d.v->d_name, ".", 1), c,
+				". dirent") ||
+		    fsck_err_on(name_len == 2 &&
+				!memcmp(d.v->d_name, "..", 2), c,
+				".. dirent") ||
+		    fsck_err_on(name_len == 2 &&
+				!memcmp(d.v->d_name, "..", 2), c,
+				".. dirent") ||
+		    fsck_err_on(memchr(d.v->d_name, '/', name_len), c,
+				"dirent name has invalid chars")) {
+			ret = remove_dirent(&trans, d);
+			if (ret)
+				goto err;
+			continue;
+		}
+
+		if (fsck_err_on(d_inum == d.k->p.inode, c,
+				"dirent points to own directory:\n%s",
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       k), buf))) {
+			ret = remove_dirent(&trans, d);
+			if (ret)
+				goto err;
+			continue;
+		}
+
+		ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target);
+		if (ret && ret != -ENOENT)
+			break;
+
+		have_target = !ret;
+		ret = 0;
+
+		if (fsck_err_on(!have_target, c,
+				"dirent points to missing inode:\n%s",
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       k), buf))) {
+			ret = remove_dirent(&trans, d);
+			if (ret)
+				goto err;
+			continue;
+		}
+
+		if (fsck_err_on(have_target &&
+				d.v->d_type !=
+				mode_to_type(target.bi_mode), c,
+				"incorrect d_type: should be %u:\n%s",
+				mode_to_type(target.bi_mode),
+				(bch2_bkey_val_to_text(&PBUF(buf), c,
+						       k), buf))) {
+			struct bkey_i_dirent *n;
+
+			n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
+			if (!n) {
+				ret = -ENOMEM;
+				goto err;
+			}
+
+			bkey_reassemble(&n->k_i, d.s_c);
+			n->v.d_type = mode_to_type(target.bi_mode);
+
+			bch2_trans_update(&trans, iter, &n->k_i);
+
+			ret = bch2_trans_commit(&trans, NULL, NULL,
+						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_LAZY_RW);
+			kfree(n);
+			if (ret)
+				goto err;
+
+		}
+	}
+
+	hash_stop_chain(&trans, &h);
+err:
+fsck_err:
+	if (ret == -EINTR)
+		goto retry;
+
+	return bch2_trans_exit(&trans) ?: ret;
+}
+
+/*
+ * Walk xattrs: verify that they all have a corresponding inode
+ */
+noinline_for_stack
+static int check_xattrs(struct bch_fs *c)
+{
+	struct inode_walker w = inode_walker_init();
+	struct hash_check h;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch_verbose(c, "checking xattrs");
+
+	hash_check_init(&h);
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
+				   POS(BCACHEFS_ROOT_INO, 0), 0);
+retry:
+	for_each_btree_key_continue(iter, 0, k, ret) {
+		ret = walk_inode(&trans, &w, k.k->p.inode);
+		if (ret)
+			break;
+
+		if (fsck_err_on(!w.have_inode, c,
+				"xattr for missing inode %llu",
+				k.k->p.inode)) {
+			ret = bch2_btree_delete_at(&trans, iter, 0);
+			if (ret)
+				goto err;
+			continue;
+		}
+
+		if (w.first_this_inode && w.have_inode)
+			hash_check_set_inode(&trans, &h, &w.inode);
+
+		ret = hash_check_key(&trans, bch2_xattr_hash_desc,
+				     &h, iter, k);
+		if (ret)
+			goto fsck_err;
+	}
+err:
+fsck_err:
+	if (ret == -EINTR)
+		goto retry;
+	return bch2_trans_exit(&trans) ?: ret;
+}
+
+/* Get root directory, create if it doesn't exist: */
+static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
+{
+	struct bkey_inode_buf packed;
+	int ret;
+
+	bch_verbose(c, "checking root directory");
+
+	ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode);
+	if (ret && ret != -ENOENT)
+		return ret;
+
+	if (fsck_err_on(ret, c, "root directory missing"))
+		goto create_root;
+
+	if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
+			"root inode not a directory"))
+		goto create_root;
+
+	return 0;
+fsck_err:
+	return ret;
+create_root:
+	bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
+			0, NULL);
+	root_inode->bi_inum = BCACHEFS_ROOT_INO;
+
+	bch2_inode_pack(&packed, root_inode);
+
+	return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+				 NULL, NULL,
+				 BTREE_INSERT_NOFAIL|
+				 BTREE_INSERT_LAZY_RW);
+}
+
+/* Get lost+found, create if it doesn't exist: */
+static int check_lostfound(struct bch_fs *c,
+			   struct bch_inode_unpacked *root_inode,
+			   struct bch_inode_unpacked *lostfound_inode)
+{
+	struct qstr lostfound = QSTR("lost+found");
+	struct bch_hash_info root_hash_info =
+		bch2_hash_info_init(c, root_inode);
+	u64 inum;
+	int ret;
+
+	bch_verbose(c, "checking lost+found");
+
+	inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
+				 &lostfound);
+	if (!inum) {
+		bch_notice(c, "creating lost+found");
+		goto create_lostfound;
+	}
+
+	ret = bch2_inode_find_by_inum(c, inum, lostfound_inode);
+	if (ret && ret != -ENOENT)
+		return ret;
+
+	if (fsck_err_on(ret, c, "lost+found missing"))
+		goto create_lostfound;
+
+	if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c,
+			"lost+found inode not a directory"))
+		goto create_lostfound;
+
+	return 0;
+fsck_err:
+	return ret;
+create_lostfound:
+	bch2_inode_init_early(c, lostfound_inode);
+
+	ret = bch2_trans_do(c, NULL,
+			    BTREE_INSERT_ATOMIC|
+			    BTREE_INSERT_NOFAIL|
+			    BTREE_INSERT_LAZY_RW,
+		bch2_create_trans(&trans,
+				  BCACHEFS_ROOT_INO, root_inode,
+				  lostfound_inode, &lostfound,
+				  0, 0, S_IFDIR|0755, 0, NULL, NULL));
+	if (ret)
+		bch_err(c, "error creating lost+found: %i", ret);
+
+	return ret;
+}
+
+struct inode_bitmap {
+	unsigned long	*bits;
+	size_t		size;
+};
+
+static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
+{
+	return nr < b->size ? test_bit(nr, b->bits) : false;
+}
+
+static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
+{
+	if (nr >= b->size) {
+		size_t new_size = max_t(size_t, max_t(size_t,
+					PAGE_SIZE * 8,
+					b->size * 2),
+					nr + 1);
+		void *n;
+
+		new_size = roundup_pow_of_two(new_size);
+		n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO);
+		if (!n) {
+			return -ENOMEM;
+		}
+
+		b->bits = n;
+		b->size = new_size;
+	}
+
+	__set_bit(nr, b->bits);
+	return 0;
+}
+
+struct pathbuf {
+	size_t		nr;
+	size_t		size;
+
+	struct pathbuf_entry {
+		u64	inum;
+		u64	offset;
+	}		*entries;
+};
+
+static int path_down(struct pathbuf *p, u64 inum)
+{
+	if (p->nr == p->size) {
+		size_t new_size = max_t(size_t, 256UL, p->size * 2);
+		void *n = krealloc(p->entries,
+				   new_size * sizeof(p->entries[0]),
+				   GFP_KERNEL);
+		if (!n)
+			return -ENOMEM;
+
+		p->entries = n;
+		p->size = new_size;
+	};
+
+	p->entries[p->nr++] = (struct pathbuf_entry) {
+		.inum = inum,
+		.offset = 0,
+	};
+	return 0;
+}
+
+noinline_for_stack
+static int check_directory_structure(struct bch_fs *c,
+				     struct bch_inode_unpacked *lostfound_inode)
+{
+	struct inode_bitmap dirs_done = { NULL, 0 };
+	struct pathbuf path = { 0, 0, NULL };
+	struct pathbuf_entry *e;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent dirent;
+	bool had_unreachable;
+	u64 d_inum;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	bch_verbose(c, "checking directory structure");
+
+	/* DFS: */
+restart_dfs:
+	had_unreachable = false;
+
+	ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
+	if (ret) {
+		bch_err(c, "memory allocation failure in inode_bitmap_set()");
+		goto err;
+	}
+
+	ret = path_down(&path, BCACHEFS_ROOT_INO);
+	if (ret)
+		goto err;
+
+	while (path.nr) {
+next:
+		e = &path.entries[path.nr - 1];
+
+		if (e->offset == U64_MAX)
+			goto up;
+
+		for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
+				   POS(e->inum, e->offset + 1), 0, k, ret) {
+			if (k.k->p.inode != e->inum)
+				break;
+
+			e->offset = k.k->p.offset;
+
+			if (k.k->type != KEY_TYPE_dirent)
+				continue;
+
+			dirent = bkey_s_c_to_dirent(k);
+
+			if (dirent.v->d_type != DT_DIR)
+				continue;
+
+			d_inum = le64_to_cpu(dirent.v->d_inum);
+
+			if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c,
+					"directory %llu has multiple hardlinks",
+					d_inum)) {
+				ret = remove_dirent(&trans, dirent);
+				if (ret)
+					goto err;
+				continue;
+			}
+
+			ret = inode_bitmap_set(&dirs_done, d_inum);
+			if (ret) {
+				bch_err(c, "memory allocation failure in inode_bitmap_set()");
+				goto err;
+			}
+
+			ret = path_down(&path, d_inum);
+			if (ret) {
+				goto err;
+			}
+
+			ret = bch2_trans_iter_free(&trans, iter);
+			if (ret) {
+				bch_err(c, "btree error %i in fsck", ret);
+				goto err;
+			}
+			goto next;
+		}
+		ret = bch2_trans_iter_free(&trans, iter) ?: ret;
+		if (ret) {
+			bch_err(c, "btree error %i in fsck", ret);
+			goto err;
+		}
+up:
+		path.nr--;
+	}
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0);
+retry:
+	for_each_btree_key_continue(iter, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_inode)
+			continue;
+
+		if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode)))
+			continue;
+
+		ret = bch2_empty_dir_trans(&trans, k.k->p.inode);
+		if (ret == -EINTR)
+			goto retry;
+		if (!ret)
+			continue;
+
+		if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
+				"unreachable directory found (inum %llu)",
+				k.k->p.inode)) {
+			bch2_trans_unlock(&trans);
+
+			ret = reattach_inode(c, lostfound_inode, k.k->p.inode);
+			if (ret) {
+				goto err;
+			}
+
+			had_unreachable = true;
+		}
+	}
+	bch2_trans_iter_free(&trans, iter);
+	if (ret)
+		goto err;
+
+	if (had_unreachable) {
+		bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
+		kfree(dirs_done.bits);
+		kfree(path.entries);
+		memset(&dirs_done, 0, sizeof(dirs_done));
+		memset(&path, 0, sizeof(path));
+		goto restart_dfs;
+	}
+err:
+fsck_err:
+	ret = bch2_trans_exit(&trans) ?: ret;
+	kfree(dirs_done.bits);
+	kfree(path.entries);
+	return ret;
+}
+
+struct nlink {
+	u32	count;
+	u32	dir_count;
+};
+
+typedef GENRADIX(struct nlink) nlink_table;
+
+static void inc_link(struct bch_fs *c, nlink_table *links,
+		     u64 range_start, u64 *range_end,
+		     u64 inum, bool dir)
+{
+	struct nlink *link;
+
+	if (inum < range_start || inum >= *range_end)
+		return;
+
+	link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
+	if (!link) {
+		bch_verbose(c, "allocation failed during fsck - will need another pass");
+		*range_end = inum;
+		return;
+	}
+
+	if (dir)
+		link->dir_count++;
+	else
+		link->count++;
+}
+
+noinline_for_stack
+static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
+			       u64 range_start, u64 *range_end)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	u64 d_inum;
+	int ret;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) {
+		switch (k.k->type) {
+		case KEY_TYPE_dirent:
+			d = bkey_s_c_to_dirent(k);
+			d_inum = le64_to_cpu(d.v->d_inum);
+
+			if (d.v->d_type == DT_DIR)
+				inc_link(c, links, range_start, range_end,
+					 d.k->p.inode, true);
+
+			inc_link(c, links, range_start, range_end,
+				 d_inum, false);
+
+			break;
+		}
+
+		bch2_trans_cond_resched(&trans);
+	}
+	ret = bch2_trans_exit(&trans) ?: ret;
+	if (ret)
+		bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
+
+	return ret;
+}
+
+static int check_inode_nlink(struct bch_fs *c,
+			     struct bch_inode_unpacked *lostfound_inode,
+			     struct bch_inode_unpacked *u,
+			     struct nlink *link,
+			     bool *do_update)
+{
+	u32 i_nlink = bch2_inode_nlink_get(u);
+	u32 real_i_nlink =
+		link->count * nlink_bias(u->bi_mode) +
+		link->dir_count;
+	int ret = 0;
+
+	/*
+	 * These should have been caught/fixed by earlier passes, we don't
+	 * repair them here:
+	 */
+	if (S_ISDIR(u->bi_mode) && link->count > 1) {
+		need_fsck_err(c, "directory %llu with multiple hardlinks: %u",
+			      u->bi_inum, link->count);
+		return 0;
+	}
+
+	if (S_ISDIR(u->bi_mode) && !link->count) {
+		need_fsck_err(c, "unreachable directory found (inum %llu)",
+			      u->bi_inum);
+		return 0;
+	}
+
+	if (!S_ISDIR(u->bi_mode) && link->dir_count) {
+		need_fsck_err(c, "non directory with subdirectories",
+			      u->bi_inum);
+		return 0;
+	}
+
+	if (!link->count &&
+	    !(u->bi_flags & BCH_INODE_UNLINKED) &&
+	    (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+		if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)",
+			     u->bi_inum, mode_to_type(u->bi_mode)) ==
+		    FSCK_ERR_IGNORE)
+			return 0;
+
+		ret = reattach_inode(c, lostfound_inode, u->bi_inum);
+		if (ret)
+			return ret;
+
+		link->count = 1;
+		real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count;
+		goto set_i_nlink;
+	}
+
+	if (i_nlink < link->count) {
+		if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)",
+			     u->bi_inum, i_nlink, link->count,
+			     mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE)
+			return 0;
+		goto set_i_nlink;
+	}
+
+	if (i_nlink != real_i_nlink &&
+	    c->sb.clean) {
+		if (fsck_err(c, "filesystem marked clean, "
+			     "but inode %llu has wrong i_nlink "
+			     "(type %u i_nlink %u, should be %u)",
+			     u->bi_inum, mode_to_type(u->bi_mode),
+			     i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
+			return 0;
+		goto set_i_nlink;
+	}
+
+	if (i_nlink != real_i_nlink &&
+	    (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+		if (fsck_err(c, "inode %llu has wrong i_nlink "
+			     "(type %u i_nlink %u, should be %u)",
+			     u->bi_inum, mode_to_type(u->bi_mode),
+			     i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
+			return 0;
+		goto set_i_nlink;
+	}
+
+	if (real_i_nlink && i_nlink != real_i_nlink)
+		bch_verbose(c, "setting inode %llu nlink from %u to %u",
+			    u->bi_inum, i_nlink, real_i_nlink);
+set_i_nlink:
+	if (i_nlink != real_i_nlink) {
+		bch2_inode_nlink_set(u, real_i_nlink);
+		*do_update = true;
+	}
+fsck_err:
+	return ret;
+}
+
+static int check_inode(struct btree_trans *trans,
+		       struct bch_inode_unpacked *lostfound_inode,
+		       struct btree_iter *iter,
+		       struct bkey_s_c_inode inode,
+		       struct nlink *link)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked u;
+	bool do_update = false;
+	int ret = 0;
+
+	ret = bch2_inode_unpack(inode, &u);
+
+	bch2_trans_unlock(trans);
+
+	if (bch2_fs_inconsistent_on(ret, c,
+			 "error unpacking inode %llu in fsck",
+			 inode.k->p.inode))
+		return ret;
+
+	if (link) {
+		ret = check_inode_nlink(c, lostfound_inode, &u, link,
+					&do_update);
+		if (ret)
+			return ret;
+	}
+
+	if (u.bi_flags & BCH_INODE_UNLINKED &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
+		      u.bi_inum))) {
+		bch_verbose(c, "deleting inode %llu", u.bi_inum);
+
+		ret = bch2_inode_rm(c, u.bi_inum);
+		if (ret)
+			bch_err(c, "error in fsck: error %i while deleting inode", ret);
+		return ret;
+	}
+
+	if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
+		      u.bi_inum))) {
+		bch_verbose(c, "truncating inode %llu", u.bi_inum);
+
+		/*
+		 * XXX: need to truncate partial blocks too here - or ideally
+		 * just switch units to bytes and that issue goes away
+		 */
+
+		ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size);
+		if (ret) {
+			bch_err(c, "error in fsck: error %i truncating inode", ret);
+			return ret;
+		}
+
+		/*
+		 * We truncated without our normal sector accounting hook, just
+		 * make sure we recalculate it:
+		 */
+		u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
+
+		u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+		do_update = true;
+	}
+
+	if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
+		      u.bi_inum))) {
+		s64 sectors;
+
+		bch_verbose(c, "recounting sectors for inode %llu",
+			    u.bi_inum);
+
+		sectors = bch2_count_inode_sectors(trans, u.bi_inum);
+		if (sectors < 0) {
+			bch_err(c, "error in fsck: error %i recounting inode sectors",
+				(int) sectors);
+			return sectors;
+		}
+
+		u.bi_sectors = sectors;
+		u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
+		do_update = true;
+	}
+
+	if (do_update) {
+		struct bkey_inode_buf p;
+
+		bch2_inode_pack(&p, &u);
+		bch2_trans_update(trans, iter, &p.inode.k_i);
+
+		ret = bch2_trans_commit(trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW);
+		if (ret && ret != -EINTR)
+			bch_err(c, "error in fsck: error %i "
+				"updating inode", ret);
+	}
+fsck_err:
+	return ret;
+}
+
+noinline_for_stack
+static int bch2_gc_walk_inodes(struct bch_fs *c,
+			       struct bch_inode_unpacked *lostfound_inode,
+			       nlink_table *links,
+			       u64 range_start, u64 range_end)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct nlink *link, zero_links = { 0, 0 };
+	struct genradix_iter nlinks_iter;
+	int ret = 0, ret2 = 0;
+	u64 nlinks_pos;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
+				   POS(range_start, 0), 0);
+	nlinks_iter = genradix_iter_init(links, 0);
+
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret2 = bkey_err(k))) {
+peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
+
+		if (!link && (!k.k || iter->pos.inode >= range_end))
+			break;
+
+		nlinks_pos = range_start + nlinks_iter.pos;
+		if (iter->pos.inode > nlinks_pos) {
+			/* Should have been caught by dirents pass: */
+			need_fsck_err_on(link && link->count, c,
+				"missing inode %llu (nlink %u)",
+				nlinks_pos, link->count);
+			genradix_iter_advance(&nlinks_iter, links);
+			goto peek_nlinks;
+		}
+
+		if (iter->pos.inode < nlinks_pos || !link)
+			link = &zero_links;
+
+		if (k.k && k.k->type == KEY_TYPE_inode) {
+			ret = check_inode(&trans, lostfound_inode, iter,
+					  bkey_s_c_to_inode(k), link);
+			BUG_ON(ret == -EINTR);
+			if (ret)
+				break;
+		} else {
+			/* Should have been caught by dirents pass: */
+			need_fsck_err_on(link->count, c,
+				"missing inode %llu (nlink %u)",
+				nlinks_pos, link->count);
+		}
+
+		if (nlinks_pos == iter->pos.inode)
+			genradix_iter_advance(&nlinks_iter, links);
+
+		bch2_btree_iter_next(iter);
+		bch2_trans_cond_resched(&trans);
+	}
+fsck_err:
+	bch2_trans_exit(&trans);
+
+	if (ret2)
+		bch_err(c, "error in fsck: btree error %i while walking inodes", ret2);
+
+	return ret ?: ret2;
+}
+
+noinline_for_stack
+static int check_inode_nlinks(struct bch_fs *c,
+			      struct bch_inode_unpacked *lostfound_inode)
+{
+	nlink_table links;
+	u64 this_iter_range_start, next_iter_range_start = 0;
+	int ret = 0;
+
+	bch_verbose(c, "checking inode nlinks");
+
+	genradix_init(&links);
+
+	do {
+		this_iter_range_start = next_iter_range_start;
+		next_iter_range_start = U64_MAX;
+
+		ret = bch2_gc_walk_dirents(c, &links,
+					  this_iter_range_start,
+					  &next_iter_range_start);
+		if (ret)
+			break;
+
+		ret = bch2_gc_walk_inodes(c, lostfound_inode, &links,
+					 this_iter_range_start,
+					 next_iter_range_start);
+		if (ret)
+			break;
+
+		genradix_free(&links);
+	} while (next_iter_range_start != U64_MAX);
+
+	genradix_free(&links);
+
+	return ret;
+}
+
+/*
+ * Checks for inconsistencies that shouldn't happen, unless we have a bug.
+ * Doesn't fix them yet, mainly because they haven't yet been observed:
+ */
+int bch2_fsck_full(struct bch_fs *c)
+{
+	struct bch_inode_unpacked root_inode, lostfound_inode;
+
+	return  check_extents(c) ?:
+		check_dirents(c) ?:
+		check_xattrs(c) ?:
+		check_root(c, &root_inode) ?:
+		check_lostfound(c, &root_inode, &lostfound_inode) ?:
+		check_directory_structure(c, &lostfound_inode) ?:
+		check_inode_nlinks(c, &lostfound_inode);
+}
+
+int bch2_fsck_inode_nlink(struct bch_fs *c)
+{
+	struct bch_inode_unpacked root_inode, lostfound_inode;
+
+	return  check_root(c, &root_inode) ?:
+		check_lostfound(c, &root_inode, &lostfound_inode) ?:
+		check_inode_nlinks(c, &lostfound_inode);
+}
+
+int bch2_fsck_walk_inodes_only(struct bch_fs *c)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_inode inode;
+	int ret;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_inode)
+			continue;
+
+		inode = bkey_s_c_to_inode(k);
+
+		if (inode.v->bi_flags &
+		    (BCH_INODE_I_SIZE_DIRTY|
+		     BCH_INODE_I_SECTORS_DIRTY|
+		     BCH_INODE_UNLINKED)) {
+			ret = check_inode(&trans, NULL, iter, inode, NULL);
+			BUG_ON(ret == -EINTR);
+			if (ret)
+				break;
+		}
+	}
+	BUG_ON(ret == -EINTR);
+
+	return bch2_trans_exit(&trans) ?: ret;
+}
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
new file mode 100644
index 000000000000..9e4af02bde1e
--- /dev/null
+++ b/fs/bcachefs/fsck.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FSCK_H
+#define _BCACHEFS_FSCK_H
+
+int bch2_fsck_full(struct bch_fs *);
+int bch2_fsck_inode_nlink(struct bch_fs *);
+int bch2_fsck_walk_inodes_only(struct bch_fs *);
+
+#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
new file mode 100644
index 000000000000..c0642ff46ba0
--- /dev/null
+++ b/fs/bcachefs/inode.c
@@ -0,0 +1,567 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "str_hash.h"
+
+#include <linux/random.h>
+
+#include <asm/unaligned.h>
+
+const char * const bch2_inode_opts[] = {
+#define x(name, ...)	#name,
+	BCH_INODE_OPTS()
+#undef  x
+	NULL,
+};
+
+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
+static const u8 bits_table[8] = {
+	1  * 8 - 1,
+	2  * 8 - 2,
+	3  * 8 - 3,
+	4  * 8 - 4,
+	6  * 8 - 5,
+	8  * 8 - 6,
+	10 * 8 - 7,
+	13 * 8 - 8,
+};
+
+static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
+{
+	__be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
+	unsigned shift, bytes, bits = likely(!hi)
+		? fls64(lo)
+		: fls64(hi) + 64;
+
+	for (shift = 1; shift <= 8; shift++)
+		if (bits < bits_table[shift - 1])
+			goto got_shift;
+
+	BUG();
+got_shift:
+	bytes = byte_table[shift - 1];
+
+	BUG_ON(out + bytes > end);
+
+	memcpy(out, (u8 *) in + 16 - bytes, bytes);
+	*out |= (1 << 8) >> shift;
+
+	return bytes;
+}
+
+static int inode_decode_field(const u8 *in, const u8 *end,
+			      u64 out[2], unsigned *out_bits)
+{
+	__be64 be[2] = { 0, 0 };
+	unsigned bytes, shift;
+	u8 *p;
+
+	if (in >= end)
+		return -1;
+
+	if (!*in)
+		return -1;
+
+	/*
+	 * position of highest set bit indicates number of bytes:
+	 * shift = number of bits to remove in high byte:
+	 */
+	shift	= 8 - __fls(*in); /* 1 <= shift <= 8 */
+	bytes	= byte_table[shift - 1];
+
+	if (in + bytes > end)
+		return -1;
+
+	p = (u8 *) be + 16 - bytes;
+	memcpy(p, in, bytes);
+	*p ^= (1 << 8) >> shift;
+
+	out[0] = be64_to_cpu(be[0]);
+	out[1] = be64_to_cpu(be[1]);
+	*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
+
+	return bytes;
+}
+
+void bch2_inode_pack(struct bkey_inode_buf *packed,
+		     const struct bch_inode_unpacked *inode)
+{
+	u8 *out = packed->inode.v.fields;
+	u8 *end = (void *) &packed[1];
+	u8 *last_nonzero_field = out;
+	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+	unsigned bytes;
+
+	bkey_inode_init(&packed->inode.k_i);
+	packed->inode.k.p.inode		= inode->bi_inum;
+	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
+	packed->inode.v.bi_flags	= cpu_to_le32(inode->bi_flags);
+	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
+
+#define x(_name, _bits)					\
+	out += inode_encode_field(out, end, 0, inode->_name);		\
+	nr_fields++;							\
+									\
+	if (inode->_name) {						\
+		last_nonzero_field = out;				\
+		last_nonzero_fieldnr = nr_fields;			\
+	}
+
+	BCH_INODE_FIELDS()
+#undef  x
+
+	out = last_nonzero_field;
+	nr_fields = last_nonzero_fieldnr;
+
+	bytes = out - (u8 *) &packed->inode.v;
+	set_bkey_val_bytes(&packed->inode.k, bytes);
+	memset_u64s_tail(&packed->inode.v, 0, bytes);
+
+	SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		struct bch_inode_unpacked unpacked;
+
+		int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
+					   &unpacked);
+		BUG_ON(ret);
+		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
+		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
+		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
+
+#define x(_name, _bits)	BUG_ON(unpacked._name != inode->_name);
+		BCH_INODE_FIELDS()
+#undef  x
+	}
+}
+
+int bch2_inode_unpack(struct bkey_s_c_inode inode,
+		      struct bch_inode_unpacked *unpacked)
+{
+	const u8 *in = inode.v->fields;
+	const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+	u64 field[2];
+	unsigned fieldnr = 0, field_bits;
+	int ret;
+
+	unpacked->bi_inum	= inode.k->p.inode;
+	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+	unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
+	unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
+
+#define x(_name, _bits)					\
+	if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {			\
+		memset(&unpacked->_name, 0,				\
+		       sizeof(*unpacked) -				\
+		       offsetof(struct bch_inode_unpacked, _name));	\
+		return 0;						\
+	}								\
+									\
+	ret = inode_decode_field(in, end, field, &field_bits);		\
+	if (ret < 0)							\
+		return ret;						\
+									\
+	if (field_bits > sizeof(unpacked->_name) * 8)			\
+		return -1;						\
+									\
+	unpacked->_name = field[1];					\
+	in += ret;
+
+	BCH_INODE_FIELDS()
+#undef  x
+
+	/* XXX: signal if there were more fields than expected? */
+
+	return 0;
+}
+
+struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
+				   struct bch_inode_unpacked *inode,
+				   u64 inum, unsigned flags)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0),
+				   BTREE_ITER_SLOTS|flags);
+	if (IS_ERR(iter))
+		return iter;
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+	if (ret)
+		goto err;
+
+	return iter;
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ERR_PTR(ret);
+}
+
+int bch2_inode_write(struct btree_trans *trans,
+		     struct btree_iter *iter,
+		     struct bch_inode_unpacked *inode)
+{
+	struct bkey_inode_buf *inode_p;
+
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	bch2_inode_pack(inode_p, inode);
+	bch2_trans_update(trans, iter, &inode_p->inode.k_i);
+	return 0;
+}
+
+const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+		struct bch_inode_unpacked unpacked;
+
+	if (k.k->p.offset)
+		return "nonzero offset";
+
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
+		return "incorrect value size";
+
+	if (k.k->p.inode < BLOCKDEV_INODE_MAX)
+		return "fs inode in blockdev range";
+
+	if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
+		return "invalid str hash type";
+
+	if (bch2_inode_unpack(inode, &unpacked))
+		return "invalid variable length fields";
+
+	if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+		return "invalid data checksum type";
+
+	if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+		return "invalid data checksum type";
+
+	if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+	    unpacked.bi_nlink != 0)
+		return "flagged as unlinked but bi_nlink != 0";
+
+	return NULL;
+}
+
+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
+		       struct bkey_s_c k)
+{
+	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+	struct bch_inode_unpacked unpacked;
+
+	if (bch2_inode_unpack(inode, &unpacked)) {
+		pr_buf(out, "(unpack error)");
+		return;
+	}
+
+#define x(_name, _bits)						\
+	pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
+	BCH_INODE_FIELDS()
+#undef  x
+}
+
+const char *bch2_inode_generation_invalid(const struct bch_fs *c,
+					  struct bkey_s_c k)
+{
+	if (k.k->p.offset)
+		return "nonzero offset";
+
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
+		return "incorrect value size";
+
+	return NULL;
+}
+
+void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
+				   struct bkey_s_c k)
+{
+	struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
+
+	pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
+}
+
+void bch2_inode_init_early(struct bch_fs *c,
+			   struct bch_inode_unpacked *inode_u)
+{
+	enum bch_str_hash_type str_hash =
+		bch2_str_hash_opt_to_type(c, c->opts.str_hash);
+
+	memset(inode_u, 0, sizeof(*inode_u));
+
+	/* ick */
+	inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
+	get_random_bytes(&inode_u->bi_hash_seed,
+			 sizeof(inode_u->bi_hash_seed));
+}
+
+void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
+			  uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+			  struct bch_inode_unpacked *parent)
+{
+	inode_u->bi_mode	= mode;
+	inode_u->bi_uid		= uid;
+	inode_u->bi_gid		= gid;
+	inode_u->bi_dev		= rdev;
+	inode_u->bi_atime	= now;
+	inode_u->bi_mtime	= now;
+	inode_u->bi_ctime	= now;
+	inode_u->bi_otime	= now;
+
+	if (parent && parent->bi_mode & S_ISGID) {
+		inode_u->bi_gid = parent->bi_gid;
+		if (S_ISDIR(mode))
+			inode_u->bi_mode |= S_ISGID;
+	}
+
+	if (parent) {
+#define x(_name, ...)	inode_u->bi_##_name = parent->bi_##_name;
+		BCH_INODE_OPTS()
+#undef x
+	}
+}
+
+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		     struct bch_inode_unpacked *parent)
+{
+	bch2_inode_init_early(c, inode_u);
+	bch2_inode_init_late(inode_u, bch2_current_time(c),
+			     uid, gid, mode, rdev, parent);
+}
+
+static inline u32 bkey_generation(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+		BUG();
+	case KEY_TYPE_inode_generation:
+		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
+	default:
+		return 0;
+	}
+}
+
+int bch2_inode_create(struct btree_trans *trans,
+		      struct bch_inode_unpacked *inode_u,
+		      u64 min, u64 max, u64 *hint)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_inode_buf *inode_p;
+	struct btree_iter *iter;
+	u64 start;
+	int ret;
+
+	if (!max)
+		max = ULLONG_MAX;
+
+	if (c->opts.inodes_32bit)
+		max = min_t(u64, max, U32_MAX);
+
+	start = READ_ONCE(*hint);
+
+	if (start >= max || start < min)
+		start = min;
+
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	iter = bch2_trans_get_iter(trans,
+			BTREE_ID_INODES, POS(start, 0),
+			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+again:
+	while (1) {
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+
+		ret = bkey_err(k);
+		if (ret)
+			return ret;
+
+		switch (k.k->type) {
+		case KEY_TYPE_inode:
+			/* slot used */
+			if (iter->pos.inode >= max)
+				goto out;
+
+			bch2_btree_iter_next_slot(iter);
+			break;
+
+		default:
+			*hint			= k.k->p.inode;
+			inode_u->bi_inum	= k.k->p.inode;
+			inode_u->bi_generation	= bkey_generation(k);
+
+			bch2_inode_pack(inode_p, inode_u);
+			bch2_trans_update(trans, iter, &inode_p->inode.k_i);
+			return 0;
+		}
+	}
+out:
+	if (start != min) {
+		/* Retry from start */
+		start = min;
+		bch2_btree_iter_set_pos(iter, POS(start, 0));
+		goto again;
+	}
+
+	return -ENOSPC;
+}
+
+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_i_inode_generation delete;
+	struct bpos start = POS(inode_nr, 0);
+	struct bpos end = POS(inode_nr + 1, 0);
+	int ret;
+
+	/*
+	 * If this was a directory, there shouldn't be any real dirents left -
+	 * but there could be whiteouts (from hash collisions) that we should
+	 * delete:
+	 *
+	 * XXX: the dirent could ideally would delete whiteouts when they're no
+	 * longer needed
+	 */
+	ret   = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+					start, end, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_XATTRS,
+					start, end, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+					start, end, NULL);
+	if (ret)
+		return ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(inode_nr, 0),
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	do {
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+		u32 bi_generation = 0;
+
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
+					"inode %llu not found when deleting",
+					inode_nr);
+
+		switch (k.k->type) {
+		case KEY_TYPE_inode: {
+			struct bch_inode_unpacked inode_u;
+
+			if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
+				bi_generation = inode_u.bi_generation + 1;
+			break;
+		}
+		case KEY_TYPE_inode_generation: {
+			struct bkey_s_c_inode_generation g =
+				bkey_s_c_to_inode_generation(k);
+			bi_generation = le32_to_cpu(g.v->bi_generation);
+			break;
+		}
+		}
+
+		if (!bi_generation) {
+			bkey_init(&delete.k);
+			delete.k.p.inode = inode_nr;
+		} else {
+			bkey_inode_generation_init(&delete.k_i);
+			delete.k.p.inode = inode_nr;
+			delete.v.bi_generation = cpu_to_le32(bi_generation);
+		}
+
+		bch2_trans_update(&trans, iter, &delete.k_i);
+
+		ret = bch2_trans_commit(&trans, NULL, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL);
+	} while (ret == -EINTR);
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+				  struct bch_inode_unpacked *inode)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
+			POS(inode_nr, 0), BTREE_ITER_SLOTS);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	ret = k.k->type == KEY_TYPE_inode
+		? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
+		: -ENOENT;
+
+	bch2_trans_iter_put(trans, iter);
+
+	return ret;
+}
+
+int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+			    struct bch_inode_unpacked *inode)
+{
+	return bch2_trans_do(c, NULL, 0,
+		bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_inode_pack_test(void)
+{
+	struct bch_inode_unpacked *u, test_inodes[] = {
+		{
+			.bi_atime	= U64_MAX,
+			.bi_ctime	= U64_MAX,
+			.bi_mtime	= U64_MAX,
+			.bi_otime	= U64_MAX,
+			.bi_size	= U64_MAX,
+			.bi_sectors	= U64_MAX,
+			.bi_uid		= U32_MAX,
+			.bi_gid		= U32_MAX,
+			.bi_nlink	= U32_MAX,
+			.bi_generation	= U32_MAX,
+			.bi_dev		= U32_MAX,
+		},
+	};
+
+	for (u = test_inodes;
+	     u < test_inodes + ARRAY_SIZE(test_inodes);
+	     u++) {
+		struct bkey_inode_buf p;
+
+		bch2_inode_pack(&p, u);
+	}
+}
+#endif
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
new file mode 100644
index 000000000000..bb759a46dc41
--- /dev/null
+++ b/fs/bcachefs/inode.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_H
+#define _BCACHEFS_INODE_H
+
+#include "opts.h"
+
+extern const char * const bch2_inode_opts[];
+
+const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_inode (struct bkey_ops) {		\
+	.key_invalid	= bch2_inode_invalid,		\
+	.val_to_text	= bch2_inode_to_text,		\
+}
+
+const char *bch2_inode_generation_invalid(const struct bch_fs *,
+					  struct bkey_s_c);
+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
+				   struct bkey_s_c);
+
+#define bch2_bkey_ops_inode_generation (struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_generation_invalid,	\
+	.val_to_text	= bch2_inode_generation_to_text,	\
+}
+
+struct bch_inode_unpacked {
+	u64			bi_inum;
+	__le64			bi_hash_seed;
+	u32			bi_flags;
+	u16			bi_mode;
+
+#define x(_name, _bits)	u##_bits _name;
+	BCH_INODE_FIELDS()
+#undef  x
+};
+
+struct bkey_inode_buf {
+	struct bkey_i_inode	inode;
+
+#define x(_name, _bits)		+ 8 + _bits / 8
+	u8		_pad[0 + BCH_INODE_FIELDS()];
+#undef  x
+} __attribute__((packed, aligned(8)));
+
+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+
+struct btree_iter *bch2_inode_peek(struct btree_trans *,
+			struct bch_inode_unpacked *, u64, unsigned);
+int bch2_inode_write(struct btree_trans *, struct btree_iter *,
+		     struct bch_inode_unpacked *);
+
+void bch2_inode_init_early(struct bch_fs *,
+			   struct bch_inode_unpacked *);
+void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
+			  uid_t, gid_t, umode_t, dev_t,
+			  struct bch_inode_unpacked *);
+void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
+		     uid_t, gid_t, umode_t, dev_t,
+		     struct bch_inode_unpacked *);
+
+int bch2_inode_create(struct btree_trans *,
+		      struct bch_inode_unpacked *,
+		      u64, u64, u64 *);
+
+int bch2_inode_rm(struct bch_fs *, u64);
+
+int bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
+				  struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
+
+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
+{
+	struct bch_io_opts ret = { 0 };
+
+#define x(_name, _bits)					\
+	if (inode->bi_##_name)						\
+		opt_set(ret, _name, inode->bi_##_name - 1);
+	BCH_INODE_OPTS()
+#undef x
+	return ret;
+}
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+				      enum inode_opt_id id, u64 v)
+{
+	switch (id) {
+#define x(_name, ...)							\
+	case Inode_opt_##_name:						\
+		inode->bi_##_name = v;					\
+		break;
+	BCH_INODE_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
+				     enum inode_opt_id id)
+{
+	switch (id) {
+#define x(_name, ...)							\
+	case Inode_opt_##_name:						\
+		return inode->bi_##_name;
+	BCH_INODE_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+static inline struct bch_io_opts
+io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode)
+{
+	struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
+
+	bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode));
+	return opts;
+}
+
+static inline u8 mode_to_type(umode_t mode)
+{
+	return (mode >> 12) & 15;
+}
+
+/* i_nlink: */
+
+static inline unsigned nlink_bias(umode_t mode)
+{
+	return S_ISDIR(mode) ? 2 : 1;
+}
+
+static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
+{
+	if (bi->bi_flags & BCH_INODE_UNLINKED)
+		bi->bi_flags &= ~BCH_INODE_UNLINKED;
+	else
+		bi->bi_nlink++;
+}
+
+static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi)
+{
+	BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED);
+	if (bi->bi_nlink)
+		bi->bi_nlink--;
+	else
+		bi->bi_flags |= BCH_INODE_UNLINKED;
+}
+
+static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
+{
+	return bi->bi_flags & BCH_INODE_UNLINKED
+		  ? 0
+		  : bi->bi_nlink + nlink_bias(bi->bi_mode);
+}
+
+static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
+					unsigned nlink)
+{
+	if (nlink) {
+		bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
+		bi->bi_flags &= ~BCH_INODE_UNLINKED;
+	} else {
+		bi->bi_nlink = 0;
+		bi->bi_flags |= BCH_INODE_UNLINKED;
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_inode_pack_test(void);
+#else
+static inline void bch2_inode_pack_test(void) {}
+#endif
+
+#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
new file mode 100644
index 000000000000..836004b128f0
--- /dev/null
+++ b/fs/bcachefs/io.c
@@ -0,0 +1,2210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bset.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "compress.h"
+#include "clock.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "rebalance.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/blkdev.h>
+#include <linux/random.h>
+
+#include <trace/events/bcachefs.h>
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	const struct bch_devs_mask *devs;
+	unsigned d, nr = 0, total = 0;
+	u64 now = local_clock(), last;
+	s64 congested;
+	struct bch_dev *ca;
+
+	if (!target)
+		return false;
+
+	rcu_read_lock();
+	devs = bch2_target_to_mask(c, target);
+	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+		ca = rcu_dereference(c->devs[d]);
+		if (!ca)
+			continue;
+
+		congested = atomic_read(&ca->congested);
+		last = READ_ONCE(ca->congested_last);
+		if (time_after64(now, last))
+			congested -= (now - last) >> 12;
+
+		total += max(congested, 0LL);
+		nr++;
+	}
+	rcu_read_unlock();
+
+	return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
+				       u64 now, int rw)
+{
+	u64 latency_capable =
+		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
+	/* ideally we'd be taking into account the device's variance here: */
+	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
+	s64 latency_over = io_latency - latency_threshold;
+
+	if (latency_threshold && latency_over > 0) {
+		/*
+		 * bump up congested by approximately latency_over * 4 /
+		 * latency_threshold - we don't need much accuracy here so don't
+		 * bother with the divide:
+		 */
+		if (atomic_read(&ca->congested) < CONGESTED_MAX)
+			atomic_add(latency_over >>
+				   max_t(int, ilog2(latency_threshold) - 2, 0),
+				   &ca->congested);
+
+		ca->congested_last = now;
+	} else if (atomic_read(&ca->congested) > 0) {
+		atomic_dec(&ca->congested);
+	}
+}
+
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
+{
+	atomic64_t *latency = &ca->cur_latency[rw];
+	u64 now = local_clock();
+	u64 io_latency = time_after64(now, submit_time)
+		? now - submit_time
+		: 0;
+	u64 old, new, v = atomic64_read(latency);
+
+	do {
+		old = v;
+
+		/*
+		 * If the io latency was reasonably close to the current
+		 * latency, skip doing the update and atomic operation - most of
+		 * the time:
+		 */
+		if (abs((int) (old - io_latency)) < (old >> 1) &&
+		    now & ~(~0 << 5))
+			break;
+
+		new = ewma_add(old, io_latency, 5);
+	} while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+
+	bch2_congested_acct(ca, io_latency, now, rw);
+
+	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+}
+
+/* Allocate, free from mempool: */
+
+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
+{
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, iter)
+		if (bv->bv_page != ZERO_PAGE(0))
+			mempool_free(bv->bv_page, &c->bio_bounce_pages);
+	bio->bi_vcnt = 0;
+}
+
+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
+{
+	struct page *page;
+
+	if (likely(!*using_mempool)) {
+		page = alloc_page(GFP_NOIO);
+		if (unlikely(!page)) {
+			mutex_lock(&c->bio_bounce_pages_lock);
+			*using_mempool = true;
+			goto pool_alloc;
+
+		}
+	} else {
+pool_alloc:
+		page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+	}
+
+	return page;
+}
+
+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
+			       size_t size)
+{
+	bool using_mempool = false;
+
+	while (size) {
+		struct page *page = __bio_alloc_page_pool(c, &using_mempool);
+		unsigned len = min(PAGE_SIZE, size);
+
+		BUG_ON(!bio_add_page(bio, page, len, 0));
+		size -= len;
+	}
+
+	if (using_mempool)
+		mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+/* Extent update path: */
+
+static int sum_sector_overwrites(struct btree_trans *trans,
+				 struct btree_iter *extent_iter,
+				 struct bkey_i *new,
+				 bool may_allocate,
+				 bool *maybe_extending,
+				 s64 *delta)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c old;
+	int ret = 0;
+
+	*maybe_extending = true;
+	*delta = 0;
+
+	iter = bch2_trans_copy_iter(trans, extent_iter);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
+		if (!may_allocate &&
+		    bch2_bkey_nr_ptrs_allocated(old) <
+		    bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) {
+			ret = -ENOSPC;
+			break;
+		}
+
+		*delta += (min(new->k.p.offset,
+			      old.k->p.offset) -
+			  max(bkey_start_offset(&new->k),
+			      bkey_start_offset(old.k))) *
+			(bkey_extent_is_allocation(&new->k) -
+			 bkey_extent_is_allocation(old.k));
+
+		if (bkey_cmp(old.k->p, new->k.p) >= 0) {
+			/*
+			 * Check if there's already data above where we're
+			 * going to be writing to - this means we're definitely
+			 * not extending the file:
+			 *
+			 * Note that it's not sufficient to check if there's
+			 * data up to the sector offset we're going to be
+			 * writing to, because i_size could be up to one block
+			 * less:
+			 */
+			if (!bkey_cmp(old.k->p, new->k.p))
+				old = bch2_btree_iter_next(iter);
+
+			if (old.k && !bkey_err(old) &&
+			    old.k->p.inode == extent_iter->pos.inode &&
+			    bkey_extent_is_data(old.k))
+				*maybe_extending = false;
+
+			break;
+		}
+	}
+
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+int bch2_extent_update(struct btree_trans *trans,
+		       struct btree_iter *iter,
+		       struct bkey_i *k,
+		       struct disk_reservation *disk_res,
+		       u64 *journal_seq,
+		       u64 new_i_size,
+		       s64 *i_sectors_delta)
+{
+	/* this must live until after bch2_trans_commit(): */
+	struct bkey_inode_buf inode_p;
+	bool extending = false;
+	s64 delta = 0;
+	int ret;
+
+	ret = bch2_extent_trim_atomic(k, iter);
+	if (ret)
+		return ret;
+
+	ret = sum_sector_overwrites(trans, iter, k,
+			disk_res && disk_res->sectors != 0,
+			&extending, &delta);
+	if (ret)
+		return ret;
+
+	new_i_size = extending
+		? min(k->k.p.offset << 9, new_i_size)
+		: 0;
+
+	if (delta || new_i_size) {
+		struct btree_iter *inode_iter;
+		struct bch_inode_unpacked inode_u;
+
+		inode_iter = bch2_inode_peek(trans, &inode_u,
+				k->k.p.inode, BTREE_ITER_INTENT);
+		if (IS_ERR(inode_iter))
+			return PTR_ERR(inode_iter);
+
+		/*
+		 * XXX:
+		 * writeback can race a bit with truncate, because truncate
+		 * first updates the inode then truncates the pagecache. This is
+		 * ugly, but lets us preserve the invariant that the in memory
+		 * i_size is always >= the on disk i_size.
+		 *
+		BUG_ON(new_i_size > inode_u.bi_size &&
+		       (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
+		 */
+		BUG_ON(new_i_size > inode_u.bi_size && !extending);
+
+		if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+		    new_i_size > inode_u.bi_size)
+			inode_u.bi_size = new_i_size;
+		else
+			new_i_size = 0;
+
+		inode_u.bi_sectors += delta;
+
+		if (delta || new_i_size) {
+			bch2_inode_pack(&inode_p, &inode_u);
+			bch2_trans_update(trans, inode_iter,
+					  &inode_p.inode.k_i);
+		}
+
+		bch2_trans_iter_put(trans, inode_iter);
+	}
+
+	bch2_trans_update(trans, iter, k);
+
+	ret = bch2_trans_commit(trans, disk_res, journal_seq,
+				BTREE_INSERT_NOCHECK_RW|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_USE_RESERVE);
+	if (!ret && i_sectors_delta)
+		*i_sectors_delta += delta;
+
+	return ret;
+}
+
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+		   struct bpos end, u64 *journal_seq,
+		   s64 *i_sectors_delta)
+{
+	struct bch_fs *c	= trans->c;
+	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
+	struct bkey_s_c k;
+	int ret = 0, ret2 = 0;
+
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       bkey_cmp(iter->pos, end) < 0) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete;
+
+		ret = bkey_err(k);
+		if (ret)
+			goto btree_err;
+
+		bkey_init(&delete.k);
+		delete.k.p = iter->pos;
+
+		/* create the biggest key we can */
+		bch2_key_resize(&delete.k, max_sectors);
+		bch2_cut_back(end, &delete.k);
+
+		bch2_trans_begin_updates(trans);
+
+		ret = bch2_extent_update(trans, iter, &delete,
+				&disk_res, journal_seq,
+				0, i_sectors_delta);
+		bch2_disk_reservation_put(c, &disk_res);
+btree_err:
+		if (ret == -EINTR) {
+			ret2 = ret;
+			ret = 0;
+		}
+		if (ret)
+			break;
+	}
+
+	if (bkey_cmp(iter->pos, end) > 0) {
+		bch2_btree_iter_set_pos(iter, end);
+		ret = bch2_btree_iter_traverse(iter);
+	}
+
+	return ret ?: ret2;
+}
+
+int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
+		u64 *journal_seq, s64 *i_sectors_delta)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   POS(inum, start),
+				   BTREE_ITER_INTENT);
+
+	ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
+			     journal_seq, i_sectors_delta);
+	bch2_trans_exit(&trans);
+
+	if (ret == -EINTR)
+		ret = 0;
+
+	return ret;
+}
+
+int bch2_write_index_default(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_i *k = bch2_keylist_front(keys);
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	int ret;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   bkey_start_pos(&k->k),
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	do {
+		BKEY_PADDED(k) tmp;
+
+		bkey_copy(&tmp.k, bch2_keylist_front(keys));
+
+		bch2_trans_begin_updates(&trans);
+
+		ret = bch2_extent_update(&trans, iter, &tmp.k,
+					 &op->res, op_journal_seq(op),
+					 op->new_i_size, &op->i_sectors_delta);
+		if (ret == -EINTR)
+			continue;
+		if (ret)
+			break;
+
+		if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
+			bch2_cut_front(iter->pos, bch2_keylist_front(keys));
+		else
+			bch2_keylist_pop_front(keys);
+	} while (!bch2_keylist_empty(keys));
+
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+/* Writes */
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+			       enum bch_data_type type,
+			       const struct bkey_i *k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+	const struct bch_extent_ptr *ptr;
+	struct bch_write_bio *n;
+	struct bch_dev *ca;
+
+	BUG_ON(c->opts.nochanges);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
+		       !c->devs[ptr->dev]);
+
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (to_entry(ptr + 1) < ptrs.end) {
+			n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
+						   &ca->replica_set));
+
+			n->bio.bi_end_io	= wbio->bio.bi_end_io;
+			n->bio.bi_private	= wbio->bio.bi_private;
+			n->parent		= wbio;
+			n->split		= true;
+			n->bounce		= false;
+			n->put_bio		= true;
+			n->bio.bi_opf		= wbio->bio.bi_opf;
+			bio_inc_remaining(&wbio->bio);
+		} else {
+			n = wbio;
+			n->split		= false;
+		}
+
+		n->c			= c;
+		n->dev			= ptr->dev;
+		n->have_ioref		= bch2_dev_get_ioref(ca, WRITE);
+		n->submit_time		= local_clock();
+		n->bio.bi_iter.bi_sector = ptr->offset;
+
+		if (!journal_flushes_device(ca))
+			n->bio.bi_opf |= REQ_FUA;
+
+		if (likely(n->have_ioref)) {
+			this_cpu_add(ca->io_done->sectors[WRITE][type],
+				     bio_sectors(&n->bio));
+
+			bio_set_dev(&n->bio, ca->disk_sb.bdev);
+			submit_bio(&n->bio);
+		} else {
+			n->bio.bi_status	= BLK_STS_REMOVED;
+			bio_endio(&n->bio);
+		}
+	}
+}
+
+static void __bch2_write(struct closure *);
+
+static void bch2_write_done(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
+
+	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
+		op->error = bch2_journal_error(&c->journal);
+
+	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+		bch2_disk_reservation_put(c, &op->res);
+	percpu_ref_put(&c->writes);
+	bch2_keylist_free(&op->insert_keys, op->inline_keys);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+
+	if (op->end_io)
+		op->end_io(op);
+	if (cl->parent)
+		closure_return(cl);
+	else
+		closure_debug_destroy(cl);
+}
+
+/**
+ * bch_write_index - after a write, update index to point to new data
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct keylist *keys = &op->insert_keys;
+	struct bch_extent_ptr *ptr;
+	struct bkey_i *src, *dst = keys->keys, *n, *k;
+	unsigned dev;
+	int ret;
+
+	for (src = keys->keys; src != keys->top; src = n) {
+		n = bkey_next(src);
+		bkey_copy(dst, src);
+
+		bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr,
+			test_bit(ptr->dev, op->failed.d));
+
+		if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) {
+			ret = -EIO;
+			goto err;
+		}
+
+		dst = bkey_next(dst);
+	}
+
+	keys->top = dst;
+
+	/*
+	 * probably not the ideal place to hook this in, but I don't
+	 * particularly want to plumb io_opts all the way through the btree
+	 * update stack right now
+	 */
+	for_each_keylist_key(keys, k)
+		bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
+
+	if (!bch2_keylist_empty(keys)) {
+		u64 sectors_start = keylist_sectors(keys);
+		int ret = op->index_update_fn(op);
+
+		BUG_ON(ret == -EINTR);
+		BUG_ON(keylist_sectors(keys) && !ret);
+
+		op->written += sectors_start - keylist_sectors(keys);
+
+		if (ret) {
+			__bcache_io_error(c, "btree IO error %i", ret);
+			op->error = ret;
+		}
+	}
+out:
+	/* If some a bucket wasn't written, we can't erasure code it: */
+	for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
+		bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+
+	bch2_open_buckets_put(c, &op->open_buckets);
+	return;
+err:
+	keys->top = keys->keys;
+	op->error = ret;
+	goto out;
+}
+
+static void bch2_write_index(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
+
+	__bch2_write_index(op);
+
+	if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
+		bch2_journal_flush_seq_async(&c->journal,
+					     *op_journal_seq(op),
+					     cl);
+		continue_at(cl, bch2_write_done, index_update_wq(op));
+	} else {
+		continue_at_nobarrier(cl, bch2_write_done, NULL);
+	}
+}
+
+static void bch2_write_endio(struct bio *bio)
+{
+	struct closure *cl		= bio->bi_private;
+	struct bch_write_op *op		= container_of(cl, struct bch_write_op, cl);
+	struct bch_write_bio *wbio	= to_wbio(bio);
+	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
+	struct bch_fs *c		= wbio->c;
+	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
+		set_bit(wbio->dev, op->failed.d);
+
+	if (wbio->have_ioref) {
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (wbio->bounce)
+		bch2_bio_free_pages_pool(c, bio);
+
+	if (wbio->put_bio)
+		bio_put(bio);
+
+	if (parent)
+		bio_endio(&parent->bio);
+	else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
+		closure_put(cl);
+	else
+		continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
+}
+
+static void init_append_extent(struct bch_write_op *op,
+			       struct write_point *wp,
+			       struct bversion version,
+			       struct bch_extent_crc_unpacked crc)
+{
+	struct bch_fs *c = op->c;
+	struct bkey_i_extent *e;
+	struct open_bucket *ob;
+	unsigned i;
+
+	BUG_ON(crc.compressed_size > wp->sectors_free);
+	wp->sectors_free -= crc.compressed_size;
+	op->pos.offset += crc.uncompressed_size;
+
+	e = bkey_extent_init(op->insert_keys.top);
+	e->k.p		= op->pos;
+	e->k.size	= crc.uncompressed_size;
+	e->k.version	= version;
+
+	if (crc.csum_type ||
+	    crc.compression_type ||
+	    crc.nonce)
+		bch2_extent_crc_append(&e->k_i, crc);
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+		union bch_extent_entry *end =
+			bkey_val_end(bkey_i_to_s(&e->k_i));
+
+		end->ptr = ob->ptr;
+		end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+		end->ptr.cached = !ca->mi.durability ||
+			(op->flags & BCH_WRITE_CACHED) != 0;
+		end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
+
+		e->k.u64s++;
+
+		BUG_ON(crc.compressed_size > ob->sectors_free);
+		ob->sectors_free -= crc.compressed_size;
+	}
+
+	bch2_keylist_push(&op->insert_keys);
+}
+
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
+					struct write_point *wp,
+					struct bio *src,
+					bool *page_alloc_failed,
+					void *buf)
+{
+	struct bch_write_bio *wbio;
+	struct bio *bio;
+	unsigned output_available =
+		min(wp->sectors_free << 9, src->bi_iter.bi_size);
+	unsigned pages = DIV_ROUND_UP(output_available +
+				      (buf
+				       ? ((unsigned long) buf & (PAGE_SIZE - 1))
+				       : 0), PAGE_SIZE);
+
+	bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
+	wbio			= wbio_init(bio);
+	wbio->put_bio		= true;
+	/* copy WRITE_SYNC flag */
+	wbio->bio.bi_opf	= src->bi_opf;
+
+	if (buf) {
+		bch2_bio_map(bio, buf, output_available);
+		return bio;
+	}
+
+	wbio->bounce		= true;
+
+	/*
+	 * We can't use mempool for more than c->sb.encoded_extent_max
+	 * worth of pages, but we'd like to allocate more if we can:
+	 */
+	bch2_bio_alloc_pages_pool(c, bio,
+				  min_t(unsigned, output_available,
+					c->sb.encoded_extent_max << 9));
+
+	if (bio->bi_iter.bi_size < output_available)
+		*page_alloc_failed =
+			bch2_bio_alloc_pages(bio,
+					     output_available -
+					     bio->bi_iter.bi_size,
+					     GFP_NOFS) != 0;
+
+	return bio;
+}
+
+static int bch2_write_rechecksum(struct bch_fs *c,
+				 struct bch_write_op *op,
+				 unsigned new_csum_type)
+{
+	struct bio *bio = &op->wbio.bio;
+	struct bch_extent_crc_unpacked new_crc;
+	int ret;
+
+	/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
+
+	if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	    bch2_csum_type_is_encryption(new_csum_type))
+		new_csum_type = op->crc.csum_type;
+
+	ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+				  NULL, &new_crc,
+				  op->crc.offset, op->crc.live_size,
+				  new_csum_type);
+	if (ret)
+		return ret;
+
+	bio_advance(bio, op->crc.offset << 9);
+	bio->bi_iter.bi_size = op->crc.live_size << 9;
+	op->crc = new_crc;
+	return 0;
+}
+
+static int bch2_write_decrypt(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct nonce nonce = extent_nonce(op->version, op->crc);
+	struct bch_csum csum;
+
+	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
+		return 0;
+
+	/*
+	 * If we need to decrypt data in the write path, we'll no longer be able
+	 * to verify the existing checksum (poly1305 mac, in this case) after
+	 * it's decrypted - this is the last point we'll be able to reverify the
+	 * checksum:
+	 */
+	csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	if (bch2_crc_cmp(op->crc.csum, csum))
+		return -EIO;
+
+	bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	op->crc.csum_type = 0;
+	op->crc.csum = (struct bch_csum) { 0, 0 };
+	return 0;
+}
+
+static enum prep_encoded_ret {
+	PREP_ENCODED_OK,
+	PREP_ENCODED_ERR,
+	PREP_ENCODED_CHECKSUM_ERR,
+	PREP_ENCODED_DO_WRITE,
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+{
+	struct bch_fs *c = op->c;
+	struct bio *bio = &op->wbio.bio;
+
+	if (!(op->flags & BCH_WRITE_DATA_ENCODED))
+		return PREP_ENCODED_OK;
+
+	BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
+
+	/* Can we just write the entire extent as is? */
+	if (op->crc.uncompressed_size == op->crc.live_size &&
+	    op->crc.compressed_size <= wp->sectors_free &&
+	    op->crc.compression_type == op->compression_type) {
+		if (!op->crc.compression_type &&
+		    op->csum_type != op->crc.csum_type &&
+		    bch2_write_rechecksum(c, op, op->csum_type))
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		return PREP_ENCODED_DO_WRITE;
+	}
+
+	/*
+	 * If the data is compressed and we couldn't write the entire extent as
+	 * is, we have to decompress it:
+	 */
+	if (op->crc.compression_type) {
+		struct bch_csum csum;
+
+		if (bch2_write_decrypt(op))
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		/* Last point we can still verify checksum: */
+		csum = bch2_checksum_bio(c, op->crc.csum_type,
+					 extent_nonce(op->version, op->crc),
+					 bio);
+		if (bch2_crc_cmp(op->crc.csum, csum))
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+			return PREP_ENCODED_ERR;
+	}
+
+	/*
+	 * No longer have compressed data after this point - data might be
+	 * encrypted:
+	 */
+
+	/*
+	 * If the data is checksummed and we're only writing a subset,
+	 * rechecksum and adjust bio to point to currently live data:
+	 */
+	if ((op->crc.live_size != op->crc.uncompressed_size ||
+	     op->crc.csum_type != op->csum_type) &&
+	    bch2_write_rechecksum(c, op, op->csum_type))
+		return PREP_ENCODED_CHECKSUM_ERR;
+
+	/*
+	 * If we want to compress the data, it has to be decrypted:
+	 */
+	if ((op->compression_type ||
+	     bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	     bch2_csum_type_is_encryption(op->csum_type)) &&
+	    bch2_write_decrypt(op))
+		return PREP_ENCODED_CHECKSUM_ERR;
+
+	return PREP_ENCODED_OK;
+}
+
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
+			     struct bio **_dst)
+{
+	struct bch_fs *c = op->c;
+	struct bio *src = &op->wbio.bio, *dst = src;
+	struct bvec_iter saved_iter;
+	void *ec_buf;
+	struct bpos ec_pos = op->pos;
+	unsigned total_output = 0, total_input = 0;
+	bool bounce = false;
+	bool page_alloc_failed = false;
+	int ret, more = 0;
+
+	BUG_ON(!bio_sectors(src));
+
+	ec_buf = bch2_writepoint_ec_buf(c, wp);
+
+	switch (bch2_write_prep_encoded_data(op, wp)) {
+	case PREP_ENCODED_OK:
+		break;
+	case PREP_ENCODED_ERR:
+		ret = -EIO;
+		goto err;
+	case PREP_ENCODED_CHECKSUM_ERR:
+		goto csum_err;
+	case PREP_ENCODED_DO_WRITE:
+		/* XXX look for bug here */
+		if (ec_buf) {
+			dst = bch2_write_bio_alloc(c, wp, src,
+						   &page_alloc_failed,
+						   ec_buf);
+			bio_copy_data(dst, src);
+			bounce = true;
+		}
+		init_append_extent(op, wp, op->version, op->crc);
+		goto do_write;
+	}
+
+	if (ec_buf ||
+	    op->compression_type ||
+	    (op->csum_type &&
+	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+	    (bch2_csum_type_is_encryption(op->csum_type) &&
+	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+		dst = bch2_write_bio_alloc(c, wp, src,
+					   &page_alloc_failed,
+					   ec_buf);
+		bounce = true;
+	}
+
+	saved_iter = dst->bi_iter;
+
+	do {
+		struct bch_extent_crc_unpacked crc =
+			(struct bch_extent_crc_unpacked) { 0 };
+		struct bversion version = op->version;
+		size_t dst_len, src_len;
+
+		if (page_alloc_failed &&
+		    bio_sectors(dst) < wp->sectors_free &&
+		    bio_sectors(dst) < c->sb.encoded_extent_max)
+			break;
+
+		BUG_ON(op->compression_type &&
+		       (op->flags & BCH_WRITE_DATA_ENCODED) &&
+		       bch2_csum_type_is_encryption(op->crc.csum_type));
+		BUG_ON(op->compression_type && !bounce);
+
+		crc.compression_type = op->compression_type
+			?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+					     op->compression_type)
+			: 0;
+		if (!crc.compression_type) {
+			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
+
+			if (op->csum_type)
+				dst_len = min_t(unsigned, dst_len,
+						c->sb.encoded_extent_max << 9);
+
+			if (bounce) {
+				swap(dst->bi_iter.bi_size, dst_len);
+				bio_copy_data(dst, src);
+				swap(dst->bi_iter.bi_size, dst_len);
+			}
+
+			src_len = dst_len;
+		}
+
+		BUG_ON(!src_len || !dst_len);
+
+		if (bch2_csum_type_is_encryption(op->csum_type)) {
+			if (bversion_zero(version)) {
+				version.lo = atomic64_inc_return(&c->key_version) + 1;
+			} else {
+				crc.nonce = op->nonce;
+				op->nonce += src_len >> 9;
+			}
+		}
+
+		if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+		    !crc.compression_type &&
+		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
+		    bch2_csum_type_is_encryption(op->csum_type)) {
+			/*
+			 * Note: when we're using rechecksum(), we need to be
+			 * checksumming @src because it has all the data our
+			 * existing checksum covers - if we bounced (because we
+			 * were trying to compress), @dst will only have the
+			 * part of the data the new checksum will cover.
+			 *
+			 * But normally we want to be checksumming post bounce,
+			 * because part of the reason for bouncing is so the
+			 * data can't be modified (by userspace) while it's in
+			 * flight.
+			 */
+			if (bch2_rechecksum_bio(c, src, version, op->crc,
+					&crc, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->csum_type))
+				goto csum_err;
+		} else {
+			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+			    bch2_rechecksum_bio(c, src, version, op->crc,
+					NULL, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->crc.csum_type))
+				goto csum_err;
+
+			crc.compressed_size	= dst_len >> 9;
+			crc.uncompressed_size	= src_len >> 9;
+			crc.live_size		= src_len >> 9;
+
+			swap(dst->bi_iter.bi_size, dst_len);
+			bch2_encrypt_bio(c, op->csum_type,
+					 extent_nonce(version, crc), dst);
+			crc.csum = bch2_checksum_bio(c, op->csum_type,
+					 extent_nonce(version, crc), dst);
+			crc.csum_type = op->csum_type;
+			swap(dst->bi_iter.bi_size, dst_len);
+		}
+
+		init_append_extent(op, wp, version, crc);
+
+		if (dst != src)
+			bio_advance(dst, dst_len);
+		bio_advance(src, src_len);
+		total_output	+= dst_len;
+		total_input	+= src_len;
+	} while (dst->bi_iter.bi_size &&
+		 src->bi_iter.bi_size &&
+		 wp->sectors_free &&
+		 !bch2_keylist_realloc(&op->insert_keys,
+				      op->inline_keys,
+				      ARRAY_SIZE(op->inline_keys),
+				      BKEY_EXTENT_U64s_MAX));
+
+	more = src->bi_iter.bi_size != 0;
+
+	dst->bi_iter = saved_iter;
+
+	if (dst == src && more) {
+		BUG_ON(total_output != total_input);
+
+		dst = bio_split(src, total_input >> 9,
+				GFP_NOIO, &c->bio_write);
+		wbio_init(dst)->put_bio	= true;
+		/* copy WRITE_SYNC flag */
+		dst->bi_opf		= src->bi_opf;
+	}
+
+	dst->bi_iter.bi_size = total_output;
+do_write:
+	/* might have done a realloc... */
+	bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
+
+	*_dst = dst;
+	return more;
+csum_err:
+	bch_err(c, "error verifying existing checksum while "
+		"rewriting existing data (memory corruption?)");
+	ret = -EIO;
+err:
+	if (to_wbio(dst)->bounce)
+		bch2_bio_free_pages_pool(c, dst);
+	if (to_wbio(dst)->put_bio)
+		bio_put(dst);
+
+	return ret;
+}
+
+static void __bch2_write(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
+	struct write_point *wp;
+	struct bio *bio;
+	bool skip_put = true;
+	int ret;
+again:
+	memset(&op->failed, 0, sizeof(op->failed));
+
+	do {
+		struct bkey_i *key_to_write;
+		unsigned key_to_write_offset = op->insert_keys.top_p -
+			op->insert_keys.keys_p;
+
+		/* +1 for possible cache device: */
+		if (op->open_buckets.nr + op->nr_replicas + 1 >
+		    ARRAY_SIZE(op->open_buckets.v))
+			goto flush_io;
+
+		if (bch2_keylist_realloc(&op->insert_keys,
+					op->inline_keys,
+					ARRAY_SIZE(op->inline_keys),
+					BKEY_EXTENT_U64s_MAX))
+			goto flush_io;
+
+		wp = bch2_alloc_sectors_start(c,
+			op->target,
+			op->opts.erasure_code,
+			op->write_point,
+			&op->devs_have,
+			op->nr_replicas,
+			op->nr_replicas_required,
+			op->alloc_reserve,
+			op->flags,
+			(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
+		EBUG_ON(!wp);
+
+		if (unlikely(IS_ERR(wp))) {
+			if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
+				ret = PTR_ERR(wp);
+				goto err;
+			}
+
+			goto flush_io;
+		}
+
+		bch2_open_bucket_get(c, wp, &op->open_buckets);
+		ret = bch2_write_extent(op, wp, &bio);
+		bch2_alloc_sectors_done(c, wp);
+
+		if (ret < 0)
+			goto err;
+
+		if (ret)
+			skip_put = false;
+
+		bio->bi_end_io	= bch2_write_endio;
+		bio->bi_private	= &op->cl;
+		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+		if (!skip_put)
+			closure_get(bio->bi_private);
+		else
+			op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
+
+		key_to_write = (void *) (op->insert_keys.keys_p +
+					 key_to_write_offset);
+
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+					  key_to_write);
+	} while (ret);
+
+	if (!skip_put)
+		continue_at(cl, bch2_write_index, index_update_wq(op));
+	return;
+err:
+	op->error = ret;
+
+	continue_at(cl, bch2_write_index, index_update_wq(op));
+	return;
+flush_io:
+	closure_sync(cl);
+
+	if (!bch2_keylist_empty(&op->insert_keys)) {
+		__bch2_write_index(op);
+
+		if (op->error) {
+			continue_at_nobarrier(cl, bch2_write_done, NULL);
+			return;
+		}
+	}
+
+	goto again;
+}
+
+/**
+ * bch_write - handle a write to a cache device or flash only volume
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+void bch2_write(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bio *bio = &op->wbio.bio;
+	struct bch_fs *c = op->c;
+
+	BUG_ON(!op->nr_replicas);
+	BUG_ON(!op->write_point.v);
+	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
+
+	if (bio_sectors(bio) & (c->opts.block_size - 1)) {
+		__bcache_io_error(c, "misaligned write");
+		op->error = -EIO;
+		goto err;
+	}
+
+	op->start_time = local_clock();
+
+	bch2_keylist_init(&op->insert_keys, op->inline_keys);
+	wbio_init(bio)->put_bio = false;
+
+	if (c->opts.nochanges ||
+	    !percpu_ref_tryget(&c->writes)) {
+		__bcache_io_error(c, "read only");
+		op->error = -EROFS;
+		goto err;
+	}
+
+	bch2_increment_clock(c, bio_sectors(bio), WRITE);
+
+	continue_at_nobarrier(cl, __bch2_write, NULL);
+	return;
+err:
+	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+		bch2_disk_reservation_put(c, &op->res);
+	closure_return(cl);
+}
+
+/* Cache promotion on read */
+
+struct promote_op {
+	struct closure		cl;
+	struct rcu_head		rcu;
+	u64			start_time;
+
+	struct rhash_head	hash;
+	struct bpos		pos;
+
+	struct migrate_write	write;
+	struct bio_vec		bi_inline_vecs[0]; /* must be last */
+};
+
+static const struct rhashtable_params bch_promote_params = {
+	.head_offset	= offsetof(struct promote_op, hash),
+	.key_offset	= offsetof(struct promote_op, pos),
+	.key_len	= sizeof(struct bpos),
+};
+
+static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
+				  struct bpos pos,
+				  struct bch_io_opts opts,
+				  unsigned flags)
+{
+	if (!(flags & BCH_READ_MAY_PROMOTE))
+		return false;
+
+	if (!opts.promote_target)
+		return false;
+
+	if (bch2_bkey_has_target(c, k, opts.promote_target))
+		return false;
+
+	if (bch2_target_congested(c, opts.promote_target)) {
+		/* XXX trace this */
+		return false;
+	}
+
+	if (rhashtable_lookup_fast(&c->promote_table, &pos,
+				   bch_promote_params))
+		return false;
+
+	return true;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+	int ret;
+
+	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+				     bch_promote_params);
+	BUG_ON(ret);
+	percpu_ref_put(&c->writes);
+	kfree_rcu(op, rcu);
+}
+
+static void promote_done(struct closure *cl)
+{
+	struct promote_op *op =
+		container_of(cl, struct promote_op, cl);
+	struct bch_fs *c = op->write.op.c;
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+			       op->start_time);
+
+	bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
+	promote_free(c, op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
+	struct closure *cl = &op->cl;
+	struct bio *bio = &op->write.op.wbio.bio;
+
+	trace_promote(&rbio->bio);
+
+	/* we now own pages: */
+	BUG_ON(!rbio->bounce);
+	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+
+	bch2_migrate_read_done(&op->write, rbio);
+
+	closure_init(cl, NULL);
+	closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
+	closure_return_with_destructor(cl, promote_done);
+}
+
+noinline
+static struct promote_op *__promote_alloc(struct bch_fs *c,
+					  enum btree_id btree_id,
+					  struct bpos pos,
+					  struct extent_ptr_decoded *pick,
+					  struct bch_io_opts opts,
+					  unsigned sectors,
+					  struct bch_read_bio **rbio)
+{
+	struct promote_op *op = NULL;
+	struct bio *bio;
+	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	int ret;
+
+	if (!percpu_ref_tryget(&c->writes))
+		return NULL;
+
+	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+	if (!op)
+		goto err;
+
+	op->start_time = local_clock();
+	op->pos = pos;
+
+	/*
+	 * We don't use the mempool here because extents that aren't
+	 * checksummed or compressed can be too big for the mempool:
+	 */
+	*rbio = kzalloc(sizeof(struct bch_read_bio) +
+			sizeof(struct bio_vec) * pages,
+			GFP_NOIO);
+	if (!*rbio)
+		goto err;
+
+	rbio_init(&(*rbio)->bio, opts);
+	bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
+
+	if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
+				 GFP_NOIO))
+		goto err;
+
+	(*rbio)->bounce		= true;
+	(*rbio)->split		= true;
+	(*rbio)->kmalloc	= true;
+
+	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+					  bch_promote_params))
+		goto err;
+
+	bio = &op->write.op.wbio.bio;
+	bio_init(bio, bio->bi_inline_vecs, pages);
+
+	ret = bch2_migrate_write_init(c, &op->write,
+			writepoint_hashed((unsigned long) current),
+			opts,
+			DATA_PROMOTE,
+			(struct data_opts) {
+				.target = opts.promote_target
+			},
+			btree_id,
+			bkey_s_c_null);
+	BUG_ON(ret);
+
+	return op;
+err:
+	if (*rbio)
+		bio_free_pages(&(*rbio)->bio);
+	kfree(*rbio);
+	*rbio = NULL;
+	kfree(op);
+	percpu_ref_put(&c->writes);
+	return NULL;
+}
+
+static inline struct promote_op *promote_alloc(struct bch_fs *c,
+					       struct bvec_iter iter,
+					       struct bkey_s_c k,
+					       struct extent_ptr_decoded *pick,
+					       struct bch_io_opts opts,
+					       unsigned flags,
+					       struct bch_read_bio **rbio,
+					       bool *bounce,
+					       bool *read_full)
+{
+	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+	/* data might have to be decompressed in the write path: */
+	unsigned sectors = promote_full
+		? max(pick->crc.compressed_size, pick->crc.live_size)
+		: bvec_iter_sectors(iter);
+	struct bpos pos = promote_full
+		? bkey_start_pos(k.k)
+		: POS(k.k->p.inode, iter.bi_sector);
+	struct promote_op *promote;
+
+	if (!should_promote(c, k, pos, opts, flags))
+		return NULL;
+
+	promote = __promote_alloc(c,
+				  k.k->type == KEY_TYPE_reflink_v
+				  ? BTREE_ID_REFLINK
+				  : BTREE_ID_EXTENTS,
+				  pos, pick, opts, sectors, rbio);
+	if (!promote)
+		return NULL;
+
+	*bounce		= true;
+	*read_full	= promote_full;
+	return promote;
+}
+
+/* Read */
+
+#define READ_RETRY_AVOID	1
+#define READ_RETRY		2
+#define READ_ERR		3
+
+enum rbio_context {
+	RBIO_CONTEXT_NULL,
+	RBIO_CONTEXT_HIGHPRI,
+	RBIO_CONTEXT_UNBOUND,
+};
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+	return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+			   enum rbio_context context,
+			   struct workqueue_struct *wq)
+{
+	if (context <= rbio->context) {
+		fn(&rbio->work);
+	} else {
+		rbio->work.func		= fn;
+		rbio->context		= context;
+		queue_work(wq, &rbio->work);
+	}
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
+{
+	BUG_ON(rbio->bounce && !rbio->split);
+
+	if (rbio->promote)
+		promote_free(rbio->c, rbio->promote);
+	rbio->promote = NULL;
+
+	if (rbio->bounce)
+		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
+	if (rbio->split) {
+		struct bch_read_bio *parent = rbio->parent;
+
+		if (rbio->kmalloc)
+			kfree(rbio);
+		else
+			bio_put(&rbio->bio);
+
+		rbio = parent;
+	}
+
+	return rbio;
+}
+
+/*
+ * Only called on a top level bch_read_bio to complete an entire read request,
+ * not a split:
+ */
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+	if (rbio->start_time)
+		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+				       rbio->start_time);
+	bio_endio(&rbio->bio);
+}
+
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+				     struct bvec_iter bvec_iter, u64 inode,
+				     struct bch_io_failures *failed,
+				     unsigned flags)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	BKEY_PADDED(k) tmp;
+	struct bkey_s_c k;
+	int ret;
+
+	flags &= ~BCH_READ_LAST_FRAGMENT;
+	flags |= BCH_READ_MUST_CLONE;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   rbio->pos, BTREE_ITER_SLOTS);
+retry:
+	rbio->bio.bi_status = 0;
+
+	k = bch2_btree_iter_peek_slot(iter);
+	if (bkey_err(k))
+		goto err;
+
+	bkey_reassemble(&tmp.k, k);
+	k = bkey_i_to_s_c(&tmp.k);
+	bch2_trans_unlock(&trans);
+
+	if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k),
+				   rbio->pick.ptr,
+				   rbio->pos.offset -
+				   rbio->pick.crc.offset)) {
+		/* extent we wanted to read no longer exists: */
+		rbio->hole = true;
+		goto out;
+	}
+
+	ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
+	if (ret == READ_RETRY)
+		goto retry;
+	if (ret)
+		goto err;
+out:
+	bch2_rbio_done(rbio);
+	bch2_trans_exit(&trans);
+	return;
+err:
+	rbio->bio.bi_status = BLK_STS_IOERR;
+	goto out;
+}
+
+static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
+			    struct bvec_iter bvec_iter, u64 inode,
+			    struct bch_io_failures *failed, unsigned flags)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	flags &= ~BCH_READ_LAST_FRAGMENT;
+	flags |= BCH_READ_MUST_CLONE;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+			   POS(inode, bvec_iter.bi_sector),
+			   BTREE_ITER_SLOTS, k, ret) {
+		BKEY_PADDED(k) tmp;
+		unsigned bytes, sectors, offset_into_extent;
+
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+
+		offset_into_extent = iter->pos.offset -
+			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		ret = bch2_read_indirect_extent(&trans,
+					&offset_into_extent, &tmp.k);
+		if (ret)
+			break;
+
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		bch2_trans_unlock(&trans);
+
+		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+		swap(bvec_iter.bi_size, bytes);
+
+		ret = __bch2_read_extent(c, rbio, bvec_iter, k,
+				offset_into_extent, failed, flags);
+		switch (ret) {
+		case READ_RETRY:
+			goto retry;
+		case READ_ERR:
+			goto err;
+		};
+
+		if (bytes == bvec_iter.bi_size)
+			goto out;
+
+		swap(bvec_iter.bi_size, bytes);
+		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+	}
+
+	if (ret == -EINTR)
+		goto retry;
+	/*
+	 * If we get here, it better have been because there was an error
+	 * reading a btree node
+	 */
+	BUG_ON(!ret);
+	__bcache_io_error(c, "btree IO error: %i", ret);
+err:
+	rbio->bio.bi_status = BLK_STS_IOERR;
+out:
+	bch2_trans_exit(&trans);
+	bch2_rbio_done(rbio);
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bvec_iter iter	= rbio->bvec_iter;
+	unsigned flags		= rbio->flags;
+	u64 inode		= rbio->pos.inode;
+	struct bch_io_failures failed = { .nr = 0 };
+
+	trace_read_retry(&rbio->bio);
+
+	if (rbio->retry == READ_RETRY_AVOID)
+		bch2_mark_io_failure(&failed, &rbio->pick);
+
+	rbio->bio.bi_status = 0;
+
+	rbio = bch2_rbio_free(rbio);
+
+	flags |= BCH_READ_IN_RETRY;
+	flags &= ~BCH_READ_MAY_PROMOTE;
+
+	if (flags & BCH_READ_NODECODE)
+		bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
+	else
+		bch2_read_retry(c, rbio, iter, inode, &failed, flags);
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+			    blk_status_t error)
+{
+	rbio->retry = retry;
+
+	if (rbio->flags & BCH_READ_IN_RETRY)
+		return;
+
+	if (retry == READ_ERR) {
+		rbio = bch2_rbio_free(rbio);
+
+		rbio->bio.bi_status = error;
+		bch2_rbio_done(rbio);
+	} else {
+		bch2_rbio_punt(rbio, bch2_rbio_retry,
+			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+	}
+}
+
+static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	BKEY_PADDED(k) new;
+	struct bch_extent_crc_unpacked new_crc;
+	u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
+	int ret;
+
+	if (rbio->pick.crc.compression_type)
+		return;
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(iter);
+	if (IS_ERR_OR_NULL(k.k))
+		goto out;
+
+	bkey_reassemble(&new.k, k);
+	k = bkey_i_to_s_c(&new.k);
+
+	if (bversion_cmp(k.k->version, rbio->version) ||
+	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
+		goto out;
+
+	/* Extent was merged? */
+	if (bkey_start_offset(k.k) < data_offset ||
+	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+		goto out;
+
+	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+			rbio->pick.crc, NULL, &new_crc,
+			bkey_start_offset(k.k) - data_offset, k.k->size,
+			rbio->pick.crc.csum_type)) {
+		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+		goto out;
+	}
+
+	if (!bch2_bkey_narrow_crcs(&new.k, new_crc))
+		goto out;
+
+	bch2_trans_update(&trans, iter, &new.k);
+	ret = bch2_trans_commit(&trans, NULL, NULL,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_NOWAIT);
+	if (ret == -EINTR)
+		goto retry;
+out:
+	bch2_trans_exit(&trans);
+}
+
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct bio *src		= &rbio->bio;
+	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
+	struct bvec_iter dst_iter = rbio->bvec_iter;
+	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+	struct nonce nonce = extent_nonce(rbio->version, crc);
+	struct bch_csum csum;
+
+	/* Reset iterator for checksumming and copying bounced data: */
+	if (rbio->bounce) {
+		src->bi_iter.bi_size		= crc.compressed_size << 9;
+		src->bi_iter.bi_idx		= 0;
+		src->bi_iter.bi_bvec_done	= 0;
+	} else {
+		src->bi_iter			= rbio->bvec_iter;
+	}
+
+	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+	if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
+		goto csum_err;
+
+	if (unlikely(rbio->narrow_crcs))
+		bch2_rbio_narrow_crcs(rbio);
+
+	if (rbio->flags & BCH_READ_NODECODE)
+		goto nodecode;
+
+	/* Adjust crc to point to subset of data we want: */
+	crc.offset     += rbio->offset_into_extent;
+	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
+
+	if (crc.compression_type != BCH_COMPRESSION_NONE) {
+		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
+			goto decompression_err;
+	} else {
+		/* don't need to decrypt the entire bio: */
+		nonce = nonce_add(nonce, crc.offset << 9);
+		bio_advance(src, crc.offset << 9);
+
+		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+		src->bi_iter.bi_size = dst_iter.bi_size;
+
+		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+
+		if (rbio->bounce) {
+			struct bvec_iter src_iter = src->bi_iter;
+			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+		}
+	}
+
+	if (rbio->promote) {
+		/*
+		 * Re encrypt data we decrypted, so it's consistent with
+		 * rbio->crc:
+		 */
+		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		promote_start(rbio->promote, rbio);
+		rbio->promote = NULL;
+	}
+nodecode:
+	if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+		rbio = bch2_rbio_free(rbio);
+		bch2_rbio_done(rbio);
+	}
+	return;
+csum_err:
+	/*
+	 * Checksum error: if the bio wasn't bounced, we may have been
+	 * reading into buffers owned by userspace (that userspace can
+	 * scribble over) - retry the read, bouncing it this time:
+	 */
+	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+		rbio->flags |= BCH_READ_MUST_BOUNCE;
+		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
+		return;
+	}
+
+	bch2_dev_io_error(ca,
+		"data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
+		rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+		csum.hi, csum.lo, crc.csum_type);
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+	return;
+decompression_err:
+	__bcache_io_error(c, "decompression error, inode %llu offset %llu",
+			  rbio->pos.inode,
+			  (u64) rbio->bvec_iter.bi_sector);
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	return;
+}
+
+static void bch2_read_endio(struct bio *bio)
+{
+	struct bch_read_bio *rbio =
+		container_of(bio, struct bch_read_bio, bio);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct workqueue_struct *wq = NULL;
+	enum rbio_context context = RBIO_CONTEXT_NULL;
+
+	if (rbio->have_ioref) {
+		bch2_latency_acct(ca, rbio->submit_time, READ);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (!rbio->split)
+		rbio->bio.bi_end_io = rbio->end_io;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
+		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+		return;
+	}
+
+	if (rbio->pick.ptr.cached &&
+	    (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+	     ptr_stale(ca, &rbio->pick.ptr))) {
+		atomic_long_inc(&c->read_realloc_races);
+
+		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
+			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+		else
+			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+		return;
+	}
+
+	if (rbio->narrow_crcs ||
+	    rbio->pick.crc.compression_type ||
+	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
+	else if (rbio->pick.crc.csum_type)
+		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
+
+	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
+}
+
+int __bch2_read_indirect_extent(struct btree_trans *trans,
+				unsigned *offset_into_extent,
+				struct bkey_i *orig_k)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 reflink_offset;
+	int ret;
+
+	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
+		*offset_into_extent;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
+				   POS(0, reflink_offset),
+				   BTREE_ITER_SLOTS);
+	ret = PTR_ERR_OR_ZERO(iter);
+	if (ret)
+		return ret;
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_reflink_v) {
+		__bcache_io_error(trans->c,
+				"pointer to nonexistent indirect extent");
+		ret = -EIO;
+		goto err;
+	}
+
+	*offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
+	bkey_reassemble(orig_k, k);
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
+		       struct bvec_iter iter, struct bkey_s_c k,
+		       unsigned offset_into_extent,
+		       struct bch_io_failures *failed, unsigned flags)
+{
+	struct extent_ptr_decoded pick;
+	struct bch_read_bio *rbio = NULL;
+	struct bch_dev *ca;
+	struct promote_op *promote = NULL;
+	bool bounce = false, read_full = false, narrow_crcs = false;
+	struct bpos pos = bkey_start_pos(k.k);
+	int pick_ret;
+
+	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
+
+	/* hole or reservation - just zero fill: */
+	if (!pick_ret)
+		goto hole;
+
+	if (pick_ret < 0) {
+		__bcache_io_error(c, "no device to read from");
+		goto err;
+	}
+
+	if (pick_ret > 0)
+		ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+	if (flags & BCH_READ_NODECODE) {
+		/*
+		 * can happen if we retry, and the extent we were going to read
+		 * has been merged in the meantime:
+		 */
+		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+			goto hole;
+
+		iter.bi_size	= pick.crc.compressed_size << 9;
+		goto noclone;
+	}
+
+	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+	    bio_flagged(&orig->bio, BIO_CHAIN))
+		flags |= BCH_READ_MUST_CLONE;
+
+	narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+		bch2_can_narrow_extent_crcs(k, pick.crc);
+
+	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+		flags |= BCH_READ_MUST_BOUNCE;
+
+	BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+
+	if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
+	    (pick.crc.csum_type != BCH_CSUM_NONE &&
+	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+	       (flags & BCH_READ_USER_MAPPED)) ||
+	      (flags & BCH_READ_MUST_BOUNCE)))) {
+		read_full = true;
+		bounce = true;
+	}
+
+	promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+				&rbio, &bounce, &read_full);
+
+	if (!read_full) {
+		EBUG_ON(pick.crc.compression_type);
+		EBUG_ON(pick.crc.csum_type &&
+			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+			 bvec_iter_sectors(iter) != pick.crc.live_size ||
+			 pick.crc.offset ||
+			 offset_into_extent));
+
+		pos.offset += offset_into_extent;
+		pick.ptr.offset += pick.crc.offset +
+			offset_into_extent;
+		offset_into_extent		= 0;
+		pick.crc.compressed_size	= bvec_iter_sectors(iter);
+		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
+		pick.crc.offset			= 0;
+		pick.crc.live_size		= bvec_iter_sectors(iter);
+		offset_into_extent		= 0;
+	}
+
+	if (rbio) {
+		/*
+		 * promote already allocated bounce rbio:
+		 * promote needs to allocate a bio big enough for uncompressing
+		 * data in the write path, but we're not going to use it all
+		 * here:
+		 */
+		BUG_ON(rbio->bio.bi_iter.bi_size <
+		       pick.crc.compressed_size << 9);
+		rbio->bio.bi_iter.bi_size =
+			pick.crc.compressed_size << 9;
+	} else if (bounce) {
+		unsigned sectors = pick.crc.compressed_size;
+
+		rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
+						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
+						  &c->bio_read_split),
+				 orig->opts);
+
+		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+		rbio->bounce	= true;
+		rbio->split	= true;
+	} else if (flags & BCH_READ_MUST_CLONE) {
+		/*
+		 * Have to clone if there were any splits, due to error
+		 * reporting issues (if a split errored, and retrying didn't
+		 * work, when it reports the error to its parent (us) we don't
+		 * know if the error was from our bio, and we should retry, or
+		 * from the whole bio, in which case we don't want to retry and
+		 * lose the error)
+		 */
+		rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
+						&c->bio_read_split),
+				 orig->opts);
+		rbio->bio.bi_iter = iter;
+		rbio->split	= true;
+	} else {
+noclone:
+		rbio = orig;
+		rbio->bio.bi_iter = iter;
+		BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+	}
+
+	BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+
+	rbio->c			= c;
+	rbio->submit_time	= local_clock();
+	if (rbio->split)
+		rbio->parent	= orig;
+	else
+		rbio->end_io	= orig->bio.bi_end_io;
+	rbio->bvec_iter		= iter;
+	rbio->offset_into_extent= offset_into_extent;
+	rbio->flags		= flags;
+	rbio->have_ioref	= pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+	rbio->narrow_crcs	= narrow_crcs;
+	rbio->hole		= 0;
+	rbio->retry		= 0;
+	rbio->context		= 0;
+	rbio->devs_have		= bch2_bkey_devs(k);
+	rbio->pick		= pick;
+	rbio->pos		= pos;
+	rbio->version		= k.k->version;
+	rbio->promote		= promote;
+	INIT_WORK(&rbio->work, NULL);
+
+	rbio->bio.bi_opf	= orig->bio.bi_opf;
+	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
+	rbio->bio.bi_end_io	= bch2_read_endio;
+
+	if (rbio->bounce)
+		trace_read_bounce(&rbio->bio);
+
+	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
+
+	percpu_down_read(&c->mark_lock);
+	bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
+	percpu_up_read(&c->mark_lock);
+
+	if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
+		bio_inc_remaining(&orig->bio);
+		trace_read_split(&orig->bio);
+	}
+
+	if (!rbio->pick.idx) {
+		if (!rbio->have_ioref) {
+			__bcache_io_error(c, "no device to read from");
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
+			     bio_sectors(&rbio->bio));
+		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
+		if (likely(!(flags & BCH_READ_IN_RETRY)))
+			submit_bio(&rbio->bio);
+		else
+			submit_bio_wait(&rbio->bio);
+	} else {
+		/* Attempting reconstruct read: */
+		if (bch2_ec_read_extent(c, rbio)) {
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		if (likely(!(flags & BCH_READ_IN_RETRY)))
+			bio_endio(&rbio->bio);
+	}
+out:
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
+		return 0;
+	} else {
+		int ret;
+
+		rbio->context = RBIO_CONTEXT_UNBOUND;
+		bch2_read_endio(&rbio->bio);
+
+		ret = rbio->retry;
+		rbio = bch2_rbio_free(rbio);
+
+		if (ret == READ_RETRY_AVOID) {
+			bch2_mark_io_failure(failed, &pick);
+			ret = READ_RETRY;
+		}
+
+		return ret;
+	}
+
+err:
+	if (flags & BCH_READ_IN_RETRY)
+		return READ_ERR;
+
+	orig->bio.bi_status = BLK_STS_IOERR;
+	goto out_read_done;
+
+hole:
+	/*
+	 * won't normally happen in the BCH_READ_NODECODE
+	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
+	 * to read no longer exists we have to signal that:
+	 */
+	if (flags & BCH_READ_NODECODE)
+		orig->hole = true;
+
+	zero_fill_bio_iter(&orig->bio, iter);
+out_read_done:
+	if (flags & BCH_READ_LAST_FRAGMENT)
+		bch2_rbio_done(orig);
+	return 0;
+}
+
+void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	unsigned flags = BCH_READ_RETRY_IF_STALE|
+		BCH_READ_MAY_PROMOTE|
+		BCH_READ_USER_MAPPED;
+	int ret;
+
+	BUG_ON(rbio->_state);
+	BUG_ON(flags & BCH_READ_NODECODE);
+	BUG_ON(flags & BCH_READ_IN_RETRY);
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+
+	bch2_trans_init(&trans, c, 0, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   POS(inode, rbio->bio.bi_iter.bi_sector),
+				   BTREE_ITER_SLOTS);
+	while (1) {
+		BKEY_PADDED(k) tmp;
+		unsigned bytes, sectors, offset_into_extent;
+
+		bch2_btree_iter_set_pos(iter,
+				POS(inode, rbio->bio.bi_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+
+		offset_into_extent = iter->pos.offset -
+			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		ret = bch2_read_indirect_extent(&trans,
+					&offset_into_extent, &tmp.k);
+		if (ret)
+			goto err;
+
+		/*
+		 * With indirect extents, the amount of data to read is the min
+		 * of the original extent and the indirect extent:
+		 */
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		/*
+		 * Unlock the iterator while the btree node's lock is still in
+		 * cache, before doing the IO:
+		 */
+		bch2_trans_unlock(&trans);
+
+		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+
+		if (rbio->bio.bi_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		bch2_read_extent(c, rbio, k, offset_into_extent, flags);
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			break;
+
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+		bio_advance(&rbio->bio, bytes);
+	}
+out:
+	bch2_trans_exit(&trans);
+	return;
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
+	bch2_rbio_done(rbio);
+	goto out;
+}
+
+void bch2_fs_io_exit(struct bch_fs *c)
+{
+	if (c->promote_table.tbl)
+		rhashtable_destroy(&c->promote_table);
+	mempool_exit(&c->bio_bounce_pages);
+	bioset_exit(&c->bio_write);
+	bioset_exit(&c->bio_read_split);
+	bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    mempool_init_page_pool(&c->bio_bounce_pages,
+				   max_t(unsigned,
+					 c->opts.btree_node_size,
+					 c->sb.encoded_extent_max) /
+				   PAGE_SECTORS, 0) ||
+	    rhashtable_init(&c->promote_table, &bch_promote_params))
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
new file mode 100644
index 000000000000..91aaa58fce4e
--- /dev/null
+++ b/fs/bcachefs/io.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_H
+#define _BCACHEFS_IO_H
+
+#include "checksum.h"
+#include "io_types.h"
+
+#define to_wbio(_bio)			\
+	container_of((_bio), struct bch_write_bio, bio)
+
+#define to_rbio(_bio)			\
+	container_of((_bio), struct bch_read_bio, bio)
+
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+void bch2_latency_acct(struct bch_dev *, u64, int);
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+			       enum bch_data_type, const struct bkey_i *);
+
+#define BLK_STS_REMOVED		((__force blk_status_t)128)
+
+enum bch_write_flags {
+	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
+	BCH_WRITE_CACHED		= (1 << 1),
+	BCH_WRITE_FLUSH			= (1 << 2),
+	BCH_WRITE_DATA_ENCODED		= (1 << 3),
+	BCH_WRITE_PAGES_STABLE		= (1 << 4),
+	BCH_WRITE_PAGES_OWNED		= (1 << 5),
+	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
+	BCH_WRITE_NOPUT_RESERVATION	= (1 << 7),
+
+	/* Internal: */
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 8),
+	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 9),
+};
+
+static inline u64 *op_journal_seq(struct bch_write_op *op)
+{
+	return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
+		? op->journal_seq_p : &op->journal_seq;
+}
+
+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
+{
+	op->journal_seq_p = journal_seq;
+	op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+}
+
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+	return op->alloc_reserve == RESERVE_MOVINGGC
+		? op->c->copygc_wq
+		: op->c->wq;
+}
+
+int bch2_extent_update(struct btree_trans *, struct btree_iter *,
+		       struct bkey_i *, struct disk_reservation *,
+		       u64 *, u64, s64 *);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+		   struct bpos, u64 *, s64 *);
+int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
+
+int bch2_write_index_default(struct bch_write_op *);
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+				      struct bch_io_opts opts)
+{
+	op->c			= c;
+	op->end_io		= NULL;
+	op->flags		= 0;
+	op->written		= 0;
+	op->error		= 0;
+	op->csum_type		= bch2_data_checksum_type(c, opts.data_checksum);
+	op->compression_type	= bch2_compression_opt_to_type[opts.compression];
+	op->nr_replicas		= 0;
+	op->nr_replicas_required = c->opts.data_replicas_required;
+	op->alloc_reserve	= RESERVE_NONE;
+	op->open_buckets.nr	= 0;
+	op->devs_have.nr	= 0;
+	op->target		= 0;
+	op->opts		= opts;
+	op->pos			= POS_MAX;
+	op->version		= ZERO_VERSION;
+	op->write_point		= (struct write_point_specifier) { 0 };
+	op->res			= (struct disk_reservation) { 0 };
+	op->journal_seq		= 0;
+	op->new_i_size		= U64_MAX;
+	op->i_sectors_delta	= 0;
+	op->index_update_fn	= bch2_write_index_default;
+}
+
+void bch2_write(struct closure *);
+
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+	struct bch_write_bio *wbio = to_wbio(bio);
+
+	memset(wbio, 0, offsetof(struct bch_write_bio, bio));
+	return wbio;
+}
+
+struct bch_devs_mask;
+struct cache_promote_op;
+struct extent_ptr_decoded;
+
+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
+				struct bkey_i *);
+
+static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+					    unsigned *offset_into_extent,
+					    struct bkey_i *k)
+{
+	return k->k.type == KEY_TYPE_reflink_p
+		? __bch2_read_indirect_extent(trans, offset_into_extent, k)
+		: 0;
+}
+
+enum bch_read_flags {
+	BCH_READ_RETRY_IF_STALE		= 1 << 0,
+	BCH_READ_MAY_PROMOTE		= 1 << 1,
+	BCH_READ_USER_MAPPED		= 1 << 2,
+	BCH_READ_NODECODE		= 1 << 3,
+	BCH_READ_LAST_FRAGMENT		= 1 << 4,
+
+	/* internal: */
+	BCH_READ_MUST_BOUNCE		= 1 << 5,
+	BCH_READ_MUST_CLONE		= 1 << 6,
+	BCH_READ_IN_RETRY		= 1 << 7,
+};
+
+int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *,
+		       struct bvec_iter, struct bkey_s_c, unsigned,
+		       struct bch_io_failures *, unsigned);
+
+static inline void bch2_read_extent(struct bch_fs *c,
+				    struct bch_read_bio *rbio,
+				    struct bkey_s_c k,
+				    unsigned offset_into_extent,
+				    unsigned flags)
+{
+	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k,
+			   offset_into_extent, NULL, flags);
+}
+
+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+					     struct bch_io_opts opts)
+{
+	struct bch_read_bio *rbio = to_rbio(bio);
+
+	rbio->_state	= 0;
+	rbio->promote	= NULL;
+	rbio->opts	= opts;
+	return rbio;
+}
+
+void bch2_fs_io_exit(struct bch_fs *);
+int bch2_fs_io_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_H */
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
new file mode 100644
index 000000000000..c37b7d7401e9
--- /dev/null
+++ b/fs/bcachefs/io_types.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_TYPES_H
+#define _BCACHEFS_IO_TYPES_H
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+#include "keylist_types.h"
+#include "opts.h"
+#include "super_types.h"
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bch_read_bio {
+	struct bch_fs		*c;
+	u64			start_time;
+	u64			submit_time;
+
+	/*
+	 * Reads will often have to be split, and if the extent being read from
+	 * was checksummed or compressed we'll also have to allocate bounce
+	 * buffers and copy the data back into the original bio.
+	 *
+	 * If we didn't have to split, we have to save and restore the original
+	 * bi_end_io - @split below indicates which:
+	 */
+	union {
+	struct bch_read_bio	*parent;
+	bio_end_io_t		*end_io;
+	};
+
+	/*
+	 * Saved copy of bio->bi_iter, from submission time - allows us to
+	 * resubmit on IO error, and also to copy data back to the original bio
+	 * when we're bouncing:
+	 */
+	struct bvec_iter	bvec_iter;
+
+	unsigned		offset_into_extent;
+
+	u16			flags;
+	union {
+	struct {
+	u16			bounce:1,
+				split:1,
+				kmalloc:1,
+				have_ioref:1,
+				narrow_crcs:1,
+				hole:1,
+				retry:2,
+				context:2;
+	};
+	u16			_state;
+	};
+
+	struct bch_devs_list	devs_have;
+
+	struct extent_ptr_decoded pick;
+	/* start pos of data we read (may not be pos of data we want) */
+	struct bpos		pos;
+	struct bversion		version;
+
+	struct promote_op	*promote;
+
+	struct bch_io_opts	opts;
+
+	struct work_struct	work;
+
+	struct bio		bio;
+};
+
+struct bch_write_bio {
+	struct bch_fs		*c;
+	struct bch_write_bio	*parent;
+
+	u64			submit_time;
+
+	struct bch_devs_list	failed;
+	u8			order;
+	u8			dev;
+
+	unsigned		split:1,
+				bounce:1,
+				put_bio:1,
+				have_ioref:1,
+				used_mempool:1;
+
+	struct bio		bio;
+};
+
+struct bch_write_op {
+	struct closure		cl;
+	struct bch_fs		*c;
+	void			(*end_io)(struct bch_write_op *);
+	u64			start_time;
+
+	unsigned		written; /* sectors */
+	u16			flags;
+	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
+
+	unsigned		csum_type:4;
+	unsigned		compression_type:4;
+	unsigned		nr_replicas:4;
+	unsigned		nr_replicas_required:4;
+	unsigned		alloc_reserve:4;
+
+	struct bch_devs_list	devs_have;
+	u16			target;
+	u16			nonce;
+	struct bch_io_opts	opts;
+
+	struct bpos		pos;
+	struct bversion		version;
+
+	/* For BCH_WRITE_DATA_ENCODED: */
+	struct bch_extent_crc_unpacked crc;
+
+	struct write_point_specifier write_point;
+
+	struct disk_reservation	res;
+
+	struct open_buckets	open_buckets;
+
+	/*
+	 * If caller wants to flush but hasn't passed us a journal_seq ptr, we
+	 * still need to stash the journal_seq somewhere:
+	 */
+	union {
+		u64			*journal_seq_p;
+		u64			journal_seq;
+	};
+	u64			new_i_size;
+	s64			i_sectors_delta;
+
+	int			(*index_update_fn)(struct bch_write_op *);
+
+	struct bch_devs_mask	failed;
+
+	struct keylist		insert_keys;
+	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+
+	/* Must be last: */
+	struct bch_write_bio	wbio;
+};
+
+#endif /* _BCACHEFS_IO_TYPES_H */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
new file mode 100644
index 000000000000..5c3e146e3942
--- /dev/null
+++ b/fs/bcachefs/journal.c
@@ -0,0 +1,1253 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcachefs journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "super-io.h"
+
+#include <trace/events/bcachefs.h>
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+	return __journal_entry_is_open(j->reservations);
+}
+
+static void journal_pin_new_entry(struct journal *j, int count)
+{
+	struct journal_entry_pin_list *p;
+
+	/*
+	 * The fifo_push() needs to happen at the same time as j->seq is
+	 * incremented for journal_last_seq() to be calculated correctly
+	 */
+	atomic64_inc(&j->seq);
+	p = fifo_push_ref(&j->pin);
+
+	INIT_LIST_HEAD(&p->list);
+	INIT_LIST_HEAD(&p->flushed);
+	atomic_set(&p->count, count);
+	p->devs.nr = 0;
+}
+
+static void bch2_journal_buf_init(struct journal *j)
+{
+	struct journal_buf *buf = journal_cur_buf(j);
+
+	memset(buf->has_inode, 0, sizeof(buf->has_inode));
+
+	memset(buf->data, 0, sizeof(*buf->data));
+	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
+	buf->data->u64s	= 0;
+}
+
+void bch2_journal_halt(struct journal *j)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+			return;
+
+		new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	journal_wake(j);
+	closure_wake_up(&journal_cur_buf(j)->wait);
+}
+
+/* journal entry close/open: */
+
+void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
+{
+	if (!need_write_just_set &&
+	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
+		bch2_time_stats_update(j->delay_time,
+				       j->need_write_time);
+
+	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+	closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+}
+
+/*
+ * Returns true if journal entry is now closed:
+ */
+static bool __journal_entry_close(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf = journal_cur_buf(j);
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+	bool set_need_write = false;
+	unsigned sectors;
+
+	lockdep_assert_held(&j->lock);
+
+	do {
+		old.v = new.v = v;
+		if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
+			return true;
+
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
+			/* this entry will never be written: */
+			closure_wake_up(&buf->wait);
+			return true;
+		}
+
+		if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+			set_bit(JOURNAL_NEED_WRITE, &j->flags);
+			j->need_write_time = local_clock();
+			set_need_write = true;
+		}
+
+		if (new.prev_buf_unwritten)
+			return false;
+
+		new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
+		new.idx++;
+		new.prev_buf_unwritten = 1;
+
+		BUG_ON(journal_state_count(new, new.idx));
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
+
+	sectors = vstruct_blocks_plus(buf->data, c->block_bits,
+				      buf->u64s_reserved) << c->block_bits;
+	BUG_ON(sectors > buf->sectors);
+	buf->sectors = sectors;
+
+	bkey_extent_init(&buf->key);
+
+	/*
+	 * We have to set last_seq here, _before_ opening a new journal entry:
+	 *
+	 * A threads may replace an old pin with a new pin on their current
+	 * journal reservation - the expectation being that the journal will
+	 * contain either what the old pin protected or what the new pin
+	 * protects.
+	 *
+	 * After the old pin is dropped journal_last_seq() won't include the old
+	 * pin, so we can only write the updated last_seq on the entry that
+	 * contains whatever the new pin protects.
+	 *
+	 * Restated, we can _not_ update last_seq for a given entry if there
+	 * could be a newer entry open with reservations/pins that have been
+	 * taken against it.
+	 *
+	 * Hence, we want update/set last_seq on the current journal entry right
+	 * before we open a new one:
+	 */
+	buf->data->last_seq	= cpu_to_le64(journal_last_seq(j));
+
+	if (journal_entry_empty(buf->data))
+		clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
+	else
+		set_bit(JOURNAL_NOT_EMPTY, &j->flags);
+
+	journal_pin_new_entry(j, 1);
+
+	bch2_journal_buf_init(j);
+
+	cancel_delayed_work(&j->write_work);
+
+	bch2_journal_space_available(j);
+
+	bch2_journal_buf_put(j, old.idx, set_need_write);
+	return true;
+}
+
+static bool journal_entry_close(struct journal *j)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = __journal_entry_close(j);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/*
+ * should _only_ called from journal_res_get() - when we actually want a
+ * journal reservation - journal entry is open means journal is dirty:
+ *
+ * returns:
+ * 0:		success
+ * -ENOSPC:	journal currently full, must invoke reclaim
+ * -EAGAIN:	journal blocked, must wait
+ * -EROFS:	insufficient rw devices or journal error
+ */
+static int journal_entry_open(struct journal *j)
+{
+	struct journal_buf *buf = journal_cur_buf(j);
+	union journal_res_state old, new;
+	int u64s;
+	u64 v;
+
+	lockdep_assert_held(&j->lock);
+	BUG_ON(journal_entry_is_open(j));
+
+	if (j->blocked)
+		return -EAGAIN;
+
+	if (j->cur_entry_error)
+		return j->cur_entry_error;
+
+	BUG_ON(!j->cur_entry_sectors);
+
+	buf->u64s_reserved	= j->entry_u64s_reserved;
+	buf->disk_sectors	= j->cur_entry_sectors;
+	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);
+
+	u64s = (int) (buf->sectors << 9) / sizeof(u64) -
+		journal_entry_overhead(j);
+	u64s  = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
+
+	if (u64s <= le32_to_cpu(buf->data->u64s))
+		return -ENOSPC;
+
+	/*
+	 * Must be set before marking the journal entry as open:
+	 */
+	j->cur_entry_u64s = u64s;
+
+	v = atomic64_read(&j->reservations.counter);
+	do {
+		old.v = new.v = v;
+
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+			return -EROFS;
+
+		/* Handle any already added entries */
+		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+
+		EBUG_ON(journal_state_count(new, new.idx));
+		journal_state_inc(&new);
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	if (j->res_get_blocked_start)
+		bch2_time_stats_update(j->blocked_time,
+				       j->res_get_blocked_start);
+	j->res_get_blocked_start = 0;
+
+	mod_delayed_work(system_freezable_wq,
+			 &j->write_work,
+			 msecs_to_jiffies(j->write_delay_ms));
+	journal_wake(j);
+	return 0;
+}
+
+static bool journal_quiesced(struct journal *j)
+{
+	union journal_res_state state = READ_ONCE(j->reservations);
+	bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
+
+	if (!ret)
+		journal_entry_close(j);
+	return ret;
+}
+
+static void journal_quiesce(struct journal *j)
+{
+	wait_event(j->wait, journal_quiesced(j));
+}
+
+static void journal_write_work(struct work_struct *work)
+{
+	struct journal *j = container_of(work, struct journal, write_work.work);
+
+	journal_entry_close(j);
+}
+
+/*
+ * Given an inode number, if that inode number has data in the journal that
+ * hasn't yet been flushed, return the journal sequence number that needs to be
+ * flushed:
+ */
+u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
+{
+	size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
+	u64 seq = 0;
+
+	if (!test_bit(h, j->buf[0].has_inode) &&
+	    !test_bit(h, j->buf[1].has_inode))
+		return 0;
+
+	spin_lock(&j->lock);
+	if (test_bit(h, journal_cur_buf(j)->has_inode))
+		seq = journal_cur_seq(j);
+	else if (test_bit(h, journal_prev_buf(j)->has_inode))
+		seq = journal_cur_seq(j) - 1;
+	spin_unlock(&j->lock);
+
+	return seq;
+}
+
+static int __journal_res_get(struct journal *j, struct journal_res *res,
+			     unsigned flags)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf;
+	bool can_discard;
+	int ret;
+retry:
+	if (journal_res_get_fast(j, res, flags))
+		return 0;
+
+	if (bch2_journal_error(j))
+		return -EROFS;
+
+	spin_lock(&j->lock);
+
+	/*
+	 * Recheck after taking the lock, so we don't race with another thread
+	 * that just did journal_entry_open() and call journal_entry_close()
+	 * unnecessarily
+	 */
+	if (journal_res_get_fast(j, res, flags)) {
+		spin_unlock(&j->lock);
+		return 0;
+	}
+
+	if (!(flags & JOURNAL_RES_GET_RESERVED) &&
+	    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+		/*
+		 * Don't want to close current journal entry, just need to
+		 * invoke reclaim:
+		 */
+		ret = -ENOSPC;
+		goto unlock;
+	}
+
+	/*
+	 * If we couldn't get a reservation because the current buf filled up,
+	 * and we had room for a bigger entry on disk, signal that we want to
+	 * realloc the journal bufs:
+	 */
+	buf = journal_cur_buf(j);
+	if (journal_entry_is_open(j) &&
+	    buf->buf_size >> 9 < buf->disk_sectors &&
+	    buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
+		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
+
+	if (journal_entry_is_open(j) &&
+	    !__journal_entry_close(j)) {
+		/*
+		 * We failed to get a reservation on the current open journal
+		 * entry because it's full, and we can't close it because
+		 * there's still a previous one in flight:
+		 */
+		trace_journal_entry_full(c);
+		ret = -EAGAIN;
+	} else {
+		ret = journal_entry_open(j);
+	}
+unlock:
+	if ((ret == -EAGAIN || ret == -ENOSPC) &&
+	    !j->res_get_blocked_start)
+		j->res_get_blocked_start = local_clock() ?: 1;
+
+	can_discard = j->can_discard;
+	spin_unlock(&j->lock);
+
+	if (!ret)
+		goto retry;
+
+	if (ret == -ENOSPC) {
+		BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED));
+
+		/*
+		 * Journal is full - can't rely on reclaim from work item due to
+		 * freezing:
+		 */
+		trace_journal_full(c);
+
+		if (!(flags & JOURNAL_RES_GET_NONBLOCK)) {
+			if (can_discard) {
+				bch2_journal_do_discards(j);
+				goto retry;
+			}
+
+			if (mutex_trylock(&j->reclaim_lock)) {
+				bch2_journal_reclaim(j);
+				mutex_unlock(&j->reclaim_lock);
+			}
+		}
+
+		ret = -EAGAIN;
+	}
+
+	return ret;
+}
+
+/*
+ * Essentially the entry function to the journaling code. When bcachefs is doing
+ * a btree insert, it calls this function to get the current journal write.
+ * Journal write is the structure used set up journal writes. The calling
+ * function will then add its keys to the structure, queuing them for the next
+ * write.
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks.
+ */
+int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
+				  unsigned flags)
+{
+	int ret;
+
+	closure_wait_event(&j->async_wait,
+		   (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
+		   (flags & JOURNAL_RES_GET_NONBLOCK));
+	return ret;
+}
+
+/* journal_preres: */
+
+static bool journal_preres_available(struct journal *j,
+				     struct journal_preres *res,
+				     unsigned new_u64s)
+{
+	bool ret = bch2_journal_preres_get_fast(j, res, new_u64s);
+
+	if (!ret)
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+
+	return ret;
+}
+
+int __bch2_journal_preres_get(struct journal *j,
+			      struct journal_preres *res,
+			      unsigned new_u64s)
+{
+	int ret;
+
+	closure_wait_event(&j->preres_wait,
+		   (ret = bch2_journal_error(j)) ||
+		   journal_preres_available(j, res, new_u64s));
+	return ret;
+}
+
+/* journal_entry_res: */
+
+void bch2_journal_entry_res_resize(struct journal *j,
+				   struct journal_entry_res *res,
+				   unsigned new_u64s)
+{
+	union journal_res_state state;
+	int d = new_u64s - res->u64s;
+
+	spin_lock(&j->lock);
+
+	j->entry_u64s_reserved += d;
+	if (d <= 0)
+		goto out;
+
+	j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
+	smp_mb();
+	state = READ_ONCE(j->reservations);
+
+	if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
+	    state.cur_entry_offset > j->cur_entry_u64s) {
+		j->cur_entry_u64s += d;
+		/*
+		 * Not enough room in current journal entry, have to flush it:
+		 */
+		__journal_entry_close(j);
+	} else {
+		journal_cur_buf(j)->u64s_reserved += d;
+	}
+out:
+	spin_unlock(&j->lock);
+	res->u64s += d;
+}
+
+/* journal flushing: */
+
+u64 bch2_journal_last_unwritten_seq(struct journal *j)
+{
+	u64 seq;
+
+	spin_lock(&j->lock);
+	seq = journal_cur_seq(j);
+	if (j->reservations.prev_buf_unwritten)
+		seq--;
+	spin_unlock(&j->lock);
+
+	return seq;
+}
+
+/**
+ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
+ * open yet, or wait if we cannot
+ *
+ * used by the btree interior update machinery, when it needs to write a new
+ * btree root - every journal entry contains the roots of all the btrees, so it
+ * doesn't need to bother with getting a journal reservation
+ */
+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	int ret;
+
+	spin_lock(&j->lock);
+
+	/*
+	 * Can't try to open more than one sequence number ahead:
+	 */
+	BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
+
+	if (journal_cur_seq(j) > seq ||
+	    journal_entry_is_open(j)) {
+		spin_unlock(&j->lock);
+		return 0;
+	}
+
+	if (journal_cur_seq(j) < seq &&
+	    !__journal_entry_close(j)) {
+		/* haven't finished writing out the previous one: */
+		trace_journal_entry_full(c);
+		ret = -EAGAIN;
+	} else {
+		BUG_ON(journal_cur_seq(j) != seq);
+
+		ret = journal_entry_open(j);
+	}
+
+	if ((ret == -EAGAIN || ret == -ENOSPC) &&
+	    !j->res_get_blocked_start)
+		j->res_get_blocked_start = local_clock() ?: 1;
+
+	if (ret == -EAGAIN || ret == -ENOSPC)
+		closure_wait(&j->async_wait, cl);
+
+	spin_unlock(&j->lock);
+
+	if (ret == -ENOSPC) {
+		trace_journal_full(c);
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+		ret = -EAGAIN;
+	}
+
+	return ret;
+}
+
+static int journal_seq_error(struct journal *j, u64 seq)
+{
+	union journal_res_state state = READ_ONCE(j->reservations);
+
+	if (seq == journal_cur_seq(j))
+		return bch2_journal_error(j);
+
+	if (seq + 1 == journal_cur_seq(j) &&
+	    !state.prev_buf_unwritten &&
+	    seq > j->seq_ondisk)
+		return -EIO;
+
+	return 0;
+}
+
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
+{
+	/* seq should be for a journal entry that has been opened: */
+	BUG_ON(seq > journal_cur_seq(j));
+	BUG_ON(seq == journal_cur_seq(j) &&
+	       j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
+
+	if (seq == journal_cur_seq(j))
+		return journal_cur_buf(j);
+	if (seq + 1 == journal_cur_seq(j) &&
+	    j->reservations.prev_buf_unwritten)
+		return journal_prev_buf(j);
+	return NULL;
+}
+
+/**
+ * bch2_journal_wait_on_seq - wait for a journal entry to be written
+ *
+ * does _not_ cause @seq to be written immediately - if there is no other
+ * activity to cause the relevant journal entry to be filled up or flushed it
+ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
+ * configurable).
+ */
+void bch2_journal_wait_on_seq(struct journal *j, u64 seq,
+			      struct closure *parent)
+{
+	struct journal_buf *buf;
+
+	spin_lock(&j->lock);
+
+	if ((buf = journal_seq_to_buf(j, seq))) {
+		if (!closure_wait(&buf->wait, parent))
+			BUG();
+
+		if (seq == journal_cur_seq(j)) {
+			smp_mb();
+			if (bch2_journal_error(j))
+				closure_wake_up(&buf->wait);
+		}
+	}
+
+	spin_unlock(&j->lock);
+}
+
+/**
+ * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ *
+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * necessary
+ */
+void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+				  struct closure *parent)
+{
+	struct journal_buf *buf;
+
+	spin_lock(&j->lock);
+
+	if (parent &&
+	    (buf = journal_seq_to_buf(j, seq)))
+		if (!closure_wait(&buf->wait, parent))
+			BUG();
+
+	if (seq == journal_cur_seq(j))
+		__journal_entry_close(j);
+	spin_unlock(&j->lock);
+}
+
+static int journal_seq_flushed(struct journal *j, u64 seq)
+{
+	int ret;
+
+	spin_lock(&j->lock);
+	ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq);
+
+	if (seq == journal_cur_seq(j))
+		__journal_entry_close(j);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+int bch2_journal_flush_seq(struct journal *j, u64 seq)
+{
+	u64 start_time = local_clock();
+	int ret, ret2;
+
+	ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
+
+	bch2_time_stats_update(j->flush_seq_time, start_time);
+
+	return ret ?: ret2 < 0 ? ret2 : 0;
+}
+
+/**
+ * bch2_journal_meta_async - force a journal entry to be written
+ */
+void bch2_journal_meta_async(struct journal *j, struct closure *parent)
+{
+	struct journal_res res;
+
+	memset(&res, 0, sizeof(res));
+
+	bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+	bch2_journal_res_put(j, &res);
+
+	bch2_journal_flush_seq_async(j, res.seq, parent);
+}
+
+int bch2_journal_meta(struct journal *j)
+{
+	struct journal_res res;
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+	if (ret)
+		return ret;
+
+	bch2_journal_res_put(j, &res);
+
+	return bch2_journal_flush_seq(j, res.seq);
+}
+
+/*
+ * bch2_journal_flush_async - if there is an open journal entry, or a journal
+ * still being written, write it and wait for the write to complete
+ */
+void bch2_journal_flush_async(struct journal *j, struct closure *parent)
+{
+	u64 seq, journal_seq;
+
+	spin_lock(&j->lock);
+	journal_seq = journal_cur_seq(j);
+
+	if (journal_entry_is_open(j)) {
+		seq = journal_seq;
+	} else if (journal_seq) {
+		seq = journal_seq - 1;
+	} else {
+		spin_unlock(&j->lock);
+		return;
+	}
+	spin_unlock(&j->lock);
+
+	bch2_journal_flush_seq_async(j, seq, parent);
+}
+
+int bch2_journal_flush(struct journal *j)
+{
+	u64 seq, journal_seq;
+
+	spin_lock(&j->lock);
+	journal_seq = journal_cur_seq(j);
+
+	if (journal_entry_is_open(j)) {
+		seq = journal_seq;
+	} else if (journal_seq) {
+		seq = journal_seq - 1;
+	} else {
+		spin_unlock(&j->lock);
+		return 0;
+	}
+	spin_unlock(&j->lock);
+
+	return bch2_journal_flush_seq(j, seq);
+}
+
+/* block/unlock the journal: */
+
+void bch2_journal_unblock(struct journal *j)
+{
+	spin_lock(&j->lock);
+	j->blocked--;
+	spin_unlock(&j->lock);
+
+	journal_wake(j);
+}
+
+void bch2_journal_block(struct journal *j)
+{
+	spin_lock(&j->lock);
+	j->blocked++;
+	spin_unlock(&j->lock);
+
+	journal_quiesce(j);
+}
+
+/* allocate journal on a device: */
+
+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
+					 bool new_fs, struct closure *cl)
+{
+	struct bch_fs *c = ca->fs;
+	struct journal_device *ja = &ca->journal;
+	struct bch_sb_field_journal *journal_buckets;
+	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+	int ret = 0;
+
+	/* don't handle reducing nr of buckets yet: */
+	if (nr <= ja->nr)
+		return 0;
+
+	ret = -ENOMEM;
+	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
+	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
+	if (!new_buckets || !new_bucket_seq)
+		goto err;
+
+	journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
+						 nr + sizeof(*journal_buckets) / sizeof(u64));
+	if (!journal_buckets)
+		goto err;
+
+	/*
+	 * We may be called from the device add path, before the new device has
+	 * actually been added to the running filesystem:
+	 */
+	if (c)
+		spin_lock(&c->journal.lock);
+
+	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
+	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
+	swap(new_buckets,	ja->buckets);
+	swap(new_bucket_seq,	ja->bucket_seq);
+
+	if (c)
+		spin_unlock(&c->journal.lock);
+
+	while (ja->nr < nr) {
+		struct open_bucket *ob = NULL;
+		unsigned pos;
+		long bucket;
+
+		if (new_fs) {
+			bucket = bch2_bucket_alloc_new_fs(ca);
+			if (bucket < 0) {
+				ret = -ENOSPC;
+				goto err;
+			}
+		} else {
+			ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC,
+					       false, cl);
+			if (IS_ERR(ob)) {
+				ret = cl ? -EAGAIN : -ENOSPC;
+				goto err;
+			}
+
+			bucket = sector_to_bucket(ca, ob->ptr.offset);
+		}
+
+		if (c) {
+			percpu_down_read(&c->mark_lock);
+			spin_lock(&c->journal.lock);
+		}
+
+		pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
+		__array_insert_item(ja->buckets,		ja->nr, pos);
+		__array_insert_item(ja->bucket_seq,		ja->nr, pos);
+		__array_insert_item(journal_buckets->buckets,	ja->nr, pos);
+		ja->nr++;
+
+		ja->buckets[pos] = bucket;
+		ja->bucket_seq[pos] = 0;
+		journal_buckets->buckets[pos] = cpu_to_le64(bucket);
+
+		if (pos <= ja->discard_idx)
+			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
+		if (pos <= ja->dirty_idx_ondisk)
+			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
+		if (pos <= ja->dirty_idx)
+			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
+		if (pos <= ja->cur_idx)
+			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+
+		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
+					  ca->mi.bucket_size,
+					  gc_phase(GC_PHASE_SB),
+					  0);
+
+		if (c) {
+			spin_unlock(&c->journal.lock);
+			percpu_up_read(&c->mark_lock);
+		}
+
+		if (!new_fs)
+			bch2_open_bucket_put(c, ob);
+	}
+
+	ret = 0;
+err:
+	kfree(new_bucket_seq);
+	kfree(new_buckets);
+
+	return ret;
+}
+
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+				unsigned nr)
+{
+	struct journal_device *ja = &ca->journal;
+	struct closure cl;
+	unsigned current_nr;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		struct disk_reservation disk_res = { 0, 0 };
+
+		closure_sync(&cl);
+
+		mutex_lock(&c->sb_lock);
+		current_nr = ja->nr;
+
+		/*
+		 * note: journal buckets aren't really counted as _sectors_ used yet, so
+		 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+		 * when space used goes up without a reservation - but we do need the
+		 * reservation to ensure we'll actually be able to allocate:
+		 */
+
+		if (bch2_disk_reservation_get(c, &disk_res,
+					      bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+			mutex_unlock(&c->sb_lock);
+			return -ENOSPC;
+		}
+
+		ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+
+		bch2_disk_reservation_put(c, &disk_res);
+
+		if (ja->nr != current_nr)
+			bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+int bch2_dev_journal_alloc(struct bch_dev *ca)
+{
+	unsigned nr;
+
+	if (dynamic_fault("bcachefs:add:journal_alloc"))
+		return -ENOMEM;
+
+	/*
+	 * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+	 * is smaller:
+	 */
+	nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
+		     BCH_JOURNAL_BUCKETS_MIN,
+		     min(1 << 10,
+			 (1 << 20) / ca->mi.bucket_size));
+
+	return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+}
+
+/* startup/shutdown: */
+
+static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
+{
+	union journal_res_state state;
+	struct journal_buf *w;
+	bool ret;
+
+	spin_lock(&j->lock);
+	state = READ_ONCE(j->reservations);
+	w = j->buf + !state.idx;
+
+	ret = state.prev_buf_unwritten &&
+		bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
+{
+	wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
+}
+
+void bch2_fs_journal_stop(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+	bch2_journal_flush_all_pins(j);
+
+	wait_event(j->wait, journal_entry_close(j));
+
+	/* do we need to write another journal entry? */
+	if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
+	    c->btree_roots_dirty)
+		bch2_journal_meta(j);
+
+	journal_quiesce(j);
+
+	BUG_ON(!bch2_journal_error(j) &&
+	       test_bit(JOURNAL_NOT_EMPTY, &j->flags));
+
+	cancel_delayed_work_sync(&j->write_work);
+	cancel_delayed_work_sync(&j->reclaim_work);
+}
+
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
+			  struct list_head *journal_entries)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	struct journal_replay *i;
+	u64 last_seq = cur_seq, nr, seq;
+
+	if (!list_empty(journal_entries))
+		last_seq = le64_to_cpu(list_first_entry(journal_entries,
+							struct journal_replay,
+							list)->j.seq);
+
+	nr = cur_seq - last_seq;
+
+	if (nr + 1 > j->pin.size) {
+		free_fifo(&j->pin);
+		init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
+		if (!j->pin.data) {
+			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+			return -ENOMEM;
+		}
+	}
+
+	j->replay_journal_seq	= last_seq;
+	j->replay_journal_seq_end = cur_seq;
+	j->last_seq_ondisk	= last_seq;
+	j->pin.front		= last_seq;
+	j->pin.back		= cur_seq;
+	atomic64_set(&j->seq, cur_seq - 1);
+
+	fifo_for_each_entry_ptr(p, &j->pin, seq) {
+		INIT_LIST_HEAD(&p->list);
+		INIT_LIST_HEAD(&p->flushed);
+		atomic_set(&p->count, 1);
+		p->devs.nr = 0;
+	}
+
+	list_for_each_entry(i, journal_entries, list) {
+		seq = le64_to_cpu(i->j.seq);
+
+		BUG_ON(seq < last_seq || seq >= cur_seq);
+
+		journal_seq_pin(j, seq)->devs = i->devs;
+	}
+
+	spin_lock(&j->lock);
+
+	set_bit(JOURNAL_STARTED, &j->flags);
+
+	journal_pin_new_entry(j, 1);
+	bch2_journal_buf_init(j);
+
+	c->last_bucket_seq_cleanup = journal_cur_seq(j);
+
+	bch2_journal_space_available(j);
+	spin_unlock(&j->lock);
+
+	return 0;
+}
+
+/* init/exit: */
+
+void bch2_dev_journal_exit(struct bch_dev *ca)
+{
+	kfree(ca->journal.bio);
+	kfree(ca->journal.buckets);
+	kfree(ca->journal.bucket_seq);
+
+	ca->journal.bio		= NULL;
+	ca->journal.buckets	= NULL;
+	ca->journal.bucket_seq	= NULL;
+}
+
+int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
+{
+	struct journal_device *ja = &ca->journal;
+	struct bch_sb_field_journal *journal_buckets =
+		bch2_sb_get_journal(sb);
+	unsigned i;
+
+	ja->nr = bch2_nr_journal_buckets(journal_buckets);
+
+	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+	if (!ja->bucket_seq)
+		return -ENOMEM;
+
+	ca->journal.bio = bio_kmalloc(GFP_KERNEL,
+			DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE));
+	if (!ca->journal.bio)
+		return -ENOMEM;
+
+	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+	if (!ja->buckets)
+		return -ENOMEM;
+
+	for (i = 0; i < ja->nr; i++)
+		ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+
+	return 0;
+}
+
+void bch2_fs_journal_exit(struct journal *j)
+{
+	kvpfree(j->buf[1].data, j->buf[1].buf_size);
+	kvpfree(j->buf[0].data, j->buf[0].buf_size);
+	free_fifo(&j->pin);
+}
+
+int bch2_fs_journal_init(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	static struct lock_class_key res_key;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	spin_lock_init(&j->lock);
+	spin_lock_init(&j->err_lock);
+	init_waitqueue_head(&j->wait);
+	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+	INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
+	init_waitqueue_head(&j->pin_flush_wait);
+	mutex_init(&j->reclaim_lock);
+	mutex_init(&j->discard_lock);
+
+	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+	j->buf[0].buf_size	= JOURNAL_ENTRY_SIZE_MIN;
+	j->buf[1].buf_size	= JOURNAL_ENTRY_SIZE_MIN;
+	j->write_delay_ms	= 1000;
+	j->reclaim_delay_ms	= 100;
+
+	/* Btree roots: */
+	j->entry_u64s_reserved +=
+		BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
+
+	atomic64_set(&j->reservations.counter,
+		((union journal_res_state)
+		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+	    !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
+	    !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	j->pin.front = j->pin.back = 1;
+out:
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
+
+/* debug: */
+
+ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
+{
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	union journal_res_state s;
+	struct bch_dev *ca;
+	unsigned iter;
+
+	rcu_read_lock();
+	spin_lock(&j->lock);
+	s = READ_ONCE(j->reservations);
+
+	pr_buf(&out,
+	       "active journal entries:\t%llu\n"
+	       "seq:\t\t\t%llu\n"
+	       "last_seq:\t\t%llu\n"
+	       "last_seq_ondisk:\t%llu\n"
+	       "prereserved:\t\t%u/%u\n"
+	       "current entry sectors:\t%u\n"
+	       "current entry:\t\t",
+	       fifo_used(&j->pin),
+	       journal_cur_seq(j),
+	       journal_last_seq(j),
+	       j->last_seq_ondisk,
+	       j->prereserved.reserved,
+	       j->prereserved.remaining,
+	       j->cur_entry_sectors);
+
+	switch (s.cur_entry_offset) {
+	case JOURNAL_ENTRY_ERROR_VAL:
+		pr_buf(&out, "error\n");
+		break;
+	case JOURNAL_ENTRY_CLOSED_VAL:
+		pr_buf(&out, "closed\n");
+		break;
+	default:
+		pr_buf(&out, "%u/%u\n",
+		       s.cur_entry_offset,
+		       j->cur_entry_u64s);
+		break;
+	}
+
+	pr_buf(&out,
+	       "current entry refs:\t%u\n"
+	       "prev entry unwritten:\t",
+	       journal_state_count(s, s.idx));
+
+	if (s.prev_buf_unwritten)
+		pr_buf(&out, "yes, ref %u sectors %u\n",
+		       journal_state_count(s, !s.idx),
+		       journal_prev_buf(j)->sectors);
+	else
+		pr_buf(&out, "no\n");
+
+	pr_buf(&out,
+	       "need write:\t\t%i\n"
+	       "replay done:\t\t%i\n",
+	       test_bit(JOURNAL_NEED_WRITE,	&j->flags),
+	       test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
+
+	for_each_member_device_rcu(ca, c, iter,
+				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!ja->nr)
+			continue;
+
+		pr_buf(&out,
+		       "dev %u:\n"
+		       "\tnr\t\t%u\n"
+		       "\tavailable\t%u:%u\n"
+		       "\tdiscard_idx\t\t%u\n"
+		       "\tdirty_idx_ondisk\t%u (seq %llu)\n"
+		       "\tdirty_idx\t\t%u (seq %llu)\n"
+		       "\tcur_idx\t\t%u (seq %llu)\n",
+		       iter, ja->nr,
+		       bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
+		       ja->sectors_free,
+		       ja->discard_idx,
+		       ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk],
+		       ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx],
+		       ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
+	}
+
+	spin_unlock(&j->lock);
+	rcu_read_unlock();
+
+	return out.pos - buf;
+}
+
+ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
+{
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *pin;
+	u64 i;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
+		pr_buf(&out, "%llu: count %u\n",
+		       i, atomic_read(&pin_list->count));
+
+		list_for_each_entry(pin, &pin_list->list, list)
+			pr_buf(&out, "\t%p %pf\n",
+			       pin, pin->flush);
+
+		if (!list_empty(&pin_list->flushed))
+			pr_buf(&out, "flushed:\n");
+
+		list_for_each_entry(pin, &pin_list->flushed, list)
+			pr_buf(&out, "\t%p %pf\n",
+			       pin, pin->flush);
+	}
+	spin_unlock(&j->lock);
+
+	return out.pos - buf;
+}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
new file mode 100644
index 000000000000..f0da2c52581c
--- /dev/null
+++ b/fs/bcachefs/journal.h
@@ -0,0 +1,495 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_H
+#define _BCACHEFS_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The primary purpose of the journal is to log updates (insertions) to the
+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
+ *
+ * Without the journal, the b-tree is always internally consistent on
+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal
+ * but did handle unclean shutdowns by doing all index updates synchronously
+ * (with coalescing).
+ *
+ * Updates to interior nodes still happen synchronously and without the journal
+ * (for simplicity) - this may change eventually but updates to interior nodes
+ * are rare enough it's not a huge priority.
+ *
+ * This means the journal is relatively separate from the b-tree; it consists of
+ * just a list of keys and journal replay consists of just redoing those
+ * insertions in same order that they appear in the journal.
+ *
+ * PERSISTENCE:
+ *
+ * For synchronous updates (where we're waiting on the index update to hit
+ * disk), the journal entry will be written out immediately (or as soon as
+ * possible, if the write for the previous journal entry was still in flight).
+ *
+ * Synchronous updates are specified by passing a closure (@flush_cl) to
+ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
+ * down to the journalling code. That closure will will wait on the journal
+ * write to complete (via closure_wait()).
+ *
+ * If the index update wasn't synchronous, the journal entry will be
+ * written out after 10 ms have elapsed, by default (the delay_ms field
+ * in struct journal).
+ *
+ * JOURNAL ENTRIES:
+ *
+ * A journal entry is variable size (struct jset), it's got a fixed length
+ * header and then a variable number of struct jset_entry entries.
+ *
+ * Journal entries are identified by monotonically increasing 64 bit sequence
+ * numbers - jset->seq; other places in the code refer to this sequence number.
+ *
+ * A jset_entry entry contains one or more bkeys (which is what gets inserted
+ * into the b-tree). We need a container to indicate which b-tree the key is
+ * for; also, the roots of the various b-trees are stored in jset_entry entries
+ * (one for each b-tree) - this lets us add new b-tree types without changing
+ * the on disk format.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * JOURNAL LAYOUT ON DISK:
+ *
+ * The journal is written to a ringbuffer of buckets (which is kept in the
+ * superblock); the individual buckets are not necessarily contiguous on disk
+ * which means that journal entries are not allowed to span buckets, but also
+ * that we can resize the journal at runtime if desired (unimplemented).
+ *
+ * The journal buckets exist in the same pool as all the other buckets that are
+ * managed by the allocator and garbage collection - garbage collection marks
+ * the journal buckets as metadata buckets.
+ *
+ * OPEN/DIRTY JOURNAL ENTRIES:
+ *
+ * Open/dirty journal entries are journal entries that contain b-tree updates
+ * that have not yet been written out to the b-tree on disk. We have to track
+ * which journal entries are dirty, and we also have to avoid wrapping around
+ * the journal and overwriting old but still dirty journal entries with new
+ * journal entries.
+ *
+ * On disk, this is represented with the "last_seq" field of struct jset;
+ * last_seq is the first sequence number that journal replay has to replay.
+ *
+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
+ * journal_device->seq) of for each journal bucket, the highest sequence number
+ * any journal entry it contains. Then, by comparing that against last_seq we
+ * can determine whether that journal bucket contains dirty journal entries or
+ * not.
+ *
+ * To track which journal entries are dirty, we maintain a fifo of refcounts
+ * (where each entry corresponds to a specific sequence number) - when a ref
+ * goes to 0, that journal entry is no longer dirty.
+ *
+ * Journalling of index updates is done at the same time as the b-tree itself is
+ * being modified (see btree_insert_key()); when we add the key to the journal
+ * the pending b-tree write takes a ref on the journal entry the key was added
+ * to. If a pending b-tree write would need to take refs on multiple dirty
+ * journal entries, it only keeps the ref on the oldest one (since a newer
+ * journal entry will still be replayed if an older entry was dirty).
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+#include <linux/hash.h>
+
+#include "journal_types.h"
+
+struct bch_fs;
+
+static inline void journal_wake(struct journal *j)
+{
+	wake_up(&j->wait);
+	closure_wake_up(&j->async_wait);
+	closure_wake_up(&j->preres_wait);
+}
+
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+	return j->buf + j->reservations.idx;
+}
+
+static inline struct journal_buf *journal_prev_buf(struct journal *j)
+{
+	return j->buf + !j->reservations.idx;
+}
+
+/* Sequence number of oldest dirty journal entry */
+
+static inline u64 journal_last_seq(struct journal *j)
+{
+	return j->pin.front;
+}
+
+static inline u64 journal_cur_seq(struct journal *j)
+{
+	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+
+	return j->pin.back - 1;
+}
+
+u64 bch2_inode_journal_seq(struct journal *, u64);
+
+static inline int journal_state_count(union journal_res_state s, int idx)
+{
+	return idx == 0 ? s.buf0_count : s.buf1_count;
+}
+
+static inline void journal_state_inc(union journal_res_state *s)
+{
+	s->buf0_count += s->idx == 0;
+	s->buf1_count += s->idx == 1;
+}
+
+static inline void bch2_journal_set_has_inode(struct journal *j,
+					      struct journal_res *res,
+					      u64 inum)
+{
+	struct journal_buf *buf = &j->buf[res->idx];
+	unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
+
+	/* avoid atomic op if possible */
+	if (unlikely(!test_bit(bit, buf->has_inode)))
+		set_bit(bit, buf->has_inode);
+}
+
+/*
+ * Amount of space that will be taken up by some keys in the journal (i.e.
+ * including the jset header)
+ */
+static inline unsigned jset_u64s(unsigned u64s)
+{
+	return u64s + sizeof(struct jset_entry) / sizeof(u64);
+}
+
+static inline int journal_entry_overhead(struct journal *j)
+{
+	return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
+{
+	struct jset *jset = buf->data;
+	struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
+
+	memset(entry, 0, sizeof(*entry));
+	entry->u64s = cpu_to_le16(u64s);
+
+	le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+
+	return entry;
+}
+
+static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+					  unsigned type, enum btree_id id,
+					  unsigned level,
+					  const void *data, unsigned u64s)
+{
+	struct journal_buf *buf = &j->buf[res->idx];
+	struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
+	unsigned actual = jset_u64s(u64s);
+
+	EBUG_ON(!res->ref);
+	EBUG_ON(actual > res->u64s);
+
+	res->offset	+= actual;
+	res->u64s	-= actual;
+
+	memset(entry, 0, sizeof(*entry));
+	entry->u64s	= cpu_to_le16(u64s);
+	entry->type	= type;
+	entry->btree_id = id;
+	entry->level	= level;
+	memcpy_u64s(entry->_data, data, u64s);
+}
+
+static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
+					enum btree_id id, const struct bkey_i *k)
+{
+	bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
+			       id, 0, k, k->k.u64s);
+}
+
+static inline bool journal_entry_empty(struct jset *j)
+{
+	struct jset_entry *i;
+
+	if (j->seq != j->last_seq)
+		return false;
+
+	vstruct_for_each(j, i)
+		if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
+			return false;
+	return true;
+}
+
+void __bch2_journal_buf_put(struct journal *, bool);
+
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
+				       bool need_write_just_set)
+{
+	union journal_res_state s;
+
+	s.v = atomic64_sub_return(((union journal_res_state) {
+				    .buf0_count = idx == 0,
+				    .buf1_count = idx == 1,
+				    }).v, &j->reservations.counter);
+	if (!journal_state_count(s, idx)) {
+		EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
+		__bch2_journal_buf_put(j, need_write_just_set);
+	}
+}
+
+/*
+ * This function releases the journal write structure so other threads can
+ * then proceed to add their keys as well.
+ */
+static inline void bch2_journal_res_put(struct journal *j,
+				       struct journal_res *res)
+{
+	if (!res->ref)
+		return;
+
+	lock_release(&j->res_map, _THIS_IP_);
+
+	while (res->u64s)
+		bch2_journal_add_entry(j, res,
+				       BCH_JSET_ENTRY_btree_keys,
+				       0, 0, NULL, 0);
+
+	bch2_journal_buf_put(j, res->idx, false);
+
+	res->ref = 0;
+}
+
+int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
+				  unsigned);
+
+#define JOURNAL_RES_GET_NONBLOCK	(1 << 0)
+#define JOURNAL_RES_GET_CHECK		(1 << 1)
+#define JOURNAL_RES_GET_RESERVED	(1 << 2)
+
+static inline int journal_res_get_fast(struct journal *j,
+				       struct journal_res *res,
+				       unsigned flags)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+
+		/*
+		 * Check if there is still room in the current journal
+		 * entry:
+		 */
+		if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
+			return 0;
+
+		EBUG_ON(!journal_state_count(new, new.idx));
+
+		if (!(flags & JOURNAL_RES_GET_RESERVED) &&
+		    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
+			return 0;
+
+		if (flags & JOURNAL_RES_GET_CHECK)
+			return 1;
+
+		new.cur_entry_offset += res->u64s;
+		journal_state_inc(&new);
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	res->ref	= true;
+	res->idx	= old.idx;
+	res->offset	= old.cur_entry_offset;
+	res->seq	= le64_to_cpu(j->buf[old.idx].data->seq);
+	return 1;
+}
+
+static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
+				       unsigned u64s, unsigned flags)
+{
+	int ret;
+
+	EBUG_ON(res->ref);
+	EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+
+	res->u64s = u64s;
+
+	if (journal_res_get_fast(j, res, flags))
+		goto out;
+
+	ret = bch2_journal_res_get_slowpath(j, res, flags);
+	if (ret)
+		return ret;
+out:
+	if (!(flags & JOURNAL_RES_GET_CHECK)) {
+		lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
+		EBUG_ON(!res->ref);
+	}
+	return 0;
+}
+
+/* journal_preres: */
+
+static inline bool journal_check_may_get_unreserved(struct journal *j)
+{
+	union journal_preres_state s = READ_ONCE(j->prereserved);
+	bool ret = s.reserved <= s.remaining &&
+		fifo_free(&j->pin) > 8;
+
+	lockdep_assert_held(&j->lock);
+
+	if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+		if (ret) {
+			set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
+			journal_wake(j);
+		} else {
+			clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
+		}
+	}
+	return ret;
+}
+
+static inline void bch2_journal_preres_put(struct journal *j,
+					   struct journal_preres *res)
+{
+	union journal_preres_state s = { .reserved = res->u64s };
+
+	if (!res->u64s)
+		return;
+
+	s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
+	res->u64s = 0;
+	closure_wake_up(&j->preres_wait);
+
+	if (s.reserved <= s.remaining &&
+	    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+		spin_lock(&j->lock);
+		journal_check_may_get_unreserved(j);
+		spin_unlock(&j->lock);
+	}
+}
+
+int __bch2_journal_preres_get(struct journal *,
+			struct journal_preres *, unsigned);
+
+static inline int bch2_journal_preres_get_fast(struct journal *j,
+					       struct journal_preres *res,
+					       unsigned new_u64s)
+{
+	int d = new_u64s - res->u64s;
+	union journal_preres_state old, new;
+	u64 v = atomic64_read(&j->prereserved.counter);
+
+	do {
+		old.v = new.v = v;
+
+		new.reserved += d;
+
+		if (new.reserved > new.remaining)
+			return 0;
+	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
+				       old.v, new.v)) != old.v);
+
+	res->u64s += d;
+	return 1;
+}
+
+static inline int bch2_journal_preres_get(struct journal *j,
+					  struct journal_preres *res,
+					  unsigned new_u64s,
+					  unsigned flags)
+{
+	if (new_u64s <= res->u64s)
+		return 0;
+
+	if (bch2_journal_preres_get_fast(j, res, new_u64s))
+		return 0;
+
+	if (flags & JOURNAL_RES_GET_NONBLOCK)
+		return -EAGAIN;
+
+	return __bch2_journal_preres_get(j, res, new_u64s);
+}
+
+/* journal_entry_res: */
+
+void bch2_journal_entry_res_resize(struct journal *,
+				   struct journal_entry_res *,
+				   unsigned);
+
+u64 bch2_journal_last_unwritten_seq(struct journal *);
+int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
+
+void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
+void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+void bch2_journal_flush_async(struct journal *, struct closure *);
+void bch2_journal_meta_async(struct journal *, struct closure *);
+
+int bch2_journal_flush_seq(struct journal *, u64);
+int bch2_journal_flush(struct journal *);
+int bch2_journal_meta(struct journal *);
+
+void bch2_journal_halt(struct journal *);
+
+static inline int bch2_journal_error(struct journal *j)
+{
+	return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
+		? -EIO : 0;
+}
+
+struct bch_dev;
+
+static inline bool journal_flushes_device(struct bch_dev *ca)
+{
+	return true;
+}
+
+static inline void bch2_journal_set_replay_done(struct journal *j)
+{
+	BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+	set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+}
+
+void bch2_journal_unblock(struct journal *);
+void bch2_journal_block(struct journal *);
+
+ssize_t bch2_journal_print_debug(struct journal *, char *);
+ssize_t bch2_journal_print_pins(struct journal *, char *);
+
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
+				unsigned nr);
+int bch2_dev_journal_alloc(struct bch_dev *);
+
+void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
+
+void bch2_fs_journal_stop(struct journal *);
+int bch2_fs_journal_start(struct journal *, u64, struct list_head *);
+
+void bch2_dev_journal_exit(struct bch_dev *);
+int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
+void bch2_fs_journal_exit(struct journal *);
+int bch2_fs_journal_init(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
new file mode 100644
index 000000000000..387377dadab5
--- /dev/null
+++ b/fs/bcachefs/journal_io.c
@@ -0,0 +1,1123 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+
+#include <trace/events/bcachefs.h>
+
+struct journal_list {
+	struct closure		cl;
+	struct mutex		lock;
+	struct list_head	*head;
+	int			ret;
+};
+
+#define JOURNAL_ENTRY_ADD_OK		0
+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE	5
+
+/*
+ * Given a journal entry we just read, add it to the list of journal entries to
+ * be replayed:
+ */
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+			     struct journal_list *jlist, struct jset *j)
+{
+	struct journal_replay *i, *pos;
+	struct list_head *where;
+	size_t bytes = vstruct_bytes(j);
+	__le64 last_seq;
+	int ret;
+
+	last_seq = !list_empty(jlist->head)
+		? list_last_entry(jlist->head, struct journal_replay,
+				  list)->j.last_seq
+		: 0;
+
+	/* Is this entry older than the range we need? */
+	if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
+		ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+		goto out;
+	}
+
+	/* Drop entries we don't need anymore */
+	list_for_each_entry_safe(i, pos, jlist->head, list) {
+		if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+			break;
+		list_del(&i->list);
+		kvpfree(i, offsetof(struct journal_replay, j) +
+			vstruct_bytes(&i->j));
+	}
+
+	list_for_each_entry_reverse(i, jlist->head, list) {
+		/* Duplicate? */
+		if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+			fsck_err_on(bytes != vstruct_bytes(&i->j) ||
+				    memcmp(j, &i->j, bytes), c,
+				    "found duplicate but non identical journal entries (seq %llu)",
+				    le64_to_cpu(j->seq));
+			goto found;
+		}
+
+		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
+			where = &i->list;
+			goto add;
+		}
+	}
+
+	where = jlist->head;
+add:
+	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+	if (!i) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&i->list, where);
+	i->devs.nr = 0;
+	memcpy(&i->j, j, bytes);
+found:
+	if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
+		bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
+	else
+		fsck_err_on(1, c, "duplicate journal entries on same device");
+	ret = JOURNAL_ENTRY_ADD_OK;
+out:
+fsck_err:
+	return ret;
+}
+
+static struct nonce journal_nonce(const struct jset *jset)
+{
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = ((__le32 *) &jset->seq)[0],
+		[2] = ((__le32 *) &jset->seq)[1],
+		[3] = BCH_NONCE_JOURNAL,
+	}};
+}
+
+/* this fills in a range with empty jset_entries: */
+static void journal_entry_null_range(void *start, void *end)
+{
+	struct jset_entry *entry;
+
+	for (entry = start; entry != end; entry = vstruct_next(entry))
+		memset(entry, 0, sizeof(*entry));
+}
+
+#define JOURNAL_ENTRY_REREAD	5
+#define JOURNAL_ENTRY_NONE	6
+#define JOURNAL_ENTRY_BAD	7
+
+#define journal_entry_err(c, msg, ...)					\
+({									\
+	switch (write) {						\
+	case READ:							\
+		mustfix_fsck_err(c, msg, ##__VA_ARGS__);		\
+		break;							\
+	case WRITE:							\
+		bch_err(c, "corrupt metadata before write:\n"		\
+			msg, ##__VA_ARGS__);				\
+		if (bch2_fs_inconsistent(c)) {				\
+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			goto fsck_err;					\
+		}							\
+		break;							\
+	}								\
+	true;								\
+})
+
+#define journal_entry_err_on(cond, c, msg, ...)				\
+	((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+
+static int journal_validate_key(struct bch_fs *c, struct jset *jset,
+				struct jset_entry *entry,
+				struct bkey_i *k, enum btree_node_type key_type,
+				const char *type, int write)
+{
+	void *next = vstruct_next(entry);
+	const char *invalid;
+	unsigned version = le32_to_cpu(jset->version);
+	int ret = 0;
+
+	if (journal_entry_err_on(!k->k.u64s, c,
+			"invalid %s in journal: k->u64s 0", type)) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	if (journal_entry_err_on((void *) bkey_next(k) >
+				(void *) vstruct_next(entry), c,
+			"invalid %s in journal: extends past end of journal entry",
+			type)) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
+			"invalid %s in journal: bad format %u",
+			type, k->k.format)) {
+		le16_add_cpu(&entry->u64s, -k->k.u64s);
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
+		bch2_bkey_swab(NULL, bkey_to_packed(k));
+
+	if (!write &&
+	    version < bcachefs_metadata_version_bkey_renumber)
+		bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
+
+	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), key_type);
+	if (invalid) {
+		char buf[160];
+
+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
+		mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
+				 type, invalid, buf);
+
+		le16_add_cpu(&entry->u64s, -k->k.u64s);
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	if (write &&
+	    version < bcachefs_metadata_version_bkey_renumber)
+		bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_btree_keys(struct bch_fs *c,
+					     struct jset *jset,
+					     struct jset_entry *entry,
+					     int write)
+{
+	struct bkey_i *k;
+
+	vstruct_for_each(entry, k) {
+		int ret = journal_validate_key(c, jset, entry, k,
+				__btree_node_type(entry->level,
+						  entry->btree_id),
+				"key", write);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int journal_entry_validate_btree_root(struct bch_fs *c,
+					     struct jset *jset,
+					     struct jset_entry *entry,
+					     int write)
+{
+	struct bkey_i *k = entry->start;
+	int ret = 0;
+
+	if (journal_entry_err_on(!entry->u64s ||
+				 le16_to_cpu(entry->u64s) != k->k.u64s, c,
+				 "invalid btree root journal entry: wrong number of keys")) {
+		void *next = vstruct_next(entry);
+		/*
+		 * we don't want to null out this jset_entry,
+		 * just the contents, so that later we can tell
+		 * we were _supposed_ to have a btree root
+		 */
+		entry->u64s = 0;
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE,
+				    "btree root", write);
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
+					    struct jset *jset,
+					    struct jset_entry *entry,
+					    int write)
+{
+	/* obsolete, don't care: */
+	return 0;
+}
+
+static int journal_entry_validate_blacklist(struct bch_fs *c,
+					    struct jset *jset,
+					    struct jset_entry *entry,
+					    int write)
+{
+	int ret = 0;
+
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
+		"invalid journal seq blacklist entry: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+	}
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
+					       struct jset *jset,
+					       struct jset_entry *entry,
+					       int write)
+{
+	struct jset_entry_blacklist_v2 *bl_entry;
+	int ret = 0;
+
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
+		"invalid journal seq blacklist entry: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		goto out;
+	}
+
+	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
+				 le64_to_cpu(bl_entry->end), c,
+		"invalid journal seq blacklist entry: start > end")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+	}
+out:
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_usage(struct bch_fs *c,
+					struct jset *jset,
+					struct jset_entry *entry,
+					int write)
+{
+	struct jset_entry_usage *u =
+		container_of(entry, struct jset_entry_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < sizeof(*u),
+				 c,
+				 "invalid journal entry usage: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_data_usage(struct bch_fs *c,
+					struct jset *jset,
+					struct jset_entry *entry,
+					int write)
+{
+	struct jset_entry_data_usage *u =
+		container_of(entry, struct jset_entry_data_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < sizeof(*u) ||
+				 bytes < sizeof(*u) + u->r.nr_devs,
+				 c,
+				 "invalid journal entry usage: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+struct jset_entry_ops {
+	int (*validate)(struct bch_fs *, struct jset *,
+			struct jset_entry *, int);
+};
+
+static const struct jset_entry_ops bch2_jset_entry_ops[] = {
+#define x(f, nr)						\
+	[BCH_JSET_ENTRY_##f]	= (struct jset_entry_ops) {	\
+		.validate	= journal_entry_validate_##f,	\
+	},
+	BCH_JSET_ENTRY_TYPES()
+#undef x
+};
+
+static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
+				  struct jset_entry *entry, int write)
+{
+	return entry->type < BCH_JSET_ENTRY_NR
+		? bch2_jset_entry_ops[entry->type].validate(c, jset,
+							    entry, write)
+		: 0;
+}
+
+static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
+				 int write)
+{
+	struct jset_entry *entry;
+	int ret = 0;
+
+	vstruct_for_each(jset, entry) {
+		if (journal_entry_err_on(vstruct_next(entry) >
+					 vstruct_last(jset), c,
+				"journal entry extends past end of jset")) {
+			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
+			break;
+		}
+
+		ret = journal_entry_validate(c, jset, entry, write);
+		if (ret)
+			break;
+	}
+fsck_err:
+	return ret;
+}
+
+static int jset_validate(struct bch_fs *c,
+			 struct jset *jset, u64 sector,
+			 unsigned bucket_sectors_left,
+			 unsigned sectors_read,
+			 int write)
+{
+	size_t bytes = vstruct_bytes(jset);
+	struct bch_csum csum;
+	unsigned version;
+	int ret = 0;
+
+	if (le64_to_cpu(jset->magic) != jset_magic(c))
+		return JOURNAL_ENTRY_NONE;
+
+	version = le32_to_cpu(jset->version);
+	if ((version != BCH_JSET_VERSION_OLD &&
+	     version < bcachefs_metadata_version_min) ||
+	    version >= bcachefs_metadata_version_max) {
+		bch_err(c, "unknown journal entry version %u", jset->version);
+		return BCH_FSCK_UNKNOWN_VERSION;
+	}
+
+	if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
+				 "journal entry too big (%zu bytes), sector %lluu",
+				 bytes, sector)) {
+		/* XXX: note we might have missing journal entries */
+		return JOURNAL_ENTRY_BAD;
+	}
+
+	if (bytes > sectors_read << 9)
+		return JOURNAL_ENTRY_REREAD;
+
+	if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+			"journal entry with unknown csum type %llu sector %lluu",
+			JSET_CSUM_TYPE(jset), sector))
+		return JOURNAL_ENTRY_BAD;
+
+	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
+	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
+				 "journal checksum bad, sector %llu", sector)) {
+		/* XXX: retry IO, when we start retrying checksum errors */
+		/* XXX: note we might have missing journal entries */
+		return JOURNAL_ENTRY_BAD;
+	}
+
+	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+		     jset->encrypted_start,
+		     vstruct_end(jset) - (void *) jset->encrypted_start);
+
+	if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+				 "invalid journal entry: last_seq > seq"))
+		jset->last_seq = jset->seq;
+
+	return 0;
+fsck_err:
+	return ret;
+}
+
+struct journal_read_buf {
+	void		*data;
+	size_t		size;
+};
+
+static int journal_read_buf_realloc(struct journal_read_buf *b,
+				    size_t new_size)
+{
+	void *n;
+
+	/* the bios are sized for this many pages, max: */
+	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
+		return -ENOMEM;
+
+	new_size = roundup_pow_of_two(new_size);
+	n = kvpmalloc(new_size, GFP_KERNEL);
+	if (!n)
+		return -ENOMEM;
+
+	kvpfree(b->data, b->size);
+	b->data = n;
+	b->size = new_size;
+	return 0;
+}
+
+static int journal_read_bucket(struct bch_dev *ca,
+			       struct journal_read_buf *buf,
+			       struct journal_list *jlist,
+			       unsigned bucket)
+{
+	struct bch_fs *c = ca->fs;
+	struct journal_device *ja = &ca->journal;
+	struct jset *j = NULL;
+	unsigned sectors, sectors_read = 0;
+	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
+	    end = offset + ca->mi.bucket_size;
+	bool saw_bad = false;
+	int ret = 0;
+
+	pr_debug("reading %u", bucket);
+
+	while (offset < end) {
+		if (!sectors_read) {
+			struct bio *bio;
+reread:
+			sectors_read = min_t(unsigned,
+				end - offset, buf->size >> 9);
+
+			bio = bio_kmalloc(GFP_KERNEL,
+					  buf_pages(buf->data,
+						    sectors_read << 9));
+			bio_set_dev(bio, ca->disk_sb.bdev);
+			bio->bi_iter.bi_sector	= offset;
+			bio_set_op_attrs(bio, REQ_OP_READ, 0);
+			bch2_bio_map(bio, buf->data, sectors_read << 9);
+
+			ret = submit_bio_wait(bio);
+			bio_put(bio);
+
+			if (bch2_dev_io_err_on(ret, ca,
+					       "journal read from sector %llu",
+					       offset) ||
+			    bch2_meta_read_fault("journal"))
+				return -EIO;
+
+			j = buf->data;
+		}
+
+		ret = jset_validate(c, j, offset,
+				    end - offset, sectors_read,
+				    READ);
+		switch (ret) {
+		case BCH_FSCK_OK:
+			break;
+		case JOURNAL_ENTRY_REREAD:
+			if (vstruct_bytes(j) > buf->size) {
+				ret = journal_read_buf_realloc(buf,
+							vstruct_bytes(j));
+				if (ret)
+					return ret;
+			}
+			goto reread;
+		case JOURNAL_ENTRY_NONE:
+			if (!saw_bad)
+				return 0;
+			sectors = c->opts.block_size;
+			goto next_block;
+		case JOURNAL_ENTRY_BAD:
+			saw_bad = true;
+			sectors = c->opts.block_size;
+			goto next_block;
+		default:
+			return ret;
+		}
+
+		/*
+		 * This happens sometimes if we don't have discards on -
+		 * when we've partially overwritten a bucket with new
+		 * journal entries. We don't need the rest of the
+		 * bucket:
+		 */
+		if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+			return 0;
+
+		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
+
+		mutex_lock(&jlist->lock);
+		ret = journal_entry_add(c, ca, jlist, j);
+		mutex_unlock(&jlist->lock);
+
+		switch (ret) {
+		case JOURNAL_ENTRY_ADD_OK:
+			break;
+		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+			break;
+		default:
+			return ret;
+		}
+
+		sectors = vstruct_sectors(j, c->block_bits);
+next_block:
+		pr_debug("next");
+		offset		+= sectors;
+		sectors_read	-= sectors;
+		j = ((void *) j) + (sectors << 9);
+	}
+
+	return 0;
+}
+
+static void bch2_journal_read_device(struct closure *cl)
+{
+	struct journal_device *ja =
+		container_of(cl, struct journal_device, read);
+	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+	struct journal_list *jlist =
+		container_of(cl->parent, struct journal_list, cl);
+	struct journal_read_buf buf = { NULL, 0 };
+	u64 min_seq = U64_MAX;
+	unsigned i;
+	int ret;
+
+	if (!ja->nr)
+		goto out;
+
+	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+	if (ret)
+		goto err;
+
+	pr_debug("%u journal buckets", ja->nr);
+
+	for (i = 0; i < ja->nr; i++) {
+		ret = journal_read_bucket(ca, &buf, jlist, i);
+		if (ret)
+			goto err;
+	}
+
+	/* Find the journal bucket with the highest sequence number: */
+	for (i = 0; i < ja->nr; i++) {
+		if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
+			ja->cur_idx = i;
+
+		min_seq = min(ja->bucket_seq[i], min_seq);
+	}
+
+	/*
+	 * If there's duplicate journal entries in multiple buckets (which
+	 * definitely isn't supposed to happen, but...) - make sure to start
+	 * cur_idx at the last of those buckets, so we don't deadlock trying to
+	 * allocate
+	 */
+	while (ja->bucket_seq[ja->cur_idx] > min_seq &&
+	       ja->bucket_seq[ja->cur_idx] >
+	       ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
+		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+
+	ja->sectors_free = 0;
+
+	/*
+	 * Set dirty_idx to indicate the entire journal is full and needs to be
+	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
+	 * pinned when it first runs:
+	 */
+	ja->discard_idx = ja->dirty_idx_ondisk =
+		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
+out:
+	kvpfree(buf.data, buf.size);
+	percpu_ref_put(&ca->io_ref);
+	closure_return(cl);
+	return;
+err:
+	mutex_lock(&jlist->lock);
+	jlist->ret = ret;
+	mutex_unlock(&jlist->lock);
+	goto out;
+}
+
+int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+{
+	struct journal_list jlist;
+	struct journal_replay *i;
+	struct bch_dev *ca;
+	unsigned iter;
+	size_t keys = 0, entries = 0;
+	bool degraded = false;
+	int ret = 0;
+
+	closure_init_stack(&jlist.cl);
+	mutex_init(&jlist.lock);
+	jlist.head = list;
+	jlist.ret = 0;
+
+	for_each_member_device(ca, c, iter) {
+		if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+		    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
+			continue;
+
+		if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
+		     ca->mi.state == BCH_MEMBER_STATE_RO) &&
+		    percpu_ref_tryget(&ca->io_ref))
+			closure_call(&ca->journal.read,
+				     bch2_journal_read_device,
+				     system_unbound_wq,
+				     &jlist.cl);
+		else
+			degraded = true;
+	}
+
+	closure_sync(&jlist.cl);
+
+	if (jlist.ret)
+		return jlist.ret;
+
+	list_for_each_entry(i, list, list) {
+		struct jset_entry *entry;
+		struct bkey_i *k, *_n;
+		struct bch_replicas_padded replicas;
+		char buf[80];
+
+		ret = jset_validate_entries(c, &i->j, READ);
+		if (ret)
+			goto fsck_err;
+
+		/*
+		 * If we're mounting in degraded mode - if we didn't read all
+		 * the devices - this is wrong:
+		 */
+
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
+
+		if (!degraded &&
+		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+		     fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
+				 "superblock not marked as containing replicas %s",
+				 (bch2_replicas_entry_to_text(&PBUF(buf),
+							      &replicas.e), buf)))) {
+			ret = bch2_mark_replicas(c, &replicas.e);
+			if (ret)
+				return ret;
+		}
+
+		for_each_jset_key(k, _n, entry, &i->j)
+			keys++;
+		entries++;
+	}
+
+	if (!list_empty(list)) {
+		i = list_last_entry(list, struct journal_replay, list);
+
+		bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+			 keys, entries, le64_to_cpu(i->j.seq));
+	}
+fsck_err:
+	return ret;
+}
+
+/* journal write: */
+
+static void __journal_write_alloc(struct journal *j,
+				  struct journal_buf *w,
+				  struct dev_alloc_list *devs_sorted,
+				  unsigned sectors,
+				  unsigned *replicas,
+				  unsigned replicas_want)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_device *ja;
+	struct bch_dev *ca;
+	unsigned i;
+
+	if (*replicas >= replicas_want)
+		return;
+
+	for (i = 0; i < devs_sorted->nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
+		if (!ca)
+			continue;
+
+		ja = &ca->journal;
+
+		/*
+		 * Check that we can use this device, and aren't already using
+		 * it:
+		 */
+		if (!ca->mi.durability ||
+		    ca->mi.state != BCH_MEMBER_STATE_RW ||
+		    !ja->nr ||
+		    bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
+					 ca->dev_idx) ||
+		    sectors > ja->sectors_free)
+			continue;
+
+		bch2_dev_stripe_increment(c, ca, &j->wp.stripe);
+
+		bch2_bkey_append_ptr(&w->key,
+			(struct bch_extent_ptr) {
+				  .offset = bucket_to_sector(ca,
+					ja->buckets[ja->cur_idx]) +
+					ca->mi.bucket_size -
+					ja->sectors_free,
+				  .dev = ca->dev_idx,
+		});
+
+		ja->sectors_free -= sectors;
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+
+		*replicas += ca->mi.durability;
+
+		if (*replicas >= replicas_want)
+			break;
+	}
+}
+
+/**
+ * journal_next_bucket - move on to the next journal bucket if possible
+ */
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
+			       unsigned sectors)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_device *ja;
+	struct bch_dev *ca;
+	struct dev_alloc_list devs_sorted;
+	unsigned i, replicas = 0, replicas_want =
+		READ_ONCE(c->opts.metadata_replicas);
+
+	rcu_read_lock();
+
+	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
+					  &c->rw_devs[BCH_DATA_JOURNAL]);
+
+	__journal_write_alloc(j, w, &devs_sorted,
+			      sectors, &replicas, replicas_want);
+
+	if (replicas >= replicas_want)
+		goto done;
+
+	for (i = 0; i < devs_sorted.nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+		if (!ca)
+			continue;
+
+		ja = &ca->journal;
+
+		if (sectors > ja->sectors_free &&
+		    sectors <= ca->mi.bucket_size &&
+		    bch2_journal_dev_buckets_available(j, ja,
+					journal_space_discarded)) {
+			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+			ja->sectors_free = ca->mi.bucket_size;
+
+			/*
+			 * ja->bucket_seq[ja->cur_idx] must always have
+			 * something sensible:
+			 */
+			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+		}
+	}
+
+	__journal_write_alloc(j, w, &devs_sorted,
+			      sectors, &replicas, replicas_want);
+done:
+	rcu_read_unlock();
+
+	return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
+}
+
+static void journal_write_compact(struct jset *jset)
+{
+	struct jset_entry *i, *next, *prev = NULL;
+
+	/*
+	 * Simple compaction, dropping empty jset_entries (from journal
+	 * reservations that weren't fully used) and merging jset_entries that
+	 * can be.
+	 *
+	 * If we wanted to be really fancy here, we could sort all the keys in
+	 * the jset and drop keys that were overwritten - probably not worth it:
+	 */
+	vstruct_for_each_safe(jset, i, next) {
+		unsigned u64s = le16_to_cpu(i->u64s);
+
+		/* Empty entry: */
+		if (!u64s)
+			continue;
+
+		/* Can we merge with previous entry? */
+		if (prev &&
+		    i->btree_id == prev->btree_id &&
+		    i->level	== prev->level &&
+		    i->type	== prev->type &&
+		    i->type	== BCH_JSET_ENTRY_btree_keys &&
+		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+			memmove_u64s_down(vstruct_next(prev),
+					  i->_data,
+					  u64s);
+			le16_add_cpu(&prev->u64s, u64s);
+			continue;
+		}
+
+		/* Couldn't merge, move i into new position (after prev): */
+		prev = prev ? vstruct_next(prev) : jset->start;
+		if (i != prev)
+			memmove_u64s_down(prev, i, jset_u64s(u64s));
+	}
+
+	prev = prev ? vstruct_next(prev) : jset->start;
+	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+}
+
+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
+{
+	/* we aren't holding j->lock: */
+	unsigned new_size = READ_ONCE(j->buf_size_want);
+	void *new_buf;
+
+	if (buf->buf_size >= new_size)
+		return;
+
+	new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
+	if (!new_buf)
+		return;
+
+	memcpy(new_buf, buf->data, buf->buf_size);
+	kvpfree(buf->data, buf->buf_size);
+	buf->data	= new_buf;
+	buf->buf_size	= new_size;
+}
+
+static void journal_write_done(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *w = journal_prev_buf(j);
+	struct bch_devs_list devs =
+		bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+	struct bch_replicas_padded replicas;
+	u64 seq = le64_to_cpu(w->data->seq);
+	u64 last_seq = le64_to_cpu(w->data->last_seq);
+
+	bch2_time_stats_update(j->write_time, j->write_start_time);
+
+	if (!devs.nr) {
+		bch_err(c, "unable to write journal to sufficient devices");
+		goto err;
+	}
+
+	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs);
+
+	if (bch2_mark_replicas(c, &replicas.e))
+		goto err;
+
+	spin_lock(&j->lock);
+	if (seq >= j->pin.front)
+		journal_seq_pin(j, seq)->devs = devs;
+
+	j->seq_ondisk		= seq;
+	j->last_seq_ondisk	= last_seq;
+	bch2_journal_space_available(j);
+
+	/*
+	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+	 * more buckets:
+	 *
+	 * Must come before signaling write completion, for
+	 * bch2_fs_journal_stop():
+	 */
+	mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
+out:
+	/* also must come before signalling write completion: */
+	closure_debug_destroy(cl);
+
+	BUG_ON(!j->reservations.prev_buf_unwritten);
+	atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
+		     &j->reservations.counter);
+
+	closure_wake_up(&w->wait);
+	journal_wake(j);
+
+	if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
+		mod_delayed_work(system_freezable_wq, &j->write_work, 0);
+	spin_unlock(&j->lock);
+	return;
+err:
+	bch2_fatal_error(c);
+	spin_lock(&j->lock);
+	goto out;
+}
+
+static void journal_write_endio(struct bio *bio)
+{
+	struct bch_dev *ca = bio->bi_private;
+	struct journal *j = &ca->fs->journal;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
+	    bch2_meta_write_fault("journal")) {
+		struct journal_buf *w = journal_prev_buf(j);
+		unsigned long flags;
+
+		spin_lock_irqsave(&j->err_lock, flags);
+		bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx);
+		spin_unlock_irqrestore(&j->err_lock, flags);
+	}
+
+	closure_put(&j->io);
+	percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_journal_write(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct journal_buf *w = journal_prev_buf(j);
+	struct jset_entry *start, *end;
+	struct jset *jset;
+	struct bio *bio;
+	struct bch_extent_ptr *ptr;
+	bool validate_before_checksum = false;
+	unsigned i, sectors, bytes, u64s;
+	int ret;
+
+	bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
+
+	journal_buf_realloc(j, w);
+	jset = w->data;
+
+	j->write_start_time = local_clock();
+
+	start	= vstruct_last(jset);
+	end	= bch2_journal_super_entries_add_common(c, start,
+						le64_to_cpu(jset->seq));
+	u64s	= (u64 *) end - (u64 *) start;
+	BUG_ON(u64s > j->entry_u64s_reserved);
+
+	le32_add_cpu(&jset->u64s, u64s);
+	BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
+
+	journal_write_compact(jset);
+
+	jset->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
+	jset->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
+	jset->magic		= cpu_to_le64(jset_magic(c));
+
+	jset->version		= c->sb.version < bcachefs_metadata_version_new_versioning
+		? cpu_to_le32(BCH_JSET_VERSION_OLD)
+		: cpu_to_le32(c->sb.version);
+
+	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
+	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
+
+	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
+		validate_before_checksum = true;
+
+	if (le32_to_cpu(jset->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		validate_before_checksum = true;
+
+	if (validate_before_checksum &&
+	    jset_validate_entries(c, jset, WRITE))
+		goto err;
+
+	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+		    jset->encrypted_start,
+		    vstruct_end(jset) - (void *) jset->encrypted_start);
+
+	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
+				  journal_nonce(jset), jset);
+
+	if (!validate_before_checksum &&
+	    jset_validate_entries(c, jset, WRITE))
+		goto err;
+
+	sectors = vstruct_sectors(jset, c->block_bits);
+	BUG_ON(sectors > w->sectors);
+
+	bytes = vstruct_bytes(jset);
+	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+
+	spin_lock(&j->lock);
+	ret = journal_write_alloc(j, w, sectors);
+
+	/*
+	 * write is allocated, no longer need to account for it in
+	 * bch2_journal_space_available():
+	 */
+	w->sectors = 0;
+
+	/*
+	 * journal entry has been compacted and allocated, recalculate space
+	 * available:
+	 */
+	bch2_journal_space_available(j);
+	spin_unlock(&j->lock);
+
+	if (ret) {
+		bch_err(c, "Unable to allocate journal write");
+		bch2_fatal_error(c);
+		continue_at(cl, journal_write_done, system_highpri_wq);
+		return;
+	}
+
+	/*
+	 * XXX: we really should just disable the entire journal in nochanges
+	 * mode
+	 */
+	if (c->opts.nochanges)
+		goto no_io;
+
+	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+		if (!percpu_ref_tryget(&ca->io_ref)) {
+			/* XXX: fix this */
+			bch_err(c, "missing device for journal write\n");
+			continue;
+		}
+
+		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL],
+			     sectors);
+
+		bio = ca->journal.bio;
+		bio_reset(bio);
+		bio_set_dev(bio, ca->disk_sb.bdev);
+		bio->bi_iter.bi_sector	= ptr->offset;
+		bio->bi_end_io		= journal_write_endio;
+		bio->bi_private		= ca;
+		bio_set_op_attrs(bio, REQ_OP_WRITE,
+				 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+		bch2_bio_map(bio, jset, sectors << 9);
+
+		trace_journal_write(bio);
+		closure_bio_submit(bio, cl);
+
+		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
+	}
+
+	for_each_rw_member(ca, c, i)
+		if (journal_flushes_device(ca) &&
+		    !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
+			percpu_ref_get(&ca->io_ref);
+
+			bio = ca->journal.bio;
+			bio_reset(bio);
+			bio_set_dev(bio, ca->disk_sb.bdev);
+			bio->bi_opf		= REQ_OP_FLUSH;
+			bio->bi_end_io		= journal_write_endio;
+			bio->bi_private		= ca;
+			closure_bio_submit(bio, cl);
+		}
+
+no_io:
+	bch2_bucket_seq_cleanup(c);
+
+	continue_at(cl, journal_write_done, system_highpri_wq);
+	return;
+err:
+	bch2_inconsistent_error(c);
+	continue_at(cl, journal_write_done, system_highpri_wq);
+}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
new file mode 100644
index 000000000000..72e575f360af
--- /dev/null
+++ b/fs/bcachefs/journal_io.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_IO_H
+#define _BCACHEFS_JOURNAL_IO_H
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+	struct list_head	list;
+	struct bch_devs_list	devs;
+	/* must be last: */
+	struct jset		j;
+};
+
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+					struct jset_entry *entry, unsigned type)
+{
+	while (entry < vstruct_last(jset)) {
+		if (entry->type == type)
+			return entry;
+
+		entry = vstruct_next(entry);
+	}
+
+	return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type)			\
+	for (entry = (jset)->start;					\
+	     (entry = __jset_entry_type_next(jset, entry, type));	\
+	     entry = vstruct_next(entry))
+
+#define for_each_jset_key(k, _n, entry, jset)				\
+	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
+		vstruct_for_each_safe(entry, k, _n)
+
+int bch2_journal_read(struct bch_fs *, struct list_head *);
+
+void bch2_journal_write(struct closure *);
+
+#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
new file mode 100644
index 000000000000..695b2c8ba03b
--- /dev/null
+++ b/fs/bcachefs/journal_reclaim.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "super.h"
+
+/* Free space calculations: */
+
+static unsigned journal_space_from(struct journal_device *ja,
+				   enum journal_space_from from)
+{
+	switch (from) {
+	case journal_space_discarded:
+		return ja->discard_idx;
+	case journal_space_clean_ondisk:
+		return ja->dirty_idx_ondisk;
+	case journal_space_clean:
+		return ja->dirty_idx;
+	default:
+		BUG();
+	}
+}
+
+unsigned bch2_journal_dev_buckets_available(struct journal *j,
+					    struct journal_device *ja,
+					    enum journal_space_from from)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	unsigned available = (journal_space_from(ja, from) -
+			      ja->cur_idx - 1 + ja->nr) % ja->nr;
+
+	/*
+	 * Allocator startup needs some journal space before we can do journal
+	 * replay:
+	 */
+	if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
+		--available;
+
+	/*
+	 * Don't use the last bucket unless writing the new last_seq
+	 * will make another bucket available:
+	 */
+	if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
+		--available;
+
+	return available;
+}
+
+static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+{
+	union journal_preres_state old, new;
+	u64 v = atomic64_read(&j->prereserved.counter);
+
+	do {
+		old.v = new.v = v;
+		new.remaining = u64s_remaining;
+	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
+				       old.v, new.v)) != old.v);
+}
+
+static struct journal_space {
+	unsigned	next_entry;
+	unsigned	remaining;
+} __journal_space_available(struct journal *j, unsigned nr_devs_want,
+			    enum journal_space_from from)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	unsigned sectors_next_entry	= UINT_MAX;
+	unsigned sectors_total		= UINT_MAX;
+	unsigned i, nr_devs = 0;
+	unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
+		? journal_prev_buf(j)->sectors
+		: 0;
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+		struct journal_device *ja = &ca->journal;
+		unsigned buckets_this_device, sectors_this_device;
+
+		if (!ja->nr)
+			continue;
+
+		buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
+		sectors_this_device = ja->sectors_free;
+
+		/*
+		 * We that we don't allocate the space for a journal entry
+		 * until we write it out - thus, account for it here:
+		 */
+		if (unwritten_sectors >= sectors_this_device) {
+			if (!buckets_this_device)
+				continue;
+
+			buckets_this_device--;
+			sectors_this_device = ca->mi.bucket_size;
+		}
+
+		sectors_this_device -= unwritten_sectors;
+
+		if (sectors_this_device < ca->mi.bucket_size &&
+		    buckets_this_device) {
+			buckets_this_device--;
+			sectors_this_device = ca->mi.bucket_size;
+		}
+
+		if (!sectors_this_device)
+			continue;
+
+		sectors_next_entry = min(sectors_next_entry,
+					 sectors_this_device);
+
+		sectors_total = min(sectors_total,
+			buckets_this_device * ca->mi.bucket_size +
+			sectors_this_device);
+
+		nr_devs++;
+	}
+	rcu_read_unlock();
+
+	if (nr_devs < nr_devs_want)
+		return (struct journal_space) { 0, 0 };
+
+	return (struct journal_space) {
+		.next_entry	= sectors_next_entry,
+		.remaining	= max_t(int, 0, sectors_total - sectors_next_entry),
+	};
+}
+
+void bch2_journal_space_available(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct journal_space discarded, clean_ondisk, clean;
+	unsigned overhead, u64s_remaining = 0;
+	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
+				       j->buf[1].buf_size >> 9);
+	unsigned i, nr_online = 0, nr_devs_want;
+	bool can_discard = false;
+	int ret = 0;
+
+	lockdep_assert_held(&j->lock);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!ja->nr)
+			continue;
+
+		while (ja->dirty_idx != ja->cur_idx &&
+		       ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
+			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
+
+		while (ja->dirty_idx_ondisk != ja->dirty_idx &&
+		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
+			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
+
+		if (ja->discard_idx != ja->dirty_idx_ondisk)
+			can_discard = true;
+
+		max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
+		nr_online++;
+	}
+	rcu_read_unlock();
+
+	j->can_discard = can_discard;
+
+	if (nr_online < c->opts.metadata_replicas_required) {
+		ret = -EROFS;
+		goto out;
+	}
+
+	if (!fifo_free(&j->pin)) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
+
+	discarded	= __journal_space_available(j, nr_devs_want, journal_space_discarded);
+	clean_ondisk	= __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
+	clean		= __journal_space_available(j, nr_devs_want, journal_space_clean);
+
+	if (!discarded.next_entry)
+		ret = -ENOSPC;
+
+	overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
+		journal_entry_overhead(j);
+	u64s_remaining = clean.remaining << 6;
+	u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
+	u64s_remaining /= 4;
+out:
+	j->cur_entry_sectors	= !ret ? discarded.next_entry : 0;
+	j->cur_entry_error	= ret;
+	journal_set_remaining(j, u64s_remaining);
+	journal_check_may_get_unreserved(j);
+
+	if (!ret)
+		journal_wake(j);
+}
+
+/* Discards - last part of journal reclaim: */
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = ja->discard_idx != ja->dirty_idx_ondisk;
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/*
+ * Advance ja->discard_idx as long as it points to buckets that are no longer
+ * dirty, issuing discards if necessary:
+ */
+void bch2_journal_do_discards(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	unsigned iter;
+
+	mutex_lock(&j->discard_lock);
+
+	for_each_rw_member(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		while (should_discard_bucket(j, ja)) {
+			if (ca->mi.discard &&
+			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca,
+						ja->buckets[ja->discard_idx]),
+					ca->mi.bucket_size, GFP_NOIO, 0);
+
+			spin_lock(&j->lock);
+			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
+
+			bch2_journal_space_available(j);
+			spin_unlock(&j->lock);
+		}
+	}
+
+	mutex_unlock(&j->discard_lock);
+}
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, holding it open to ensure it gets replayed during recovery:
+ */
+
+static void bch2_journal_reclaim_fast(struct journal *j)
+{
+	struct journal_entry_pin_list temp;
+	bool popped = false;
+
+	lockdep_assert_held(&j->lock);
+
+	/*
+	 * Unpin journal entries whose reference counts reached zero, meaning
+	 * all btree nodes got written out
+	 */
+	while (!fifo_empty(&j->pin) &&
+	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
+		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+		BUG_ON(!fifo_pop(&j->pin, temp));
+		popped = true;
+	}
+
+	if (popped)
+		bch2_journal_space_available(j);
+}
+
+void bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+	if (atomic_dec_and_test(&pin_list->count)) {
+		spin_lock(&j->lock);
+		bch2_journal_reclaim_fast(j);
+		spin_unlock(&j->lock);
+	}
+}
+
+static inline void __journal_pin_add(struct journal *j,
+				     u64 seq,
+				     struct journal_entry_pin *pin,
+				     journal_pin_flush_fn flush_fn)
+{
+	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+	BUG_ON(journal_pin_active(pin));
+	BUG_ON(!atomic_read(&pin_list->count));
+
+	atomic_inc(&pin_list->count);
+	pin->seq	= seq;
+	pin->flush	= flush_fn;
+
+	list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
+
+	/*
+	 * If the journal is currently full,  we might want to call flush_fn
+	 * immediately:
+	 */
+	journal_wake(j);
+}
+
+void bch2_journal_pin_add(struct journal *j, u64 seq,
+			  struct journal_entry_pin *pin,
+			  journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+	__journal_pin_add(j, seq, pin, flush_fn);
+	spin_unlock(&j->lock);
+}
+
+static inline void __journal_pin_drop(struct journal *j,
+				      struct journal_entry_pin *pin)
+{
+	struct journal_entry_pin_list *pin_list;
+
+	if (!journal_pin_active(pin))
+		return;
+
+	pin_list = journal_seq_pin(j, pin->seq);
+	pin->seq = 0;
+	list_del_init(&pin->list);
+
+	/*
+	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
+	 * writing a new last_seq will now make another bucket available:
+	 */
+	if (atomic_dec_and_test(&pin_list->count) &&
+	    pin_list == &fifo_peek_front(&j->pin))
+		bch2_journal_reclaim_fast(j);
+	else if (fifo_used(&j->pin) == 1 &&
+		 atomic_read(&pin_list->count) == 1)
+		journal_wake(j);
+}
+
+void bch2_journal_pin_drop(struct journal *j,
+			   struct journal_entry_pin *pin)
+{
+	spin_lock(&j->lock);
+	__journal_pin_drop(j, pin);
+	spin_unlock(&j->lock);
+}
+
+void bch2_journal_pin_update(struct journal *j, u64 seq,
+			     struct journal_entry_pin *pin,
+			     journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+
+	if (pin->seq != seq) {
+		__journal_pin_drop(j, pin);
+		__journal_pin_add(j, seq, pin, flush_fn);
+	} else {
+		struct journal_entry_pin_list *pin_list =
+			journal_seq_pin(j, seq);
+
+		list_move(&pin->list, &pin_list->list);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+void bch2_journal_pin_add_if_older(struct journal *j,
+				  struct journal_entry_pin *src_pin,
+				  struct journal_entry_pin *pin,
+				  journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+
+	if (journal_pin_active(src_pin) &&
+	    (!journal_pin_active(pin) ||
+	     src_pin->seq < pin->seq)) {
+		__journal_pin_drop(j, pin);
+		__journal_pin_add(j, src_pin->seq, pin, flush_fn);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
+{
+	BUG_ON(journal_pin_active(pin));
+
+	wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
+}
+
+/*
+ * Journal reclaim: flush references to open journal entries to reclaim space in
+ * the journal
+ *
+ * May be done by the journal code in the background as needed to free up space
+ * for more journal entries, or as part of doing a clean shutdown, or to migrate
+ * data off of a specific device:
+ */
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *ret = NULL;
+
+	spin_lock(&j->lock);
+
+	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
+		if (*seq > max_seq ||
+		    (ret = list_first_entry_or_null(&pin_list->list,
+				struct journal_entry_pin, list)))
+			break;
+
+	if (ret) {
+		list_move(&ret->list, &pin_list->flushed);
+		BUG_ON(j->flush_in_progress);
+		j->flush_in_progress = ret;
+		j->last_flushed = jiffies;
+	}
+
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
+			       unsigned min_nr)
+{
+	struct journal_entry_pin *pin;
+	u64 seq;
+
+	lockdep_assert_held(&j->reclaim_lock);
+
+	while ((pin = journal_get_next_pin(j, min_nr
+				? U64_MAX : seq_to_flush, &seq))) {
+		if (min_nr)
+			min_nr--;
+
+		pin->flush(j, pin, seq);
+
+		BUG_ON(j->flush_in_progress != pin);
+		j->flush_in_progress = NULL;
+		wake_up(&j->pin_flush_wait);
+	}
+}
+
+/**
+ * bch2_journal_reclaim - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+void bch2_journal_reclaim(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	unsigned iter, min_nr = 0;
+	u64 seq_to_flush = 0;
+
+	lockdep_assert_held(&j->reclaim_lock);
+
+	bch2_journal_do_discards(j);
+
+	spin_lock(&j->lock);
+
+	for_each_rw_member(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+		unsigned nr_buckets, bucket_to_flush;
+
+		if (!ja->nr)
+			continue;
+
+		/* Try to keep the journal at most half full: */
+		nr_buckets = ja->nr / 2;
+
+		/* And include pre-reservations: */
+		nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
+					   (ca->mi.bucket_size << 6) -
+					   journal_entry_overhead(j));
+
+		nr_buckets = min(nr_buckets, ja->nr);
+
+		bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
+		seq_to_flush = max(seq_to_flush,
+				   ja->bucket_seq[bucket_to_flush]);
+	}
+
+	/* Also flush if the pin fifo is more than half full */
+	seq_to_flush = max_t(s64, seq_to_flush,
+			     (s64) journal_cur_seq(j) -
+			     (j->pin.size >> 1));
+	spin_unlock(&j->lock);
+
+	/*
+	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
+	 * make sure to flush at least one journal pin:
+	 */
+	if (time_after(jiffies, j->last_flushed +
+		       msecs_to_jiffies(j->reclaim_delay_ms)))
+		min_nr = 1;
+
+	if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
+		seq_to_flush = max(seq_to_flush, journal_last_seq(j));
+		min_nr = 1;
+	}
+
+	journal_flush_pins(j, seq_to_flush, min_nr);
+
+	if (!bch2_journal_error(j))
+		queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
+				   msecs_to_jiffies(j->reclaim_delay_ms));
+}
+
+void bch2_journal_reclaim_work(struct work_struct *work)
+{
+	struct journal *j = container_of(to_delayed_work(work),
+				struct journal, reclaim_work);
+
+	mutex_lock(&j->reclaim_lock);
+	bch2_journal_reclaim(j);
+	mutex_unlock(&j->reclaim_lock);
+}
+
+static int journal_flush_done(struct journal *j, u64 seq_to_flush)
+{
+	int ret;
+
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
+
+	mutex_lock(&j->reclaim_lock);
+
+	journal_flush_pins(j, seq_to_flush, 0);
+
+	spin_lock(&j->lock);
+	/*
+	 * If journal replay hasn't completed, the unreplayed journal entries
+	 * hold refs on their corresponding sequence numbers
+	 */
+	ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+		journal_last_seq(j) > seq_to_flush ||
+		(fifo_used(&j->pin) == 1 &&
+		 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+
+	spin_unlock(&j->lock);
+	mutex_unlock(&j->reclaim_lock);
+
+	return ret;
+}
+
+void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+{
+	if (!test_bit(JOURNAL_STARTED, &j->flags))
+		return;
+
+	closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
+}
+
+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	u64 iter, seq = 0;
+	int ret = 0;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(p, &j->pin, iter)
+		if (dev_idx >= 0
+		    ? bch2_dev_list_has_dev(p->devs, dev_idx)
+		    : p->devs.nr < c->opts.metadata_replicas)
+			seq = iter;
+	spin_unlock(&j->lock);
+
+	bch2_journal_flush_pins(j, seq);
+
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+	seq = 0;
+
+	spin_lock(&j->lock);
+	while (!ret && seq < j->pin.back) {
+		struct bch_replicas_padded replicas;
+
+		seq = max(seq, journal_last_seq(j));
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
+					 journal_seq_pin(j, seq)->devs);
+		seq++;
+
+		spin_unlock(&j->lock);
+		ret = bch2_mark_replicas(c, &replicas.e);
+		spin_lock(&j->lock);
+	}
+	spin_unlock(&j->lock);
+
+	ret = bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
new file mode 100644
index 000000000000..9bf982a17797
--- /dev/null
+++ b/fs/bcachefs/journal_reclaim.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
+#define _BCACHEFS_JOURNAL_RECLAIM_H
+
+#define JOURNAL_PIN	(32 * 1024)
+
+enum journal_space_from {
+	journal_space_discarded,
+	journal_space_clean_ondisk,
+	journal_space_clean,
+};
+
+unsigned bch2_journal_dev_buckets_available(struct journal *,
+					    struct journal_device *,
+					    enum journal_space_from);
+void bch2_journal_space_available(struct journal *);
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+	return pin->seq != 0;
+}
+
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+	EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
+
+	return &j->pin.data[seq & j->pin.mask];
+}
+
+void bch2_journal_pin_put(struct journal *, u64);
+
+void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
+			  journal_pin_flush_fn);
+void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *,
+			     journal_pin_flush_fn);
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+void bch2_journal_pin_add_if_older(struct journal *,
+				  struct journal_entry_pin *,
+				  struct journal_entry_pin *,
+				  journal_pin_flush_fn);
+void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_do_discards(struct journal *);
+void bch2_journal_reclaim(struct journal *);
+void bch2_journal_reclaim_work(struct work_struct *);
+
+void bch2_journal_flush_pins(struct journal *, u64);
+
+static inline void bch2_journal_flush_all_pins(struct journal *j)
+{
+	bch2_journal_flush_pins(j, U64_MAX);
+}
+
+int bch2_journal_flush_device_pins(struct journal *, int);
+
+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
new file mode 100644
index 000000000000..787d9f7638d0
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "eytzinger.h"
+#include "journal_seq_blacklist.h"
+#include "super-io.h"
+
+/*
+ * journal_seq_blacklist machinery:
+ *
+ * To guarantee order of btree updates after a crash, we need to detect when a
+ * btree node entry (bset) is newer than the newest journal entry that was
+ * successfully written, and ignore it - effectively ignoring any btree updates
+ * that didn't make it into the journal.
+ *
+ * If we didn't do this, we might have two btree nodes, a and b, both with
+ * updates that weren't written to the journal yet: if b was updated after a,
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
+ * to b happened, but not the updates to a that happened before it.
+ *
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
+ * because everything they contain will also have been journalled - and must
+ * still be present in the journal on disk until a journal entry has been
+ * written _after_ that bset was written.
+ *
+ * To accomplish this, bsets record the newest journal sequence number they
+ * contain updates for; then, on startup, the btree code queries the journal
+ * code to ask "Is this sequence number newer than the newest journal entry? If
+ * so, ignore it."
+ *
+ * When this happens, we must blacklist that journal sequence number: the
+ * journal must not write any entries with that sequence number, and it must
+ * record that it was blacklisted so that a) on recovery we don't think we have
+ * missing journal entries and b) so that the btree code continues to ignore
+ * that bset, until that btree node is rewritten.
+ */
+
+static unsigned
+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
+{
+	return bl
+		? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
+		   sizeof(struct journal_seq_blacklist_entry))
+		: 0;
+}
+
+static unsigned sb_blacklist_u64s(unsigned nr)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl;
+
+	return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
+}
+
+static struct bch_sb_field_journal_seq_blacklist *
+blacklist_entry_try_merge(struct bch_fs *c,
+			  struct bch_sb_field_journal_seq_blacklist *bl,
+			  unsigned i)
+{
+	unsigned nr = blacklist_nr_entries(bl);
+
+	if (le64_to_cpu(bl->start[i].end) >=
+	    le64_to_cpu(bl->start[i + 1].start)) {
+		bl->start[i].end = bl->start[i + 1].end;
+		--nr;
+		memmove(&bl->start[i],
+			&bl->start[i + 1],
+			sizeof(bl->start[0]) * (nr - i));
+
+		bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+							sb_blacklist_u64s(nr));
+		BUG_ON(!bl);
+	}
+
+	return bl;
+}
+
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl;
+	unsigned i, nr;
+	int ret = 0;
+
+	mutex_lock(&c->sb_lock);
+	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+	nr = blacklist_nr_entries(bl);
+
+	if (bl) {
+		for (i = 0; i < nr; i++) {
+			struct journal_seq_blacklist_entry *e =
+				bl->start + i;
+
+			if (start == le64_to_cpu(e->start) &&
+			    end   == le64_to_cpu(e->end))
+				goto out;
+
+			if (start <= le64_to_cpu(e->start) &&
+			    end   >= le64_to_cpu(e->end)) {
+				e->start = cpu_to_le64(start);
+				e->end	= cpu_to_le64(end);
+
+				if (i + 1 < nr)
+					bl = blacklist_entry_try_merge(c,
+								bl, i);
+				if (i)
+					bl = blacklist_entry_try_merge(c,
+								bl, i - 1);
+				goto out_write_sb;
+			}
+		}
+	}
+
+	bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+					sb_blacklist_u64s(nr + 1));
+	if (!bl) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	bl->start[nr].start	= cpu_to_le64(start);
+	bl->start[nr].end	= cpu_to_le64(end);
+out_write_sb:
+	c->disk_sb.sb->features[0] |=
+		1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3;
+
+	ret = bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+static int journal_seq_blacklist_table_cmp(const void *_l,
+					   const void *_r, size_t size)
+{
+	const struct journal_seq_blacklist_table_entry *l = _l;
+	const struct journal_seq_blacklist_table_entry *r = _r;
+
+	return cmp_int(l->start, r->start);
+}
+
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
+				     bool dirty)
+{
+	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+	struct journal_seq_blacklist_table_entry search = { .start = seq };
+	int idx;
+
+	if (!t)
+		return false;
+
+	idx = eytzinger0_find_le(t->entries, t->nr,
+				 sizeof(t->entries[0]),
+				 journal_seq_blacklist_table_cmp,
+				 &search);
+	if (idx < 0)
+		return false;
+
+	BUG_ON(t->entries[idx].start > seq);
+
+	if (seq >= t->entries[idx].end)
+		return false;
+
+	if (dirty)
+		t->entries[idx].dirty = true;
+	return true;
+}
+
+int bch2_blacklist_table_initialize(struct bch_fs *c)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+	struct journal_seq_blacklist_table *t;
+	unsigned i, nr = blacklist_nr_entries(bl);
+
+	BUG_ON(c->journal_seq_blacklist_table);
+
+	if (!bl)
+		return 0;
+
+	t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
+		    GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	t->nr = nr;
+
+	for (i = 0; i < nr; i++) {
+		t->entries[i].start	= le64_to_cpu(bl->start[i].start);
+		t->entries[i].end	= le64_to_cpu(bl->start[i].end);
+	}
+
+	eytzinger0_sort(t->entries,
+			t->nr,
+			sizeof(t->entries[0]),
+			journal_seq_blacklist_table_cmp,
+			NULL);
+
+	c->journal_seq_blacklist_table = t;
+	return 0;
+}
+
+static const char *
+bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
+				       struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		field_to_type(f, journal_seq_blacklist);
+	struct journal_seq_blacklist_entry *i;
+	unsigned nr = blacklist_nr_entries(bl);
+
+	for (i = bl->start; i < bl->start + nr; i++) {
+		if (le64_to_cpu(i->start) >=
+		    le64_to_cpu(i->end))
+			return "entry start >= end";
+
+		if (i + 1 < bl->start + nr &&
+		    le64_to_cpu(i[0].end) >
+		    le64_to_cpu(i[1].start))
+			return "entries out of order";
+	}
+
+	return NULL;
+}
+
+static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
+						  struct bch_sb *sb,
+						  struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		field_to_type(f, journal_seq_blacklist);
+	struct journal_seq_blacklist_entry *i;
+	unsigned nr = blacklist_nr_entries(bl);
+
+	for (i = bl->start; i < bl->start + nr; i++) {
+		if (i != bl->start)
+			pr_buf(out, " ");
+
+		pr_buf(out, "%llu-%llu",
+		       le64_to_cpu(i->start),
+		       le64_to_cpu(i->end));
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
+	.validate	= bch2_sb_journal_seq_blacklist_validate,
+	.to_text	= bch2_sb_journal_seq_blacklist_to_text
+};
+
+void bch2_blacklist_entries_gc(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs,
+					journal_seq_blacklist_gc_work);
+	struct journal_seq_blacklist_table *t;
+	struct bch_sb_field_journal_seq_blacklist *bl;
+	struct journal_seq_blacklist_entry *src, *dst;
+	struct btree_trans trans;
+	unsigned i, nr, new_nr;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_iter *iter;
+		struct btree *b;
+
+		for_each_btree_node(&trans, iter, i, POS_MIN,
+				    BTREE_ITER_PREFETCH, b)
+			if (test_bit(BCH_FS_STOPPING, &c->flags)) {
+				bch2_trans_exit(&trans);
+				return;
+			}
+		bch2_trans_iter_free(&trans, iter);
+	}
+
+	ret = bch2_trans_exit(&trans);
+	if (ret)
+		return;
+
+	mutex_lock(&c->sb_lock);
+	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+	if (!bl)
+		goto out;
+
+	nr = blacklist_nr_entries(bl);
+	dst = bl->start;
+
+	t = c->journal_seq_blacklist_table;
+	BUG_ON(nr != t->nr);
+
+	for (src = bl->start, i = eytzinger0_first(t->nr);
+	     src < bl->start + nr;
+	     src++, i = eytzinger0_next(i, nr)) {
+		BUG_ON(t->entries[i].start	!= le64_to_cpu(src->start));
+		BUG_ON(t->entries[i].end	!= le64_to_cpu(src->end));
+
+		if (t->entries[i].dirty)
+			*dst++ = *src;
+	}
+
+	new_nr = dst - bl->start;
+
+	bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
+
+	if (new_nr != nr) {
+		bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+				new_nr ? sb_blacklist_u64s(new_nr) : 0);
+		BUG_ON(new_nr && !bl);
+
+		if (!new_nr)
+			c->disk_sb.sb->features[0] &=
+				~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3);
+
+		bch2_write_super(c);
+	}
+out:
+	mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
new file mode 100644
index 000000000000..03f4b97247fd
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
+int bch2_blacklist_table_initialize(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
+
+void bch2_blacklist_entries_gc(struct work_struct *);
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
new file mode 100644
index 000000000000..8eea12a03c06
--- /dev/null
+++ b/fs/bcachefs/journal_types.h
@@ -0,0 +1,276 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_TYPES_H
+#define _BCACHEFS_JOURNAL_TYPES_H
+
+#include <linux/cache.h>
+#include <linux/workqueue.h>
+
+#include "alloc_types.h"
+#include "super_types.h"
+#include "fifo.h"
+
+struct journal_res;
+
+/*
+ * We put two of these in struct journal; we used them for writes to the
+ * journal that are being staged or in flight.
+ */
+struct journal_buf {
+	struct jset		*data;
+
+	BKEY_PADDED(key);
+
+	struct closure_waitlist	wait;
+
+	unsigned		buf_size;	/* size in bytes of @data */
+	unsigned		sectors;	/* maximum size for current entry */
+	unsigned		disk_sectors;	/* maximum size entry could have been, if
+						   buf_size was bigger */
+	unsigned		u64s_reserved;
+	/* bloom filter: */
+	unsigned long		has_inode[1024 / sizeof(unsigned long)];
+};
+
+/*
+ * Something that makes a journal entry dirty - i.e. a btree node that has to be
+ * flushed:
+ */
+
+struct journal_entry_pin_list {
+	struct list_head		list;
+	struct list_head		flushed;
+	atomic_t			count;
+	struct bch_devs_list		devs;
+};
+
+struct journal;
+struct journal_entry_pin;
+typedef void (*journal_pin_flush_fn)(struct journal *j,
+				struct journal_entry_pin *, u64);
+
+struct journal_entry_pin {
+	struct list_head		list;
+	journal_pin_flush_fn		flush;
+	u64				seq;
+};
+
+struct journal_res {
+	bool			ref;
+	u8			idx;
+	u16			u64s;
+	u32			offset;
+	u64			seq;
+};
+
+/*
+ * For reserving space in the journal prior to getting a reservation on a
+ * particular journal entry:
+ */
+struct journal_preres {
+	unsigned		u64s;
+};
+
+union journal_res_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u64		cur_entry_offset:20,
+				idx:1,
+				prev_buf_unwritten:1,
+				buf0_count:21,
+				buf1_count:21;
+	};
+};
+
+union journal_preres_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u32		reserved;
+		u32		remaining;
+	};
+};
+
+/* bytes: */
+#define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
+#define JOURNAL_ENTRY_SIZE_MAX		(4U  << 20) /* 4M */
+
+/*
+ * We stash some journal state as sentinal values in cur_entry_offset:
+ * note - cur_entry_offset is in units of u64s
+ */
+#define JOURNAL_ENTRY_OFFSET_MAX	((1U << 20) - 1)
+
+#define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
+#define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
+
+/*
+ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
+ * either because something's waiting on the write to complete or because it's
+ * been dirty too long and the timer's expired.
+ */
+
+enum {
+	JOURNAL_REPLAY_DONE,
+	JOURNAL_STARTED,
+	JOURNAL_NEED_WRITE,
+	JOURNAL_NOT_EMPTY,
+	JOURNAL_MAY_GET_UNRESERVED,
+};
+
+/* Embedded in struct bch_fs */
+struct journal {
+	/* Fastpath stuff up front: */
+
+	unsigned long		flags;
+
+	union journal_res_state reservations;
+
+	/* Max size of current journal entry */
+	unsigned		cur_entry_u64s;
+	unsigned		cur_entry_sectors;
+
+	/*
+	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
+	 * insufficient devices:
+	 */
+	int			cur_entry_error;
+
+	union journal_preres_state prereserved;
+
+	/* Reserved space in journal entry to be used just prior to write */
+	unsigned		entry_u64s_reserved;
+
+	unsigned		buf_size_want;
+
+	/*
+	 * Two journal entries -- one is currently open for new entries, the
+	 * other is possibly being written out.
+	 */
+	struct journal_buf	buf[2];
+
+	spinlock_t		lock;
+
+	/* if nonzero, we may not open a new journal entry: */
+	unsigned		blocked;
+
+	/* Used when waiting because the journal was full */
+	wait_queue_head_t	wait;
+	struct closure_waitlist	async_wait;
+	struct closure_waitlist	preres_wait;
+
+	struct closure		io;
+	struct delayed_work	write_work;
+
+	/* Sequence number of most recent journal entry (last entry in @pin) */
+	atomic64_t		seq;
+
+	/* seq, last_seq from the most recent journal entry successfully written */
+	u64			seq_ondisk;
+	u64			last_seq_ondisk;
+
+	/*
+	 * FIFO of journal entries whose btree updates have not yet been
+	 * written out.
+	 *
+	 * Each entry is a reference count. The position in the FIFO is the
+	 * entry's sequence number relative to @seq.
+	 *
+	 * The journal entry itself holds a reference count, put when the
+	 * journal entry is written out. Each btree node modified by the journal
+	 * entry also holds a reference count, put when the btree node is
+	 * written.
+	 *
+	 * When a reference count reaches zero, the journal entry is no longer
+	 * needed. When all journal entries in the oldest journal bucket are no
+	 * longer needed, the bucket can be discarded and reused.
+	 */
+	struct {
+		u64 front, back, size, mask;
+		struct journal_entry_pin_list *data;
+	}			pin;
+
+	u64			replay_journal_seq;
+	u64			replay_journal_seq_end;
+
+	struct write_point	wp;
+	spinlock_t		err_lock;
+
+	struct delayed_work	reclaim_work;
+	struct mutex		reclaim_lock;
+	unsigned long		last_flushed;
+	struct journal_entry_pin *flush_in_progress;
+	wait_queue_head_t	pin_flush_wait;
+
+	/* protects advancing ja->discard_idx: */
+	struct mutex		discard_lock;
+	bool			can_discard;
+
+	unsigned		write_delay_ms;
+	unsigned		reclaim_delay_ms;
+
+	u64			res_get_blocked_start;
+	u64			need_write_time;
+	u64			write_start_time;
+
+	struct time_stats	*write_time;
+	struct time_stats	*delay_time;
+	struct time_stats	*blocked_time;
+	struct time_stats	*flush_seq_time;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	res_map;
+#endif
+};
+
+/*
+ * Embedded in struct bch_dev. First three fields refer to the array of journal
+ * buckets, in bch_sb.
+ */
+struct journal_device {
+	/*
+	 * For each journal bucket, contains the max sequence number of the
+	 * journal writes it contains - so we know when a bucket can be reused.
+	 */
+	u64			*bucket_seq;
+
+	unsigned		sectors_free;
+
+	/*
+	 * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
+	 */
+	unsigned		discard_idx;		/* Next bucket to discard */
+	unsigned		dirty_idx_ondisk;
+	unsigned		dirty_idx;
+	unsigned		cur_idx;		/* Journal bucket we're currently writing to */
+	unsigned		nr;
+
+	u64			*buckets;
+
+	/* Bio for journal reads/writes to this device */
+	struct bio		*bio;
+
+	/* for bch_journal_read_device */
+	struct closure		read;
+};
+
+/*
+ * journal_entry_res - reserve space in every journal entry:
+ */
+struct journal_entry_res {
+	unsigned		u64s;
+};
+
+#endif /* _BCACHEFS_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
new file mode 100644
index 000000000000..5da54ced9cad
--- /dev/null
+++ b/fs/bcachefs/keylist.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "keylist.h"
+
+int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
+			size_t nr_inline_u64s, size_t new_u64s)
+{
+	size_t oldsize = bch_keylist_u64s(l);
+	size_t newsize = oldsize + new_u64s;
+	u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
+	u64 *new_keys;
+
+	newsize = roundup_pow_of_two(newsize);
+
+	if (newsize <= nr_inline_u64s ||
+	    (old_buf && roundup_pow_of_two(oldsize) == newsize))
+		return 0;
+
+	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
+	if (!new_keys)
+		return -ENOMEM;
+
+	if (!old_buf)
+		memcpy_u64s(new_keys, inline_u64s, oldsize);
+
+	l->keys_p = new_keys;
+	l->top_p = new_keys + oldsize;
+
+	return 0;
+}
+
+void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
+{
+	struct bkey_i *where;
+
+	for_each_keylist_key(l, where)
+		if (bkey_cmp(insert->k.p, where->k.p) < 0)
+			break;
+
+	memmove_u64s_up((u64 *) where + insert->k.u64s,
+			where,
+			((u64 *) l->top) - ((u64 *) where));
+
+	l->top_p += insert->k.u64s;
+	bkey_copy(where, insert);
+}
+
+void bch2_keylist_pop_front(struct keylist *l)
+{
+	l->top_p -= bch2_keylist_front(l)->k.u64s;
+
+	memmove_u64s_down(l->keys,
+			  bkey_next(l->keys),
+			  bch_keylist_u64s(l));
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *l)
+{
+	struct bkey_i *k;
+
+	for_each_keylist_key(l, k)
+		BUG_ON(bkey_next(k) != l->top &&
+		       bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
+}
+#endif
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
new file mode 100644
index 000000000000..a7ff86b08abc
--- /dev/null
+++ b/fs/bcachefs/keylist.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_H
+#define _BCACHEFS_KEYLIST_H
+
+#include "keylist_types.h"
+
+int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
+void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
+void bch2_keylist_pop_front(struct keylist *);
+
+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
+{
+	l->top_p = l->keys_p = inline_keys;
+}
+
+static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
+{
+	if (l->keys_p != inline_keys)
+		kfree(l->keys_p);
+	bch2_keylist_init(l, inline_keys);
+}
+
+static inline void bch2_keylist_push(struct keylist *l)
+{
+	l->top = bkey_next(l->top);
+}
+
+static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
+{
+	bkey_copy(l->top, k);
+	bch2_keylist_push(l);
+}
+
+static inline bool bch2_keylist_empty(struct keylist *l)
+{
+	return l->top == l->keys;
+}
+
+static inline size_t bch_keylist_u64s(struct keylist *l)
+{
+	return l->top_p - l->keys_p;
+}
+
+static inline size_t bch2_keylist_bytes(struct keylist *l)
+{
+	return bch_keylist_u64s(l) * sizeof(u64);
+}
+
+static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
+{
+	return l->keys;
+}
+
+#define for_each_keylist_key(_keylist, _k)			\
+	for (_k = (_keylist)->keys;				\
+	     _k != (_keylist)->top;				\
+	     _k = bkey_next(_k))
+
+static inline u64 keylist_sectors(struct keylist *keys)
+{
+	struct bkey_i *k;
+	u64 ret = 0;
+
+	for_each_keylist_key(keys, k)
+		ret += k->k.size;
+
+	return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *);
+#else
+static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
+#endif
+
+#endif /* _BCACHEFS_KEYLIST_H */
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
new file mode 100644
index 000000000000..4b3ff7d8a875
--- /dev/null
+++ b/fs/bcachefs/keylist_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_TYPES_H
+#define _BCACHEFS_KEYLIST_TYPES_H
+
+struct keylist {
+	union {
+		struct bkey_i		*keys;
+		u64			*keys_p;
+	};
+	union {
+		struct bkey_i		*top;
+		u64			*top_p;
+	};
+};
+
+#endif /* _BCACHEFS_KEYLIST_TYPES_H */
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
new file mode 100644
index 000000000000..de8522f754e2
--- /dev/null
+++ b/fs/bcachefs/migrate.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for moving data off a device.
+ */
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "migrate.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
+			 unsigned dev_idx, int flags, bool metadata)
+{
+	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+	unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+	unsigned nr_good;
+
+	bch2_bkey_drop_device(k, dev_idx);
+
+	nr_good = bch2_bkey_durability(c, k.s_c);
+	if ((!nr_good && !(flags & lost)) ||
+	    (nr_good < replicas && !(flags & degraded)))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags,
+				   enum btree_id btree_id)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	BKEY_PADDED(key) tmp;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
+				   BTREE_ITER_PREFETCH);
+
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k))) {
+		if (!bch2_bkey_has_device(k, dev_idx)) {
+			ret = bch2_mark_bkey_replicas(c, k);
+			if (ret)
+				break;
+			bch2_btree_iter_next(iter);
+			continue;
+		}
+
+		bkey_reassemble(&tmp.key, k);
+
+		ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.key),
+				    dev_idx, flags, false);
+		if (ret)
+			break;
+
+		/*
+		 * If the new extent no longer has any pointers, bch2_extent_normalize()
+		 * will do the appropriate thing with it (turning it into a
+		 * KEY_TYPE_error key, or just a discard if it was a cached extent)
+		 */
+		bch2_extent_normalize(c, bkey_i_to_s(&tmp.key));
+
+		bch2_btree_iter_set_pos(iter, bkey_start_pos(&tmp.key.k));
+
+		bch2_trans_update(&trans, iter, &tmp.key);
+
+		ret = bch2_trans_commit(&trans, NULL, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL);
+
+		/*
+		 * don't want to leave ret == -EINTR, since if we raced and
+		 * something else overwrote the key we could spuriously return
+		 * -EINTR below:
+		 */
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+	}
+
+	ret = bch2_trans_exit(&trans) ?: ret;
+
+	BUG_ON(ret == -EINTR);
+
+	return ret;
+}
+
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	return  __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?:
+		__bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK);
+}
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct closure cl;
+	struct btree *b;
+	unsigned id;
+	int ret;
+
+	/* don't handle this yet: */
+	if (flags & BCH_FORCE_IF_METADATA_LOST)
+		return -EINVAL;
+
+	bch2_trans_init(&trans, c, 0, 0);
+	closure_init_stack(&cl);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		for_each_btree_node(&trans, iter, id, POS_MIN,
+				    BTREE_ITER_PREFETCH, b) {
+			__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+			struct bkey_i_btree_ptr *new_key;
+retry:
+			if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
+						  dev_idx)) {
+				/*
+				 * we might have found a btree node key we
+				 * needed to update, and then tried to update it
+				 * but got -EINTR after upgrading the iter, but
+				 * then raced and the node is now gone:
+				 */
+				bch2_btree_iter_downgrade(iter);
+
+				ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
+				if (ret)
+					goto err;
+			} else {
+				bkey_copy(&tmp.k, &b->key);
+				new_key = bkey_i_to_btree_ptr(&tmp.k);
+
+				ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i),
+						    dev_idx, flags, true);
+				if (ret)
+					goto err;
+
+				ret = bch2_btree_node_update_key(c, iter, b, new_key);
+				if (ret == -EINTR) {
+					b = bch2_btree_iter_peek_node(iter);
+					goto retry;
+				}
+				if (ret)
+					goto err;
+			}
+		}
+		bch2_trans_iter_free(&trans, iter);
+	}
+
+	/* flush relevant btree updates */
+	while (1) {
+		closure_wait_event(&c->btree_interior_update_wait,
+				   !bch2_btree_interior_updates_nr_pending(c) ||
+				   c->btree_roots_dirty);
+		if (!bch2_btree_interior_updates_nr_pending(c))
+			break;
+		bch2_journal_meta(&c->journal);
+	}
+
+	ret = 0;
+err:
+	ret = bch2_trans_exit(&trans) ?: ret;
+
+	BUG_ON(ret == -EINTR);
+
+	return ret;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+		bch2_dev_metadata_drop(c, dev_idx, flags) ?:
+		bch2_replicas_gc2(c);
+}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
new file mode 100644
index 000000000000..027efaa0d575
--- /dev/null
+++ b/fs/bcachefs/migrate.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MIGRATE_H
+#define _BCACHEFS_MIGRATE_H
+
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
+
+#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
new file mode 100644
index 000000000000..ab20e981145b
--- /dev/null
+++ b/fs/bcachefs/move.c
@@ -0,0 +1,804 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "disk_groups.h"
+#include "inode.h"
+#include "io.h"
+#include "journal_reclaim.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "keylist.h"
+
+#include <linux/ioprio.h>
+#include <linux/kthread.h>
+
+#include <trace/events/bcachefs.h>
+
+#define SECTORS_IN_FLIGHT_PER_DEVICE	2048
+
+struct moving_io {
+	struct list_head	list;
+	struct closure		cl;
+	bool			read_completed;
+
+	unsigned		read_sectors;
+	unsigned		write_sectors;
+
+	struct bch_read_bio	rbio;
+
+	struct migrate_write	write;
+	/* Must be last since it is variable size */
+	struct bio_vec		bi_inline_vecs[0];
+};
+
+struct moving_context {
+	/* Closure for waiting on all reads and writes to complete */
+	struct closure		cl;
+
+	struct bch_move_stats	*stats;
+
+	struct list_head	reads;
+
+	/* in flight sectors: */
+	atomic_t		read_sectors;
+	atomic_t		write_sectors;
+
+	wait_queue_head_t	wait;
+};
+
+static int bch2_migrate_index_update(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct migrate_write *m =
+		container_of(op, struct migrate_write, op);
+	struct keylist *keys = &op->insert_keys;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	iter = bch2_trans_get_iter(&trans, m->btree_id,
+				   bkey_start_pos(&bch2_keylist_front(keys)->k),
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	while (1) {
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+		struct bkey_i *insert;
+		struct bkey_i_extent *new =
+			bkey_i_to_extent(bch2_keylist_front(keys));
+		BKEY_PADDED(k) _new, _insert;
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+		bool did_work = false;
+		int nr;
+
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		if (bversion_cmp(k.k->version, new->k.version) ||
+		    !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
+			goto nomatch;
+
+		if (m->data_cmd == DATA_REWRITE &&
+		    !bch2_bkey_has_device(k, m->data_opts.rewrite_dev))
+			goto nomatch;
+
+		bkey_reassemble(&_insert.k, k);
+		insert = &_insert.k;
+
+		bkey_copy(&_new.k, bch2_keylist_front(keys));
+		new = bkey_i_to_extent(&_new.k);
+
+		bch2_cut_front(iter->pos, insert);
+		bch2_cut_back(new->k.p, &insert->k);
+		bch2_cut_back(insert->k.p, &new->k);
+
+		if (m->data_cmd == DATA_REWRITE)
+			bch2_bkey_drop_device(bkey_i_to_s(insert),
+					      m->data_opts.rewrite_dev);
+
+		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
+			if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
+				/*
+				 * raced with another move op? extent already
+				 * has a pointer to the device we just wrote
+				 * data to
+				 */
+				continue;
+			}
+
+			bch2_extent_ptr_decoded_append(insert, &p);
+			did_work = true;
+		}
+
+		if (!did_work)
+			goto nomatch;
+
+		bch2_bkey_narrow_crcs(insert,
+				(struct bch_extent_crc_unpacked) { 0 });
+		bch2_extent_normalize(c, bkey_i_to_s(insert));
+		bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
+					       op->opts.background_target,
+					       op->opts.data_replicas);
+
+		/*
+		 * If we're not fully overwriting @k, and it's compressed, we
+		 * need a reservation for all the pointers in @insert
+		 */
+		nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) -
+			 m->nr_ptrs_reserved;
+
+		if (insert->k.size < k.k->size &&
+		    bch2_extent_is_compressed(k) &&
+		    nr > 0) {
+			ret = bch2_disk_reservation_add(c, &op->res,
+					keylist_sectors(keys) * nr, 0);
+			if (ret)
+				goto out;
+
+			m->nr_ptrs_reserved += nr;
+			goto next;
+		}
+
+		bch2_trans_update(&trans, iter, insert);
+
+		ret = bch2_trans_commit(&trans, &op->res,
+				op_journal_seq(op),
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_USE_RESERVE|
+				m->data_opts.btree_insert_flags);
+		if (!ret)
+			atomic_long_inc(&c->extent_migrate_done);
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+next:
+		while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) {
+			bch2_keylist_pop_front(keys);
+			if (bch2_keylist_empty(keys))
+				goto out;
+		}
+
+		bch2_cut_front(iter->pos, bch2_keylist_front(keys));
+		continue;
+nomatch:
+		if (m->ctxt)
+			atomic64_add(k.k->p.offset - iter->pos.offset,
+				     &m->ctxt->stats->sectors_raced);
+		atomic_long_inc(&c->extent_migrate_raced);
+		trace_move_race(&new->k);
+		bch2_btree_iter_next_slot(iter);
+		goto next;
+	}
+out:
+	bch2_trans_exit(&trans);
+	BUG_ON(ret == -EINTR);
+	return ret;
+}
+
+void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
+{
+	/* write bio must own pages: */
+	BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+	m->ptr		= rbio->pick.ptr;
+	m->offset	= rbio->pos.offset - rbio->pick.crc.offset;
+	m->op.devs_have	= rbio->devs_have;
+	m->op.pos	= rbio->pos;
+	m->op.version	= rbio->version;
+	m->op.crc	= rbio->pick.crc;
+	m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
+
+	if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
+		m->op.nonce	= m->op.crc.nonce + m->op.crc.offset;
+		m->op.csum_type = m->op.crc.csum_type;
+	}
+
+	if (m->data_cmd == DATA_REWRITE)
+		bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
+}
+
+int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
+			    struct write_point_specifier wp,
+			    struct bch_io_opts io_opts,
+			    enum data_cmd data_cmd,
+			    struct data_opts data_opts,
+			    enum btree_id btree_id,
+			    struct bkey_s_c k)
+{
+	int ret;
+
+	m->btree_id	= btree_id;
+	m->data_cmd	= data_cmd;
+	m->data_opts	= data_opts;
+	m->nr_ptrs_reserved = 0;
+
+	bch2_write_op_init(&m->op, c, io_opts);
+	m->op.compression_type =
+		bch2_compression_opt_to_type[io_opts.background_compression ?:
+					     io_opts.compression];
+	m->op.target	= data_opts.target,
+	m->op.write_point = wp;
+
+	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+		m->op.alloc_reserve = RESERVE_MOVINGGC;
+
+	m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
+		BCH_WRITE_PAGES_STABLE|
+		BCH_WRITE_PAGES_OWNED|
+		BCH_WRITE_DATA_ENCODED;
+
+	m->op.nr_replicas	= 1;
+	m->op.nr_replicas_required = 1;
+	m->op.index_update_fn	= bch2_migrate_index_update;
+
+	switch (data_cmd) {
+	case DATA_ADD_REPLICAS: {
+		/*
+		 * DATA_ADD_REPLICAS is used for moving data to a different
+		 * device in the background, and due to compression the new copy
+		 * might take up more space than the old copy:
+		 */
+#if 0
+		int nr = (int) io_opts.data_replicas -
+			bch2_bkey_nr_dirty_ptrs(k);
+#endif
+		int nr = (int) io_opts.data_replicas;
+
+		if (nr > 0) {
+			m->op.nr_replicas = m->nr_ptrs_reserved = nr;
+
+			ret = bch2_disk_reservation_get(c, &m->op.res,
+					k.k->size, m->op.nr_replicas, 0);
+			if (ret)
+				return ret;
+		}
+		break;
+	}
+	case DATA_REWRITE: {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+		unsigned compressed_sectors = 0;
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			if (!p.ptr.cached &&
+			    p.crc.compression_type != BCH_COMPRESSION_NONE &&
+			    bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
+				compressed_sectors += p.crc.compressed_size;
+
+		if (compressed_sectors) {
+			ret = bch2_disk_reservation_add(c, &m->op.res,
+					compressed_sectors,
+					BCH_DISK_RESERVATION_NOFAIL);
+			if (ret)
+				return ret;
+		}
+		break;
+	}
+	case DATA_PROMOTE:
+		m->op.flags	|= BCH_WRITE_ALLOC_NOWAIT;
+		m->op.flags	|= BCH_WRITE_CACHED;
+		break;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static void move_free(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_context *ctxt = io->write.ctxt;
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+
+	bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
+
+	bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
+		if (bv->bv_page)
+			__free_page(bv->bv_page);
+
+	wake_up(&ctxt->wait);
+
+	kfree(io);
+}
+
+static void move_write_done(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+
+	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
+	closure_return_with_destructor(cl, move_free);
+}
+
+static void move_write(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+
+	if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+		closure_return_with_destructor(cl, move_free);
+		return;
+	}
+
+	bch2_migrate_read_done(&io->write, &io->rbio);
+
+	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
+	closure_call(&io->write.op.cl, bch2_write, NULL, cl);
+	continue_at(cl, move_write_done, NULL);
+}
+
+static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
+{
+	struct moving_io *io =
+		list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
+
+	return io && io->read_completed ? io : NULL;
+}
+
+static void move_read_endio(struct bio *bio)
+{
+	struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+	struct moving_context *ctxt = io->write.ctxt;
+
+	atomic_sub(io->read_sectors, &ctxt->read_sectors);
+	io->read_completed = true;
+
+	if (next_pending_write(ctxt))
+		wake_up(&ctxt->wait);
+
+	closure_put(&ctxt->cl);
+}
+
+static void do_pending_writes(struct moving_context *ctxt)
+{
+	struct moving_io *io;
+
+	while ((io = next_pending_write(ctxt))) {
+		list_del(&io->list);
+		closure_call(&io->cl, move_write, NULL, &ctxt->cl);
+	}
+}
+
+#define move_ctxt_wait_event(_ctxt, _cond)			\
+do {								\
+	do_pending_writes(_ctxt);				\
+								\
+	if (_cond)						\
+		break;						\
+	__wait_event((_ctxt)->wait,				\
+		     next_pending_write(_ctxt) || (_cond));	\
+} while (1)
+
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+{
+	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
+
+	move_ctxt_wait_event(ctxt,
+		!atomic_read(&ctxt->write_sectors) ||
+		atomic_read(&ctxt->write_sectors) != sectors_pending);
+}
+
+static int bch2_move_extent(struct bch_fs *c,
+			    struct moving_context *ctxt,
+			    struct write_point_specifier wp,
+			    struct bch_io_opts io_opts,
+			    enum btree_id btree_id,
+			    struct bkey_s_c k,
+			    enum data_cmd data_cmd,
+			    struct data_opts data_opts)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct moving_io *io;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned sectors = k.k->size, pages;
+	int ret = -ENOMEM;
+
+	move_ctxt_wait_event(ctxt,
+		atomic_read(&ctxt->write_sectors) <
+		SECTORS_IN_FLIGHT_PER_DEVICE);
+
+	move_ctxt_wait_event(ctxt,
+		atomic_read(&ctxt->read_sectors) <
+		SECTORS_IN_FLIGHT_PER_DEVICE);
+
+	/* write path might have to decompress data: */
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
+
+	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	io = kzalloc(sizeof(struct moving_io) +
+		     sizeof(struct bio_vec) * pages, GFP_KERNEL);
+	if (!io)
+		goto err;
+
+	io->write.ctxt		= ctxt;
+	io->read_sectors	= k.k->size;
+	io->write_sectors	= k.k->size;
+
+	bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
+	bio_set_prio(&io->write.op.wbio.bio,
+		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
+				 GFP_KERNEL))
+		goto err_free;
+
+	io->rbio.c		= c;
+	io->rbio.opts		= io_opts;
+	bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
+	io->rbio.bio.bi_vcnt = pages;
+	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+	io->rbio.bio.bi_iter.bi_size = sectors << 9;
+
+	bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
+	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
+	io->rbio.bio.bi_end_io		= move_read_endio;
+
+	ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
+				      data_cmd, data_opts, btree_id, k);
+	if (ret)
+		goto err_free_pages;
+
+	atomic64_inc(&ctxt->stats->keys_moved);
+	atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+
+	trace_move_extent(k.k);
+
+	atomic_add(io->read_sectors, &ctxt->read_sectors);
+	list_add_tail(&io->list, &ctxt->reads);
+
+	/*
+	 * dropped by move_read_endio() - guards against use after free of
+	 * ctxt when doing wakeup
+	 */
+	closure_get(&ctxt->cl);
+	bch2_read_extent(c, &io->rbio, k, 0,
+			 BCH_READ_NODECODE|
+			 BCH_READ_LAST_FRAGMENT);
+	return 0;
+err_free_pages:
+	bio_free_pages(&io->write.op.wbio.bio);
+err_free:
+	kfree(io);
+err:
+	trace_move_alloc_fail(k.k);
+	return ret;
+}
+
+static int __bch2_move_data(struct bch_fs *c,
+		struct moving_context *ctxt,
+		struct bch_ratelimit *rate,
+		struct write_point_specifier wp,
+		struct bpos start,
+		struct bpos end,
+		move_pred_fn pred, void *arg,
+		struct bch_move_stats *stats,
+		enum btree_id btree_id)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	BKEY_PADDED(k) tmp;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct data_opts data_opts;
+	enum data_cmd data_cmd;
+	u64 delay, cur_inum = U64_MAX;
+	int ret = 0, ret2;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	stats->data_type = BCH_DATA_USER;
+	stats->btree_id	= btree_id;
+	stats->pos	= POS_MIN;
+
+	iter = bch2_trans_get_iter(&trans, btree_id, start,
+				   BTREE_ITER_PREFETCH);
+
+	if (rate)
+		bch2_ratelimit_reset(rate);
+
+	while (1) {
+		do {
+			delay = rate ? bch2_ratelimit_delay(rate) : 0;
+
+			if (delay) {
+				bch2_trans_unlock(&trans);
+				set_current_state(TASK_INTERRUPTIBLE);
+			}
+
+			if (kthread && (ret = kthread_should_stop())) {
+				__set_current_state(TASK_RUNNING);
+				goto out;
+			}
+
+			if (delay)
+				schedule_timeout(delay);
+
+			if (unlikely(freezing(current))) {
+				bch2_trans_unlock(&trans);
+				move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+				try_to_freeze();
+			}
+		} while (delay);
+peek:
+		k = bch2_btree_iter_peek(iter);
+
+		stats->pos = iter->pos;
+
+		if (!k.k)
+			break;
+		ret = bkey_err(k);
+		if (ret)
+			break;
+		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+			break;
+
+		if (!bkey_extent_is_direct_data(k.k))
+			goto next_nondata;
+
+		if (btree_id == BTREE_ID_EXTENTS &&
+		    cur_inum != k.k->p.inode) {
+			struct bch_inode_unpacked inode;
+
+			/* don't hold btree locks while looking up inode: */
+			bch2_trans_unlock(&trans);
+
+			io_opts = bch2_opts_to_inode_opts(c->opts);
+			if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
+				bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
+			cur_inum = k.k->p.inode;
+			goto peek;
+		}
+
+		switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
+		case DATA_SKIP:
+			goto next;
+		case DATA_SCRUB:
+			BUG();
+		case DATA_ADD_REPLICAS:
+		case DATA_REWRITE:
+		case DATA_PROMOTE:
+			break;
+		default:
+			BUG();
+		}
+
+		/* unlock before doing IO: */
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+		bch2_trans_unlock(&trans);
+
+		ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k,
+					data_cmd, data_opts);
+		if (ret2) {
+			if (ret2 == -ENOMEM) {
+				/* memory allocation failure, wait for some IO to finish */
+				bch2_move_ctxt_wait_for_io(ctxt);
+				continue;
+			}
+
+			/* XXX signal failure */
+			goto next;
+		}
+
+		if (rate)
+			bch2_ratelimit_increment(rate, k.k->size);
+next:
+		atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k),
+			     &stats->sectors_seen);
+next_nondata:
+		bch2_btree_iter_next(iter);
+		bch2_trans_cond_resched(&trans);
+	}
+out:
+	ret = bch2_trans_exit(&trans) ?: ret;
+
+	return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+		   struct bch_ratelimit *rate,
+		   struct write_point_specifier wp,
+		   struct bpos start,
+		   struct bpos end,
+		   move_pred_fn pred, void *arg,
+		   struct bch_move_stats *stats)
+{
+	struct moving_context ctxt = { .stats = stats };
+	int ret;
+
+	closure_init_stack(&ctxt.cl);
+	INIT_LIST_HEAD(&ctxt.reads);
+	init_waitqueue_head(&ctxt.wait);
+
+	stats->data_type = BCH_DATA_USER;
+
+	ret =   __bch2_move_data(c, &ctxt, rate, wp, start, end,
+				 pred, arg, stats, BTREE_ID_EXTENTS) ?:
+		__bch2_move_data(c, &ctxt, rate, wp, start, end,
+				 pred, arg, stats, BTREE_ID_REFLINK);
+
+	move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+	closure_sync(&ctxt.cl);
+
+	EBUG_ON(atomic_read(&ctxt.write_sectors));
+
+	trace_move_data(c,
+			atomic64_read(&stats->sectors_moved),
+			atomic64_read(&stats->keys_moved));
+
+	return ret;
+}
+
+static int bch2_move_btree(struct bch_fs *c,
+			   move_pred_fn pred,
+			   void *arg,
+			   struct bch_move_stats *stats)
+{
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct btree *b;
+	unsigned id;
+	struct data_opts data_opts;
+	enum data_cmd cmd;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	stats->data_type = BCH_DATA_BTREE;
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		stats->btree_id = id;
+
+		for_each_btree_node(&trans, iter, id, POS_MIN,
+				    BTREE_ITER_PREFETCH, b) {
+			stats->pos = iter->pos;
+
+			switch ((cmd = pred(c, arg,
+					    bkey_i_to_s_c(&b->key),
+					    &io_opts, &data_opts))) {
+			case DATA_SKIP:
+				goto next;
+			case DATA_SCRUB:
+				BUG();
+			case DATA_ADD_REPLICAS:
+			case DATA_REWRITE:
+				break;
+			default:
+				BUG();
+			}
+
+			ret = bch2_btree_node_rewrite(c, iter,
+					b->data->keys.seq, 0) ?: ret;
+next:
+			bch2_trans_cond_resched(&trans);
+		}
+
+		ret = bch2_trans_iter_free(&trans, iter) ?: ret;
+	}
+
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+#if 0
+static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
+				struct bkey_s_c k,
+				struct bch_io_opts *io_opts,
+				struct data_opts *data_opts)
+{
+	return DATA_SCRUB;
+}
+#endif
+
+static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
+				      struct bkey_s_c k,
+				      struct bch_io_opts *io_opts,
+				      struct data_opts *data_opts)
+{
+	unsigned nr_good = bch2_bkey_durability(c, k);
+	unsigned replicas = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+		replicas = c->opts.metadata_replicas;
+		break;
+	case KEY_TYPE_extent:
+		replicas = io_opts->data_replicas;
+		break;
+	}
+
+	if (!nr_good || nr_good >= replicas)
+		return DATA_SKIP;
+
+	data_opts->target		= 0;
+	data_opts->btree_insert_flags	= 0;
+	return DATA_ADD_REPLICAS;
+}
+
+static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
+				  struct bkey_s_c k,
+				  struct bch_io_opts *io_opts,
+				  struct data_opts *data_opts)
+{
+	struct bch_ioctl_data *op = arg;
+
+	if (!bch2_bkey_has_device(k, op->migrate.dev))
+		return DATA_SKIP;
+
+	data_opts->target		= 0;
+	data_opts->btree_insert_flags	= 0;
+	data_opts->rewrite_dev		= op->migrate.dev;
+	return DATA_REWRITE;
+}
+
+int bch2_data_job(struct bch_fs *c,
+		  struct bch_move_stats *stats,
+		  struct bch_ioctl_data op)
+{
+	int ret = 0;
+
+	switch (op.op) {
+	case BCH_DATA_OP_REREPLICATE:
+		stats->data_type = BCH_DATA_JOURNAL;
+		ret = bch2_journal_flush_device_pins(&c->journal, -1);
+
+		ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+
+		while (1) {
+			closure_wait_event(&c->btree_interior_update_wait,
+					   !bch2_btree_interior_updates_nr_pending(c) ||
+					   c->btree_roots_dirty);
+			if (!bch2_btree_interior_updates_nr_pending(c))
+				break;
+			bch2_journal_meta(&c->journal);
+		}
+
+		ret = bch2_replicas_gc2(c) ?: ret;
+
+		ret = bch2_move_data(c, NULL,
+				     writepoint_hashed((unsigned long) current),
+				     op.start,
+				     op.end,
+				     rereplicate_pred, c, stats) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
+		break;
+	case BCH_DATA_OP_MIGRATE:
+		if (op.migrate.dev >= c->sb.nr_devices)
+			return -EINVAL;
+
+		stats->data_type = BCH_DATA_JOURNAL;
+		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
+
+		ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
+
+		ret = bch2_move_data(c, NULL,
+				     writepoint_hashed((unsigned long) current),
+				     op.start,
+				     op.end,
+				     migrate_pred, &op, stats) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
new file mode 100644
index 000000000000..0acd1720d4f8
--- /dev/null
+++ b/fs/bcachefs/move.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_H
+#define _BCACHEFS_MOVE_H
+
+#include "btree_iter.h"
+#include "buckets.h"
+#include "io_types.h"
+#include "move_types.h"
+
+struct bch_read_bio;
+struct moving_context;
+
+enum data_cmd {
+	DATA_SKIP,
+	DATA_SCRUB,
+	DATA_ADD_REPLICAS,
+	DATA_REWRITE,
+	DATA_PROMOTE,
+};
+
+struct data_opts {
+	u16		target;
+	unsigned	rewrite_dev;
+	int		btree_insert_flags;
+};
+
+struct migrate_write {
+	enum btree_id		btree_id;
+	enum data_cmd		data_cmd;
+	struct data_opts	data_opts;
+
+	unsigned		nr_ptrs_reserved;
+
+	struct moving_context	*ctxt;
+
+	/* what we read: */
+	struct bch_extent_ptr	ptr;
+	u64			offset;
+
+	struct bch_write_op	op;
+};
+
+void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
+int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
+			    struct write_point_specifier,
+			    struct bch_io_opts,
+			    enum data_cmd, struct data_opts,
+			    enum btree_id, struct bkey_s_c);
+
+typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
+				struct bkey_s_c,
+				struct bch_io_opts *, struct data_opts *);
+
+int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
+		   struct write_point_specifier,
+		   struct bpos, struct bpos,
+		   move_pred_fn, void *,
+		   struct bch_move_stats *);
+
+int bch2_data_job(struct bch_fs *,
+		  struct bch_move_stats *,
+		  struct bch_ioctl_data);
+
+#endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
new file mode 100644
index 000000000000..6788170d3f95
--- /dev/null
+++ b/fs/bcachefs/move_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_TYPES_H
+#define _BCACHEFS_MOVE_TYPES_H
+
+struct bch_move_stats {
+	enum bch_data_type	data_type;
+	enum btree_id		btree_id;
+	struct bpos		pos;
+
+	atomic64_t		keys_moved;
+	atomic64_t		sectors_moved;
+	atomic64_t		sectors_seen;
+	atomic64_t		sectors_raced;
+};
+
+#endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
new file mode 100644
index 000000000000..710296044194
--- /dev/null
+++ b/fs/bcachefs/movinggc.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "extents.h"
+#include "eytzinger.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "movinggc.h"
+#include "super-io.h"
+
+#include <trace/events/bcachefs.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
+#include <linux/wait.h>
+
+/*
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
+ * need the buckets we're freeing up to go back into the copygc reserve to make
+ * forward progress, but if the copygc reserve is full they'll be available for
+ * any allocation - and it's possible that in a given iteration, we free up most
+ * of the buckets we're going to free before we allocate most of the buckets
+ * we're going to allocate.
+ *
+ * If we only use half of the reserve per iteration, then in steady state we'll
+ * always have room in the reserve for the buckets we're going to need in the
+ * next iteration:
+ */
+#define COPYGC_BUCKETS_PER_ITER(ca)					\
+	((ca)->free[RESERVE_MOVINGGC].size / 2)
+
+/*
+ * Max sectors to move per iteration: Have to take into account internal
+ * fragmentation from the multiple write points for each generation:
+ */
+#define COPYGC_SECTORS_PER_ITER(ca)					\
+	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
+
+static inline int sectors_used_cmp(copygc_heap *heap,
+				   struct copygc_heap_entry l,
+				   struct copygc_heap_entry r)
+{
+	return cmp_int(l.sectors, r.sectors);
+}
+
+static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
+{
+	const struct copygc_heap_entry *l = _l;
+	const struct copygc_heap_entry *r = _r;
+
+	return cmp_int(l->offset, r->offset);
+}
+
+static bool __copygc_pred(struct bch_dev *ca,
+			  struct bkey_s_c k)
+{
+	copygc_heap *h = &ca->copygc_heap;
+	const struct bch_extent_ptr *ptr =
+		bch2_bkey_has_device(k, ca->dev_idx);
+
+	if (ptr) {
+		struct copygc_heap_entry search = { .offset = ptr->offset };
+
+		ssize_t i = eytzinger0_find_le(h->data, h->used,
+					       sizeof(h->data[0]),
+					       bucket_offset_cmp, &search);
+
+		return (i >= 0 &&
+			ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+			ptr->gen == h->data[i].gen);
+	}
+
+	return false;
+}
+
+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
+				 struct bkey_s_c k,
+				 struct bch_io_opts *io_opts,
+				 struct data_opts *data_opts)
+{
+	struct bch_dev *ca = arg;
+
+	if (!__copygc_pred(ca, k))
+		return DATA_SKIP;
+
+	data_opts->target		= dev_to_target(ca->dev_idx);
+	data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
+	data_opts->rewrite_dev		= ca->dev_idx;
+	return DATA_REWRITE;
+}
+
+static bool have_copygc_reserve(struct bch_dev *ca)
+{
+	bool ret;
+
+	spin_lock(&ca->freelist_lock);
+	ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
+		ca->allocator_state != ALLOCATOR_RUNNING;
+	spin_unlock(&ca->freelist_lock);
+
+	return ret;
+}
+
+static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
+{
+	copygc_heap *h = &ca->copygc_heap;
+	struct copygc_heap_entry e, *i;
+	struct bucket_array *buckets;
+	struct bch_move_stats move_stats;
+	u64 sectors_to_move = 0, sectors_not_moved = 0;
+	u64 buckets_to_move, buckets_not_moved = 0;
+	size_t b;
+	int ret;
+
+	memset(&move_stats, 0, sizeof(move_stats));
+	closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
+
+	/*
+	 * Find buckets with lowest sector counts, skipping completely
+	 * empty buckets, by building a maxheap sorted by sector count,
+	 * and repeatedly replacing the maximum element until all
+	 * buckets have been visited.
+	 */
+	h->used = 0;
+
+	/*
+	 * We need bucket marks to be up to date - gc can't be recalculating
+	 * them:
+	 */
+	down_read(&c->gc_lock);
+	down_read(&ca->bucket_lock);
+	buckets = bucket_array(ca);
+
+	for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
+		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+		struct copygc_heap_entry e;
+
+		if (m.owned_by_allocator ||
+		    m.data_type != BCH_DATA_USER ||
+		    !bucket_sectors_used(m) ||
+		    bucket_sectors_used(m) >= ca->mi.bucket_size)
+			continue;
+
+		e = (struct copygc_heap_entry) {
+			.gen		= m.gen,
+			.sectors	= bucket_sectors_used(m),
+			.offset		= bucket_to_sector(ca, b),
+		};
+		heap_add_or_replace(h, e, -sectors_used_cmp, NULL);
+	}
+	up_read(&ca->bucket_lock);
+	up_read(&c->gc_lock);
+
+	for (i = h->data; i < h->data + h->used; i++)
+		sectors_to_move += i->sectors;
+
+	while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
+		BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL));
+		sectors_to_move -= e.sectors;
+	}
+
+	buckets_to_move = h->used;
+
+	if (!buckets_to_move)
+		return;
+
+	eytzinger0_sort(h->data, h->used,
+			sizeof(h->data[0]),
+			bucket_offset_cmp, NULL);
+
+	ret = bch2_move_data(c, &ca->copygc_pd.rate,
+			     writepoint_ptr(&ca->copygc_write_point),
+			     POS_MIN, POS_MAX,
+			     copygc_pred, ca,
+			     &move_stats);
+
+	down_read(&ca->bucket_lock);
+	buckets = bucket_array(ca);
+	for (i = h->data; i < h->data + h->used; i++) {
+		size_t b = sector_to_bucket(ca, i->offset);
+		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+
+		if (i->gen == m.gen && bucket_sectors_used(m)) {
+			sectors_not_moved += bucket_sectors_used(m);
+			buckets_not_moved++;
+		}
+	}
+	up_read(&ca->bucket_lock);
+
+	if (sectors_not_moved && !ret)
+		bch_warn_ratelimited(c,
+			"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
+			 sectors_not_moved, sectors_to_move,
+			 buckets_not_moved, buckets_to_move);
+
+	trace_copygc(ca,
+		     atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
+		     buckets_to_move, buckets_not_moved);
+}
+
+static int bch2_copygc_thread(void *arg)
+{
+	struct bch_dev *ca = arg;
+	struct bch_fs *c = ca->fs;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	struct bch_dev_usage usage;
+	unsigned long last;
+	u64 available, fragmented, reserve, next;
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		if (kthread_wait_freezable(c->copy_gc_enabled))
+			break;
+
+		last = atomic_long_read(&clock->now);
+
+		reserve = ca->copygc_threshold;
+
+		usage = bch2_dev_usage_read(c, ca);
+
+		available = __dev_buckets_available(ca, usage) *
+			ca->mi.bucket_size;
+		if (available > reserve) {
+			next = last + available - reserve;
+			bch2_kthread_io_clock_wait(clock, next,
+					MAX_SCHEDULE_TIMEOUT);
+			continue;
+		}
+
+		/*
+		 * don't start copygc until there's more than half the copygc
+		 * reserve of fragmented space:
+		 */
+		fragmented = usage.sectors_fragmented;
+		if (fragmented < reserve) {
+			next = last + reserve - fragmented;
+			bch2_kthread_io_clock_wait(clock, next,
+					MAX_SCHEDULE_TIMEOUT);
+			continue;
+		}
+
+		bch2_copygc(c, ca);
+	}
+
+	return 0;
+}
+
+void bch2_copygc_stop(struct bch_dev *ca)
+{
+	ca->copygc_pd.rate.rate = UINT_MAX;
+	bch2_ratelimit_reset(&ca->copygc_pd.rate);
+
+	if (ca->copygc_thread) {
+		kthread_stop(ca->copygc_thread);
+		put_task_struct(ca->copygc_thread);
+	}
+	ca->copygc_thread = NULL;
+}
+
+int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct task_struct *t;
+
+	if (ca->copygc_thread)
+		return 0;
+
+	if (c->opts.nochanges)
+		return 0;
+
+	if (bch2_fs_init_fault("copygc_start"))
+		return -ENOMEM;
+
+	t = kthread_create(bch2_copygc_thread, ca,
+			   "bch_copygc[%s]", ca->name);
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	get_task_struct(t);
+
+	ca->copygc_thread = t;
+	wake_up_process(ca->copygc_thread);
+
+	return 0;
+}
+
+void bch2_dev_copygc_init(struct bch_dev *ca)
+{
+	bch2_pd_controller_init(&ca->copygc_pd);
+	ca->copygc_pd.d_term = 0;
+}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
new file mode 100644
index 000000000000..dcd479632cf1
--- /dev/null
+++ b/fs/bcachefs/movinggc.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVINGGC_H
+#define _BCACHEFS_MOVINGGC_H
+
+void bch2_copygc_stop(struct bch_dev *);
+int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
+void bch2_dev_copygc_init(struct bch_dev *);
+
+#endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
new file mode 100644
index 000000000000..13a9a2fcd575
--- /dev/null
+++ b/fs/bcachefs/opts.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+
+#include "bcachefs.h"
+#include "compress.h"
+#include "disk_groups.h"
+#include "opts.h"
+#include "super-io.h"
+#include "util.h"
+
+const char * const bch2_error_actions[] = {
+	"continue",
+	"remount-ro",
+	"panic",
+	NULL
+};
+
+const char * const bch2_csum_types[] = {
+	"none",
+	"crc32c",
+	"crc64",
+	NULL
+};
+
+const char * const bch2_compression_types[] = {
+	"none",
+	"lz4",
+	"gzip",
+	"zstd",
+	NULL
+};
+
+const char * const bch2_str_hash_types[] = {
+	"crc32c",
+	"crc64",
+	"siphash",
+	NULL
+};
+
+const char * const bch2_data_types[] = {
+	"none",
+	"sb",
+	"journal",
+	"btree",
+	"data",
+	"cached",
+	NULL
+};
+
+const char * const bch2_cache_replacement_policies[] = {
+	"lru",
+	"fifo",
+	"random",
+	NULL
+};
+
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+const char * const bch2_cache_modes[] = {
+	"default",
+	"writethrough",
+	"writeback",
+	"writearound",
+	"none",
+	NULL
+};
+
+const char * const bch2_dev_state[] = {
+	"readwrite",
+	"readonly",
+	"failed",
+	"spare",
+	NULL
+};
+
+void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
+{
+#define x(_name, ...)						\
+	if (opt_defined(src, _name))					\
+		opt_set(*dst, _name, src._name);
+
+	BCH_OPTS()
+#undef x
+}
+
+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+	switch (id) {
+#define x(_name, ...)						\
+	case Opt_##_name:						\
+		return opt_defined(*opts, _name);
+	BCH_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+	switch (id) {
+#define x(_name, ...)						\
+	case Opt_##_name:						\
+		return opts->_name;
+	BCH_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
+{
+	switch (id) {
+#define x(_name, ...)						\
+	case Opt_##_name:						\
+		opt_set(*opts, _name, v);				\
+		break;
+	BCH_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
+{
+	struct bch_opts opts = bch2_opts_empty();
+
+#define x(_name, _bits, _mode, _type, _sb_opt, ...)			\
+	if (_sb_opt != NO_SB_OPT)					\
+		opt_set(opts, _name, _sb_opt(sb));
+	BCH_OPTS()
+#undef x
+
+	return opts;
+}
+
+const struct bch_option bch2_opt_table[] = {
+#define OPT_BOOL()		.type = BCH_OPT_BOOL
+#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT, .min = _min, .max = _max
+#define OPT_SECTORS(_min, _max)	.type = BCH_OPT_SECTORS, .min = _min, .max = _max
+#define OPT_STR(_choices)	.type = BCH_OPT_STR, .choices = _choices
+#define OPT_FN(_fn)		.type = BCH_OPT_FN,			\
+				.parse = _fn##_parse,			\
+				.to_text = _fn##_to_text
+
+#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help)	\
+	[Opt_##_name] = {						\
+		.attr	= {						\
+			.name	= #_name,				\
+			.mode = (_mode) & OPT_RUNTIME ? 0644 : 0444,	\
+		},							\
+		.mode	= _mode,					\
+		.hint	= _hint,					\
+		.help	= _help,					\
+		.set_sb	= SET_##_sb_opt,				\
+		_type							\
+	},
+
+	BCH_OPTS()
+#undef x
+};
+
+int bch2_opt_lookup(const char *name)
+{
+	const struct bch_option *i;
+
+	for (i = bch2_opt_table;
+	     i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
+	     i++)
+		if (!strcmp(name, i->attr.name))
+			return i - bch2_opt_table;
+
+	return -1;
+}
+
+struct synonym {
+	const char	*s1, *s2;
+};
+
+static const struct synonym bch_opt_synonyms[] = {
+	{ "quota",	"usrquota" },
+};
+
+static int bch2_mount_opt_lookup(const char *name)
+{
+	const struct synonym *i;
+
+	for (i = bch_opt_synonyms;
+	     i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
+	     i++)
+		if (!strcmp(name, i->s1))
+			name = i->s2;
+
+	return bch2_opt_lookup(name);
+}
+
+int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
+		   const char *val, u64 *res)
+{
+	ssize_t ret;
+
+	switch (opt->type) {
+	case BCH_OPT_BOOL:
+		ret = kstrtou64(val, 10, res);
+		if (ret < 0)
+			return ret;
+
+		if (*res > 1)
+			return -ERANGE;
+		break;
+	case BCH_OPT_UINT:
+		ret = kstrtou64(val, 10, res);
+		if (ret < 0)
+			return ret;
+
+		if (*res < opt->min || *res >= opt->max)
+			return -ERANGE;
+		break;
+	case BCH_OPT_SECTORS:
+		ret = bch2_strtou64_h(val, res);
+		if (ret < 0)
+			return ret;
+
+		if (*res & 511)
+			return -EINVAL;
+
+		*res >>= 9;
+
+		if (*res < opt->min || *res >= opt->max)
+			return -ERANGE;
+		break;
+	case BCH_OPT_STR:
+		ret = match_string(opt->choices, -1, val);
+		if (ret < 0)
+			return ret;
+
+		*res = ret;
+		break;
+	case BCH_OPT_FN:
+		if (!c)
+			return -EINVAL;
+
+		return opt->parse(c, val, res);
+	}
+
+	return 0;
+}
+
+void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
+		      const struct bch_option *opt, u64 v,
+		      unsigned flags)
+{
+	if (flags & OPT_SHOW_MOUNT_STYLE) {
+		if (opt->type == BCH_OPT_BOOL) {
+			pr_buf(out, "%s%s",
+			       v ? "" : "no",
+			       opt->attr.name);
+			return;
+		}
+
+		pr_buf(out, "%s=", opt->attr.name);
+	}
+
+	switch (opt->type) {
+	case BCH_OPT_BOOL:
+	case BCH_OPT_UINT:
+		pr_buf(out, "%lli", v);
+		break;
+	case BCH_OPT_SECTORS:
+		bch2_hprint(out, v);
+		break;
+	case BCH_OPT_STR:
+		if (flags & OPT_SHOW_FULL_LIST)
+			bch2_string_opt_to_text(out, opt->choices, v);
+		else
+			pr_buf(out, opt->choices[v]);
+		break;
+	case BCH_OPT_FN:
+		opt->to_text(out, c, v);
+		break;
+	default:
+		BUG();
+	}
+}
+
+int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
+{
+	int ret = 0;
+
+	switch (id) {
+	case Opt_compression:
+	case Opt_background_compression:
+		ret = bch2_check_set_has_compressed_data(c, v);
+		break;
+	case Opt_erasure_code:
+		if (v &&
+		    !(c->sb.features & (1ULL << BCH_FEATURE_EC))) {
+			mutex_lock(&c->sb_lock);
+			c->disk_sb.sb->features[0] |=
+				cpu_to_le64(1ULL << BCH_FEATURE_EC);
+
+			bch2_write_super(c);
+			mutex_unlock(&c->sb_lock);
+		}
+		break;
+	}
+
+	return ret;
+}
+
+int bch2_opts_check_may_set(struct bch_fs *c)
+{
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		ret = bch2_opt_check_may_set(c, i,
+				bch2_opt_get_by_id(&c->opts, i));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
+{
+	char *opt, *name, *val;
+	int ret, id;
+	u64 v;
+
+	while ((opt = strsep(&options, ",")) != NULL) {
+		name	= strsep(&opt, "=");
+		val	= opt;
+
+		if (val) {
+			id = bch2_mount_opt_lookup(name);
+			if (id < 0)
+				goto bad_opt;
+
+			ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v);
+			if (ret < 0)
+				goto bad_val;
+		} else {
+			id = bch2_mount_opt_lookup(name);
+			v = 1;
+
+			if (id < 0 &&
+			    !strncmp("no", name, 2)) {
+				id = bch2_mount_opt_lookup(name + 2);
+				v = 0;
+			}
+
+			if (id < 0)
+				goto bad_opt;
+
+			if (bch2_opt_table[id].type != BCH_OPT_BOOL)
+				goto no_val;
+		}
+
+		if (!(bch2_opt_table[id].mode & OPT_MOUNT))
+			goto bad_opt;
+
+		if (id == Opt_acl &&
+		    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
+			goto bad_opt;
+
+		if ((id == Opt_usrquota ||
+		     id == Opt_grpquota) &&
+		    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+			goto bad_opt;
+
+		bch2_opt_set_by_id(opts, id, v);
+	}
+
+	return 0;
+bad_opt:
+	pr_err("Bad mount option %s", name);
+	return -1;
+bad_val:
+	pr_err("Invalid value %s for mount option %s", val, name);
+	return -1;
+no_val:
+	pr_err("Mount option %s requires a value", name);
+	return -1;
+}
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+	struct bch_io_opts ret = { 0 };
+#define x(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(ret, _name, src._name);
+	BCH_INODE_OPTS()
+#undef x
+	return ret;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
+{
+	struct bch_opts ret = { 0 };
+#define x(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(ret, _name, src._name);
+	BCH_INODE_OPTS()
+#undef x
+	return ret;
+}
+
+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
+{
+#define x(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(*dst, _name, src._name);
+	BCH_INODE_OPTS()
+#undef x
+}
+
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
+{
+	static const enum bch_opt_id inode_opt_list[] = {
+#define x(_name, _bits)	Opt_##_name,
+	BCH_INODE_OPTS()
+#undef x
+	};
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
+		if (inode_opt_list[i] == id)
+			return true;
+
+	return false;
+}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
new file mode 100644
index 000000000000..bd2058f1fe2b
--- /dev/null
+++ b/fs/bcachefs/opts.h
@@ -0,0 +1,403 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_OPTS_H
+#define _BCACHEFS_OPTS_H
+
+#include <linux/bug.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include "bcachefs_format.h"
+
+extern const char * const bch2_error_actions[];
+extern const char * const bch2_csum_types[];
+extern const char * const bch2_compression_types[];
+extern const char * const bch2_str_hash_types[];
+extern const char * const bch2_data_types[];
+extern const char * const bch2_cache_replacement_policies[];
+extern const char * const bch2_cache_modes[];
+extern const char * const bch2_dev_state[];
+
+/*
+ * Mount options; we also store defaults in the superblock.
+ *
+ * Also exposed via sysfs: if an option is writeable, and it's also stored in
+ * the superblock, changing it via sysfs (currently? might change this) also
+ * updates the superblock.
+ *
+ * We store options as signed integers, where -1 means undefined. This means we
+ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
+ * apply the options from that struct that are defined.
+ */
+
+/* dummy option, for options that aren't stored in the superblock */
+LE64_BITMASK(NO_SB_OPT,		struct bch_sb, flags[0], 0, 0);
+
+/* When can be set: */
+enum opt_mode {
+	OPT_FORMAT	= (1 << 0),
+	OPT_MOUNT	= (1 << 1),
+	OPT_RUNTIME	= (1 << 2),
+	OPT_INODE	= (1 << 3),
+	OPT_DEVICE	= (1 << 4),
+};
+
+enum opt_type {
+	BCH_OPT_BOOL,
+	BCH_OPT_UINT,
+	BCH_OPT_SECTORS,
+	BCH_OPT_STR,
+	BCH_OPT_FN,
+};
+
+/**
+ * x(name, shortopt, type, in mem type, mode, sb_opt)
+ *
+ * @name	- name of mount option, sysfs attribute, and struct bch_opts
+ *		  member
+ *
+ * @mode	- when opt may be set
+ *
+ * @sb_option	- name of corresponding superblock option
+ *
+ * @type	- one of OPT_BOOL, OPT_UINT, OPT_STR
+ */
+
+/*
+ * XXX: add fields for
+ *  - default value
+ *  - helptext
+ */
+
+#define BCH_OPTS()							\
+	x(block_size,			u16,				\
+	  OPT_FORMAT,							\
+	  OPT_SECTORS(1, 128),						\
+	  BCH_SB_BLOCK_SIZE,		8,				\
+	  "size",	NULL)						\
+	x(btree_node_size,		u16,				\
+	  OPT_FORMAT,							\
+	  OPT_SECTORS(1, 128),						\
+	  BCH_SB_BTREE_NODE_SIZE,	512,				\
+	  "size",	"Btree node size, default 256k")		\
+	x(errors,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_STR(bch2_error_actions),					\
+	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_RO,		\
+	  NULL,		"Action to take on filesystem error")		\
+	x(metadata_replicas,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_META_REPLICAS_WANT,	1,				\
+	  "#",		"Number of metadata replicas")			\
+	x(data_replicas,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_DATA_REPLICAS_WANT,	1,				\
+	  "#",		"Number of data replicas")			\
+	x(metadata_replicas_required, u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_META_REPLICAS_REQ,	1,				\
+	  "#",		NULL)						\
+	x(data_replicas_required,	u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_DATA_REPLICAS_REQ,	1,				\
+	  "#",		NULL)						\
+	x(metadata_checksum,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_STR(bch2_csum_types),					\
+	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_CRC32C,		\
+	  NULL,		NULL)						\
+	x(data_checksum,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_STR(bch2_csum_types),					\
+	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_CRC32C,		\
+	  NULL,		NULL)						\
+	x(compression,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_STR(bch2_compression_types),				\
+	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_NONE,	\
+	  NULL,		NULL)						\
+	x(background_compression,	u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_STR(bch2_compression_types),				\
+	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE,	\
+	  NULL,		NULL)						\
+	x(str_hash,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_STR(bch2_str_hash_types),					\
+	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_SIPHASH,	\
+	  NULL,		"Hash function for directory entries and xattrs")\
+	x(foreground_target,		u16,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_FOREGROUND_TARGET,	0,				\
+	  "(target)",	"Device or disk group for foreground writes")	\
+	x(background_target,		u16,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_BACKGROUND_TARGET,	0,				\
+	  "(target)",	"Device or disk group to move data to in the background")\
+	x(promote_target,		u16,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_PROMOTE_TARGET,	0,				\
+	  "(target)",	"Device or disk group to promote data to on read")\
+	x(erasure_code,			u16,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_ERASURE_CODE,		false,				\
+	  NULL,		"Enable erasure coding (DO NOT USE YET)")	\
+	x(inodes_32bit,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_BOOL(),							\
+	  BCH_SB_INODE_32BIT,		false,				\
+	  NULL,		"Constrain inode numbers to 32 bits")		\
+	x(gc_reserve_percent,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_UINT(5, 21),						\
+	  BCH_SB_GC_RESERVE,		8,				\
+	  "%",		"Percentage of disk space to reserve for copygc")\
+	x(gc_reserve_bytes,		u64,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_SECTORS(0, U64_MAX),					\
+	  BCH_SB_GC_RESERVE_BYTES,	0,				\
+	  "%",		"Amount of disk space to reserve for copygc\n"	\
+			"Takes precedence over gc_reserve_percent if set")\
+	x(root_reserve_percent,		u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_UINT(0, 100),						\
+	  BCH_SB_ROOT_RESERVE,		0,				\
+	  "%",		"Percentage of disk space to reserve for superuser")\
+	x(wide_macs,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
+	  OPT_BOOL(),							\
+	  BCH_SB_128_BIT_MACS,		false,				\
+	  NULL,		"Store full 128 bits of cryptographic MACs, instead of 80")\
+	x(acl,				u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH_SB_POSIX_ACL,		true,				\
+	  NULL,		"Enable POSIX acls")				\
+	x(usrquota,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH_SB_USRQUOTA,		false,				\
+	  NULL,		"Enable user quotas")				\
+	x(grpquota,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH_SB_GRPQUOTA,		false,				\
+	  NULL,		"Enable group quotas")				\
+	x(prjquota,			u8,				\
+	  OPT_FORMAT|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH_SB_PRJQUOTA,		false,				\
+	  NULL,		"Enable project quotas")			\
+	x(degraded,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Allow mounting in degraded mode")		\
+	x(discard,			u8,				\
+	  OPT_MOUNT|OPT_DEVICE,						\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Enable discard/TRIM support")			\
+	x(verbose,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Extra debugging information during mount/recovery")\
+	x(journal_flush_disabled,	u8,				\
+	  OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Disable journal flush on sync/fsync\n"		\
+			"If enabled, writes can be lost, but only since the\n"\
+			"last journal write (default 1 second)")	\
+	x(fsck,				u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Run fsck on mount")				\
+	x(fix_errors,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Fix errors during fsck without asking")	\
+	x(nochanges,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Super read only mode - no writes at all will be issued,\n"\
+			"even if we have to replay the journal")	\
+	x(norecovery,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Don't replay the journal")			\
+	x(noexcl,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Don't open device in exclusive mode")		\
+	x(sb,				u64,				\
+	  OPT_MOUNT,							\
+	  OPT_UINT(0, S64_MAX),						\
+	  NO_SB_OPT,			BCH_SB_SECTOR,			\
+	  "offset",	"Sector offset of superblock")			\
+	x(read_only,			u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		NULL)						\
+	x(nostart,			u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Don\'t start filesystem, only open devices")	\
+	x(reconstruct_alloc,		u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Reconstruct alloc btree")			\
+	x(version_upgrade,		u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		"Set superblock to latest version,\n"		\
+			"allowing any new features to be used")		\
+	x(project,			u8,				\
+	  OPT_INODE,							\
+	  OPT_BOOL(),							\
+	  NO_SB_OPT,			false,				\
+	  NULL,		NULL)						\
+	x(fs_size,			u64,				\
+	  OPT_DEVICE,							\
+	  OPT_SECTORS(0, S64_MAX),					\
+	  NO_SB_OPT,			0,				\
+	  "size",	"Size of filesystem on device")			\
+	x(bucket,			u32,				\
+	  OPT_DEVICE,							\
+	  OPT_SECTORS(0, S64_MAX),					\
+	  NO_SB_OPT,			0,				\
+	  "size",	"Size of filesystem on device")			\
+	x(durability,			u8,				\
+	  OPT_DEVICE,							\
+	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
+	  NO_SB_OPT,			1,				\
+	  "n",		"Data written to this device will be considered\n"\
+			"to have already been replicated n times")
+
+struct bch_opts {
+#define x(_name, _bits, ...)	unsigned _name##_defined:1;
+	BCH_OPTS()
+#undef x
+
+#define x(_name, _bits, ...)	_bits	_name;
+	BCH_OPTS()
+#undef x
+};
+
+static const struct bch_opts bch2_opts_default = {
+#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)		\
+	._name##_defined = true,					\
+	._name = _default,						\
+
+	BCH_OPTS()
+#undef x
+};
+
+#define opt_defined(_opts, _name)	((_opts)._name##_defined)
+
+#define opt_get(_opts, _name)						\
+	(opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
+
+#define opt_set(_opts, _name, _v)					\
+do {									\
+	(_opts)._name##_defined = true;					\
+	(_opts)._name = _v;						\
+} while (0)
+
+static inline struct bch_opts bch2_opts_empty(void)
+{
+	return (struct bch_opts) { 0 };
+}
+
+void bch2_opts_apply(struct bch_opts *, struct bch_opts);
+
+enum bch_opt_id {
+#define x(_name, ...)	Opt_##_name,
+	BCH_OPTS()
+#undef x
+	bch2_opts_nr
+};
+
+struct bch_fs;
+struct printbuf;
+
+struct bch_option {
+	struct attribute	attr;
+	void			(*set_sb)(struct bch_sb *, u64);
+	enum opt_mode		mode;
+	enum opt_type		type;
+
+	union {
+	struct {
+		u64		min, max;
+	};
+	struct {
+		const char * const *choices;
+	};
+	struct {
+		int (*parse)(struct bch_fs *, const char *, u64 *);
+		void (*to_text)(struct printbuf *, struct bch_fs *, u64);
+	};
+	};
+
+	const char		*hint;
+	const char		*help;
+
+};
+
+extern const struct bch_option bch2_opt_table[];
+
+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
+u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
+void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
+
+struct bch_opts bch2_opts_from_sb(struct bch_sb *);
+
+int bch2_opt_lookup(const char *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
+
+#define OPT_SHOW_FULL_LIST	(1 << 0)
+#define OPT_SHOW_MOUNT_STYLE	(1 << 1)
+
+void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
+		      const struct bch_option *, u64, unsigned);
+
+int bch2_opt_check_may_set(struct bch_fs *, int, u64);
+int bch2_opts_check_may_set(struct bch_fs *);
+int bch2_parse_mount_opts(struct bch_opts *, char *);
+
+/* inode opts: */
+
+struct bch_io_opts {
+#define x(_name, _bits)	unsigned _name##_defined:1;
+	BCH_INODE_OPTS()
+#undef x
+
+#define x(_name, _bits)	u##_bits _name;
+	BCH_INODE_OPTS()
+#undef x
+};
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
+
+#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
new file mode 100644
index 000000000000..0fa6f33c049b
--- /dev/null
+++ b/fs/bcachefs/quota.c
@@ -0,0 +1,782 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "inode.h"
+#include "quota.h"
+#include "super-io.h"
+
+static const char *bch2_sb_validate_quota(struct bch_sb *sb,
+					  struct bch_sb_field *f)
+{
+	struct bch_sb_field_quota *q = field_to_type(f, quota);
+
+	if (vstruct_bytes(&q->field) != sizeof(*q))
+		return "invalid field quota: wrong size";
+
+	return NULL;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_quota = {
+	.validate	= bch2_sb_validate_quota,
+};
+
+const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (k.k->p.inode >= QTYP_NR)
+		return "invalid quota type";
+
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
+		return "incorrect value size";
+
+	return NULL;
+}
+
+static const char * const bch2_quota_counters[] = {
+	"space",
+	"inodes",
+};
+
+void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
+			struct bkey_s_c k)
+{
+	struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
+	unsigned i;
+
+	for (i = 0; i < Q_COUNTERS; i++)
+		pr_buf(out, "%s hardlimit %llu softlimit %llu",
+		       bch2_quota_counters[i],
+		       le64_to_cpu(dq.v->c[i].hardlimit),
+		       le64_to_cpu(dq.v->c[i].softlimit));
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+#include <linux/cred.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+
+static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
+{
+	qtypes >>= i;
+	return qtypes ? i + __ffs(qtypes) : QTYP_NR;
+}
+
+#define for_each_set_qtype(_c, _i, _q, _qtypes)				\
+	for (_i = 0;							\
+	     (_i = __next_qtype(_i, _qtypes),				\
+	      _q = &(_c)->quotas[_i],					\
+	      _i < QTYP_NR);						\
+	     _i++)
+
+static bool ignore_hardlimit(struct bch_memquota_type *q)
+{
+	if (capable(CAP_SYS_RESOURCE))
+		return true;
+#if 0
+	struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+
+	return capable(CAP_SYS_RESOURCE) &&
+	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+		!(info->dqi_flags & DQF_ROOT_SQUASH));
+#endif
+	return false;
+}
+
+enum quota_msg {
+	SOFTWARN,	/* Softlimit reached */
+	SOFTLONGWARN,	/* Grace time expired */
+	HARDWARN,	/* Hardlimit reached */
+
+	HARDBELOW,	/* Usage got below inode hardlimit */
+	SOFTBELOW,	/* Usage got below inode softlimit */
+};
+
+static int quota_nl[][Q_COUNTERS] = {
+	[HARDWARN][Q_SPC]	= QUOTA_NL_BHARDWARN,
+	[SOFTLONGWARN][Q_SPC]	= QUOTA_NL_BSOFTLONGWARN,
+	[SOFTWARN][Q_SPC]	= QUOTA_NL_BSOFTWARN,
+	[HARDBELOW][Q_SPC]	= QUOTA_NL_BHARDBELOW,
+	[SOFTBELOW][Q_SPC]	= QUOTA_NL_BSOFTBELOW,
+
+	[HARDWARN][Q_INO]	= QUOTA_NL_IHARDWARN,
+	[SOFTLONGWARN][Q_INO]	= QUOTA_NL_ISOFTLONGWARN,
+	[SOFTWARN][Q_INO]	= QUOTA_NL_ISOFTWARN,
+	[HARDBELOW][Q_INO]	= QUOTA_NL_IHARDBELOW,
+	[SOFTBELOW][Q_INO]	= QUOTA_NL_ISOFTBELOW,
+};
+
+struct quota_msgs {
+	u8		nr;
+	struct {
+		u8	qtype;
+		u8	msg;
+	}		m[QTYP_NR * Q_COUNTERS];
+};
+
+static void prepare_msg(unsigned qtype,
+			enum quota_counters counter,
+			struct quota_msgs *msgs,
+			enum quota_msg msg_type)
+{
+	BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
+
+	msgs->m[msgs->nr].qtype	= qtype;
+	msgs->m[msgs->nr].msg	= quota_nl[msg_type][counter];
+	msgs->nr++;
+}
+
+static void prepare_warning(struct memquota_counter *qc,
+			    unsigned qtype,
+			    enum quota_counters counter,
+			    struct quota_msgs *msgs,
+			    enum quota_msg msg_type)
+{
+	if (qc->warning_issued & (1 << msg_type))
+		return;
+
+	prepare_msg(qtype, counter, msgs, msg_type);
+}
+
+static void flush_warnings(struct bch_qid qid,
+			   struct super_block *sb,
+			   struct quota_msgs *msgs)
+{
+	unsigned i;
+
+	for (i = 0; i < msgs->nr; i++)
+		quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
+				   sb->s_dev, msgs->m[i].msg);
+}
+
+static int bch2_quota_check_limit(struct bch_fs *c,
+				  unsigned qtype,
+				  struct bch_memquota *mq,
+				  struct quota_msgs *msgs,
+				  enum quota_counters counter,
+				  s64 v,
+				  enum quota_acct_mode mode)
+{
+	struct bch_memquota_type *q = &c->quotas[qtype];
+	struct memquota_counter *qc = &mq->c[counter];
+	u64 n = qc->v + v;
+
+	BUG_ON((s64) n < 0);
+
+	if (mode == KEY_TYPE_QUOTA_NOCHECK)
+		return 0;
+
+	if (v <= 0) {
+		if (n < qc->hardlimit &&
+		    (qc->warning_issued & (1 << HARDWARN))) {
+			qc->warning_issued &= ~(1 << HARDWARN);
+			prepare_msg(qtype, counter, msgs, HARDBELOW);
+		}
+
+		if (n < qc->softlimit &&
+		    (qc->warning_issued & (1 << SOFTWARN))) {
+			qc->warning_issued &= ~(1 << SOFTWARN);
+			prepare_msg(qtype, counter, msgs, SOFTBELOW);
+		}
+
+		qc->warning_issued = 0;
+		return 0;
+	}
+
+	if (qc->hardlimit &&
+	    qc->hardlimit < n &&
+	    !ignore_hardlimit(q)) {
+		if (mode == KEY_TYPE_QUOTA_PREALLOC)
+			return -EDQUOT;
+
+		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+	}
+
+	if (qc->softlimit &&
+	    qc->softlimit < n &&
+	    qc->timer &&
+	    ktime_get_real_seconds() >= qc->timer &&
+	    !ignore_hardlimit(q)) {
+		if (mode == KEY_TYPE_QUOTA_PREALLOC)
+			return -EDQUOT;
+
+		prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
+	}
+
+	if (qc->softlimit &&
+	    qc->softlimit < n &&
+	    qc->timer == 0) {
+		if (mode == KEY_TYPE_QUOTA_PREALLOC)
+			return -EDQUOT;
+
+		prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+
+		/* XXX is this the right one? */
+		qc->timer = ktime_get_real_seconds() +
+			q->limits[counter].warnlimit;
+	}
+
+	return 0;
+}
+
+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+		    enum quota_counters counter, s64 v,
+		    enum quota_acct_mode mode)
+{
+	unsigned qtypes = enabled_qtypes(c);
+	struct bch_memquota_type *q;
+	struct bch_memquota *mq[QTYP_NR];
+	struct quota_msgs msgs;
+	unsigned i;
+	int ret = 0;
+
+	memset(&msgs, 0, sizeof(msgs));
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_lock_nested(&q->lock, i);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
+		if (!mq[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
+		if (ret)
+			goto err;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mq[i]->c[counter].v += v;
+err:
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_unlock(&q->lock);
+
+	flush_warnings(qid, c->vfs_sb, &msgs);
+
+	return ret;
+}
+
+static void __bch2_quota_transfer(struct bch_memquota *src_q,
+				  struct bch_memquota *dst_q,
+				  enum quota_counters counter, s64 v)
+{
+	BUG_ON(v > src_q->c[counter].v);
+	BUG_ON(v + dst_q->c[counter].v < v);
+
+	src_q->c[counter].v -= v;
+	dst_q->c[counter].v += v;
+}
+
+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+			struct bch_qid dst,
+			struct bch_qid src, u64 space,
+			enum quota_acct_mode mode)
+{
+	struct bch_memquota_type *q;
+	struct bch_memquota *src_q[3], *dst_q[3];
+	struct quota_msgs msgs;
+	unsigned i;
+	int ret = 0;
+
+	qtypes &= enabled_qtypes(c);
+
+	memset(&msgs, 0, sizeof(msgs));
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_lock_nested(&q->lock, i);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
+		dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
+
+		if (!src_q[i] || !dst_q[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
+					     dst_q[i]->c[Q_SPC].v + space,
+					     mode);
+		if (ret)
+			goto err;
+
+		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
+					     dst_q[i]->c[Q_INO].v + 1,
+					     mode);
+		if (ret)
+			goto err;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		__bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
+		__bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
+	}
+
+err:
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_unlock(&q->lock);
+
+	flush_warnings(dst, c->vfs_sb, &msgs);
+
+	return ret;
+}
+
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_quota dq;
+	struct bch_memquota_type *q;
+	struct bch_memquota *mq;
+	unsigned i;
+
+	BUG_ON(k.k->p.inode >= QTYP_NR);
+
+	switch (k.k->type) {
+	case KEY_TYPE_quota:
+		dq = bkey_s_c_to_quota(k);
+		q = &c->quotas[k.k->p.inode];
+
+		mutex_lock(&q->lock);
+		mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
+		if (!mq) {
+			mutex_unlock(&q->lock);
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < Q_COUNTERS; i++) {
+			mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
+			mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
+		}
+
+		mutex_unlock(&q->lock);
+	}
+
+	return 0;
+}
+
+static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0),
+			   BTREE_ITER_PREFETCH, k, ret) {
+		if (k.k->p.inode != type)
+			break;
+
+		ret = __bch2_quota_set(c, k);
+		if (ret)
+			break;
+	}
+
+	return bch2_trans_exit(&trans) ?: ret;
+}
+
+void bch2_fs_quota_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+		genradix_free(&c->quotas[i].table);
+}
+
+void bch2_fs_quota_init(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+		mutex_init(&c->quotas[i].lock);
+}
+
+static void bch2_sb_quota_read(struct bch_fs *c)
+{
+	struct bch_sb_field_quota *sb_quota;
+	unsigned i, j;
+
+	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+	if (!sb_quota)
+		return;
+
+	for (i = 0; i < QTYP_NR; i++) {
+		struct bch_memquota_type *q = &c->quotas[i];
+
+		for (j = 0; j < Q_COUNTERS; j++) {
+			q->limits[j].timelimit =
+				le32_to_cpu(sb_quota->q[i].c[j].timelimit);
+			q->limits[j].warnlimit =
+				le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
+		}
+	}
+}
+
+int bch2_fs_quota_read(struct bch_fs *c)
+{
+	unsigned i, qtypes = enabled_qtypes(c);
+	struct bch_memquota_type *q;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bch_inode_unpacked u;
+	struct bkey_s_c k;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	bch2_sb_quota_read(c);
+	mutex_unlock(&c->sb_lock);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		ret = bch2_quota_init_type(c, i);
+		if (ret)
+			return ret;
+	}
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		switch (k.k->type) {
+		case KEY_TYPE_inode:
+			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+			if (ret)
+				return ret;
+
+			bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+					KEY_TYPE_QUOTA_NOCHECK);
+			bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+					KEY_TYPE_QUOTA_NOCHECK);
+		}
+	}
+	return bch2_trans_exit(&trans) ?: ret;
+}
+
+/* Enable/disable/delete quotas for an entire filesystem: */
+
+static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	/* Accounting must be enabled at mount time: */
+	if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
+		return -EINVAL;
+
+	/* Can't enable enforcement without accounting: */
+	if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
+		return -EINVAL;
+
+	if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
+		return -EINVAL;
+
+	if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
+
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
+
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&c->sb_lock);
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
+
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
+
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	if (uflags & FS_USER_QUOTA) {
+		if (c->opts.usrquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+					      POS(QTYP_USR, 0),
+					      POS(QTYP_USR + 1, 0),
+					      NULL);
+		if (ret)
+			return ret;
+	}
+
+	if (uflags & FS_GROUP_QUOTA) {
+		if (c->opts.grpquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+					      POS(QTYP_GRP, 0),
+					      POS(QTYP_GRP + 1, 0),
+					      NULL);
+		if (ret)
+			return ret;
+	}
+
+	if (uflags & FS_PROJ_QUOTA) {
+		if (c->opts.prjquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+					      POS(QTYP_PRJ, 0),
+					      POS(QTYP_PRJ + 1, 0),
+					      NULL);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Return quota status information, such as enforcements, quota file inode
+ * numbers etc.
+ */
+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	unsigned qtypes = enabled_qtypes(c);
+	unsigned i;
+
+	memset(state, 0, sizeof(*state));
+
+	for (i = 0; i < QTYP_NR; i++) {
+		state->s_state[i].flags |= QCI_SYSFILE;
+
+		if (!(qtypes & (1 << i)))
+			continue;
+
+		state->s_state[i].flags |= QCI_ACCT_ENABLED;
+
+		state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
+		state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
+
+		state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
+		state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
+	}
+
+	return 0;
+}
+
+/*
+ * Adjust quota timers & warnings
+ */
+static int bch2_quota_set_info(struct super_block *sb, int type,
+			       struct qc_info *info)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_sb_field_quota *sb_quota;
+	struct bch_memquota_type *q;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	if (type >= QTYP_NR)
+		return -EINVAL;
+
+	if (!((1 << type) & enabled_qtypes(c)))
+		return -ESRCH;
+
+	if (info->i_fieldmask &
+	    ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
+		return -EINVAL;
+
+	q = &c->quotas[type];
+
+	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+	if (!sb_quota) {
+		sb_quota = bch2_sb_resize_quota(&c->disk_sb,
+					sizeof(*sb_quota) / sizeof(u64));
+		if (!sb_quota)
+			return -ENOSPC;
+	}
+
+	if (info->i_fieldmask & QC_SPC_TIMER)
+		sb_quota->q[type].c[Q_SPC].timelimit =
+			cpu_to_le32(info->i_spc_timelimit);
+
+	if (info->i_fieldmask & QC_SPC_WARNS)
+		sb_quota->q[type].c[Q_SPC].warnlimit =
+			cpu_to_le32(info->i_spc_warnlimit);
+
+	if (info->i_fieldmask & QC_INO_TIMER)
+		sb_quota->q[type].c[Q_INO].timelimit =
+			cpu_to_le32(info->i_ino_timelimit);
+
+	if (info->i_fieldmask & QC_INO_WARNS)
+		sb_quota->q[type].c[Q_INO].warnlimit =
+			cpu_to_le32(info->i_ino_warnlimit);
+
+	bch2_sb_quota_read(c);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+/* Get/set individual quotas: */
+
+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
+{
+	dst->d_space		= src->c[Q_SPC].v << 9;
+	dst->d_spc_hardlimit	= src->c[Q_SPC].hardlimit << 9;
+	dst->d_spc_softlimit	= src->c[Q_SPC].softlimit << 9;
+	dst->d_spc_timer	= src->c[Q_SPC].timer;
+	dst->d_spc_warns	= src->c[Q_SPC].warns;
+
+	dst->d_ino_count	= src->c[Q_INO].v;
+	dst->d_ino_hardlimit	= src->c[Q_INO].hardlimit;
+	dst->d_ino_softlimit	= src->c[Q_INO].softlimit;
+	dst->d_ino_timer	= src->c[Q_INO].timer;
+	dst->d_ino_warns	= src->c[Q_INO].warns;
+}
+
+static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c		= sb->s_fs_info;
+	struct bch_memquota_type *q	= &c->quotas[kqid.type];
+	qid_t qid			= from_kqid(&init_user_ns, kqid);
+	struct bch_memquota *mq;
+
+	memset(qdq, 0, sizeof(*qdq));
+
+	mutex_lock(&q->lock);
+	mq = genradix_ptr(&q->table, qid);
+	if (mq)
+		__bch2_quota_get(qdq, mq);
+	mutex_unlock(&q->lock);
+
+	return 0;
+}
+
+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
+			       struct qc_dqblk *qdq)
+{
+	struct bch_fs *c		= sb->s_fs_info;
+	struct bch_memquota_type *q	= &c->quotas[kqid->type];
+	qid_t qid			= from_kqid(&init_user_ns, *kqid);
+	struct genradix_iter iter;
+	struct bch_memquota *mq;
+	int ret = 0;
+
+	mutex_lock(&q->lock);
+
+	genradix_for_each_from(&q->table, iter, mq, qid)
+		if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
+			__bch2_quota_get(qdq, mq);
+			*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
+			goto found;
+		}
+
+	ret = -ENOENT;
+found:
+	mutex_unlock(&q->lock);
+	return ret;
+}
+
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_i_quota new_quota;
+	int ret;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	bkey_quota_init(&new_quota.k_i);
+	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_QUOTAS, new_quota.k.p,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(iter);
+
+	ret = bkey_err(k);
+	if (unlikely(ret))
+		return ret;
+
+	switch (k.k->type) {
+	case KEY_TYPE_quota:
+		new_quota.v = *bkey_s_c_to_quota(k).v;
+		break;
+	}
+
+	if (qdq->d_fieldmask & QC_SPC_SOFT)
+		new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
+	if (qdq->d_fieldmask & QC_SPC_HARD)
+		new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
+
+	if (qdq->d_fieldmask & QC_INO_SOFT)
+		new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
+	if (qdq->d_fieldmask & QC_INO_HARD)
+		new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+
+	bch2_trans_update(&trans, iter, &new_quota.k_i);
+
+	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+
+	bch2_trans_exit(&trans);
+
+	if (ret)
+		return ret;
+
+	ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+
+	return ret;
+}
+
+const struct quotactl_ops bch2_quotactl_operations = {
+	.quota_enable		= bch2_quota_enable,
+	.quota_disable		= bch2_quota_disable,
+	.rm_xquota		= bch2_quota_remove,
+
+	.get_state		= bch2_quota_get_state,
+	.set_info		= bch2_quota_set_info,
+
+	.get_dqblk		= bch2_get_quota,
+	.get_nextdqblk		= bch2_get_next_quota,
+	.set_dqblk		= bch2_set_quota,
+};
+
+#endif /* CONFIG_BCACHEFS_QUOTA */
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
new file mode 100644
index 000000000000..51e4f9713ef0
--- /dev/null
+++ b/fs/bcachefs/quota.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_H
+#define _BCACHEFS_QUOTA_H
+
+#include "inode.h"
+#include "quota_types.h"
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
+
+const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_quota (struct bkey_ops) {		\
+	.key_invalid	= bch2_quota_invalid,		\
+	.val_to_text	= bch2_quota_to_text,		\
+}
+
+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
+{
+	return (struct bch_qid) {
+		.q[QTYP_USR] = u->bi_uid,
+		.q[QTYP_GRP] = u->bi_gid,
+		.q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0,
+	};
+}
+
+static inline unsigned enabled_qtypes(struct bch_fs *c)
+{
+	return ((c->opts.usrquota << QTYP_USR)|
+		(c->opts.grpquota << QTYP_GRP)|
+		(c->opts.prjquota << QTYP_PRJ));
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
+		    s64, enum quota_acct_mode);
+
+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
+			struct bch_qid, u64, enum quota_acct_mode);
+
+void bch2_fs_quota_exit(struct bch_fs *);
+void bch2_fs_quota_init(struct bch_fs *);
+int bch2_fs_quota_read(struct bch_fs *);
+
+extern const struct quotactl_ops bch2_quotactl_operations;
+
+#else
+
+static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+				  enum quota_counters counter, s64 v,
+				  enum quota_acct_mode mode)
+{
+	return 0;
+}
+
+static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+				      struct bch_qid dst,
+				      struct bch_qid src, u64 space,
+				      enum quota_acct_mode mode)
+{
+	return 0;
+}
+
+static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
+static inline void bch2_fs_quota_init(struct bch_fs *c) {}
+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
+
+#endif
+
+#endif /* _BCACHEFS_QUOTA_H */
diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
new file mode 100644
index 000000000000..6a136083d389
--- /dev/null
+++ b/fs/bcachefs/quota_types.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_TYPES_H
+#define _BCACHEFS_QUOTA_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+struct bch_qid {
+	u32		q[QTYP_NR];
+};
+
+enum quota_acct_mode {
+	KEY_TYPE_QUOTA_PREALLOC,
+	KEY_TYPE_QUOTA_WARN,
+	KEY_TYPE_QUOTA_NOCHECK,
+};
+
+struct memquota_counter {
+	u64				v;
+	u64				hardlimit;
+	u64				softlimit;
+	s64				timer;
+	int				warns;
+	int				warning_issued;
+};
+
+struct bch_memquota {
+	struct memquota_counter		c[Q_COUNTERS];
+};
+
+typedef GENRADIX(struct bch_memquota)	bch_memquota_table;
+
+struct quota_limit {
+	u32				timelimit;
+	u32				warnlimit;
+};
+
+struct bch_memquota_type {
+	struct quota_limit		limits[Q_COUNTERS];
+	bch_memquota_table		table;
+	struct mutex			lock;
+};
+
+#endif /* _BCACHEFS_QUOTA_TYPES_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
new file mode 100644
index 000000000000..84b3fb6eb101
--- /dev/null
+++ b/fs/bcachefs/rebalance.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "extents.h"
+#include "io.h"
+#include "move.h"
+#include "rebalance.h"
+#include "super-io.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sched/cputime.h>
+#include <trace/events/bcachefs.h>
+
+static inline bool rebalance_ptr_pred(struct bch_fs *c,
+				      struct extent_ptr_decoded p,
+				      struct bch_io_opts *io_opts)
+{
+	if (io_opts->background_target &&
+	    !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target) &&
+	    !p.ptr.cached)
+		return true;
+
+	if (io_opts->background_compression &&
+	    p.crc.compression_type !=
+	    bch2_compression_opt_to_type[io_opts->background_compression])
+		return true;
+
+	return false;
+}
+
+void bch2_rebalance_add_key(struct bch_fs *c,
+			    struct bkey_s_c k,
+			    struct bch_io_opts *io_opts)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	if (!io_opts->background_target &&
+	    !io_opts->background_compression)
+		return;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (rebalance_ptr_pred(c, p, io_opts)) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+
+			if (atomic64_add_return(p.crc.compressed_size,
+						&ca->rebalance_work) ==
+			    p.crc.compressed_size)
+				rebalance_wakeup(c);
+		}
+}
+
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+	if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
+	    sectors)
+		rebalance_wakeup(c);
+}
+
+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
+				    struct bkey_s_c k,
+				    struct bch_io_opts *io_opts,
+				    struct data_opts *data_opts)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned nr_replicas = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		nr_replicas += !p.ptr.cached;
+
+		if (rebalance_ptr_pred(c, p, io_opts))
+			goto found;
+	}
+
+	if (nr_replicas < io_opts->data_replicas)
+		goto found;
+
+	return DATA_SKIP;
+found:
+	data_opts->target		= io_opts->background_target;
+	data_opts->btree_insert_flags	= 0;
+	return DATA_ADD_REPLICAS;
+}
+
+struct rebalance_work {
+	int		dev_most_full_idx;
+	unsigned	dev_most_full_percent;
+	u64		dev_most_full_work;
+	u64		dev_most_full_capacity;
+	u64		total_work;
+};
+
+static void rebalance_work_accumulate(struct rebalance_work *w,
+		u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+{
+	unsigned percent_full;
+	u64 work = dev_work + unknown_dev;
+
+	if (work < dev_work || work < unknown_dev)
+		work = U64_MAX;
+	work = min(work, capacity);
+
+	percent_full = div64_u64(work * 100, capacity);
+
+	if (percent_full >= w->dev_most_full_percent) {
+		w->dev_most_full_idx		= idx;
+		w->dev_most_full_percent	= percent_full;
+		w->dev_most_full_work		= work;
+		w->dev_most_full_capacity	= capacity;
+	}
+
+	if (w->total_work + dev_work >= w->total_work &&
+	    w->total_work + dev_work >= dev_work)
+		w->total_work += dev_work;
+}
+
+static struct rebalance_work rebalance_work(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	struct rebalance_work ret = { .dev_most_full_idx = -1 };
+	u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		rebalance_work_accumulate(&ret,
+			atomic64_read(&ca->rebalance_work),
+			unknown_dev,
+			bucket_to_sector(ca, ca->mi.nbuckets -
+					 ca->mi.first_bucket),
+			i);
+
+	rebalance_work_accumulate(&ret,
+		unknown_dev, 0, c->capacity, -1);
+
+	return ret;
+}
+
+static void rebalance_work_reset(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		atomic64_set(&ca->rebalance_work, 0);
+
+	atomic64_set(&c->rebalance.work_unknown_dev, 0);
+}
+
+static unsigned long curr_cputime(void)
+{
+	u64 utime, stime;
+
+	task_cputime_adjusted(current, &utime, &stime);
+	return nsecs_to_jiffies(utime + stime);
+}
+
+static int bch2_rebalance_thread(void *arg)
+{
+	struct bch_fs *c = arg;
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	struct rebalance_work w, p;
+	unsigned long start, prev_start;
+	unsigned long prev_run_time, prev_run_cputime;
+	unsigned long cputime, prev_cputime;
+	unsigned long io_start;
+	long throttle;
+
+	set_freezable();
+
+	io_start	= atomic_long_read(&clock->now);
+	p		= rebalance_work(c);
+	prev_start	= jiffies;
+	prev_cputime	= curr_cputime();
+
+	while (!kthread_wait_freezable(r->enabled)) {
+		start			= jiffies;
+		cputime			= curr_cputime();
+
+		prev_run_time		= start - prev_start;
+		prev_run_cputime	= cputime - prev_cputime;
+
+		w			= rebalance_work(c);
+		BUG_ON(!w.dev_most_full_capacity);
+
+		if (!w.total_work) {
+			r->state = REBALANCE_WAITING;
+			kthread_wait_freezable(rebalance_work(c).total_work);
+			continue;
+		}
+
+		/*
+		 * If there isn't much work to do, throttle cpu usage:
+		 */
+		throttle = prev_run_cputime * 100 /
+			max(1U, w.dev_most_full_percent) -
+			prev_run_time;
+
+		if (w.dev_most_full_percent < 20 && throttle > 0) {
+			r->state = REBALANCE_THROTTLED;
+			r->throttled_until_iotime = io_start +
+				div_u64(w.dev_most_full_capacity *
+					(20 - w.dev_most_full_percent),
+					50);
+			r->throttled_until_cputime = start + throttle;
+
+			bch2_kthread_io_clock_wait(clock,
+				r->throttled_until_iotime,
+				throttle);
+			continue;
+		}
+
+		/* minimum 1 mb/sec: */
+		r->pd.rate.rate =
+			max_t(u64, 1 << 11,
+			      r->pd.rate.rate *
+			      max(p.dev_most_full_percent, 1U) /
+			      max(w.dev_most_full_percent, 1U));
+
+		io_start	= atomic_long_read(&clock->now);
+		p		= w;
+		prev_start	= start;
+		prev_cputime	= cputime;
+
+		r->state = REBALANCE_RUNNING;
+		memset(&r->move_stats, 0, sizeof(r->move_stats));
+		rebalance_work_reset(c);
+
+		bch2_move_data(c,
+			       /* ratelimiting disabled for now */
+			       NULL, /*  &r->pd.rate, */
+			       writepoint_ptr(&c->rebalance_write_point),
+			       POS_MIN, POS_MAX,
+			       rebalance_pred, NULL,
+			       &r->move_stats);
+	}
+
+	return 0;
+}
+
+ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
+{
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct rebalance_work w = rebalance_work(c);
+	char h1[21], h2[21];
+
+	bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9);
+	bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9);
+	pr_buf(&out, "fullest_dev (%i):\t%s/%s\n",
+	       w.dev_most_full_idx, h1, h2);
+
+	bch2_hprint(&PBUF(h1), w.total_work << 9);
+	bch2_hprint(&PBUF(h2), c->capacity << 9);
+	pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2);
+
+	pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate);
+
+	switch (r->state) {
+	case REBALANCE_WAITING:
+		pr_buf(&out, "waiting\n");
+		break;
+	case REBALANCE_THROTTLED:
+		bch2_hprint(&PBUF(h1),
+			    (r->throttled_until_iotime -
+			     atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+		pr_buf(&out, "throttled for %lu sec or %s io\n",
+		       (r->throttled_until_cputime - jiffies) / HZ,
+		       h1);
+		break;
+	case REBALANCE_RUNNING:
+		pr_buf(&out, "running\n");
+		pr_buf(&out, "pos %llu:%llu\n",
+		       r->move_stats.pos.inode,
+		       r->move_stats.pos.offset);
+		break;
+	}
+
+	return out.pos - buf;
+}
+
+void bch2_rebalance_stop(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	c->rebalance.pd.rate.rate = UINT_MAX;
+	bch2_ratelimit_reset(&c->rebalance.pd.rate);
+
+	p = rcu_dereference_protected(c->rebalance.thread, 1);
+	c->rebalance.thread = NULL;
+
+	if (p) {
+		/* for sychronizing with rebalance_wakeup() */
+		synchronize_rcu();
+
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_rebalance_start(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	if (c->opts.nochanges)
+		return 0;
+
+	p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	get_task_struct(p);
+	rcu_assign_pointer(c->rebalance.thread, p);
+	wake_up_process(p);
+	return 0;
+}
+
+void bch2_fs_rebalance_init(struct bch_fs *c)
+{
+	bch2_pd_controller_init(&c->rebalance.pd);
+
+	atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
+}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
new file mode 100644
index 000000000000..99e2a1fb6084
--- /dev/null
+++ b/fs/bcachefs/rebalance.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_H
+#define _BCACHEFS_REBALANCE_H
+
+#include "rebalance_types.h"
+
+static inline void rebalance_wakeup(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	rcu_read_lock();
+	p = rcu_dereference(c->rebalance.thread);
+	if (p)
+		wake_up_process(p);
+	rcu_read_unlock();
+}
+
+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
+			    struct bch_io_opts *);
+void bch2_rebalance_add_work(struct bch_fs *, u64);
+
+ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
+
+void bch2_rebalance_stop(struct bch_fs *);
+int bch2_rebalance_start(struct bch_fs *);
+void bch2_fs_rebalance_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
new file mode 100644
index 000000000000..192c6be20ced
--- /dev/null
+++ b/fs/bcachefs/rebalance_types.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_TYPES_H
+#define _BCACHEFS_REBALANCE_TYPES_H
+
+#include "move_types.h"
+
+enum rebalance_state {
+	REBALANCE_WAITING,
+	REBALANCE_THROTTLED,
+	REBALANCE_RUNNING,
+};
+
+struct bch_fs_rebalance {
+	struct task_struct __rcu *thread;
+	struct bch_pd_controller pd;
+
+	atomic64_t		work_unknown_dev;
+
+	enum rebalance_state	state;
+	unsigned long		throttled_until_iotime;
+	unsigned long		throttled_until_cputime;
+	struct bch_move_stats	move_stats;
+
+	unsigned		enabled:1;
+};
+
+#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
new file mode 100644
index 000000000000..2c441a278044
--- /dev/null
+++ b/fs/bcachefs/recovery.c
@@ -0,0 +1,1047 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "buckets.h"
+#include "dirent.h"
+#include "ec.h"
+#include "error.h"
+#include "fs-common.h"
+#include "fsck.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "quota.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+#include <linux/stat.h>
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+/* iterate over keys read from the journal: */
+
+struct journal_iter bch2_journal_iter_init(struct journal_keys *keys,
+					   enum btree_id id)
+{
+	return (struct journal_iter) {
+		.keys		= keys,
+		.k		= keys->d,
+		.btree_id	= id,
+	};
+}
+
+struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+{
+	while (1) {
+		if (iter->k == iter->keys->d + iter->keys->nr)
+			return bkey_s_c_null;
+
+		if (iter->k->btree_id == iter->btree_id)
+			return bkey_i_to_s_c(iter->k->k);
+
+		iter->k++;
+	}
+
+	return bkey_s_c_null;
+}
+
+struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter)
+{
+	if (iter->k == iter->keys->d + iter->keys->nr)
+		return bkey_s_c_null;
+
+	iter->k++;
+	return bch2_journal_iter_peek(iter);
+}
+
+/* sort and dedup all keys in the journal: */
+
+static void journal_entries_free(struct list_head *list)
+{
+
+	while (!list_empty(list)) {
+		struct journal_replay *i =
+			list_first_entry(list, struct journal_replay, list);
+		list_del(&i->list);
+		kvpfree(i, offsetof(struct journal_replay, j) +
+			vstruct_bytes(&i->j));
+	}
+}
+
+static int journal_sort_key_cmp(const void *_l, const void *_r)
+{
+	const struct journal_key *l = _l;
+	const struct journal_key *r = _r;
+
+	return cmp_int(l->btree_id, r->btree_id) ?:
+		bkey_cmp(l->pos, r->pos) ?:
+		cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->journal_offset, r->journal_offset);
+}
+
+static int journal_sort_seq_cmp(const void *_l, const void *_r)
+{
+	const struct journal_key *l = _l;
+	const struct journal_key *r = _r;
+
+	return cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->btree_id, r->btree_id) ?:
+		bkey_cmp(l->pos, r->pos);
+}
+
+static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i)
+{
+	while (i + 1 < keys->d + keys->nr &&
+	       journal_sort_key_cmp(i, i + 1) > 0) {
+		swap(i[0], i[1]);
+		i++;
+	}
+}
+
+static void journal_keys_free(struct journal_keys *keys)
+{
+	struct journal_key *i;
+
+	for_each_journal_key(*keys, i)
+		if (i->allocated)
+			kfree(i->k);
+	kvfree(keys->d);
+	keys->d = NULL;
+	keys->nr = 0;
+}
+
+static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
+{
+	struct journal_replay *p;
+	struct jset_entry *entry;
+	struct bkey_i *k, *_n;
+	struct journal_keys keys = { NULL }, keys_deduped = { NULL };
+	struct journal_key *i;
+	size_t nr_keys = 0;
+
+	list_for_each_entry(p, journal_entries, list)
+		for_each_jset_key(k, _n, entry, &p->j)
+			nr_keys++;
+
+	keys.journal_seq_base = keys_deduped.journal_seq_base =
+		le64_to_cpu(list_first_entry(journal_entries,
+					     struct journal_replay,
+					     list)->j.seq);
+
+	keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
+	if (!keys.d)
+		goto err;
+
+	keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL);
+	if (!keys_deduped.d)
+		goto err;
+
+	list_for_each_entry(p, journal_entries, list)
+		for_each_jset_key(k, _n, entry, &p->j)
+			keys.d[keys.nr++] = (struct journal_key) {
+				.btree_id	= entry->btree_id,
+				.pos		= bkey_start_pos(&k->k),
+				.k		= k,
+				.journal_seq	= le64_to_cpu(p->j.seq) -
+					keys.journal_seq_base,
+				.journal_offset	= k->_data - p->j._data,
+			};
+
+	sort(keys.d, nr_keys, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
+
+	i = keys.d;
+	while (i < keys.d + keys.nr) {
+		if (i + 1 < keys.d + keys.nr &&
+		    i[0].btree_id == i[1].btree_id &&
+		    !bkey_cmp(i[0].pos, i[1].pos)) {
+			if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
+				i++;
+			} else {
+				bch2_cut_front(i[1].k->k.p, i[0].k);
+				i[0].pos = i[1].k->k.p;
+				journal_keys_sift(&keys, i);
+			}
+			continue;
+		}
+
+		if (i + 1 < keys.d + keys.nr &&
+		    i[0].btree_id == i[1].btree_id &&
+		    bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) {
+			if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
+			     cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
+				if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
+					bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k);
+				} else {
+					struct bkey_i *split =
+						kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
+
+					if (!split)
+						goto err;
+
+					bkey_copy(split, i[0].k);
+					bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k);
+					keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
+						.btree_id	= i[0].btree_id,
+						.allocated	= true,
+						.pos		= bkey_start_pos(&split->k),
+						.k		= split,
+						.journal_seq	= i[0].journal_seq,
+						.journal_offset	= i[0].journal_offset,
+					};
+
+					bch2_cut_front(i[1].k->k.p, i[0].k);
+					i[0].pos = i[1].k->k.p;
+					journal_keys_sift(&keys, i);
+					continue;
+				}
+			} else {
+				if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) {
+					i[1] = i[0];
+					i++;
+					continue;
+				} else {
+					bch2_cut_front(i[0].k->k.p, i[1].k);
+					i[1].pos = i[0].k->k.p;
+					journal_keys_sift(&keys, i + 1);
+					continue;
+				}
+			}
+		}
+
+		keys_deduped.d[keys_deduped.nr++] = *i++;
+	}
+
+	kvfree(keys.d);
+	return keys_deduped;
+err:
+	journal_keys_free(&keys_deduped);
+	kvfree(keys.d);
+	return (struct journal_keys) { NULL };
+}
+
+/* journal replay: */
+
+static void replay_now_at(struct journal *j, u64 seq)
+{
+	BUG_ON(seq < j->replay_journal_seq);
+	BUG_ON(seq > j->replay_journal_seq_end);
+
+	while (j->replay_journal_seq < seq)
+		bch2_journal_pin_put(j, j->replay_journal_seq++);
+}
+
+static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
+				  struct bkey_i *k)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter, *split_iter;
+	/*
+	 * We might cause compressed extents to be split, so we need to pass in
+	 * a disk_reservation:
+	 */
+	struct disk_reservation disk_res =
+		bch2_disk_reservation_init(c, 0);
+	struct bkey_i *split;
+	struct bpos atomic_end;
+	/*
+	 * Some extents aren't equivalent - w.r.t. what the triggers do
+	 * - if they're split:
+	 */
+	bool remark_if_split = bch2_extent_is_compressed(bkey_i_to_s_c(k)) ||
+		k->k.type == KEY_TYPE_reflink_p;
+	bool remark = false;
+	int ret;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	iter = bch2_trans_get_iter(&trans, btree_id,
+				   bkey_start_pos(&k->k),
+				   BTREE_ITER_INTENT);
+
+	do {
+		ret = bch2_btree_iter_traverse(iter);
+		if (ret)
+			goto err;
+
+		atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
+
+		split_iter = bch2_trans_copy_iter(&trans, iter);
+		ret = PTR_ERR_OR_ZERO(split_iter);
+		if (ret)
+			goto err;
+
+		split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
+		ret = PTR_ERR_OR_ZERO(split);
+		if (ret)
+			goto err;
+
+		if (!remark &&
+		    remark_if_split &&
+		    bkey_cmp(atomic_end, k->k.p) < 0) {
+			ret = bch2_disk_reservation_add(c, &disk_res,
+					k->k.size *
+					bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
+					BCH_DISK_RESERVATION_NOFAIL);
+			BUG_ON(ret);
+
+			remark = true;
+		}
+
+		bkey_copy(split, k);
+		bch2_cut_front(split_iter->pos, split);
+		bch2_cut_back(atomic_end, &split->k);
+
+		bch2_trans_update(&trans, split_iter, split);
+		bch2_btree_iter_set_pos(iter, split->k.p);
+	} while (bkey_cmp(iter->pos, k->k.p) < 0);
+
+	if (remark) {
+		ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
+					  0, -((s64) k->k.size),
+					  BCH_BUCKET_MARK_OVERWRITE) ?:
+		      bch2_trans_commit(&trans, &disk_res, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW|
+					BTREE_INSERT_NOMARK_OVERWRITES|
+					BTREE_INSERT_NO_CLEAR_REPLICAS);
+	} else {
+		ret = bch2_trans_commit(&trans, &disk_res, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW|
+					BTREE_INSERT_JOURNAL_REPLAY|
+					BTREE_INSERT_NOMARK);
+	}
+
+	if (ret)
+		goto err;
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	bch2_disk_reservation_put(c, &disk_res);
+
+	return bch2_trans_exit(&trans) ?: ret;
+}
+
+static int bch2_journal_replay(struct bch_fs *c,
+			       struct journal_keys keys)
+{
+	struct journal *j = &c->journal;
+	struct journal_key *i;
+	int ret;
+
+	sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
+
+	for_each_journal_key(keys, i) {
+		replay_now_at(j, keys.journal_seq_base + i->journal_seq);
+
+		if (i->btree_id == BTREE_ID_ALLOC)
+			ret = bch2_alloc_replay_key(c, i->k);
+		else if (btree_node_type_is_extents(i->btree_id))
+			ret = bch2_extent_replay_key(c, i->btree_id, i->k);
+		else
+			ret = bch2_btree_insert(c, i->btree_id, i->k,
+						NULL, NULL,
+						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_LAZY_RW|
+						BTREE_INSERT_JOURNAL_REPLAY|
+						BTREE_INSERT_NOMARK);
+
+		if (ret) {
+			bch_err(c, "journal replay: error %d while replaying key",
+				ret);
+			return ret;
+		}
+
+		cond_resched();
+	}
+
+	replay_now_at(j, j->replay_journal_seq_end);
+	j->replay_journal_seq = 0;
+
+	bch2_journal_set_replay_done(j);
+	bch2_journal_flush_all_pins(j);
+	return bch2_journal_error(j);
+}
+
+static bool journal_empty(struct list_head *journal)
+{
+	return list_empty(journal) ||
+		journal_entry_empty(&list_last_entry(journal,
+					struct journal_replay, list)->j);
+}
+
+static int
+verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
+						  struct list_head *journal)
+{
+	struct journal_replay *i =
+		list_last_entry(journal, struct journal_replay, list);
+	u64 start_seq	= le64_to_cpu(i->j.last_seq);
+	u64 end_seq	= le64_to_cpu(i->j.seq);
+	u64 seq		= start_seq;
+	int ret = 0;
+
+	list_for_each_entry(i, journal, list) {
+		fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
+			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
+			seq, le64_to_cpu(i->j.seq) - 1,
+			start_seq, end_seq);
+
+		seq = le64_to_cpu(i->j.seq);
+
+		fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
+			    "found blacklisted journal entry %llu", seq);
+
+		do {
+			seq++;
+		} while (bch2_journal_seq_is_blacklisted(c, seq, false));
+	}
+fsck_err:
+	return ret;
+}
+
+/* journal replay early: */
+
+static int journal_replay_entry_early(struct bch_fs *c,
+				      struct jset_entry *entry)
+{
+	int ret = 0;
+
+	switch (entry->type) {
+	case BCH_JSET_ENTRY_btree_root: {
+		struct btree_root *r;
+
+		if (entry->btree_id >= BTREE_ID_NR) {
+			bch_err(c, "filesystem has unknown btree type %u",
+				entry->btree_id);
+			return -EINVAL;
+		}
+
+		r = &c->btree_roots[entry->btree_id];
+
+		if (entry->u64s) {
+			r->level = entry->level;
+			bkey_copy(&r->key, &entry->start[0]);
+			r->error = 0;
+		} else {
+			r->error = -EIO;
+		}
+		r->alive = true;
+		break;
+	}
+	case BCH_JSET_ENTRY_usage: {
+		struct jset_entry_usage *u =
+			container_of(entry, struct jset_entry_usage, entry);
+
+		switch (entry->btree_id) {
+		case FS_USAGE_RESERVED:
+			if (entry->level < BCH_REPLICAS_MAX)
+				c->usage_base->persistent_reserved[entry->level] =
+					le64_to_cpu(u->v);
+			break;
+		case FS_USAGE_INODES:
+			c->usage_base->nr_inodes = le64_to_cpu(u->v);
+			break;
+		case FS_USAGE_KEY_VERSION:
+			atomic64_set(&c->key_version,
+				     le64_to_cpu(u->v));
+			break;
+		}
+
+		break;
+	}
+	case BCH_JSET_ENTRY_data_usage: {
+		struct jset_entry_data_usage *u =
+			container_of(entry, struct jset_entry_data_usage, entry);
+		ret = bch2_replicas_set_usage(c, &u->r,
+					      le64_to_cpu(u->v));
+		break;
+	}
+	case BCH_JSET_ENTRY_blacklist: {
+		struct jset_entry_blacklist *bl_entry =
+			container_of(entry, struct jset_entry_blacklist, entry);
+
+		ret = bch2_journal_seq_blacklist_add(c,
+				le64_to_cpu(bl_entry->seq),
+				le64_to_cpu(bl_entry->seq) + 1);
+		break;
+	}
+	case BCH_JSET_ENTRY_blacklist_v2: {
+		struct jset_entry_blacklist_v2 *bl_entry =
+			container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+		ret = bch2_journal_seq_blacklist_add(c,
+				le64_to_cpu(bl_entry->start),
+				le64_to_cpu(bl_entry->end) + 1);
+		break;
+	}
+	}
+
+	return ret;
+}
+
+static int journal_replay_early(struct bch_fs *c,
+				struct bch_sb_field_clean *clean,
+				struct list_head *journal)
+{
+	struct jset_entry *entry;
+	int ret;
+
+	if (clean) {
+		c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
+		c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
+
+		for (entry = clean->start;
+		     entry != vstruct_end(&clean->field);
+		     entry = vstruct_next(entry)) {
+			ret = journal_replay_entry_early(c, entry);
+			if (ret)
+				return ret;
+		}
+	} else {
+		struct journal_replay *i =
+			list_last_entry(journal, struct journal_replay, list);
+
+		c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
+		c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+
+		list_for_each_entry(i, journal, list)
+			vstruct_for_each(&i->j, entry) {
+				ret = journal_replay_entry_early(c, entry);
+				if (ret)
+					return ret;
+			}
+	}
+
+	bch2_fs_usage_initialize(c);
+
+	return 0;
+}
+
+/* sb clean section: */
+
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+				      struct bch_sb_field_clean *clean,
+				      struct jset *j,
+				      enum btree_id id, unsigned *level)
+{
+	struct bkey_i *k;
+	struct jset_entry *entry, *start, *end;
+
+	if (clean) {
+		start = clean->start;
+		end = vstruct_end(&clean->field);
+	} else {
+		start = j->start;
+		end = vstruct_last(j);
+	}
+
+	for (entry = start; entry < end; entry = vstruct_next(entry))
+		if (entry->type == BCH_JSET_ENTRY_btree_root &&
+		    entry->btree_id == id)
+			goto found;
+
+	return NULL;
+found:
+	if (!entry->u64s)
+		return ERR_PTR(-EINVAL);
+
+	k = entry->start;
+	*level = entry->level;
+	return k;
+}
+
+static int verify_superblock_clean(struct bch_fs *c,
+				   struct bch_sb_field_clean **cleanp,
+				   struct jset *j)
+{
+	unsigned i;
+	struct bch_sb_field_clean *clean = *cleanp;
+	int ret = 0;
+
+	if (!c->sb.clean || !j)
+		return 0;
+
+	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+			le64_to_cpu(clean->journal_seq),
+			le64_to_cpu(j->seq))) {
+		kfree(clean);
+		*cleanp = NULL;
+		return 0;
+	}
+
+	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct bkey_i *k1, *k2;
+		unsigned l1 = 0, l2 = 0;
+
+		k1 = btree_root_find(c, clean, NULL, i, &l1);
+		k2 = btree_root_find(c, NULL, j, i, &l2);
+
+		if (!k1 && !k2)
+			continue;
+
+		mustfix_fsck_err_on(!k1 || !k2 ||
+				    IS_ERR(k1) ||
+				    IS_ERR(k2) ||
+				    k1->k.u64s != k2->k.u64s ||
+				    memcmp(k1, k2, bkey_bytes(k1)) ||
+				    l1 != l2, c,
+			"superblock btree root doesn't match journal after clean shutdown");
+	}
+fsck_err:
+	return ret;
+}
+
+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *clean, *sb_clean;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+
+	if (fsck_err_on(!sb_clean, c,
+			"superblock marked clean but clean section not present")) {
+		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+		c->sb.clean = false;
+		mutex_unlock(&c->sb_lock);
+		return NULL;
+	}
+
+	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+			GFP_KERNEL);
+	if (!clean) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (le16_to_cpu(c->disk_sb.sb->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		bch2_sb_clean_renumber(clean, READ);
+
+	mutex_unlock(&c->sb_lock);
+
+	return clean;
+fsck_err:
+	mutex_unlock(&c->sb_lock);
+	return ERR_PTR(ret);
+}
+
+static int read_btree_roots(struct bch_fs *c)
+{
+	unsigned i;
+	int ret = 0;
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_root *r = &c->btree_roots[i];
+
+		if (!r->alive)
+			continue;
+
+		if (i == BTREE_ID_ALLOC &&
+		    c->opts.reconstruct_alloc) {
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+			continue;
+		}
+
+
+		if (r->error) {
+			__fsck_err(c, i == BTREE_ID_ALLOC
+				   ? FSCK_CAN_IGNORE : 0,
+				   "invalid btree root %s",
+				   bch2_btree_ids[i]);
+			if (i == BTREE_ID_ALLOC)
+				c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+		}
+
+		ret = bch2_btree_root_read(c, i, &r->key, r->level);
+		if (ret) {
+			__fsck_err(c, i == BTREE_ID_ALLOC
+				   ? FSCK_CAN_IGNORE : 0,
+				   "error reading btree root %s",
+				   bch2_btree_ids[i]);
+			if (i == BTREE_ID_ALLOC)
+				c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+		}
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (!c->btree_roots[i].b)
+			bch2_btree_root_alloc(c, i);
+fsck_err:
+	return ret;
+}
+
+int bch2_fs_recovery(struct bch_fs *c)
+{
+	const char *err = "cannot allocate memory";
+	struct bch_sb_field_clean *clean = NULL;
+	u64 journal_seq;
+	LIST_HEAD(journal_entries);
+	struct journal_keys journal_keys = { NULL };
+	bool wrote = false, write_sb = false;
+	int ret;
+
+	if (c->sb.clean)
+		clean = read_superblock_clean(c);
+	ret = PTR_ERR_OR_ZERO(clean);
+	if (ret)
+		goto err;
+
+	if (c->sb.clean)
+		bch_info(c, "recovering from clean shutdown, journal seq %llu",
+			 le64_to_cpu(clean->journal_seq));
+
+	if (!c->replicas.entries) {
+		bch_info(c, "building replicas info");
+		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+	}
+
+	if (!c->sb.clean || c->opts.fsck) {
+		struct jset *j;
+
+		ret = bch2_journal_read(c, &journal_entries);
+		if (ret)
+			goto err;
+
+		if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c,
+				"filesystem marked clean but journal not empty")) {
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+			c->sb.clean = false;
+		}
+
+		if (!c->sb.clean && list_empty(&journal_entries)) {
+			bch_err(c, "no journal entries found");
+			ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
+			goto err;
+		}
+
+		journal_keys = journal_keys_sort(&journal_entries);
+		if (!journal_keys.d) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		j = &list_last_entry(&journal_entries,
+				     struct journal_replay, list)->j;
+
+		ret = verify_superblock_clean(c, &clean, j);
+		if (ret)
+			goto err;
+
+		journal_seq = le64_to_cpu(j->seq) + 1;
+	} else {
+		journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+	}
+
+	ret = journal_replay_early(c, clean, &journal_entries);
+	if (ret)
+		goto err;
+
+	if (!c->sb.clean) {
+		ret = bch2_journal_seq_blacklist_add(c,
+						     journal_seq,
+						     journal_seq + 4);
+		if (ret) {
+			bch_err(c, "error creating new journal seq blacklist entry");
+			goto err;
+		}
+
+		journal_seq += 4;
+	}
+
+	ret = bch2_blacklist_table_initialize(c);
+
+	if (!list_empty(&journal_entries)) {
+		ret = verify_journal_entries_not_blacklisted_or_missing(c,
+							&journal_entries);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_fs_journal_start(&c->journal, journal_seq,
+				    &journal_entries);
+	if (ret)
+		goto err;
+
+	ret = read_btree_roots(c);
+	if (ret)
+		goto err;
+
+	bch_verbose(c, "starting alloc read");
+	err = "error reading allocation information";
+	ret = bch2_alloc_read(c, &journal_keys);
+	if (ret)
+		goto err;
+	bch_verbose(c, "alloc read done");
+
+	bch_verbose(c, "starting stripes_read");
+	err = "error reading stripes";
+	ret = bch2_stripes_read(c, &journal_keys);
+	if (ret)
+		goto err;
+	bch_verbose(c, "stripes_read done");
+
+	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+
+	if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) &&
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) {
+		/*
+		 * interior btree node updates aren't consistent with the
+		 * journal; after an unclean shutdown we have to walk all
+		 * pointers to metadata:
+		 */
+		bch_info(c, "starting metadata mark and sweep");
+		err = "error in mark and sweep";
+		ret = bch2_gc(c, NULL, true, true);
+		if (ret)
+			goto err;
+		bch_verbose(c, "mark and sweep done");
+	}
+
+	if (c->opts.fsck ||
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
+	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
+		bch_info(c, "starting mark and sweep");
+		err = "error in mark and sweep";
+		ret = bch2_gc(c, &journal_keys, true, false);
+		if (ret)
+			goto err;
+		bch_verbose(c, "mark and sweep done");
+	}
+
+	clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+
+	/*
+	 * Skip past versions that might have possibly been used (as nonces),
+	 * but hadn't had their pointers written:
+	 */
+	if (c->sb.encryption_type && !c->sb.clean)
+		atomic64_add(1 << 16, &c->key_version);
+
+	if (c->opts.norecovery)
+		goto out;
+
+	bch_verbose(c, "starting journal replay");
+	err = "journal replay failed";
+	ret = bch2_journal_replay(c, journal_keys);
+	if (ret)
+		goto err;
+	bch_verbose(c, "journal replay done");
+
+	if (!c->opts.nochanges) {
+		/*
+		 * note that even when filesystem was clean there might be work
+		 * to do here, if we ran gc (because of fsck) which recalculated
+		 * oldest_gen:
+		 */
+		bch_verbose(c, "writing allocation info");
+		err = "error writing out alloc info";
+		ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?:
+			bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote);
+		if (ret) {
+			bch_err(c, "error writing alloc info");
+			goto err;
+		}
+		bch_verbose(c, "alloc write done");
+
+		set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags);
+	}
+
+	if (!c->sb.clean) {
+		if (!(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+			bch_info(c, "checking inode link counts");
+			err = "error in recovery";
+			ret = bch2_fsck_inode_nlink(c);
+			if (ret)
+				goto err;
+			bch_verbose(c, "check inodes done");
+
+		} else {
+			bch_verbose(c, "checking for deleted inodes");
+			err = "error in recovery";
+			ret = bch2_fsck_walk_inodes_only(c);
+			if (ret)
+				goto err;
+			bch_verbose(c, "check inodes done");
+		}
+	}
+
+	if (c->opts.fsck) {
+		bch_info(c, "starting fsck");
+		err = "error in fsck";
+		ret = bch2_fsck_full(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "fsck done");
+	}
+
+	if (enabled_qtypes(c)) {
+		bch_verbose(c, "reading quotas");
+		ret = bch2_fs_quota_read(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "quotas done");
+	}
+
+	mutex_lock(&c->sb_lock);
+	if (c->opts.version_upgrade) {
+		if (c->sb.version < bcachefs_metadata_version_new_versioning)
+			c->disk_sb.sb->version_min =
+				le16_to_cpu(bcachefs_metadata_version_min);
+		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
+		write_sb = true;
+	}
+
+	if (!test_bit(BCH_FS_ERROR, &c->flags)) {
+		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
+		write_sb = true;
+	}
+
+	if (c->opts.fsck &&
+	    !test_bit(BCH_FS_ERROR, &c->flags)) {
+		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
+		write_sb = true;
+	}
+
+	if (write_sb)
+		bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (c->journal_seq_blacklist_table &&
+	    c->journal_seq_blacklist_table->nr > 128)
+		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
+out:
+	ret = 0;
+err:
+fsck_err:
+	set_bit(BCH_FS_FSCK_DONE, &c->flags);
+	bch2_flush_fsck_errs(c);
+
+	journal_keys_free(&journal_keys);
+	journal_entries_free(&journal_entries);
+	kfree(clean);
+	if (ret)
+		bch_err(c, "Error in recovery: %s (%i)", err, ret);
+	else
+		bch_verbose(c, "ret %i", ret);
+	return ret;
+}
+
+int bch2_fs_initialize(struct bch_fs *c)
+{
+	struct bch_inode_unpacked root_inode, lostfound_inode;
+	struct bkey_inode_buf packed_inode;
+	struct qstr lostfound = QSTR("lost+found");
+	const char *err = "cannot allocate memory";
+	struct bch_dev *ca;
+	LIST_HEAD(journal);
+	unsigned i;
+	int ret;
+
+	bch_notice(c, "initializing new filesystem");
+
+	mutex_lock(&c->sb_lock);
+	for_each_online_member(ca, c, i)
+		bch2_mark_dev_superblock(c, ca, 0);
+	mutex_unlock(&c->sb_lock);
+
+	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		bch2_btree_root_alloc(c, i);
+
+	err = "unable to allocate journal buckets";
+	for_each_online_member(ca, c, i) {
+		ret = bch2_dev_journal_alloc(ca);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			goto err;
+		}
+	}
+
+	/*
+	 * journal_res_get() will crash if called before this has
+	 * set up the journal.pin FIFO and journal.cur pointer:
+	 */
+	bch2_fs_journal_start(&c->journal, 1, &journal);
+	bch2_journal_set_replay_done(&c->journal);
+
+	err = "error going read write";
+	ret = __bch2_fs_read_write(c, true);
+	if (ret)
+		goto err;
+
+	bch2_inode_init(c, &root_inode, 0, 0,
+			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
+	root_inode.bi_inum = BCACHEFS_ROOT_INO;
+	bch2_inode_pack(&packed_inode, &root_inode);
+
+	err = "error creating root directory";
+	ret = bch2_btree_insert(c, BTREE_ID_INODES,
+				&packed_inode.inode.k_i,
+				NULL, NULL, 0);
+	if (ret)
+		goto err;
+
+	bch2_inode_init_early(c, &lostfound_inode);
+
+	err = "error creating lost+found";
+	ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
+		bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
+				  &root_inode, &lostfound_inode,
+				  &lostfound,
+				  0, 0, S_IFDIR|0755, 0,
+				  NULL, NULL));
+	if (ret)
+		goto err;
+
+	if (enabled_qtypes(c)) {
+		ret = bch2_fs_quota_read(c);
+		if (ret)
+			goto err;
+	}
+
+	err = "error writing first journal entry";
+	ret = bch2_journal_meta(&c->journal);
+	if (ret)
+		goto err;
+
+	mutex_lock(&c->sb_lock);
+	c->disk_sb.sb->version = c->disk_sb.sb->version_min =
+		le16_to_cpu(bcachefs_metadata_version_current);
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+
+	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+err:
+	pr_err("Error initializing new filesystem: %s (%i)", err, ret);
+	return ret;
+}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
new file mode 100644
index 000000000000..479ea46f8dcb
--- /dev/null
+++ b/fs/bcachefs/recovery.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_H
+#define _BCACHEFS_RECOVERY_H
+
+struct journal_keys {
+	struct journal_key {
+		enum btree_id	btree_id:8;
+		unsigned	allocated:1;
+		struct bpos	pos;
+		struct bkey_i	*k;
+		u32		journal_seq;
+		u32		journal_offset;
+	}			*d;
+	size_t			nr;
+	u64			journal_seq_base;
+};
+
+#define for_each_journal_key(keys, i)				\
+	for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
+
+struct journal_iter {
+	struct journal_keys	*keys;
+	struct journal_key	*k;
+	enum btree_id		btree_id;
+};
+
+struct journal_iter bch2_journal_iter_init(struct journal_keys *,
+					   enum btree_id);
+struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *);
+struct bkey_s_c bch2_journal_iter_next(struct journal_iter *);
+
+int bch2_fs_recovery(struct bch_fs *);
+int bch2_fs_initialize(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
new file mode 100644
index 000000000000..6d45ae24479d
--- /dev/null
+++ b/fs/bcachefs/reflink.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "inode.h"
+#include "io.h"
+#include "reflink.h"
+
+#include <linux/sched/signal.h>
+
+/* reflink pointers */
+
+const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+	if (bkey_val_bytes(p.k) != sizeof(*p.v))
+		return "incorrect value size";
+
+	return NULL;
+}
+
+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+	pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx));
+}
+
+enum merge_result bch2_reflink_p_merge(struct bch_fs *c,
+				       struct bkey_s _l, struct bkey_s _r)
+{
+	struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
+	struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r);
+
+	if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+		return BCH_MERGE_NOMERGE;
+
+	if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
+		bch2_key_resize(l.k, KEY_SIZE_MAX);
+		__bch2_cut_front(l.k->p, _r);
+		return BCH_MERGE_PARTIAL;
+	}
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+
+	return BCH_MERGE_MERGE;
+}
+
+/* indirect extents */
+
+const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+	if (bkey_val_bytes(r.k) < sizeof(*r.v))
+		return "incorrect value size";
+
+	return bch2_bkey_ptrs_invalid(c, k);
+}
+
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+	pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+static int bch2_make_extent_indirect(struct btree_trans *trans,
+				     struct btree_iter *extent_iter,
+				     struct bkey_i_extent *e)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *reflink_iter;
+	struct bkey_s_c k;
+	struct bkey_i_reflink_v *r_v;
+	struct bkey_i_reflink_p *r_p;
+	int ret;
+
+	for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK,
+			   POS(0, c->reflink_hint),
+			   BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
+		if (reflink_iter->pos.inode) {
+			bch2_btree_iter_set_pos(reflink_iter, POS_MIN);
+			continue;
+		}
+
+		if (bkey_deleted(k.k) && e->k.size <= k.k->size)
+			break;
+	}
+
+	if (ret)
+		goto err;
+
+	/* rewind iter to start of hole, if necessary: */
+	bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k));
+
+	r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k));
+	ret = PTR_ERR_OR_ZERO(r_v);
+	if (ret)
+		goto err;
+
+	bkey_reflink_v_init(&r_v->k_i);
+	r_v->k.p	= reflink_iter->pos;
+	bch2_key_resize(&r_v->k, e->k.size);
+	r_v->k.version	= e->k.version;
+
+	set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) +
+			  bkey_val_u64s(&e->k));
+	r_v->v.refcount	= 0;
+	memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k));
+
+	bch2_trans_update(trans, reflink_iter, &r_v->k_i);
+
+	r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
+	if (IS_ERR(r_p))
+		return PTR_ERR(r_p);
+
+	e->k.type = KEY_TYPE_reflink_p;
+	r_p = bkey_i_to_reflink_p(&e->k_i);
+	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
+
+	bch2_trans_update(trans, extent_iter, &r_p->k_i);
+err:
+	if (!IS_ERR(reflink_iter)) {
+		c->reflink_hint = reflink_iter->pos.offset;
+		bch2_trans_iter_put(trans, reflink_iter);
+	}
+
+	return ret;
+}
+
+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
+{
+	struct bkey_s_c k = bch2_btree_iter_peek(iter);
+	int ret;
+
+	for_each_btree_key_continue(iter, 0, k, ret) {
+		if (bkey_cmp(iter->pos, end) >= 0)
+			return bkey_s_c_null;
+
+		if (k.k->type == KEY_TYPE_extent ||
+		    k.k->type == KEY_TYPE_reflink_p)
+			break;
+	}
+
+	return k;
+}
+
+s64 bch2_remap_range(struct bch_fs *c,
+		     struct bpos dst_start, struct bpos src_start,
+		     u64 remap_sectors, u64 *journal_seq,
+		     u64 new_i_size, s64 *i_sectors_delta)
+{
+	struct btree_trans trans;
+	struct btree_iter *dst_iter, *src_iter;
+	struct bkey_s_c src_k;
+	BKEY_PADDED(k) new_dst, new_src;
+	struct bpos dst_end = dst_start, src_end = src_start;
+	struct bpos dst_want, src_want;
+	u64 src_done, dst_done;
+	int ret = 0, ret2 = 0;
+
+	if (!percpu_ref_tryget(&c->writes))
+		return -EROFS;
+
+	if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
+		mutex_lock(&c->sb_lock);
+		if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
+			c->disk_sb.sb->features[0] |=
+				cpu_to_le64(1ULL << BCH_FEATURE_REFLINK);
+
+			bch2_write_super(c);
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	dst_end.offset += remap_sectors;
+	src_end.offset += remap_sectors;
+
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
+
+	src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
+				       BTREE_ITER_INTENT);
+	dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
+				       BTREE_ITER_INTENT);
+
+	while (1) {
+		bch2_trans_begin_updates(&trans);
+		trans.mem_top = 0;
+
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			goto err;
+		}
+
+		src_k = get_next_src(src_iter, src_end);
+		ret = bkey_err(src_k);
+		if (ret)
+			goto btree_err;
+
+		src_done = bpos_min(src_iter->pos, src_end).offset -
+			src_start.offset;
+		dst_want = POS(dst_start.inode, dst_start.offset + src_done);
+
+		if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
+			ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
+					     journal_seq, i_sectors_delta);
+			if (ret)
+				goto btree_err;
+			continue;
+		}
+
+		BUG_ON(bkey_cmp(dst_iter->pos, dst_want));
+
+		if (!bkey_cmp(dst_iter->pos, dst_end))
+			break;
+
+		if (src_k.k->type == KEY_TYPE_extent) {
+			bkey_reassemble(&new_src.k, src_k);
+			src_k = bkey_i_to_s_c(&new_src.k);
+
+			bch2_cut_front(src_iter->pos,	&new_src.k);
+			bch2_cut_back(src_end,		&new_src.k.k);
+
+			ret = bch2_make_extent_indirect(&trans, src_iter,
+						bkey_i_to_extent(&new_src.k));
+			if (ret)
+				goto btree_err;
+
+			BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
+		}
+
+		if (src_k.k->type == KEY_TYPE_reflink_p) {
+			struct bkey_s_c_reflink_p src_p =
+				bkey_s_c_to_reflink_p(src_k);
+			struct bkey_i_reflink_p *dst_p =
+				bkey_reflink_p_init(&new_dst.k);
+
+			u64 offset = le64_to_cpu(src_p.v->idx) +
+				(src_iter->pos.offset -
+				 bkey_start_offset(src_k.k));
+
+			dst_p->v.idx = cpu_to_le64(offset);
+		} else {
+			BUG();
+		}
+
+		new_dst.k.k.p = dst_iter->pos;
+		bch2_key_resize(&new_dst.k.k,
+				min(src_k.k->p.offset - src_iter->pos.offset,
+				    dst_end.offset - dst_iter->pos.offset));
+
+		ret = bch2_extent_update(&trans, dst_iter, &new_dst.k,
+					 NULL, journal_seq,
+					 new_i_size, i_sectors_delta);
+		if (ret)
+			goto btree_err;
+
+		dst_done = dst_iter->pos.offset - dst_start.offset;
+		src_want = POS(src_start.inode, src_start.offset + dst_done);
+		bch2_btree_iter_set_pos(src_iter, src_want);
+btree_err:
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			goto err;
+	}
+
+	BUG_ON(bkey_cmp(dst_iter->pos, dst_end));
+err:
+	BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
+
+	dst_done = dst_iter->pos.offset - dst_start.offset;
+	new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
+
+	bch2_trans_begin(&trans);
+
+	do {
+		struct bch_inode_unpacked inode_u;
+		struct btree_iter *inode_iter;
+
+		inode_iter = bch2_inode_peek(&trans, &inode_u,
+				dst_start.inode, BTREE_ITER_INTENT);
+		ret2 = PTR_ERR_OR_ZERO(inode_iter);
+
+		if (!ret2 &&
+		    inode_u.bi_size < new_i_size)
+			ret2  = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
+				bch2_trans_commit(&trans, NULL, journal_seq,
+						  BTREE_INSERT_ATOMIC);
+	} while (ret2 == -EINTR);
+
+	ret = bch2_trans_exit(&trans) ?: ret;
+
+	percpu_ref_put(&c->writes);
+
+	return dst_done ?: ret ?: ret2;
+}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
new file mode 100644
index 000000000000..ac23b855858c
--- /dev/null
+++ b/fs/bcachefs/reflink.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_H
+#define _BCACHEFS_REFLINK_H
+
+const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+enum merge_result bch2_reflink_p_merge(struct bch_fs *,
+				       struct bkey_s, struct bkey_s);
+
+#define bch2_bkey_ops_reflink_p (struct bkey_ops) {		\
+	.key_invalid	= bch2_reflink_p_invalid,		\
+	.val_to_text	= bch2_reflink_p_to_text,		\
+	.key_merge	= bch2_reflink_p_merge,		\
+}
+
+const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+
+
+#define bch2_bkey_ops_reflink_v (struct bkey_ops) {		\
+	.key_invalid	= bch2_reflink_v_invalid,		\
+	.val_to_text	= bch2_reflink_v_to_text,		\
+}
+
+s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
+		     u64, u64 *, u64, s64 *);
+
+#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
new file mode 100644
index 000000000000..cb5ebb87c701
--- /dev/null
+++ b/fs/bcachefs/replicas.c
@@ -0,0 +1,1076 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets.h"
+#include "journal.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+					    struct bch_replicas_cpu *);
+
+/* Replicas tracking - in memory: */
+
+static inline int u8_cmp(u8 l, u8 r)
+{
+	return cmp_int(l, r);
+}
+
+static void verify_replicas_entry(struct bch_replicas_entry *e)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	unsigned i;
+
+	BUG_ON(e->data_type >= BCH_DATA_NR);
+	BUG_ON(!e->nr_devs);
+	BUG_ON(e->nr_required > 1 &&
+	       e->nr_required >= e->nr_devs);
+
+	for (i = 0; i + 1 < e->nr_devs; i++)
+		BUG_ON(e->devs[i] >= e->devs[i + 1]);
+#endif
+}
+
+static void replicas_entry_sort(struct bch_replicas_entry *e)
+{
+	bubble_sort(e->devs, e->nr_devs, u8_cmp);
+}
+
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
+void bch2_replicas_entry_to_text(struct printbuf *out,
+				 struct bch_replicas_entry *e)
+{
+	unsigned i;
+
+	pr_buf(out, "%s: %u/%u [",
+	       bch2_data_types[e->data_type],
+	       e->nr_required,
+	       e->nr_devs);
+
+	for (i = 0; i < e->nr_devs; i++)
+		pr_buf(out, i ? " %u" : "%u", e->devs[i]);
+	pr_buf(out, "]");
+}
+
+void bch2_cpu_replicas_to_text(struct printbuf *out,
+			      struct bch_replicas_cpu *r)
+{
+	struct bch_replicas_entry *e;
+	bool first = true;
+
+	for_each_cpu_replicas_entry(r, e) {
+		if (!first)
+			pr_buf(out, " ");
+		first = false;
+
+		bch2_replicas_entry_to_text(out, e);
+	}
+}
+
+static void extent_to_replicas(struct bkey_s_c k,
+			       struct bch_replicas_entry *r)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	r->nr_required	= 1;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.ptr.cached)
+			continue;
+
+		if (p.has_ec)
+			r->nr_required = 0;
+
+		r->devs[r->nr_devs++] = p.ptr.dev;
+	}
+}
+
+static void stripe_to_replicas(struct bkey_s_c k,
+			       struct bch_replicas_entry *r)
+{
+	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+	const struct bch_extent_ptr *ptr;
+
+	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
+
+	for (ptr = s.v->ptrs;
+	     ptr < s.v->ptrs + s.v->nr_blocks;
+	     ptr++)
+		r->devs[r->nr_devs++] = ptr->dev;
+}
+
+void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
+			   struct bkey_s_c k)
+{
+	e->nr_devs = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+		e->data_type = BCH_DATA_BTREE;
+		extent_to_replicas(k, e);
+		break;
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		e->data_type = BCH_DATA_USER;
+		extent_to_replicas(k, e);
+		break;
+	case KEY_TYPE_stripe:
+		e->data_type = BCH_DATA_USER;
+		stripe_to_replicas(k, e);
+		break;
+	}
+
+	replicas_entry_sort(e);
+}
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+			      enum bch_data_type data_type,
+			      struct bch_devs_list devs)
+{
+	unsigned i;
+
+	BUG_ON(!data_type ||
+	       data_type == BCH_DATA_SB ||
+	       data_type >= BCH_DATA_NR);
+
+	e->data_type	= data_type;
+	e->nr_devs	= 0;
+	e->nr_required	= 1;
+
+	for (i = 0; i < devs.nr; i++)
+		e->devs[e->nr_devs++] = devs.devs[i];
+
+	replicas_entry_sort(e);
+}
+
+static struct bch_replicas_cpu
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+		       struct bch_replicas_entry *new_entry)
+{
+	unsigned i;
+	struct bch_replicas_cpu new = {
+		.nr		= old->nr + 1,
+		.entry_size	= max_t(unsigned, old->entry_size,
+					replicas_entry_bytes(new_entry)),
+	};
+
+	BUG_ON(!new_entry->data_type);
+	verify_replicas_entry(new_entry);
+
+	new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
+	if (!new.entries)
+		return new;
+
+	for (i = 0; i < old->nr; i++)
+		memcpy(cpu_replicas_entry(&new, i),
+		       cpu_replicas_entry(old, i),
+		       old->entry_size);
+
+	memcpy(cpu_replicas_entry(&new, old->nr),
+	       new_entry,
+	       replicas_entry_bytes(new_entry));
+
+	bch2_cpu_replicas_sort(&new);
+	return new;
+}
+
+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
+				       struct bch_replicas_entry *search)
+{
+	int idx, entry_size = replicas_entry_bytes(search);
+
+	if (unlikely(entry_size > r->entry_size))
+		return -1;
+
+	verify_replicas_entry(search);
+
+#define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
+	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
+			      entry_cmp, search);
+#undef entry_cmp
+
+	return idx < r->nr ? idx : -1;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *c,
+			    struct bch_replicas_entry *search)
+{
+	replicas_entry_sort(search);
+
+	return __replicas_entry_idx(&c->replicas, search);
+}
+
+static bool __replicas_has_entry(struct bch_replicas_cpu *r,
+				 struct bch_replicas_entry *search)
+{
+	return __replicas_entry_idx(r, search) >= 0;
+}
+
+static bool bch2_replicas_marked_locked(struct bch_fs *c,
+			  struct bch_replicas_entry *search,
+			  bool check_gc_replicas)
+{
+	if (!search->nr_devs)
+		return true;
+
+	verify_replicas_entry(search);
+
+	return __replicas_has_entry(&c->replicas, search) &&
+		(!check_gc_replicas ||
+		 likely((!c->replicas_gc.entries)) ||
+		 __replicas_has_entry(&c->replicas_gc, search));
+}
+
+bool bch2_replicas_marked(struct bch_fs *c,
+			  struct bch_replicas_entry *search,
+			  bool check_gc_replicas)
+{
+	bool marked;
+
+	percpu_down_read(&c->mark_lock);
+	marked = bch2_replicas_marked_locked(c, search, check_gc_replicas);
+	percpu_up_read(&c->mark_lock);
+
+	return marked;
+}
+
+static void __replicas_table_update(struct bch_fs_usage *dst,
+				    struct bch_replicas_cpu *dst_r,
+				    struct bch_fs_usage *src,
+				    struct bch_replicas_cpu *src_r)
+{
+	int src_idx, dst_idx;
+
+	*dst = *src;
+
+	for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
+		if (!src->replicas[src_idx])
+			continue;
+
+		dst_idx = __replicas_entry_idx(dst_r,
+				cpu_replicas_entry(src_r, src_idx));
+		BUG_ON(dst_idx < 0);
+
+		dst->replicas[dst_idx] = src->replicas[src_idx];
+	}
+}
+
+static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
+				    struct bch_replicas_cpu *dst_r,
+				    struct bch_fs_usage __percpu *src_p,
+				    struct bch_replicas_cpu *src_r)
+{
+	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
+	struct bch_fs_usage *dst, *src = (void *)
+		bch2_acc_percpu_u64s((void *) src_p, src_nr);
+
+	preempt_disable();
+	dst = this_cpu_ptr(dst_p);
+	preempt_enable();
+
+	__replicas_table_update(dst, dst_r, src, src_r);
+}
+
+/*
+ * Resize filesystem accounting:
+ */
+static int replicas_table_update(struct bch_fs *c,
+				 struct bch_replicas_cpu *new_r)
+{
+	struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
+	struct bch_fs_usage *new_scratch = NULL;
+	struct bch_fs_usage __percpu *new_gc = NULL;
+	struct bch_fs_usage *new_base = NULL;
+	unsigned bytes = sizeof(struct bch_fs_usage) +
+		sizeof(u64) * new_r->nr;
+	int ret = -ENOMEM;
+
+	if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
+	    !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
+						GFP_NOIO)) ||
+	    !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
+						GFP_NOIO)) ||
+	    !(new_scratch  = kmalloc(bytes, GFP_NOIO)) ||
+	    (c->usage_gc &&
+	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
+		goto err;
+
+	if (c->usage_base)
+		__replicas_table_update(new_base,		new_r,
+					c->usage_base,		&c->replicas);
+	if (c->usage[0])
+		__replicas_table_update_pcpu(new_usage[0],	new_r,
+					     c->usage[0],	&c->replicas);
+	if (c->usage[1])
+		__replicas_table_update_pcpu(new_usage[1],	new_r,
+					     c->usage[1],	&c->replicas);
+	if (c->usage_gc)
+		__replicas_table_update_pcpu(new_gc,		new_r,
+					     c->usage_gc,	&c->replicas);
+
+	swap(c->usage_base,	new_base);
+	swap(c->usage[0],	new_usage[0]);
+	swap(c->usage[1],	new_usage[1]);
+	swap(c->usage_scratch,	new_scratch);
+	swap(c->usage_gc,	new_gc);
+	swap(c->replicas,	*new_r);
+	ret = 0;
+err:
+	free_percpu(new_gc);
+	kfree(new_scratch);
+	free_percpu(new_usage[1]);
+	free_percpu(new_usage[0]);
+	kfree(new_base);
+	return ret;
+}
+
+static unsigned reserve_journal_replicas(struct bch_fs *c,
+				     struct bch_replicas_cpu *r)
+{
+	struct bch_replicas_entry *e;
+	unsigned journal_res_u64s = 0;
+
+	/* nr_inodes: */
+	journal_res_u64s +=
+		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
+
+	/* key_version: */
+	journal_res_u64s +=
+		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
+
+	/* persistent_reserved: */
+	journal_res_u64s +=
+		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
+		BCH_REPLICAS_MAX;
+
+	for_each_cpu_replicas_entry(r, e)
+		journal_res_u64s +=
+			DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
+				     e->nr_devs, sizeof(u64));
+	return journal_res_u64s;
+}
+
+noinline
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
+				struct bch_replicas_entry *new_entry)
+{
+	struct bch_replicas_cpu new_r, new_gc;
+	int ret = -ENOMEM;
+
+	verify_replicas_entry(new_entry);
+
+	memset(&new_r, 0, sizeof(new_r));
+	memset(&new_gc, 0, sizeof(new_gc));
+
+	mutex_lock(&c->sb_lock);
+
+	if (c->replicas_gc.entries &&
+	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
+		new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry);
+		if (!new_gc.entries)
+			goto err;
+	}
+
+	if (!__replicas_has_entry(&c->replicas, new_entry)) {
+		new_r = cpu_replicas_add_entry(&c->replicas, new_entry);
+		if (!new_r.entries)
+			goto err;
+
+		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
+		if (ret)
+			goto err;
+
+		bch2_journal_entry_res_resize(&c->journal,
+				&c->replicas_journal_res,
+				reserve_journal_replicas(c, &new_r));
+	}
+
+	if (!new_r.entries &&
+	    !new_gc.entries)
+		goto out;
+
+	/* allocations done, now commit: */
+
+	if (new_r.entries)
+		bch2_write_super(c);
+
+	/* don't update in memory replicas until changes are persistent */
+	percpu_down_write(&c->mark_lock);
+	if (new_r.entries)
+		ret = replicas_table_update(c, &new_r);
+	if (new_gc.entries)
+		swap(new_gc, c->replicas_gc);
+	percpu_up_write(&c->mark_lock);
+out:
+	ret = 0;
+err:
+	mutex_unlock(&c->sb_lock);
+
+	kfree(new_r.entries);
+	kfree(new_gc.entries);
+
+	return ret;
+}
+
+int bch2_mark_replicas(struct bch_fs *c,
+		       struct bch_replicas_entry *r)
+{
+	return likely(bch2_replicas_marked(c, r, true))
+		? 0
+		: bch2_mark_replicas_slowpath(c, r);
+}
+
+bool bch2_bkey_replicas_marked_locked(struct bch_fs *c,
+				      struct bkey_s_c k,
+				      bool check_gc_replicas)
+{
+	struct bch_replicas_padded search;
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
+
+	for (i = 0; i < cached.nr; i++) {
+		bch2_replicas_entry_cached(&search.e, cached.devs[i]);
+
+		if (!bch2_replicas_marked_locked(c, &search.e,
+						 check_gc_replicas))
+			return false;
+	}
+
+	bch2_bkey_to_replicas(&search.e, k);
+
+	return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas);
+}
+
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+			       struct bkey_s_c k,
+			       bool check_gc_replicas)
+{
+	bool marked;
+
+	percpu_down_read(&c->mark_lock);
+	marked = bch2_bkey_replicas_marked_locked(c, k, check_gc_replicas);
+	percpu_up_read(&c->mark_lock);
+
+	return marked;
+}
+
+int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bch_replicas_padded search;
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < cached.nr; i++) {
+		bch2_replicas_entry_cached(&search.e, cached.devs[i]);
+
+		ret = bch2_mark_replicas(c, &search.e);
+		if (ret)
+			return ret;
+	}
+
+	bch2_bkey_to_replicas(&search.e, k);
+
+	return bch2_mark_replicas(c, &search.e);
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int ret)
+{
+	unsigned i;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+	percpu_down_write(&c->mark_lock);
+
+	/*
+	 * this is kind of crappy; the replicas gc mechanism needs to be ripped
+	 * out
+	 */
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+		struct bch_replicas_cpu n;
+
+		if (!__replicas_has_entry(&c->replicas_gc, e) &&
+		    (c->usage_base->replicas[i] ||
+		     percpu_u64_get(&c->usage[0]->replicas[i]) ||
+		     percpu_u64_get(&c->usage[1]->replicas[i]))) {
+			n = cpu_replicas_add_entry(&c->replicas_gc, e);
+			if (!n.entries) {
+				ret = -ENOSPC;
+				goto err;
+			}
+
+			swap(n, c->replicas_gc);
+			kfree(n.entries);
+		}
+	}
+
+	if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
+		ret = -ENOSPC;
+		goto err;
+	}
+
+	ret = replicas_table_update(c, &c->replicas_gc);
+err:
+	kfree(c->replicas_gc.entries);
+	c->replicas_gc.entries = NULL;
+
+	percpu_up_write(&c->mark_lock);
+
+	if (!ret)
+		bch2_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+	struct bch_replicas_entry *e;
+	unsigned i = 0;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+	BUG_ON(c->replicas_gc.entries);
+
+	c->replicas_gc.nr		= 0;
+	c->replicas_gc.entry_size	= 0;
+
+	for_each_cpu_replicas_entry(&c->replicas, e)
+		if (!((1 << e->data_type) & typemask)) {
+			c->replicas_gc.nr++;
+			c->replicas_gc.entry_size =
+				max_t(unsigned, c->replicas_gc.entry_size,
+				      replicas_entry_bytes(e));
+		}
+
+	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
+					 c->replicas_gc.entry_size,
+					 GFP_NOIO);
+	if (!c->replicas_gc.entries) {
+		mutex_unlock(&c->sb_lock);
+		return -ENOMEM;
+	}
+
+	for_each_cpu_replicas_entry(&c->replicas, e)
+		if (!((1 << e->data_type) & typemask))
+			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
+			       e, c->replicas_gc.entry_size);
+
+	bch2_cpu_replicas_sort(&c->replicas_gc);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_replicas_gc2(struct bch_fs *c)
+{
+	struct bch_replicas_cpu new = { 0 };
+	unsigned i, nr;
+	int ret = 0;
+
+	bch2_journal_meta(&c->journal);
+retry:
+	nr		= READ_ONCE(c->replicas.nr);
+	new.entry_size	= READ_ONCE(c->replicas.entry_size);
+	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
+	if (!new.entries)
+		return -ENOMEM;
+
+	mutex_lock(&c->sb_lock);
+	percpu_down_write(&c->mark_lock);
+
+	if (nr			!= c->replicas.nr ||
+	    new.entry_size	!= c->replicas.entry_size) {
+		percpu_up_write(&c->mark_lock);
+		mutex_unlock(&c->sb_lock);
+		kfree(new.entries);
+		goto retry;
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		if (e->data_type == BCH_DATA_JOURNAL ||
+		    c->usage_base->replicas[i] ||
+		    percpu_u64_get(&c->usage[0]->replicas[i]) ||
+		    percpu_u64_get(&c->usage[1]->replicas[i]))
+			memcpy(cpu_replicas_entry(&new, new.nr++),
+			       e, new.entry_size);
+	}
+
+	bch2_cpu_replicas_sort(&new);
+
+	if (bch2_cpu_replicas_to_sb_replicas(c, &new)) {
+		ret = -ENOSPC;
+		goto err;
+	}
+
+	ret = replicas_table_update(c, &new);
+err:
+	kfree(new.entries);
+
+	percpu_up_write(&c->mark_lock);
+
+	if (!ret)
+		bch2_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_replicas_set_usage(struct bch_fs *c,
+			    struct bch_replicas_entry *r,
+			    u64 sectors)
+{
+	int ret, idx = bch2_replicas_entry_idx(c, r);
+
+	if (idx < 0) {
+		struct bch_replicas_cpu n;
+
+		n = cpu_replicas_add_entry(&c->replicas, r);
+		if (!n.entries)
+			return -ENOMEM;
+
+		ret = replicas_table_update(c, &n);
+		if (ret)
+			return ret;
+
+		kfree(n.entries);
+
+		idx = bch2_replicas_entry_idx(c, r);
+		BUG_ON(ret < 0);
+	}
+
+	c->usage_base->replicas[idx] = sectors;
+
+	return 0;
+}
+
+/* Replicas tracking - superblock: */
+
+static int
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
+				   struct bch_replicas_cpu *cpu_r)
+{
+	struct bch_replicas_entry *e, *dst;
+	unsigned nr = 0, entry_size = 0, idx = 0;
+
+	for_each_replicas_entry(sb_r, e) {
+		entry_size = max_t(unsigned, entry_size,
+				   replicas_entry_bytes(e));
+		nr++;
+	}
+
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+	if (!cpu_r->entries)
+		return -ENOMEM;
+
+	cpu_r->nr		= nr;
+	cpu_r->entry_size	= entry_size;
+
+	for_each_replicas_entry(sb_r, e) {
+		dst = cpu_replicas_entry(cpu_r, idx++);
+		memcpy(dst, e, replicas_entry_bytes(e));
+		replicas_entry_sort(dst);
+	}
+
+	return 0;
+}
+
+static int
+__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
+				      struct bch_replicas_cpu *cpu_r)
+{
+	struct bch_replicas_entry_v0 *e;
+	unsigned nr = 0, entry_size = 0, idx = 0;
+
+	for_each_replicas_entry(sb_r, e) {
+		entry_size = max_t(unsigned, entry_size,
+				   replicas_entry_bytes(e));
+		nr++;
+	}
+
+	entry_size += sizeof(struct bch_replicas_entry) -
+		sizeof(struct bch_replicas_entry_v0);
+
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+	if (!cpu_r->entries)
+		return -ENOMEM;
+
+	cpu_r->nr		= nr;
+	cpu_r->entry_size	= entry_size;
+
+	for_each_replicas_entry(sb_r, e) {
+		struct bch_replicas_entry *dst =
+			cpu_replicas_entry(cpu_r, idx++);
+
+		dst->data_type	= e->data_type;
+		dst->nr_devs	= e->nr_devs;
+		dst->nr_required = 1;
+		memcpy(dst->devs, e->devs, e->nr_devs);
+		replicas_entry_sort(dst);
+	}
+
+	return 0;
+}
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
+{
+	struct bch_sb_field_replicas *sb_v1;
+	struct bch_sb_field_replicas_v0 *sb_v0;
+	struct bch_replicas_cpu new_r = { 0, 0, NULL };
+	int ret = 0;
+
+	if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb)))
+		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
+	else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
+		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
+
+	if (ret)
+		return -ENOMEM;
+
+	bch2_cpu_replicas_sort(&new_r);
+
+	percpu_down_write(&c->mark_lock);
+
+	ret = replicas_table_update(c, &new_r);
+	percpu_up_write(&c->mark_lock);
+
+	kfree(new_r.entries);
+
+	return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
+					       struct bch_replicas_cpu *r)
+{
+	struct bch_sb_field_replicas_v0 *sb_r;
+	struct bch_replicas_entry_v0 *dst;
+	struct bch_replicas_entry *src;
+	size_t bytes;
+
+	bytes = sizeof(struct bch_sb_field_replicas);
+
+	for_each_cpu_replicas_entry(r, src)
+		bytes += replicas_entry_bytes(src) - 1;
+
+	sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
+			DIV_ROUND_UP(bytes, sizeof(u64)));
+	if (!sb_r)
+		return -ENOSPC;
+
+	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
+	sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
+
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);
+
+	dst = sb_r->entries;
+	for_each_cpu_replicas_entry(r, src) {
+		dst->data_type	= src->data_type;
+		dst->nr_devs	= src->nr_devs;
+		memcpy(dst->devs, src->devs, src->nr_devs);
+
+		dst = replicas_entry_next(dst);
+
+		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
+	}
+
+	return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+					    struct bch_replicas_cpu *r)
+{
+	struct bch_sb_field_replicas *sb_r;
+	struct bch_replicas_entry *dst, *src;
+	bool need_v1 = false;
+	size_t bytes;
+
+	bytes = sizeof(struct bch_sb_field_replicas);
+
+	for_each_cpu_replicas_entry(r, src) {
+		bytes += replicas_entry_bytes(src);
+		if (src->nr_required != 1)
+			need_v1 = true;
+	}
+
+	if (!need_v1)
+		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
+
+	sb_r = bch2_sb_resize_replicas(&c->disk_sb,
+			DIV_ROUND_UP(bytes, sizeof(u64)));
+	if (!sb_r)
+		return -ENOSPC;
+
+	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
+	sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
+
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);
+
+	dst = sb_r->entries;
+	for_each_cpu_replicas_entry(r, src) {
+		memcpy(dst, src, replicas_entry_bytes(src));
+
+		dst = replicas_entry_next(dst);
+
+		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
+	}
+
+	return 0;
+}
+
+static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
+{
+	unsigned i;
+
+	sort_cmp_size(cpu_r->entries,
+		      cpu_r->nr,
+		      cpu_r->entry_size,
+		      memcmp, NULL);
+
+	for (i = 0; i + 1 < cpu_r->nr; i++) {
+		struct bch_replicas_entry *l =
+			cpu_replicas_entry(cpu_r, i);
+		struct bch_replicas_entry *r =
+			cpu_replicas_entry(cpu_r, i + 1);
+
+		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
+
+		if (!memcmp(l, r, cpu_r->entry_size))
+			return "duplicate replicas entry";
+	}
+
+	return NULL;
+}
+
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
+{
+	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+	struct bch_replicas_cpu cpu_r = { .entries = NULL };
+	struct bch_replicas_entry *e;
+	const char *err;
+	unsigned i;
+
+	for_each_replicas_entry(sb_r, e) {
+		err = "invalid replicas entry: invalid data type";
+		if (e->data_type >= BCH_DATA_NR)
+			goto err;
+
+		err = "invalid replicas entry: no devices";
+		if (!e->nr_devs)
+			goto err;
+
+		err = "invalid replicas entry: bad nr_required";
+		if (e->nr_required > 1 &&
+		    e->nr_required >= e->nr_devs)
+			goto err;
+
+		err = "invalid replicas entry: invalid device";
+		for (i = 0; i < e->nr_devs; i++)
+			if (!bch2_dev_exists(sb, mi, e->devs[i]))
+				goto err;
+	}
+
+	err = "cannot allocate memory";
+	if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
+		goto err;
+
+	err = check_dup_replicas_entries(&cpu_r);
+err:
+	kfree(cpu_r.entries);
+	return err;
+}
+
+static void bch2_sb_replicas_to_text(struct printbuf *out,
+				     struct bch_sb *sb,
+				     struct bch_sb_field *f)
+{
+	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
+	struct bch_replicas_entry *e;
+	bool first = true;
+
+	for_each_replicas_entry(r, e) {
+		if (!first)
+			pr_buf(out, " ");
+		first = false;
+
+		bch2_replicas_entry_to_text(out, e);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
+	.validate	= bch2_sb_validate_replicas,
+	.to_text	= bch2_sb_replicas_to_text,
+};
+
+static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f)
+{
+	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+	struct bch_replicas_cpu cpu_r = { .entries = NULL };
+	struct bch_replicas_entry_v0 *e;
+	const char *err;
+	unsigned i;
+
+	for_each_replicas_entry_v0(sb_r, e) {
+		err = "invalid replicas entry: invalid data type";
+		if (e->data_type >= BCH_DATA_NR)
+			goto err;
+
+		err = "invalid replicas entry: no devices";
+		if (!e->nr_devs)
+			goto err;
+
+		err = "invalid replicas entry: invalid device";
+		for (i = 0; i < e->nr_devs; i++)
+			if (!bch2_dev_exists(sb, mi, e->devs[i]))
+				goto err;
+	}
+
+	err = "cannot allocate memory";
+	if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
+		goto err;
+
+	err = check_dup_replicas_entries(&cpu_r);
+err:
+	kfree(cpu_r.entries);
+	return err;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
+	.validate	= bch2_sb_validate_replicas_v0,
+};
+
+/* Query replicas: */
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *c,
+					      struct bch_devs_mask online_devs)
+{
+	struct bch_sb_field_members *mi;
+	struct bch_replicas_entry *e;
+	unsigned i, nr_online, nr_offline;
+	struct replicas_status ret;
+
+	memset(&ret, 0, sizeof(ret));
+
+	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
+		ret.replicas[i].redundancy = INT_MAX;
+
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+
+	percpu_down_read(&c->mark_lock);
+
+	for_each_cpu_replicas_entry(&c->replicas, e) {
+		if (e->data_type >= ARRAY_SIZE(ret.replicas))
+			panic("e %p data_type %u\n", e, e->data_type);
+
+		nr_online = nr_offline = 0;
+
+		for (i = 0; i < e->nr_devs; i++) {
+			BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi,
+						e->devs[i]));
+
+			if (test_bit(e->devs[i], online_devs.d))
+				nr_online++;
+			else
+				nr_offline++;
+		}
+
+		ret.replicas[e->data_type].redundancy =
+			min(ret.replicas[e->data_type].redundancy,
+			    (int) nr_online - (int) e->nr_required);
+
+		ret.replicas[e->data_type].nr_offline =
+			max(ret.replicas[e->data_type].nr_offline,
+			    nr_offline);
+	}
+
+	percpu_up_read(&c->mark_lock);
+
+	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
+		if (ret.replicas[i].redundancy == INT_MAX)
+			ret.replicas[i].redundancy = 0;
+
+	return ret;
+}
+
+struct replicas_status bch2_replicas_status(struct bch_fs *c)
+{
+	return __bch2_replicas_status(c, bch2_online_devs(c));
+}
+
+static bool have_enough_devs(struct replicas_status s,
+			     enum bch_data_type type,
+			     bool force_if_degraded,
+			     bool force_if_lost)
+{
+	return (!s.replicas[type].nr_offline || force_if_degraded) &&
+		(s.replicas[type].redundancy >= 0 || force_if_lost);
+}
+
+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
+{
+	return (have_enough_devs(s, BCH_DATA_JOURNAL,
+				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
+				 flags & BCH_FORCE_IF_METADATA_LOST) &&
+		have_enough_devs(s, BCH_DATA_BTREE,
+				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
+				 flags & BCH_FORCE_IF_METADATA_LOST) &&
+		have_enough_devs(s, BCH_DATA_USER,
+				 flags & BCH_FORCE_IF_DATA_DEGRADED,
+				 flags & BCH_FORCE_IF_DATA_LOST));
+}
+
+int bch2_replicas_online(struct bch_fs *c, bool meta)
+{
+	struct replicas_status s = bch2_replicas_status(c);
+
+	return (meta
+		? min(s.replicas[BCH_DATA_JOURNAL].redundancy,
+		      s.replicas[BCH_DATA_BTREE].redundancy)
+		: s.replicas[BCH_DATA_USER].redundancy) + 1;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bch_replicas_entry *e;
+	unsigned i, ret = 0;
+
+	percpu_down_read(&c->mark_lock);
+
+	for_each_cpu_replicas_entry(&c->replicas, e)
+		for (i = 0; i < e->nr_devs; i++)
+			if (e->devs[i] == ca->dev_idx)
+				ret |= 1 << e->data_type;
+
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
+}
+
+int bch2_fs_replicas_init(struct bch_fs *c)
+{
+	c->journal.entry_u64s_reserved +=
+		reserve_journal_replicas(c, &c->replicas);
+
+	return replicas_table_update(c, &c->replicas);
+}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
new file mode 100644
index 000000000000..0d6e19126021
--- /dev/null
+++ b/fs/bcachefs/replicas.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_H
+#define _BCACHEFS_REPLICAS_H
+
+#include "eytzinger.h"
+#include "replicas_types.h"
+
+void bch2_replicas_entry_to_text(struct printbuf *,
+				 struct bch_replicas_entry *);
+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+
+static inline struct bch_replicas_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+	return (void *) r->entries + r->entry_size * i;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *,
+			    struct bch_replicas_entry *);
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+			      enum bch_data_type,
+			      struct bch_devs_list);
+bool bch2_replicas_marked(struct bch_fs *,
+			  struct bch_replicas_entry *, bool);
+int bch2_mark_replicas(struct bch_fs *,
+		       struct bch_replicas_entry *);
+
+bool bch2_bkey_replicas_marked_locked(struct bch_fs *,
+				      struct bkey_s_c, bool);
+void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
+bool bch2_bkey_replicas_marked(struct bch_fs *,
+			       struct bkey_s_c, bool);
+int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
+
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+					      unsigned dev)
+{
+	e->data_type	= BCH_DATA_CACHED;
+	e->nr_devs	= 1;
+	e->nr_required	= 1;
+	e->devs[0]	= dev;
+}
+
+struct replicas_status {
+	struct {
+		int		redundancy;
+		unsigned	nr_offline;
+	}			replicas[BCH_DATA_NR];
+};
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *,
+					      struct bch_devs_mask);
+struct replicas_status bch2_replicas_status(struct bch_fs *);
+bool bch2_have_enough_devs(struct replicas_status, unsigned);
+
+int bch2_replicas_online(struct bch_fs *, bool);
+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
+
+int bch2_replicas_gc_end(struct bch_fs *, int);
+int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+int bch2_replicas_gc2(struct bch_fs *);
+
+int bch2_replicas_set_usage(struct bch_fs *,
+			    struct bch_replicas_entry *,
+			    u64);
+
+#define for_each_cpu_replicas_entry(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+	     _i = (void *) (_i) + (_r)->entry_size)
+
+/* iterate over superblock replicas - used by userspace tools: */
+
+#define replicas_entry_bytes(_i)					\
+	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+
+#define replicas_entry_next(_i)						\
+	((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
+
+#define for_each_replicas_entry(_r, _i)					\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
+#define for_each_replicas_entry_v0(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
+
+int bch2_fs_replicas_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
new file mode 100644
index 000000000000..0535b1d3760e
--- /dev/null
+++ b/fs/bcachefs/replicas_types.h
@@ -0,0 +1,10 @@
+#ifndef _BCACHEFS_REPLICAS_TYPES_H
+#define _BCACHEFS_REPLICAS_TYPES_H
+
+struct bch_replicas_cpu {
+	unsigned		nr;
+	unsigned		entry_size;
+	struct bch_replicas_entry *entries;
+};
+
+#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
new file mode 100644
index 000000000000..c062edb3fbc2
--- /dev/null
+++ b/fs/bcachefs/siphash.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*	$OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
+
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
+ * are the number of compression rounds and the number of finalization rounds.
+ * A compression round is identical to a finalization round and this round
+ * function is called SipRound.  Given a 128-bit key k and a (possibly empty)
+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
+ *
+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
+ * by Jean-Philippe Aumasson and Daniel J. Bernstein,
+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
+ * https://131002.net/siphash/siphash.pdf
+ * https://131002.net/siphash/
+ */
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/bitops.h>
+#include <linux/string.h>
+
+#include "siphash.h"
+
+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
+{
+	while (rounds--) {
+		ctx->v[0] += ctx->v[1];
+		ctx->v[2] += ctx->v[3];
+		ctx->v[1] = rol64(ctx->v[1], 13);
+		ctx->v[3] = rol64(ctx->v[3], 16);
+
+		ctx->v[1] ^= ctx->v[0];
+		ctx->v[3] ^= ctx->v[2];
+		ctx->v[0] = rol64(ctx->v[0], 32);
+
+		ctx->v[2] += ctx->v[1];
+		ctx->v[0] += ctx->v[3];
+		ctx->v[1] = rol64(ctx->v[1], 17);
+		ctx->v[3] = rol64(ctx->v[3], 21);
+
+		ctx->v[1] ^= ctx->v[2];
+		ctx->v[3] ^= ctx->v[0];
+		ctx->v[2] = rol64(ctx->v[2], 32);
+	}
+}
+
+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
+{
+	u64 m = get_unaligned_le64(ptr);
+
+	ctx->v[3] ^= m;
+	SipHash_Rounds(ctx, rounds);
+	ctx->v[0] ^= m;
+}
+
+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
+{
+	u64 k0, k1;
+
+	k0 = le64_to_cpu(key->k0);
+	k1 = le64_to_cpu(key->k1);
+
+	ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
+	ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
+	ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
+	ctx->v[3] = 0x7465646279746573ULL ^ k1;
+
+	memset(ctx->buf, 0, sizeof(ctx->buf));
+	ctx->bytes = 0;
+}
+
+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
+		    const void *src, size_t len)
+{
+	const u8 *ptr = src;
+	size_t left, used;
+
+	if (len == 0)
+		return;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	ctx->bytes += len;
+
+	if (used > 0) {
+		left = sizeof(ctx->buf) - used;
+
+		if (len >= left) {
+			memcpy(&ctx->buf[used], ptr, left);
+			SipHash_CRounds(ctx, ctx->buf, rc);
+			len -= left;
+			ptr += left;
+		} else {
+			memcpy(&ctx->buf[used], ptr, len);
+			return;
+		}
+	}
+
+	while (len >= sizeof(ctx->buf)) {
+		SipHash_CRounds(ctx, ptr, rc);
+		len -= sizeof(ctx->buf);
+		ptr += sizeof(ctx->buf);
+	}
+
+	if (len > 0)
+		memcpy(&ctx->buf[used], ptr, len);
+}
+
+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+
+	r = SipHash_End(ctx, rc, rf);
+
+	*((__le64 *) dst) = cpu_to_le64(r);
+}
+
+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+	size_t left, used;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	left = sizeof(ctx->buf) - used;
+	memset(&ctx->buf[used], 0, left - 1);
+	ctx->buf[7] = ctx->bytes;
+
+	SipHash_CRounds(ctx, ctx->buf, rc);
+	ctx->v[2] ^= 0xff;
+	SipHash_Rounds(ctx, rf);
+
+	r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
+	memset(ctx, 0, sizeof(*ctx));
+	return (r);
+}
+
+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
+{
+	SIPHASH_CTX ctx;
+
+	SipHash_Init(&ctx, key);
+	SipHash_Update(&ctx, rc, rf, src, len);
+	return SipHash_End(&ctx, rc, rf);
+}
diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
new file mode 100644
index 000000000000..3dfaf34a43b2
--- /dev/null
+++ b/fs/bcachefs/siphash.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
+ * optimized for speed on short messages returning a 64bit hash/digest value.
+ *
+ * The number of rounds is defined during the initialization:
+ *  SipHash24_Init() for the fast and resonable strong version
+ *  SipHash48_Init() for the strong version (half as fast)
+ *
+ * struct SIPHASH_CTX ctx;
+ * SipHash24_Init(&ctx);
+ * SipHash_SetKey(&ctx, "16bytes long key");
+ * SipHash_Update(&ctx, pointer_to_string, length_of_string);
+ * SipHash_Final(output, &ctx);
+ */
+
+#ifndef _SIPHASH_H_
+#define _SIPHASH_H_
+
+#include <linux/types.h>
+
+#define SIPHASH_BLOCK_LENGTH	 8
+#define SIPHASH_KEY_LENGTH	16
+#define SIPHASH_DIGEST_LENGTH	 8
+
+typedef struct _SIPHASH_CTX {
+	u64		v[4];
+	u8		buf[SIPHASH_BLOCK_LENGTH];
+	u32		bytes;
+} SIPHASH_CTX;
+
+typedef struct {
+	__le64		k0;
+	__le64		k1;
+} SIPHASH_KEY;
+
+void	SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
+void	SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
+u64	SipHash_End(SIPHASH_CTX *, int, int);
+void	SipHash_Final(void *, SIPHASH_CTX *, int, int);
+u64	SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
+
+#define SipHash24_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash24_Update(_c, _p, _l)	SipHash_Update((_c), 2, 4, (_p), (_l))
+#define SipHash24_End(_d)		SipHash_End((_d), 2, 4)
+#define SipHash24_Final(_d, _c)		SipHash_Final((_d), (_c), 2, 4)
+#define SipHash24(_k, _p, _l)		SipHash((_k), 2, 4, (_p), (_l))
+
+#define SipHash48_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash48_Update(_c, _p, _l)	SipHash_Update((_c), 4, 8, (_p), (_l))
+#define SipHash48_End(_d)		SipHash_End((_d), 4, 8)
+#define SipHash48_Final(_d, _c)		SipHash_Final((_d), (_c), 4, 8)
+#define SipHash48(_k, _p, _l)		SipHash((_k), 4, 8, (_p), (_l))
+
+#endif /* _SIPHASH_H_ */
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
new file mode 100644
index 000000000000..7be4a8e50eaa
--- /dev/null
+++ b/fs/bcachefs/str_hash.h
@@ -0,0 +1,331 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_STR_HASH_H
+#define _BCACHEFS_STR_HASH_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "checksum.h"
+#include "error.h"
+#include "inode.h"
+#include "siphash.h"
+#include "super.h"
+
+#include <linux/crc32c.h>
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+
+static inline enum bch_str_hash_type
+bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
+{
+	switch (opt) {
+	case BCH_STR_HASH_OPT_CRC32C:
+		return BCH_STR_HASH_CRC32C;
+	case BCH_STR_HASH_OPT_CRC64:
+		return BCH_STR_HASH_CRC64;
+	case BCH_STR_HASH_OPT_SIPHASH:
+		return c->sb.features & (1ULL << BCH_FEATURE_NEW_SIPHASH)
+			? BCH_STR_HASH_SIPHASH
+			: BCH_STR_HASH_SIPHASH_OLD;
+	default:
+	     BUG();
+	}
+}
+
+struct bch_hash_info {
+	u8			type;
+	union {
+		__le64		crc_key;
+		SIPHASH_KEY	siphash_key;
+	};
+};
+
+static inline struct bch_hash_info
+bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
+{
+	/* XXX ick */
+	struct bch_hash_info info = {
+		.type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
+			~(~0U << INODE_STR_HASH_BITS),
+		.crc_key = bi->bi_hash_seed,
+	};
+
+	if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) {
+		SHASH_DESC_ON_STACK(desc, c->sha256);
+		u8 digest[SHA256_DIGEST_SIZE];
+
+		desc->tfm = c->sha256;
+
+		crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
+				    sizeof(bi->bi_hash_seed), digest);
+		memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
+	}
+
+	return info;
+}
+
+struct bch_str_hash_ctx {
+	union {
+		u32		crc32c;
+		u64		crc64;
+		SIPHASH_CTX	siphash;
+	};
+};
+
+static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
+				     const struct bch_hash_info *info)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_CRC32C:
+		ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key));
+		break;
+	case BCH_STR_HASH_CRC64:
+		ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key));
+		break;
+	case BCH_STR_HASH_SIPHASH_OLD:
+	case BCH_STR_HASH_SIPHASH:
+		SipHash24_Init(&ctx->siphash, &info->siphash_key);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
+				       const struct bch_hash_info *info,
+				       const void *data, size_t len)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_CRC32C:
+		ctx->crc32c = crc32c(ctx->crc32c, data, len);
+		break;
+	case BCH_STR_HASH_CRC64:
+		ctx->crc64 = crc64_be(ctx->crc64, data, len);
+		break;
+	case BCH_STR_HASH_SIPHASH_OLD:
+	case BCH_STR_HASH_SIPHASH:
+		SipHash24_Update(&ctx->siphash, data, len);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
+				   const struct bch_hash_info *info)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_CRC32C:
+		return ctx->crc32c;
+	case BCH_STR_HASH_CRC64:
+		return ctx->crc64 >> 1;
+	case BCH_STR_HASH_SIPHASH_OLD:
+	case BCH_STR_HASH_SIPHASH:
+		return SipHash24_End(&ctx->siphash) >> 1;
+	default:
+		BUG();
+	}
+}
+
+struct bch_hash_desc {
+	enum btree_id	btree_id;
+	u8		key_type;
+
+	u64		(*hash_key)(const struct bch_hash_info *, const void *);
+	u64		(*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
+	bool		(*cmp_key)(struct bkey_s_c, const void *);
+	bool		(*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+};
+
+static __always_inline struct btree_iter *
+bch2_hash_lookup(struct btree_trans *trans,
+		 const struct bch_hash_desc desc,
+		 const struct bch_hash_info *info,
+		 u64 inode, const void *key,
+		 unsigned flags)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key(trans, iter, desc.btree_id,
+			   POS(inode, desc.hash_key(info, key)),
+			   BTREE_ITER_SLOTS|flags, k, ret) {
+		if (iter->pos.inode != inode)
+			break;
+
+		if (k.k->type == desc.key_type) {
+			if (!desc.cmp_key(k, key))
+				return iter;
+		} else if (k.k->type == KEY_TYPE_whiteout) {
+			;
+		} else {
+			/* hole, not found */
+			break;
+		}
+	}
+
+	return ERR_PTR(ret ?: -ENOENT);
+}
+
+static __always_inline struct btree_iter *
+bch2_hash_hole(struct btree_trans *trans,
+	       const struct bch_hash_desc desc,
+	       const struct bch_hash_info *info,
+	       u64 inode, const void *key)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key(trans, iter, desc.btree_id,
+			   POS(inode, desc.hash_key(info, key)),
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+		if (iter->pos.inode != inode)
+			break;
+
+		if (k.k->type != desc.key_type)
+			return iter;
+	}
+
+	return ERR_PTR(ret ?: -ENOSPC);
+}
+
+static __always_inline
+int bch2_hash_needs_whiteout(struct btree_trans *trans,
+			     const struct bch_hash_desc desc,
+			     const struct bch_hash_info *info,
+			     struct btree_iter *start)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	iter = bch2_trans_copy_iter(trans, start);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	bch2_btree_iter_next_slot(iter);
+
+	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
+		if (k.k->type != desc.key_type &&
+		    k.k->type != KEY_TYPE_whiteout)
+			break;
+
+		if (k.k->type == desc.key_type &&
+		    desc.hash_bkey(info, k) <= start->pos.offset) {
+			iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+			ret = 1;
+			break;
+		}
+	}
+
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static __always_inline
+int bch2_hash_set(struct btree_trans *trans,
+		  const struct bch_hash_desc desc,
+		  const struct bch_hash_info *info,
+		  u64 inode, struct bkey_i *insert, int flags)
+{
+	struct btree_iter *iter, *slot = NULL;
+	struct bkey_s_c k;
+	bool found = false;
+	int ret;
+
+	for_each_btree_key(trans, iter, desc.btree_id,
+			   POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+		if (iter->pos.inode != inode)
+			break;
+
+		if (k.k->type == desc.key_type) {
+			if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
+				goto found;
+
+			/* hash collision: */
+			continue;
+		}
+
+		if (!slot &&
+		    !(flags & BCH_HASH_SET_MUST_REPLACE)) {
+			slot = bch2_trans_copy_iter(trans, iter);
+			if (IS_ERR(slot))
+				return PTR_ERR(slot);
+		}
+
+		if (k.k->type != KEY_TYPE_whiteout)
+			goto not_found;
+	}
+
+	if (!ret)
+		ret = -ENOSPC;
+out:
+	if (!IS_ERR_OR_NULL(slot))
+		bch2_trans_iter_put(trans, slot);
+	if (!IS_ERR_OR_NULL(iter))
+		bch2_trans_iter_put(trans, iter);
+
+	return ret;
+found:
+	found = true;
+not_found:
+
+	if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
+		ret = -ENOENT;
+	} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
+		ret = -EEXIST;
+	} else {
+		if (!found && slot)
+			swap(iter, slot);
+
+		insert->k.p = iter->pos;
+		bch2_trans_update(trans, iter, insert);
+	}
+
+	goto out;
+}
+
+static __always_inline
+int bch2_hash_delete_at(struct btree_trans *trans,
+			const struct bch_hash_desc desc,
+			const struct bch_hash_info *info,
+			struct btree_iter *iter)
+{
+	struct bkey_i *delete;
+	int ret;
+
+	ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
+	if (ret < 0)
+		return ret;
+
+	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+	if (IS_ERR(delete))
+		return PTR_ERR(delete);
+
+	bkey_init(&delete->k);
+	delete->k.p = iter->pos;
+	delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted;
+
+	bch2_trans_update(trans, iter, delete);
+	return 0;
+}
+
+static __always_inline
+int bch2_hash_delete(struct btree_trans *trans,
+		     const struct bch_hash_desc desc,
+		     const struct bch_hash_info *info,
+		     u64 inode, const void *key)
+{
+	struct btree_iter *iter;
+
+	iter = bch2_hash_lookup(trans, desc, info, inode, key,
+				BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	return bch2_hash_delete_at(trans, desc, info, iter);
+}
+
+#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
new file mode 100644
index 000000000000..7e9c1f9c850c
--- /dev/null
+++ b/fs/bcachefs/super-io.c
@@ -0,0 +1,1154 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io.h"
+#include "journal.h"
+#include "journal_seq_blacklist.h"
+#include "replicas.h"
+#include "quota.h"
+#include "super-io.h"
+#include "super.h"
+#include "vstructs.h"
+
+#include <linux/backing-dev.h>
+#include <linux/sort.h>
+
+const char * const bch2_sb_fields[] = {
+#define x(name, nr)	#name,
+	BCH_SB_FIELDS()
+#undef x
+	NULL
+};
+
+static const char *bch2_sb_field_validate(struct bch_sb *,
+					  struct bch_sb_field *);
+
+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
+				      enum bch_sb_field_type type)
+{
+	struct bch_sb_field *f;
+
+	/* XXX: need locking around superblock to access optional fields */
+
+	vstruct_for_each(sb, f)
+		if (le32_to_cpu(f->type) == type)
+			return f;
+	return NULL;
+}
+
+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
+						   struct bch_sb_field *f,
+						   unsigned u64s)
+{
+	unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+	unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
+
+	BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) >
+	       sb->page_order);
+
+	if (!f) {
+		f = vstruct_last(sb->sb);
+		memset(f, 0, sizeof(u64) * u64s);
+		f->u64s = cpu_to_le32(u64s);
+		f->type = 0;
+	} else {
+		void *src, *dst;
+
+		src = vstruct_end(f);
+
+		if (u64s) {
+			f->u64s = cpu_to_le32(u64s);
+			dst = vstruct_end(f);
+		} else {
+			dst = f;
+		}
+
+		memmove(dst, src, vstruct_end(sb->sb) - src);
+
+		if (dst > src)
+			memset(src, 0, dst - src);
+	}
+
+	sb->sb->u64s = cpu_to_le32(sb_u64s);
+
+	return u64s ? f : NULL;
+}
+
+void bch2_sb_field_delete(struct bch_sb_handle *sb,
+			  enum bch_sb_field_type type)
+{
+	struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
+
+	if (f)
+		__bch2_sb_field_resize(sb, f, 0);
+}
+
+/* Superblock realloc/free: */
+
+void bch2_free_super(struct bch_sb_handle *sb)
+{
+	if (sb->bio)
+		bio_put(sb->bio);
+	if (!IS_ERR_OR_NULL(sb->bdev))
+		blkdev_put(sb->bdev, sb->mode);
+
+	free_pages((unsigned long) sb->sb, sb->page_order);
+	memset(sb, 0, sizeof(*sb));
+}
+
+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
+{
+	size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
+	unsigned order = get_order(new_bytes);
+	struct bch_sb *new_sb;
+	struct bio *bio;
+
+	if (sb->sb && sb->page_order >= order)
+		return 0;
+
+	if (sb->have_layout) {
+		u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
+
+		if (new_bytes > max_bytes) {
+			char buf[BDEVNAME_SIZE];
+
+			pr_err("%s: superblock too big: want %zu but have %llu",
+			       bdevname(sb->bdev, buf), new_bytes, max_bytes);
+			return -ENOSPC;
+		}
+	}
+
+	if (sb->page_order >= order && sb->sb)
+		return 0;
+
+	if (dynamic_fault("bcachefs:add:super_realloc"))
+		return -ENOMEM;
+
+	if (sb->have_bio) {
+		bio = bio_kmalloc(GFP_KERNEL, 1 << order);
+		if (!bio)
+			return -ENOMEM;
+
+		if (sb->bio)
+			bio_put(sb->bio);
+		sb->bio = bio;
+	}
+
+	new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order);
+	if (!new_sb)
+		return -ENOMEM;
+
+	if (sb->sb)
+		memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
+
+	free_pages((unsigned long) sb->sb, sb->page_order);
+	sb->sb = new_sb;
+
+	sb->page_order = order;
+
+	return 0;
+}
+
+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
+					  enum bch_sb_field_type type,
+					  unsigned u64s)
+{
+	struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
+	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+	ssize_t d = -old_u64s + u64s;
+
+	if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+		return NULL;
+
+	if (sb->fs_sb) {
+		struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
+		struct bch_dev *ca;
+		unsigned i;
+
+		lockdep_assert_held(&c->sb_lock);
+
+		/* XXX: we're not checking that offline device have enough space */
+
+		for_each_online_member(ca, c, i) {
+			struct bch_sb_handle *sb = &ca->disk_sb;
+
+			if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+				percpu_ref_put(&ca->ref);
+				return NULL;
+			}
+		}
+	}
+
+	f = bch2_sb_field_get(sb->sb, type);
+	f = __bch2_sb_field_resize(sb, f, u64s);
+	if (f)
+		f->type = cpu_to_le32(type);
+	return f;
+}
+
+/* Superblock validate: */
+
+static inline void __bch2_sb_layout_size_assert(void)
+{
+	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
+}
+
+static const char *validate_sb_layout(struct bch_sb_layout *layout)
+{
+	u64 offset, prev_offset, max_sectors;
+	unsigned i;
+
+	if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
+		return "Not a bcachefs superblock layout";
+
+	if (layout->layout_type != 0)
+		return "Invalid superblock layout type";
+
+	if (!layout->nr_superblocks)
+		return "Invalid superblock layout: no superblocks";
+
+	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
+		return "Invalid superblock layout: too many superblocks";
+
+	max_sectors = 1 << layout->sb_max_size_bits;
+
+	prev_offset = le64_to_cpu(layout->sb_offset[0]);
+
+	for (i = 1; i < layout->nr_superblocks; i++) {
+		offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset < prev_offset + max_sectors)
+			return "Invalid superblock layout: superblocks overlap";
+		prev_offset = offset;
+	}
+
+	return NULL;
+}
+
+const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
+{
+	struct bch_sb *sb = disk_sb->sb;
+	struct bch_sb_field *f;
+	struct bch_sb_field_members *mi;
+	const char *err;
+	u32 version, version_min;
+	u16 block_size;
+
+	version		= le16_to_cpu(sb->version);
+	version_min	= version >= bcachefs_metadata_version_new_versioning
+		? le16_to_cpu(sb->version_min)
+		: version;
+
+	if (version    >= bcachefs_metadata_version_max ||
+	    version_min < bcachefs_metadata_version_min)
+		return "Unsupported superblock version";
+
+	if (version_min > version)
+		return "Bad minimum version";
+
+	if (sb->features[1] ||
+	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR)))
+		return "Filesystem has incompatible features";
+
+	block_size = le16_to_cpu(sb->block_size);
+
+	if (!is_power_of_2(block_size) ||
+	    block_size > PAGE_SECTORS)
+		return "Bad block size";
+
+	if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
+		return "Bad user UUID";
+
+	if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le)))
+		return "Bad internal UUID";
+
+	if (!sb->nr_devices ||
+	    sb->nr_devices <= sb->dev_idx ||
+	    sb->nr_devices > BCH_SB_MEMBERS_MAX)
+		return "Bad number of member devices";
+
+	if (!BCH_SB_META_REPLICAS_WANT(sb) ||
+	    BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of metadata replicas";
+
+	if (!BCH_SB_META_REPLICAS_REQ(sb) ||
+	    BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of metadata replicas";
+
+	if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
+	    BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of data replicas";
+
+	if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
+	    BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of data replicas";
+
+	if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+		return "Invalid metadata checksum type";
+
+	if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+		return "Invalid metadata checksum type";
+
+	if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
+		return "Invalid compression type";
+
+	if (!BCH_SB_BTREE_NODE_SIZE(sb))
+		return "Btree node size not set";
+
+	if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
+		return "Btree node size not a power of two";
+
+	if (BCH_SB_GC_RESERVE(sb) < 5)
+		return "gc reserve percentage too small";
+
+	if (!sb->time_precision ||
+	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
+		return "invalid time precision";
+
+	/* validate layout */
+	err = validate_sb_layout(&sb->layout);
+	if (err)
+		return err;
+
+	vstruct_for_each(sb, f) {
+		if (!f->u64s)
+			return "Invalid superblock: invalid optional field";
+
+		if (vstruct_next(f) > vstruct_last(sb))
+			return "Invalid superblock: invalid optional field";
+	}
+
+	/* members must be validated first: */
+	mi = bch2_sb_get_members(sb);
+	if (!mi)
+		return "Invalid superblock: member info area missing";
+
+	err = bch2_sb_field_validate(sb, &mi->field);
+	if (err)
+		return err;
+
+	vstruct_for_each(sb, f) {
+		if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
+			continue;
+
+		err = bch2_sb_field_validate(sb, f);
+		if (err)
+			return err;
+	}
+
+	return NULL;
+}
+
+/* device open: */
+
+static void bch2_sb_update(struct bch_fs *c)
+{
+	struct bch_sb *src = c->disk_sb.sb;
+	struct bch_sb_field_members *mi = bch2_sb_get_members(src);
+	struct bch_dev *ca;
+	unsigned i;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	c->sb.uuid		= src->uuid;
+	c->sb.user_uuid		= src->user_uuid;
+	c->sb.version		= le16_to_cpu(src->version);
+	c->sb.nr_devices	= src->nr_devices;
+	c->sb.clean		= BCH_SB_CLEAN(src);
+	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
+	c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
+	c->sb.time_base_lo	= le64_to_cpu(src->time_base_lo);
+	c->sb.time_base_hi	= le32_to_cpu(src->time_base_hi);
+	c->sb.time_precision	= le32_to_cpu(src->time_precision);
+	c->sb.features		= le64_to_cpu(src->features[0]);
+	c->sb.compat		= le64_to_cpu(src->compat[0]);
+
+	for_each_member_device(ca, c, i)
+		ca->mi = bch2_mi_to_cpu(mi->members + i);
+}
+
+/* doesn't copy member info */
+static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
+{
+	struct bch_sb_field *src_f, *dst_f;
+	struct bch_sb *dst = dst_handle->sb;
+	unsigned i;
+
+	dst->version		= src->version;
+	dst->version_min	= src->version_min;
+	dst->seq		= src->seq;
+	dst->uuid		= src->uuid;
+	dst->user_uuid		= src->user_uuid;
+	memcpy(dst->label,	src->label, sizeof(dst->label));
+
+	dst->block_size		= src->block_size;
+	dst->nr_devices		= src->nr_devices;
+
+	dst->time_base_lo	= src->time_base_lo;
+	dst->time_base_hi	= src->time_base_hi;
+	dst->time_precision	= src->time_precision;
+
+	memcpy(dst->flags,	src->flags,	sizeof(dst->flags));
+	memcpy(dst->features,	src->features,	sizeof(dst->features));
+	memcpy(dst->compat,	src->compat,	sizeof(dst->compat));
+
+	for (i = 0; i < BCH_SB_FIELD_NR; i++) {
+		if (i == BCH_SB_FIELD_journal)
+			continue;
+
+		src_f = bch2_sb_field_get(src, i);
+		dst_f = bch2_sb_field_get(dst, i);
+		dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
+				src_f ? le32_to_cpu(src_f->u64s) : 0);
+
+		if (src_f)
+			memcpy(dst_f, src_f, vstruct_bytes(src_f));
+	}
+}
+
+int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
+{
+	struct bch_sb_field_journal *journal_buckets =
+		bch2_sb_get_journal(src);
+	unsigned journal_u64s = journal_buckets
+		? le32_to_cpu(journal_buckets->field.u64s)
+		: 0;
+	int ret;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	ret = bch2_sb_realloc(&c->disk_sb,
+			      le32_to_cpu(src->u64s) - journal_u64s);
+	if (ret)
+		return ret;
+
+	__copy_super(&c->disk_sb, src);
+
+	ret = bch2_sb_replicas_to_cpu_replicas(c);
+	if (ret)
+		return ret;
+
+	ret = bch2_sb_disk_groups_to_cpu(c);
+	if (ret)
+		return ret;
+
+	bch2_sb_update(c);
+	return 0;
+}
+
+int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb;
+	struct bch_sb_field_journal *journal_buckets =
+		bch2_sb_get_journal(dst);
+	unsigned journal_u64s = journal_buckets
+		? le32_to_cpu(journal_buckets->field.u64s)
+		: 0;
+	unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
+	int ret;
+
+	ret = bch2_sb_realloc(&ca->disk_sb, u64s);
+	if (ret)
+		return ret;
+
+	__copy_super(&ca->disk_sb, src);
+	return 0;
+}
+
+/* read superblock: */
+
+static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
+{
+	struct bch_csum csum;
+	size_t bytes;
+reread:
+	bio_reset(sb->bio);
+	bio_set_dev(sb->bio, sb->bdev);
+	sb->bio->bi_iter.bi_sector = offset;
+	bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
+	bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order);
+
+	if (submit_bio_wait(sb->bio))
+		return "IO error";
+
+	if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
+		return "Not a bcachefs superblock";
+
+	if (le16_to_cpu(sb->sb->version) <  bcachefs_metadata_version_min ||
+	    le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max)
+		return "Unsupported superblock version";
+
+	bytes = vstruct_bytes(sb->sb);
+
+	if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
+		return "Bad superblock: too big";
+
+	if (get_order(bytes) > sb->page_order) {
+		if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
+			return "cannot allocate memory";
+		goto reread;
+	}
+
+	if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
+		return "unknown csum type";
+
+	/* XXX: verify MACs */
+	csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
+			    null_nonce(), sb->sb);
+
+	if (bch2_crc_cmp(csum, sb->sb->csum))
+		return "bad checksum reading superblock";
+
+	sb->seq = le64_to_cpu(sb->sb->seq);
+
+	return NULL;
+}
+
+int bch2_read_super(const char *path, struct bch_opts *opts,
+		    struct bch_sb_handle *sb)
+{
+	u64 offset = opt_get(*opts, sb);
+	struct bch_sb_layout layout;
+	const char *err;
+	__le64 *i;
+	int ret;
+
+	pr_verbose_init(*opts, "");
+
+	memset(sb, 0, sizeof(*sb));
+	sb->mode	= FMODE_READ;
+	sb->have_bio	= true;
+
+	if (!opt_get(*opts, noexcl))
+		sb->mode |= FMODE_EXCL;
+
+	if (!opt_get(*opts, nochanges))
+		sb->mode |= FMODE_WRITE;
+
+	sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+	if (IS_ERR(sb->bdev) &&
+	    PTR_ERR(sb->bdev) == -EACCES &&
+	    opt_get(*opts, read_only)) {
+		sb->mode &= ~FMODE_WRITE;
+
+		sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+		if (!IS_ERR(sb->bdev))
+			opt_set(*opts, nochanges, true);
+	}
+
+	if (IS_ERR(sb->bdev)) {
+		ret = PTR_ERR(sb->bdev);
+		goto out;
+	}
+
+	err = "cannot allocate memory";
+	ret = bch2_sb_realloc(sb, 0);
+	if (ret)
+		goto err;
+
+	ret = -EFAULT;
+	err = "dynamic fault";
+	if (bch2_fs_init_fault("read_super"))
+		goto err;
+
+	ret = -EINVAL;
+	err = read_one_super(sb, offset);
+	if (!err)
+		goto got_super;
+
+	if (opt_defined(*opts, sb))
+		goto err;
+
+	pr_err("error reading default superblock: %s", err);
+
+	/*
+	 * Error reading primary superblock - read location of backup
+	 * superblocks:
+	 */
+	bio_reset(sb->bio);
+	bio_set_dev(sb->bio, sb->bdev);
+	sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
+	bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
+	/*
+	 * use sb buffer to read layout, since sb buffer is page aligned but
+	 * layout won't be:
+	 */
+	bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
+
+	err = "IO error";
+	if (submit_bio_wait(sb->bio))
+		goto err;
+
+	memcpy(&layout, sb->sb, sizeof(layout));
+	err = validate_sb_layout(&layout);
+	if (err)
+		goto err;
+
+	for (i = layout.sb_offset;
+	     i < layout.sb_offset + layout.nr_superblocks; i++) {
+		offset = le64_to_cpu(*i);
+
+		if (offset == opt_get(*opts, sb))
+			continue;
+
+		err = read_one_super(sb, offset);
+		if (!err)
+			goto got_super;
+	}
+
+	ret = -EINVAL;
+	goto err;
+
+got_super:
+	err = "Superblock block size smaller than device block size";
+	ret = -EINVAL;
+	if (le16_to_cpu(sb->sb->block_size) << 9 <
+	    bdev_logical_block_size(sb->bdev))
+		goto err;
+
+	ret = 0;
+	sb->have_layout = true;
+out:
+	pr_verbose_init(*opts, "ret %i", ret);
+	return ret;
+err:
+	bch2_free_super(sb);
+	pr_err("error reading superblock: %s", err);
+	goto out;
+}
+
+/* write superblock: */
+
+static void write_super_endio(struct bio *bio)
+{
+	struct bch_dev *ca = bio->bi_private;
+
+	/* XXX: return errors directly */
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
+		ca->sb_write_error = 1;
+
+	closure_put(&ca->fs->sb_write);
+	percpu_ref_put(&ca->io_ref);
+}
+
+static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bch_sb *sb = ca->disk_sb.sb;
+	struct bio *bio = ca->disk_sb.bio;
+
+	bio_reset(bio);
+	bio_set_dev(bio, ca->disk_sb.bdev);
+	bio->bi_iter.bi_sector	= le64_to_cpu(sb->layout.sb_offset[0]);
+	bio->bi_end_io		= write_super_endio;
+	bio->bi_private		= ca;
+	bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META);
+	bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
+
+	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB],
+		     bio_sectors(bio));
+
+	percpu_ref_get(&ca->io_ref);
+	closure_bio_submit(bio, &c->sb_write);
+}
+
+static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
+{
+	struct bch_sb *sb = ca->disk_sb.sb;
+	struct bio *bio = ca->disk_sb.bio;
+
+	sb->offset = sb->layout.sb_offset[idx];
+
+	SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
+	sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
+				null_nonce(), sb);
+
+	bio_reset(bio);
+	bio_set_dev(bio, ca->disk_sb.bdev);
+	bio->bi_iter.bi_sector	= le64_to_cpu(sb->offset);
+	bio->bi_end_io		= write_super_endio;
+	bio->bi_private		= ca;
+	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
+	bch2_bio_map(bio, sb,
+		     roundup((size_t) vstruct_bytes(sb),
+			     bdev_logical_block_size(ca->disk_sb.bdev)));
+
+	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
+		     bio_sectors(bio));
+
+	percpu_ref_get(&ca->io_ref);
+	closure_bio_submit(bio, &c->sb_write);
+}
+
+int bch2_write_super(struct bch_fs *c)
+{
+	struct closure *cl = &c->sb_write;
+	struct bch_dev *ca;
+	unsigned i, sb = 0, nr_wrote;
+	const char *err;
+	struct bch_devs_mask sb_written;
+	bool wrote, can_mount_without_written, can_mount_with_written;
+	int ret = 0;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	closure_init_stack(cl);
+	memset(&sb_written, 0, sizeof(sb_written));
+
+	le64_add_cpu(&c->disk_sb.sb->seq, 1);
+
+	if (test_bit(BCH_FS_ERROR, &c->flags))
+		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
+
+	for_each_online_member(ca, c, i)
+		bch2_sb_from_fs(c, ca);
+
+	for_each_online_member(ca, c, i) {
+		err = bch2_sb_validate(&ca->disk_sb);
+		if (err) {
+			bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
+			ret = -1;
+			goto out;
+		}
+	}
+
+	if (c->opts.nochanges)
+		goto out;
+
+	for_each_online_member(ca, c, i) {
+		__set_bit(ca->dev_idx, sb_written.d);
+		ca->sb_write_error = 0;
+	}
+
+	for_each_online_member(ca, c, i)
+		read_back_super(c, ca);
+	closure_sync(cl);
+
+	for_each_online_member(ca, c, i) {
+		if (!ca->sb_write_error &&
+		    ca->disk_sb.seq !=
+		    le64_to_cpu(ca->sb_read_scratch->seq)) {
+			bch2_fs_fatal_error(c,
+				"Superblock modified by another process");
+			percpu_ref_put(&ca->io_ref);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	do {
+		wrote = false;
+		for_each_online_member(ca, c, i)
+			if (!ca->sb_write_error &&
+			    sb < ca->disk_sb.sb->layout.nr_superblocks) {
+				write_one_super(c, ca, sb);
+				wrote = true;
+			}
+		closure_sync(cl);
+		sb++;
+	} while (wrote);
+
+	for_each_online_member(ca, c, i) {
+		if (ca->sb_write_error)
+			__clear_bit(ca->dev_idx, sb_written.d);
+		else
+			ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq);
+	}
+
+	nr_wrote = dev_mask_nr(&sb_written);
+
+	can_mount_with_written =
+		bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
+				      BCH_FORCE_IF_DEGRADED);
+
+	for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
+		sb_written.d[i] = ~sb_written.d[i];
+
+	can_mount_without_written =
+		bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
+				      BCH_FORCE_IF_DEGRADED);
+
+	/*
+	 * If we would be able to mount _without_ the devices we successfully
+	 * wrote superblocks to, we weren't able to write to enough devices:
+	 *
+	 * Exception: if we can mount without the successes because we haven't
+	 * written anything (new filesystem), we continue if we'd be able to
+	 * mount with the devices we did successfully write to:
+	 */
+	if (bch2_fs_fatal_err_on(!nr_wrote ||
+				 (can_mount_without_written &&
+				  !can_mount_with_written), c,
+		"Unable to write superblock to sufficient devices"))
+		ret = -1;
+out:
+	/* Make new options visible after they're persistent: */
+	bch2_sb_update(c);
+	return ret;
+}
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+	u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
+
+	return l < r ? -1 : l > r ? 1 : 0;
+}
+
+static const char *bch2_sb_validate_journal(struct bch_sb *sb,
+					    struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal *journal = field_to_type(f, journal);
+	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+	const char *err;
+	unsigned nr;
+	unsigned i;
+	u64 *b;
+
+	journal = bch2_sb_get_journal(sb);
+	if (!journal)
+		return NULL;
+
+	nr = bch2_nr_journal_buckets(journal);
+	if (!nr)
+		return NULL;
+
+	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
+	if (!b)
+		return "cannot allocate memory";
+
+	for (i = 0; i < nr; i++)
+		b[i] = le64_to_cpu(journal->buckets[i]);
+
+	sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+	err = "journal bucket at sector 0";
+	if (!b[0])
+		goto err;
+
+	err = "journal bucket before first bucket";
+	if (m && b[0] < le16_to_cpu(m->first_bucket))
+		goto err;
+
+	err = "journal bucket past end of device";
+	if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets))
+		goto err;
+
+	err = "duplicate journal buckets";
+	for (i = 0; i + 1 < nr; i++)
+		if (b[i] == b[i + 1])
+			goto err;
+
+	err = NULL;
+err:
+	kfree(b);
+	return err;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+	.validate	= bch2_sb_validate_journal,
+};
+
+/* BCH_SB_FIELD_members: */
+
+static const char *bch2_sb_validate_members(struct bch_sb *sb,
+					    struct bch_sb_field *f)
+{
+	struct bch_sb_field_members *mi = field_to_type(f, members);
+	struct bch_member *m;
+
+	if ((void *) (mi->members + sb->nr_devices) >
+	    vstruct_end(&mi->field))
+		return "Invalid superblock: bad member info";
+
+	for (m = mi->members;
+	     m < mi->members + sb->nr_devices;
+	     m++) {
+		if (!bch2_member_exists(m))
+			continue;
+
+		if (le64_to_cpu(m->nbuckets) > LONG_MAX)
+			return "Too many buckets";
+
+		if (le64_to_cpu(m->nbuckets) -
+		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS)
+			return "Not enough buckets";
+
+		if (le16_to_cpu(m->bucket_size) <
+		    le16_to_cpu(sb->block_size))
+			return "bucket size smaller than block size";
+
+		if (le16_to_cpu(m->bucket_size) <
+		    BCH_SB_BTREE_NODE_SIZE(sb))
+			return "bucket size smaller than btree node size";
+	}
+
+	return NULL;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_members = {
+	.validate	= bch2_sb_validate_members,
+};
+
+/* BCH_SB_FIELD_crypt: */
+
+static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
+					  struct bch_sb_field *f)
+{
+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+	if (vstruct_bytes(&crypt->field) != sizeof(*crypt))
+		return "invalid field crypt: wrong size";
+
+	if (BCH_CRYPT_KDF_TYPE(crypt))
+		return "invalid field crypt: bad kdf type";
+
+	return NULL;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+	.validate	= bch2_sb_validate_crypt,
+};
+
+/* BCH_SB_FIELD_clean: */
+
+void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write)
+{
+	struct jset_entry *entry;
+
+	for (entry = clean->start;
+	     entry < (struct jset_entry *) vstruct_end(&clean->field);
+	     entry = vstruct_next(entry))
+		bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write);
+}
+
+int bch2_fs_mark_dirty(struct bch_fs *c)
+{
+	int ret;
+
+	/*
+	 * Unconditionally write superblock, to verify it hasn't changed before
+	 * we go rw:
+	 */
+
+	mutex_lock(&c->sb_lock);
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+	c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
+	ret = bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+static void
+entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+{
+	memset(entry, 0, u64s * sizeof(u64));
+
+	/*
+	 * The u64s field counts from the start of data, ignoring the shared
+	 * fields.
+	 */
+	entry->u64s = u64s - 1;
+}
+
+static void
+entry_init_size(struct jset_entry *entry, size_t size)
+{
+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+	entry_init_u64s(entry, u64s);
+}
+
+struct jset_entry *
+bch2_journal_super_entries_add_common(struct bch_fs *c,
+				      struct jset_entry *entry,
+				      u64 journal_seq)
+{
+	struct btree_root *r;
+	unsigned i;
+
+	mutex_lock(&c->btree_root_lock);
+
+	for (r = c->btree_roots;
+	     r < c->btree_roots + BTREE_ID_NR;
+	     r++)
+		if (r->alive) {
+			entry_init_u64s(entry, r->key.u64s + 1);
+			entry->btree_id	= r - c->btree_roots;
+			entry->level	= r->level;
+			entry->type	= BCH_JSET_ENTRY_btree_root;
+			bkey_copy(&entry->start[0], &r->key);
+
+			entry = vstruct_next(entry);
+		}
+	c->btree_roots_dirty = false;
+
+	mutex_unlock(&c->btree_root_lock);
+
+	percpu_down_write(&c->mark_lock);
+
+	if (!journal_seq) {
+		bch2_fs_usage_acc_to_base(c, 0);
+		bch2_fs_usage_acc_to_base(c, 1);
+	} else {
+		bch2_fs_usage_acc_to_base(c, journal_seq & 1);
+	}
+
+	{
+		struct jset_entry_usage *u =
+			container_of(entry, struct jset_entry_usage, entry);
+
+		entry_init_size(entry, sizeof(*u));
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = FS_USAGE_INODES;
+		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
+
+		entry = vstruct_next(entry);
+	}
+
+	{
+		struct jset_entry_usage *u =
+			container_of(entry, struct jset_entry_usage, entry);
+
+		entry_init_size(entry, sizeof(*u));
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = FS_USAGE_KEY_VERSION;
+		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
+
+		entry = vstruct_next(entry);
+	}
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		struct jset_entry_usage *u =
+			container_of(entry, struct jset_entry_usage, entry);
+
+		entry_init_size(entry, sizeof(*u));
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = FS_USAGE_RESERVED;
+		u->entry.level	= i;
+		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
+
+		entry = vstruct_next(entry);
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+		struct jset_entry_data_usage *u =
+			container_of(entry, struct jset_entry_data_usage, entry);
+
+		entry_init_size(entry, sizeof(*u) + e->nr_devs);
+		u->entry.type	= BCH_JSET_ENTRY_data_usage;
+		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
+		memcpy(&u->r, e, replicas_entry_bytes(e));
+
+		entry = vstruct_next(entry);
+	}
+
+	percpu_up_write(&c->mark_lock);
+
+	return entry;
+}
+
+void bch2_fs_mark_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *sb_clean;
+	struct jset_entry *entry;
+	unsigned u64s;
+
+	mutex_lock(&c->sb_lock);
+	if (BCH_SB_CLEAN(c->disk_sb.sb))
+		goto out;
+
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
+
+	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
+	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
+
+	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
+
+	sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
+	if (!sb_clean) {
+		bch_err(c, "error resizing superblock while setting filesystem clean");
+		goto out;
+	}
+
+	sb_clean->flags		= 0;
+	sb_clean->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
+	sb_clean->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
+	sb_clean->journal_seq	= cpu_to_le64(journal_cur_seq(&c->journal) - 1);
+
+	/* Trying to catch outstanding bug: */
+	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
+
+	entry = sb_clean->start;
+	entry = bch2_journal_super_entries_add_common(c, entry, 0);
+	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+
+	memset(entry, 0,
+	       vstruct_end(&sb_clean->field) - (void *) entry);
+
+	if (le16_to_cpu(c->disk_sb.sb->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		bch2_sb_clean_renumber(sb_clean, WRITE);
+
+	bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+}
+
+static const char *bch2_sb_validate_clean(struct bch_sb *sb,
+					  struct bch_sb_field *f)
+{
+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+	if (vstruct_bytes(&clean->field) < sizeof(*clean))
+		return "invalid field crypt: wrong size";
+
+	return NULL;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
+	.validate	= bch2_sb_validate_clean,
+};
+
+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
+#define x(f, nr)					\
+	[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
+	BCH_SB_FIELDS()
+#undef x
+};
+
+static const char *bch2_sb_field_validate(struct bch_sb *sb,
+					  struct bch_sb_field *f)
+{
+	unsigned type = le32_to_cpu(f->type);
+
+	return type < BCH_SB_FIELD_NR
+		? bch2_sb_field_ops[type]->validate(sb, f)
+		: NULL;
+}
+
+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+			   struct bch_sb_field *f)
+{
+	unsigned type = le32_to_cpu(f->type);
+	const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR
+		? bch2_sb_field_ops[type] : NULL;
+
+	if (ops)
+		pr_buf(out, "%s", bch2_sb_fields[type]);
+	else
+		pr_buf(out, "(unknown field %u)", type);
+
+	pr_buf(out, " (size %llu):", vstruct_bytes(f));
+
+	if (ops && ops->to_text)
+		bch2_sb_field_ops[type]->to_text(out, sb, f);
+}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
new file mode 100644
index 000000000000..f5450e596c62
--- /dev/null
+++ b/fs/bcachefs/super-io.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_IO_H
+#define _BCACHEFS_SUPER_IO_H
+
+#include "extents.h"
+#include "eytzinger.h"
+#include "super_types.h"
+#include "super.h"
+
+#include <asm/byteorder.h>
+
+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
+					  enum bch_sb_field_type, unsigned);
+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
+
+#define field_to_type(_f, _name)					\
+	container_of_or_null(_f, struct bch_sb_field_##_name, field)
+
+#define x(_name, _nr)							\
+static inline struct bch_sb_field_##_name *				\
+bch2_sb_get_##_name(struct bch_sb *sb)					\
+{									\
+	return field_to_type(bch2_sb_field_get(sb,			\
+				BCH_SB_FIELD_##_name), _name);		\
+}									\
+									\
+static inline struct bch_sb_field_##_name *				\
+bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s)	\
+{									\
+	return field_to_type(bch2_sb_field_resize(sb,			\
+				BCH_SB_FIELD_##_name, u64s), _name);	\
+}
+
+BCH_SB_FIELDS()
+#undef x
+
+extern const char * const bch2_sb_fields[];
+
+struct bch_sb_field_ops {
+	const char *	(*validate)(struct bch_sb *, struct bch_sb_field *);
+	void		(*to_text)(struct printbuf *, struct bch_sb *,
+				   struct bch_sb_field *);
+};
+
+static inline bool bch2_sb_test_feature(struct bch_sb *sb,
+					enum bch_sb_features f)
+{
+	unsigned w = f / 64;
+	unsigned b = f % 64;
+
+	return le64_to_cpu(sb->features[w]) & (1ULL << b);
+}
+
+static inline void bch2_sb_set_feature(struct bch_sb *sb,
+				       enum bch_sb_features f)
+{
+	if (!bch2_sb_test_feature(sb, f)) {
+		unsigned w = f / 64;
+		unsigned b = f % 64;
+
+		le64_add_cpu(&sb->features[w], 1ULL << b);
+	}
+}
+
+static inline __le64 bch2_sb_magic(struct bch_fs *c)
+{
+	__le64 ret;
+	memcpy(&ret, &c->sb.uuid, sizeof(ret));
+	return ret;
+}
+
+static inline __u64 jset_magic(struct bch_fs *c)
+{
+	return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
+}
+
+static inline __u64 bset_magic(struct bch_fs *c)
+{
+	return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
+}
+
+int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
+int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
+
+void bch2_free_super(struct bch_sb_handle *);
+int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
+
+const char *bch2_sb_validate(struct bch_sb_handle *);
+
+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
+int bch2_write_super(struct bch_fs *);
+
+/* BCH_SB_FIELD_journal: */
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+	return j
+		? (__le64 *) vstruct_end(&j->field) - j->buckets
+		: 0;
+}
+
+/* BCH_SB_FIELD_members: */
+
+static inline bool bch2_member_exists(struct bch_member *m)
+{
+	return !bch2_is_zero(m->uuid.b, sizeof(uuid_le));
+}
+
+static inline bool bch2_dev_exists(struct bch_sb *sb,
+				   struct bch_sb_field_members *mi,
+				   unsigned dev)
+{
+	return dev < sb->nr_devices &&
+		bch2_member_exists(&mi->members[dev]);
+}
+
+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
+{
+	return (struct bch_member_cpu) {
+		.nbuckets	= le64_to_cpu(mi->nbuckets),
+		.first_bucket	= le16_to_cpu(mi->first_bucket),
+		.bucket_size	= le16_to_cpu(mi->bucket_size),
+		.group		= BCH_MEMBER_GROUP(mi),
+		.state		= BCH_MEMBER_STATE(mi),
+		.replacement	= BCH_MEMBER_REPLACEMENT(mi),
+		.discard	= BCH_MEMBER_DISCARD(mi),
+		.data_allowed	= BCH_MEMBER_DATA_ALLOWED(mi),
+		.durability	= BCH_MEMBER_DURABILITY(mi)
+			? BCH_MEMBER_DURABILITY(mi) - 1
+			: 1,
+		.valid		= !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
+	};
+}
+
+/* BCH_SB_FIELD_clean: */
+
+struct jset_entry *
+bch2_journal_super_entries_add_common(struct bch_fs *,
+				      struct jset_entry *, u64);
+
+void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
+
+int bch2_fs_mark_dirty(struct bch_fs *);
+void bch2_fs_mark_clean(struct bch_fs *);
+
+void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
+			   struct bch_sb_field *);
+
+#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
new file mode 100644
index 000000000000..f0af26bd328f
--- /dev/null
+++ b/fs/bcachefs/super.c
@@ -0,0 +1,1953 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcachefs setup/teardown code, and some metadata io - read a superblock and
+ * figure out what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "bkey_sort.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "chardev.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "move.h"
+#include "migrate.h"
+#include "movinggc.h"
+#include "quota.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+#include "sysfs.h"
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/random.h>
+#include <linux/sysfs.h>
+#include <crypto/hash.h>
+
+#include <trace/events/bcachefs.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+
+#define KTYPE(type)							\
+struct kobj_type type ## _ktype = {					\
+	.release	= type ## _release,				\
+	.sysfs_ops	= &type ## _sysfs_ops,				\
+	.default_attrs	= type ## _files				\
+}
+
+static void bch2_fs_release(struct kobject *);
+static void bch2_dev_release(struct kobject *);
+
+static void bch2_fs_internal_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_opts_dir_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_time_stats_release(struct kobject *k)
+{
+}
+
+static KTYPE(bch2_fs);
+static KTYPE(bch2_fs_internal);
+static KTYPE(bch2_fs_opts_dir);
+static KTYPE(bch2_fs_time_stats);
+static KTYPE(bch2_dev);
+
+static struct kset *bcachefs_kset;
+static LIST_HEAD(bch_fs_list);
+static DEFINE_MUTEX(bch_fs_list_lock);
+
+static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
+
+static void bch2_dev_free(struct bch_dev *);
+static int bch2_dev_alloc(struct bch_fs *, unsigned);
+static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
+static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
+
+struct bch_fs *bch2_dev_to_fs(dev_t dev)
+{
+	struct bch_fs *c;
+	struct bch_dev *ca;
+	unsigned i;
+
+	mutex_lock(&bch_fs_list_lock);
+	rcu_read_lock();
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		for_each_member_device_rcu(ca, c, i, NULL)
+			if (ca->disk_sb.bdev->bd_dev == dev) {
+				closure_get(&c->cl);
+				goto found;
+			}
+	c = NULL;
+found:
+	rcu_read_unlock();
+	mutex_unlock(&bch_fs_list_lock);
+
+	return c;
+}
+
+static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid)
+{
+	struct bch_fs *c;
+
+	lockdep_assert_held(&bch_fs_list_lock);
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le)))
+			return c;
+
+	return NULL;
+}
+
+struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
+{
+	struct bch_fs *c;
+
+	mutex_lock(&bch_fs_list_lock);
+	c = __bch2_uuid_to_fs(uuid);
+	if (c)
+		closure_get(&c->cl);
+	mutex_unlock(&bch_fs_list_lock);
+
+	return c;
+}
+
+/* Filesystem RO/RW: */
+
+/*
+ * For startup/shutdown of RW stuff, the dependencies are:
+ *
+ * - foreground writes depend on copygc and rebalance (to free up space)
+ *
+ * - copygc and rebalance depend on mark and sweep gc (they actually probably
+ *   don't because they either reserve ahead of time or don't block if
+ *   allocations fail, but allocations can require mark and sweep gc to run
+ *   because of generation number wraparound)
+ *
+ * - all of the above depends on the allocator threads
+ *
+ * - allocator depends on the journal (when it rewrites prios and gens)
+ */
+
+static void __bch2_fs_read_only(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	bool wrote;
+	unsigned i, clean_passes = 0;
+	int ret;
+
+	bch2_rebalance_stop(c);
+
+	for_each_member_device(ca, c, i)
+		bch2_copygc_stop(ca);
+
+	bch2_gc_thread_stop(c);
+
+	/*
+	 * Flush journal before stopping allocators, because flushing journal
+	 * blacklist entries involves allocating new btree nodes:
+	 */
+	bch2_journal_flush_all_pins(&c->journal);
+
+	if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
+		goto allocator_not_running;
+
+	do {
+		wrote = false;
+
+		ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
+			bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
+
+		if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+			bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
+
+		if (ret)
+			break;
+
+		for_each_member_device(ca, c, i)
+			bch2_dev_allocator_quiesce(c, ca);
+
+		bch2_journal_flush_all_pins(&c->journal);
+
+		/*
+		 * We need to explicitly wait on btree interior updates to complete
+		 * before stopping the journal, flushing all journal pins isn't
+		 * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
+		 * interior updates have to drop their journal pin before they're
+		 * fully complete:
+		 */
+		closure_wait_event(&c->btree_interior_update_wait,
+				   !bch2_btree_interior_updates_nr_pending(c));
+
+		clean_passes = wrote ? 0 : clean_passes + 1;
+	} while (clean_passes < 2);
+allocator_not_running:
+	for_each_member_device(ca, c, i)
+		bch2_dev_allocator_stop(ca);
+
+	clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+
+	bch2_fs_journal_stop(&c->journal);
+
+	/* XXX: mark super that alloc info is persistent */
+
+	/*
+	 * the journal kicks off btree writes via reclaim - wait for in flight
+	 * writes after stopping journal:
+	 */
+	if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+		bch2_btree_flush_all_writes(c);
+	else
+		bch2_btree_verify_flushed(c);
+
+	/*
+	 * After stopping journal:
+	 */
+	for_each_member_device(ca, c, i)
+		bch2_dev_allocator_remove(c, ca);
+}
+
+static void bch2_writes_disabled(struct percpu_ref *writes)
+{
+	struct bch_fs *c = container_of(writes, struct bch_fs, writes);
+
+	set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+	wake_up(&bch_read_only_wait);
+}
+
+void bch2_fs_read_only(struct bch_fs *c)
+{
+	if (!test_bit(BCH_FS_RW, &c->flags)) {
+		cancel_delayed_work_sync(&c->journal.reclaim_work);
+		return;
+	}
+
+	BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+	/*
+	 * Block new foreground-end write operations from starting - any new
+	 * writes will return -EROFS:
+	 *
+	 * (This is really blocking new _allocations_, writes to previously
+	 * allocated space can still happen until stopping the allocator in
+	 * bch2_dev_allocator_stop()).
+	 */
+	percpu_ref_kill(&c->writes);
+
+	cancel_work_sync(&c->ec_stripe_delete_work);
+	cancel_delayed_work(&c->pd_controllers_update);
+
+	/*
+	 * If we're not doing an emergency shutdown, we want to wait on
+	 * outstanding writes to complete so they don't see spurious errors due
+	 * to shutting down the allocator:
+	 *
+	 * If we are doing an emergency shutdown outstanding writes may
+	 * hang until we shutdown the allocator so we don't want to wait
+	 * on outstanding writes before shutting everything down - but
+	 * we do need to wait on them before returning and signalling
+	 * that going RO is complete:
+	 */
+	wait_event(bch_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
+		   test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
+
+	__bch2_fs_read_only(c);
+
+	wait_event(bch_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+
+	if (!bch2_journal_error(&c->journal) &&
+	    !test_bit(BCH_FS_ERROR, &c->flags) &&
+	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
+	    test_bit(BCH_FS_STARTED, &c->flags) &&
+	    !c->opts.norecovery)
+		bch2_fs_mark_clean(c);
+
+	clear_bit(BCH_FS_RW, &c->flags);
+}
+
+static void bch2_fs_read_only_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, read_only_work);
+
+	mutex_lock(&c->state_lock);
+	bch2_fs_read_only(c);
+	mutex_unlock(&c->state_lock);
+}
+
+static void bch2_fs_read_only_async(struct bch_fs *c)
+{
+	queue_work(system_long_wq, &c->read_only_work);
+}
+
+bool bch2_fs_emergency_read_only(struct bch_fs *c)
+{
+	bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
+
+	bch2_fs_read_only_async(c);
+	bch2_journal_halt(&c->journal);
+
+	wake_up(&bch_read_only_wait);
+	return ret;
+}
+
+static int bch2_fs_read_write_late(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	ret = bch2_gc_thread_start(c);
+	if (ret) {
+		bch_err(c, "error starting gc thread");
+		return ret;
+	}
+
+	for_each_rw_member(ca, c, i) {
+		ret = bch2_copygc_start(c, ca);
+		if (ret) {
+			bch_err(c, "error starting copygc threads");
+			percpu_ref_put(&ca->io_ref);
+			return ret;
+		}
+	}
+
+	ret = bch2_rebalance_start(c);
+	if (ret) {
+		bch_err(c, "error starting rebalance thread");
+		return ret;
+	}
+
+	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
+
+	schedule_work(&c->ec_stripe_delete_work);
+
+	return 0;
+}
+
+int __bch2_fs_read_write(struct bch_fs *c, bool early)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	if (test_bit(BCH_FS_RW, &c->flags))
+		return 0;
+
+	/*
+	 * nochanges is used for fsck -n mode - we have to allow going rw
+	 * during recovery for that to work:
+	 */
+	if (c->opts.norecovery ||
+	    (c->opts.nochanges &&
+	     (!early || c->opts.read_only)))
+		return -EROFS;
+
+	ret = bch2_fs_mark_dirty(c);
+	if (ret)
+		goto err;
+
+	for_each_rw_member(ca, c, i)
+		bch2_dev_allocator_add(c, ca);
+	bch2_recalc_capacity(c);
+
+	if (!test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
+		ret = bch2_fs_allocator_start(c);
+		if (ret) {
+			bch_err(c, "error initializing allocator");
+			goto err;
+		}
+
+		set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
+	}
+
+	for_each_rw_member(ca, c, i) {
+		ret = bch2_dev_allocator_start(ca);
+		if (ret) {
+			bch_err(c, "error starting allocator threads");
+			percpu_ref_put(&ca->io_ref);
+			goto err;
+		}
+	}
+
+	set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+
+	if (!early) {
+		ret = bch2_fs_read_write_late(c);
+		if (ret)
+			goto err;
+	}
+
+	percpu_ref_reinit(&c->writes);
+	set_bit(BCH_FS_RW, &c->flags);
+
+	queue_delayed_work(c->journal_reclaim_wq,
+			   &c->journal.reclaim_work, 0);
+	return 0;
+err:
+	__bch2_fs_read_only(c);
+	return ret;
+}
+
+int bch2_fs_read_write(struct bch_fs *c)
+{
+	return __bch2_fs_read_write(c, false);
+}
+
+int bch2_fs_read_write_early(struct bch_fs *c)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	return __bch2_fs_read_write(c, true);
+}
+
+/* Filesystem startup/shutdown: */
+
+static void bch2_fs_free(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < BCH_TIME_STAT_NR; i++)
+		bch2_time_stats_exit(&c->times[i]);
+
+	bch2_fs_quota_exit(c);
+	bch2_fs_fsio_exit(c);
+	bch2_fs_ec_exit(c);
+	bch2_fs_encryption_exit(c);
+	bch2_fs_io_exit(c);
+	bch2_fs_btree_iter_exit(c);
+	bch2_fs_btree_cache_exit(c);
+	bch2_fs_journal_exit(&c->journal);
+	bch2_io_clock_exit(&c->io_clock[WRITE]);
+	bch2_io_clock_exit(&c->io_clock[READ]);
+	bch2_fs_compress_exit(c);
+	percpu_free_rwsem(&c->mark_lock);
+	kfree(c->usage_scratch);
+	free_percpu(c->usage[1]);
+	free_percpu(c->usage[0]);
+	kfree(c->usage_base);
+	free_percpu(c->pcpu);
+	mempool_exit(&c->btree_bounce_pool);
+	bioset_exit(&c->btree_bio);
+	mempool_exit(&c->btree_interior_update_pool);
+	mempool_exit(&c->btree_reserve_pool);
+	mempool_exit(&c->fill_iter);
+	percpu_ref_exit(&c->writes);
+	kfree(c->replicas.entries);
+	kfree(c->replicas_gc.entries);
+	kfree(rcu_dereference_protected(c->disk_groups, 1));
+	kfree(c->journal_seq_blacklist_table);
+
+	if (c->journal_reclaim_wq)
+		destroy_workqueue(c->journal_reclaim_wq);
+	if (c->copygc_wq)
+		destroy_workqueue(c->copygc_wq);
+	if (c->wq)
+		destroy_workqueue(c->wq);
+
+	free_pages((unsigned long) c->disk_sb.sb,
+		   c->disk_sb.page_order);
+	kvpfree(c, sizeof(*c));
+	module_put(THIS_MODULE);
+}
+
+static void bch2_fs_release(struct kobject *kobj)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	bch2_fs_free(c);
+}
+
+void bch2_fs_stop(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	bch_verbose(c, "shutting down");
+
+	set_bit(BCH_FS_STOPPING, &c->flags);
+
+	cancel_work_sync(&c->journal_seq_blacklist_gc_work);
+
+	for_each_member_device(ca, c, i)
+		if (ca->kobj.state_in_sysfs &&
+		    ca->disk_sb.bdev)
+			sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+
+	if (c->kobj.state_in_sysfs)
+		kobject_del(&c->kobj);
+
+	bch2_fs_debug_exit(c);
+	bch2_fs_chardev_exit(c);
+
+	kobject_put(&c->time_stats);
+	kobject_put(&c->opts_dir);
+	kobject_put(&c->internal);
+
+	mutex_lock(&bch_fs_list_lock);
+	list_del(&c->list);
+	mutex_unlock(&bch_fs_list_lock);
+
+	closure_sync(&c->cl);
+	closure_debug_destroy(&c->cl);
+
+	mutex_lock(&c->state_lock);
+	bch2_fs_read_only(c);
+	mutex_unlock(&c->state_lock);
+
+	/* btree prefetch might have kicked off reads in the background: */
+	bch2_btree_flush_all_reads(c);
+
+	for_each_member_device(ca, c, i)
+		cancel_work_sync(&ca->io_error_work);
+
+	cancel_work_sync(&c->btree_write_error_work);
+	cancel_delayed_work_sync(&c->pd_controllers_update);
+	cancel_work_sync(&c->read_only_work);
+
+	for (i = 0; i < c->sb.nr_devices; i++)
+		if (c->devs[i])
+			bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
+
+	bch_verbose(c, "shutdown complete");
+
+	kobject_put(&c->kobj);
+}
+
+static const char *bch2_fs_online(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	const char *err = NULL;
+	unsigned i;
+	int ret;
+
+	lockdep_assert_held(&bch_fs_list_lock);
+
+	if (!list_empty(&c->list))
+		return NULL;
+
+	if (__bch2_uuid_to_fs(c->sb.uuid))
+		return "filesystem UUID already open";
+
+	ret = bch2_fs_chardev_init(c);
+	if (ret)
+		return "error creating character device";
+
+	bch2_fs_debug_init(c);
+
+	if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
+	    kobject_add(&c->internal, &c->kobj, "internal") ||
+	    kobject_add(&c->opts_dir, &c->kobj, "options") ||
+	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
+	    bch2_opts_create_sysfs_files(&c->opts_dir))
+		return "error creating sysfs objects";
+
+	mutex_lock(&c->state_lock);
+
+	err = "error creating sysfs objects";
+	__for_each_member_device(ca, c, i, NULL)
+		if (bch2_dev_sysfs_online(c, ca))
+			goto err;
+
+	list_add(&c->list, &bch_fs_list);
+	err = NULL;
+err:
+	mutex_unlock(&c->state_lock);
+	return err;
+}
+
+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
+{
+	struct bch_sb_field_members *mi;
+	struct bch_fs *c;
+	unsigned i, iter_size;
+	const char *err;
+
+	pr_verbose_init(opts, "");
+
+	c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+	if (!c)
+		goto out;
+
+	__module_get(THIS_MODULE);
+
+	c->minor		= -1;
+	c->disk_sb.fs_sb	= true;
+
+	mutex_init(&c->state_lock);
+	mutex_init(&c->sb_lock);
+	mutex_init(&c->replicas_gc_lock);
+	mutex_init(&c->btree_root_lock);
+	INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
+
+	init_rwsem(&c->gc_lock);
+
+	for (i = 0; i < BCH_TIME_STAT_NR; i++)
+		bch2_time_stats_init(&c->times[i]);
+
+	bch2_fs_allocator_background_init(c);
+	bch2_fs_allocator_foreground_init(c);
+	bch2_fs_rebalance_init(c);
+	bch2_fs_quota_init(c);
+
+	INIT_LIST_HEAD(&c->list);
+
+	INIT_LIST_HEAD(&c->btree_interior_update_list);
+	mutex_init(&c->btree_reserve_cache_lock);
+	mutex_init(&c->btree_interior_update_lock);
+
+	mutex_init(&c->usage_scratch_lock);
+
+	mutex_init(&c->bio_bounce_pages_lock);
+
+	bio_list_init(&c->btree_write_error_list);
+	spin_lock_init(&c->btree_write_error_lock);
+	INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
+
+	INIT_WORK(&c->journal_seq_blacklist_gc_work,
+		  bch2_blacklist_entries_gc);
+
+	INIT_LIST_HEAD(&c->fsck_errors);
+	mutex_init(&c->fsck_error_lock);
+
+	INIT_LIST_HEAD(&c->ec_new_stripe_list);
+	mutex_init(&c->ec_new_stripe_lock);
+	mutex_init(&c->ec_stripe_create_lock);
+	spin_lock_init(&c->ec_stripes_heap_lock);
+
+	seqcount_init(&c->gc_pos_lock);
+
+	seqcount_init(&c->usage_lock);
+
+	c->copy_gc_enabled		= 1;
+	c->rebalance.enabled		= 1;
+	c->promote_whole_extents	= true;
+
+	c->journal.write_time	= &c->times[BCH_TIME_journal_write];
+	c->journal.delay_time	= &c->times[BCH_TIME_journal_delay];
+	c->journal.blocked_time	= &c->times[BCH_TIME_blocked_journal];
+	c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
+
+	bch2_fs_btree_cache_init_early(&c->btree_cache);
+
+	if (percpu_init_rwsem(&c->mark_lock))
+		goto err;
+
+	mutex_lock(&c->sb_lock);
+
+	if (bch2_sb_to_fs(c, sb)) {
+		mutex_unlock(&c->sb_lock);
+		goto err;
+	}
+
+	mutex_unlock(&c->sb_lock);
+
+	scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
+
+	c->opts = bch2_opts_default;
+	bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb));
+	bch2_opts_apply(&c->opts, opts);
+
+	c->block_bits		= ilog2(c->opts.block_size);
+	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
+
+	if (bch2_fs_init_fault("fs_alloc"))
+		goto err;
+
+	iter_size = sizeof(struct btree_node_iter_large) +
+		(btree_blocks(c) + 1) * 2 *
+		sizeof(struct btree_node_iter_set);
+
+	if (!(c->wq = alloc_workqueue("bcachefs",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+	    !(c->copygc_wq = alloc_workqueue("bcache_copygc",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+	    !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
+	    percpu_ref_init(&c->writes, bch2_writes_disabled,
+			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+	    mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
+				      sizeof(struct btree_reserve)) ||
+	    mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+				      sizeof(struct btree_update)) ||
+	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
+	    bioset_init(&c->btree_bio, 1,
+			max(offsetof(struct btree_read_bio, bio),
+			    offsetof(struct btree_write_bio, wbio.bio)),
+			BIOSET_NEED_BVECS) ||
+	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
+					btree_bytes(c)) ||
+	    bch2_io_clock_init(&c->io_clock[READ]) ||
+	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
+	    bch2_fs_journal_init(&c->journal) ||
+	    bch2_fs_replicas_init(c) ||
+	    bch2_fs_btree_cache_init(c) ||
+	    bch2_fs_btree_iter_init(c) ||
+	    bch2_fs_io_init(c) ||
+	    bch2_fs_encryption_init(c) ||
+	    bch2_fs_compress_init(c) ||
+	    bch2_fs_ec_init(c) ||
+	    bch2_fs_fsio_init(c))
+		goto err;
+
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	for (i = 0; i < c->sb.nr_devices; i++)
+		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
+		    bch2_dev_alloc(c, i))
+			goto err;
+
+	/*
+	 * Now that all allocations have succeeded, init various refcounty
+	 * things that let us shutdown:
+	 */
+	closure_init(&c->cl, NULL);
+
+	c->kobj.kset = bcachefs_kset;
+	kobject_init(&c->kobj, &bch2_fs_ktype);
+	kobject_init(&c->internal, &bch2_fs_internal_ktype);
+	kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
+	kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+
+	mutex_lock(&bch_fs_list_lock);
+	err = bch2_fs_online(c);
+	mutex_unlock(&bch_fs_list_lock);
+	if (err) {
+		bch_err(c, "bch2_fs_online() error: %s", err);
+		goto err;
+	}
+out:
+	pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
+	return c;
+err:
+	bch2_fs_free(c);
+	c = NULL;
+	goto out;
+}
+
+noinline_for_stack
+static void print_mount_opts(struct bch_fs *c)
+{
+	enum bch_opt_id i;
+	char buf[512];
+	struct printbuf p = PBUF(buf);
+	bool first = true;
+
+	strcpy(buf, "(null)");
+
+	if (c->opts.read_only) {
+		pr_buf(&p, "ro");
+		first = false;
+	}
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		const struct bch_option *opt = &bch2_opt_table[i];
+		u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+		if (!(opt->mode & OPT_MOUNT))
+			continue;
+
+		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+			continue;
+
+		if (!first)
+			pr_buf(&p, ",");
+		first = false;
+		bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
+	}
+
+	bch_info(c, "mounted with opts: %s", buf);
+}
+
+int bch2_fs_start(struct bch_fs *c)
+{
+	const char *err = "cannot allocate memory";
+	struct bch_sb_field_members *mi;
+	struct bch_dev *ca;
+	time64_t now = ktime_get_real_seconds();
+	unsigned i;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->state_lock);
+
+	BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
+
+	mutex_lock(&c->sb_lock);
+
+	for_each_online_member(ca, c, i)
+		bch2_sb_from_fs(c, ca);
+
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	for_each_online_member(ca, c, i)
+		mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
+
+	mutex_unlock(&c->sb_lock);
+
+	for_each_rw_member(ca, c, i)
+		bch2_dev_allocator_add(c, ca);
+	bch2_recalc_capacity(c);
+
+	ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
+		? bch2_fs_recovery(c)
+		: bch2_fs_initialize(c);
+	if (ret)
+		goto err;
+
+	ret = bch2_opts_check_may_set(c);
+	if (ret)
+		goto err;
+
+	err = "dynamic fault";
+	ret = -EINVAL;
+	if (bch2_fs_init_fault("fs_start"))
+		goto err;
+
+	if (c->opts.read_only || c->opts.nochanges) {
+		bch2_fs_read_only(c);
+	} else {
+		err = "error going read write";
+		ret = !test_bit(BCH_FS_RW, &c->flags)
+			? bch2_fs_read_write(c)
+			: bch2_fs_read_write_late(c);
+		if (ret)
+			goto err;
+	}
+
+	set_bit(BCH_FS_STARTED, &c->flags);
+	print_mount_opts(c);
+	ret = 0;
+out:
+	mutex_unlock(&c->state_lock);
+	return ret;
+err:
+	switch (ret) {
+	case BCH_FSCK_ERRORS_NOT_FIXED:
+		bch_err(c, "filesystem contains errors: please report this to the developers");
+		pr_cont("mount with -o fix_errors to repair\n");
+		err = "fsck error";
+		break;
+	case BCH_FSCK_REPAIR_UNIMPLEMENTED:
+		bch_err(c, "filesystem contains errors: please report this to the developers");
+		pr_cont("repair unimplemented: inform the developers so that it can be added\n");
+		err = "fsck error";
+		break;
+	case BCH_FSCK_REPAIR_IMPOSSIBLE:
+		bch_err(c, "filesystem contains errors, but repair impossible");
+		err = "fsck error";
+		break;
+	case BCH_FSCK_UNKNOWN_VERSION:
+		err = "unknown metadata version";;
+		break;
+	case -ENOMEM:
+		err = "cannot allocate memory";
+		break;
+	case -EIO:
+		err = "IO error";
+		break;
+	}
+
+	if (ret >= 0)
+		ret = -EIO;
+	goto out;
+}
+
+static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
+{
+	struct bch_sb_field_members *sb_mi;
+
+	sb_mi = bch2_sb_get_members(sb);
+	if (!sb_mi)
+		return "Invalid superblock: member info area missing";
+
+	if (le16_to_cpu(sb->block_size) != c->opts.block_size)
+		return "mismatched block size";
+
+	if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
+	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
+		return "new cache bucket size is too small";
+
+	return NULL;
+}
+
+static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+{
+	struct bch_sb *newest =
+		le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+	struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
+
+	if (uuid_le_cmp(fs->uuid, sb->uuid))
+		return "device not a member of filesystem";
+
+	if (!bch2_dev_exists(newest, mi, sb->dev_idx))
+		return "device has been removed";
+
+	if (fs->block_size != sb->block_size)
+		return "mismatched block size";
+
+	return NULL;
+}
+
+/* Device startup/shutdown: */
+
+static void bch2_dev_release(struct kobject *kobj)
+{
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+
+	kfree(ca);
+}
+
+static void bch2_dev_free(struct bch_dev *ca)
+{
+	cancel_work_sync(&ca->io_error_work);
+
+	if (ca->kobj.state_in_sysfs &&
+	    ca->disk_sb.bdev)
+		sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+
+	if (ca->kobj.state_in_sysfs)
+		kobject_del(&ca->kobj);
+
+	bch2_free_super(&ca->disk_sb);
+	bch2_dev_journal_exit(ca);
+
+	free_percpu(ca->io_done);
+	bioset_exit(&ca->replica_set);
+	bch2_dev_buckets_free(ca);
+	free_page((unsigned long) ca->sb_read_scratch);
+
+	bch2_time_stats_exit(&ca->io_latency[WRITE]);
+	bch2_time_stats_exit(&ca->io_latency[READ]);
+
+	percpu_ref_exit(&ca->io_ref);
+	percpu_ref_exit(&ca->ref);
+	kobject_put(&ca->kobj);
+}
+
+static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
+{
+
+	lockdep_assert_held(&c->state_lock);
+
+	if (percpu_ref_is_zero(&ca->io_ref))
+		return;
+
+	__bch2_dev_read_only(c, ca);
+
+	reinit_completion(&ca->io_ref_completion);
+	percpu_ref_kill(&ca->io_ref);
+	wait_for_completion(&ca->io_ref_completion);
+
+	if (ca->kobj.state_in_sysfs) {
+		sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+		sysfs_remove_link(&ca->kobj, "block");
+	}
+
+	bch2_free_super(&ca->disk_sb);
+	bch2_dev_journal_exit(ca);
+}
+
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
+{
+	struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
+
+	complete(&ca->ref_completion);
+}
+
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
+{
+	struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
+
+	complete(&ca->io_ref_completion);
+}
+
+static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
+{
+	int ret;
+
+	if (!c->kobj.state_in_sysfs)
+		return 0;
+
+	if (!ca->kobj.state_in_sysfs) {
+		ret = kobject_add(&ca->kobj, &c->kobj,
+				  "dev-%u", ca->dev_idx);
+		if (ret)
+			return ret;
+	}
+
+	if (ca->disk_sb.bdev) {
+		struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
+
+		ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
+		if (ret)
+			return ret;
+
+		ret = sysfs_create_link(&ca->kobj, block, "block");
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
+					struct bch_member *member)
+{
+	struct bch_dev *ca;
+
+	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+	if (!ca)
+		return NULL;
+
+	kobject_init(&ca->kobj, &bch2_dev_ktype);
+	init_completion(&ca->ref_completion);
+	init_completion(&ca->io_ref_completion);
+
+	init_rwsem(&ca->bucket_lock);
+
+	writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
+
+	spin_lock_init(&ca->freelist_lock);
+	bch2_dev_copygc_init(ca);
+
+	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
+
+	bch2_time_stats_init(&ca->io_latency[READ]);
+	bch2_time_stats_init(&ca->io_latency[WRITE]);
+
+	ca->mi = bch2_mi_to_cpu(member);
+	ca->uuid = member->uuid;
+
+	if (opt_defined(c->opts, discard))
+		ca->mi.discard = opt_get(c->opts, discard);
+
+	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
+			    0, GFP_KERNEL) ||
+	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
+			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+	    !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
+	    bch2_dev_buckets_alloc(c, ca) ||
+	    bioset_init(&ca->replica_set, 4,
+			offsetof(struct bch_write_bio, bio), 0) ||
+	    !(ca->io_done	= alloc_percpu(*ca->io_done)))
+		goto err;
+
+	return ca;
+err:
+	bch2_dev_free(ca);
+	return NULL;
+}
+
+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
+			    unsigned dev_idx)
+{
+	ca->dev_idx = dev_idx;
+	__set_bit(ca->dev_idx, ca->self.d);
+	scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
+
+	ca->fs = c;
+	rcu_assign_pointer(c->devs[ca->dev_idx], ca);
+
+	if (bch2_dev_sysfs_online(c, ca))
+		pr_warn("error creating sysfs objects");
+}
+
+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
+{
+	struct bch_member *member =
+		bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
+	struct bch_dev *ca = NULL;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	if (bch2_fs_init_fault("dev_alloc"))
+		goto err;
+
+	ca = __bch2_dev_alloc(c, member);
+	if (!ca)
+		goto err;
+
+	bch2_dev_attach(c, ca, dev_idx);
+out:
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+err:
+	if (ca)
+		bch2_dev_free(ca);
+	ret = -ENOMEM;
+	goto out;
+}
+
+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
+{
+	unsigned ret;
+
+	if (bch2_dev_is_online(ca)) {
+		bch_err(ca, "already have device online in slot %u",
+			sb->sb->dev_idx);
+		return -EINVAL;
+	}
+
+	if (get_capacity(sb->bdev->bd_disk) <
+	    ca->mi.bucket_size * ca->mi.nbuckets) {
+		bch_err(ca, "cannot online: device too small");
+		return -EINVAL;
+	}
+
+	BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
+	if (get_capacity(sb->bdev->bd_disk) <
+	    ca->mi.bucket_size * ca->mi.nbuckets) {
+		bch_err(ca, "device too small");
+		return -EINVAL;
+	}
+
+	ret = bch2_dev_journal_init(ca, sb->sb);
+	if (ret)
+		return ret;
+
+	/* Commit: */
+	ca->disk_sb = *sb;
+	if (sb->mode & FMODE_EXCL)
+		ca->disk_sb.bdev->bd_holder = ca;
+	memset(sb, 0, sizeof(*sb));
+
+	percpu_ref_reinit(&ca->io_ref);
+
+	return 0;
+}
+
+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	lockdep_assert_held(&c->state_lock);
+
+	if (le64_to_cpu(sb->sb->seq) >
+	    le64_to_cpu(c->disk_sb.sb->seq))
+		bch2_sb_to_fs(c, sb->sb);
+
+	BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
+	       !c->devs[sb->sb->dev_idx]);
+
+	ca = bch_dev_locked(c, sb->sb->dev_idx);
+
+	ret = __bch2_dev_attach_bdev(ca, sb);
+	if (ret)
+		return ret;
+
+	if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) &&
+	    !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_SB])) {
+		mutex_lock(&c->sb_lock);
+		bch2_mark_dev_superblock(ca->fs, ca, 0);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	bch2_dev_sysfs_online(c, ca);
+
+	if (c->sb.nr_devices == 1)
+		bdevname(ca->disk_sb.bdev, c->name);
+	bdevname(ca->disk_sb.bdev, ca->name);
+
+	rebalance_wakeup(c);
+	return 0;
+}
+
+/* Device management: */
+
+/*
+ * Note: this function is also used by the error paths - when a particular
+ * device sees an error, we call it to determine whether we can just set the
+ * device RO, or - if this function returns false - we'll set the whole
+ * filesystem RO:
+ *
+ * XXX: maybe we should be more explicit about whether we're changing state
+ * because we got an error or what have you?
+ */
+bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
+			    enum bch_member_state new_state, int flags)
+{
+	struct bch_devs_mask new_online_devs;
+	struct replicas_status s;
+	struct bch_dev *ca2;
+	int i, nr_rw = 0, required;
+
+	lockdep_assert_held(&c->state_lock);
+
+	switch (new_state) {
+	case BCH_MEMBER_STATE_RW:
+		return true;
+	case BCH_MEMBER_STATE_RO:
+		if (ca->mi.state != BCH_MEMBER_STATE_RW)
+			return true;
+
+		/* do we have enough devices to write to?  */
+		for_each_member_device(ca2, c, i)
+			if (ca2 != ca)
+				nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
+
+		required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
+			       ? c->opts.metadata_replicas
+			       : c->opts.metadata_replicas_required,
+			       !(flags & BCH_FORCE_IF_DATA_DEGRADED)
+			       ? c->opts.data_replicas
+			       : c->opts.data_replicas_required);
+
+		return nr_rw >= required;
+	case BCH_MEMBER_STATE_FAILED:
+	case BCH_MEMBER_STATE_SPARE:
+		if (ca->mi.state != BCH_MEMBER_STATE_RW &&
+		    ca->mi.state != BCH_MEMBER_STATE_RO)
+			return true;
+
+		/* do we have enough devices to read from?  */
+		new_online_devs = bch2_online_devs(c);
+		__clear_bit(ca->dev_idx, new_online_devs.d);
+
+		s = __bch2_replicas_status(c, new_online_devs);
+
+		return bch2_have_enough_devs(s, flags);
+	default:
+		BUG();
+	}
+}
+
+static bool bch2_fs_may_start(struct bch_fs *c)
+{
+	struct replicas_status s;
+	struct bch_sb_field_members *mi;
+	struct bch_dev *ca;
+	unsigned i, flags = c->opts.degraded
+		? BCH_FORCE_IF_DEGRADED
+		: 0;
+
+	if (!c->opts.degraded) {
+		mutex_lock(&c->sb_lock);
+		mi = bch2_sb_get_members(c->disk_sb.sb);
+
+		for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+			if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
+				continue;
+
+			ca = bch_dev_locked(c, i);
+
+			if (!bch2_dev_is_online(ca) &&
+			    (ca->mi.state == BCH_MEMBER_STATE_RW ||
+			     ca->mi.state == BCH_MEMBER_STATE_RO)) {
+				mutex_unlock(&c->sb_lock);
+				return false;
+			}
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	s = bch2_replicas_status(c);
+
+	return bch2_have_enough_devs(s, flags);
+}
+
+static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
+{
+	bch2_copygc_stop(ca);
+
+	/*
+	 * The allocator thread itself allocates btree nodes, so stop it first:
+	 */
+	bch2_dev_allocator_stop(ca);
+	bch2_dev_allocator_remove(c, ca);
+	bch2_dev_journal_stop(&c->journal, ca);
+}
+
+static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
+
+	bch2_dev_allocator_add(c, ca);
+	bch2_recalc_capacity(c);
+
+	if (bch2_dev_allocator_start(ca))
+		return "error starting allocator thread";
+
+	if (bch2_copygc_start(c, ca))
+		return "error starting copygc thread";
+
+	return NULL;
+}
+
+int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
+			 enum bch_member_state new_state, int flags)
+{
+	struct bch_sb_field_members *mi;
+	int ret = 0;
+
+	if (ca->mi.state == new_state)
+		return 0;
+
+	if (!bch2_dev_state_allowed(c, ca, new_state, flags))
+		return -EINVAL;
+
+	if (new_state != BCH_MEMBER_STATE_RW)
+		__bch2_dev_read_only(c, ca);
+
+	bch_notice(ca, "%s", bch2_dev_state[new_state]);
+
+	mutex_lock(&c->sb_lock);
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (new_state == BCH_MEMBER_STATE_RW &&
+	    __bch2_dev_read_write(c, ca))
+		ret = -ENOMEM;
+
+	rebalance_wakeup(c);
+
+	return ret;
+}
+
+int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
+		       enum bch_member_state new_state, int flags)
+{
+	int ret;
+
+	mutex_lock(&c->state_lock);
+	ret = __bch2_dev_set_state(c, ca, new_state, flags);
+	mutex_unlock(&c->state_lock);
+
+	return ret;
+}
+
+/* Device add/removal: */
+
+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+	struct bch_sb_field_members *mi;
+	unsigned dev_idx = ca->dev_idx, data;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->state_lock);
+
+	percpu_ref_put(&ca->ref); /* XXX */
+
+	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+		bch_err(ca, "Cannot remove without losing data");
+		goto err;
+	}
+
+	__bch2_dev_read_only(c, ca);
+
+	/*
+	 * XXX: verify that dev_idx is really not in use anymore, anywhere
+	 *
+	 * flag_data_bad() does not check btree pointers
+	 */
+	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
+	if (ret) {
+		bch_err(ca, "Remove failed: error %i dropping data", ret);
+		goto err;
+	}
+
+	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
+	if (ret) {
+		bch_err(ca, "Remove failed: error %i flushing journal", ret);
+		goto err;
+	}
+
+	data = bch2_dev_has_data(c, ca);
+	if (data) {
+		char data_has_str[100];
+
+		bch2_flags_to_text(&PBUF(data_has_str),
+				   bch2_data_types, data);
+		bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+				      POS(ca->dev_idx, 0),
+				      POS(ca->dev_idx + 1, 0),
+				      NULL);
+	if (ret) {
+		bch_err(ca, "Remove failed, error deleting alloc info");
+		goto err;
+	}
+
+	/*
+	 * must flush all existing journal entries, they might have
+	 * (overwritten) keys that point to the device we're removing:
+	 */
+	bch2_journal_flush_all_pins(&c->journal);
+	ret = bch2_journal_error(&c->journal);
+	if (ret) {
+		bch_err(ca, "Remove failed, journal error");
+		goto err;
+	}
+
+	__bch2_dev_offline(c, ca);
+
+	mutex_lock(&c->sb_lock);
+	rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
+	mutex_unlock(&c->sb_lock);
+
+	percpu_ref_kill(&ca->ref);
+	wait_for_completion(&ca->ref_completion);
+
+	bch2_dev_free(ca);
+
+	/*
+	 * Free this device's slot in the bch_member array - all pointers to
+	 * this device must be gone:
+	 */
+	mutex_lock(&c->sb_lock);
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
+
+	bch2_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+	mutex_unlock(&c->state_lock);
+	return 0;
+err:
+	if (ca->mi.state == BCH_MEMBER_STATE_RW &&
+	    !percpu_ref_is_zero(&ca->io_ref))
+		__bch2_dev_read_write(c, ca);
+	mutex_unlock(&c->state_lock);
+	return ret;
+}
+
+static void dev_usage_clear(struct bch_dev *ca)
+{
+	struct bucket_array *buckets;
+
+	percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
+
+	down_read(&ca->bucket_lock);
+	buckets = bucket_array(ca);
+
+	memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
+	up_read(&ca->bucket_lock);
+}
+
+/* Add new device to running filesystem: */
+int bch2_dev_add(struct bch_fs *c, const char *path)
+{
+	struct bch_opts opts = bch2_opts_empty();
+	struct bch_sb_handle sb;
+	const char *err;
+	struct bch_dev *ca = NULL;
+	struct bch_sb_field_members *mi;
+	struct bch_member dev_mi;
+	unsigned dev_idx, nr_devices, u64s;
+	int ret;
+
+	ret = bch2_read_super(path, &opts, &sb);
+	if (ret)
+		return ret;
+
+	err = bch2_sb_validate(&sb);
+	if (err)
+		return -EINVAL;
+
+	dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
+
+	err = bch2_dev_may_add(sb.sb, c);
+	if (err)
+		return -EINVAL;
+
+	ca = __bch2_dev_alloc(c, &dev_mi);
+	if (!ca) {
+		bch2_free_super(&sb);
+		return -ENOMEM;
+	}
+
+	ret = __bch2_dev_attach_bdev(ca, &sb);
+	if (ret) {
+		bch2_dev_free(ca);
+		return ret;
+	}
+
+	/*
+	 * We want to allocate journal on the new device before adding the new
+	 * device to the filesystem because allocating after we attach requires
+	 * spinning up the allocator thread, and the allocator thread requires
+	 * doing btree writes, which if the existing devices are RO isn't going
+	 * to work
+	 *
+	 * So we have to mark where the superblocks are, but marking allocated
+	 * data normally updates the filesystem usage too, so we have to mark,
+	 * allocate the journal, reset all the marks, then remark after we
+	 * attach...
+	 */
+	bch2_mark_dev_superblock(ca->fs, ca, 0);
+
+	err = "journal alloc failed";
+	ret = bch2_dev_journal_alloc(ca);
+	if (ret)
+		goto err;
+
+	dev_usage_clear(ca);
+
+	mutex_lock(&c->state_lock);
+	mutex_lock(&c->sb_lock);
+
+	err = "insufficient space in new superblock";
+	ret = bch2_sb_from_fs(c, ca);
+	if (ret)
+		goto err_unlock;
+
+	mi = bch2_sb_get_members(ca->disk_sb.sb);
+
+	if (!bch2_sb_resize_members(&ca->disk_sb,
+				le32_to_cpu(mi->field.u64s) +
+				sizeof(dev_mi) / sizeof(u64))) {
+		ret = -ENOSPC;
+		goto err_unlock;
+	}
+
+	if (dynamic_fault("bcachefs:add:no_slot"))
+		goto no_slot;
+
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
+		if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
+			goto have_slot;
+no_slot:
+	err = "no slots available in superblock";
+	ret = -ENOSPC;
+	goto err_unlock;
+
+have_slot:
+	nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+	u64s = (sizeof(struct bch_sb_field_members) +
+		sizeof(struct bch_member) * nr_devices) / sizeof(u64);
+
+	err = "no space in superblock for member info";
+	ret = -ENOSPC;
+
+	mi = bch2_sb_resize_members(&c->disk_sb, u64s);
+	if (!mi)
+		goto err_unlock;
+
+	/* success: */
+
+	mi->members[dev_idx] = dev_mi;
+	mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds());
+	c->disk_sb.sb->nr_devices	= nr_devices;
+
+	ca->disk_sb.sb->dev_idx	= dev_idx;
+	bch2_dev_attach(c, ca, dev_idx);
+
+	bch2_mark_dev_superblock(c, ca, 0);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+		err = __bch2_dev_read_write(c, ca);
+		if (err)
+			goto err_late;
+	}
+
+	mutex_unlock(&c->state_lock);
+	return 0;
+
+err_unlock:
+	mutex_unlock(&c->sb_lock);
+	mutex_unlock(&c->state_lock);
+err:
+	if (ca)
+		bch2_dev_free(ca);
+	bch2_free_super(&sb);
+	bch_err(c, "Unable to add device: %s", err);
+	return ret;
+err_late:
+	bch_err(c, "Error going rw after adding device: %s", err);
+	return -EINVAL;
+}
+
+/* Hot add existing device to running filesystem: */
+int bch2_dev_online(struct bch_fs *c, const char *path)
+{
+	struct bch_opts opts = bch2_opts_empty();
+	struct bch_sb_handle sb = { NULL };
+	struct bch_sb_field_members *mi;
+	struct bch_dev *ca;
+	unsigned dev_idx;
+	const char *err;
+	int ret;
+
+	mutex_lock(&c->state_lock);
+
+	ret = bch2_read_super(path, &opts, &sb);
+	if (ret) {
+		mutex_unlock(&c->state_lock);
+		return ret;
+	}
+
+	dev_idx = sb.sb->dev_idx;
+
+	err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
+	if (err)
+		goto err;
+
+	if (bch2_dev_attach_bdev(c, &sb)) {
+		err = "bch2_dev_attach_bdev() error";
+		goto err;
+	}
+
+	ca = bch_dev_locked(c, dev_idx);
+	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+		err = __bch2_dev_read_write(c, ca);
+		if (err)
+			goto err;
+	}
+
+	mutex_lock(&c->sb_lock);
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+
+	mi->members[ca->dev_idx].last_mount =
+		cpu_to_le64(ktime_get_real_seconds());
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	mutex_unlock(&c->state_lock);
+	return 0;
+err:
+	mutex_unlock(&c->state_lock);
+	bch2_free_super(&sb);
+	bch_err(c, "error bringing %s online: %s", path, err);
+	return -EINVAL;
+}
+
+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+	mutex_lock(&c->state_lock);
+
+	if (!bch2_dev_is_online(ca)) {
+		bch_err(ca, "Already offline");
+		mutex_unlock(&c->state_lock);
+		return 0;
+	}
+
+	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+		bch_err(ca, "Cannot offline required disk");
+		mutex_unlock(&c->state_lock);
+		return -EINVAL;
+	}
+
+	__bch2_dev_offline(c, ca);
+
+	mutex_unlock(&c->state_lock);
+	return 0;
+}
+
+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+	struct bch_member *mi;
+	int ret = 0;
+
+	mutex_lock(&c->state_lock);
+
+	if (nbuckets < ca->mi.nbuckets) {
+		bch_err(ca, "Cannot shrink yet");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (bch2_dev_is_online(ca) &&
+	    get_capacity(ca->disk_sb.bdev->bd_disk) <
+	    ca->mi.bucket_size * nbuckets) {
+		bch_err(ca, "New size larger than device");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = bch2_dev_buckets_resize(c, ca, nbuckets);
+	if (ret) {
+		bch_err(ca, "Resize error: %i", ret);
+		goto err;
+	}
+
+	mutex_lock(&c->sb_lock);
+	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+	mi->nbuckets = cpu_to_le64(nbuckets);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	bch2_recalc_capacity(c);
+err:
+	mutex_unlock(&c->state_lock);
+	return ret;
+}
+
+/* return with ref on ca->ref: */
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
+{
+
+	struct bch_dev *ca;
+	dev_t dev;
+	unsigned i;
+	int ret;
+
+	ret = lookup_bdev(path, &dev);
+	if (ret)
+		return ERR_PTR(ret);
+
+	for_each_member_device(ca, c, i)
+		if (ca->disk_sb.bdev->bd_dev == dev)
+			goto found;
+
+	ca = ERR_PTR(-ENOENT);
+found:
+	return ca;
+}
+
+/* Filesystem open: */
+
+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
+			    struct bch_opts opts)
+{
+	struct bch_sb_handle *sb = NULL;
+	struct bch_fs *c = NULL;
+	unsigned i, best_sb = 0;
+	const char *err;
+	int ret = -ENOMEM;
+
+	pr_verbose_init(opts, "");
+
+	if (!nr_devices) {
+		c = ERR_PTR(-EINVAL);
+		goto out2;
+	}
+
+	if (!try_module_get(THIS_MODULE)) {
+		c = ERR_PTR(-ENODEV);
+		goto out2;
+	}
+
+	sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
+	if (!sb)
+		goto err;
+
+	for (i = 0; i < nr_devices; i++) {
+		ret = bch2_read_super(devices[i], &opts, &sb[i]);
+		if (ret)
+			goto err;
+
+		err = bch2_sb_validate(&sb[i]);
+		if (err)
+			goto err_print;
+	}
+
+	for (i = 1; i < nr_devices; i++)
+		if (le64_to_cpu(sb[i].sb->seq) >
+		    le64_to_cpu(sb[best_sb].sb->seq))
+			best_sb = i;
+
+	for (i = 0; i < nr_devices; i++) {
+		err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
+		if (err)
+			goto err_print;
+	}
+
+	ret = -ENOMEM;
+	c = bch2_fs_alloc(sb[best_sb].sb, opts);
+	if (!c)
+		goto err;
+
+	err = "bch2_dev_online() error";
+	mutex_lock(&c->state_lock);
+	for (i = 0; i < nr_devices; i++)
+		if (bch2_dev_attach_bdev(c, &sb[i])) {
+			mutex_unlock(&c->state_lock);
+			goto err_print;
+		}
+	mutex_unlock(&c->state_lock);
+
+	err = "insufficient devices";
+	if (!bch2_fs_may_start(c))
+		goto err_print;
+
+	if (!c->opts.nostart) {
+		ret = bch2_fs_start(c);
+		if (ret)
+			goto err;
+	}
+out:
+	kfree(sb);
+	module_put(THIS_MODULE);
+out2:
+	pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
+	return c;
+err_print:
+	pr_err("bch_fs_open err opening %s: %s",
+	       devices[0], err);
+	ret = -EINVAL;
+err:
+	if (c)
+		bch2_fs_stop(c);
+	for (i = 0; i < nr_devices; i++)
+		bch2_free_super(&sb[i]);
+	c = ERR_PTR(ret);
+	goto out;
+}
+
+static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
+					      struct bch_opts opts)
+{
+	const char *err;
+	struct bch_fs *c;
+	bool allocated_fs = false;
+	int ret;
+
+	err = bch2_sb_validate(sb);
+	if (err)
+		return err;
+
+	mutex_lock(&bch_fs_list_lock);
+	c = __bch2_uuid_to_fs(sb->sb->uuid);
+	if (c) {
+		closure_get(&c->cl);
+
+		err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
+		if (err)
+			goto err;
+	} else {
+		c = bch2_fs_alloc(sb->sb, opts);
+		err = "cannot allocate memory";
+		if (!c)
+			goto err;
+
+		allocated_fs = true;
+	}
+
+	err = "bch2_dev_online() error";
+
+	mutex_lock(&c->sb_lock);
+	if (bch2_dev_attach_bdev(c, sb)) {
+		mutex_unlock(&c->sb_lock);
+		goto err;
+	}
+	mutex_unlock(&c->sb_lock);
+
+	if (!c->opts.nostart && bch2_fs_may_start(c)) {
+		err = "error starting filesystem";
+		ret = bch2_fs_start(c);
+		if (ret)
+			goto err;
+	}
+
+	closure_put(&c->cl);
+	mutex_unlock(&bch_fs_list_lock);
+
+	return NULL;
+err:
+	mutex_unlock(&bch_fs_list_lock);
+
+	if (allocated_fs)
+		bch2_fs_stop(c);
+	else if (c)
+		closure_put(&c->cl);
+
+	return err;
+}
+
+const char *bch2_fs_open_incremental(const char *path)
+{
+	struct bch_sb_handle sb;
+	struct bch_opts opts = bch2_opts_empty();
+	const char *err;
+
+	if (bch2_read_super(path, &opts, &sb))
+		return "error reading superblock";
+
+	err = __bch2_fs_open_incremental(&sb, opts);
+	bch2_free_super(&sb);
+
+	return err;
+}
+
+/* Global interfaces/init */
+
+static void bcachefs_exit(void)
+{
+	bch2_debug_exit();
+	bch2_vfs_exit();
+	bch2_chardev_exit();
+	if (bcachefs_kset)
+		kset_unregister(bcachefs_kset);
+}
+
+static int __init bcachefs_init(void)
+{
+	bch2_bkey_pack_test();
+	bch2_inode_pack_test();
+
+	if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+	    bch2_chardev_init() ||
+	    bch2_vfs_init() ||
+	    bch2_debug_init())
+		goto err;
+
+	return 0;
+err:
+	bcachefs_exit();
+	return -ENOMEM;
+}
+
+#define BCH_DEBUG_PARAM(name, description)			\
+	bool bch2_##name;					\
+	module_param_named(name, bch2_##name, bool, 0644);	\
+	MODULE_PARM_DESC(name, description);
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+module_exit(bcachefs_exit);
+module_init(bcachefs_init);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
new file mode 100644
index 000000000000..9204e8fdabdd
--- /dev/null
+++ b/fs/bcachefs/super.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_H
+#define _BCACHEFS_SUPER_H
+
+#include "extents.h"
+
+#include "bcachefs_ioctl.h"
+
+#include <linux/math64.h>
+
+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+{
+	return div_u64(s, ca->mi.bucket_size);
+}
+
+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
+{
+	return ((sector_t) b) * ca->mi.bucket_size;
+}
+
+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
+{
+	u32 remainder;
+
+	div_u64_rem(s, ca->mi.bucket_size, &remainder);
+	return remainder;
+}
+
+static inline bool bch2_dev_is_online(struct bch_dev *ca)
+{
+	return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+	return bch2_dev_is_online(ca) &&
+		ca->mi.state != BCH_MEMBER_STATE_FAILED;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+	if (!percpu_ref_tryget(&ca->io_ref))
+		return false;
+
+	if (ca->mi.state == BCH_MEMBER_STATE_RW ||
+	    (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
+		return true;
+
+	percpu_ref_put(&ca->io_ref);
+	return false;
+}
+
+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
+{
+	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
+}
+
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+					 unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs.nr; i++)
+		if (devs.devs[i] == dev)
+			return true;
+
+	return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+					  unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs->nr; i++)
+		if (devs->devs[i] == dev) {
+			array_remove_item(devs->devs, devs->nr, i);
+			return;
+		}
+}
+
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+					 unsigned dev)
+{
+	BUG_ON(bch2_dev_list_has_dev(*devs, dev));
+	BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+	devs->devs[devs->nr++] = dev;
+}
+
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+	return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+}
+
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
+					      const struct bch_devs_mask *mask)
+{
+	struct bch_dev *ca = NULL;
+
+	while ((*iter = mask
+		? find_next_bit(mask->d, c->sb.nr_devices, *iter)
+		: *iter) < c->sb.nr_devices &&
+	       !(ca = rcu_dereference_check(c->devs[*iter],
+					    lockdep_is_held(&c->state_lock))))
+		(*iter)++;
+
+	return ca;
+}
+
+#define __for_each_member_device(ca, c, iter, mask)			\
+	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+
+#define for_each_member_device_rcu(ca, c, iter, mask)			\
+	__for_each_member_device(ca, c, iter, mask)
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
+{
+	struct bch_dev *ca;
+
+	rcu_read_lock();
+	if ((ca = __bch2_next_dev(c, iter, NULL)))
+		percpu_ref_get(&ca->ref);
+	rcu_read_unlock();
+
+	return ca;
+}
+
+/*
+ * If you break early, you must drop your ref on the current device
+ */
+#define for_each_member_device(ca, c, iter)				\
+	for ((iter) = 0;						\
+	     (ca = bch2_get_next_dev(c, &(iter)));			\
+	     percpu_ref_put(&ca->ref), (iter)++)
+
+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
+						      unsigned *iter,
+						      int state_mask)
+{
+	struct bch_dev *ca;
+
+	rcu_read_lock();
+	while ((ca = __bch2_next_dev(c, iter, NULL)) &&
+	       (!((1 << ca->mi.state) & state_mask) ||
+		!percpu_ref_tryget(&ca->io_ref)))
+		(*iter)++;
+	rcu_read_unlock();
+
+	return ca;
+}
+
+#define __for_each_online_member(ca, c, iter, state_mask)		\
+	for ((iter) = 0;						\
+	     (ca = bch2_get_next_online_dev(c, &(iter), state_mask));	\
+	     percpu_ref_put(&ca->io_ref), (iter)++)
+
+#define for_each_online_member(ca, c, iter)				\
+	__for_each_online_member(ca, c, iter, ~0)
+
+#define for_each_rw_member(ca, c, iter)					\
+	__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW)
+
+#define for_each_readable_member(ca, c, iter)				\
+	__for_each_online_member(ca, c, iter,				\
+		(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
+
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_protected(c->devs[idx],
+					 lockdep_is_held(&c->sb_lock) ||
+					 lockdep_is_held(&c->state_lock));
+}
+
+/* XXX kill, move to struct bch_fs */
+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
+{
+	struct bch_devs_mask devs;
+	struct bch_dev *ca;
+	unsigned i;
+
+	memset(&devs, 0, sizeof(devs));
+	for_each_online_member(ca, c, i)
+		__set_bit(ca->dev_idx, devs.d);
+	return devs;
+}
+
+struct bch_fs *bch2_dev_to_fs(dev_t);
+struct bch_fs *bch2_uuid_to_fs(uuid_le);
+
+bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
+			   enum bch_member_state, int);
+int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
+			enum bch_member_state, int);
+int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
+		      enum bch_member_state, int);
+
+int bch2_dev_fail(struct bch_dev *, int);
+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_add(struct bch_fs *, const char *);
+int bch2_dev_online(struct bch_fs *, const char *);
+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
+struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
+
+bool bch2_fs_emergency_read_only(struct bch_fs *);
+void bch2_fs_read_only(struct bch_fs *);
+
+int __bch2_fs_read_write(struct bch_fs *, bool);
+int bch2_fs_read_write(struct bch_fs *);
+int bch2_fs_read_write_early(struct bch_fs *);
+
+void bch2_fs_stop(struct bch_fs *);
+
+int bch2_fs_start(struct bch_fs *);
+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
+const char *bch2_fs_open_incremental(const char *path);
+
+#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
new file mode 100644
index 000000000000..20406ebd6f5b
--- /dev/null
+++ b/fs/bcachefs/super_types.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_TYPES_H
+#define _BCACHEFS_SUPER_TYPES_H
+
+struct bch_sb_handle {
+	struct bch_sb		*sb;
+	struct block_device	*bdev;
+	struct bio		*bio;
+	unsigned		page_order;
+	fmode_t			mode;
+	unsigned		have_layout:1;
+	unsigned		have_bio:1;
+	unsigned		fs_sb:1;
+	u64			seq;
+};
+
+struct bch_devs_mask {
+	unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
+};
+
+struct bch_devs_list {
+	u8			nr;
+	u8			devs[BCH_REPLICAS_MAX + 1];
+};
+
+struct bch_member_cpu {
+	u64			nbuckets;	/* device size */
+	u16			first_bucket;   /* index of first bucket used */
+	u16			bucket_size;	/* sectors */
+	u16			group;
+	u8			state;
+	u8			replacement;
+	u8			discard;
+	u8			data_allowed;
+	u8			durability;
+	u8			valid;
+};
+
+struct bch_disk_group_cpu {
+	bool				deleted;
+	u16				parent;
+	struct bch_devs_mask		devs;
+};
+
+struct bch_disk_groups_cpu {
+	struct rcu_head			rcu;
+	unsigned			nr;
+	struct bch_disk_group_cpu	entries[];
+};
+
+#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
new file mode 100644
index 000000000000..27646c435e30
--- /dev/null
+++ b/fs/bcachefs/sysfs.c
@@ -0,0 +1,1068 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcache sysfs interfaces
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#ifndef NO_BCACHEFS_SYSFS
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "sysfs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "inode.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "opts.h"
+#include "rebalance.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "tests.h"
+
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+#include <linux/sched/clock.h>
+
+#include "util.h"
+
+#define SYSFS_OPS(type)							\
+struct sysfs_ops type ## _sysfs_ops = {					\
+	.show	= type ## _show,					\
+	.store	= type ## _store					\
+}
+
+#define SHOW(fn)							\
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+			   char *buf)					\
+
+#define STORE(fn)							\
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+			    const char *buf, size_t size)		\
+
+#define __sysfs_attribute(_name, _mode)					\
+	static struct attribute sysfs_##_name =				\
+		{ .name = #_name, .mode = _mode }
+
+#define write_attribute(n)	__sysfs_attribute(n, S_IWUSR)
+#define read_attribute(n)	__sysfs_attribute(n, S_IRUGO)
+#define rw_attribute(n)		__sysfs_attribute(n, S_IRUGO|S_IWUSR)
+
+#define sysfs_printf(file, fmt, ...)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\
+} while (0)
+
+#define sysfs_print(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return snprint(buf, PAGE_SIZE, var);			\
+} while (0)
+
+#define sysfs_hprint(file, val)						\
+do {									\
+	if (attr == &sysfs_ ## file) {					\
+		struct printbuf out = _PBUF(buf, PAGE_SIZE);		\
+		bch2_hprint(&out, val);					\
+		pr_buf(&out, "\n");					\
+		return out.pos - buf;					\
+	}								\
+} while (0)
+
+#define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
+#define var_print(_var)		sysfs_print(_var, var(_var))
+#define var_hprint(_var)	sysfs_hprint(_var, var(_var))
+
+#define sysfs_strtoul(file, var)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe(buf, var) ?: (ssize_t) size;	\
+} while (0)
+
+#define sysfs_strtoul_clamp(file, var, min, max)			\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe_clamp(buf, var, min, max)		\
+			?: (ssize_t) size;				\
+} while (0)
+
+#define strtoul_or_return(cp)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (_r)								\
+		return _r;						\
+	_v;								\
+})
+
+#define strtoul_restrict_or_return(cp, min, max)			\
+({									\
+	unsigned long __v = 0;						\
+	int _r = strtoul_safe_restrict(cp, __v, min, max);		\
+	if (_r)								\
+		return _r;						\
+	__v;								\
+})
+
+#define strtoi_h_or_return(cp)						\
+({									\
+	u64 _v;								\
+	int _r = strtoi_h(cp, &_v);					\
+	if (_r)								\
+		return _r;						\
+	_v;								\
+})
+
+#define sysfs_hatoi(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
+} while (0)
+
+write_attribute(trigger_journal_flush);
+write_attribute(trigger_btree_coalesce);
+write_attribute(trigger_gc);
+write_attribute(trigger_alloc_write);
+write_attribute(prune_cache);
+rw_attribute(btree_gc_periodic);
+
+read_attribute(uuid);
+read_attribute(minor);
+read_attribute(bucket_size);
+read_attribute(block_size);
+read_attribute(btree_node_size);
+read_attribute(first_bucket);
+read_attribute(nbuckets);
+read_attribute(durability);
+read_attribute(iodone);
+
+read_attribute(io_latency_read);
+read_attribute(io_latency_write);
+read_attribute(io_latency_stats_read);
+read_attribute(io_latency_stats_write);
+read_attribute(congested);
+
+read_attribute(bucket_quantiles_last_read);
+read_attribute(bucket_quantiles_last_write);
+read_attribute(bucket_quantiles_fragmentation);
+read_attribute(bucket_quantiles_oldest_gen);
+
+read_attribute(reserve_stats);
+read_attribute(btree_cache_size);
+read_attribute(compression_stats);
+read_attribute(journal_debug);
+read_attribute(journal_pins);
+read_attribute(btree_updates);
+read_attribute(dirty_btree_nodes);
+
+read_attribute(internal_uuid);
+
+read_attribute(has_data);
+read_attribute(alloc_debug);
+write_attribute(wake_allocator);
+
+read_attribute(read_realloc_races);
+read_attribute(extent_migrate_done);
+read_attribute(extent_migrate_raced);
+
+rw_attribute(journal_write_delay_ms);
+rw_attribute(journal_reclaim_delay_ms);
+
+rw_attribute(discard);
+rw_attribute(cache_replacement_policy);
+rw_attribute(label);
+
+rw_attribute(copy_gc_enabled);
+sysfs_pd_controller_attribute(copy_gc);
+
+rw_attribute(rebalance_enabled);
+sysfs_pd_controller_attribute(rebalance);
+read_attribute(rebalance_work);
+rw_attribute(promote_whole_extents);
+
+read_attribute(new_stripes);
+
+rw_attribute(pd_controllers_update_seconds);
+
+read_attribute(meta_replicas_have);
+read_attribute(data_replicas_have);
+
+#ifdef CONFIG_BCACHEFS_TESTS
+write_attribute(perf_test);
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	rw_attribute(name);
+
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#define x(_name)						\
+	static struct attribute sysfs_time_stat_##_name =		\
+		{ .name = #_name, .mode = S_IRUGO };
+	BCH_TIME_STATS()
+#undef x
+
+static struct attribute sysfs_state_rw = {
+	.name = "state",
+	.mode = S_IRUGO
+};
+
+static size_t bch2_btree_cache_size(struct bch_fs *c)
+{
+	size_t ret = 0;
+	struct btree *b;
+
+	mutex_lock(&c->btree_cache.lock);
+	list_for_each_entry(b, &c->btree_cache.live, list)
+		ret += btree_bytes(c);
+
+	mutex_unlock(&c->btree_cache.lock);
+	return ret;
+}
+
+static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
+{
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
+	struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
+
+	if (!fs_usage)
+		return -ENOMEM;
+
+	bch2_fs_usage_to_text(&out, c, fs_usage);
+
+	percpu_up_read(&c->mark_lock);
+
+	kfree(fs_usage);
+
+	return out.pos - buf;
+}
+
+static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
+	    nr_compressed_extents = 0,
+	    compressed_sectors_compressed = 0,
+	    compressed_sectors_uncompressed = 0;
+	int ret;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EPERM;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret)
+		if (k.k->type == KEY_TYPE_extent) {
+			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+			const union bch_extent_entry *entry;
+			struct extent_ptr_decoded p;
+
+			extent_for_each_ptr_decode(e, p, entry) {
+				if (p.crc.compression_type == BCH_COMPRESSION_NONE) {
+					nr_uncompressed_extents++;
+					uncompressed_sectors += e.k->size;
+				} else {
+					nr_compressed_extents++;
+					compressed_sectors_compressed +=
+						p.crc.compressed_size;
+					compressed_sectors_uncompressed +=
+						p.crc.uncompressed_size;
+				}
+
+				/* only looking at the first ptr */
+				break;
+			}
+		}
+
+	ret = bch2_trans_exit(&trans) ?: ret;
+	if (ret)
+		return ret;
+
+	return scnprintf(buf, PAGE_SIZE,
+			"uncompressed data:\n"
+			"	nr extents:			%llu\n"
+			"	size (bytes):			%llu\n"
+			"compressed data:\n"
+			"	nr extents:			%llu\n"
+			"	compressed size (bytes):	%llu\n"
+			"	uncompressed size (bytes):	%llu\n",
+			nr_uncompressed_extents,
+			uncompressed_sectors << 9,
+			nr_compressed_extents,
+			compressed_sectors_compressed << 9,
+			compressed_sectors_uncompressed << 9);
+}
+
+static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf)
+{
+	char *out = buf, *end = buf + PAGE_SIZE;
+	struct ec_stripe_head *h;
+	struct ec_stripe_new *s;
+
+	mutex_lock(&c->ec_new_stripe_lock);
+	list_for_each_entry(h, &c->ec_new_stripe_list, list) {
+		out += scnprintf(out, end - out,
+				 "target %u algo %u redundancy %u:\n",
+				 h->target, h->algo, h->redundancy);
+
+		if (h->s)
+			out += scnprintf(out, end - out,
+					 "\tpending: blocks %u allocated %u\n",
+					 h->s->blocks.nr,
+					 bitmap_weight(h->s->blocks_allocated,
+						       h->s->blocks.nr));
+
+		mutex_lock(&h->lock);
+		list_for_each_entry(s, &h->stripes, list)
+			out += scnprintf(out, end - out,
+					 "\tin flight: blocks %u allocated %u pin %u\n",
+					 s->blocks.nr,
+					 bitmap_weight(s->blocks_allocated,
+						       s->blocks.nr),
+					 atomic_read(&s->pin));
+		mutex_unlock(&h->lock);
+
+	}
+	mutex_unlock(&c->ec_new_stripe_lock);
+
+	return out - buf;
+}
+
+SHOW(bch2_fs)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	sysfs_print(minor,			c->minor);
+	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
+
+	sysfs_print(journal_write_delay_ms,	c->journal.write_delay_ms);
+	sysfs_print(journal_reclaim_delay_ms,	c->journal.reclaim_delay_ms);
+
+	sysfs_print(block_size,			block_bytes(c));
+	sysfs_print(btree_node_size,		btree_bytes(c));
+	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
+
+	sysfs_print(read_realloc_races,
+		    atomic_long_read(&c->read_realloc_races));
+	sysfs_print(extent_migrate_done,
+		    atomic_long_read(&c->extent_migrate_done));
+	sysfs_print(extent_migrate_raced,
+		    atomic_long_read(&c->extent_migrate_raced));
+
+	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
+
+	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+
+	sysfs_print(pd_controllers_update_seconds,
+		    c->pd_controllers_update_seconds);
+
+	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
+	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
+
+	if (attr == &sysfs_rebalance_work)
+		return bch2_rebalance_work_show(c, buf);
+
+	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
+
+	sysfs_printf(meta_replicas_have, "%i",	bch2_replicas_online(c, true));
+	sysfs_printf(data_replicas_have, "%i",	bch2_replicas_online(c, false));
+
+	/* Debugging: */
+
+	if (attr == &sysfs_alloc_debug)
+		return show_fs_alloc_debug(c, buf);
+
+	if (attr == &sysfs_journal_debug)
+		return bch2_journal_print_debug(&c->journal, buf);
+
+	if (attr == &sysfs_journal_pins)
+		return bch2_journal_print_pins(&c->journal, buf);
+
+	if (attr == &sysfs_btree_updates)
+		return bch2_btree_updates_print(c, buf);
+
+	if (attr == &sysfs_dirty_btree_nodes)
+		return bch2_dirty_btree_nodes_print(c, buf);
+
+	if (attr == &sysfs_compression_stats)
+		return bch2_compression_stats(c, buf);
+
+	if (attr == &sysfs_new_stripes)
+		return bch2_new_stripes(c, buf);
+
+#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+	return 0;
+}
+
+STORE(__bch2_fs)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
+	sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
+
+	if (attr == &sysfs_btree_gc_periodic) {
+		ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
+			?: (ssize_t) size;
+
+		wake_up_process(c->gc_thread);
+		return ret;
+	}
+
+	if (attr == &sysfs_copy_gc_enabled) {
+		struct bch_dev *ca;
+		unsigned i;
+		ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
+			?: (ssize_t) size;
+
+		for_each_member_device(ca, c, i)
+			if (ca->copygc_thread)
+				wake_up_process(ca->copygc_thread);
+		return ret;
+	}
+
+	if (attr == &sysfs_rebalance_enabled) {
+		ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
+			?: (ssize_t) size;
+
+		rebalance_wakeup(c);
+		return ret;
+	}
+
+	sysfs_strtoul(pd_controllers_update_seconds,
+		      c->pd_controllers_update_seconds);
+	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
+
+	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);
+
+	/* Debugging: */
+
+#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EPERM;
+
+	/* Debugging: */
+
+	if (attr == &sysfs_trigger_journal_flush)
+		bch2_journal_meta_async(&c->journal, NULL);
+
+	if (attr == &sysfs_trigger_btree_coalesce)
+		bch2_coalesce(c);
+
+	if (attr == &sysfs_trigger_gc)
+		bch2_gc(c, NULL, false, false);
+
+	if (attr == &sysfs_trigger_alloc_write) {
+		bool wrote;
+
+		bch2_alloc_write(c, 0, &wrote);
+	}
+
+	if (attr == &sysfs_prune_cache) {
+		struct shrink_control sc;
+
+		sc.gfp_mask = GFP_KERNEL;
+		sc.nr_to_scan = strtoul_or_return(buf);
+		c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+	}
+#ifdef CONFIG_BCACHEFS_TESTS
+	if (attr == &sysfs_perf_test) {
+		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
+		char *test		= strsep(&p, " \t\n");
+		char *nr_str		= strsep(&p, " \t\n");
+		char *threads_str	= strsep(&p, " \t\n");
+		unsigned threads;
+		u64 nr;
+		int ret = -EINVAL;
+
+		if (threads_str &&
+		    !(ret = kstrtouint(threads_str, 10, &threads)) &&
+		    !(ret = bch2_strtoull_h(nr_str, &nr)))
+			bch2_btree_perf_test(c, test, nr, threads);
+		else
+			size = ret;
+		kfree(tmp);
+	}
+#endif
+	return size;
+}
+
+STORE(bch2_fs)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	mutex_lock(&c->state_lock);
+	size = __bch2_fs_store(kobj, attr, buf, size);
+	mutex_unlock(&c->state_lock);
+
+	return size;
+}
+SYSFS_OPS(bch2_fs);
+
+struct attribute *bch2_fs_files[] = {
+	&sysfs_minor,
+	&sysfs_block_size,
+	&sysfs_btree_node_size,
+	&sysfs_btree_cache_size,
+
+	&sysfs_meta_replicas_have,
+	&sysfs_data_replicas_have,
+
+	&sysfs_journal_write_delay_ms,
+	&sysfs_journal_reclaim_delay_ms,
+
+	&sysfs_promote_whole_extents,
+
+	&sysfs_compression_stats,
+
+#ifdef CONFIG_BCACHEFS_TESTS
+	&sysfs_perf_test,
+#endif
+	NULL
+};
+
+/* internal dir - just a wrapper */
+
+SHOW(bch2_fs_internal)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+	return bch2_fs_show(&c->kobj, attr, buf);
+}
+
+STORE(bch2_fs_internal)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+	return bch2_fs_store(&c->kobj, attr, buf, size);
+}
+SYSFS_OPS(bch2_fs_internal);
+
+struct attribute *bch2_fs_internal_files[] = {
+	&sysfs_alloc_debug,
+	&sysfs_journal_debug,
+	&sysfs_journal_pins,
+	&sysfs_btree_updates,
+	&sysfs_dirty_btree_nodes,
+
+	&sysfs_read_realloc_races,
+	&sysfs_extent_migrate_done,
+	&sysfs_extent_migrate_raced,
+
+	&sysfs_trigger_journal_flush,
+	&sysfs_trigger_btree_coalesce,
+	&sysfs_trigger_gc,
+	&sysfs_trigger_alloc_write,
+	&sysfs_prune_cache,
+
+	&sysfs_copy_gc_enabled,
+
+	&sysfs_rebalance_enabled,
+	&sysfs_rebalance_work,
+	sysfs_pd_controller_files(rebalance),
+
+	&sysfs_new_stripes,
+
+	&sysfs_internal_uuid,
+
+#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
+	BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+	NULL
+};
+
+/* options */
+
+SHOW(bch2_fs_opts_dir)
+{
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
+	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+	int id = opt - bch2_opt_table;
+	u64 v = bch2_opt_get_by_id(&c->opts, id);
+
+	bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST);
+	pr_buf(&out, "\n");
+
+	return out.pos - buf;
+}
+
+STORE(bch2_fs_opts_dir)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+	int ret, id = opt - bch2_opt_table;
+	char *tmp;
+	u64 v;
+
+	tmp = kstrdup(buf, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	ret = bch2_opt_parse(c, opt, strim(tmp), &v);
+	kfree(tmp);
+
+	if (ret < 0)
+		return ret;
+
+	ret = bch2_opt_check_may_set(c, id, v);
+	if (ret < 0)
+		return ret;
+
+	if (opt->set_sb != SET_NO_SB_OPT) {
+		mutex_lock(&c->sb_lock);
+		opt->set_sb(c->disk_sb.sb, v);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	bch2_opt_set_by_id(&c->opts, id, v);
+
+	if ((id == Opt_background_target ||
+	     id == Opt_background_compression) && v) {
+		bch2_rebalance_add_work(c, S64_MAX);
+		rebalance_wakeup(c);
+	}
+
+	return size;
+}
+SYSFS_OPS(bch2_fs_opts_dir);
+
+struct attribute *bch2_fs_opts_dir_files[] = { NULL };
+
+int bch2_opts_create_sysfs_files(struct kobject *kobj)
+{
+	const struct bch_option *i;
+	int ret;
+
+	for (i = bch2_opt_table;
+	     i < bch2_opt_table + bch2_opts_nr;
+	     i++) {
+		if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME)))
+			continue;
+
+		ret = sysfs_create_file(kobj, &i->attr);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* time stats */
+
+SHOW(bch2_fs_time_stats)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+
+#define x(name)						\
+	if (attr == &sysfs_time_stat_##name)				\
+		return bch2_time_stats_print(&c->times[BCH_TIME_##name],\
+					     buf, PAGE_SIZE);
+	BCH_TIME_STATS()
+#undef x
+
+	return 0;
+}
+
+STORE(bch2_fs_time_stats)
+{
+	return size;
+}
+SYSFS_OPS(bch2_fs_time_stats);
+
+struct attribute *bch2_fs_time_stats_files[] = {
+#define x(name)						\
+	&sysfs_time_stat_##name,
+	BCH_TIME_STATS()
+#undef x
+	NULL
+};
+
+typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
+				 size_t, void *);
+
+static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
+				  size_t b, void *private)
+{
+	int rw = (private ? 1 : 0);
+
+	return bucket_last_io(c, bucket(ca, b), rw);
+}
+
+static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
+				       size_t b, void *private)
+{
+	struct bucket *g = bucket(ca, b);
+	return bucket_sectors_used(g->mark);
+}
+
+static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
+				     size_t b, void *private)
+{
+	return bucket_gc_gen(ca, b);
+}
+
+static int unsigned_cmp(const void *_l, const void *_r)
+{
+	const unsigned *l = _l;
+	const unsigned *r = _r;
+
+	return cmp_int(*l, *r);
+}
+
+static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
+			      char *buf, bucket_map_fn *fn, void *private)
+{
+	size_t i, n;
+	/* Compute 31 quantiles */
+	unsigned q[31], *p;
+	ssize_t ret = 0;
+
+	down_read(&ca->bucket_lock);
+	n = ca->mi.nbuckets;
+
+	p = vzalloc(n * sizeof(unsigned));
+	if (!p) {
+		up_read(&ca->bucket_lock);
+		return -ENOMEM;
+	}
+
+	for (i = ca->mi.first_bucket; i < n; i++)
+		p[i] = fn(c, ca, i, private);
+
+	sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
+	up_read(&ca->bucket_lock);
+
+	while (n &&
+	       !p[n - 1])
+		--n;
+
+	for (i = 0; i < ARRAY_SIZE(q); i++)
+		q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)];
+
+	vfree(p);
+
+	for (i = 0; i < ARRAY_SIZE(q); i++)
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+				 "%u ", q[i]);
+	buf[ret - 1] = '\n';
+
+	return ret;
+}
+
+static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
+{
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
+	enum alloc_reserve i;
+
+	spin_lock(&ca->freelist_lock);
+
+	pr_buf(&out, "free_inc:\t%zu\t%zu\n",
+	       fifo_used(&ca->free_inc),
+	       ca->free_inc.size);
+
+	for (i = 0; i < RESERVE_NR; i++)
+		pr_buf(&out, "free[%u]:\t%zu\t%zu\n", i,
+		       fifo_used(&ca->free[i]),
+		       ca->free[i].size);
+
+	spin_unlock(&ca->freelist_lock);
+
+	return out.pos - buf;
+}
+
+static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
+{
+	struct bch_fs *c = ca->fs;
+	struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+	unsigned i, nr[BCH_DATA_NR];
+
+	memset(nr, 0, sizeof(nr));
+
+	for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+		nr[c->open_buckets[i].type]++;
+
+	return scnprintf(buf, PAGE_SIZE,
+		"free_inc:               %zu/%zu\n"
+		"free[RESERVE_BTREE]:    %zu/%zu\n"
+		"free[RESERVE_MOVINGGC]: %zu/%zu\n"
+		"free[RESERVE_NONE]:     %zu/%zu\n"
+		"buckets:\n"
+		"    capacity:           %llu\n"
+		"    alloc:              %llu\n"
+		"    sb:                 %llu\n"
+		"    journal:            %llu\n"
+		"    meta:               %llu\n"
+		"    user:               %llu\n"
+		"    cached:             %llu\n"
+		"    erasure coded:      %llu\n"
+		"    available:          %lli\n"
+		"sectors:\n"
+		"    sb:                 %llu\n"
+		"    journal:            %llu\n"
+		"    meta:               %llu\n"
+		"    user:               %llu\n"
+		"    cached:             %llu\n"
+		"    fragmented:         %llu\n"
+		"    copygc threshold:   %llu\n"
+		"freelist_wait:          %s\n"
+		"open buckets:           %u/%u (reserved %u)\n"
+		"open_buckets_wait:      %s\n"
+		"open_buckets_btree:     %u\n"
+		"open_buckets_user:      %u\n"
+		"btree reserve cache:    %u\n",
+		fifo_used(&ca->free_inc),		ca->free_inc.size,
+		fifo_used(&ca->free[RESERVE_BTREE]),	ca->free[RESERVE_BTREE].size,
+		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
+		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
+		ca->mi.nbuckets - ca->mi.first_bucket,
+		stats.buckets_alloc,
+		stats.buckets[BCH_DATA_SB],
+		stats.buckets[BCH_DATA_JOURNAL],
+		stats.buckets[BCH_DATA_BTREE],
+		stats.buckets[BCH_DATA_USER],
+		stats.buckets[BCH_DATA_CACHED],
+		stats.buckets_ec,
+		ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
+		stats.sectors[BCH_DATA_SB],
+		stats.sectors[BCH_DATA_JOURNAL],
+		stats.sectors[BCH_DATA_BTREE],
+		stats.sectors[BCH_DATA_USER],
+		stats.sectors[BCH_DATA_CACHED],
+		stats.sectors_fragmented,
+		ca->copygc_threshold,
+		c->freelist_wait.list.first		? "waiting" : "empty",
+		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
+		BTREE_NODE_OPEN_BUCKET_RESERVE,
+		c->open_buckets_wait.list.first		? "waiting" : "empty",
+		nr[BCH_DATA_BTREE],
+		nr[BCH_DATA_USER],
+		c->btree_reserve_cache_nr);
+}
+
+static const char * const bch2_rw[] = {
+	"read",
+	"write",
+	NULL
+};
+
+static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
+{
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
+	int rw, i;
+
+	for (rw = 0; rw < 2; rw++) {
+		pr_buf(&out, "%s:\n", bch2_rw[rw]);
+
+		for (i = 1; i < BCH_DATA_NR; i++)
+			pr_buf(&out, "%-12s:%12llu\n",
+			       bch2_data_types[i],
+			       percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
+	}
+
+	return out.pos - buf;
+}
+
+SHOW(bch2_dev)
+{
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+	struct bch_fs *c = ca->fs;
+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
+
+	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
+
+	sysfs_print(bucket_size,	bucket_bytes(ca));
+	sysfs_print(block_size,		block_bytes(c));
+	sysfs_print(first_bucket,	ca->mi.first_bucket);
+	sysfs_print(nbuckets,		ca->mi.nbuckets);
+	sysfs_print(durability,		ca->mi.durability);
+	sysfs_print(discard,		ca->mi.discard);
+
+	if (attr == &sysfs_label) {
+		if (ca->mi.group) {
+			mutex_lock(&c->sb_lock);
+			bch2_disk_path_to_text(&out, &c->disk_sb,
+					       ca->mi.group - 1);
+			mutex_unlock(&c->sb_lock);
+		} else {
+			pr_buf(&out, "none");
+		}
+
+		pr_buf(&out, "\n");
+		return out.pos - buf;
+	}
+
+	if (attr == &sysfs_has_data) {
+		bch2_flags_to_text(&out, bch2_data_types,
+				   bch2_dev_has_data(c, ca));
+		pr_buf(&out, "\n");
+		return out.pos - buf;
+	}
+
+	sysfs_pd_controller_show(copy_gc, &ca->copygc_pd);
+
+	if (attr == &sysfs_cache_replacement_policy) {
+		bch2_string_opt_to_text(&out,
+					bch2_cache_replacement_policies,
+					ca->mi.replacement);
+		pr_buf(&out, "\n");
+		return out.pos - buf;
+	}
+
+	if (attr == &sysfs_state_rw) {
+		bch2_string_opt_to_text(&out, bch2_dev_state,
+					ca->mi.state);
+		pr_buf(&out, "\n");
+		return out.pos - buf;
+	}
+
+	if (attr == &sysfs_iodone)
+		return show_dev_iodone(ca, buf);
+
+	sysfs_print(io_latency_read,		atomic64_read(&ca->cur_latency[READ]));
+	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
+
+	if (attr == &sysfs_io_latency_stats_read)
+		return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE);
+	if (attr == &sysfs_io_latency_stats_write)
+		return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE);
+
+	sysfs_printf(congested,			"%u%%",
+		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
+		     * 100 / CONGESTED_MAX);
+
+	if (attr == &sysfs_bucket_quantiles_last_read)
+		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
+	if (attr == &sysfs_bucket_quantiles_last_write)
+		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
+	if (attr == &sysfs_bucket_quantiles_fragmentation)
+		return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
+	if (attr == &sysfs_bucket_quantiles_oldest_gen)
+		return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
+
+	if (attr == &sysfs_reserve_stats)
+		return show_reserve_stats(ca, buf);
+	if (attr == &sysfs_alloc_debug)
+		return show_dev_alloc_debug(ca, buf);
+
+	return 0;
+}
+
+STORE(bch2_dev)
+{
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+	struct bch_fs *c = ca->fs;
+	struct bch_member *mi;
+
+	sysfs_pd_controller_store(copy_gc, &ca->copygc_pd);
+
+	if (attr == &sysfs_discard) {
+		bool v = strtoul_or_return(buf);
+
+		mutex_lock(&c->sb_lock);
+		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+
+		if (v != BCH_MEMBER_DISCARD(mi)) {
+			SET_BCH_MEMBER_DISCARD(mi, v);
+			bch2_write_super(c);
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (attr == &sysfs_cache_replacement_policy) {
+		ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf);
+
+		if (v < 0)
+			return v;
+
+		mutex_lock(&c->sb_lock);
+		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+
+		if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
+			SET_BCH_MEMBER_REPLACEMENT(mi, v);
+			bch2_write_super(c);
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (attr == &sysfs_label) {
+		char *tmp;
+		int ret;
+
+		tmp = kstrdup(buf, GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+
+		ret = bch2_dev_group_set(c, ca, strim(tmp));
+		kfree(tmp);
+		if (ret)
+			return ret;
+	}
+
+	if (attr == &sysfs_wake_allocator)
+		bch2_wake_allocator(ca);
+
+	return size;
+}
+SYSFS_OPS(bch2_dev);
+
+struct attribute *bch2_dev_files[] = {
+	&sysfs_uuid,
+	&sysfs_bucket_size,
+	&sysfs_block_size,
+	&sysfs_first_bucket,
+	&sysfs_nbuckets,
+	&sysfs_durability,
+
+	/* settings: */
+	&sysfs_discard,
+	&sysfs_cache_replacement_policy,
+	&sysfs_state_rw,
+	&sysfs_label,
+
+	&sysfs_has_data,
+	&sysfs_iodone,
+
+	&sysfs_io_latency_read,
+	&sysfs_io_latency_write,
+	&sysfs_io_latency_stats_read,
+	&sysfs_io_latency_stats_write,
+	&sysfs_congested,
+
+	/* alloc info - other stats: */
+	&sysfs_bucket_quantiles_last_read,
+	&sysfs_bucket_quantiles_last_write,
+	&sysfs_bucket_quantiles_fragmentation,
+	&sysfs_bucket_quantiles_oldest_gen,
+
+	&sysfs_reserve_stats,
+
+	/* debug: */
+	&sysfs_alloc_debug,
+	&sysfs_wake_allocator,
+
+	sysfs_pd_controller_files(copy_gc),
+	NULL
+};
+
+#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
new file mode 100644
index 000000000000..525fd05d91f7
--- /dev/null
+++ b/fs/bcachefs/sysfs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SYSFS_H_
+#define _BCACHEFS_SYSFS_H_
+
+#include <linux/sysfs.h>
+
+#ifndef NO_BCACHEFS_SYSFS
+
+struct attribute;
+struct sysfs_ops;
+
+extern struct attribute *bch2_fs_files[];
+extern struct attribute *bch2_fs_internal_files[];
+extern struct attribute *bch2_fs_opts_dir_files[];
+extern struct attribute *bch2_fs_time_stats_files[];
+extern struct attribute *bch2_dev_files[];
+
+extern struct sysfs_ops bch2_fs_sysfs_ops;
+extern struct sysfs_ops bch2_fs_internal_sysfs_ops;
+extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+extern struct sysfs_ops bch2_dev_sysfs_ops;
+
+int bch2_opts_create_sysfs_files(struct kobject *);
+
+#else
+
+static struct attribute *bch2_fs_files[] = {};
+static struct attribute *bch2_fs_internal_files[] = {};
+static struct attribute *bch2_fs_opts_dir_files[] = {};
+static struct attribute *bch2_fs_time_stats_files[] = {};
+static struct attribute *bch2_dev_files[] = {};
+
+static const struct sysfs_ops bch2_fs_sysfs_ops;
+static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+static const struct sysfs_ops bch2_dev_sysfs_ops;
+
+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; }
+
+#endif /* NO_BCACHEFS_SYSFS */
+
+#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
new file mode 100644
index 000000000000..724f41e6590c
--- /dev/null
+++ b/fs/bcachefs/tests.c
@@ -0,0 +1,678 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_TESTS
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "journal_reclaim.h"
+#include "tests.h"
+
+#include "linux/kthread.h"
+#include "linux/random.h"
+
+static void delete_test_keys(struct bch_fs *c)
+{
+	int ret;
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+				      POS(0, 0), POS(0, U64_MAX),
+				      NULL);
+	BUG_ON(ret);
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+				      POS(0, 0), POS(0, U64_MAX),
+				      NULL);
+	BUG_ON(ret);
+}
+
+/* unit tests */
+
+static void test_delete(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p,
+				   BTREE_ITER_INTENT);
+
+	ret = bch2_btree_iter_traverse(iter);
+	BUG_ON(ret);
+
+	bch2_trans_update(&trans, iter, &k.k_i);
+	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+	BUG_ON(ret);
+
+	pr_info("deleting once");
+	ret = bch2_btree_delete_at(&trans, iter, 0);
+	BUG_ON(ret);
+
+	pr_info("deleting twice");
+	ret = bch2_btree_delete_at(&trans, iter, 0);
+	BUG_ON(ret);
+
+	bch2_trans_exit(&trans);
+}
+
+static void test_delete_written(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p,
+				   BTREE_ITER_INTENT);
+
+	ret = bch2_btree_iter_traverse(iter);
+	BUG_ON(ret);
+
+	bch2_trans_update(&trans, iter, &k.k_i);
+	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+	BUG_ON(ret);
+
+	bch2_journal_flush_all_pins(&c->journal);
+
+	ret = bch2_btree_delete_at(&trans, iter, 0);
+	BUG_ON(ret);
+
+	bch2_trans_exit(&trans);
+}
+
+static void test_iterate(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 i;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	delete_test_keys(c);
+
+	pr_info("inserting test keys");
+
+	for (i = 0; i < nr; i++) {
+		struct bkey_i_cookie k;
+
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = i;
+
+		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+					NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
+			   POS_MIN, 0, k, ret)
+		BUG_ON(k.k->p.offset != i++);
+
+	BUG_ON(i != nr);
+
+	pr_info("iterating backwards");
+
+	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k))
+		BUG_ON(k.k->p.offset != --i);
+
+	BUG_ON(i);
+
+	bch2_trans_exit(&trans);
+}
+
+static void test_iterate_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 i;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	delete_test_keys(c);
+
+	pr_info("inserting test extents");
+
+	for (i = 0; i < nr; i += 8) {
+		struct bkey_i_cookie k;
+
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = i + 8;
+		k.k.size = 8;
+
+		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+					NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+			   POS_MIN, 0, k, ret) {
+		BUG_ON(bkey_start_offset(k.k) != i);
+		i = k.k->p.offset;
+	}
+
+	BUG_ON(i != nr);
+
+	pr_info("iterating backwards");
+
+	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) {
+		BUG_ON(k.k->p.offset != i);
+		i = bkey_start_offset(k.k);
+	}
+
+	BUG_ON(i);
+
+	bch2_trans_exit(&trans);
+}
+
+static void test_iterate_slots(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 i;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	delete_test_keys(c);
+
+	pr_info("inserting test keys");
+
+	for (i = 0; i < nr; i++) {
+		struct bkey_i_cookie k;
+
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = i * 2;
+
+		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+					NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+			   0, k, ret) {
+		BUG_ON(k.k->p.offset != i);
+		i += 2;
+	}
+	bch2_trans_iter_free(&trans, iter);
+
+	BUG_ON(i != nr * 2);
+
+	pr_info("iterating forwards by slots");
+
+	i = 0;
+
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+			   BTREE_ITER_SLOTS, k, ret) {
+		BUG_ON(bkey_deleted(k.k) != (i & 1));
+		BUG_ON(k.k->p.offset != i++);
+
+		if (i == nr * 2)
+			break;
+	}
+
+	bch2_trans_exit(&trans);
+}
+
+static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 i;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	delete_test_keys(c);
+
+	pr_info("inserting test keys");
+
+	for (i = 0; i < nr; i += 16) {
+		struct bkey_i_cookie k;
+
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = i + 16;
+		k.k.size = 8;
+
+		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+					NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
+			   0, k, ret) {
+		BUG_ON(bkey_start_offset(k.k) != i + 8);
+		BUG_ON(k.k->size != 8);
+		i += 16;
+	}
+	bch2_trans_iter_free(&trans, iter);
+
+	BUG_ON(i != nr);
+
+	pr_info("iterating forwards by slots");
+
+	i = 0;
+
+	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
+			   BTREE_ITER_SLOTS, k, ret) {
+		BUG_ON(bkey_deleted(k.k) != !(i % 16));
+
+		BUG_ON(bkey_start_offset(k.k) != i);
+		BUG_ON(k.k->size != 8);
+		i = k.k->p.offset;
+
+		if (i == nr)
+			break;
+	}
+
+	bch2_trans_exit(&trans);
+}
+
+/*
+ * XXX: we really want to make sure we've got a btree with depth > 0 for these
+ * tests
+ */
+static void test_peek_end(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN, 0);
+
+	k = bch2_btree_iter_peek(iter);
+	BUG_ON(k.k);
+
+	k = bch2_btree_iter_peek(iter);
+	BUG_ON(k.k);
+
+	bch2_trans_exit(&trans);
+}
+
+static void test_peek_end_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0);
+
+	k = bch2_btree_iter_peek(iter);
+	BUG_ON(k.k);
+
+	k = bch2_btree_iter_peek(iter);
+	BUG_ON(k.k);
+
+	bch2_trans_exit(&trans);
+}
+
+/* extent unit tests */
+
+u64 test_version;
+
+static void insert_test_extent(struct bch_fs *c,
+			       u64 start, u64 end)
+{
+	struct bkey_i_cookie k;
+	int ret;
+
+	//pr_info("inserting %llu-%llu v %llu", start, end, test_version);
+
+	bkey_cookie_init(&k.k_i);
+	k.k_i.k.p.offset = end;
+	k.k_i.k.size = end - start;
+	k.k_i.k.version.lo = test_version++;
+
+	ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+				NULL, NULL, 0);
+	BUG_ON(ret);
+}
+
+static void __test_extent_overwrite(struct bch_fs *c,
+				    u64 e1_start, u64 e1_end,
+				    u64 e2_start, u64 e2_end)
+{
+	insert_test_extent(c, e1_start, e1_end);
+	insert_test_extent(c, e2_start, e2_end);
+
+	delete_test_keys(c);
+}
+
+static void test_extent_overwrite_front(struct bch_fs *c, u64 nr)
+{
+	__test_extent_overwrite(c, 0, 64, 0, 32);
+	__test_extent_overwrite(c, 8, 64, 0, 32);
+}
+
+static void test_extent_overwrite_back(struct bch_fs *c, u64 nr)
+{
+	__test_extent_overwrite(c, 0, 64, 32, 64);
+	__test_extent_overwrite(c, 0, 64, 32, 72);
+}
+
+static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
+{
+	__test_extent_overwrite(c, 0, 64, 32, 40);
+}
+
+static void test_extent_overwrite_all(struct bch_fs *c, u64 nr)
+{
+	__test_extent_overwrite(c, 32, 64,  0,  64);
+	__test_extent_overwrite(c, 32, 64,  0, 128);
+	__test_extent_overwrite(c, 32, 64, 32,  64);
+	__test_extent_overwrite(c, 32, 64, 32, 128);
+}
+
+/* perf tests */
+
+static u64 test_rand(void)
+{
+	u64 v;
+#if 0
+	v = prandom_u32();
+#else
+	prandom_bytes(&v, sizeof(v));
+#endif
+	return v;
+}
+
+static void rand_insert(struct bch_fs *c, u64 nr)
+{
+	struct bkey_i_cookie k;
+	int ret;
+	u64 i;
+
+	for (i = 0; i < nr; i++) {
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = test_rand();
+
+		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+					NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+}
+
+static void rand_lookup(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 i;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for (i = 0; i < nr; i++) {
+		iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+					   POS(0, test_rand()), 0);
+
+		k = bch2_btree_iter_peek(iter);
+		bch2_trans_iter_free(&trans, iter);
+	}
+
+	bch2_trans_exit(&trans);
+}
+
+static void rand_mixed(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+	u64 i;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for (i = 0; i < nr; i++) {
+		iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+					   POS(0, test_rand()), 0);
+
+		k = bch2_btree_iter_peek(iter);
+
+		if (!(i & 3) && k.k) {
+			struct bkey_i_cookie k;
+
+			bkey_cookie_init(&k.k_i);
+			k.k.p = iter->pos;
+
+			bch2_trans_update(&trans, iter, &k.k_i);
+			ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+			BUG_ON(ret);
+		}
+
+		bch2_trans_iter_free(&trans, iter);
+	}
+
+	bch2_trans_exit(&trans);
+}
+
+static void rand_delete(struct bch_fs *c, u64 nr)
+{
+	struct bkey_i k;
+	int ret;
+	u64 i;
+
+	for (i = 0; i < nr; i++) {
+		bkey_init(&k.k);
+		k.k.p.offset = test_rand();
+
+		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k,
+					NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+}
+
+static void seq_insert(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct bkey_i_cookie insert;
+	int ret;
+	u64 i = 0;
+
+	bkey_cookie_init(&insert.k_i);
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+		insert.k.p = iter->pos;
+
+		bch2_trans_update(&trans, iter, &insert.k_i);
+		ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+		BUG_ON(ret);
+
+		if (++i == nr)
+			break;
+	}
+	bch2_trans_exit(&trans);
+}
+
+static void seq_lookup(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret)
+		;
+	bch2_trans_exit(&trans);
+}
+
+static void seq_overwrite(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+			   BTREE_ITER_INTENT, k, ret) {
+		struct bkey_i_cookie u;
+
+		bkey_reassemble(&u.k_i, k);
+
+		bch2_trans_update(&trans, iter, &u.k_i);
+		ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+		BUG_ON(ret);
+	}
+	bch2_trans_exit(&trans);
+}
+
+static void seq_delete(struct bch_fs *c, u64 nr)
+{
+	int ret;
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+				      POS(0, 0), POS(0, U64_MAX),
+				      NULL);
+	BUG_ON(ret);
+}
+
+typedef void (*perf_test_fn)(struct bch_fs *, u64);
+
+struct test_job {
+	struct bch_fs			*c;
+	u64				nr;
+	unsigned			nr_threads;
+	perf_test_fn			fn;
+
+	atomic_t			ready;
+	wait_queue_head_t		ready_wait;
+
+	atomic_t			done;
+	struct completion		done_completion;
+
+	u64				start;
+	u64				finish;
+};
+
+static int btree_perf_test_thread(void *data)
+{
+	struct test_job *j = data;
+
+	if (atomic_dec_and_test(&j->ready)) {
+		wake_up(&j->ready_wait);
+		j->start = sched_clock();
+	} else {
+		wait_event(j->ready_wait, !atomic_read(&j->ready));
+	}
+
+	j->fn(j->c, j->nr / j->nr_threads);
+
+	if (atomic_dec_and_test(&j->done)) {
+		j->finish = sched_clock();
+		complete(&j->done_completion);
+	}
+
+	return 0;
+}
+
+void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+			  u64 nr, unsigned nr_threads)
+{
+	struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
+	char name_buf[20], nr_buf[20], per_sec_buf[20];
+	unsigned i;
+	u64 time;
+
+	atomic_set(&j.ready, nr_threads);
+	init_waitqueue_head(&j.ready_wait);
+
+	atomic_set(&j.done, nr_threads);
+	init_completion(&j.done_completion);
+
+#define perf_test(_test)				\
+	if (!strcmp(testname, #_test)) j.fn = _test
+
+	perf_test(rand_insert);
+	perf_test(rand_lookup);
+	perf_test(rand_mixed);
+	perf_test(rand_delete);
+
+	perf_test(seq_insert);
+	perf_test(seq_lookup);
+	perf_test(seq_overwrite);
+	perf_test(seq_delete);
+
+	/* a unit test, not a perf test: */
+	perf_test(test_delete);
+	perf_test(test_delete_written);
+	perf_test(test_iterate);
+	perf_test(test_iterate_extents);
+	perf_test(test_iterate_slots);
+	perf_test(test_iterate_slots_extents);
+	perf_test(test_peek_end);
+	perf_test(test_peek_end_extents);
+
+	perf_test(test_extent_overwrite_front);
+	perf_test(test_extent_overwrite_back);
+	perf_test(test_extent_overwrite_middle);
+	perf_test(test_extent_overwrite_all);
+
+	if (!j.fn) {
+		pr_err("unknown test %s", testname);
+		return;
+	}
+
+	//pr_info("running test %s:", testname);
+
+	if (nr_threads == 1)
+		btree_perf_test_thread(&j);
+	else
+		for (i = 0; i < nr_threads; i++)
+			kthread_run(btree_perf_test_thread, &j,
+				    "bcachefs perf test[%u]", i);
+
+	while (wait_for_completion_interruptible(&j.done_completion))
+		;
+
+	time = j.finish - j.start;
+
+	scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
+	bch2_hprint(&PBUF(nr_buf), nr);
+	bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time);
+	printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
+		name_buf, nr_buf, nr_threads,
+		time / NSEC_PER_SEC,
+		time * nr_threads / nr,
+		per_sec_buf);
+}
+
+#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
new file mode 100644
index 000000000000..551d0764225e
--- /dev/null
+++ b/fs/bcachefs/tests.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TEST_H
+#define _BCACHEFS_TEST_H
+
+struct bch_fs;
+
+#ifdef CONFIG_BCACHEFS_TESTS
+
+void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+
+#else
+
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#endif /* _BCACHEFS_TEST_H */
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
new file mode 100644
index 000000000000..59e8dfa3d245
--- /dev/null
+++ b/fs/bcachefs/trace.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "buckets.h"
+#include "btree_types.h"
+#include "keylist.h"
+
+#include <linux/blktrace_api.h>
+#include "keylist.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/bcachefs.h>
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
new file mode 100644
index 000000000000..2cc433ec0e3a
--- /dev/null
+++ b/fs/bcachefs/util.c
@@ -0,0 +1,910 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * random utiility code, for bcache but in theory not specific to bcache
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/log2.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/sched/clock.h>
+
+#include "eytzinger.h"
+#include "util.h"
+
+static const char si_units[] = "?kMGTPEZY";
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+			 u64 t_max, bool t_signed)
+{
+	bool positive = *cp != '-';
+	unsigned u;
+	u64 v = 0;
+
+	if (*cp == '+' || *cp == '-')
+		cp++;
+
+	if (!isdigit(*cp))
+		return -EINVAL;
+
+	do {
+		if (v > U64_MAX / 10)
+			return -ERANGE;
+		v *= 10;
+		if (v > U64_MAX - (*cp - '0'))
+			return -ERANGE;
+		v += *cp - '0';
+		cp++;
+	} while (isdigit(*cp));
+
+	for (u = 1; u < strlen(si_units); u++)
+		if (*cp == si_units[u]) {
+			cp++;
+			goto got_unit;
+		}
+	u = 0;
+got_unit:
+	if (*cp == '\n')
+		cp++;
+	if (*cp)
+		return -EINVAL;
+
+	if (fls64(v) + u * 10 > 64)
+		return -ERANGE;
+
+	v <<= u * 10;
+
+	if (positive) {
+		if (v > t_max)
+			return -ERANGE;
+	} else {
+		if (v && !t_signed)
+			return -ERANGE;
+
+		if (v > t_max + 1)
+			return -ERANGE;
+		v = -v;
+	}
+
+	*res = v;
+	return 0;
+}
+
+#define STRTO_H(name, type)					\
+int bch2_ ## name ## _h(const char *cp, type *res)		\
+{								\
+	u64 v;							\
+	int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type),	\
+			ANYSINT_MAX(type) != ((type) ~0ULL));	\
+	*res = v;						\
+	return ret;						\
+}
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+STRTO_H(strtou64, u64)
+
+void bch2_hprint(struct printbuf *buf, s64 v)
+{
+	int u, t = 0;
+
+	for (u = 0; v >= 1024 || v <= -1024; u++) {
+		t = v & ~(~0U << 10);
+		v >>= 10;
+	}
+
+	pr_buf(buf, "%lli", v);
+
+	/*
+	 * 103 is magic: t is in the range [-1023, 1023] and we want
+	 * to turn it into [-9, 9]
+	 */
+	if (u && v < 100 && v > -100)
+		pr_buf(buf, ".%i", t / 103);
+	if (u)
+		pr_buf(buf, "%c", si_units[u]);
+}
+
+void bch2_string_opt_to_text(struct printbuf *out,
+			     const char * const list[],
+			     size_t selected)
+{
+	size_t i;
+
+	for (i = 0; list[i]; i++)
+		pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]);
+}
+
+void bch2_flags_to_text(struct printbuf *out,
+			const char * const list[], u64 flags)
+{
+	unsigned bit, nr = 0;
+	bool first = true;
+
+	if (out->pos != out->end)
+		*out->pos = '\0';
+
+	while (list[nr])
+		nr++;
+
+	while (flags && (bit = __ffs(flags)) < nr) {
+		if (!first)
+			pr_buf(out, ",");
+		first = false;
+		pr_buf(out, "%s", list[bit]);
+		flags ^= 1 << bit;
+	}
+}
+
+u64 bch2_read_flag_list(char *opt, const char * const list[])
+{
+	u64 ret = 0;
+	char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL);
+
+	if (!d)
+		return -ENOMEM;
+
+	s = strim(d);
+
+	while ((p = strsep(&s, ","))) {
+		int flag = match_string(list, -1, p);
+		if (flag < 0) {
+			ret = -1;
+			break;
+		}
+
+		ret |= 1 << flag;
+	}
+
+	kfree(d);
+
+	return ret;
+}
+
+bool bch2_is_zero(const void *_p, size_t n)
+{
+	const char *p = _p;
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		if (p[i])
+			return false;
+	return true;
+}
+
+static void bch2_quantiles_update(struct quantiles *q, u64 v)
+{
+	unsigned i = 0;
+
+	while (i < ARRAY_SIZE(q->entries)) {
+		struct quantile_entry *e = q->entries + i;
+
+		if (unlikely(!e->step)) {
+			e->m = v;
+			e->step = max_t(unsigned, v / 2, 1024);
+		} else if (e->m > v) {
+			e->m = e->m >= e->step
+				? e->m - e->step
+				: 0;
+		} else if (e->m < v) {
+			e->m = e->m + e->step > e->m
+				? e->m + e->step
+				: U32_MAX;
+		}
+
+		if ((e->m > v ? e->m - v : v - e->m) < e->step)
+			e->step = max_t(unsigned, e->step / 2, 1);
+
+		if (v >= e->m)
+			break;
+
+		i = eytzinger0_child(i, v > e->m);
+	}
+}
+
+/* time stats: */
+
+static void bch2_time_stats_update_one(struct time_stats *stats,
+				       u64 start, u64 end)
+{
+	u64 duration, freq;
+
+	duration	= time_after64(end, start)
+		? end - start : 0;
+	freq		= time_after64(end, stats->last_event)
+		? end - stats->last_event : 0;
+
+	stats->count++;
+
+	stats->average_duration = stats->average_duration
+		? ewma_add(stats->average_duration, duration, 6)
+		: duration;
+
+	stats->average_frequency = stats->average_frequency
+		? ewma_add(stats->average_frequency, freq, 6)
+		: freq;
+
+	stats->max_duration = max(stats->max_duration, duration);
+
+	stats->last_event = end;
+
+	bch2_quantiles_update(&stats->quantiles, duration);
+}
+
+void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
+{
+	unsigned long flags;
+
+	if (!stats->buffer) {
+		spin_lock_irqsave(&stats->lock, flags);
+		bch2_time_stats_update_one(stats, start, end);
+
+		if (stats->average_frequency < 32 &&
+		    stats->count > 1024)
+			stats->buffer =
+				alloc_percpu_gfp(struct time_stat_buffer,
+						 GFP_ATOMIC);
+		spin_unlock_irqrestore(&stats->lock, flags);
+	} else {
+		struct time_stat_buffer_entry *i;
+		struct time_stat_buffer *b;
+
+		preempt_disable();
+		b = this_cpu_ptr(stats->buffer);
+
+		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+		b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+			.start = start,
+			.end = end
+		};
+
+		if (b->nr == ARRAY_SIZE(b->entries)) {
+			spin_lock_irqsave(&stats->lock, flags);
+			for (i = b->entries;
+			     i < b->entries + ARRAY_SIZE(b->entries);
+			     i++)
+				bch2_time_stats_update_one(stats, i->start, i->end);
+			spin_unlock_irqrestore(&stats->lock, flags);
+
+			b->nr = 0;
+		}
+
+		preempt_enable();
+	}
+}
+
+static const struct time_unit {
+	const char	*name;
+	u32		nsecs;
+} time_units[] = {
+	{ "ns",		1		},
+	{ "us",		NSEC_PER_USEC	},
+	{ "ms",		NSEC_PER_MSEC	},
+	{ "sec",	NSEC_PER_SEC	},
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+	const struct time_unit *u;
+
+	for (u = time_units;
+	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
+	     ns >= u[1].nsecs << 1;
+	     u++)
+		;
+
+	return u;
+}
+
+static void pr_time_units(struct printbuf *out, u64 ns)
+{
+	const struct time_unit *u = pick_time_units(ns);
+
+	pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
+size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len)
+{
+	struct printbuf out = _PBUF(buf, len);
+	const struct time_unit *u;
+	u64 freq = READ_ONCE(stats->average_frequency);
+	u64 q, last_q = 0;
+	int i;
+
+	pr_buf(&out, "count:\t\t%llu\n",
+			 stats->count);
+	pr_buf(&out, "rate:\t\t%llu/sec\n",
+	       freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
+
+	pr_buf(&out, "frequency:\t");
+	pr_time_units(&out, freq);
+
+	pr_buf(&out, "\navg duration:\t");
+	pr_time_units(&out, stats->average_duration);
+
+	pr_buf(&out, "\nmax duration:\t");
+	pr_time_units(&out, stats->max_duration);
+
+	i = eytzinger0_first(NR_QUANTILES);
+	u = pick_time_units(stats->quantiles.entries[i].m);
+
+	pr_buf(&out, "\nquantiles (%s):\t", u->name);
+	eytzinger0_for_each(i, NR_QUANTILES) {
+		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+		q = max(stats->quantiles.entries[i].m, last_q);
+		pr_buf(&out, "%llu%s",
+		       div_u64(q, u->nsecs),
+		       is_last ? "\n" : " ");
+		last_q = q;
+	}
+
+	return out.pos - buf;
+}
+
+void bch2_time_stats_exit(struct time_stats *stats)
+{
+	free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct time_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	spin_lock_init(&stats->lock);
+}
+
+/* ratelimit: */
+
+/**
+ * bch2_ratelimit_delay() - return how long to delay until the next time to do
+ * some work
+ *
+ * @d - the struct bch_ratelimit to update
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
+{
+	u64 now = local_clock();
+
+	return time_after64(d->next, now)
+		? nsecs_to_jiffies(d->next - now)
+		: 0;
+}
+
+/**
+ * bch2_ratelimit_increment() - increment @d by the amount of work done
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ */
+void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
+{
+	u64 now = local_clock();
+
+	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+
+	if (time_before64(now + NSEC_PER_SEC, d->next))
+		d->next = now + NSEC_PER_SEC;
+
+	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
+		d->next = now - NSEC_PER_SEC * 2;
+}
+
+/* pd controller: */
+
+/*
+ * Updates pd_controller. Attempts to scale inputed values to units per second.
+ * @target: desired value
+ * @actual: current value
+ *
+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
+ * it makes actual go down.
+ */
+void bch2_pd_controller_update(struct bch_pd_controller *pd,
+			      s64 target, s64 actual, int sign)
+{
+	s64 proportional, derivative, change;
+
+	unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
+
+	if (seconds_since_update == 0)
+		return;
+
+	pd->last_update = jiffies;
+
+	proportional = actual - target;
+	proportional *= seconds_since_update;
+	proportional = div_s64(proportional, pd->p_term_inverse);
+
+	derivative = actual - pd->last_actual;
+	derivative = div_s64(derivative, seconds_since_update);
+	derivative = ewma_add(pd->smoothed_derivative, derivative,
+			      (pd->d_term / seconds_since_update) ?: 1);
+	derivative = derivative * pd->d_term;
+	derivative = div_s64(derivative, pd->p_term_inverse);
+
+	change = proportional + derivative;
+
+	/* Don't increase rate if not keeping up */
+	if (change > 0 &&
+	    pd->backpressure &&
+	    time_after64(local_clock(),
+			 pd->rate.next + NSEC_PER_MSEC))
+		change = 0;
+
+	change *= (sign * -1);
+
+	pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
+				1, UINT_MAX);
+
+	pd->last_actual		= actual;
+	pd->last_derivative	= derivative;
+	pd->last_proportional	= proportional;
+	pd->last_change		= change;
+	pd->last_target		= target;
+}
+
+void bch2_pd_controller_init(struct bch_pd_controller *pd)
+{
+	pd->rate.rate		= 1024;
+	pd->last_update		= jiffies;
+	pd->p_term_inverse	= 6000;
+	pd->d_term		= 30;
+	pd->d_smooth		= pd->d_term;
+	pd->backpressure	= 1;
+}
+
+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
+{
+	/* 2^64 - 1 is 20 digits, plus null byte */
+	char rate[21];
+	char actual[21];
+	char target[21];
+	char proportional[21];
+	char derivative[21];
+	char change[21];
+	s64 next_io;
+
+	bch2_hprint(&PBUF(rate),	pd->rate.rate);
+	bch2_hprint(&PBUF(actual),	pd->last_actual);
+	bch2_hprint(&PBUF(target),	pd->last_target);
+	bch2_hprint(&PBUF(proportional), pd->last_proportional);
+	bch2_hprint(&PBUF(derivative),	pd->last_derivative);
+	bch2_hprint(&PBUF(change),	pd->last_change);
+
+	next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
+
+	return sprintf(buf,
+		       "rate:\t\t%s/sec\n"
+		       "target:\t\t%s\n"
+		       "actual:\t\t%s\n"
+		       "proportional:\t%s\n"
+		       "derivative:\t%s\n"
+		       "change:\t\t%s/sec\n"
+		       "next io:\t%llims\n",
+		       rate, target, actual, proportional,
+		       derivative, change, next_io);
+}
+
+/* misc: */
+
+void bch2_bio_map(struct bio *bio, void *base, size_t size)
+{
+	while (size) {
+		struct page *page = is_vmalloc_addr(base)
+				? vmalloc_to_page(base)
+				: virt_to_page(base);
+		unsigned offset = offset_in_page(base);
+		unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
+
+		BUG_ON(!bio_add_page(bio, page, len, offset));
+		size -= len;
+		base += len;
+	}
+}
+
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
+{
+	while (size) {
+		struct page *page = alloc_page(gfp_mask);
+		unsigned len = min(PAGE_SIZE, size);
+
+		if (!page)
+			return -ENOMEM;
+
+		BUG_ON(!bio_add_page(bio, page, len, 0));
+		size -= len;
+	}
+
+	return 0;
+}
+
+size_t bch2_rand_range(size_t max)
+{
+	size_t rand;
+
+	if (!max)
+		return 0;
+
+	do {
+		rand = get_random_long();
+		rand &= roundup_pow_of_two(max) - 1;
+	} while (rand >= max);
+
+	return rand;
+}
+
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, dst, iter, dst_iter) {
+		void *dstp = kmap_atomic(bv.bv_page);
+		memcpy(dstp + bv.bv_offset, src, bv.bv_len);
+		kunmap_atomic(dstp);
+
+		src += bv.bv_len;
+	}
+}
+
+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, src, iter, src_iter) {
+		void *srcp = kmap_atomic(bv.bv_page);
+		memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
+		kunmap_atomic(srcp);
+
+		dst += bv.bv_len;
+	}
+}
+
+void bch_scnmemcpy(struct printbuf *out,
+		   const char *src, size_t len)
+{
+	size_t n = printbuf_remaining(out);
+
+	if (n) {
+		n = min(n - 1, len);
+		memcpy(out->pos, src, n);
+		out->pos += n;
+		*out->pos = '\0';
+	}
+}
+
+#include "eytzinger.h"
+
+static int alignment_ok(const void *base, size_t align)
+{
+	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+		((unsigned long)base & (align - 1)) == 0;
+}
+
+static void u32_swap(void *a, void *b, size_t size)
+{
+	u32 t = *(u32 *)a;
+	*(u32 *)a = *(u32 *)b;
+	*(u32 *)b = t;
+}
+
+static void u64_swap(void *a, void *b, size_t size)
+{
+	u64 t = *(u64 *)a;
+	*(u64 *)a = *(u64 *)b;
+	*(u64 *)b = t;
+}
+
+static void generic_swap(void *a, void *b, size_t size)
+{
+	char t;
+
+	do {
+		t = *(char *)a;
+		*(char *)a++ = *(char *)b;
+		*(char *)b++ = t;
+	} while (--size > 0);
+}
+
+static inline int do_cmp(void *base, size_t n, size_t size,
+			 int (*cmp_func)(const void *, const void *, size_t),
+			 size_t l, size_t r)
+{
+	return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
+			base + inorder_to_eytzinger0(r, n) * size,
+			size);
+}
+
+static inline void do_swap(void *base, size_t n, size_t size,
+			   void (*swap_func)(void *, void *, size_t),
+			   size_t l, size_t r)
+{
+	swap_func(base + inorder_to_eytzinger0(l, n) * size,
+		  base + inorder_to_eytzinger0(r, n) * size,
+		  size);
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+		     int (*cmp_func)(const void *, const void *, size_t),
+		     void (*swap_func)(void *, void *, size_t))
+{
+	int i, c, r;
+
+	if (!swap_func) {
+		if (size == 4 && alignment_ok(base, 4))
+			swap_func = u32_swap;
+		else if (size == 8 && alignment_ok(base, 8))
+			swap_func = u64_swap;
+		else
+			swap_func = generic_swap;
+	}
+
+	/* heapify */
+	for (i = n / 2 - 1; i >= 0; --i) {
+		for (r = i; r * 2 + 1 < n; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < n &&
+			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+				c++;
+
+			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+				break;
+
+			do_swap(base, n, size, swap_func, r, c);
+		}
+	}
+
+	/* sort */
+	for (i = n - 1; i > 0; --i) {
+		do_swap(base, n, size, swap_func, 0, i);
+
+		for (r = 0; r * 2 + 1 < i; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < i &&
+			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+				c++;
+
+			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+				break;
+
+			do_swap(base, n, size, swap_func, r, c);
+		}
+	}
+}
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *, size_t),
+	  void (*swap_func)(void *, void *, size_t size))
+{
+	/* pre-scale counters for performance */
+	int i = (num/2 - 1) * size, n = num * size, c, r;
+
+	if (!swap_func) {
+		if (size == 4 && alignment_ok(base, 4))
+			swap_func = u32_swap;
+		else if (size == 8 && alignment_ok(base, 8))
+			swap_func = u64_swap;
+		else
+			swap_func = generic_swap;
+	}
+
+	/* heapify */
+	for ( ; i >= 0; i -= size) {
+		for (r = i; r * 2 + size < n; r  = c) {
+			c = r * 2 + size;
+			if (c < n - size &&
+			    cmp_func(base + c, base + c + size, size) < 0)
+				c += size;
+			if (cmp_func(base + r, base + c, size) >= 0)
+				break;
+			swap_func(base + r, base + c, size);
+		}
+	}
+
+	/* sort */
+	for (i = n - size; i > 0; i -= size) {
+		swap_func(base, base + i, size);
+		for (r = 0; r * 2 + size < i; r = c) {
+			c = r * 2 + size;
+			if (c < i - size &&
+			    cmp_func(base + c, base + c + size, size) < 0)
+				c += size;
+			if (cmp_func(base + r, base + c, size) >= 0)
+				break;
+			swap_func(base + r, base + c, size);
+		}
+	}
+}
+
+static void mempool_free_vp(void *element, void *pool_data)
+{
+	size_t size = (size_t) pool_data;
+
+	vpfree(element, size);
+}
+
+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
+{
+	size_t size = (size_t) pool_data;
+
+	return vpmalloc(size, gfp_mask);
+}
+
+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+	return size < PAGE_SIZE
+		? mempool_init_kmalloc_pool(pool, min_nr, size)
+		: mempool_init(pool, min_nr, mempool_alloc_vp,
+			       mempool_free_vp, (void *) size);
+}
+
+#if 0
+void eytzinger1_test(void)
+{
+	unsigned inorder, eytz, size;
+
+	pr_info("1 based eytzinger test:");
+
+	for (size = 2;
+	     size < 65536;
+	     size++) {
+		unsigned extra = eytzinger1_extra(size);
+
+		if (!(size % 4096))
+			pr_info("tree size %u", size);
+
+		BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
+		BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
+
+		BUG_ON(eytzinger1_prev(eytzinger1_first(size), size)	!= 0);
+		BUG_ON(eytzinger1_next(eytzinger1_last(size), size)	!= 0);
+
+		inorder = 1;
+		eytzinger1_for_each(eytz, size) {
+			BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
+			BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
+			BUG_ON(eytz != eytzinger1_last(size) &&
+			       eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
+
+			inorder++;
+		}
+	}
+}
+
+void eytzinger0_test(void)
+{
+
+	unsigned inorder, eytz, size;
+
+	pr_info("0 based eytzinger test:");
+
+	for (size = 1;
+	     size < 65536;
+	     size++) {
+		unsigned extra = eytzinger0_extra(size);
+
+		if (!(size % 4096))
+			pr_info("tree size %u", size);
+
+		BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
+		BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
+
+		BUG_ON(eytzinger0_prev(eytzinger0_first(size), size)	!= -1);
+		BUG_ON(eytzinger0_next(eytzinger0_last(size), size)	!= -1);
+
+		inorder = 0;
+		eytzinger0_for_each(eytz, size) {
+			BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
+			BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
+			BUG_ON(eytz != eytzinger0_last(size) &&
+			       eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
+
+			inorder++;
+		}
+	}
+}
+
+static inline int cmp_u16(const void *_l, const void *_r, size_t size)
+{
+	const u16 *l = _l, *r = _r;
+
+	return (*l > *r) - (*r - *l);
+}
+
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+{
+	int i, c1 = -1, c2 = -1;
+	ssize_t r;
+
+	r = eytzinger0_find_le(test_array, nr,
+			       sizeof(test_array[0]),
+			       cmp_u16, &search);
+	if (r >= 0)
+		c1 = test_array[r];
+
+	for (i = 0; i < nr; i++)
+		if (test_array[i] <= search && test_array[i] > c2)
+			c2 = test_array[i];
+
+	if (c1 != c2) {
+		eytzinger0_for_each(i, nr)
+			pr_info("[%3u] = %12u", i, test_array[i]);
+		pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
+			i, r, c1, c2);
+	}
+}
+
+void eytzinger0_find_test(void)
+{
+	unsigned i, nr, allocated = 1 << 12;
+	u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
+
+	for (nr = 1; nr < allocated; nr++) {
+		pr_info("testing %u elems", nr);
+
+		get_random_bytes(test_array, nr * sizeof(test_array[0]));
+		eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
+
+		/* verify array is sorted correctly: */
+		eytzinger0_for_each(i, nr)
+			BUG_ON(i != eytzinger0_last(nr) &&
+			       test_array[i] > test_array[eytzinger0_next(i, nr)]);
+
+		for (i = 0; i < U16_MAX; i += 1 << 12)
+			eytzinger0_find_test_val(test_array, nr, i);
+
+		for (i = 0; i < nr; i++) {
+			eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
+			eytzinger0_find_test_val(test_array, nr, test_array[i]);
+			eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
+		}
+	}
+
+	kfree(test_array);
+}
+#endif
+
+/*
+ * Accumulate percpu counters onto one cpu's copy - only valid when access
+ * against any percpu counter is guarded against
+ */
+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
+{
+	u64 *ret;
+	int cpu;
+
+	preempt_disable();
+	ret = this_cpu_ptr(p);
+	preempt_enable();
+
+	for_each_possible_cpu(cpu) {
+		u64 *i = per_cpu_ptr(p, cpu);
+
+		if (i != ret) {
+			acc_u64s(ret, i, nr);
+			memset(i, 0, nr * sizeof(u64));
+		}
+	}
+
+	return ret;
+}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
new file mode 100644
index 000000000000..dc0c9967a5f6
--- /dev/null
+++ b/fs/bcachefs/util.h
@@ -0,0 +1,761 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_UTIL_H
+#define _BCACHEFS_UTIL_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/closure.h>
+#include <linux/errno.h>
+#include <linux/freezer.h>
+#include <linux/kernel.h>
+#include <linux/sched/clock.h>
+#include <linux/llist.h>
+#include <linux/log2.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#define PAGE_SECTOR_SHIFT	(PAGE_SHIFT - 9)
+#define PAGE_SECTORS		(1UL << PAGE_SECTOR_SHIFT)
+
+struct closure;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+#define EBUG_ON(cond)		BUG_ON(cond)
+#define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
+#define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
+#define atomic_sub_bug(i, v)	BUG_ON(atomic_sub_return(i, v) < 0)
+#define atomic_add_bug(i, v)	BUG_ON(atomic_add_return(i, v) < 0)
+#define atomic_long_dec_bug(v)		BUG_ON(atomic_long_dec_return(v) < 0)
+#define atomic_long_sub_bug(i, v)	BUG_ON(atomic_long_sub_return(i, v) < 0)
+#define atomic64_dec_bug(v)	BUG_ON(atomic64_dec_return(v) < 0)
+#define atomic64_inc_bug(v, i)	BUG_ON(atomic64_inc_return(v) <= i)
+#define atomic64_sub_bug(i, v)	BUG_ON(atomic64_sub_return(i, v) < 0)
+#define atomic64_add_bug(i, v)	BUG_ON(atomic64_add_return(i, v) < 0)
+
+#define memcpy(dst, src, len)						\
+({									\
+	void *_dst = (dst);						\
+	const void *_src = (src);					\
+	size_t _len = (len);						\
+									\
+	BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) ||		\
+		 (void *) (_dst) + (_len) <= (void *) (_src)));		\
+	memcpy(_dst, _src, _len);					\
+})
+
+#else /* DEBUG */
+
+#define EBUG_ON(cond)
+#define atomic_dec_bug(v)	atomic_dec(v)
+#define atomic_inc_bug(v, i)	atomic_inc(v)
+#define atomic_sub_bug(i, v)	atomic_sub(i, v)
+#define atomic_add_bug(i, v)	atomic_add(i, v)
+#define atomic_long_dec_bug(v)		atomic_long_dec(v)
+#define atomic_long_sub_bug(i, v)	atomic_long_sub(i, v)
+#define atomic64_dec_bug(v)	atomic64_dec(v)
+#define atomic64_inc_bug(v, i)	atomic64_inc(v)
+#define atomic64_sub_bug(i, v)	atomic64_sub(i, v)
+#define atomic64_add_bug(i, v)	atomic64_add(i, v)
+
+#endif
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define CPU_BIG_ENDIAN		0
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define CPU_BIG_ENDIAN		1
+#endif
+
+/* type hackery */
+
+#define type_is_exact(_val, _type)					\
+	__builtin_types_compatible_p(typeof(_val), _type)
+
+#define type_is(_val, _type)						\
+	(__builtin_types_compatible_p(typeof(_val), _type) ||		\
+	 __builtin_types_compatible_p(typeof(_val), const _type))
+
+/* Userspace doesn't align allocations as nicely as the kernel allocators: */
+static inline size_t buf_pages(void *p, size_t len)
+{
+	return DIV_ROUND_UP(len +
+			    ((unsigned long) p & (PAGE_SIZE - 1)),
+			    PAGE_SIZE);
+}
+
+static inline void vpfree(void *p, size_t size)
+{
+	if (is_vmalloc_addr(p))
+		vfree(p);
+	else
+		free_pages((unsigned long) p, get_order(size));
+}
+
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
+{
+	return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+					 get_order(size)) ?:
+		__vmalloc(size, gfp_mask);
+}
+
+static inline void kvpfree(void *p, size_t size)
+{
+	if (size < PAGE_SIZE)
+		kfree(p);
+	else
+		vpfree(p, size);
+}
+
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
+{
+	return size < PAGE_SIZE
+		? kmalloc(size, gfp_mask)
+		: vpmalloc(size, gfp_mask);
+}
+
+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
+
+#define HEAP(type)							\
+struct {								\
+	size_t size, used;						\
+	type *data;							\
+}
+
+#define DECLARE_HEAP(type, name) HEAP(type) name
+
+#define init_heap(heap, _size, gfp)					\
+({									\
+	(heap)->used = 0;						\
+	(heap)->size = (_size);						\
+	(heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+				 (gfp));				\
+})
+
+#define free_heap(heap)							\
+do {									\
+	kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0]));	\
+	(heap)->data = NULL;						\
+} while (0)
+
+#define heap_set_backpointer(h, i, _fn)					\
+do {									\
+	void (*fn)(typeof(h), size_t) = _fn;				\
+	if (fn)								\
+		fn(h, i);						\
+} while (0)
+
+#define heap_swap(h, i, j, set_backpointer)				\
+do {									\
+	swap((h)->data[i], (h)->data[j]);				\
+	heap_set_backpointer(h, i, set_backpointer);			\
+	heap_set_backpointer(h, j, set_backpointer);			\
+} while (0)
+
+#define heap_peek(h)							\
+({									\
+	EBUG_ON(!(h)->used);						\
+	(h)->data[0];							\
+})
+
+#define heap_full(h)	((h)->used == (h)->size)
+
+#define heap_sift_down(h, i, cmp, set_backpointer)			\
+do {									\
+	size_t _c, _j = i;						\
+									\
+	for (; _j * 2 + 1 < (h)->used; _j = _c) {			\
+		_c = _j * 2 + 1;					\
+		if (_c + 1 < (h)->used &&				\
+		    cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0)	\
+			_c++;						\
+									\
+		if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0)		\
+			break;						\
+		heap_swap(h, _c, _j, set_backpointer);			\
+	}								\
+} while (0)
+
+#define heap_sift_up(h, i, cmp, set_backpointer)			\
+do {									\
+	while (i) {							\
+		size_t p = (i - 1) / 2;					\
+		if (cmp(h, (h)->data[i], (h)->data[p]) >= 0)		\
+			break;						\
+		heap_swap(h, i, p, set_backpointer);			\
+		i = p;							\
+	}								\
+} while (0)
+
+#define __heap_add(h, d, cmp, set_backpointer)				\
+({									\
+	size_t _i = (h)->used++;					\
+	(h)->data[_i] = d;						\
+	heap_set_backpointer(h, _i, set_backpointer);			\
+									\
+	heap_sift_up(h, _i, cmp, set_backpointer);			\
+	_i;								\
+})
+
+#define heap_add(h, d, cmp, set_backpointer)				\
+({									\
+	bool _r = !heap_full(h);					\
+	if (_r)								\
+		__heap_add(h, d, cmp, set_backpointer);			\
+	_r;								\
+})
+
+#define heap_add_or_replace(h, new, cmp, set_backpointer)		\
+do {									\
+	if (!heap_add(h, new, cmp, set_backpointer) &&			\
+	    cmp(h, new, heap_peek(h)) >= 0) {				\
+		(h)->data[0] = new;					\
+		heap_set_backpointer(h, 0, set_backpointer);		\
+		heap_sift_down(h, 0, cmp, set_backpointer);		\
+	}								\
+} while (0)
+
+#define heap_del(h, i, cmp, set_backpointer)				\
+do {									\
+	size_t _i = (i);						\
+									\
+	BUG_ON(_i >= (h)->used);					\
+	(h)->used--;							\
+	heap_swap(h, _i, (h)->used, set_backpointer);			\
+	heap_sift_up(h, _i, cmp, set_backpointer);			\
+	heap_sift_down(h, _i, cmp, set_backpointer);			\
+} while (0)
+
+#define heap_pop(h, d, cmp, set_backpointer)				\
+({									\
+	bool _r = (h)->used;						\
+	if (_r) {							\
+		(d) = (h)->data[0];					\
+		heap_del(h, 0, cmp, set_backpointer);			\
+	}								\
+	_r;								\
+})
+
+#define heap_resort(heap, cmp, set_backpointer)				\
+do {									\
+	ssize_t _i;							\
+	for (_i = (ssize_t) (heap)->used / 2 -  1; _i >= 0; --_i)	\
+		heap_sift_down(heap, _i, cmp, set_backpointer);		\
+} while (0)
+
+#define ANYSINT_MAX(t)							\
+	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+struct printbuf {
+	char		*pos;
+	char		*end;
+};
+
+static inline size_t printbuf_remaining(struct printbuf *buf)
+{
+	return buf->end - buf->pos;
+}
+
+#define _PBUF(_buf, _len)						\
+	((struct printbuf) {						\
+		.pos	= _buf,						\
+		.end	= _buf + _len,					\
+	})
+
+#define PBUF(_buf) _PBUF(_buf, sizeof(_buf))
+
+#define pr_buf(_out, ...)						\
+do {									\
+	(_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out),	\
+				 __VA_ARGS__);				\
+} while (0)
+
+void bch_scnmemcpy(struct printbuf *, const char *, size_t);
+
+int bch2_strtoint_h(const char *, int *);
+int bch2_strtouint_h(const char *, unsigned int *);
+int bch2_strtoll_h(const char *, long long *);
+int bch2_strtoull_h(const char *, unsigned long long *);
+int bch2_strtou64_h(const char *, u64 *);
+
+static inline int bch2_strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch2_strtoint_h(cp, (int *) res);
+#else
+	return bch2_strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int bch2_strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch2_strtouint_h(cp, (unsigned int *) res);
+#else
+	return bch2_strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res)						\
+	( type_is(*res, int)		? bch2_strtoint_h(cp, (void *) res)\
+	: type_is(*res, long)		? bch2_strtol_h(cp, (void *) res)\
+	: type_is(*res, long long)	? bch2_strtoll_h(cp, (void *) res)\
+	: type_is(*res, unsigned)	? bch2_strtouint_h(cp, (void *) res)\
+	: type_is(*res, unsigned long)	? bch2_strtoul_h(cp, (void *) res)\
+	: type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\
+	: -EINVAL)
+
+#define strtoul_safe(cp, var)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = _v;						\
+	_r;								\
+})
+
+#define strtoul_safe_clamp(cp, var, min, max)				\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = clamp_t(typeof(var), _v, min, max);		\
+	_r;								\
+})
+
+#define strtoul_safe_restrict(cp, var, min, max)			\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r && _v >= min && _v <= max)				\
+		var = _v;						\
+	else								\
+		_r = -EINVAL;						\
+	_r;								\
+})
+
+#define snprint(buf, size, var)						\
+	snprintf(buf, size,						\
+		   type_is(var, int)		? "%i\n"		\
+		 : type_is(var, unsigned)	? "%u\n"		\
+		 : type_is(var, long)		? "%li\n"		\
+		 : type_is(var, unsigned long)	? "%lu\n"		\
+		 : type_is(var, s64)		? "%lli\n"		\
+		 : type_is(var, u64)		? "%llu\n"		\
+		 : type_is(var, char *)		? "%s\n"		\
+		 : "%i\n", var)
+
+void bch2_hprint(struct printbuf *, s64);
+
+bool bch2_is_zero(const void *, size_t);
+
+void bch2_string_opt_to_text(struct printbuf *,
+			     const char * const [], size_t);
+
+void bch2_flags_to_text(struct printbuf *, const char * const[], u64);
+u64 bch2_read_flag_list(char *, const char * const[]);
+
+#define NR_QUANTILES	15
+#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+	struct quantile_entry {
+		u64	m;
+		u64	step;
+	}		entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+	unsigned	nr;
+	struct time_stat_buffer_entry {
+		u64	start;
+		u64	end;
+	}		entries[32];
+};
+
+struct time_stats {
+	spinlock_t	lock;
+	u64		count;
+	/* all fields are in nanoseconds */
+	u64		average_duration;
+	u64		average_frequency;
+	u64		max_duration;
+	u64		last_event;
+	struct quantiles quantiles;
+
+	struct time_stat_buffer __percpu *buffer;
+};
+
+void __bch2_time_stats_update(struct time_stats *stats, u64, u64);
+
+static inline void bch2_time_stats_update(struct time_stats *stats, u64 start)
+{
+	__bch2_time_stats_update(stats, start, local_clock());
+}
+
+size_t bch2_time_stats_print(struct time_stats *, char *, size_t);
+
+void bch2_time_stats_exit(struct time_stats *);
+void bch2_time_stats_init(struct time_stats *);
+
+#define ewma_add(ewma, val, weight)					\
+({									\
+	typeof(ewma) _ewma = (ewma);					\
+	typeof(weight) _weight = (weight);				\
+									\
+	(((_ewma << _weight) - _ewma) + (val)) >> _weight;		\
+})
+
+struct bch_ratelimit {
+	/* Next time we want to do some work, in nanoseconds */
+	u64			next;
+
+	/*
+	 * Rate at which we want to do work, in units per nanosecond
+	 * The units here correspond to the units passed to
+	 * bch2_ratelimit_increment()
+	 */
+	unsigned		rate;
+};
+
+static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
+{
+	d->next = local_clock();
+}
+
+u64 bch2_ratelimit_delay(struct bch_ratelimit *);
+void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
+
+struct bch_pd_controller {
+	struct bch_ratelimit	rate;
+	unsigned long		last_update;
+
+	s64			last_actual;
+	s64			smoothed_derivative;
+
+	unsigned		p_term_inverse;
+	unsigned		d_smooth;
+	unsigned		d_term;
+
+	/* for exporting to sysfs (no effect on behavior) */
+	s64			last_derivative;
+	s64			last_proportional;
+	s64			last_change;
+	s64			last_target;
+
+	/* If true, the rate will not increase if bch2_ratelimit_delay()
+	 * is not being called often enough. */
+	bool			backpressure;
+};
+
+void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
+void bch2_pd_controller_init(struct bch_pd_controller *);
+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *);
+
+#define sysfs_pd_controller_attribute(name)				\
+	rw_attribute(name##_rate);					\
+	rw_attribute(name##_rate_bytes);				\
+	rw_attribute(name##_rate_d_term);				\
+	rw_attribute(name##_rate_p_term_inverse);			\
+	read_attribute(name##_rate_debug)
+
+#define sysfs_pd_controller_files(name)					\
+	&sysfs_##name##_rate,						\
+	&sysfs_##name##_rate_bytes,					\
+	&sysfs_##name##_rate_d_term,					\
+	&sysfs_##name##_rate_p_term_inverse,				\
+	&sysfs_##name##_rate_debug
+
+#define sysfs_pd_controller_show(name, var)				\
+do {									\
+	sysfs_hprint(name##_rate,		(var)->rate.rate);	\
+	sysfs_print(name##_rate_bytes,		(var)->rate.rate);	\
+	sysfs_print(name##_rate_d_term,		(var)->d_term);		\
+	sysfs_print(name##_rate_p_term_inverse,	(var)->p_term_inverse);	\
+									\
+	if (attr == &sysfs_##name##_rate_debug)				\
+		return bch2_pd_controller_print_debug(var, buf);		\
+} while (0)
+
+#define sysfs_pd_controller_store(name, var)				\
+do {									\
+	sysfs_strtoul_clamp(name##_rate,				\
+			    (var)->rate.rate, 1, UINT_MAX);		\
+	sysfs_strtoul_clamp(name##_rate_bytes,				\
+			    (var)->rate.rate, 1, UINT_MAX);		\
+	sysfs_strtoul(name##_rate_d_term,	(var)->d_term);		\
+	sysfs_strtoul_clamp(name##_rate_p_term_inverse,			\
+			    (var)->p_term_inverse, 1, INT_MAX);		\
+} while (0)
+
+#define container_of_or_null(ptr, type, member)				\
+({									\
+	typeof(ptr) _ptr = ptr;						\
+	_ptr ? container_of(_ptr, type, member) : NULL;			\
+})
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+	unsigned fract = x & ~(~0 << fract_bits);
+
+	x >>= fract_bits;
+	x   = 1 << x;
+	x  += (x * fract) >> fract_bits;
+
+	return x;
+}
+
+void bch2_bio_map(struct bio *bio, void *base, size_t);
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+	return bdev->bd_inode->i_size >> 9;
+}
+
+#define closure_bio_submit(bio, cl)					\
+do {									\
+	closure_get(cl);						\
+	submit_bio(bio);						\
+} while (0)
+
+#define kthread_wait_freezable(cond)					\
+({									\
+	int _ret = 0;							\
+	while (1) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (kthread_should_stop()) {				\
+			_ret = -1;					\
+			break;						\
+		}							\
+									\
+		if (cond)						\
+			break;						\
+									\
+		schedule();						\
+		try_to_freeze();					\
+	}								\
+	set_current_state(TASK_RUNNING);				\
+	_ret;								\
+})
+
+size_t bch2_rand_range(size_t);
+
+void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
+void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+
+static inline void memcpy_u64s_small(void *dst, const void *src,
+				     unsigned u64s)
+{
+	u64 *d = dst;
+	const u64 *s = src;
+
+	while (u64s--)
+		*d++ = *s++;
+}
+
+static inline void __memcpy_u64s(void *dst, const void *src,
+				 unsigned u64s)
+{
+#ifdef CONFIG_X86_64
+	long d0, d1, d2;
+	asm volatile("rep ; movsq"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (u64s), "1" (dst), "2" (src)
+		     : "memory");
+#else
+	u64 *d = dst;
+	const u64 *s = src;
+
+	while (u64s--)
+		*d++ = *s++;
+#endif
+}
+
+static inline void memcpy_u64s(void *dst, const void *src,
+			       unsigned u64s)
+{
+	EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
+		 dst + u64s * sizeof(u64) <= src));
+
+	__memcpy_u64s(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_down(void *dst, const void *src,
+				       unsigned u64s)
+{
+	__memcpy_u64s(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down(void *dst, const void *src,
+				     unsigned u64s)
+{
+	EBUG_ON(dst > src);
+
+	__memmove_u64s_down(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
+					   unsigned u64s)
+{
+	u64 *dst = (u64 *) _dst + u64s;
+	u64 *src = (u64 *) _src + u64s;
+
+	while (u64s--)
+		*--dst = *--src;
+}
+
+static inline void memmove_u64s_up_small(void *dst, const void *src,
+					 unsigned u64s)
+{
+	EBUG_ON(dst < src);
+
+	__memmove_u64s_up_small(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up(void *_dst, const void *_src,
+				     unsigned u64s)
+{
+	u64 *dst = (u64 *) _dst + u64s - 1;
+	u64 *src = (u64 *) _src + u64s - 1;
+
+#ifdef CONFIG_X86_64
+	long d0, d1, d2;
+	asm volatile("std ;\n"
+		     "rep ; movsq\n"
+		     "cld ;\n"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (u64s), "1" (dst), "2" (src)
+		     : "memory");
+#else
+	while (u64s--)
+		*dst-- = *src--;
+#endif
+}
+
+static inline void memmove_u64s_up(void *dst, const void *src,
+				   unsigned u64s)
+{
+	EBUG_ON(dst < src);
+
+	__memmove_u64s_up(dst, src, u64s);
+}
+
+static inline void memmove_u64s(void *dst, const void *src,
+				unsigned u64s)
+{
+	if (dst < src)
+		__memmove_u64s_down(dst, src, u64s);
+	else
+		__memmove_u64s_up(dst, src, u64s);
+}
+
+/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
+static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
+{
+	unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
+
+	memset(s + bytes, c, rem);
+}
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *, size_t),
+	  void (*swap_func)(void *, void *, size_t));
+
+/* just the memmove, doesn't update @_nr */
+#define __array_insert_item(_array, _nr, _pos)				\
+	memmove(&(_array)[(_pos) + 1],					\
+		&(_array)[(_pos)],					\
+		sizeof((_array)[0]) * ((_nr) - (_pos)))
+
+#define array_insert_item(_array, _nr, _pos, _new_item)			\
+do {									\
+	__array_insert_item(_array, _nr, _pos);				\
+	(_nr)++;							\
+	(_array)[(_pos)] = (_new_item);					\
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
+do {									\
+	(_nr) -= (_nr_to_remove);					\
+	memmove(&(_array)[(_pos)],					\
+		&(_array)[(_pos) + (_nr_to_remove)],			\
+		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos)				\
+	array_remove_items(_array, _nr, _pos, 1)
+
+#define bubble_sort(_base, _nr, _cmp)					\
+do {									\
+	ssize_t _i, _end;						\
+	bool _swapped = true;						\
+									\
+	for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
+		_swapped = false;					\
+		for (_i = 0; _i < _end; _i++)				\
+			if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {	\
+				swap((_base)[_i], (_base)[_i + 1]);	\
+				_swapped = true;			\
+			}						\
+	}								\
+} while (0)
+
+static inline u64 percpu_u64_get(u64 __percpu *src)
+{
+	u64 ret = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		ret += *per_cpu_ptr(src, cpu);
+	return ret;
+}
+
+static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(dst, cpu) = 0;
+
+	preempt_disable();
+	*this_cpu_ptr(dst) = src;
+	preempt_enable();
+}
+
+static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
+{
+	unsigned i;
+
+	for (i = 0; i < nr; i++)
+		acc[i] += src[i];
+}
+
+static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
+				   unsigned nr)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
+}
+
+static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		memset(per_cpu_ptr(p, cpu), c, bytes);
+}
+
+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
+
+#define cmp_int(l, r)		((l > r) - (l < r))
+
+#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
new file mode 100644
index 000000000000..c099cdc0605f
--- /dev/null
+++ b/fs/bcachefs/vstructs.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _VSTRUCTS_H
+#define _VSTRUCTS_H
+
+#include "util.h"
+
+/*
+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this
+ * assumes u64 is little endian:
+ */
+#define __vstruct_u64s(_s)						\
+({									\
+	( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s)		\
+	: type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s)		\
+	: type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s)		\
+	: ((__force u8) ((_s)->u64s)));						\
+})
+
+#define __vstruct_bytes(_type, _u64s)					\
+({									\
+	BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));		\
+									\
+	(offsetof(_type, _data) + (_u64s) * sizeof(u64));		\
+})
+
+#define vstruct_bytes(_s)						\
+	__vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
+
+#define __vstruct_blocks(_type, _sector_block_bits, _u64s)		\
+	(round_up(__vstruct_bytes(_type, _u64s),			\
+		  512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
+
+#define vstruct_blocks(_s, _sector_block_bits)				\
+	__vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
+
+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s)		\
+	__vstruct_blocks(typeof(*(_s)), _sector_block_bits,		\
+			 __vstruct_u64s(_s) + (_u64s))
+
+#define vstruct_sectors(_s, _sector_block_bits)				\
+	(round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
+
+#define vstruct_next(_s)						\
+	((typeof(_s))			((_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_last(_s)						\
+	((typeof(&(_s)->start[0]))	((_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_end(_s)							\
+	((void *)			((_s)->_data + __vstruct_u64s(_s)))
+
+#define vstruct_for_each(_s, _i)					\
+	for (_i = (_s)->start;						\
+	     _i < vstruct_last(_s);					\
+	     _i = vstruct_next(_i))
+
+#define vstruct_for_each_safe(_s, _i, _t)				\
+	for (_i = (_s)->start;						\
+	     _i < vstruct_last(_s) && (_t = vstruct_next(_i), true);	\
+	     _i = _t)
+
+#define vstruct_idx(_s, _idx)						\
+	((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
+
+#endif /* _VSTRUCTS_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
new file mode 100644
index 000000000000..9b8f6f1f9a77
--- /dev/null
+++ b/fs/bcachefs/xattr.c
@@ -0,0 +1,582 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "fs.h"
+#include "rebalance.h"
+#include "str_hash.h"
+#include "xattr.h"
+
+#include <linux/dcache.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
+
+static u64 bch2_xattr_hash(const struct bch_hash_info *info,
+			  const struct xattr_search_key *key)
+{
+	struct bch_str_hash_ctx ctx;
+
+	bch2_str_hash_init(&ctx, info);
+	bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
+	bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
+
+	return bch2_str_hash_end(&ctx, info);
+}
+
+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch2_xattr_hash(info, key);
+}
+
+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
+
+	return bch2_xattr_hash(info,
+		 &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
+}
+
+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+	const struct xattr_search_key *r = _r;
+
+	return l.v->x_type != r->type ||
+		l.v->x_name_len != r->name.len ||
+		memcmp(l.v->x_name, r->name.name, r->name.len);
+}
+
+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+	struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
+
+	return l.v->x_type != r.v->x_type ||
+		l.v->x_name_len != r.v->x_name_len ||
+		memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
+}
+
+const struct bch_hash_desc bch2_xattr_hash_desc = {
+	.btree_id	= BTREE_ID_XATTRS,
+	.key_type	= KEY_TYPE_xattr,
+	.hash_key	= xattr_hash_key,
+	.hash_bkey	= xattr_hash_bkey,
+	.cmp_key	= xattr_cmp_key,
+	.cmp_bkey	= xattr_cmp_bkey,
+};
+
+const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	const struct xattr_handler *handler;
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
+		return "value too small";
+
+	if (bkey_val_u64s(k.k) <
+	    xattr_val_u64s(xattr.v->x_name_len,
+			   le16_to_cpu(xattr.v->x_val_len)))
+		return "value too small";
+
+	if (bkey_val_u64s(k.k) >
+	    xattr_val_u64s(xattr.v->x_name_len,
+			   le16_to_cpu(xattr.v->x_val_len) + 4))
+		return "value too big";
+
+	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+	if (!handler)
+		return "invalid type";
+
+	if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
+		return "xattr name has invalid characters";
+
+	return NULL;
+}
+
+void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
+			struct bkey_s_c k)
+{
+	const struct xattr_handler *handler;
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+
+	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+	if (handler && handler->prefix)
+		pr_buf(out, "%s", handler->prefix);
+	else if (handler)
+		pr_buf(out, "(type %u)", xattr.v->x_type);
+	else
+		pr_buf(out, "(unknown type %u)", xattr.v->x_type);
+
+	bch_scnmemcpy(out, xattr.v->x_name,
+		      xattr.v->x_name_len);
+	pr_buf(out, ":");
+	bch_scnmemcpy(out, xattr_val(xattr.v),
+		      le16_to_cpu(xattr.v->x_val_len));
+}
+
+int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
+		   const char *name, void *buffer, size_t size, int type)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c_xattr xattr;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+				&inode->ei_str_hash, inode->v.i_ino,
+				&X_SEARCH(type, name, strlen(name)),
+				0);
+	if (IS_ERR(iter)) {
+		bch2_trans_exit(&trans);
+		BUG_ON(PTR_ERR(iter) == -EINTR);
+
+		return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter);
+	}
+
+	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+	ret = le16_to_cpu(xattr.v->x_val_len);
+	if (buffer) {
+		if (ret > size)
+			ret = -ERANGE;
+		else
+			memcpy(buffer, xattr_val(xattr.v), ret);
+	}
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
+int bch2_xattr_set(struct btree_trans *trans, u64 inum,
+		   const struct bch_hash_info *hash_info,
+		   const char *name, const void *value, size_t size,
+		   int type, int flags)
+{
+	int ret;
+
+	if (value) {
+		struct bkey_i_xattr *xattr;
+		unsigned namelen = strlen(name);
+		unsigned u64s = BKEY_U64s +
+			xattr_val_u64s(namelen, size);
+
+		if (u64s > U8_MAX)
+			return -ERANGE;
+
+		xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+		if (IS_ERR(xattr))
+			return PTR_ERR(xattr);
+
+		bkey_xattr_init(&xattr->k_i);
+		xattr->k.u64s		= u64s;
+		xattr->v.x_type		= type;
+		xattr->v.x_name_len	= namelen;
+		xattr->v.x_val_len	= cpu_to_le16(size);
+		memcpy(xattr->v.x_name, name, namelen);
+		memcpy(xattr_val(&xattr->v), value, size);
+
+		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+			      inum, &xattr->k_i,
+			      (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
+			      (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
+	} else {
+		struct xattr_search_key search =
+			X_SEARCH(type, name, strlen(name));
+
+		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
+				       hash_info, inum, &search);
+	}
+
+	if (ret == -ENOENT)
+		ret = flags & XATTR_REPLACE ? -ENODATA : 0;
+
+	return ret;
+}
+
+struct xattr_buf {
+	char		*buf;
+	size_t		len;
+	size_t		used;
+};
+
+static int __bch2_xattr_emit(const char *prefix,
+			     const char *name, size_t name_len,
+			     struct xattr_buf *buf)
+{
+	const size_t prefix_len = strlen(prefix);
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (buf->buf) {
+		if (buf->used + total_len > buf->len)
+			return -ERANGE;
+
+		memcpy(buf->buf + buf->used, prefix, prefix_len);
+		memcpy(buf->buf + buf->used + prefix_len,
+		       name, name_len);
+		buf->buf[buf->used + prefix_len + name_len] = '\0';
+	}
+
+	buf->used += total_len;
+	return 0;
+}
+
+static int bch2_xattr_emit(struct dentry *dentry,
+			    const struct bch_xattr *xattr,
+			    struct xattr_buf *buf)
+{
+	const struct xattr_handler *handler =
+		bch2_xattr_type_to_handler(xattr->x_type);
+
+	return handler && (!handler->list || handler->list(dentry))
+		? __bch2_xattr_emit(handler->prefix ?: handler->name,
+				    xattr->x_name, xattr->x_name_len, buf)
+		: 0;
+}
+
+static int bch2_xattr_list_bcachefs(struct bch_fs *c,
+				    struct bch_inode_info *inode,
+				    struct xattr_buf *buf,
+				    bool all)
+{
+	const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
+	unsigned id;
+	int ret = 0;
+	u64 v;
+
+	for (id = 0; id < Inode_opt_nr; id++) {
+		v = bch2_inode_opt_get(&inode->ei_inode, id);
+		if (!v)
+			continue;
+
+		if (!all &&
+		    !(inode->ei_inode.bi_fields_set & (1 << id)))
+			continue;
+
+		ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
+					strlen(bch2_inode_opts[id]), buf);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct bch_fs *c = dentry->d_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
+	u64 inum = dentry->d_inode->i_ino;
+	int ret;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
+			   POS(inum, 0), 0, k, ret) {
+		BUG_ON(k.k->p.inode < inum);
+
+		if (k.k->p.inode > inum)
+			break;
+
+		if (k.k->type != KEY_TYPE_xattr)
+			continue;
+
+		ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
+		if (ret)
+			break;
+	}
+	ret = bch2_trans_exit(&trans) ?: ret;
+
+	if (ret)
+		return ret;
+
+	ret = bch2_xattr_list_bcachefs(c, inode, &buf, false);
+	if (ret)
+		return ret;
+
+	ret = bch2_xattr_list_bcachefs(c, inode, &buf, true);
+	if (ret)
+		return ret;
+
+	return buf.used;
+}
+
+static int bch2_xattr_get_handler(const struct xattr_handler *handler,
+				  struct dentry *dentry, struct inode *vinode,
+				  const char *name, void *buffer, size_t size)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	return bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+}
+
+static int bch2_xattr_set_handler(const struct xattr_handler *handler,
+				  struct dentry *dentry, struct inode *vinode,
+				  const char *name, const void *value,
+				  size_t size, int flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC,
+			bch2_xattr_set(&trans, inode->v.i_ino,
+				       &inode->ei_str_hash,
+				       name, value, size,
+				       handler->flags, flags));
+}
+
+static const struct xattr_handler bch_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= bch2_xattr_get_handler,
+	.set	= bch2_xattr_set_handler,
+	.flags	= KEY_TYPE_XATTR_INDEX_USER,
+};
+
+static bool bch2_xattr_trusted_list(struct dentry *dentry)
+{
+	return capable(CAP_SYS_ADMIN);
+}
+
+static const struct xattr_handler bch_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= bch2_xattr_trusted_list,
+	.get	= bch2_xattr_get_handler,
+	.set	= bch2_xattr_set_handler,
+	.flags	= KEY_TYPE_XATTR_INDEX_TRUSTED,
+};
+
+static const struct xattr_handler bch_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.get	= bch2_xattr_get_handler,
+	.set	= bch2_xattr_set_handler,
+	.flags	= KEY_TYPE_XATTR_INDEX_SECURITY,
+};
+
+#ifndef NO_BCACHEFS_FS
+
+static int opt_to_inode_opt(int id)
+{
+	switch (id) {
+#define x(name, ...)				\
+	case Opt_##name: return Inode_opt_##name;
+	BCH_INODE_OPTS()
+#undef  x
+	default:
+		return -1;
+	}
+}
+
+static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				struct dentry *dentry, struct inode *vinode,
+				const char *name, void *buffer, size_t size,
+				bool all)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_opts opts =
+		bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
+	const struct bch_option *opt;
+	int id, inode_opt_id;
+	char buf[512];
+	struct printbuf out = PBUF(buf);
+	unsigned val_len;
+	u64 v;
+
+	id = bch2_opt_lookup(name);
+	if (id < 0 || !bch2_opt_is_inode_opt(id))
+		return -EINVAL;
+
+	inode_opt_id = opt_to_inode_opt(id);
+	if (inode_opt_id < 0)
+		return -EINVAL;
+
+	opt = bch2_opt_table + id;
+
+	if (!bch2_opt_defined_by_id(&opts, id))
+		return -ENODATA;
+
+	if (!all &&
+	    !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id)))
+		return -ENODATA;
+
+	v = bch2_opt_get_by_id(&opts, id);
+	bch2_opt_to_text(&out, c, opt, v, 0);
+
+	val_len = out.pos - buf;
+
+	if (buffer && val_len > size)
+		return -ERANGE;
+
+	if (buffer)
+		memcpy(buffer, buf, val_len);
+	return val_len;
+}
+
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, void *buffer, size_t size)
+{
+	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
+					 name, buffer, size, false);
+}
+
+struct inode_opt_set {
+	int			id;
+	u64			v;
+	bool			defined;
+};
+
+static int inode_opt_set_fn(struct bch_inode_info *inode,
+			    struct bch_inode_unpacked *bi,
+			    void *p)
+{
+	struct inode_opt_set *s = p;
+
+	if (s->defined)
+		bi->bi_fields_set |= 1U << s->id;
+	else
+		bi->bi_fields_set &= ~(1U << s->id);
+
+	bch2_inode_opt_set(bi, s->id, s->v);
+
+	return 0;
+}
+
+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	const struct bch_option *opt;
+	char *buf;
+	struct inode_opt_set s;
+	int opt_id, inode_opt_id, ret;
+
+	opt_id = bch2_opt_lookup(name);
+	if (opt_id < 0)
+		return -EINVAL;
+
+	opt = bch2_opt_table + opt_id;
+
+	inode_opt_id = opt_to_inode_opt(opt_id);
+	if (inode_opt_id < 0)
+		return -EINVAL;
+
+	s.id = inode_opt_id;
+
+	if (value) {
+		u64 v = 0;
+
+		buf = kmalloc(size + 1, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		memcpy(buf, value, size);
+		buf[size] = '\0';
+
+		ret = bch2_opt_parse(c, opt, buf, &v);
+		kfree(buf);
+
+		if (ret < 0)
+			return ret;
+
+		ret = bch2_opt_check_may_set(c, opt_id, v);
+		if (ret < 0)
+			return ret;
+
+		s.v = v + 1;
+		s.defined = true;
+	} else {
+		if (!IS_ROOT(dentry)) {
+			struct bch_inode_info *dir =
+				to_bch_ei(d_inode(dentry->d_parent));
+
+			s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id);
+		} else {
+			s.v = 0;
+		}
+
+		s.defined = false;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	if (inode_opt_id == Inode_opt_project) {
+		ret = bch2_set_projid(c, inode, s.v);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
+err:
+	mutex_unlock(&inode->ei_update_lock);
+
+	if (value &&
+	    (opt_id == Opt_background_compression ||
+	     opt_id == Opt_background_target))
+		bch2_rebalance_add_work(c, inode->v.i_blocks);
+
+	return ret;
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_handler = {
+	.prefix	= "bcachefs.",
+	.get	= bch2_xattr_bcachefs_get,
+	.set	= bch2_xattr_bcachefs_set,
+};
+
+static int bch2_xattr_bcachefs_get_effective(
+				const struct xattr_handler *handler,
+				struct dentry *dentry, struct inode *vinode,
+				const char *name, void *buffer, size_t size)
+{
+	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
+					 name, buffer, size, true);
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
+	.prefix	= "bcachefs_effective.",
+	.get	= bch2_xattr_bcachefs_get_effective,
+	.set	= bch2_xattr_bcachefs_set,
+};
+
+#endif /* NO_BCACHEFS_FS */
+
+const struct xattr_handler *bch2_xattr_handlers[] = {
+	&bch_xattr_user_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+	&bch_xattr_trusted_handler,
+	&bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+	&bch_xattr_bcachefs_handler,
+	&bch_xattr_bcachefs_effective_handler,
+#endif
+	NULL
+};
+
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+	[KEY_TYPE_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
+	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS]	=
+		&posix_acl_access_xattr_handler,
+	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
+		&posix_acl_default_xattr_handler,
+	[KEY_TYPE_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
+	[KEY_TYPE_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
+};
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
+{
+	return type < ARRAY_SIZE(bch_xattr_handler_map)
+		? bch_xattr_handler_map[type]
+		: NULL;
+}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
new file mode 100644
index 000000000000..4151065ab853
--- /dev/null
+++ b/fs/bcachefs/xattr.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_H
+#define _BCACHEFS_XATTR_H
+
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_xattr_hash_desc;
+
+const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_xattr (struct bkey_ops) {		\
+	.key_invalid	= bch2_xattr_invalid,		\
+	.val_to_text	= bch2_xattr_to_text,		\
+}
+
+static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
+{
+	return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
+			    name_len + val_len, sizeof(u64));
+}
+
+#define xattr_val(_xattr)					\
+	((void *) (_xattr)->x_name + (_xattr)->x_name_len)
+
+struct xattr_search_key {
+	u8		type;
+	struct qstr	name;
+};
+
+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key)	\
+	{ .type = _type, .name = QSTR_INIT(_name, _len) })
+
+struct dentry;
+struct xattr_handler;
+struct bch_hash_info;
+struct bch_inode_info;
+
+int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
+		  const char *, void *, size_t, int);
+
+int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
+		   const char *, const void *, size_t, int, int);
+
+ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
+
+extern const struct xattr_handler *bch2_xattr_handlers[];
+
+#endif /* _BCACHEFS_XATTR_H */
author	Kent Overstreet <kent.overstreet@gmail.com>	2017-03-16 22:18:50 -0800
committer	Kent Overstreet <kent.overstreet@gmail.com>	2021-04-27 12:17:53 -0400
commit	afb402e0dfe759cfb4bf1c594d4fbbcbe6a30c14 (patch)
tree	f4bae59d80d056eb6dfbb536678b211c9e383b6f /fs/bcachefs
parent	b7faa92b19192fbb7b9a4211bbebeacdd3134efe (diff)