Update bcachefs sources to 14ce2a2031 bcachefs: fixes for building in userspace

author: Kent Overstreet <kent.overstreet@gmail.com> 2017-12-21 18:00:30 -0500
committer: Kent Overstreet <kent.overstreet@gmail.com> 2017-12-21 18:06:45 -0500
commit: 1cf4d51dc4661f336f5318c176a3561ddf5bf04f (patch)
tree: 8b390ccd48361ba1408be6799d46e62c6382cc39 /libbcachefs
parent: 8acc54456e11ee0ec80ed0c6abb6d68abae60592 (diff)
50 files changed, 1952 insertions, 1367 deletions
diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c
index 2632d21c..480941d6 100644
--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@@ -193,8 +193,7 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
 			if (ret < 0)
 				return ret;
 			else {
-				inode->v.i_ctime =
-					current_fs_time(inode->v.i_sb);
+				inode->v.i_ctime = current_time(&inode->v);
 				mark_inode_dirty(&inode->v);
 				if (ret == 0)
 					acl = NULL;
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index d29d871a..29799df6 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -257,7 +257,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
 		return;
 
 	a = bkey_s_c_to_alloc(k);
-	ca = c->devs[a.k->p.inode];
+	ca = bch_dev_bkey_exists(c, a.k->p.inode);
 
 	if (a.k->p.offset >= ca->mi.nbuckets)
 		return;
@@ -305,10 +305,12 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
 	}
 
+	mutex_lock(&c->bucket_lock);
 	for_each_member_device(ca, c, i) {
 		bch2_recalc_min_prio(c, ca, READ);
 		bch2_recalc_min_prio(c, ca, WRITE);
 	}
+	mutex_unlock(&c->bucket_lock);
 
 	return 0;
 }
@@ -368,7 +370,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
 	if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
 		return 0;
 
-	ca = c->devs[pos.inode];
+	ca = bch_dev_bkey_exists(c, pos.inode);
 
 	if (pos.offset >= ca->mi.nbuckets)
 		return 0;
@@ -461,7 +463,7 @@ static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
 
 /* Bucket heap / gen */
 
-void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
+static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
 {
 	struct prio_clock *clock = &c->prio_clock[rw];
 	struct bucket *g;
@@ -975,7 +977,7 @@ static int bch2_allocator_thread(void *arg)
 
 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 {
-	struct bch_dev *ca = c->devs[ob->ptr.dev];
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
 	spin_lock(&ob->lock);
 	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false,
@@ -1303,7 +1305,7 @@ static void writepoint_drop_ptrs(struct bch_fs *c,
 
 	for (i = wp->nr_ptrs - 1; i >= 0; --i) {
 		struct open_bucket *ob = wp->ptrs[i];
-		struct bch_dev *ca = c->devs[ob->ptr.dev];
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
 		if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) {
 			BUG_ON(ca->open_buckets_partial_nr >=
@@ -1331,7 +1333,7 @@ static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
 	unsigned i;
 
 	writepoint_for_each_ptr(wp, ob, i) {
-		struct bch_dev *ca = c->devs[ob->ptr.dev];
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
 		BUG_ON(ptr_stale(ca, &ob->ptr));
 	}
@@ -1537,7 +1539,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
 
 	for (i = 0; i < wp->nr_ptrs_can_use; i++) {
 		struct open_bucket *ob = wp->ptrs[i];
-		struct bch_dev *ca = c->devs[ob->ptr.dev];
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 		struct bch_extent_ptr tmp = ob->ptr;
 
 		EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
@@ -1589,7 +1591,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 		ra_pages += bdi->ra_pages;
 	}
 
-	c->bdi.ra_pages = ra_pages;
+	bch2_set_ra_pages(c, ra_pages);
 
 	/* Find fastest, slowest tiers with devices: */
 
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index b679dd16..e25baf56 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -326,9 +326,9 @@ struct io_count {
 struct bch_dev {
 	struct kobject		kobj;
 	struct percpu_ref	ref;
+	struct completion	ref_completion;
 	struct percpu_ref	io_ref;
-	struct completion	stop_complete;
-	struct completion	offline_complete;
+	struct completion	io_ref_completion;
 
 	struct bch_fs		*fs;
 
@@ -515,12 +515,11 @@ struct bch_fs {
 	struct closure		sb_write;
 	struct mutex		sb_lock;
 
-	struct backing_dev_info bdi;
-
 	/* BTREE CACHE */
 	struct bio_set		btree_read_bio;
 
 	struct btree_root	btree_roots[BTREE_ID_NR];
+	bool			btree_roots_dirty;
 	struct mutex		btree_root_lock;
 
 	struct btree_cache	btree_cache;
@@ -710,6 +709,14 @@ struct bch_fs {
 #undef BCH_TIME_STAT
 };
 
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
+{
+#ifndef NO_BCACHEFS_FS
+	if (c->vfs_sb)
+		c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
+}
+
 static inline bool bch2_fs_running(struct bch_fs *c)
 {
 	return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 2dc9a7e0..6e0e0452 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -593,18 +593,24 @@ struct bch_inode_generation {
 } __attribute__((packed, aligned(8)));
 BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
 
-#define BCH_INODE_FIELDS()				\
-	BCH_INODE_FIELD(bi_atime,	64)		\
-	BCH_INODE_FIELD(bi_ctime,	64)		\
-	BCH_INODE_FIELD(bi_mtime,	64)		\
-	BCH_INODE_FIELD(bi_otime,	64)		\
-	BCH_INODE_FIELD(bi_size,	64)		\
-	BCH_INODE_FIELD(bi_sectors,	64)		\
-	BCH_INODE_FIELD(bi_uid,		32)		\
-	BCH_INODE_FIELD(bi_gid,		32)		\
-	BCH_INODE_FIELD(bi_nlink,	32)		\
-	BCH_INODE_FIELD(bi_generation,	32)		\
-	BCH_INODE_FIELD(bi_dev,		32)
+#define BCH_INODE_FIELDS()					\
+	BCH_INODE_FIELD(bi_atime,			64)	\
+	BCH_INODE_FIELD(bi_ctime,			64)	\
+	BCH_INODE_FIELD(bi_mtime,			64)	\
+	BCH_INODE_FIELD(bi_otime,			64)	\
+	BCH_INODE_FIELD(bi_size,			64)	\
+	BCH_INODE_FIELD(bi_sectors,			64)	\
+	BCH_INODE_FIELD(bi_uid,				32)	\
+	BCH_INODE_FIELD(bi_gid,				32)	\
+	BCH_INODE_FIELD(bi_nlink,			32)	\
+	BCH_INODE_FIELD(bi_generation,			32)	\
+	BCH_INODE_FIELD(bi_dev,				32)	\
+	BCH_INODE_FIELD(bi_data_checksum,		8)	\
+	BCH_INODE_FIELD(bi_compression,			8)
+
+#define BCH_INODE_FIELDS_INHERIT()				\
+	BCH_INODE_FIELD(bi_data_checksum)			\
+	BCH_INODE_FIELD(bi_compression)
 
 enum {
 	/*
@@ -794,7 +800,7 @@ struct bch_sb_layout {
 	__u8			sb_max_size_bits; /* base 2 of 512 byte sectors */
 	__u8			nr_superblocks;
 	__u8			pad[5];
-	__u64			sb_offset[61];
+	__le64			sb_offset[61];
 } __attribute__((packed, aligned(8)));
 
 #define BCH_SB_LAYOUT_SECTOR	7
@@ -1089,6 +1095,11 @@ struct jset_entry {
 	};
 };
 
+struct jset_entry_blacklist {
+	struct jset_entry	entry;
+	__le64			seq;
+};
+
 #define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
 
 enum {
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index 73089a90..97015084 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -1,6 +1,7 @@
 
 #include "bcachefs.h"
 #include "bkey.h"
+#include "bkey_methods.h"
 #include "bset.h"
 #include "util.h"
 
@@ -80,37 +81,6 @@ static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
 					const struct bkey_format *format) {}
 #endif
 
-int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
-{
-	char *out = buf, *end = buf + size;
-
-#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
-
-	p("u64s %u type %u %llu:%llu snap %u len %u ver %llu",
-	  k->u64s, k->type, k->p.inode, k->p.offset,
-	  k->p.snapshot, k->size, k->version.lo);
-
-	BUG_ON(bkey_packed(k));
-
-	switch (k->type) {
-	case KEY_TYPE_DELETED:
-		p(" deleted");
-		break;
-	case KEY_TYPE_DISCARD:
-		p(" discard");
-		break;
-	case KEY_TYPE_ERROR:
-		p(" error");
-		break;
-	case KEY_TYPE_COOKIE:
-		p(" cookie");
-		break;
-	}
-#undef p
-
-	return out - buf;
-}
-
 struct pack_state {
 	const struct bkey_format *format;
 	unsigned		bits;	/* bits remaining in current word */
@@ -336,7 +306,8 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
 	 * Extents - we have to guarantee that if an extent is packed, a trimmed
 	 * version will also pack:
 	 */
-	if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET])
+	if (bkey_start_offset(in) <
+	    le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
 		return false;
 
 	pack_state_finish(&state, out);
@@ -800,7 +771,7 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
 			      bool *eax_zeroed)
 {
 	unsigned bits = format->bits_per_field[field];
-	u64 offset = format->field_offset[field];
+	u64 offset = le64_to_cpu(format->field_offset[field]);
 	unsigned i, byte, bit_offset, align, shl, shr;
 
 	if (!bits && !offset) {
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index dc0b88f7..89697956 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -8,7 +8,6 @@
 #include "vstructs.h"
 
 void bch2_to_binary(char *, const u64 *, unsigned);
-int bch2_bkey_to_text(char *, size_t, const struct bkey *);
 
 #define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
 
@@ -377,7 +376,8 @@ static inline u64 bkey_field_max(const struct bkey_format *f,
 				 enum bch_bkey_fields nr)
 {
 	return f->bits_per_field[nr] < 64
-		? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr])
+		? (le64_to_cpu(f->field_offset[nr]) +
+		   ~(~0ULL << f->bits_per_field[nr]))
 		: U64_MAX;
 }
 
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 23894158..1736a483 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -18,28 +18,11 @@ const struct bkey_ops *bch2_bkey_ops[] = {
 	[BKEY_TYPE_BTREE]	= &bch2_bkey_btree_ops,
 };
 
-/* Returns string indicating reason for being invalid, or NULL if valid: */
-const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
-			 struct bkey_s_c k)
+const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
+				  struct bkey_s_c k)
 {
 	const struct bkey_ops *ops = bch2_bkey_ops[type];
 
-	if (k.k->u64s < BKEY_U64s)
-		return "u64s too small";
-
-	if (!ops->is_extents) {
-		if (k.k->size)
-			return "nonzero size field";
-	} else {
-		if ((k.k->size == 0) != bkey_deleted(k.k))
-			return "bad size field";
-	}
-
-	if (ops->is_extents &&
-	    !k.k->size &&
-	    !bkey_deleted(k.k))
-		return "zero size field";
-
 	switch (k.k->type) {
 	case KEY_TYPE_DELETED:
 	case KEY_TYPE_DISCARD:
@@ -63,8 +46,41 @@ const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
 	}
 }
 
-const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b,
-				    struct bkey_s_c k)
+const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+			      struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = bch2_bkey_ops[type];
+
+	if (k.k->u64s < BKEY_U64s)
+		return "u64s too small";
+
+	if (!ops->is_extents) {
+		if (k.k->size)
+			return "nonzero size field";
+	} else {
+		if ((k.k->size == 0) != bkey_deleted(k.k))
+			return "bad size field";
+	}
+
+	if (ops->is_extents &&
+	    !k.k->size &&
+	    !bkey_deleted(k.k))
+		return "zero size field";
+
+	if (k.k->p.snapshot)
+		return "nonzero snapshot";
+
+	return NULL;
+}
+
+const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+			      struct bkey_s_c k)
+{
+	return __bch2_bkey_invalid(c, type, k) ?:
+		bch2_bkey_val_invalid(c, type, k);
+}
+
+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 {
 	if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
 		return "key before start of btree node";
@@ -72,10 +88,7 @@ const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b,
 	if (bkey_cmp(k.k->p, b->data->max_key) > 0)
 		return "key past end of btree node";
 
-	if (k.k->p.snapshot)
-		return "nonzero snapshot";
-
-	return bch2_bkey_invalid(c, btree_node_type(b), k);
+	return NULL;
 }
 
 void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
@@ -86,7 +99,8 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 
 	BUG_ON(!k.k->u64s);
 
-	invalid = bch2_btree_bkey_invalid(c, b, k);
+	invalid = bch2_bkey_invalid(c, type, k) ?:
+		bch2_bkey_in_btree_node(b, k);
 	if (invalid) {
 		char buf[160];
 
@@ -100,33 +114,62 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 		ops->key_debugcheck(c, b, k);
 }
 
-char *bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
-		       char *buf, size_t size, struct bkey_s_c k)
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
 {
-	const struct bkey_ops *ops = bch2_bkey_ops[type];
+	char *out = buf, *end = buf + size;
 
-	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
-	    ops->val_to_text)
-		ops->val_to_text(c, buf, size, k);
+	p("u64s %u type %u ", k->u64s, k->type);
+
+	if (bkey_cmp(k->p, POS_MAX))
+		p("%llu:%llu", k->p.inode, k->p.offset);
+	else
+		p("POS_MAX");
 
-	return buf;
+	p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
+
+	return out - buf;
 }
 
-char *bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
-			    char *buf, size_t size, struct bkey_s_c k)
+int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
+		     char *buf, size_t size, struct bkey_s_c k)
 {
 	const struct bkey_ops *ops = bch2_bkey_ops[type];
 	char *out = buf, *end = buf + size;
 
-	out += bch2_bkey_to_text(out, end - out, k.k);
-
-	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
-	    ops->val_to_text) {
-		out += scnprintf(out, end - out, ": ");
-		ops->val_to_text(c, out, end - out, k);
+	switch (k.k->type) {
+	case KEY_TYPE_DELETED:
+		p(" deleted");
+		break;
+	case KEY_TYPE_DISCARD:
+		p(" discard");
+		break;
+	case KEY_TYPE_ERROR:
+		p(" error");
+		break;
+	case KEY_TYPE_COOKIE:
+		p(" cookie");
+		break;
+	default:
+		if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
+			ops->val_to_text(c, buf, size, k);
+		break;
 	}
 
-	return buf;
+	return out - buf;
+}
+
+int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
+			  char *buf, size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+
+	out += bch2_bkey_to_text(out, end - out, k.k);
+	out += scnprintf(out, end - out, ": ");
+	out += bch2_val_to_text(c, type, out, end - out, k);
+
+	return out - buf;
 }
 
 void bch2_bkey_swab(enum bkey_type type,
diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h
index 29c1abd3..59db3037 100644
--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@@ -64,15 +64,19 @@ struct bkey_ops {
 	bool		is_extents;
 };
 
+const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
+				  struct bkey_s_c);
+const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
 const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
-const char *bch2_btree_bkey_invalid(struct bch_fs *, struct btree *,
-				    struct bkey_s_c);
+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
 
 void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-char *bch2_val_to_text(struct bch_fs *, enum bkey_type,
-		       char *, size_t, struct bkey_s_c);
-char *bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
-			    char *, size_t, struct bkey_s_c);
+
+int bch2_bkey_to_text(char *, size_t, const struct bkey *);
+int bch2_val_to_text(struct bch_fs *, enum bkey_type,
+		     char *, size_t, struct bkey_s_c);
+int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
+			  char *, size_t, struct bkey_s_c);
 
 void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
 		    struct bkey_packed *);
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 1198fe39..2294cc3a 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -96,7 +96,7 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 
 		extent_for_each_ptr(e, ptr) {
-			struct bch_dev *ca = c->devs[ptr->dev];
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 			size_t b = PTR_BUCKET_NR(ca, ptr);
 
 			if (gen_after(ca->oldest_gens[b], ptr->gen))
@@ -159,14 +159,15 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 		    (!c->opts.nofsck &&
 		     fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
-				 "superblock not marked as containing replicas"))) {
+				 "superblock not marked as containing replicas (type %u)",
+				 data_type))) {
 			ret = bch2_check_mark_super(c, e, data_type);
 			if (ret)
 				return ret;
 		}
 
 		extent_for_each_ptr(e, ptr) {
-			struct bch_dev *ca = c->devs[ptr->dev];
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 			struct bucket *g = PTR_BUCKET(ca, ptr);
 
 			if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
@@ -315,14 +316,14 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 	lockdep_assert_held(&c->sb_lock);
 
 	for (i = 0; i < layout->nr_superblocks; i++) {
-		if (layout->sb_offset[i] == BCH_SB_SECTOR)
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset == BCH_SB_SECTOR)
 			mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
 					      BUCKET_SB, flags);
 
-		mark_metadata_sectors(c, ca,
-				      layout->sb_offset[i],
-				      layout->sb_offset[i] +
-				      (1 << layout->sb_max_size_bits),
+		mark_metadata_sectors(c, ca, offset,
+				      offset + (1 << layout->sb_max_size_bits),
 				      BUCKET_SB, flags);
 	}
 
@@ -414,7 +415,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 		spin_lock(&ob->lock);
 		if (ob->valid) {
 			gc_pos_set(c, gc_pos_alloc(c, ob));
-			ca = c->devs[ob->ptr.dev];
+			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
 					       gc_pos_alloc(c, ob),
 					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
@@ -424,7 +425,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 	}
 }
 
-void bch2_gc_start(struct bch_fs *c)
+static void bch2_gc_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	struct bucket *g;
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 38c373c6..87a8ddf9 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -556,7 +556,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	struct bset_tree *t;
 	struct bset *start_bset = bset(b, &b->set[start_idx]);
 	bool used_mempool = false;
-	u64 start_time;
+	u64 start_time, seq = 0;
 	unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
 	bool sorting_entire_node = start_idx == 0 &&
 		end_idx == b->nsets;
@@ -595,12 +595,9 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 		bch2_time_stats_update(&c->btree_sort_time, start_time);
 
 	/* Make sure we preserve bset journal_seq: */
-	for (t = b->set + start_idx + 1;
-	     t < b->set + end_idx;
-	     t++)
-		start_bset->journal_seq =
-			max(start_bset->journal_seq,
-			    bset(b, t)->journal_seq);
+	for (t = b->set + start_idx; t < b->set + end_idx; t++)
+		seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
+	start_bset->journal_seq = cpu_to_le64(seq);
 
 	if (sorting_entire_node) {
 		unsigned u64s = le16_to_cpu(out->keys.u64s);
@@ -958,6 +955,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 {
 	struct bkey_packed *k, *prev = NULL;
 	struct bpos prev_pos = POS_MIN;
+	enum bkey_type type = btree_node_type(b);
 	bool seen_non_whiteout = false;
 	const char *err;
 	int ret = 0;
@@ -1025,7 +1023,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
 	if (!BSET_SEPARATE_WHITEOUTS(i)) {
 		seen_non_whiteout = true;
-		whiteout_u64s = 0;
+		*whiteout_u64s = 0;
 	}
 
 	for (k = i->start;
@@ -1059,16 +1057,17 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		}
 
 		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
-			bch2_bkey_swab(btree_node_type(b), &b->format, k);
+			bch2_bkey_swab(type, &b->format, k);
 
 		u = bkey_disassemble(b, k, &tmp);
 
-		invalid = bch2_btree_bkey_invalid(c, b, u);
+		invalid = __bch2_bkey_invalid(c, type, u) ?:
+			bch2_bkey_in_btree_node(b, u) ?:
+			(write ? bch2_bkey_val_invalid(c, type, u) : NULL);
 		if (invalid) {
 			char buf[160];
 
-			bch2_bkey_val_to_text(c, btree_node_type(b),
-					      buf, sizeof(buf), u);
+			bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
 			btree_err(BTREE_ERR_FIXABLE, c, b, i,
 				  "invalid bkey %s: %s", buf, invalid);
 
@@ -1114,6 +1113,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 	struct btree_node_entry *bne;
 	struct btree_node_iter *iter;
 	struct btree_node *sorted;
+	struct bkey_packed *k;
+	struct bset *i;
 	bool used_mempool;
 	unsigned u64s;
 	int ret, retry_read = 0, write = READ;
@@ -1137,7 +1138,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 		unsigned sectors, whiteout_u64s = 0;
 		struct nonce nonce;
 		struct bch_csum csum;
-		struct bset *i;
 
 		if (!b->written) {
 			i = &b->data->keys;
@@ -1238,6 +1238,31 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
 	btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
 
+	i = &b->data->keys;
+	for (k = i->start; k != vstruct_last(i);) {
+		enum bkey_type type = btree_node_type(b);
+		struct bkey tmp;
+		struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
+		const char *invalid = bch2_bkey_val_invalid(c, type, u);
+
+		if (invalid) {
+			char buf[160];
+
+			bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
+			btree_err(BTREE_ERR_FIXABLE, c, b, i,
+				  "invalid bkey %s: %s", buf, invalid);
+
+			btree_keys_account_key_drop(&b->nr, 0, k);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			continue;
+		}
+
+		k = bkey_next(k);
+	}
+
 	bch2_bset_build_aux_tree(b, b->set, false);
 
 	set_needs_whiteout(btree_bset_first(b));
@@ -1278,13 +1303,13 @@ static void btree_node_read_work(struct work_struct *work)
 		bio->bi_iter.bi_size	= btree_bytes(c);
 		submit_bio_wait(bio);
 start:
-		bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
+		bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
 		percpu_ref_put(&rb->pick.ca->io_ref);
 
 		__set_bit(rb->pick.ca->dev_idx, avoid.d);
 		rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
 
-		if (!bio->bi_error &&
+		if (!bio->bi_status &&
 		    !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
 			goto out;
 	} while (!IS_ERR_OR_NULL(rb->pick.ca));
@@ -1377,17 +1402,24 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
 	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
 
 	bch2_btree_node_read(c, b, true);
-	six_unlock_write(&b->lock);
 
 	if (btree_node_read_error(b)) {
-		six_unlock_intent(&b->lock);
-		return -EIO;
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		mutex_lock(&c->btree_cache.lock);
+		list_move(&b->list, &c->btree_cache.freeable);
+		mutex_unlock(&c->btree_cache.lock);
+
+		ret = -EIO;
+		goto err;
 	}
 
 	bch2_btree_set_root_for_read(c, b);
+err:
+	six_unlock_write(&b->lock);
 	six_unlock_intent(&b->lock);
 
-	return 0;
+	return ret;
 }
 
 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
@@ -1412,35 +1444,57 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
 	struct closure *cl	= wbio->cl;
 	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 	struct bkey_i_extent *new_key;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct btree_iter iter;
+	int ret;
 
-	six_lock_read(&b->lock);
-	bkey_copy(&tmp.k, &b->key);
-	six_unlock_read(&b->lock);
+	__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
+			       BTREE_MAX_DEPTH,
+			       b->level, 0);
+retry:
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto err;
 
-	if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) {
-		/* Node has been freed: */
+	/* has node been freed? */
+	if (iter.nodes[b->level] != b) {
+		/* node has been freed: */
+		if (!btree_node_dying(b))
+			panic("foo4\n");
 		goto out;
 	}
 
-	new_key = bkey_i_to_extent(&tmp.k);
+	if (!btree_node_hashed(b))
+		panic("foo5\n");
 
-	while (wbio->replicas_failed) {
-		unsigned idx = __fls(wbio->replicas_failed);
+	bkey_copy(&tmp.k, &b->key);
 
-		bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
-		wbio->replicas_failed ^= 1 << idx;
-	}
+	new_key = bkey_i_to_extent(&tmp.k);
+	e = extent_i_to_s(new_key);
+	extent_for_each_ptr_backwards(e, ptr)
+		if (bch2_dev_list_has_dev(wbio->failed, ptr->dev))
+			bch2_extent_drop_ptr(e, ptr);
 
-	if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
-	    bch2_btree_node_update_key(c, b, new_key)) {
-		set_btree_node_noevict(b);
-		bch2_fatal_error(c);
-	}
+	if (!bch2_extent_nr_ptrs(e.c))
+		goto err;
+
+	ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+	if (ret == -EINTR)
+		goto retry;
+	if (ret)
+		goto err;
 out:
+	bch2_btree_iter_unlock(&iter);
 	bio_put(&wbio->bio);
 	btree_node_write_done(c, b);
 	if (cl)
 		closure_put(cl);
+	return;
+err:
+	set_btree_node_noevict(b);
+	bch2_fs_fatal_error(c, "fatal error writing btree node");
+	goto out;
 }
 
 void bch2_btree_write_error_work(struct work_struct *work)
@@ -1470,12 +1524,17 @@ static void btree_node_write_endio(struct bio *bio)
 	struct closure *cl		= !wbio->split ? wbio->cl : NULL;
 	struct bch_fs *c		= wbio->c;
 	struct bch_dev *ca		= wbio->ca;
+	unsigned long flags;
 
 	bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
 
-	if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
-	    bch2_meta_write_fault("btree"))
-		set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
+	if (bio->bi_status == BLK_STS_REMOVED ||
+	    bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+	    bch2_meta_write_fault("btree")) {
+		spin_lock_irqsave(&c->btree_write_error_lock, flags);
+		bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+	}
 
 	if (wbio->have_io_ref)
 		percpu_ref_put(&ca->io_ref);
@@ -1491,12 +1550,11 @@ static void btree_node_write_endio(struct bio *bio)
 		wbio->used_mempool,
 		wbio->data);
 
-	if (wbio->replicas_failed) {
-		unsigned long flags;
-
+	if (wbio->failed.nr) {
 		spin_lock_irqsave(&c->btree_write_error_lock, flags);
 		bio_list_add(&c->btree_write_error_list, &wbio->bio);
 		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
 		queue_work(c->wq, &c->btree_write_error_work);
 		return;
 	}
@@ -1707,6 +1765,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
 	wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
 	wbio->cl		= parent;
+	wbio->failed.nr		= 0;
 	wbio->order		= order;
 	wbio->used_mempool	= used_mempool;
 	wbio->data		= data;
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index b0e64957..0b505a73 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -75,8 +75,8 @@ bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 {
 	struct btree_iter *linked;
 	struct btree *b = iter->nodes[level];
-	enum btree_node_locked_type want = btree_lock_want(iter, level);
-	enum btree_node_locked_type have = btree_node_locked_type(iter, level);
+	int want = btree_lock_want(iter, level);
+	int have = btree_node_locked_type(iter, level);
 
 	if (want == have)
 		return true;
@@ -108,6 +108,17 @@ success:
 	return true;
 }
 
+bool bch2_btree_iter_relock(struct btree_iter *iter)
+{
+	unsigned l;
+
+	for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
+		if (!bch2_btree_node_relock(iter, l))
+			return false;
+
+	return true;
+}
+
 /* Slowpath: */
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 			   unsigned level,
@@ -214,7 +225,6 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
 				     unsigned new_locks_want)
 {
 	struct btree_iter *linked;
-	unsigned l;
 
 	/* Drop locks we don't want anymore: */
 	if (new_locks_want < iter->locks_want)
@@ -228,12 +238,9 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
 	iter->locks_want = new_locks_want;
 	btree_iter_drop_extra_locks(iter);
 
-	for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
-		if (!bch2_btree_node_relock(iter, l))
-			goto fail;
+	if (bch2_btree_iter_relock(iter))
+		return true;
 
-	return true;
-fail:
 	/*
 	 * Just an optimization: ancestor nodes must be locked before child
 	 * nodes, so set locks_want on iterators that might lock ancestors
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index c2711892..acfe5b59 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -75,7 +75,7 @@ static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
 	mark_btree_node_locked(iter, level, SIX_LOCK_intent);
 }
 
-static inline int btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level)
 {
 	return level < iter->locks_want
 		? SIX_LOCK_intent
@@ -111,6 +111,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
 }
 
 bool bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool bch2_btree_iter_relock(struct btree_iter *);
 
 void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
 void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index f1e06a37..f0e6896a 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -196,6 +196,7 @@ enum btree_flags {
 	BTREE_NODE_accessed,
 	BTREE_NODE_write_in_flight,
 	BTREE_NODE_just_written,
+	BTREE_NODE_dying,
 };
 
 BTREE_FLAG(read_in_flight);
@@ -207,6 +208,7 @@ BTREE_FLAG(write_idx);
 BTREE_FLAG(accessed);
 BTREE_FLAG(write_in_flight);
 BTREE_FLAG(just_written);
+BTREE_FLAG(dying);
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index e11fcec9..c7c29306 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -130,7 +130,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 			    __le64, unsigned);
-int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
-			       struct bkey_i_extent *);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
+			       struct btree *, struct bkey_i_extent *);
 
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 1fe8fff8..04854532 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -21,7 +21,7 @@
 static void btree_node_will_make_reachable(struct btree_update *,
 					   struct btree *);
 static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *);
+static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
 
 /* Debug code: */
 
@@ -686,7 +686,7 @@ retry:
 		BUG_ON(c->btree_roots[b->btree_id].as != as);
 		c->btree_roots[b->btree_id].as = NULL;
 
-		bch2_btree_set_root_ondisk(c, b);
+		bch2_btree_set_root_ondisk(c, b, WRITE);
 
 		/*
 		 * We don't have to wait anything anything here (before
@@ -914,6 +914,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	struct btree_write *w;
 	struct bset_tree *t;
 
+	set_btree_node_dying(b);
 	btree_interior_update_add_node_reference(as, b);
 
 	/*
@@ -925,7 +926,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	 * in with keys that aren't in the journal anymore:
 	 */
 	for_each_bset(b, t)
-		as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
+		as->journal_seq = max(as->journal_seq,
+				      le64_to_cpu(bset(b, t)->journal_seq));
 
 	mutex_lock(&c->btree_interior_update_lock);
 
@@ -1027,6 +1029,10 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 	mutex_unlock(&c->btree_cache.lock);
 
 	mutex_lock(&c->btree_root_lock);
+	BUG_ON(btree_node_root(c, b) &&
+	       (b->level < btree_node_root(c, b)->level ||
+		!btree_node_dying(btree_node_root(c, b))));
+
 	btree_node_root(c, b) = b;
 	mutex_unlock(&c->btree_root_lock);
 
@@ -1054,7 +1060,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 			    gc_pos_btree_root(b->btree_id));
 }
 
-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
+static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
 {
 	struct btree_root *r = &c->btree_roots[b->btree_id];
 
@@ -1064,6 +1070,8 @@ static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
 	bkey_copy(&r->key, &b->key);
 	r->level = b->level;
 	r->alive = true;
+	if (rw == WRITE)
+		c->btree_roots_dirty = true;
 
 	mutex_unlock(&c->btree_root_lock);
 }
@@ -1787,64 +1795,16 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 	return ret;
 }
 
-int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b,
-			       struct bkey_i_extent *new_key)
+static void __bch2_btree_node_update_key(struct bch_fs *c,
+					 struct btree_update *as,
+					 struct btree_iter *iter,
+					 struct btree *b, struct btree *new_hash,
+					 struct bkey_i_extent *new_key)
 {
-	struct btree_update *as = NULL;
-	struct btree *parent, *new_hash = NULL;
-	struct btree_iter iter;
-	struct closure cl;
+	struct btree *parent;
 	bool must_rewrite_parent = false;
 	int ret;
 
-	__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
-			       BTREE_MAX_DEPTH,
-			       b->level, 0);
-	closure_init_stack(&cl);
-
-	ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
-	if (ret)
-		return ret;
-
-retry:
-	down_read(&c->gc_lock);
-	ret = bch2_btree_iter_traverse(&iter);
-	if (ret)
-		goto err;
-
-	/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
-	if (!new_hash &&
-	    PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
-		/* bch2_btree_reserve_get will unlock */
-		do {
-			ret = bch2_btree_cache_cannibalize_lock(c, &cl);
-			closure_sync(&cl);
-		} while (ret == -EAGAIN);
-
-		BUG_ON(ret);
-
-		new_hash = bch2_btree_node_mem_alloc(c);
-	}
-
-	as = bch2_btree_update_start(c, iter.btree_id,
-				     btree_update_reserve_required(c, b),
-				     BTREE_INSERT_NOFAIL|
-				     BTREE_INSERT_USE_RESERVE|
-				     BTREE_INSERT_USE_ALLOC_RESERVE,
-				     &cl);
-	if (IS_ERR(as)) {
-		ret = PTR_ERR(as);
-		if (ret == -EAGAIN || ret == -EINTR) {
-			bch2_btree_iter_unlock(&iter);
-			up_read(&c->gc_lock);
-			closure_sync(&cl);
-			goto retry;
-		}
-		goto err;
-	}
-
-	mutex_lock(&c->btree_interior_update_lock);
-
 	/*
 	 * Two corner cases that need to be thought about here:
 	 *
@@ -1869,22 +1829,12 @@ retry:
 	if (b->will_make_reachable)
 		must_rewrite_parent = true;
 
-	/* other case: btree node being freed */
-	if (iter.nodes[b->level] != b) {
-		/* node has been freed: */
-		BUG_ON(btree_node_hashed(b));
-		mutex_unlock(&c->btree_interior_update_lock);
-		goto err;
-	}
-
-	mutex_unlock(&c->btree_interior_update_lock);
-
 	if (must_rewrite_parent)
 		as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;
 
 	btree_interior_update_add_node_reference(as, b);
 
-	parent = iter.nodes[b->level + 1];
+	parent = iter->nodes[b->level + 1];
 	if (parent) {
 		if (new_hash) {
 			bkey_copy(&new_hash->key, &new_key->k_i);
@@ -1893,8 +1843,8 @@ retry:
 			BUG_ON(ret);
 		}
 
-		bch2_btree_insert_node(as, parent, &iter,
-				       &keylist_single(&new_key->k_i));
+		bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+		bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
 
 		if (new_hash) {
 			mutex_lock(&c->btree_cache.lock);
@@ -1914,7 +1864,7 @@ retry:
 
 		BUG_ON(btree_node_root(c, b) != b);
 
-		bch2_btree_node_lock_write(b, &iter);
+		bch2_btree_node_lock_write(b, iter);
 
 		bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
 			      c->opts.btree_node_size, true,
@@ -1925,14 +1875,94 @@ retry:
 					   &stats);
 		bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
 				    gc_pos_btree_root(b->btree_id));
-		bkey_copy(&b->key, &new_key->k_i);
+
+		if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+			mutex_lock(&c->btree_cache.lock);
+			bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+			bkey_copy(&b->key, &new_key->k_i);
+			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+			BUG_ON(ret);
+			mutex_unlock(&c->btree_cache.lock);
+		} else {
+			bkey_copy(&b->key, &new_key->k_i);
+		}
 
 		btree_update_updated_root(as);
-		bch2_btree_node_unlock_write(b, &iter);
+		bch2_btree_node_unlock_write(b, iter);
 	}
 
 	bch2_btree_update_done(as);
-out:
+}
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
+			       struct btree *b, struct bkey_i_extent *new_key)
+{
+	struct btree_update *as = NULL;
+	struct btree *new_hash = NULL;
+	struct closure cl;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	if (!down_read_trylock(&c->gc_lock)) {
+		bch2_btree_iter_unlock(iter);
+		down_read(&c->gc_lock);
+
+		if (!bch2_btree_iter_relock(iter)) {
+			ret = -EINTR;
+			goto err;
+		}
+	}
+
+	/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
+	if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+		/* bch2_btree_reserve_get will unlock */
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		if (ret) {
+			ret = -EINTR;
+
+			bch2_btree_iter_unlock(iter);
+			up_read(&c->gc_lock);
+			closure_sync(&cl);
+			down_read(&c->gc_lock);
+
+			if (!bch2_btree_iter_relock(iter))
+				goto err;
+		}
+
+		new_hash = bch2_btree_node_mem_alloc(c);
+	}
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+				     btree_update_reserve_required(c, b),
+				     BTREE_INSERT_NOFAIL|
+				     BTREE_INSERT_USE_RESERVE|
+				     BTREE_INSERT_USE_ALLOC_RESERVE,
+				     &cl);
+	if (IS_ERR(as)) {
+		ret = PTR_ERR(as);
+		if (ret == -EAGAIN)
+			ret = -EINTR;
+
+		if (ret != -EINTR)
+			goto err;
+
+		bch2_btree_iter_unlock(iter);
+		up_read(&c->gc_lock);
+		closure_sync(&cl);
+		down_read(&c->gc_lock);
+
+		if (!bch2_btree_iter_relock(iter))
+			goto err;
+	}
+
+	ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
+	if (ret)
+		goto err_free_update;
+
+	__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+err:
 	if (new_hash) {
 		mutex_lock(&c->btree_cache.lock);
 		list_move(&new_hash->list, &c->btree_cache.freeable);
@@ -1941,14 +1971,12 @@ out:
 		six_unlock_write(&new_hash->lock);
 		six_unlock_intent(&new_hash->lock);
 	}
-	bch2_btree_iter_unlock(&iter);
 	up_read(&c->gc_lock);
 	closure_sync(&cl);
 	return ret;
-err:
-	if (as)
-		bch2_btree_update_free(as);
-	goto out;
+err_free_update:
+	bch2_btree_update_free(as);
+	goto err;
 }
 
 /* Init code: */
@@ -1962,7 +1990,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
 	BUG_ON(btree_node_root(c, b));
 
 	__bch2_btree_set_root_inmem(c, b);
-	bch2_btree_set_root_ondisk(c, b);
+	bch2_btree_set_root_ondisk(c, b, READ);
 }
 
 int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
@@ -1998,7 +2026,7 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
 	BUG_ON(btree_node_root(c, b));
 
 	bch2_btree_set_root_inmem(as, b);
-	bch2_btree_set_root_ondisk(c, b);
+	bch2_btree_set_root_ondisk(c, b, WRITE);
 
 	bch2_btree_open_bucket_put(c, b);
 	six_unlock_intent(&b->lock);
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index b73002de..f0a63232 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -174,9 +174,11 @@ do {									\
 
 #define bch2_usage_read_raw(_stats)					\
 ({									\
-	typeof(*this_cpu_ptr(_stats)) _acc = { 0 };			\
+	typeof(*this_cpu_ptr(_stats)) _acc;				\
 	int cpu;							\
 									\
+	memset(&_acc, 0, sizeof(_acc));					\
+									\
 	for_each_possible_cpu(cpu)					\
 		bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu));	\
 									\
@@ -479,7 +481,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 {
 	struct bucket_mark old, new;
 	unsigned saturated;
-	struct bch_dev *ca = c->devs[ptr->dev];
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
 	unsigned data_type = type == S_META
 		? BUCKET_BTREE : BUCKET_DATA;
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 0bd8d2d8..6f9b1226 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -68,16 +68,14 @@ struct bch_dev_usage {
 
 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
-
 	/* _uncompressed_ sectors: */
+	u64			online_reserved;
+	u64			available_cache;
 
 	struct {
 		u64		data[S_ALLOC_NR];
 		u64		persistent_reserved;
 	}			s[BCH_REPLICAS_MAX];
-
-	u64			online_reserved;
-	u64			available_cache;
 };
 
 /*
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index d9a3212c..24af2ca1 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "bcachefs_ioctl.h"
+#include "chardev.h"
 #include "super.h"
 #include "super-io.h"
 
@@ -25,7 +26,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
 			return ERR_PTR(-EINVAL);
 
 		rcu_read_lock();
-		ca = c->devs[dev];
+		ca = rcu_dereference(c->devs[dev]);
 		if (ca)
 			percpu_ref_get(&ca->ref);
 		rcu_read_unlock();
@@ -80,7 +81,7 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
 
 	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
 
-	if (copy_from_user(user_devs, arg.devs,
+	if (copy_from_user(user_devs, user_arg->devs,
 			   sizeof(u64) * arg.nr_devs))
 		goto err;
 
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h
index 1a089417..b0c8a50e 100644
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -72,14 +72,15 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
 	}
 }
 
-static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c)
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
+							 unsigned opt)
 {
 	if (c->sb.encryption_type)
 		return c->opts.wide_macs
 			? BCH_CSUM_CHACHA20_POLY1305_128
 			: BCH_CSUM_CHACHA20_POLY1305_80;
 
-	return bch2_csum_opt_to_type(c->opts.data_checksum, true);
+	return bch2_csum_opt_to_type(opt, true);
 }
 
 static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
@@ -143,6 +144,14 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
 	return nonce;
 }
 
+static inline struct nonce null_nonce(void)
+{
+	struct nonce ret;
+
+	memset(&ret, 0, sizeof(ret));
+	return ret;
+}
+
 static inline struct nonce extent_nonce(struct bversion version,
 					struct bch_extent_crc_unpacked crc)
 {
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 8357c8de..ca2a06e2 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -95,11 +95,17 @@ print:
 	vscnprintf(buf, sizeof(_buf), fmt, args);
 	va_end(args);
 
+	if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+		bch_err(c, "%s, exiting", buf);
+		mutex_unlock(&c->fsck_error_lock);
+		return FSCK_ERR_EXIT;
+	}
+
 	if (flags & FSCK_CAN_FIX) {
-		if (c->opts.fix_errors == FSCK_ERR_ASK) {
+		if (c->opts.fix_errors == FSCK_OPT_ASK) {
 			printk(KERN_ERR "%s: fix?", buf);
 			fix = ask_yn();
-		} else if (c->opts.fix_errors == FSCK_ERR_YES ||
+		} else if (c->opts.fix_errors == FSCK_OPT_YES ||
 			   (c->opts.nochanges &&
 			    !(flags & FSCK_CAN_IGNORE))) {
 			if (print)
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index 68635eee..28fe4fce 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -96,9 +96,10 @@ enum {
 };
 
 enum fsck_err_opts {
-	FSCK_ERR_NO,
-	FSCK_ERR_YES,
-	FSCK_ERR_ASK,
+	FSCK_OPT_EXIT,
+	FSCK_OPT_YES,
+	FSCK_OPT_NO,
+	FSCK_OPT_ASK,
 };
 
 enum fsck_err_ret {
@@ -217,7 +218,7 @@ do {									\
 #define bcache_io_error(c, bio, fmt, ...)				\
 do {									\
 	__bcache_io_error(c, fmt, ##__VA_ARGS__);			\
-	(bio)->bi_error = -EIO;						\
+	(bio)->bi_status = BLK_STS_IOERR;					\
 } while (0)
 
 #endif /* _BCACHEFS_ERROR_H */
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 6e79f491..176978ca 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -18,6 +18,7 @@
 #include "extents.h"
 #include "inode.h"
 #include "journal.h"
+#include "super.h"
 #include "super-io.h"
 #include "util.h"
 #include "xattr.h"
@@ -156,6 +157,19 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
 	return nr_ptrs;
 }
 
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *c, struct bkey_s_c_extent e)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned nr_ptrs = 0;
+
+	extent_for_each_ptr(e, ptr)
+		nr_ptrs += (!ptr->cached &&
+			    bch_dev_bkey_exists(c, ptr->dev)->mi.state !=
+			    BCH_MEMBER_STATE_FAILED);
+
+	return nr_ptrs;
+}
+
 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 {
 	struct bkey_s_c_extent e;
@@ -362,7 +376,7 @@ static bool should_drop_ptr(const struct bch_fs *c,
 			    struct bkey_s_c_extent e,
 			    const struct bch_extent_ptr *ptr)
 {
-	return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr);
+	return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
 }
 
 static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
@@ -411,8 +425,10 @@ static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 				entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
 				break;
 			case BCH_EXTENT_ENTRY_crc128:
-				entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
-				entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
+				entry->crc128.csum.hi = (__force __le64)
+					swab64((__force u64) entry->crc128.csum.hi);
+				entry->crc128.csum.lo = (__force __le64)
+					swab64((__force u64) entry->crc128.csum.lo);
 				break;
 			case BCH_EXTENT_ENTRY_ptr:
 				break;
@@ -432,10 +448,11 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
 	const struct bch_extent_ptr *ptr2;
 	struct bch_dev *ca;
 
-	if (ptr->dev >= c->sb.nr_devices)
+	if (ptr->dev >= c->sb.nr_devices ||
+	    !c->devs[ptr->dev])
 		return "pointer to invalid device";
 
-	ca = c->devs[ptr->dev];
+	ca = bch_dev_bkey_exists(c, ptr->dev);
 	if (!ca)
 		return "pointer to invalid device";
 
@@ -487,7 +504,9 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
 			break;
 		case BCH_EXTENT_ENTRY_ptr:
 			ptr = entry_to_ptr(entry);
-			ca = c->devs[ptr->dev];
+			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;
 
 			p("ptr: %u:%llu gen %u%s", ptr->dev,
 			  (u64) ptr->offset, ptr->gen,
@@ -528,7 +547,7 @@ static void extent_pick_read_device(struct bch_fs *c,
 	struct bch_extent_crc_unpacked crc;
 
 	extent_for_each_ptr_crc(e, ptr, crc) {
-		struct bch_dev *ca = c->devs[ptr->dev];
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 
 		if (ptr->cached && ptr_stale(ca, ptr))
 			continue;
@@ -621,7 +640,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 	bool bad;
 
 	extent_for_each_ptr(e, ptr) {
-		ca = c->devs[ptr->dev];
+		ca = bch_dev_bkey_exists(c, ptr->dev);
 		g = PTR_BUCKET(ca, ptr);
 		replicas++;
 
@@ -1730,7 +1749,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 	memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
 
 	extent_for_each_ptr(e, ptr) {
-		ca = c->devs[ptr->dev];
+		ca = bch_dev_bkey_exists(c, ptr->dev);
 		g = PTR_BUCKET(ca, ptr);
 		replicas++;
 		ptrs_per_tier[ca->mi.tier]++;
@@ -1844,7 +1863,7 @@ static void bch2_extent_to_text(struct bch_fs *c, char *buf,
 static unsigned PTR_TIER(struct bch_fs *c,
 			 const struct bch_extent_ptr *ptr)
 {
-	return c->devs[ptr->dev]->mi.tier;
+	return bch_dev_bkey_exists(c, ptr->dev)->mi.tier;
 }
 
 static void bch2_extent_crc_init(union bch_extent_crc *crc,
@@ -1971,14 +1990,10 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 				      struct bkey_s_extent e)
 {
 	struct bch_extent_ptr *ptr;
-	unsigned tier = 0, nr_cached = 0, nr_good = 0;
+	unsigned tier = 0, nr_cached = 0;
+	unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
 	bool have_higher_tier;
 
-	extent_for_each_ptr(e, ptr)
-		if (!ptr->cached &&
-		    c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED)
-			nr_good++;
-
 	if (nr_good <= c->opts.data_replicas)
 		return;
 
@@ -2103,7 +2118,7 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c,
 				return BCH_MERGE_NOMERGE;
 
 			/* We don't allow extents to straddle buckets: */
-			ca = c->devs[lp->dev];
+			ca = bch_dev_bkey_exists(c, lp->dev);
 
 			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
 				return BCH_MERGE_NOMERGE;
@@ -2347,6 +2362,30 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 	}
 }
 
+int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+{
+	struct btree_iter iter;
+	struct bpos end = pos;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	end.offset += size;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
+			     BTREE_ITER_WITH_HOLES, k) {
+		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+			break;
+
+		if (!bch2_extent_is_fully_allocated(k)) {
+			ret = -ENOSPC;
+			break;
+		}
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
 const struct bkey_ops bch2_bkey_extent_ops = {
 	.key_invalid	= bch2_extent_invalid,
 	.key_debugcheck	= bch2_extent_debugcheck,
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 1ec2db5e..ab7993ab 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -45,6 +45,7 @@ bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
 
 unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *, struct bkey_s_c_extent);
 unsigned bch2_extent_is_compressed(struct bkey_s_c);
 
 bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
@@ -243,14 +244,14 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 	case BCH_EXTENT_CRC32:
 		return (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc32),
-			.csum.lo		= crc->crc32.csum,
+			.csum.lo		= (__force __le64) crc->crc32.csum,
 		};
 	case BCH_EXTENT_CRC64:
 		return (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc64),
 			.nonce			= crc->crc64.nonce,
-			.csum.lo		= crc->crc64.csum_lo,
-			.csum.hi		= crc->crc64.csum_hi,
+			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
+			.csum.hi		= (__force __le64) crc->crc64.csum_hi,
 		};
 	case BCH_EXTENT_CRC128:
 		return (struct bch_extent_crc_unpacked) {
@@ -425,4 +426,6 @@ bool bch2_cut_front(struct bpos, struct bkey_i *);
 bool bch2_cut_back(struct bpos, struct bkey *);
 void bch2_key_resize(struct bkey *, unsigned);
 
+int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+
 #endif /* _BCACHEFS_EXTENTS_H */
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 298e3592..2c34a85c 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -28,8 +28,11 @@
 
 struct i_sectors_hook {
 	struct extent_insert_hook	hook;
-	s64				sectors;
 	struct bch_inode_info		*inode;
+	s64				sectors;
+	u64				new_i_size;
+	unsigned			flags;
+	unsigned			appending:1;
 };
 
 struct bchfs_write_op {
@@ -43,17 +46,6 @@ struct bchfs_write_op {
 	struct bch_write_op		op;
 };
 
-static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
-					struct bch_inode_info *inode,
-					bool is_dio)
-{
-	op->inode		= inode;
-	op->sectors_added	= 0;
-	op->is_dio		= is_dio;
-	op->unalloc		= false;
-	op->new_i_size		= U64_MAX;
-}
-
 struct bch_writepage_io {
 	struct closure			cl;
 
@@ -65,12 +57,8 @@ struct dio_write {
 	struct closure			cl;
 	struct kiocb			*req;
 	struct bch_fs			*c;
-	long				written;
-	long				error;
 	loff_t				offset;
 
-	struct disk_reservation		res;
-
 	struct iovec			*iovec;
 	struct iovec			inline_vecs[UIO_FASTIOV];
 	struct iov_iter			iter;
@@ -129,12 +117,6 @@ static int inode_set_size(struct bch_inode_info *inode,
 	lockdep_assert_held(&inode->ei_update_lock);
 
 	bi->bi_size = *new_i_size;
-
-	if (atomic_long_read(&inode->ei_size_dirty_count))
-		bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
-	else
-		bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-
 	return 0;
 }
 
@@ -145,16 +127,16 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
 	return __bch2_write_inode(c, inode, inode_set_size, &new_size);
 }
 
-static inline void i_size_dirty_put(struct bch_inode_info *inode)
+static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors)
 {
-	atomic_long_dec_bug(&inode->ei_size_dirty_count);
+	inode->v.i_blocks += sectors;
 }
 
-static inline void i_size_dirty_get(struct bch_inode_info *inode)
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors)
 {
-	lockdep_assert_held(&inode->v.i_rwsem);
-
-	atomic_long_inc(&inode->ei_size_dirty_count);
+	mutex_lock(&inode->ei_update_lock);
+	__i_sectors_acct(c, inode, sectors);
+	mutex_unlock(&inode->ei_update_lock);
 }
 
 /* i_sectors accounting: */
@@ -172,90 +154,83 @@ i_sectors_hook_fn(struct extent_insert_hook *hook,
 	int sign = bkey_extent_is_allocation(&insert->k) -
 		(k.k && bkey_extent_is_allocation(k.k));
 
-	EBUG_ON(!(h->inode->ei_flags & BCH_INODE_I_SECTORS_DIRTY));
-	EBUG_ON(!atomic_long_read(&h->inode->ei_sectors_dirty_count));
+	EBUG_ON(!(h->inode->ei_inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY));
 
 	h->sectors += sectors * sign;
 
 	return BTREE_INSERT_OK;
 }
 
-static int inode_set_i_sectors_dirty(struct bch_inode_info *inode,
-				     struct bch_inode_unpacked *bi, void *p)
-{
-	BUG_ON(bi->bi_flags & BCH_INODE_I_SECTORS_DIRTY);
-
-	bi->bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
-	return 0;
-}
-
-static int inode_clear_i_sectors_dirty(struct bch_inode_info *inode,
-				       struct bch_inode_unpacked *bi,
-				       void *p)
+static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode,
+				     struct bch_inode_unpacked *bi,
+				     void *p)
 {
-	BUG_ON(!(bi->bi_flags & BCH_INODE_I_SECTORS_DIRTY));
+	struct i_sectors_hook *h = p;
 
-	bi->bi_sectors	= atomic64_read(&inode->ei_sectors);
-	bi->bi_flags	&= ~BCH_INODE_I_SECTORS_DIRTY;
+	if (h->new_i_size != U64_MAX &&
+	    (!h->appending ||
+	     h->new_i_size > bi->bi_size))
+		bi->bi_size = h->new_i_size;
+	bi->bi_sectors	+= h->sectors;
+	bi->bi_flags	&= ~h->flags;
 	return 0;
 }
 
-static void i_sectors_dirty_put(struct bch_fs *c,
-				struct bch_inode_info *inode,
-				struct i_sectors_hook *h)
+static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
 {
-	if (h->sectors) {
-		spin_lock(&inode->v.i_lock);
-		inode->v.i_blocks += h->sectors;
-		spin_unlock(&inode->v.i_lock);
+	int ret;
 
-		atomic64_add(h->sectors, &inode->ei_sectors);
-		EBUG_ON(atomic64_read(&inode->ei_sectors) < 0);
-	}
+	mutex_lock(&h->inode->ei_update_lock);
+	if (h->new_i_size != U64_MAX)
+		i_size_write(&h->inode->v, h->new_i_size);
 
-	EBUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count) <= 0);
+	__i_sectors_acct(c, h->inode, h->sectors);
 
-	mutex_lock(&inode->ei_update_lock);
+	ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
+	mutex_unlock(&h->inode->ei_update_lock);
 
-	if (atomic_long_dec_and_test(&inode->ei_sectors_dirty_count)) {
-		int ret = __bch2_write_inode(c, inode,
-					  inode_clear_i_sectors_dirty, NULL);
+	h->sectors = 0;
 
-		ret = ret;
-	}
-
-	mutex_unlock(&inode->ei_update_lock);
+	return ret;
 }
 
-static int __must_check i_sectors_dirty_get(struct bch_fs *c,
-					    struct bch_inode_info *inode,
-					    struct i_sectors_hook *h)
+static int i_sectors_dirty_start_fn(struct bch_inode_info *inode,
+				    struct bch_inode_unpacked *bi, void *p)
 {
-	int ret = 0;
+	struct i_sectors_hook *h = p;
 
-	h->hook.fn	= i_sectors_hook_fn;
-	h->sectors	= 0;
-#ifdef CONFIG_BCACHEFS_DEBUG
-	h->inode	= inode;
-#endif
+	if (h->flags & BCH_INODE_I_SIZE_DIRTY)
+		bi->bi_size = h->new_i_size;
 
-	if (atomic_long_inc_not_zero(&inode->ei_sectors_dirty_count))
-		return 0;
-
-	mutex_lock(&inode->ei_update_lock);
-
-	if (!(inode->ei_flags & BCH_INODE_I_SECTORS_DIRTY))
-		ret = __bch2_write_inode(c, inode, inode_set_i_sectors_dirty,
-					 NULL);
+	bi->bi_flags |= h->flags;
+	return 0;
+}
 
-	if (!ret)
-		atomic_long_inc(&inode->ei_sectors_dirty_count);
+static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h)
+{
+	int ret;
 
-	mutex_unlock(&inode->ei_update_lock);
+	mutex_lock(&h->inode->ei_update_lock);
+	ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h);
+	mutex_unlock(&h->inode->ei_update_lock);
 
 	return ret;
 }
 
+static inline struct i_sectors_hook
+i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags)
+{
+	return (struct i_sectors_hook) {
+		.hook.fn	= i_sectors_hook_fn,
+		.inode		= inode,
+		.sectors	= 0,
+		.new_i_size	= U64_MAX,
+		.flags		= flags|BCH_INODE_I_SECTORS_DIRTY,
+	};
+}
+
+/* normal i_size/i_sectors update machinery: */
+
 struct bchfs_extent_trans_hook {
 	struct bchfs_write_op		*op;
 	struct extent_insert_hook	hook;
@@ -289,18 +264,18 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
 	BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
 
 	/* XXX: inode->i_size locking */
-	if (offset > inode->ei_size) {
-		BUG_ON(inode->ei_flags & BCH_INODE_I_SIZE_DIRTY);
-
+	if (offset > inode->ei_inode.bi_size) {
 		if (!h->need_inode_update) {
 			h->need_inode_update = true;
 			return BTREE_INSERT_NEED_TRAVERSE;
 		}
 
+		BUG_ON(h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY);
+
 		h->inode_u.bi_size = offset;
 		do_pack = true;
 
-		inode->ei_size = offset;
+		inode->ei_inode.bi_size = offset;
 
 		if (h->op->is_dio)
 			i_size_write(&inode->v, offset);
@@ -315,15 +290,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
 		h->inode_u.bi_sectors += sectors;
 		do_pack = true;
 
-		atomic64_add(sectors, &inode->ei_sectors);
-
 		h->op->sectors_added += sectors;
-
-		if (h->op->is_dio) {
-			spin_lock(&inode->v.i_lock);
-			inode->v.i_blocks += sectors;
-			spin_unlock(&inode->v.i_lock);
-		}
 	}
 
 	if (do_pack)
@@ -340,6 +307,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 	struct btree_iter extent_iter, inode_iter;
 	struct bchfs_extent_trans_hook hook;
 	struct bkey_i *k = bch2_keylist_front(keys);
+	s64 orig_sectors_added = op->sectors_added;
 	int ret;
 
 	BUG_ON(k->k.p.inode != op->inode->v.i_ino);
@@ -362,7 +330,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 
 		/* XXX: inode->i_size locking */
 		k = bch2_keylist_front(keys);
-		if (min(k->k.p.offset << 9, op->new_i_size) > op->inode->ei_size)
+		if (min(k->k.p.offset << 9, op->new_i_size) >
+		    op->inode->ei_inode.bi_size)
 			hook.need_inode_update = true;
 
 		if (hook.need_inode_update) {
@@ -430,9 +399,41 @@ err:
 	bch2_btree_iter_unlock(&extent_iter);
 	bch2_btree_iter_unlock(&inode_iter);
 
+	if (op->is_dio)
+		i_sectors_acct(wop->c, op->inode,
+			       op->sectors_added - orig_sectors_added);
+
 	return ret;
 }
 
+static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
+					struct bch_fs *c,
+					struct bch_inode_info *inode,
+					struct bch_io_opts opts,
+					bool is_dio)
+{
+	op->inode		= inode;
+	op->sectors_added	= 0;
+	op->is_dio		= is_dio;
+	op->unalloc		= false;
+	op->new_i_size		= U64_MAX;
+
+	bch2_write_op_init(&op->op, c);
+	op->op.csum_type	= bch2_data_checksum_type(c, opts.data_checksum);
+	op->op.compression_type	= bch2_compression_opt_to_type(opts.compression);
+	op->op.devs		= c->fastest_devs;
+	op->op.index_update_fn	= bchfs_write_index_update;
+	op_journal_seq_set(&op->op, &inode->ei_journal_seq);
+}
+
+static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info *inode)
+{
+	struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
+
+	bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode->ei_inode));
+	return opts;
+}
+
 /* page state: */
 
 /* stored in page->private: */
@@ -551,11 +552,8 @@ static void bch2_clear_page_bits(struct page *page)
 	s = xchg(page_state(page), (struct bch_page_state) { .v = 0 });
 	ClearPagePrivate(page);
 
-	if (s.dirty_sectors) {
-		spin_lock(&inode->v.i_lock);
-		inode->v.i_blocks -= s.dirty_sectors;
-		spin_unlock(&inode->v.i_lock);
-	}
+	if (s.dirty_sectors)
+		i_sectors_acct(c, inode, -s.dirty_sectors);
 
 	if (s.reserved)
 		bch2_disk_reservation_put(c, &res);
@@ -563,19 +561,16 @@ static void bch2_clear_page_bits(struct page *page)
 
 int bch2_set_page_dirty(struct page *page)
 {
+	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_page_state old, new;
 
 	old = page_state_cmpxchg(page_state(page), new,
 		new.dirty_sectors = PAGE_SECTORS - new.sectors;
 	);
 
-	if (old.dirty_sectors != new.dirty_sectors) {
-		struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
-
-		spin_lock(&inode->v.i_lock);
-		inode->v.i_blocks += new.dirty_sectors - old.dirty_sectors;
-		spin_unlock(&inode->v.i_lock);
-	}
+	if (old.dirty_sectors != new.dirty_sectors)
+		i_sectors_acct(c, inode, new.dirty_sectors - old.dirty_sectors);
 
 	return __set_page_dirty_nobuffers(page);
 }
@@ -624,7 +619,7 @@ static void bch2_readpages_end_io(struct bio *bio)
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
 
-		if (!bio->bi_error) {
+		if (!bio->bi_status) {
 			SetPageUptodate(page);
 		} else {
 			ClearPageUptodate(page);
@@ -846,6 +841,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
 {
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts = io_opts(c, inode);
 	struct btree_iter iter;
 	struct page *page;
 	struct readpages_iter readpages_iter = {
@@ -868,7 +864,8 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
 				   c->sb.encoded_extent_max >> PAGE_SECTOR_SHIFT);
 
 		struct bch_read_bio *rbio =
-			to_rbio(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read));
+			rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read),
+				  opts);
 
 		rbio->bio.bi_end_io = bch2_readpages_end_io;
 		bio_add_page_contig(&rbio->bio, page);
@@ -914,9 +911,10 @@ int bch2_readpage(struct file *file, struct page *page)
 {
 	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts = io_opts(c, inode);
 	struct bch_read_bio *rbio;
 
-	rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read));
+	rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
 	rbio->bio.bi_end_io = bch2_readpages_end_io;
 
 	__bchfs_readpage(c, rbio, inode->v.i_ino, page);
@@ -925,8 +923,15 @@ int bch2_readpage(struct file *file, struct page *page)
 
 struct bch_writepage_state {
 	struct bch_writepage_io	*io;
+	struct bch_io_opts	opts;
 };
 
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+								  struct bch_inode_info *inode)
+{
+	return (struct bch_writepage_state) { .opts = io_opts(c, inode) };
+}
+
 static void bch2_writepage_io_free(struct closure *cl)
 {
 	struct bch_writepage_io *io = container_of(cl,
@@ -982,13 +987,8 @@ static void bch2_writepage_io_done(struct closure *cl)
 	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
 	 * before calling end_page_writeback:
 	 */
-	if (io->op.sectors_added) {
-		struct bch_inode_info *inode = io->op.inode;
-
-		spin_lock(&inode->v.i_lock);
-		inode->v.i_blocks += io->op.sectors_added;
-		spin_unlock(&inode->v.i_lock);
-	}
+	if (io->op.sectors_added)
+		i_sectors_acct(c, io->op.inode, io->op.sectors_added);
 
 	bio_for_each_segment_all(bvec, bio, i)
 		end_page_writeback(bvec->bv_page);
@@ -1004,8 +1004,6 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
 	w->io = NULL;
 	atomic_add(bio->bi_vcnt, &io->op.op.c->writeback_pages);
 
-	io->op.op.pos.offset = bio->bi_iter.bi_sector;
-
 	closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl);
 	continue_at(&io->cl, bch2_writepage_io_done, NULL);
 }
@@ -1017,46 +1015,26 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
 static void bch2_writepage_io_alloc(struct bch_fs *c,
 				    struct bch_writepage_state *w,
 				    struct bch_inode_info *inode,
-				    struct page *page)
-{
-	u64 inum = inode->v.i_ino;
-	unsigned nr_replicas = page_state(page)->nr_replicas;
-
-	EBUG_ON(!nr_replicas);
-	/* XXX: disk_reservation->gen isn't plumbed through */
-
-	if (!w->io) {
-alloc_io:
-		w->io = container_of(bio_alloc_bioset(GFP_NOFS,
-						      BIO_MAX_PAGES,
-						      &c->writepage_bioset),
-				     struct bch_writepage_io, op.op.wbio.bio);
-
-		closure_init(&w->io->cl, NULL);
-		bch2_fswrite_op_init(&w->io->op, inode, false);
-		bch2_write_op_init(&w->io->op.op, c,
-				(struct disk_reservation) {
-					.nr_replicas = c->opts.data_replicas,
-				},
-				c->fastest_devs,
-				writepoint_hashed(inode->ei_last_dirtied),
-				POS(inum, 0),
-				&inode->ei_journal_seq,
-				0);
-		w->io->op.op.index_update_fn = bchfs_write_index_update;
-	}
+				    struct page *page,
+				    struct bch_page_state s)
+{
+	struct bch_write_op *op;
+	u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT;
 
-	if (w->io->op.op.res.nr_replicas != nr_replicas ||
-	    bio_add_page_contig(&w->io->op.op.wbio.bio, page)) {
-		bch2_writepage_do_io(w);
-		goto alloc_io;
-	}
+	w->io = container_of(bio_alloc_bioset(GFP_NOFS,
+					      BIO_MAX_PAGES,
+					      &c->writepage_bioset),
+			     struct bch_writepage_io, op.op.wbio.bio);
+	op = &w->io->op.op;
 
-	/*
-	 * We shouldn't ever be handed pages for multiple inodes in a single
-	 * pass - right?
-	 */
-	BUG_ON(inode != w->io->op.inode);
+	closure_init(&w->io->cl, NULL);
+
+	bch2_fswrite_op_init(&w->io->op, c, inode, w->opts, false);
+	op->nr_replicas		= s.nr_replicas;
+	op->res.nr_replicas	= s.nr_replicas;
+	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
+	op->pos			= POS(inode->v.i_ino, offset);
+	op->wbio.bio.bi_iter.bi_sector = offset;
 }
 
 static int __bch2_writepage(struct bch_fs *c, struct page *page,
@@ -1091,32 +1069,39 @@ static int __bch2_writepage(struct bch_fs *c, struct page *page,
 	 */
 	zero_user_segment(page, offset, PAGE_SIZE);
 do_io:
-	bch2_writepage_io_alloc(c, w, inode, page);
-
-	/* while page is locked: */
-	w->io->op.new_i_size = i_size;
-
-	if (wbc->sync_mode == WB_SYNC_ALL)
-		w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
-
 	/* Before unlocking the page, transfer reservation to w->io: */
 	old = page_state_cmpxchg(page_state(page), new, {
 		EBUG_ON(!new.reserved &&
 			(new.sectors != PAGE_SECTORS ||
 			!new.allocated));
 
-		if (new.allocated &&
-		    w->io->op.op.compression_type != BCH_COMPRESSION_NONE)
+		if (new.allocated && w->opts.compression)
 			new.allocated = 0;
 		else if (!new.reserved)
-			goto out;
+			break;
 		new.reserved = 0;
 	});
 
-	w->io->op.op.res.sectors += PAGE_SECTORS *
-		(old.reserved - new.reserved) *
-		old.nr_replicas;
-out:
+	if (w->io &&
+	    (w->io->op.op.res.nr_replicas != old.nr_replicas ||
+	     !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
+		bch2_writepage_do_io(w);
+
+	if (!w->io)
+		bch2_writepage_io_alloc(c, w, inode, page, old);
+
+	BUG_ON(inode != w->io->op.inode);
+	BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
+
+	if (old.reserved)
+		w->io->op.op.res.sectors += old.nr_replicas * PAGE_SECTORS;
+
+	/* while page is locked: */
+	w->io->op.new_i_size = i_size;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	unlock_page(page);
@@ -1127,7 +1112,8 @@ out:
 int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
-	struct bch_writepage_state w = { NULL };
+	struct bch_writepage_state w =
+		bch_writepage_state_init(c, to_bch_ei(mapping->host));
 	struct pagecache_iter iter;
 	struct page *page;
 	int ret = 0;
@@ -1275,7 +1261,8 @@ continue_unlock:
 int bch2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
-	struct bch_writepage_state w = { NULL };
+	struct bch_writepage_state w =
+		bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
 	int ret;
 
 	ret = __bch2_writepage(c, page, wbc, &w);
@@ -1306,7 +1293,7 @@ static int bch2_read_single_page(struct page *page,
 	__bchfs_readpage(c, rbio, inode->v.i_ino, page);
 	wait_for_completion(&done);
 
-	ret = rbio->bio.bi_error;
+	ret = blk_status_to_errno(rbio->bio.bi_status);
 	bio_put(&rbio->bio);
 
 	if (ret < 0)
@@ -1440,8 +1427,8 @@ static void bch2_direct_IO_read_endio(struct bio *bio)
 {
 	struct dio_read *dio = bio->bi_private;
 
-	if (bio->bi_error)
-		dio->ret = bio->bi_error;
+	if (bio->bi_status)
+		dio->ret = blk_status_to_errno(bio->bi_status);
 
 	closure_put(&dio->cl);
 }
@@ -1456,6 +1443,7 @@ static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req,
 			       struct file *file, struct bch_inode_info *inode,
 			       struct iov_iter *iter, loff_t offset)
 {
+	struct bch_io_opts opts = io_opts(c, inode);
 	struct dio_read *dio;
 	struct bio *bio;
 	bool sync = is_sync_kiocb(req);
@@ -1512,7 +1500,7 @@ start:
 		ret = bio_iov_iter_get_pages(bio, iter);
 		if (ret < 0) {
 			/* XXX: fault inject this path */
-			bio->bi_error = ret;
+			bio->bi_status = BLK_STS_RESOURCE;
 			bio_endio(bio);
 			break;
 		}
@@ -1523,7 +1511,7 @@ start:
 		if (iter->count)
 			closure_get(&dio->cl);
 
-		bch2_read(c, to_rbio(bio), inode->v.i_ino);
+		bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
 	}
 
 	if (sync) {
@@ -1542,9 +1530,9 @@ static long __bch2_dio_write_complete(struct dio_write *dio)
 	struct file *file = dio->req->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct bch_inode_info *inode = file_bch_inode(file);
-	long ret = dio->error ?: dio->written;
+	long ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
 
-	bch2_disk_reservation_put(dio->c, &dio->res);
+	bch2_disk_reservation_put(dio->c, &dio->iop.op.res);
 
 	__pagecache_block_put(&mapping->add_lock);
 	inode_dio_end(&inode->v);
@@ -1569,11 +1557,6 @@ static void bch2_dio_write_done(struct dio_write *dio)
 	struct bio_vec *bv;
 	int i;
 
-	dio->written += dio->iop.op.written << 9;
-
-	if (dio->iop.op.error)
-		dio->error = dio->iop.op.error;
-
 	bio_for_each_segment_all(bv, &dio->iop.op.wbio.bio, i)
 		put_page(bv->bv_page);
 
@@ -1586,38 +1569,15 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
 	struct file *file = dio->req->ki_filp;
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bio *bio = &dio->iop.op.wbio.bio;
-	unsigned flags = 0;
 	int ret;
 
-	if ((dio->req->ki_flags & IOCB_DSYNC) &&
-	    !dio->c->opts.journal_flush_disabled)
-		flags |= BCH_WRITE_FLUSH;
-
 	ret = bio_iov_iter_get_pages(bio, &dio->iter);
 	if (ret < 0) {
-		/*
-		 * these didn't get initialized, but bch2_dio_write_done() will
-		 * look at them:
-		 */
-		dio->iop.op.error = 0;
-		dio->iop.op.written = 0;
-		dio->error = ret;
+		dio->iop.op.error = ret;
 		return;
 	}
 
-	dio->iop.sectors_added	= 0;
-	bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
-			   dio->c->fastest_devs,
-			   writepoint_hashed((unsigned long) dio->task),
-			   POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
-			   &inode->ei_journal_seq,
-			   flags);
-	dio->iop.op.index_update_fn = bchfs_write_index_update;
-
-	if (!dio->iop.unalloc) {
-		dio->res.sectors -= bio_sectors(bio);
-		dio->iop.op.res.sectors = bio_sectors(bio);
-	}
+	dio->iop.op.pos = POS(inode->v.i_ino, (dio->offset >> 9) + dio->iop.op.written);
 
 	task_io_account_write(bio->bi_iter.bi_size);
 
@@ -1632,7 +1592,7 @@ static void bch2_dio_write_loop_async(struct closure *cl)
 
 	bch2_dio_write_done(dio);
 
-	if (dio->iter.count && !dio->error) {
+	if (dio->iter.count && !dio->iop.op.error) {
 		use_mm(dio->task->mm);
 		pagecache_block_get(&mapping->add_lock);
 
@@ -1652,31 +1612,6 @@ static void bch2_dio_write_loop_async(struct closure *cl)
 	}
 }
 
-static int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos,
-				      u64 size)
-{
-	struct btree_iter iter;
-	struct bpos end = pos;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	end.offset += size;
-
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
-			     BTREE_ITER_WITH_HOLES, k) {
-		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
-			break;
-
-		if (!bch2_extent_is_fully_allocated(k)) {
-			ret = -ENOSPC;
-			break;
-		}
-	}
-	bch2_btree_iter_unlock(&iter);
-
-	return ret;
-}
-
 static int bch2_direct_IO_write(struct bch_fs *c,
 				struct kiocb *req, struct file *file,
 				struct bch_inode_info *inode,
@@ -1703,13 +1638,17 @@ static int bch2_direct_IO_write(struct bch_fs *c,
 	closure_init(&dio->cl, NULL);
 	dio->req		= req;
 	dio->c			= c;
-	dio->written		= 0;
-	dio->error		= 0;
 	dio->offset		= offset;
 	dio->iovec		= NULL;
 	dio->iter		= *iter;
 	dio->task		= current;
-	bch2_fswrite_op_init(&dio->iop, inode, true);
+	bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true);
+	dio->iop.op.write_point	= writepoint_hashed((unsigned long) dio->task);
+	dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION;
+
+	if ((dio->req->ki_flags & IOCB_DSYNC) &&
+	    !c->opts.journal_flush_disabled)
+		dio->iop.op.flags |= BCH_WRITE_FLUSH;
 
 	if (offset + iter->count > inode->v.i_size)
 		sync = true;
@@ -1722,7 +1661,7 @@ static int bch2_direct_IO_write(struct bch_fs *c,
 	 * Have to then guard against racing with truncate (deleting data that
 	 * we would have been overwriting)
 	 */
-	ret = bch2_disk_reservation_get(c, &dio->res, iter->count >> 9, 0);
+	ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, 0);
 	if (unlikely(ret)) {
 		if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
 						      offset >> 9),
@@ -1735,6 +1674,8 @@ static int bch2_direct_IO_write(struct bch_fs *c,
 		dio->iop.unalloc = true;
 	}
 
+	dio->iop.op.nr_replicas	= dio->iop.op.res.nr_replicas;
+
 	inode_dio_begin(&inode->v);
 	__pagecache_block_get(&mapping->add_lock);
 
@@ -1744,20 +1685,20 @@ static int bch2_direct_IO_write(struct bch_fs *c,
 
 			closure_sync(&dio->cl);
 			bch2_dio_write_done(dio);
-		} while (dio->iter.count && !dio->error);
+		} while (dio->iter.count && !dio->iop.op.error);
 
 		closure_debug_destroy(&dio->cl);
 		return __bch2_dio_write_complete(dio);
 	} else {
 		bch2_do_direct_IO_write(dio);
 
-		if (dio->iter.count && !dio->error) {
+		if (dio->iter.count && !dio->iop.op.error) {
 			if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
 				dio->iovec = kmalloc(dio->iter.nr_segs *
 						     sizeof(struct iovec),
 						     GFP_KERNEL);
 				if (!dio->iovec)
-					dio->error = -ENOMEM;
+					dio->iop.op.error = -ENOMEM;
 			} else {
 				dio->iovec = dio->inline_vecs;
 			}
@@ -1965,11 +1906,11 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	return bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
 }
 
-static int __bch2_truncate_page(struct address_space *mapping,
+static int __bch2_truncate_page(struct bch_inode_info *inode,
 				pgoff_t index, loff_t start, loff_t end)
 {
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
 	unsigned start_offset = start & (PAGE_SIZE - 1);
 	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
 	struct page *page;
@@ -2049,10 +1990,10 @@ out:
 	return ret;
 }
 
-static int bch2_truncate_page(struct address_space *mapping, loff_t from)
+static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
 {
-	return __bch2_truncate_page(mapping, from >> PAGE_SHIFT,
-				   from, from + PAGE_SIZE);
+	return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
+				    from, from + PAGE_SIZE);
 }
 
 int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
@@ -2060,6 +2001,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
 	bool shrink = iattr->ia_size <= inode->v.i_size;
+	struct i_sectors_hook i_sectors_hook =
+		i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY);
 	int ret = 0;
 
 	inode_dio_wait(&inode->v);
@@ -2069,17 +2012,15 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 
 	/* sync appends.. */
 	/* XXX what protects inode->i_size? */
-	if (iattr->ia_size > inode->ei_size)
+	if (iattr->ia_size > inode->ei_inode.bi_size)
 		ret = filemap_write_and_wait_range(mapping,
-						   inode->ei_size, S64_MAX);
+						   inode->ei_inode.bi_size, S64_MAX);
 	if (ret)
 		goto err_put_pagecache;
 
-	mutex_lock(&inode->ei_update_lock);
-	i_size_dirty_get(inode);
-	ret = bch2_write_inode_size(c, inode, inode->v.i_size);
-	mutex_unlock(&inode->ei_update_lock);
+	i_sectors_hook.new_i_size = iattr->ia_size;
 
+	ret = i_sectors_dirty_start(c, &i_sectors_hook);
 	if (unlikely(ret))
 		goto err;
 
@@ -2090,45 +2031,32 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 	 * here (new i_size < current i_size):
 	 */
 	if (shrink) {
-		struct i_sectors_hook i_sectors_hook;
-		int ret;
-
-		ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+		ret = bch2_truncate_page(inode, iattr->ia_size);
 		if (unlikely(ret))
 			goto err;
 
-		ret = bch2_truncate_page(inode->v.i_mapping, iattr->ia_size);
-		if (unlikely(ret)) {
-			i_sectors_dirty_put(c, inode, &i_sectors_hook);
-			goto err;
-		}
-
 		ret = bch2_inode_truncate(c, inode->v.i_ino,
-					 round_up(iattr->ia_size, PAGE_SIZE) >> 9,
-					 &i_sectors_hook.hook,
-					 &inode->ei_journal_seq);
-
-		i_sectors_dirty_put(c, inode, &i_sectors_hook);
-
+					  round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+					  &i_sectors_hook.hook,
+					  &inode->ei_journal_seq);
 		if (unlikely(ret))
 			goto err;
 	}
 
-	mutex_lock(&inode->ei_update_lock);
 	setattr_copy(&inode->v, iattr);
-	inode->v.i_mtime = inode->v.i_ctime = current_fs_time(inode->v.i_sb);
-out:
-	/* clear I_SIZE_DIRTY: */
-	i_size_dirty_put(inode);
-	ret = bch2_write_inode_size(c, inode, inode->v.i_size);
-	mutex_unlock(&inode->ei_update_lock);
+	inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
+err:
+	/*
+	 * On error - in particular, bch2_truncate_page() error - don't clear
+	 * I_SIZE_DIRTY, as we've left data above i_size!:
+	 */
+	if (ret)
+		i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
 
+	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
 err_put_pagecache:
 	pagecache_block_put(&mapping->add_lock);
 	return ret;
-err:
-	mutex_lock(&inode->ei_update_lock);
-	goto out;
 }
 
 static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
@@ -2144,33 +2072,41 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 	inode_dio_wait(&inode->v);
 	pagecache_block_get(&mapping->add_lock);
 
-	ret = __bch2_truncate_page(mapping,
+	ret = __bch2_truncate_page(inode,
 				   offset >> PAGE_SHIFT,
 				   offset, offset + len);
 	if (unlikely(ret))
-		goto out;
+		goto err;
 
 	if (offset >> PAGE_SHIFT !=
 	    (offset + len) >> PAGE_SHIFT) {
-		ret = __bch2_truncate_page(mapping,
+		ret = __bch2_truncate_page(inode,
 					   (offset + len) >> PAGE_SHIFT,
 					   offset, offset + len);
 		if (unlikely(ret))
-			goto out;
+			goto err;
 	}
 
 	truncate_pagecache_range(&inode->v, offset, offset + len - 1);
 
 	if (discard_start < discard_end) {
 		struct disk_reservation disk_res;
-		struct i_sectors_hook i_sectors_hook;
+		struct i_sectors_hook i_sectors_hook =
+			i_sectors_hook_init(inode, 0);
 		int ret;
 
-		BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
-
-		ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+		ret = i_sectors_dirty_start(c, &i_sectors_hook);
 		if (unlikely(ret))
-			goto out;
+			goto err;
+
+		/*
+		 * We need to pass in a disk reservation here because we might
+		 * be splitting a compressed extent into two. This isn't a
+		 * problem with truncate because truncate will never split an
+		 * extent, only truncate it...
+		 */
+		ret = bch2_disk_reservation_get(c, &disk_res, 0, 0);
+		BUG_ON(ret);
 
 		ret = bch2_btree_delete_range(c,
 				BTREE_ID_EXTENTS,
@@ -2180,11 +2116,11 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 				&disk_res,
 				&i_sectors_hook.hook,
 				&inode->ei_journal_seq);
-
-		i_sectors_dirty_put(c, inode, &i_sectors_hook);
 		bch2_disk_reservation_put(c, &disk_res);
+
+		ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
 	}
-out:
+err:
 	pagecache_block_put(&mapping->add_lock);
 	inode_unlock(&inode->v);
 
@@ -2200,7 +2136,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 	struct btree_iter dst;
 	BKEY_PADDED(k) copy;
 	struct bkey_s_c k;
-	struct i_sectors_hook i_sectors_hook;
+	struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
 	loff_t new_size;
 	int ret;
 
@@ -2237,7 +2173,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 	if (ret)
 		goto err;
 
-	ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+	ret = i_sectors_dirty_start(c, &i_sectors_hook);
 	if (ret)
 		goto err;
 
@@ -2278,8 +2214,14 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 					   BTREE_INSERT_ENTRY(&dst, &copy.k));
 		bch2_disk_reservation_put(c, &disk_res);
 btree_iter_err:
-		if (ret < 0 && ret != -EINTR)
-			goto err_unwind;
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			goto err_put_sectors_dirty;
+		/*
+		 * XXX: if we error here we've left data with multiple
+		 * pointers... which isn't a _super_ serious problem...
+		 */
 
 		bch2_btree_iter_cond_resched(&src);
 	}
@@ -2292,30 +2234,18 @@ btree_iter_err:
 				 &i_sectors_hook.hook,
 				 &inode->ei_journal_seq);
 	if (ret)
-		goto err_unwind;
-
-	i_sectors_dirty_put(c, inode, &i_sectors_hook);
+		goto err_put_sectors_dirty;
 
-	mutex_lock(&inode->ei_update_lock);
 	i_size_write(&inode->v, new_size);
-	ret = bch2_write_inode_size(c, inode, inode->v.i_size);
-	mutex_unlock(&inode->ei_update_lock);
-
+	i_sectors_hook.new_i_size = new_size;
+err_put_sectors_dirty:
+	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+err:
 	pagecache_block_put(&mapping->add_lock);
 	inode_unlock(&inode->v);
 
-	return ret;
-err_unwind:
-	/*
-	 * XXX: we've left data with multiple pointers... which isn't a _super_
-	 * serious problem...
-	 */
-	i_sectors_dirty_put(c, inode, &i_sectors_hook);
-err:
 	bch2_btree_iter_unlock(&src);
 	bch2_btree_iter_unlock(&dst);
-	pagecache_block_put(&mapping->add_lock);
-	inode_unlock(&inode->v);
 	return ret;
 }
 
@@ -2324,11 +2254,11 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 {
 	struct address_space *mapping = inode->v.i_mapping;
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct i_sectors_hook i_sectors_hook;
+	struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
 	struct btree_iter iter;
-	struct bpos end;
+	struct bpos end_pos;
 	loff_t block_start, block_end;
-	loff_t new_size = offset + len;
+	loff_t end = offset + len;
 	unsigned sectors;
 	unsigned replicas = READ_ONCE(c->opts.data_replicas);
 	int ret;
@@ -2340,45 +2270,43 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 	inode_dio_wait(&inode->v);
 	pagecache_block_get(&mapping->add_lock);
 
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    new_size > inode->v.i_size) {
-		ret = inode_newsize_ok(&inode->v, new_size);
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+		ret = inode_newsize_ok(&inode->v, end);
 		if (ret)
 			goto err;
 	}
 
 	if (mode & FALLOC_FL_ZERO_RANGE) {
-		ret = __bch2_truncate_page(mapping,
+		ret = __bch2_truncate_page(inode,
 					   offset >> PAGE_SHIFT,
-					   offset, offset + len);
+					   offset, end);
 
 		if (!ret &&
-		    offset >> PAGE_SHIFT !=
-		    (offset + len) >> PAGE_SHIFT)
-			ret = __bch2_truncate_page(mapping,
-						   (offset + len) >> PAGE_SHIFT,
-						   offset, offset + len);
+		    offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
+			ret = __bch2_truncate_page(inode,
+						   end >> PAGE_SHIFT,
+						   offset, end);
 
 		if (unlikely(ret))
 			goto err;
 
-		truncate_pagecache_range(&inode->v, offset, offset + len - 1);
+		truncate_pagecache_range(&inode->v, offset, end - 1);
 
 		block_start	= round_up(offset, PAGE_SIZE);
-		block_end	= round_down(offset + len, PAGE_SIZE);
+		block_end	= round_down(end, PAGE_SIZE);
 	} else {
 		block_start	= round_down(offset, PAGE_SIZE);
-		block_end	= round_up(offset + len, PAGE_SIZE);
+		block_end	= round_up(end, PAGE_SIZE);
 	}
 
 	bch2_btree_iter_set_pos(&iter, POS(inode->v.i_ino, block_start >> 9));
-	end = POS(inode->v.i_ino, block_end >> 9);
+	end_pos = POS(inode->v.i_ino, block_end >> 9);
 
-	ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+	ret = i_sectors_dirty_start(c, &i_sectors_hook);
 	if (unlikely(ret))
 		goto err;
 
-	while (bkey_cmp(iter.pos, end) < 0) {
+	while (bkey_cmp(iter.pos, end_pos) < 0) {
 		struct disk_reservation disk_res = { 0 };
 		struct bkey_i_reservation reservation;
 		struct bkey_s_c k;
@@ -2407,7 +2335,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 		reservation.k.size	= k.k->size;
 
 		bch2_cut_front(iter.pos, &reservation.k_i);
-		bch2_cut_back(end, &reservation.k);
+		bch2_cut_back(end_pos, &reservation.k);
 
 		sectors = reservation.k.size;
 		reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
@@ -2435,11 +2363,11 @@ btree_iter_err:
 	}
 	bch2_btree_iter_unlock(&iter);
 
-	i_sectors_dirty_put(c, inode, &i_sectors_hook);
+	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
 
 	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    new_size > inode->v.i_size) {
-		i_size_write(&inode->v, new_size);
+	    end > inode->v.i_size) {
+		i_size_write(&inode->v, end);
 
 		mutex_lock(&inode->ei_update_lock);
 		ret = bch2_write_inode_size(c, inode, inode->v.i_size);
@@ -2449,14 +2377,14 @@ btree_iter_err:
 	/* blech */
 	if ((mode & FALLOC_FL_KEEP_SIZE) &&
 	    (mode & FALLOC_FL_ZERO_RANGE) &&
-	    inode->ei_size != inode->v.i_size) {
+	    inode->ei_inode.bi_size != inode->v.i_size) {
 		/* sync appends.. */
 		ret = filemap_write_and_wait_range(mapping,
-					inode->ei_size, S64_MAX);
+					inode->ei_inode.bi_size, S64_MAX);
 		if (ret)
 			goto err;
 
-		if (inode->ei_size != inode->v.i_size) {
+		if (inode->ei_inode.bi_size != inode->v.i_size) {
 			mutex_lock(&inode->ei_update_lock);
 			ret = bch2_write_inode_size(c, inode, inode->v.i_size);
 			mutex_unlock(&inode->ei_update_lock);
@@ -2468,7 +2396,7 @@ btree_iter_err:
 
 	return 0;
 err_put_sectors_dirty:
-	i_sectors_dirty_put(c, inode, &i_sectors_hook);
+	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
 err:
 	bch2_btree_iter_unlock(&iter);
 	pagecache_block_put(&mapping->add_lock);
@@ -2669,11 +2597,14 @@ void bch2_fs_fsio_exit(struct bch_fs *c)
 int bch2_fs_fsio_init(struct bch_fs *c)
 {
 	if (bioset_init(&c->writepage_bioset,
-			4, offsetof(struct bch_writepage_io, op.op.wbio.bio)) ||
+			4, offsetof(struct bch_writepage_io, op.op.wbio.bio),
+			BIOSET_NEED_BVECS) ||
 	    bioset_init(&c->dio_read_bioset,
-			4, offsetof(struct dio_read, rbio.bio)) ||
+			4, offsetof(struct dio_read, rbio.bio),
+			BIOSET_NEED_BVECS) ||
 	    bioset_init(&c->dio_write_bioset,
-			4, offsetof(struct dio_write, iop.op.wbio.bio)))
+			4, offsetof(struct dio_write, iop.op.wbio.bio),
+			BIOSET_NEED_BVECS))
 		return -ENOMEM;
 
 	return 0;
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index bd915fec..24228c8e 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -75,7 +75,7 @@ do {									\
 /* Set VFS inode flags from bcachefs inode: */
 void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
 {
-	set_flags(bch_flags_to_vfs, inode->ei_flags, inode->v.i_flags);
+	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
 }
 
 static int bch2_inode_flags_set(struct bch_inode_info *inode,
@@ -99,13 +99,13 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
 		return -EINVAL;
 
 	bi->bi_flags = newflags;
-	inode->v.i_ctime = current_fs_time(inode->v.i_sb);
+	inode->v.i_ctime = current_time(&inode->v);
 	return 0;
 }
 
 static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
 {
-	unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_flags);
+	unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
 
 	return put_user(flags, arg);
 }
@@ -153,7 +153,7 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
 {
 	struct fsxattr fa = { 0 };
 
-	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_flags);
+	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
 
 	return copy_to_user(arg, &fa, sizeof(fa));
 }
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 43688cd3..cb0397f1 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -12,6 +12,7 @@
 #include "fs-ioctl.h"
 #include "fsck.h"
 #include "inode.h"
+#include "io.h"
 #include "journal.h"
 #include "keylist.h"
 #include "super.h"
@@ -130,10 +131,8 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
 				BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
 	} while (ret == -EINTR);
 
-	if (!ret) {
-		inode->ei_size	= inode_u.bi_size;
-		inode->ei_flags	= inode_u.bi_flags;
-	}
+	if (!ret)
+		inode->ei_inode = inode_u;
 out:
 	bch2_btree_iter_unlock(&iter);
 
@@ -146,7 +145,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 	return __bch2_write_inode(c, inode, NULL, NULL);
 }
 
-int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
 {
 	int ret;
 
@@ -158,7 +157,7 @@ int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
 	return ret;
 }
 
-int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
 {
 	int ret = 0;
 
@@ -223,7 +222,9 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
 	bch2_inode_init(c, &inode_u,
 			i_uid_read(&inode->v),
 			i_gid_read(&inode->v),
-			inode->v.i_mode, rdev);
+			inode->v.i_mode, rdev,
+			&dir->ei_inode);
+
 	ret = bch2_inode_create(c, &inode_u,
 				BLOCKDEV_INODE_MAX, 0,
 				&c->unused_inode_hint);
@@ -277,7 +278,7 @@ static int bch2_vfs_dirent_create(struct bch_fs *c,
 	if (unlikely(ret))
 		return ret;
 
-	dir->v.i_mtime = dir->v.i_ctime = current_fs_time(c->vfs_sb);
+	dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v);
 	mark_inode_dirty_sync(&dir->v);
 	return 0;
 }
@@ -344,7 +345,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 
 	lockdep_assert_held(&inode->v.i_rwsem);
 
-	inode->v.i_ctime = current_fs_time(dir->v.i_sb);
+	inode->v.i_ctime = current_time(&dir->v);
 
 	ret = bch2_inc_nlink(c, inode);
 	if (ret)
@@ -473,7 +474,7 @@ static int bch2_rename(struct bch_fs *c,
 {
 	struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
 	struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
-	struct timespec now = current_fs_time(old_dir->v.i_sb);
+	struct timespec now = current_time(&old_dir->v);
 	int ret;
 
 	lockdep_assert_held(&old_dir->v.i_rwsem);
@@ -551,7 +552,7 @@ static int bch2_rename_exchange(struct bch_fs *c,
 {
 	struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
 	struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
-	struct timespec now = current_fs_time(old_dir->v.i_sb);
+	struct timespec now = current_time(&old_dir->v);
 	int ret;
 
 	ret = bch2_dirent_rename(c,
@@ -909,10 +910,8 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
 	inode->v.i_ctime	= bch2_time_to_timespec(c, bi->bi_ctime);
 
 	inode->ei_journal_seq	= 0;
-	inode->ei_size		= bi->bi_size;
-	inode->ei_flags		= bi->bi_flags;
-	atomic64_set(&inode->ei_sectors, bi->bi_sectors);
 	inode->ei_str_hash	= bch2_hash_info_init(c, bi);
+	inode->ei_inode		= *bi;
 
 	bch2_inode_flags_to_vfs(inode);
 
@@ -949,8 +948,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
 	inode_init_once(&inode->v);
 	mutex_init(&inode->ei_update_lock);
 	inode->ei_journal_seq = 0;
-	atomic_long_set(&inode->ei_size_dirty_count, 0);
-	atomic_long_set(&inode->ei_sectors_dirty_count, 0);
 
 	return &inode->v;
 }
@@ -995,12 +992,6 @@ static void bch2_evict_inode(struct inode *vinode)
 
 	truncate_inode_pages_final(&inode->v.i_data);
 
-	if (!bch2_journal_error(&c->journal) && !is_bad_inode(&inode->v)) {
-		/* XXX - we want to check this stuff iff there weren't IO errors: */
-		BUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count));
-		BUG_ON(atomic64_read(&inode->ei_sectors) != inode->v.i_blocks);
-	}
-
 	clear_inode(&inode->v);
 
 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
@@ -1272,9 +1263,16 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
 	sb->s_time_gran		= c->sb.time_precision;
 	c->vfs_sb		= sb;
-	sb->s_bdi		= &c->bdi;
 	strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
 
+	ret = super_setup_bdi(sb);
+	if (ret)
+		goto err_put_super;
+
+	sb->s_bdi->congested_fn		= bch2_congested;
+	sb->s_bdi->congested_data	= c;
+	sb->s_bdi->ra_pages		= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+
 	for_each_online_member(ca, c, i) {
 		struct block_device *bdev = ca->disk_sb.bdev;
 
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index d255ca7c..652105fb 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -1,6 +1,7 @@
 #ifndef _BCACHEFS_FS_H
 #define _BCACHEFS_FS_H
 
+#include "opts.h"
 #include "str_hash.h"
 
 #include <linux/seqlock.h>
@@ -11,22 +12,12 @@ struct bch_inode_info {
 
 	struct mutex		ei_update_lock;
 	u64			ei_journal_seq;
-
-	atomic_long_t		ei_size_dirty_count;
-
-	/*
-	 * these are updated whenever we update the inode in the btree - for
-	 * e.g. fsync
-	 */
-	u64			ei_size;
-	u32			ei_flags;
-
-	atomic_long_t		ei_sectors_dirty_count;
-	atomic64_t		ei_sectors;
+	unsigned long		ei_last_dirtied;
 
 	struct bch_hash_info	ei_str_hash;
 
-	unsigned long		ei_last_dirtied;
+	/* copy of inode in btree: */
+	struct bch_inode_unpacked ei_inode;
 };
 
 #define to_bch_ei(_inode)					\
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 4760b16e..696926fe 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -204,7 +204,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
 			"hash table key at wrong offset: %llu, "
 			"hashed to %llu chain starts at %llu\n%s",
 			k.k->p.offset, hashed, h->chain.pos.offset,
-			bch2_bkey_val_to_text(c, desc.btree_id,
+			bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
 					      buf, sizeof(buf), k))) {
 		ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
 		if (ret) {
@@ -224,7 +224,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
 		if (fsck_err_on(k2.k->type == desc.key_type &&
 				!desc.cmp_bkey(k, k2), c,
 				"duplicate hash table keys:\n%s",
-				bch2_bkey_val_to_text(c, desc.btree_id,
+				bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
 						      buf, sizeof(buf), k))) {
 			ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL);
 			if (ret)
@@ -397,9 +397,9 @@ static int check_dirents(struct bch_fs *c)
 
 		if (fsck_err_on(have_target &&
 				d.v->d_type !=
-				mode_to_type(le16_to_cpu(target.bi_mode)), c,
+				mode_to_type(target.bi_mode), c,
 				"incorrect d_type: should be %u:\n%s",
-				mode_to_type(le16_to_cpu(target.bi_mode)),
+				mode_to_type(target.bi_mode),
 				bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
 						      buf, sizeof(buf), k))) {
 			struct bkey_i_dirent *n;
@@ -411,7 +411,7 @@ static int check_dirents(struct bch_fs *c)
 			}
 
 			bkey_reassemble(&n->k_i, d.s_c);
-			n->v.d_type = mode_to_type(le16_to_cpu(target.bi_mode));
+			n->v.d_type = mode_to_type(target.bi_mode);
 
 			ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
 					BTREE_INSERT_NOFAIL,
@@ -493,7 +493,8 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
 fsck_err:
 	return ret;
 create_root:
-	bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+	bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+			0, NULL);
 	root_inode->bi_inum = BCACHEFS_ROOT_INO;
 
 	bch2_inode_pack(&packed, root_inode);
@@ -545,7 +546,8 @@ create_lostfound:
 	if (ret)
 		return ret;
 
-	bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+	bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+			0, root_inode);
 
 	ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
 			       &c->unused_inode_hint);
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 05f617ae..71a24cc6 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -198,6 +198,12 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
 		if (bch2_inode_unpack(inode, &unpacked))
 			return "invalid variable length fields";
 
+		if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+			return "invalid data checksum type";
+
+		if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+			return "invalid data checksum type";
+
 		return NULL;
 	}
 	case BCH_INODE_BLOCKDEV:
@@ -221,6 +227,7 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
 static void bch2_inode_to_text(struct bch_fs *c, char *buf,
 			       size_t size, struct bkey_s_c k)
 {
+	char *out = buf, *end = out + size;
 	struct bkey_s_c_inode inode;
 	struct bch_inode_unpacked unpacked;
 
@@ -228,11 +235,14 @@ static void bch2_inode_to_text(struct bch_fs *c, char *buf,
 	case BCH_INODE_FS:
 		inode = bkey_s_c_to_inode(k);
 		if (bch2_inode_unpack(inode, &unpacked)) {
-			scnprintf(buf, size, "(unpack error)");
+			out += scnprintf(out, end - out, "(unpack error)");
 			break;
 		}
 
-		scnprintf(buf, size, "i_size %llu", unpacked.bi_size);
+#define BCH_INODE_FIELD(_name, _bits)						\
+		out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
+		BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
 		break;
 	}
 }
@@ -243,9 +253,12 @@ const struct bkey_ops bch2_bkey_inode_ops = {
 };
 
 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
-		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev)
+		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		     struct bch_inode_unpacked *parent)
 {
-	s64 now = timespec_to_bch2_time(c, CURRENT_TIME);
+	s64 now = timespec_to_bch2_time(c,
+		timespec_trunc(current_kernel_time(),
+			       c->sb.time_precision));
 
 	memset(inode_u, 0, sizeof(*inode_u));
 
@@ -261,6 +274,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 	inode_u->bi_mtime	= now;
 	inode_u->bi_ctime	= now;
 	inode_u->bi_otime	= now;
+
+	if (parent) {
+#define BCH_INODE_FIELD(_name)	inode_u->_name = parent->_name;
+		BCH_INODE_FIELDS_INHERIT()
+#undef BCH_INODE_FIELD
+	}
 }
 
 int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
@@ -416,7 +435,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 			struct bch_inode_unpacked inode_u;
 
 			if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
-				bi_generation = cpu_to_le32(inode_u.bi_generation) + 1;
+				bi_generation = inode_u.bi_generation + 1;
 			break;
 		}
 		case BCH_INODE_GENERATION: {
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 53c70617..8ebb6fb6 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -1,6 +1,8 @@
 #ifndef _BCACHEFS_INODE_H
 #define _BCACHEFS_INODE_H
 
+#include "opts.h"
+
 #include <linux/math64.h>
 
 extern const struct bkey_ops bch2_bkey_inode_ops;
@@ -28,7 +30,8 @@ void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *)
 int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
 
 void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
-		    uid_t, gid_t, umode_t, dev_t);
+		     uid_t, gid_t, umode_t, dev_t,
+		     struct bch_inode_unpacked *);
 int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
 		      u64, u64, u64 *);
 int bch2_inode_truncate(struct bch_fs *, u64, u64,
@@ -55,6 +58,45 @@ static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts)
 	return div_s64(ns, c->sb.time_precision);
 }
 
+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
+{
+	struct bch_io_opts ret = { 0 };
+
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (inode->bi_##_name)						\
+		opt_set(ret, _name, inode->bi_##_name - 1);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	return ret;
+}
+
+static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+					enum bch_opt_id id, u64 v)
+{
+	switch (id) {
+#define BCH_INODE_OPT(_name, ...)					\
+	case Opt_##_name:						\
+		inode->bi_##_name = v;					\
+		break;
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	default:
+		BUG();
+	}
+}
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+				      enum bch_opt_id id, u64 v)
+{
+	return __bch2_inode_opt_set(inode, id, v + 1);
+}
+
+static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
+					enum bch_opt_id id)
+{
+	return __bch2_inode_opt_set(inode, id, 0);
+}
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_inode_pack_test(void);
 #else
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 0c41e411..3369a2ff 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -20,6 +20,7 @@
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "super.h"
 #include "super-io.h"
 
 #include <linux/blkdev.h>
@@ -139,7 +140,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 	const struct bch_extent_ptr *ptr;
 	struct bch_write_bio *n;
 	struct bch_dev *ca;
-	unsigned ptr_idx = 0;
 
 	BUG_ON(c->opts.nochanges);
 
@@ -147,7 +147,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
 		       !c->devs[ptr->dev]);
 
-		ca = c->devs[ptr->dev];
+		ca = bch_dev_bkey_exists(c, ptr->dev);
 
 		if (ptr + 1 < &extent_entry_last(e)->ptr) {
 			n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
@@ -168,7 +168,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
 		n->c			= c;
 		n->ca			= ca;
-		n->ptr_idx		= ptr_idx++;
 		n->submit_time_us	= local_clock_us();
 		n->bio.bi_iter.bi_sector = ptr->offset;
 
@@ -184,7 +183,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 			submit_bio(&n->bio);
 		} else {
 			n->have_io_ref		= false;
-			bcache_io_error(c, &n->bio, "device has been removed");
+			n->bio.bi_status	= BLK_STS_REMOVED;
 			bio_endio(&n->bio);
 		}
 	}
@@ -201,9 +200,12 @@ static void bch2_write_done(struct closure *cl)
 	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
 		op->error = bch2_journal_error(&op->c->journal);
 
-	bch2_disk_reservation_put(op->c, &op->res);
+	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+		bch2_disk_reservation_put(op->c, &op->res);
 	percpu_ref_put(&op->c->writes);
 	bch2_keylist_free(&op->insert_keys, op->inline_keys);
+	op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED);
+
 	closure_return(cl);
 }
 
@@ -244,9 +246,37 @@ static void bch2_write_index(struct closure *cl)
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
 	struct keylist *keys = &op->insert_keys;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct bkey_i *src, *dst = keys->keys, *n;
+	int ret;
 
 	op->flags |= BCH_WRITE_LOOPED;
 
+	for (src = keys->keys; src != keys->top; src = n) {
+		n = bkey_next(src);
+		bkey_copy(dst, src);
+
+		e = bkey_i_to_s_extent(dst);
+		extent_for_each_ptr_backwards(e, ptr)
+			if (test_bit(ptr->dev, op->failed.d))
+				bch2_extent_drop_ptr(e, ptr);
+
+		ret = bch2_extent_nr_ptrs(e.c)
+			? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
+			: -EIO;
+		if (ret) {
+			keys->top = keys->keys;
+			op->error = ret;
+			op->flags |= BCH_WRITE_DONE;
+			goto err;
+		}
+
+		dst = bkey_next(dst);
+	}
+
+	keys->top = dst;
+
 	if (!bch2_keylist_empty(keys)) {
 		u64 sectors_start = keylist_sectors(keys);
 		int ret = op->index_update_fn(op);
@@ -260,7 +290,7 @@ static void bch2_write_index(struct closure *cl)
 			op->error = ret;
 		}
 	}
-
+err:
 	bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
 
 	if (!(op->flags & BCH_WRITE_DONE))
@@ -276,43 +306,6 @@ static void bch2_write_index(struct closure *cl)
 	}
 }
 
-static void bch2_write_io_error(struct closure *cl)
-{
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct keylist *keys = &op->insert_keys;
-	struct bch_fs *c = op->c;
-	struct bch_extent_ptr *ptr;
-	struct bkey_i *k;
-	int ret;
-
-	for_each_keylist_key(keys, k) {
-		struct bkey_i *n = bkey_next(k);
-		struct bkey_s_extent e = bkey_i_to_s_extent(k);
-
-		extent_for_each_ptr_backwards(e, ptr)
-			if (test_bit(ptr->dev, op->failed.d))
-				bch2_extent_drop_ptr(e, ptr);
-
-		memmove(bkey_next(k), n, (void *) keys->top - (void *) n);
-		keys->top_p -= (u64 *) n - (u64 *) bkey_next(k);
-
-		ret = bch2_extent_nr_ptrs(e.c)
-			? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
-			: -EIO;
-		if (ret) {
-			keys->top = keys->keys;
-			op->error = ret;
-			op->flags |= BCH_WRITE_DONE;
-			break;
-		}
-	}
-
-	memset(&op->failed, 0, sizeof(op->failed));
-
-	bch2_write_index(cl);
-	return;
-}
-
 static void bch2_write_endio(struct bio *bio)
 {
 	struct closure *cl		= bio->bi_private;
@@ -324,10 +317,8 @@ static void bch2_write_endio(struct bio *bio)
 
 	bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
 
-	if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
 		set_bit(ca->dev_idx, op->failed.d);
-		set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
-	}
 
 	if (wbio->have_io_ref)
 		percpu_ref_put(&ca->io_ref);
@@ -706,11 +697,6 @@ do_write:
 
 	key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
 
-	ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
-				    BCH_DATA_USER);
-	if (ret)
-		goto err;
-
 	dst->bi_end_io	= bch2_write_endio;
 	dst->bi_private	= &op->cl;
 	bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
@@ -870,7 +856,8 @@ void bch2_write(struct closure *cl)
 	    !percpu_ref_tryget(&c->writes)) {
 		__bcache_io_error(c, "read only");
 		op->error = -EROFS;
-		bch2_disk_reservation_put(c, &op->res);
+		if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+			bch2_disk_reservation_put(c, &op->res);
 		closure_return(cl);
 	}
 
@@ -916,7 +903,10 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
 	rbio->promote = NULL;
 
-	__bch2_write_op_init(&op->write.op, c);
+	bch2_write_op_init(&op->write.op, c);
+	op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum);
+	op->write.op.compression_type =
+		bch2_compression_opt_to_type(rbio->opts.compression);
 
 	op->write.move_dev	= -1;
 	op->write.op.devs	= c->fastest_devs;
@@ -1060,7 +1050,7 @@ static void bch2_rbio_retry(struct work_struct *work)
 	if (rbio->split)
 		rbio = bch2_rbio_free(rbio);
 	else
-		rbio->bio.bi_error = 0;
+		rbio->bio.bi_status = 0;
 
 	if (!(flags & BCH_READ_NODECODE))
 		flags |= BCH_READ_MUST_CLONE;
@@ -1073,7 +1063,8 @@ static void bch2_rbio_retry(struct work_struct *work)
 		__bch2_read(c, rbio, iter, inode, &avoid, flags);
 }
 
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+			    blk_status_t error)
 {
 	rbio->retry = retry;
 
@@ -1081,7 +1072,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
 		return;
 
 	if (retry == READ_ERR) {
-		bch2_rbio_parent(rbio)->bio.bi_error = error;
+		bch2_rbio_parent(rbio)->bio.bi_status = error;
 		bch2_rbio_done(rbio);
 	} else {
 		bch2_rbio_punt(rbio, bch2_rbio_retry,
@@ -1236,7 +1227,7 @@ csum_err:
 	 */
 	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
 		rbio->flags |= BCH_READ_MUST_BOUNCE;
-		bch2_rbio_error(rbio, READ_RETRY, -EIO);
+		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
 		return;
 	}
 
@@ -1245,13 +1236,13 @@ csum_err:
 		rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
 		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
 		csum.hi, csum.lo, crc.csum_type);
-	bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO);
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
 	return;
 decompression_err:
 	__bcache_io_error(c, "decompression error, inode %llu offset %llu",
 			  rbio->pos.inode,
 			  (u64) rbio->bvec_iter.bi_sector);
-	bch2_rbio_error(rbio, READ_ERR, -EIO);
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
 	return;
 }
 
@@ -1270,8 +1261,8 @@ static void bch2_read_endio(struct bio *bio)
 	if (!rbio->split)
 		rbio->bio.bi_end_io = rbio->end_io;
 
-	if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) {
-		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error);
+	if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) {
+		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
 		return;
 	}
 
@@ -1281,9 +1272,9 @@ static void bch2_read_endio(struct bio *bio)
 		atomic_long_inc(&c->read_realloc_races);
 
 		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
-			bch2_rbio_error(rbio, READ_RETRY, -EINTR);
+			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
 		else
-			bch2_rbio_error(rbio, READ_ERR, -EINTR);
+			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
 		return;
 	}
 
@@ -1360,7 +1351,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 
 		rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
 					DIV_ROUND_UP(sectors, PAGE_SECTORS),
-					&c->bio_read_split));
+					&c->bio_read_split),
+				 orig->opts);
 
 		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
 		split = true;
@@ -1374,7 +1366,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		 * lose the error)
 		 */
 		rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
-						&c->bio_read_split));
+						&c->bio_read_split),
+				 orig->opts);
 		rbio->bio.bi_iter = iter;
 		split = true;
 	} else {
@@ -1428,6 +1421,8 @@ noclone:
 		bch2_read_endio(&rbio->bio);
 
 		ret = rbio->retry;
+		if (rbio->split)
+			rbio = bch2_rbio_free(rbio);
 		if (!ret)
 			bch2_rbio_done(rbio);
 	}
@@ -1503,7 +1498,7 @@ err:
 	 * possibly bigger than the memory that was
 	 * originally allocated)
 	 */
-	rbio->bio.bi_error = -EINTR;
+	rbio->bio.bi_status = BLK_STS_AGAIN;
 	bio_endio(&rbio->bio);
 	return;
 }
@@ -1561,6 +1556,7 @@ retry:
 			case READ_RETRY:
 				goto retry;
 			case READ_ERR:
+				rbio->bio.bi_status = BLK_STS_IOERR;
 				bio_endio(&rbio->bio);
 				return;
 			};
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index bd0d7c43..0c145eb6 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -21,6 +21,8 @@ void bch2_latency_acct(struct bch_dev *, unsigned, int);
 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 			       enum bch_data_type, const struct bkey_i *);
 
+#define BLK_STS_REMOVED		((__force blk_status_t)128)
+
 enum bch_write_flags {
 	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
 	BCH_WRITE_CACHED		= (1 << 1),
@@ -29,11 +31,12 @@ enum bch_write_flags {
 	BCH_WRITE_PAGES_STABLE		= (1 << 4),
 	BCH_WRITE_PAGES_OWNED		= (1 << 5),
 	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
+	BCH_WRITE_NOPUT_RESERVATION	= (1 << 7),
 
 	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 7),
-	BCH_WRITE_DONE			= (1 << 8),
-	BCH_WRITE_LOOPED		= (1 << 9),
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 8),
+	BCH_WRITE_DONE			= (1 << 9),
+	BCH_WRITE_LOOPED		= (1 << 10),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -42,6 +45,12 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
 		? op->journal_seq_p : &op->journal_seq;
 }
 
+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
+{
+	op->journal_seq_p = journal_seq;
+	op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+}
+
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
 	return op->alloc_reserve == RESERVE_MOVINGGC
@@ -51,14 +60,14 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 
 int bch2_write_index_default(struct bch_write_op *);
 
-static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
 {
 	op->c			= c;
 	op->io_wq		= index_update_wq(op);
 	op->flags		= 0;
 	op->written		= 0;
 	op->error		= 0;
-	op->csum_type		= bch2_data_checksum_type(c);
+	op->csum_type		= bch2_data_checksum_type(c, c->opts.data_checksum);
 	op->compression_type	=
 		bch2_compression_opt_to_type(c->opts.compression);
 	op->nr_replicas		= 0;
@@ -75,27 +84,6 @@ static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *
 	op->index_update_fn	= bch2_write_index_default;
 }
 
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-				      struct disk_reservation res,
-				      struct bch_devs_mask *devs,
-				      struct write_point_specifier write_point,
-				      struct bpos pos,
-				      u64 *journal_seq, unsigned flags)
-{
-	__bch2_write_op_init(op, c);
-	op->flags	= flags;
-	op->nr_replicas	= res.nr_replicas;
-	op->pos		= pos;
-	op->res		= res;
-	op->devs	= devs;
-	op->write_point	= write_point;
-
-	if (journal_seq) {
-		op->journal_seq_p = journal_seq;
-		op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
-	}
-}
-
 void bch2_write(struct closure *);
 
 static inline struct bch_write_bio *wbio_init(struct bio *bio)
@@ -134,25 +122,27 @@ static inline void bch2_read_extent(struct bch_fs *c,
 				    struct extent_pick_ptr *pick,
 				    unsigned flags)
 {
-	rbio->_state = 0;
 	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
 }
 
 static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 			     u64 inode)
 {
-	rbio->_state = 0;
+	BUG_ON(rbio->_state);
 	__bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
 		    BCH_READ_RETRY_IF_STALE|
 		    BCH_READ_MAY_PROMOTE|
 		    BCH_READ_USER_MAPPED);
 }
 
-static inline struct bch_read_bio *rbio_init(struct bio *bio)
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+					     struct bch_io_opts opts)
 {
 	struct bch_read_bio *rbio = to_rbio(bio);
 
-	rbio->_state = 0;
+	rbio->_state	= 0;
+	rbio->promote	= NULL;
+	rbio->opts	= opts;
 	return rbio;
 }
 
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index ed9a4bbe..ff18fdc9 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -6,6 +6,7 @@
 #include "buckets_types.h"
 #include "extents_types.h"
 #include "keylist_types.h"
+#include "opts.h"
 #include "super_types.h"
 
 #include <linux/llist.h>
@@ -56,6 +57,8 @@ struct bch_read_bio {
 
 	struct promote_op	*promote;
 
+	struct bch_io_opts	opts;
+
 	struct work_struct	work;
 
 	struct bio		bio;
@@ -69,8 +72,7 @@ struct bch_write_bio {
 	struct closure		*cl;
 	};
 
-	u8			ptr_idx;
-	u8			replicas_failed;
+	struct bch_devs_list	failed;
 	u8			order;
 
 	unsigned		split:1,
@@ -90,8 +92,8 @@ struct bch_write_op {
 	struct bch_fs		*c;
 	struct workqueue_struct	*io_wq;
 
+	unsigned		written; /* sectors */
 	u16			flags;
-	u16			written; /* sectors */
 	s8			error;
 
 	unsigned		csum_type:4;
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 5d9a298d..b4e149ac 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -338,8 +338,8 @@ struct journal_list {
  * Given a journal entry we just read, add it to the list of journal entries to
  * be replayed:
  */
-static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
-		    struct jset *j)
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+			     struct journal_list *jlist, struct jset *j)
 {
 	struct journal_replay *i, *pos;
 	struct list_head *where;
@@ -347,8 +347,6 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
 	__le64 last_seq;
 	int ret;
 
-	mutex_lock(&jlist->lock);
-
 	last_seq = !list_empty(jlist->head)
 		? list_last_entry(jlist->head, struct journal_replay,
 				  list)->j.last_seq
@@ -376,9 +374,7 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
 				    memcmp(j, &i->j, bytes), c,
 				    "found duplicate but non identical journal entries (seq %llu)",
 				    le64_to_cpu(j->seq));
-
-			ret = JOURNAL_ENTRY_ADD_OK;
-			goto out;
+			goto found;
 		}
 
 		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
@@ -395,12 +391,16 @@ add:
 		goto out;
 	}
 
-	memcpy(&i->j, j, bytes);
 	list_add(&i->list, where);
+	i->devs.nr = 0;
+	memcpy(&i->j, j, bytes);
+found:
+	if (!fsck_err_on(bch2_dev_list_has_dev(i->devs, ca->dev_idx),
+			 c, "duplicate journal entries on same device"))
+		bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
 	ret = JOURNAL_ENTRY_ADD_OK;
 out:
 fsck_err:
-	mutex_unlock(&jlist->lock);
 	return ret;
 }
 
@@ -496,8 +496,8 @@ fsck_err:
 #define journal_entry_err_on(cond, c, msg, ...)				\
 	((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
 
-static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
-				    int write)
+static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j,
+					  int write)
 {
 	struct jset_entry *entry;
 	int ret = 0;
@@ -508,7 +508,7 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
 		if (journal_entry_err_on(vstruct_next(entry) >
 					 vstruct_last(j), c,
 				"journal entry extends past end of jset")) {
-			j->u64s = cpu_to_le64((u64 *) entry - j->_data);
+			j->u64s = cpu_to_le32((u64 *) entry - j->_data);
 			break;
 		}
 
@@ -614,7 +614,7 @@ static int journal_entry_validate(struct bch_fs *c,
 			"invalid journal entry: last_seq > seq"))
 		j->last_seq = j->seq;
 
-	return __journal_entry_validate(c, j, write);
+	return 0;
 fsck_err:
 	return ret;
 }
@@ -722,7 +722,10 @@ reread:			sectors_read = min_t(unsigned,
 
 		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
-		ret = journal_entry_add(c, jlist, j);
+		mutex_lock(&jlist->lock);
+		ret = journal_entry_add(c, ca, jlist, j);
+		mutex_unlock(&jlist->lock);
+
 		switch (ret) {
 		case JOURNAL_ENTRY_ADD_OK:
 			*entries_found = true;
@@ -916,7 +919,9 @@ static int journal_seq_blacklist_read(struct journal *j,
 
 	for_each_jset_entry_type(entry, &i->j,
 			JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
-		seq = le64_to_cpu(entry->_data[0]);
+		struct jset_entry_blacklist *bl_entry =
+			container_of(entry, struct jset_entry_blacklist, entry);
+		seq = le64_to_cpu(bl_entry->seq);
 
 		bch_verbose(c, "blacklisting existing journal seq %llu", seq);
 
@@ -982,6 +987,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 	fsck_err_on(c->sb.clean && journal_has_keys(list), c,
 		    "filesystem marked clean but journal has keys to replay");
 
+	list_for_each_entry(i, list, list) {
+		ret = journal_entry_validate_entries(c, &i->j, READ);
+		if (ret)
+			goto fsck_err;
+	}
+
 	i = list_last_entry(list, struct journal_replay, list);
 
 	unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
@@ -1002,6 +1013,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		INIT_LIST_HEAD(&p->list);
 		INIT_LIST_HEAD(&p->flushed);
 		atomic_set(&p->count, 0);
+		p->devs.nr = 0;
 	}
 
 	mutex_lock(&j->blacklist_lock);
@@ -1010,6 +1022,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
 
 		atomic_set(&p->count, 1);
+		p->devs = i->devs;
 
 		if (journal_seq_blacklist_read(j, i, p)) {
 			mutex_unlock(&j->blacklist_lock);
@@ -1090,7 +1103,7 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
 {
 	struct journal_buf *w = journal_prev_buf(j);
 
-	atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count);
+	atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
 
 	if (!need_write_just_set &&
 	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
@@ -1122,6 +1135,7 @@ static void __journal_entry_new(struct journal *j, int count)
 	INIT_LIST_HEAD(&p->list);
 	INIT_LIST_HEAD(&p->flushed);
 	atomic_set(&p->count, count);
+	p->devs.nr = 0;
 }
 
 static void __bch2_journal_next_entry(struct journal *j)
@@ -1851,6 +1865,21 @@ void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 		   bch2_journal_error(j));
 }
 
+int bch2_journal_flush_all_pins(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool flush;
+
+	bch2_journal_flush_pins(j, U64_MAX);
+
+	spin_lock(&j->lock);
+	flush = last_seq(j) != j->last_seq_ondisk ||
+		c->btree_roots_dirty;
+	spin_unlock(&j->lock);
+
+	return flush ? bch2_journal_meta(j) : 0;
+}
+
 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
 	bool ret;
@@ -2002,7 +2031,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 	 * i.e. whichever device was limiting the current journal entry size.
 	 */
 	extent_for_each_ptr_backwards(e, ptr) {
-		ca = c->devs[ptr->dev];
+		   ca = bch_dev_bkey_exists(c, ptr->dev);
 
 		if (ca->mi.state != BCH_MEMBER_STATE_RW ||
 		    ca->journal.sectors_free <= sectors)
@@ -2197,7 +2226,7 @@ static void journal_write_endio(struct bio *bio)
 	struct bch_dev *ca = bio->bi_private;
 	struct journal *j = &ca->fs->journal;
 
-	if (bch2_dev_io_err_on(bio->bi_error, ca, "journal write") ||
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
 	    bch2_meta_write_fault("journal")) {
 		/* Was this a flush or an actual journal write? */
 		if (ca->journal.ptr_idx != U8_MAX) {
@@ -2233,6 +2262,7 @@ static void journal_write(struct closure *cl)
 		if (r->alive)
 			bch2_journal_add_btree_root(w, i, &r->key, r->level);
 	}
+	c->btree_roots_dirty = false;
 	mutex_unlock(&c->btree_root_lock);
 
 	journal_write_compact(jset);
@@ -2246,7 +2276,7 @@ static void journal_write(struct closure *cl)
 	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 
 	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
-	    __journal_entry_validate(c, jset, WRITE))
+	    journal_entry_validate_entries(c, jset, WRITE))
 		goto err;
 
 	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
@@ -2257,7 +2287,7 @@ static void journal_write(struct closure *cl)
 				  journal_nonce(jset), jset);
 
 	if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
-	    __journal_entry_validate(c, jset, WRITE))
+	    journal_entry_validate_entries(c, jset, WRITE))
 		goto err;
 
 	sectors = vstruct_sectors(jset, c->block_bits);
@@ -2277,6 +2307,9 @@ static void journal_write(struct closure *cl)
 				  BCH_DATA_JOURNAL))
 		goto err;
 
+	journal_seq_pin(j, le64_to_cpu(jset->seq))->devs =
+			bch2_extent_devs(bkey_i_to_s_c_extent(&j->key));
+
 	/*
 	 * XXX: we really should just disable the entire journal in nochanges
 	 * mode
@@ -2285,7 +2318,7 @@ static void journal_write(struct closure *cl)
 		goto no_io;
 
 	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) {
-		ca = c->devs[ptr->dev];
+		ca = bch_dev_bkey_exists(c, ptr->dev);
 		if (!percpu_ref_tryget(&ca->io_ref)) {
 			/* XXX: fix this */
 			bch_err(c, "missing device for journal write\n");
@@ -2693,6 +2726,46 @@ int bch2_journal_flush(struct journal *j)
 	return bch2_journal_flush_seq(j, seq);
 }
 
+int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	struct bch_devs_list devs;
+	u64 seq = 0;
+	unsigned iter;
+	int ret = 0;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(p, &j->pin, iter)
+		if (bch2_dev_list_has_dev(p->devs, dev_idx))
+			seq = journal_pin_seq(j, p);
+	spin_unlock(&j->lock);
+
+	bch2_journal_flush_pins(j, seq);
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+	seq = 0;
+
+	spin_lock(&j->lock);
+	while (!ret && seq < atomic64_read(&j->seq)) {
+		seq = max(seq, last_seq(j));
+		devs = journal_seq_pin(j, seq)->devs;
+		seq++;
+
+		spin_unlock(&j->lock);
+		ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL);
+		spin_lock(&j->lock);
+	}
+	spin_unlock(&j->lock);
+
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
+
 ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -2862,9 +2935,7 @@ void bch2_fs_journal_stop(struct journal *j)
 	 * journal entries, then force a brand new empty journal entry to be
 	 * written:
 	 */
-	bch2_journal_flush_pins(j, U64_MAX);
-	bch2_journal_flush_async(j, NULL);
-	bch2_journal_meta(j);
+	bch2_journal_flush_all_pins(j);
 
 	cancel_delayed_work_sync(&j->write_work);
 	cancel_delayed_work_sync(&j->reclaim_work);
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 9d6c79c6..5f3ece08 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -118,6 +118,8 @@
  */
 struct journal_replay {
 	struct list_head	list;
+	struct bch_devs_list	devs;
+	/* must be last: */
 	struct jset		j;
 };
 
@@ -164,6 +166,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
 				  struct journal_entry_pin *,
 				  journal_pin_flush_fn);
 void bch2_journal_flush_pins(struct journal *, u64);
+int bch2_journal_flush_all_pins(struct journal *);
 
 struct closure;
 struct bch_fs;
@@ -356,6 +359,7 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
 int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
 int bch2_journal_meta(struct journal *);
+int bch2_journal_flush_device(struct journal *, unsigned);
 
 void bch2_journal_halt(struct journal *);
 
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 55b41c56..87f378a6 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -34,6 +34,7 @@ struct journal_entry_pin_list {
 	struct list_head		list;
 	struct list_head		flushed;
 	atomic_t			count;
+	struct bch_devs_list		devs;
 };
 
 struct journal;
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 8d1c0ee0..e11ee953 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -27,23 +27,9 @@ static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
 
 #define MAX_DATA_OFF_ITER	10
 
-/*
- * This moves only the data off, leaving the meta-data (if any) in place.
- * It walks the key space, and for any key with a valid pointer to the
- * relevant device, it copies it elsewhere, updating the key to point to
- * the copy.
- * The meta-data is moved off by bch_move_meta_data_off_device.
- *
- * Note: If the number of data replicas desired is > 1, ideally, any
- * new copies would not be made in the same device that already have a
- * copy (if there are enough devices).
- * This is _not_ currently implemented.  The multiple replicas can
- * land in the same device even if there are others available.
- */
-
-int bch2_move_data_off_device(struct bch_dev *ca)
+static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
+				    int flags)
 {
-	struct bch_fs *c = ca->fs;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 keys_moved, sectors_moved;
@@ -113,10 +99,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
 	return ret;
 }
 
-/*
- * This walks the btree, and for any node on the relevant device it moves the
- * node elsewhere.
- */
 static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
 			       enum btree_id id)
 {
@@ -200,9 +182,9 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
  *   is written.
  */
 
-int bch2_move_metadata_off_device(struct bch_dev *ca)
+static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
+				     int flags)
 {
-	struct bch_fs *c = ca->fs;
 	unsigned i;
 	int ret = 0;
 
@@ -240,37 +222,31 @@ err:
 	return ret;
 }
 
-/*
- * Flagging data bad when forcibly removing a device after failing to
- * migrate the data off the device.
- */
+int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+	return bch2_dev_usrdata_migrate(c, ca, flags) ?:
+		bch2_dev_metadata_migrate(c, ca, flags);
+}
 
-static int bch2_flag_key_bad(struct btree_iter *iter,
-			    struct bch_dev *ca,
-			    struct bkey_s_c_extent orig)
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+			 unsigned dev_idx, int flags, bool metadata)
 {
-	BKEY_PADDED(key) tmp;
-	struct bkey_s_extent e;
 	struct bch_extent_ptr *ptr;
-	struct bch_fs *c = ca->fs;
-
-	bkey_reassemble(&tmp.key, orig.s_c);
-	e = bkey_i_to_s_extent(&tmp.key);
+	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+	unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+	unsigned nr_good;
 
 	extent_for_each_ptr_backwards(e, ptr)
-		if (ptr->dev == ca->dev_idx)
+		if (ptr->dev == dev_idx)
 			bch2_extent_drop_ptr(e, ptr);
 
-	/*
-	 * If the new extent no longer has any pointers, bch2_extent_normalize()
-	 * will do the appropriate thing with it (turning it into a
-	 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
-	 */
-	bch2_extent_normalize(c, e.s);
+	nr_good = bch2_extent_nr_good_ptrs(c, e.c);
+	if ((!nr_good && !(flags & lost)) ||
+	    (nr_good < replicas && !(flags & degraded)))
+		return -EINVAL;
 
-	return bch2_btree_insert_at(c, NULL, NULL, NULL,
-				   BTREE_INSERT_ATOMIC,
-				   BTREE_INSERT_ENTRY(iter, &tmp.key));
+	return 0;
 }
 
 /*
@@ -284,11 +260,11 @@ static int bch2_flag_key_bad(struct btree_iter *iter,
  * that we've already tried to move the data MAX_DATA_OFF_ITER times and
  * are not likely to succeed if we try again.
  */
-int bch2_flag_data_bad(struct bch_dev *ca)
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-	struct bch_fs *c = ca->fs;
 	struct bkey_s_c k;
-	struct bkey_s_c_extent e;
+	struct bkey_s_extent e;
+	BKEY_PADDED(key) tmp;
 	struct btree_iter iter;
 	int ret = 0;
 
@@ -303,11 +279,33 @@ int bch2_flag_data_bad(struct bch_dev *ca)
 		if (!bkey_extent_is_data(k.k))
 			goto advance;
 
-		e = bkey_s_c_to_extent(k);
-		if (!bch2_extent_has_device(e, ca->dev_idx))
+		if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx))
 			goto advance;
 
-		ret = bch2_flag_key_bad(&iter, ca, e);
+		bkey_reassemble(&tmp.key, k);
+		e = bkey_i_to_s_extent(&tmp.key);
+
+		ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
+		if (ret)
+			break;
+
+		/*
+		 * If the new extent no longer has any pointers, bch2_extent_normalize()
+		 * will do the appropriate thing with it (turning it into a
+		 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+		 */
+		bch2_extent_normalize(c, e.s);
+
+		if (bkey_extent_is_data(e.k) &&
+		    (ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER)))
+			break;
+
+		iter.pos = bkey_start_pos(&tmp.key.k);
+
+		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+					   BTREE_INSERT_ATOMIC|
+					   BTREE_INSERT_NOFAIL,
+					   BTREE_INSERT_ENTRY(&iter, &tmp.key));
 
 		/*
 		 * don't want to leave ret == -EINTR, since if we raced and
@@ -319,26 +317,6 @@ int bch2_flag_data_bad(struct bch_dev *ca)
 		if (ret)
 			break;
 
-		/*
-		 * If the replica we're dropping was dirty and there is an
-		 * additional cached replica, the cached replica will now be
-		 * considered dirty - upon inserting the new version of the key,
-		 * the bucket accounting will be updated to reflect the fact
-		 * that the cached data is now dirty and everything works out as
-		 * if by magic without us having to do anything.
-		 *
-		 * The one thing we need to be concerned with here is there's a
-		 * race between when we drop any stale pointers from the key
-		 * we're about to insert, and when the key actually gets
-		 * inserted and the cached data is marked as dirty - we could
-		 * end up trying to insert a key with a pointer that should be
-		 * dirty, but points to stale data.
-		 *
-		 * If that happens the insert code just bails out and doesn't do
-		 * the insert - however, it doesn't return an error. Hence we
-		 * need to always recheck the current key before advancing to
-		 * the next:
-		 */
 		continue;
 advance:
 		if (bkey_extent_is_data(k.k)) {
@@ -357,3 +335,80 @@ advance:
 
 	return ret;
 }
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct btree_iter iter;
+	struct closure cl;
+	struct btree *b;
+	unsigned id;
+	int ret;
+
+	/* don't handle this yet: */
+	if (flags & BCH_FORCE_IF_METADATA_LOST)
+		return -EINVAL;
+
+	closure_init_stack(&cl);
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+			__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+			struct bkey_i_extent *new_key;
+retry:
+			if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
+						    dev_idx)) {
+				bch2_btree_iter_set_locks_want(&iter, 0);
+
+				ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
+							    BCH_DATA_BTREE);
+				if (ret)
+					goto err;
+			} else {
+				bkey_copy(&tmp.k, &b->key);
+				new_key = bkey_i_to_extent(&tmp.k);
+
+				ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
+						    dev_idx, flags, true);
+				if (ret)
+					goto err;
+
+				if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) {
+					b = bch2_btree_iter_peek_node(&iter);
+					goto retry;
+				}
+
+				ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+				if (ret == -EINTR) {
+					b = bch2_btree_iter_peek_node(&iter);
+					goto retry;
+				}
+				if (ret)
+					goto err;
+			}
+		}
+		bch2_btree_iter_unlock(&iter);
+
+		/* btree root */
+		mutex_lock(&c->btree_root_lock);
+		mutex_unlock(&c->btree_root_lock);
+	}
+
+	ret = 0;
+out:
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+err:
+	bch2_btree_iter_unlock(&iter);
+	goto out;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+		bch2_dev_metadata_drop(c, dev_idx, flags);
+}
diff --git a/libbcachefs/migrate.h b/libbcachefs/migrate.h
index 9bdaa792..6db7b911 100644
--- a/libbcachefs/migrate.h
+++ b/libbcachefs/migrate.h
@@ -1,8 +1,7 @@
 #ifndef _BCACHEFS_MIGRATE_H
 #define _BCACHEFS_MIGRATE_H
 
-int bch2_move_data_off_device(struct bch_dev *);
-int bch2_move_metadata_off_device(struct bch_dev *);
-int bch2_flag_data_bad(struct bch_dev *);
+int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
 
 #endif /* _BCACHEFS_MIGRATE_H */
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 5eaf0cf8..8ce63d66 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -3,6 +3,7 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "inode.h"
 #include "io.h"
 #include "move.h"
 #include "super-io.h"
@@ -206,7 +207,7 @@ static void move_write(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
 
-	if (likely(!io->rbio.bio.bi_error)) {
+	if (likely(!io->rbio.bio.bi_status)) {
 		bch2_migrate_write_init(&io->write, &io->rbio);
 		closure_call(&io->write.op.cl, bch2_write, NULL, cl);
 	}
@@ -240,6 +241,7 @@ static int bch2_move_extent(struct bch_fs *c,
 			  struct write_point_specifier wp,
 			  int btree_insert_flags,
 			  int move_device,
+			  struct bch_io_opts opts,
 			  struct bkey_s_c k)
 {
 	struct extent_pick_ptr pick;
@@ -276,6 +278,7 @@ static int bch2_move_extent(struct bch_fs *c,
 		goto err;
 	}
 
+	io->rbio.opts = opts;
 	bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
 	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 	io->rbio.bio.bi_iter.bi_size = sectors << 9;
@@ -284,9 +287,13 @@ static int bch2_move_extent(struct bch_fs *c,
 	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
 	io->rbio.bio.bi_end_io		= move_read_endio;
 
-	__bch2_write_op_init(&io->write.op, c);
 	io->write.btree_insert_flags = btree_insert_flags;
 	io->write.move_dev	= move_device;
+
+	bch2_write_op_init(&io->write.op, c);
+	io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
+	io->write.op.compression_type =
+		bch2_compression_opt_to_type(opts.compression);
 	io->write.op.devs	= devs;
 	io->write.op.write_point = wp;
 
@@ -371,9 +378,11 @@ int bch2_move_data(struct bch_fs *c,
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct moving_context ctxt;
+	struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
 	struct btree_iter iter;
 	BKEY_PADDED(k) tmp;
 	struct bkey_s_c k;
+	u64 cur_inum = U64_MAX;
 	int ret = 0;
 
 	bch2_move_ctxt_init(&ctxt);
@@ -396,7 +405,7 @@ int bch2_move_data(struct bch_fs *c,
 		    (bch2_btree_iter_unlock(&iter),
 		     (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
 			break;
-
+peek:
 		k = bch2_btree_iter_peek(&iter);
 		if (!k.k)
 			break;
@@ -404,8 +413,23 @@ int bch2_move_data(struct bch_fs *c,
 		if (ret)
 			break;
 
-		if (!bkey_extent_is_data(k.k) ||
-		    !pred(arg, bkey_s_c_to_extent(k)))
+		if (!bkey_extent_is_data(k.k))
+			goto next;
+
+		if (cur_inum != k.k->p.inode) {
+			struct bch_inode_unpacked inode;
+
+			/* don't hold btree locks while looking up inode: */
+			bch2_btree_iter_unlock(&iter);
+
+			opts = bch2_opts_to_inode_opts(c->opts);
+			if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
+				bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode));
+			cur_inum = k.k->p.inode;
+			goto peek;
+		}
+
+		if (!pred(arg, bkey_s_c_to_extent(k)))
 			goto next;
 
 		/* unlock before doing IO: */
@@ -415,7 +439,7 @@ int bch2_move_data(struct bch_fs *c,
 
 		if (bch2_move_extent(c, &ctxt, devs, wp,
 				     btree_insert_flags,
-				     move_device, k)) {
+				     move_device, opts, k)) {
 			/* memory allocation failure, wait for some IO to finish */
 			bch2_move_ctxt_wait_for_io(&ctxt);
 			continue;
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index c9482151..28e40e41 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -76,16 +76,27 @@ void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
 #undef BCH_OPT
 }
 
-u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
 {
 	switch (id) {
 #define BCH_OPT(_name, ...)						\
 	case Opt_##_name:						\
-		return opts->_name;					\
-
+		return opt_defined(*opts, _name);
 	BCH_OPTS()
 #undef BCH_OPT
+	default:
+		BUG();
+	}
+}
 
+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+	switch (id) {
+#define BCH_OPT(_name, ...)						\
+	case Opt_##_name:						\
+		return opts->_name;
+	BCH_OPTS()
+#undef BCH_OPT
 	default:
 		BUG();
 	}
@@ -98,10 +109,8 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
 	case Opt_##_name:						\
 		opt_set(*opts, _name, v);				\
 		break;
-
 	BCH_OPTS()
 #undef BCH_OPT
-
 	default:
 		BUG();
 	}
@@ -118,7 +127,6 @@ struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
 #define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
 	if (_sb_opt != NO_SB_OPT)					\
 		opt_set(opts, _name, _sb_opt(sb));
-
 	BCH_OPTS()
 #undef BCH_OPT
 
@@ -145,7 +153,7 @@ const struct bch_option bch2_opt_table[] = {
 #undef BCH_OPT
 };
 
-static int bch2_opt_lookup(const char *name)
+int bch2_opt_lookup(const char *name)
 {
 	const struct bch_option *i;
 
@@ -247,3 +255,52 @@ no_val:
 	pr_err("Mount option %s requires a value", name);
 	return -1;
 }
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+	struct bch_io_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(ret, _name, src._name);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	return ret;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
+{
+	struct bch_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(ret, _name, src._name);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	return ret;
+}
+
+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
+{
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(*dst, _name, src._name);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+}
+
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
+{
+	static const enum bch_opt_id inode_opt_list[] = {
+#define BCH_INODE_OPT(_name, _bits)	Opt_##_name,
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	};
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
+		if (inode_opt_list[i] == id)
+			return true;
+
+	return false;
+}
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 33e3a2c8..126056e6 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -181,10 +181,7 @@ do {									\
 
 static inline struct bch_opts bch2_opts_empty(void)
 {
-	struct bch_opts opts;
-
-	memset(&opts, 0, sizeof(opts));
-	return opts;
+	return (struct bch_opts) { 0 };
 }
 
 void bch2_opts_apply(struct bch_opts *, struct bch_opts);
@@ -215,12 +212,35 @@ struct bch_option {
 
 extern const struct bch_option bch2_opt_table[];
 
+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
 u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
 void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
 
 struct bch_opts bch2_opts_from_sb(struct bch_sb *);
 
+int bch2_opt_lookup(const char *);
 int bch2_opt_parse(const struct bch_option *, const char *, u64 *);
 int bch2_parse_mount_opts(struct bch_opts *, char *);
 
+/* inode opts: */
+
+#define BCH_INODE_OPTS()					\
+	BCH_INODE_OPT(data_checksum,			8)	\
+	BCH_INODE_OPT(compression,			8)
+
+struct bch_io_opts {
+#define BCH_INODE_OPT(_name, _bits)	unsigned _name##_defined:1;
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+
+#define BCH_INODE_OPT(_name, _bits)	u##_bits _name;
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+};
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
+
 #endif /* _BCACHEFS_OPTS_H */
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index a3ecfb92..3f55c244 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -12,6 +12,8 @@
 #include <linux/sort.h>
 
 static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+					    struct bch_replicas_cpu *);
 static const char *bch2_sb_validate_replicas(struct bch_sb *);
 
 static inline void __bch2_sb_layout_size_assert(void)
@@ -157,7 +159,7 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
 		return NULL;
 
 	f = __bch2_sb_field_resize(sb->sb, f, u64s);
-	f->type = type;
+	f->type = cpu_to_le32(type);
 	return f;
 }
 
@@ -188,7 +190,7 @@ struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c,
 	}
 
 	f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
-	f->type = type;
+	f->type = cpu_to_le32(type);
 	return f;
 }
 
@@ -354,7 +356,16 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 
 	if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
 	    BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
-		return "Invalid number of metadata replicas";
+		return "Invalid number of data replicas";
+
+	if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+		return "Invalid metadata checksum type";
+
+	if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+		return "Invalid metadata checksum type";
+
+	if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
+		return "Invalid compression type";
 
 	if (!BCH_SB_BTREE_NODE_SIZE(sb))
 		return "Btree node size not set";
@@ -507,7 +518,7 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
 		if (src_f->type == BCH_SB_FIELD_journal)
 			continue;
 
-		dst_f = bch2_sb_field_get(dst, src_f->type);
+		dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
 		dst_f = __bch2_sb_field_resize(dst, dst_f,
 				le32_to_cpu(src_f->u64s));
 
@@ -601,7 +612,7 @@ reread:
 
 	/* XXX: verify MACs */
 	csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
-			    (struct nonce) { 0 }, sb->sb);
+			    null_nonce(), sb->sb);
 
 	if (bch2_crc_cmp(csum, sb->sb->csum))
 		return "bad checksum reading superblock";
@@ -688,9 +699,9 @@ const char *bch2_read_super(const char *path,
 got_super:
 	pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
 		 le64_to_cpu(ret->sb->version),
-		 le64_to_cpu(ret->sb->flags),
+		 le64_to_cpu(ret->sb->flags[0]),
 		 le64_to_cpu(ret->sb->seq),
-		 le16_to_cpu(ret->sb->u64s));
+		 le32_to_cpu(ret->sb->u64s));
 
 	err = "Superblock block size smaller than device block size";
 	if (le16_to_cpu(ret->sb->block_size) << 9 <
@@ -711,7 +722,7 @@ static void write_super_endio(struct bio *bio)
 
 	/* XXX: return errors directly */
 
-	if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write"))
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
 		ca->sb_write_error = 1;
 
 	closure_put(&ca->fs->sb_write);
@@ -727,7 +738,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 
 	SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
 	sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
-				(struct nonce) { 0 }, sb);
+				null_nonce(), sb);
 
 	bio_reset(bio);
 	bio->bi_bdev		= ca->disk_sb.bdev;
@@ -830,7 +841,12 @@ out:
 	bch2_sb_update(c);
 }
 
-/* replica information: */
+/* Replicas tracking - in memory: */
+
+#define for_each_cpu_replicas_entry(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+	     _i = (void *) (_i) + (_r)->entry_size)
 
 static inline struct bch_replicas_cpu_entry *
 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
@@ -838,6 +854,11 @@ cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
 	return (void *) r->entries + r->entry_size * i;
 }
 
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
 static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
 				     unsigned dev)
 {
@@ -856,6 +877,246 @@ static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
 		offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
 }
 
+static unsigned bkey_to_replicas(struct bkey_s_c_extent e,
+			     enum bch_data_type data_type,
+			     struct bch_replicas_cpu_entry *r,
+			     unsigned *max_dev)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned nr = 0;
+
+	BUG_ON(!data_type ||
+	       data_type == BCH_DATA_SB ||
+	       data_type >= BCH_DATA_NR);
+
+	memset(r, 0, sizeof(*r));
+	r->data_type = data_type;
+
+	*max_dev = 0;
+
+	extent_for_each_ptr(e, ptr)
+		if (!ptr->cached) {
+			*max_dev = max_t(unsigned, *max_dev, ptr->dev);
+			replicas_set_dev(r, ptr->dev);
+			nr++;
+		}
+	return nr;
+}
+
+static struct bch_replicas_cpu *
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+		       struct bch_replicas_cpu_entry new_entry,
+		       unsigned max_dev)
+{
+	struct bch_replicas_cpu *new;
+	unsigned i, nr, entry_size;
+
+	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+		DIV_ROUND_UP(max_dev + 1, 8);
+	entry_size = max(entry_size, old->entry_size);
+	nr = old->nr + 1;
+
+	new = kzalloc(sizeof(struct bch_replicas_cpu) +
+		      nr * entry_size, GFP_NOIO);
+	if (!new)
+		return NULL;
+
+	new->nr		= nr;
+	new->entry_size	= entry_size;
+
+	for (i = 0; i < old->nr; i++)
+		memcpy(cpu_replicas_entry(new, i),
+		       cpu_replicas_entry(old, i),
+		       min(new->entry_size, old->entry_size));
+
+	memcpy(cpu_replicas_entry(new, old->nr),
+	       &new_entry,
+	       new->entry_size);
+
+	bch2_cpu_replicas_sort(new);
+	return new;
+}
+
+static bool replicas_has_entry(struct bch_replicas_cpu *r,
+				struct bch_replicas_cpu_entry search,
+				unsigned max_dev)
+{
+	return max_dev < replicas_dev_slots(r) &&
+		eytzinger0_find(r->entries, r->nr,
+				r->entry_size,
+				memcmp, &search) < r->nr;
+}
+
+noinline
+static int bch2_check_mark_super_slowpath(struct bch_fs *c,
+				struct bch_replicas_cpu_entry new_entry,
+				unsigned max_dev)
+{
+	struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r;
+	int ret = -ENOMEM;
+
+	mutex_lock(&c->sb_lock);
+
+	old_gc = rcu_dereference_protected(c->replicas_gc,
+					   lockdep_is_held(&c->sb_lock));
+	if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
+		new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
+		if (!new_gc)
+			goto err;
+	}
+
+	old_r = rcu_dereference_protected(c->replicas,
+					  lockdep_is_held(&c->sb_lock));
+	/* recheck, might have raced */
+	if (replicas_has_entry(old_r, new_entry, max_dev))
+		goto out;
+
+	new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
+	if (!new_r)
+		goto err;
+
+	ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+	if (ret)
+		goto err;
+
+	if (new_gc) {
+		rcu_assign_pointer(c->replicas_gc, new_gc);
+		kfree_rcu(old_gc, rcu);
+	}
+
+	rcu_assign_pointer(c->replicas, new_r);
+	kfree_rcu(old_r, rcu);
+
+	bch2_write_super(c);
+out:
+	ret = 0;
+err:
+	mutex_unlock(&c->sb_lock);
+	return ret;
+}
+
+static inline int __bch2_check_mark_super(struct bch_fs *c,
+				struct bch_replicas_cpu_entry search,
+				unsigned max_dev)
+{
+	struct bch_replicas_cpu *r, *gc_r;
+	bool marked;
+
+	rcu_read_lock();
+	r = rcu_dereference(c->replicas);
+	gc_r = rcu_dereference(c->replicas_gc);
+	marked = replicas_has_entry(r, search, max_dev) &&
+		(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
+	rcu_read_unlock();
+
+	return likely(marked) ? 0
+		: bch2_check_mark_super_slowpath(c, search, max_dev);
+}
+
+int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+			  enum bch_data_type data_type)
+{
+	struct bch_replicas_cpu_entry search;
+	unsigned max_dev;
+
+	if (!bkey_to_replicas(e, data_type, &search, &max_dev))
+		return 0;
+
+	return __bch2_check_mark_super(c, search, max_dev);
+}
+
+int bch2_check_mark_super_devlist(struct bch_fs *c,
+				  struct bch_devs_list *devs,
+				  enum bch_data_type data_type)
+{
+	struct bch_replicas_cpu_entry search = { .data_type = data_type };
+	unsigned i, max_dev = 0;
+
+	if (!devs->nr)
+		return 0;
+
+	for (i = 0; i < devs->nr; i++) {
+		max_dev = max_t(unsigned, max_dev, devs->devs[i]);
+		replicas_set_dev(&search, devs->devs[i]);
+	}
+
+	return __bch2_check_mark_super(c, search, max_dev);
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int err)
+{
+	struct bch_replicas_cpu *new_r, *old_r;
+	int ret = 0;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+
+	new_r = rcu_dereference_protected(c->replicas_gc,
+					  lockdep_is_held(&c->sb_lock));
+
+	if (err) {
+		rcu_assign_pointer(c->replicas_gc, NULL);
+		kfree_rcu(new_r, rcu);
+		goto err;
+	}
+
+	if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
+		ret = -ENOSPC;
+		goto err;
+	}
+
+	old_r = rcu_dereference_protected(c->replicas,
+					  lockdep_is_held(&c->sb_lock));
+
+	rcu_assign_pointer(c->replicas, new_r);
+	rcu_assign_pointer(c->replicas_gc, NULL);
+	kfree_rcu(old_r, rcu);
+
+	bch2_write_super(c);
+err:
+	mutex_unlock(&c->sb_lock);
+	return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+	struct bch_replicas_cpu *dst, *src;
+	struct bch_replicas_cpu_entry *e;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+	BUG_ON(c->replicas_gc);
+
+	src = rcu_dereference_protected(c->replicas,
+					lockdep_is_held(&c->sb_lock));
+
+	dst = kzalloc(sizeof(struct bch_replicas_cpu) +
+		      src->nr * src->entry_size, GFP_NOIO);
+	if (!dst) {
+		mutex_unlock(&c->sb_lock);
+		return -ENOMEM;
+	}
+
+	dst->nr		= 0;
+	dst->entry_size	= src->entry_size;
+
+	for_each_cpu_replicas_entry(src, e)
+		if (!((1 << e->data_type) & typemask))
+			memcpy(cpu_replicas_entry(dst, dst->nr++),
+			       e, dst->entry_size);
+
+	bch2_cpu_replicas_sort(dst);
+
+	rcu_assign_pointer(c->replicas_gc, dst);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+/* Replicas tracking - superblock: */
+
 static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
 					unsigned *nr,
 					unsigned *bytes,
@@ -914,10 +1175,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
 		}
 	}
 
-	eytzinger0_sort(cpu_r->entries,
-			cpu_r->nr,
-			cpu_r->entry_size,
-			memcmp, NULL);
+	bch2_cpu_replicas_sort(cpu_r);
 	return cpu_r;
 }
 
@@ -926,14 +1184,12 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 	struct bch_sb_field_replicas *sb_r;
 	struct bch_replicas_cpu *cpu_r, *old_r;
 
-	lockdep_assert_held(&c->sb_lock);
-
 	sb_r	= bch2_sb_get_replicas(c->disk_sb);
 	cpu_r	= __bch2_sb_replicas_to_cpu_replicas(sb_r);
 	if (!cpu_r)
 		return -ENOMEM;
 
-	old_r = c->replicas;
+	old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
 	rcu_assign_pointer(c->replicas, cpu_r);
 	if (old_r)
 		kfree_rcu(old_r, rcu);
@@ -941,192 +1197,133 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 	return 0;
 }
 
-static void bkey_to_replicas(struct bkey_s_c_extent e,
-			     enum bch_data_type data_type,
-			     struct bch_replicas_cpu_entry *r,
-			     unsigned *max_dev)
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+					    struct bch_replicas_cpu *r)
 {
-	const struct bch_extent_ptr *ptr;
-
-	BUG_ON(!data_type ||
-	       data_type == BCH_DATA_SB ||
-	       data_type >= BCH_DATA_NR);
-
-	memset(r, 0, sizeof(*r));
-	r->data_type = data_type;
-
-	*max_dev = 0;
-
-	extent_for_each_ptr(e, ptr)
-		if (!ptr->cached) {
-			*max_dev = max_t(unsigned, *max_dev, ptr->dev);
-			replicas_set_dev(r, ptr->dev);
-		}
-}
+	struct bch_sb_field_replicas *sb_r;
+	struct bch_replicas_entry *sb_e;
+	struct bch_replicas_cpu_entry *e;
+	size_t i, bytes;
 
-/*
- * for when gc of replica information is in progress:
- */
-static int bch2_update_gc_replicas(struct bch_fs *c,
-				   struct bch_replicas_cpu *gc_r,
-				   struct bkey_s_c_extent e,
-				   enum bch_data_type data_type)
-{
-	struct bch_replicas_cpu_entry new_e;
-	struct bch_replicas_cpu *new;
-	unsigned i, nr, entry_size, max_dev;
+	bytes = sizeof(struct bch_sb_field_replicas);
 
-	bkey_to_replicas(e, data_type, &new_e, &max_dev);
+	for_each_cpu_replicas_entry(r, e) {
+		bytes += sizeof(struct bch_replicas_entry);
+		for (i = 0; i < r->entry_size - 1; i++)
+			bytes += hweight8(e->devs[i]);
+	}
 
-	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
-		DIV_ROUND_UP(max_dev + 1, 8);
-	entry_size = max(entry_size, gc_r->entry_size);
-	nr = gc_r->nr + 1;
+	sb_r = bch2_fs_sb_resize_replicas(c,
+			DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+	if (!sb_r)
+		return -ENOSPC;
 
-	new = kzalloc(sizeof(struct bch_replicas_cpu) +
-		      nr * entry_size, GFP_NOIO);
-	if (!new)
-		return -ENOMEM;
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);
 
-	new->nr		= nr;
-	new->entry_size	= entry_size;
+	sb_e = sb_r->entries;
+	for_each_cpu_replicas_entry(r, e) {
+		sb_e->data_type = e->data_type;
 
-	for (i = 0; i < gc_r->nr; i++)
-		memcpy(cpu_replicas_entry(new, i),
-		       cpu_replicas_entry(gc_r, i),
-		       gc_r->entry_size);
+		for (i = 0; i < replicas_dev_slots(r); i++)
+			if (replicas_test_dev(e, i))
+				sb_e->devs[sb_e->nr++] = i;
 
-	memcpy(cpu_replicas_entry(new, nr - 1),
-	       &new_e,
-	       new->entry_size);
+		sb_e = replicas_entry_next(sb_e);
 
-	eytzinger0_sort(new->entries,
-			new->nr,
-			new->entry_size,
-			memcmp, NULL);
+		BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+	}
 
-	rcu_assign_pointer(c->replicas_gc, new);
-	kfree_rcu(gc_r, rcu);
 	return 0;
 }
 
-static bool replicas_has_extent(struct bch_replicas_cpu *r,
-				struct bkey_s_c_extent e,
-				enum bch_data_type data_type)
-{
-	struct bch_replicas_cpu_entry search;
-	unsigned max_dev;
-
-	bkey_to_replicas(e, data_type, &search, &max_dev);
-
-	return max_dev < replicas_dev_slots(r) &&
-		eytzinger0_find(r->entries, r->nr,
-				r->entry_size,
-				memcmp, &search) < r->nr;
-}
-
-bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
-			  enum bch_data_type data_type)
-{
-	bool ret;
-
-	rcu_read_lock();
-	ret = replicas_has_extent(rcu_dereference(c->replicas),
-				  e, data_type);
-	rcu_read_unlock();
-
-	return ret;
-}
-
-noinline
-static int bch2_check_mark_super_slowpath(struct bch_fs *c,
-					  struct bkey_s_c_extent e,
-					  enum bch_data_type data_type)
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
 {
-	struct bch_replicas_cpu *gc_r;
-	const struct bch_extent_ptr *ptr;
+	struct bch_sb_field_members *mi;
 	struct bch_sb_field_replicas *sb_r;
-	struct bch_replicas_entry *new_entry;
-	unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
-	int ret = 0;
+	struct bch_replicas_cpu *cpu_r = NULL;
+	struct bch_replicas_entry *e;
+	const char *err;
+	unsigned i;
 
-	mutex_lock(&c->sb_lock);
+	mi	= bch2_sb_get_members(sb);
+	sb_r	= bch2_sb_get_replicas(sb);
+	if (!sb_r)
+		return NULL;
 
-	gc_r = rcu_dereference_protected(c->replicas_gc,
-					 lockdep_is_held(&c->sb_lock));
-	if (gc_r &&
-	    !replicas_has_extent(gc_r, e, data_type)) {
-		ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
-		if (ret)
+	for_each_replicas_entry(sb_r, e) {
+		err = "invalid replicas entry: invalid data type";
+		if (e->data_type >= BCH_DATA_NR)
 			goto err;
-	}
-
-	/* recheck, might have raced */
-	if (bch2_sb_has_replicas(c, e, data_type)) {
-		mutex_unlock(&c->sb_lock);
-		return 0;
-	}
 
-	new_entry_bytes = sizeof(struct bch_replicas_entry) +
-		bch2_extent_nr_dirty_ptrs(e.s_c);
-
-	sb_r = bch2_sb_get_replicas(c->disk_sb);
+		err = "invalid replicas entry: no devices";
+		if (!e->nr)
+			goto err;
 
-	bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+		err = "invalid replicas entry: too many devices";
+		if (e->nr >= BCH_REPLICAS_MAX)
+			goto err;
 
-	new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
+		err = "invalid replicas entry: invalid device";
+		for (i = 0; i < e->nr; i++)
+			if (!bch2_dev_exists(sb, mi, e->devs[i]))
+				goto err;
+	}
 
-	sb_r = bch2_fs_sb_resize_replicas(c,
-			DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
-				     sizeof(u64)));
-	if (!sb_r) {
-		ret = -ENOSPC;
+	err = "cannot allocate memory";
+	cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+	if (!cpu_r)
 		goto err;
-	}
 
-	new_entry = (void *) sb_r + bytes;
-	new_entry->data_type = data_type;
-	new_entry->nr = 0;
+	sort_cmp_size(cpu_r->entries,
+		      cpu_r->nr,
+		      cpu_r->entry_size,
+		      memcmp, NULL);
+
+	for (i = 0; i + 1 < cpu_r->nr; i++) {
+		struct bch_replicas_cpu_entry *l =
+			cpu_replicas_entry(cpu_r, i);
+		struct bch_replicas_cpu_entry *r =
+			cpu_replicas_entry(cpu_r, i + 1);
 
-	extent_for_each_ptr(e, ptr)
-		if (!ptr->cached)
-			new_entry->devs[new_entry->nr++] = ptr->dev;
+		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
 
-	ret = bch2_sb_replicas_to_cpu_replicas(c);
-	if (ret) {
-		memset(new_entry, 0,
-		       vstruct_end(&sb_r->field) - (void *) new_entry);
-		goto err;
+		err = "duplicate replicas entry";
+		if (!memcmp(l, r, cpu_r->entry_size))
+			goto err;
 	}
 
-	bch2_write_super(c);
+	err = NULL;
 err:
-	mutex_unlock(&c->sb_lock);
-	return ret;
+	kfree(cpu_r);
+	return err;
 }
 
-int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+/* Query replicas: */
+
+bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
 			  enum bch_data_type data_type)
 {
-	struct bch_replicas_cpu *gc_r;
-	bool marked;
+	struct bch_replicas_cpu_entry search;
+	unsigned max_dev;
+	bool ret;
+
+	if (!bkey_to_replicas(e, data_type, &search, &max_dev))
+		return true;
 
 	rcu_read_lock();
-	marked = replicas_has_extent(rcu_dereference(c->replicas),
-				     e, data_type) &&
-		(!(gc_r = rcu_dereference(c->replicas_gc)) ||
-		 replicas_has_extent(gc_r, e, data_type));
+	ret = replicas_has_entry(rcu_dereference(c->replicas),
+				 search, max_dev);
 	rcu_read_unlock();
 
-	if (marked)
-		return 0;
-
-	return bch2_check_mark_super_slowpath(c, e, data_type);
+	return ret;
 }
 
 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
-					struct bch_devs_mask online_devs)
+					      struct bch_devs_mask online_devs)
 {
+	struct bch_sb_field_members *mi;
 	struct bch_replicas_cpu_entry *e;
 	struct bch_replicas_cpu *r;
 	unsigned i, dev, dev_slots, nr_online, nr_offline;
@@ -1137,14 +1334,15 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
 		ret.replicas[i].nr_online = UINT_MAX;
 
+	mi = bch2_sb_get_members(c->disk_sb);
 	rcu_read_lock();
-	r = rcu_dereference(c->replicas);
-	dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices);
 
-	for (i = 0; i < r->nr; i++) {
-		e = cpu_replicas_entry(r, i);
+	r = rcu_dereference(c->replicas);
+	dev_slots = replicas_dev_slots(r);
 
-		BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
+	for_each_cpu_replicas_entry(r, e) {
+		if (e->data_type >= ARRAY_SIZE(ret.replicas))
+			panic("e %p data_type %u\n", e, e->data_type);
 
 		nr_online = nr_offline = 0;
 
@@ -1152,6 +1350,8 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 			if (!replicas_test_dev(e, dev))
 				continue;
 
+			BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev));
+
 			if (test_bit(dev, online_devs.d))
 				nr_online++;
 			else
@@ -1216,7 +1416,7 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bch_replicas_cpu_entry *e;
 	struct bch_replicas_cpu *r;
-	unsigned i, ret = 0;
+	unsigned ret = 0;
 
 	rcu_read_lock();
 	r = rcu_dereference(c->replicas);
@@ -1224,191 +1424,13 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 	if (ca->dev_idx >= replicas_dev_slots(r))
 		goto out;
 
-	for (i = 0; i < r->nr; i++) {
-		e = cpu_replicas_entry(r, i);
-
+	for_each_cpu_replicas_entry(r, e)
 		if (replicas_test_dev(e, ca->dev_idx)) {
 			ret |= 1 << e->data_type;
 			break;
 		}
-	}
 out:
 	rcu_read_unlock();
 
 	return ret;
 }
-
-static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
-{
-	struct bch_sb_field_members *mi;
-	struct bch_sb_field_replicas *sb_r;
-	struct bch_replicas_cpu *cpu_r = NULL;
-	struct bch_replicas_entry *e;
-	const char *err;
-	unsigned i;
-
-	mi	= bch2_sb_get_members(sb);
-	sb_r	= bch2_sb_get_replicas(sb);
-	if (!sb_r)
-		return NULL;
-
-	for_each_replicas_entry(sb_r, e) {
-		err = "invalid replicas entry: invalid data type";
-		if (e->data_type >= BCH_DATA_NR)
-			goto err;
-
-		err = "invalid replicas entry: too many devices";
-		if (e->nr >= BCH_REPLICAS_MAX)
-			goto err;
-
-		err = "invalid replicas entry: invalid device";
-		for (i = 0; i < e->nr; i++)
-			if (!bch2_dev_exists(sb, mi, e->devs[i]))
-				goto err;
-	}
-
-	err = "cannot allocate memory";
-	cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
-	if (!cpu_r)
-		goto err;
-
-	sort_cmp_size(cpu_r->entries,
-		      cpu_r->nr,
-		      cpu_r->entry_size,
-		      memcmp, NULL);
-
-	for (i = 0; i + 1 < cpu_r->nr; i++) {
-		struct bch_replicas_cpu_entry *l =
-			cpu_replicas_entry(cpu_r, i);
-		struct bch_replicas_cpu_entry *r =
-			cpu_replicas_entry(cpu_r, i + 1);
-
-		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
-
-		err = "duplicate replicas entry";
-		if (!memcmp(l, r, cpu_r->entry_size))
-			goto err;
-	}
-
-	err = NULL;
-err:
-	kfree(cpu_r);
-	return err;
-}
-
-int bch2_replicas_gc_end(struct bch_fs *c, int err)
-{
-	struct bch_sb_field_replicas *sb_r;
-	struct bch_replicas_cpu *r, *old_r;
-	struct bch_replicas_entry *dst_e;
-	size_t i, j, bytes, dev_slots;
-	int ret = 0;
-
-	lockdep_assert_held(&c->replicas_gc_lock);
-
-	mutex_lock(&c->sb_lock);
-
-	r = rcu_dereference_protected(c->replicas_gc,
-				      lockdep_is_held(&c->sb_lock));
-
-	if (err) {
-		rcu_assign_pointer(c->replicas_gc, NULL);
-		kfree_rcu(r, rcu);
-		goto err;
-	}
-
-	dev_slots = replicas_dev_slots(r);
-
-	bytes = sizeof(struct bch_sb_field_replicas);
-
-	for (i = 0; i < r->nr; i++) {
-		struct bch_replicas_cpu_entry *e =
-			cpu_replicas_entry(r, i);
-
-		bytes += sizeof(struct bch_replicas_entry);
-		for (j = 0; j < r->entry_size - 1; j++)
-			bytes += hweight8(e->devs[j]);
-	}
-
-	sb_r = bch2_fs_sb_resize_replicas(c,
-			DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
-	if (!sb_r) {
-		ret = -ENOSPC;
-		goto err;
-	}
-
-	memset(&sb_r->entries, 0,
-	       vstruct_end(&sb_r->field) -
-	       (void *) &sb_r->entries);
-
-	dst_e = sb_r->entries;
-	for (i = 0; i < r->nr; i++) {
-		struct bch_replicas_cpu_entry *src_e =
-			cpu_replicas_entry(r, i);
-
-		dst_e->data_type = src_e->data_type;
-
-		for (j = 0; j < dev_slots; j++)
-			if (replicas_test_dev(src_e, j))
-				dst_e->devs[dst_e->nr++] = j;
-
-		dst_e = replicas_entry_next(dst_e);
-	}
-
-	old_r = rcu_dereference_protected(c->replicas,
-					  lockdep_is_held(&c->sb_lock));
-	rcu_assign_pointer(c->replicas, r);
-	rcu_assign_pointer(c->replicas_gc, NULL);
-	kfree_rcu(old_r, rcu);
-
-	bch2_write_super(c);
-err:
-	mutex_unlock(&c->sb_lock);
-	return ret;
-}
-
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
-{
-	struct bch_replicas_cpu *r, *src;
-	unsigned i;
-
-	lockdep_assert_held(&c->replicas_gc_lock);
-
-	mutex_lock(&c->sb_lock);
-	BUG_ON(c->replicas_gc);
-
-	src = rcu_dereference_protected(c->replicas,
-					lockdep_is_held(&c->sb_lock));
-
-	r = kzalloc(sizeof(struct bch_replicas_cpu) +
-		    src->nr * src->entry_size, GFP_NOIO);
-	if (!r) {
-		mutex_unlock(&c->sb_lock);
-		return -ENOMEM;
-	}
-
-	r->entry_size = src->entry_size;
-	r->nr = 0;
-
-	for (i = 0; i < src->nr; i++) {
-		struct bch_replicas_cpu_entry *dst_e =
-			cpu_replicas_entry(r, r->nr);
-		struct bch_replicas_cpu_entry *src_e =
-			cpu_replicas_entry(src, i);
-
-		if (!(src_e->data_type & typemask)) {
-			memcpy(dst_e, src_e, r->entry_size);
-			r->nr++;
-		}
-	}
-
-	eytzinger0_sort(r->entries,
-			r->nr,
-			r->entry_size,
-			memcmp, NULL);
-
-	rcu_assign_pointer(c->replicas_gc, r);
-	mutex_unlock(&c->sb_lock);
-
-	return 0;
-}
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index 8cafb301..4096efb2 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -125,23 +125,12 @@ void bch2_write_super(struct bch_fs *);
 
 /* replicas: */
 
-/* iterate over bch_sb_field_replicas: */
-
-static inline struct bch_replicas_entry *
-replicas_entry_next(struct bch_replicas_entry *i)
-{
-	return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
-}
-
-#define for_each_replicas_entry(_r, _i)					\
-	for (_i = (_r)->entries;					\
-	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
-	     (_i) = replicas_entry_next(_i))
-
 bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
 			  enum bch_data_type);
 int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
 			  enum bch_data_type);
+int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *,
+				  enum bch_data_type);
 
 struct replicas_status {
 	struct {
@@ -161,4 +150,17 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 int bch2_replicas_gc_end(struct bch_fs *, int);
 int bch2_replicas_gc_start(struct bch_fs *, unsigned);
 
+/* iterate over superblock replicas - used by userspace tools: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+	return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i)					\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
 #endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 4e8b0a51..60a2d83e 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -140,8 +140,9 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
 	return c;
 }
 
-int bch2_congested(struct bch_fs *c, int bdi_bits)
+int bch2_congested(void *data, int bdi_bits)
 {
+	struct bch_fs *c = data;
 	struct backing_dev_info *bdi;
 	struct bch_dev *ca;
 	unsigned i;
@@ -178,13 +179,6 @@ int bch2_congested(struct bch_fs *c, int bdi_bits)
 	return ret;
 }
 
-static int bch2_congested_fn(void *data, int bdi_bits)
-{
-	struct bch_fs *c = data;
-
-	return bch2_congested(c, bdi_bits);
-}
-
 /* Filesystem RO/RW: */
 
 /*
@@ -218,7 +212,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	 * Flush journal before stopping allocators, because flushing journal
 	 * blacklist entries involves allocating new btree nodes:
 	 */
-	bch2_journal_flush_pins(&c->journal, U64_MAX);
+	bch2_journal_flush_all_pins(&c->journal);
 
 	if (!bch2_journal_error(&c->journal))
 		bch2_btree_verify_flushed(c);
@@ -379,8 +373,6 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[WRITE]);
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
-	if (c->bdi.bdi_list.next)
-		bdi_destroy(&c->bdi);
 	lg_lock_free(&c->usage_lock);
 	free_percpu(c->usage_percpu);
 	mempool_exit(&c->btree_bounce_pool);
@@ -393,7 +385,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	mempool_exit(&c->btree_reserve_pool);
 	mempool_exit(&c->fill_iter);
 	percpu_ref_exit(&c->writes);
-	kfree(c->replicas);
+	kfree(rcu_dereference_protected(c->replicas, 1));
 
 	if (c->copygc_wq)
 		destroy_workqueue(c->copygc_wq);
@@ -414,7 +406,7 @@ static void bch2_fs_exit(struct bch_fs *c)
 
 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (c->devs[i])
-			bch2_dev_free(c->devs[i]);
+			bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
 
 	closure_debug_destroy(&c->cl);
 	kobject_put(&c->kobj);
@@ -576,10 +568,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 				      sizeof(struct btree_update)) ||
 	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
 	    bioset_init(&c->btree_read_bio, 1,
-			offsetof(struct btree_read_bio, bio)) ||
-	    bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
-	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
-	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
+			offsetof(struct btree_read_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+			BIOSET_NEED_BVECS) ||
 	    mempool_init_page_pool(&c->bio_bounce_pages,
 				   max_t(unsigned,
 					 c->opts.btree_node_size,
@@ -588,7 +584,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
 	    lg_lock_init(&c->usage_lock) ||
 	    mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
-	    bdi_setup_and_register(&c->bdi, "bcachefs") ||
 	    bch2_io_clock_init(&c->io_clock[READ]) ||
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
 	    bch2_fs_journal_init(&c->journal) ||
@@ -599,10 +594,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_fsio_init(c))
 		goto err;
 
-	c->bdi.ra_pages		= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
-	c->bdi.congested_fn	= bch2_congested_fn;
-	c->bdi.congested_data	= c;
-
 	mi = bch2_sb_get_members(c->disk_sb);
 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (bch2_dev_exists(c->disk_sb, mi, i) &&
@@ -729,8 +720,12 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 				continue;
 
 			err = "error reading btree root";
-			if (bch2_btree_root_read(c, i, k, level))
-				goto err;
+			if (bch2_btree_root_read(c, i, k, level)) {
+				if (i != BTREE_ID_ALLOC)
+					goto err;
+
+				mustfix_fsck_err(c, "error reading btree root");
+			}
 		}
 
 		err = "error reading allocation information";
@@ -830,7 +825,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 		closure_sync(&cl);
 
 		bch2_inode_init(c, &inode, 0, 0,
-			       S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+			       S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
 		inode.bi_inum = BCACHEFS_ROOT_INO;
 
 		bch2_inode_pack(&packed_inode, &inode);
@@ -877,6 +872,7 @@ out:
 	bch2_journal_entries_free(&journal);
 	return err;
 err:
+fsck_err:
 	closure_sync(&cl);
 
 	switch (ret) {
@@ -995,24 +991,20 @@ static void bch2_dev_free(struct bch_dev *ca)
 	kobject_put(&ca->kobj);
 }
 
-static void bch2_dev_io_ref_release(struct percpu_ref *ref)
-{
-	struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
-
-	complete(&ca->offline_complete);
-}
-
 static void __bch2_dev_offline(struct bch_dev *ca)
 {
 	struct bch_fs *c = ca->fs;
 
 	lockdep_assert_held(&c->state_lock);
 
+	if (percpu_ref_is_zero(&ca->io_ref))
+		return;
+
 	__bch2_dev_read_only(c, ca);
 
-	reinit_completion(&ca->offline_complete);
+	reinit_completion(&ca->io_ref_completion);
 	percpu_ref_kill(&ca->io_ref);
-	wait_for_completion(&ca->offline_complete);
+	wait_for_completion(&ca->io_ref_completion);
 
 	if (ca->kobj.state_in_sysfs) {
 		struct kobject *block =
@@ -1026,27 +1018,18 @@ static void __bch2_dev_offline(struct bch_dev *ca)
 	bch2_dev_journal_exit(ca);
 }
 
-static void bch2_dev_ref_release(struct percpu_ref *ref)
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
 {
 	struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
 
-	complete(&ca->stop_complete);
+	complete(&ca->ref_completion);
 }
 
-static void bch2_dev_stop(struct bch_dev *ca)
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
 {
-	struct bch_fs *c = ca->fs;
-
-	lockdep_assert_held(&c->state_lock);
-
-	BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
-	rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
-
-	synchronize_rcu();
+	struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
 
-	reinit_completion(&ca->stop_complete);
-	percpu_ref_kill(&ca->ref);
-	wait_for_completion(&ca->stop_complete);
+	complete(&ca->io_ref_completion);
 }
 
 static int bch2_dev_sysfs_online(struct bch_dev *ca)
@@ -1095,8 +1078,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 		return -ENOMEM;
 
 	kobject_init(&ca->kobj, &bch2_dev_ktype);
-	init_completion(&ca->stop_complete);
-	init_completion(&ca->offline_complete);
+	init_completion(&ca->ref_completion);
+	init_completion(&ca->io_ref_completion);
 
 	ca->dev_idx = dev_idx;
 	__set_bit(ca->dev_idx, ca->self.d);
@@ -1132,9 +1115,9 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 		DIV_ROUND_UP(BTREE_NODE_RESERVE,
 			     ca->mi.bucket_size / c->opts.btree_node_size);
 
-	if (percpu_ref_init(&ca->ref, bch2_dev_ref_release,
+	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
 			    0, GFP_KERNEL) ||
-	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release,
+	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
 	    !init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets,
 		       GFP_KERNEL) ||
@@ -1155,7 +1138,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 					    GFP_KERNEL|__GFP_ZERO)) ||
 	    !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
 	    bioset_init(&ca->replica_set, 4,
-			offsetof(struct bch_write_bio, bio)) ||
+			offsetof(struct bch_write_bio, bio), 0) ||
 	    !(ca->io_done	= alloc_percpu(*ca->io_done)))
 		goto err;
 
@@ -1180,8 +1163,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 	struct bch_dev *ca;
 	int ret;
 
-	lockdep_assert_held(&c->sb_lock);
-
 	if (le64_to_cpu(sb->sb->seq) >
 	    le64_to_cpu(c->disk_sb->seq))
 		bch2_sb_to_fs(c, sb->sb);
@@ -1189,13 +1170,15 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 	BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
 	       !c->devs[sb->sb->dev_idx]);
 
-	ca = c->devs[sb->sb->dev_idx];
+	ca = bch_dev_locked(c, sb->sb->dev_idx);
 	if (ca->disk_sb.bdev) {
 		bch_err(c, "already have device online in slot %u",
 			sb->sb->dev_idx);
 		return -EINVAL;
 	}
 
+	BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
 	ret = bch2_dev_journal_init(ca, sb->sb);
 	if (ret)
 		return ret;
@@ -1222,7 +1205,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 	if (bch2_dev_sysfs_online(ca))
 		pr_warn("error creating sysfs objects");
 
-	bch2_mark_dev_superblock(c, ca, 0);
+	bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
 
 	if (ca->mi.state == BCH_MEMBER_STATE_RW)
 		bch2_dev_allocator_add(c, ca);
@@ -1293,6 +1276,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 {
 	struct replicas_status s;
 	struct bch_sb_field_members *mi;
+	struct bch_dev *ca;
 	unsigned i, flags = c->opts.degraded
 		? BCH_FORCE_IF_DEGRADED
 		: 0;
@@ -1301,14 +1285,19 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 		mutex_lock(&c->sb_lock);
 		mi = bch2_sb_get_members(c->disk_sb);
 
-		for (i = 0; i < c->disk_sb->nr_devices; i++)
-			if (bch2_dev_exists(c->disk_sb, mi, i) &&
-			    !bch2_dev_is_online(c->devs[i]) &&
-			    (c->devs[i]->mi.state == BCH_MEMBER_STATE_RW ||
-			     c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) {
+		for (i = 0; i < c->disk_sb->nr_devices; i++) {
+			if (!bch2_dev_exists(c->disk_sb, mi, i))
+				continue;
+
+			ca = bch_dev_locked(c, i);
+
+			if (!bch2_dev_is_online(ca) &&
+			    (ca->mi.state == BCH_MEMBER_STATE_RW ||
+			     ca->mi.state == BCH_MEMBER_STATE_RO)) {
 				mutex_unlock(&c->sb_lock);
 				return false;
 			}
+		}
 		mutex_unlock(&c->sb_lock);
 	}
 
@@ -1419,22 +1408,59 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	 *
 	 * flag_data_bad() does not check btree pointers
 	 */
-	ret = bch2_flag_data_bad(ca);
+	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
 	if (ret) {
-		bch_err(ca, "Remove failed");
+		bch_err(ca, "Remove failed: error %i dropping data", ret);
+		goto err;
+	}
+
+	ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
+	if (ret) {
+		bch_err(ca, "Remove failed: error %i flushing journal", ret);
 		goto err;
 	}
 
 	data = bch2_dev_has_data(c, ca);
 	if (data) {
-		bch_err(ca, "Remove failed, still has data (%x)", data);
+		char data_has_str[100];
+		bch2_scnprint_flag_list(data_has_str,
+					sizeof(data_has_str),
+					bch2_data_types,
+					data);
+		bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+		ret = -EBUSY;
 		goto err;
 	}
 
-	bch2_journal_meta(&c->journal);
+	ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+				      POS(ca->dev_idx, 0),
+				      POS(ca->dev_idx + 1, 0),
+				      ZERO_VERSION,
+				      NULL, NULL, NULL);
+	if (ret) {
+		bch_err(ca, "Remove failed, error deleting alloc info");
+		goto err;
+	}
+
+	/*
+	 * must flush all existing journal entries, they might have
+	 * (overwritten) keys that point to the device we're removing:
+	 */
+	ret = bch2_journal_flush_all_pins(&c->journal);
+	if (ret) {
+		bch_err(ca, "Remove failed, journal error");
+		goto err;
+	}
 
 	__bch2_dev_offline(ca);
-	bch2_dev_stop(ca);
+
+	mutex_lock(&c->sb_lock);
+	rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
+	mutex_unlock(&c->sb_lock);
+
+	percpu_ref_kill(&ca->ref);
+	wait_for_completion(&ca->ref_completion);
+
 	bch2_dev_free(ca);
 
 	/*
@@ -1542,7 +1568,7 @@ have_slot:
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	ca = c->devs[dev_idx];
+	ca = bch_dev_locked(c, dev_idx);
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		err = "journal alloc failed";
 		if (bch2_dev_journal_alloc(ca))
@@ -1568,7 +1594,7 @@ err:
 /* Hot add existing device to running filesystem: */
 int bch2_dev_online(struct bch_fs *c, const char *path)
 {
-	struct bch_sb_handle sb = { 0 };
+	struct bch_sb_handle sb = { NULL };
 	struct bch_dev *ca;
 	unsigned dev_idx;
 	const char *err;
@@ -1593,7 +1619,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 	}
 	mutex_unlock(&c->sb_lock);
 
-	ca = c->devs[dev_idx];
+	ca = bch_dev_locked(c, dev_idx);
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		err = __bch2_dev_read_write(c, ca);
 		if (err)
@@ -1619,7 +1645,6 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 		return -EINVAL;
 	}
 
-	__bch2_dev_read_only(c, ca);
 	__bch2_dev_offline(ca);
 
 	mutex_unlock(&c->state_lock);
@@ -1629,37 +1654,31 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
 {
 	unsigned data;
-	int ret;
+	int ret = 0;
 
 	mutex_lock(&c->state_lock);
 
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		bch_err(ca, "Cannot migrate data off RW device");
-		mutex_unlock(&c->state_lock);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err;
 	}
 
-	mutex_unlock(&c->state_lock);
-
-	ret = bch2_move_data_off_device(ca);
+	ret = bch2_dev_data_migrate(c, ca, 0);
 	if (ret) {
 		bch_err(ca, "Error migrating data: %i", ret);
-		return ret;
-	}
-
-	ret = bch2_move_metadata_off_device(ca);
-	if (ret) {
-		bch_err(ca, "Error migrating metadata: %i", ret);
-		return ret;
+		goto err;
 	}
 
 	data = bch2_dev_has_data(c, ca);
 	if (data) {
 		bch_err(ca, "Migrate error: data still present (%x)", data);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err;
 	}
-
-	return 0;
+err:
+	mutex_unlock(&c->state_lock);
+	return ret;
 }
 
 /* Filesystem open: */
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index eb1d2f3d..7ebe5981 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -59,6 +59,14 @@ static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
 		}
 }
 
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+					 unsigned dev)
+{
+	BUG_ON(bch2_dev_list_has_dev(*devs, dev));
+	BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+	devs->devs[devs->nr++] = dev;
+}
+
 static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
 					      struct bch_devs_mask *mask)
 {
@@ -131,6 +139,26 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
 	__for_each_online_member(ca, c, iter,				\
 		(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
 
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_protected(c->devs[idx],
+					 lockdep_is_held(&c->sb_lock) ||
+					 lockdep_is_held(&c->state_lock));
+}
+
 /* XXX kill, move to struct bch_fs */
 static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
 {
@@ -146,7 +174,7 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
 
 struct bch_fs *bch2_bdev_to_fs(struct block_device *);
 struct bch_fs *bch2_uuid_to_fs(uuid_le);
-int bch2_congested(struct bch_fs *, int);
+int bch2_congested(void *, int);
 
 bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
 			   enum bch_member_state, int);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 35f1e561..3197a2e4 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -739,7 +739,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		c->open_buckets_wait.list.first		? "waiting" : "empty");
 }
 
-const char * const bch2_rw[] = {
+static const char * const bch2_rw[] = {
 	"read",
 	"write",
 	NULL
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c
index 2e29f741..f5007864 100644
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@@ -6,7 +6,6 @@
 #include "clock.h"
 #include "extents.h"
 #include "io.h"
-#include "keylist.h"
 #include "move.h"
 #include "super-io.h"
 #include "tier.h"
@@ -28,7 +27,7 @@ static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
 		return false;
 
 	extent_for_each_ptr(e, ptr)
-		if (c->devs[ptr->dev]->mi.tier >= tier->idx)
+		if (bch_dev_bkey_exists(c, ptr->dev)->mi.tier >= tier->idx)
 			replicas++;
 
 	return replicas < c->opts.data_replicas;
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index a251bf9c..6e97e831 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -34,8 +34,12 @@ struct closure;
 #define atomic64_sub_bug(i, v)	BUG_ON(atomic64_sub_return(i, v) < 0)
 #define atomic64_add_bug(i, v)	BUG_ON(atomic64_add_return(i, v) < 0)
 
-#define memcpy(_dst, _src, _len)					\
+#define memcpy(dst, src, len)						\
 ({									\
+	void *_dst = (dst);						\
+	const void *_src = (src);					\
+	size_t _len = (len);						\
+									\
 	BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) ||		\
 		 (void *) (_dst) + (_len) <= (void *) (_src)));		\
 	memcpy(_dst, _src, _len);					\
diff --git a/libbcachefs/vstructs.h b/libbcachefs/vstructs.h
index ce2cece0..79566442 100644
--- a/libbcachefs/vstructs.h
+++ b/libbcachefs/vstructs.h
@@ -9,10 +9,10 @@
  */
 #define __vstruct_u64s(_s)						\
 ({									\
-	( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s)		\
-	: type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s)		\
-	: type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s)		\
-	: ((_s)->u64s));						\
+	( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s)		\
+	: type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s)		\
+	: type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s)		\
+	: ((__force u8) ((_s)->u64s)));						\
 })
 
 #define __vstruct_bytes(_type, _u64s)					\
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index 3a49d728..1d6cbe72 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
+#include "compress.h"
 #include "extents.h"
 #include "fs.h"
 #include "str_hash.h"
@@ -358,25 +359,139 @@ static const struct xattr_handler bch_xattr_security_handler = {
 	.flags	= BCH_XATTR_INDEX_SECURITY,
 };
 
-static const struct xattr_handler *bch_xattr_handler_map[] = {
-	[BCH_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
-	[BCH_XATTR_INDEX_POSIX_ACL_ACCESS]	=
-		&posix_acl_access_xattr_handler,
-	[BCH_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
-		&posix_acl_default_xattr_handler,
-	[BCH_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
-	[BCH_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, void *buffer, size_t size)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_opts opts =
+		bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
+	const struct bch_option *opt;
+	int ret, id;
+	u64 v;
+
+	id = bch2_opt_lookup(name);
+	if (id < 0 || !bch2_opt_is_inode_opt(id))
+		return -EINVAL;
+
+	opt = bch2_opt_table + id;
+
+	if (!bch2_opt_defined_by_id(&opts, id))
+		return -ENODATA;
+
+	v = bch2_opt_get_by_id(&opts, id);
+
+	if (opt->type == BCH_OPT_STR)
+		ret = snprintf(buffer, size, "%s", opt->choices[v]);
+	else
+		ret = snprintf(buffer, size, "%llu", v);
+
+	return ret <= size || !buffer ? ret : -ERANGE;
+}
+
+struct inode_opt_set {
+	int			id;
+	u64			v;
+	bool			defined;
 };
 
+static int inode_opt_set_fn(struct bch_inode_info *inode,
+			    struct bch_inode_unpacked *bi,
+			    void *p)
+{
+	struct inode_opt_set *s = p;
+
+	if (s->defined)
+		bch2_inode_opt_set(bi, s->id, s->v);
+	else
+		bch2_inode_opt_clear(bi, s->id);
+	return 0;
+}
+
+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	const struct bch_option *opt;
+	char *buf;
+	struct inode_opt_set s;
+	int ret;
+
+	s.id = bch2_opt_lookup(name);
+	if (s.id < 0 || !bch2_opt_is_inode_opt(s.id))
+		return -EINVAL;
+
+	opt = bch2_opt_table + s.id;
+
+	if (value) {
+		buf = kmalloc(size + 1, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		memcpy(buf, value, size);
+		buf[size] = '\0';
+
+		ret = bch2_opt_parse(opt, buf, &s.v);
+		kfree(buf);
+
+		if (ret < 0)
+			return ret;
+
+		if (s.id == Opt_compression) {
+			mutex_lock(&c->sb_lock);
+			ret = bch2_check_set_has_compressed_data(c, s.v);
+			mutex_unlock(&c->sb_lock);
+
+			if (ret)
+				return ret;
+		}
+
+		s.defined = true;
+	} else {
+		s.defined = false;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s);
+	mutex_unlock(&inode->ei_update_lock);
+
+	return ret;
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_handler = {
+	.prefix	= "bcachefs.",
+	.get	= bch2_xattr_bcachefs_get,
+	.set	= bch2_xattr_bcachefs_set,
+};
+
+#endif /* NO_BCACHEFS_FS */
+
 const struct xattr_handler *bch2_xattr_handlers[] = {
 	&bch_xattr_user_handler,
 	&posix_acl_access_xattr_handler,
 	&posix_acl_default_xattr_handler,
 	&bch_xattr_trusted_handler,
 	&bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+	&bch_xattr_bcachefs_handler,
+#endif
 	NULL
 };
 
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+	[BCH_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
+	[BCH_XATTR_INDEX_POSIX_ACL_ACCESS]	=
+		&posix_acl_access_xattr_handler,
+	[BCH_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
+		&posix_acl_default_xattr_handler,
+	[BCH_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
+	[BCH_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
+};
+
 static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
 {
 	return type < ARRAY_SIZE(bch_xattr_handler_map)
author	Kent Overstreet <kent.overstreet@gmail.com>	2017-12-21 18:00:30 -0500
committer	Kent Overstreet <kent.overstreet@gmail.com>	2017-12-21 18:06:45 -0500
commit	1cf4d51dc4661f336f5318c176a3561ddf5bf04f (patch)
tree	8b390ccd48361ba1408be6799d46e62c6382cc39 /libbcachefs
parent	8acc54456e11ee0ec80ed0c6abb6d68abae60592 (diff)