39 files changed, 1602 insertions, 303 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 4201504158a1..227bef531c16 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -800,20 +800,21 @@ void submit_bio_noacct(struct bio *bio)
 			goto end_io;
 	}
 
+	if (WARN_ON_ONCE((bio->bi_opf & REQ_PREFLUSH) &&
+			 bio_op(bio) != REQ_OP_WRITE &&
+			 bio_op(bio) != REQ_OP_ZONE_APPEND))
+		goto end_io;
+
 	/*
 	 * Filter flush bio's early so that bio based drivers without flush
 	 * support don't have to worry about them.
 	 */
-	if (op_is_flush(bio->bi_opf)) {
-		if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE &&
-				 bio_op(bio) != REQ_OP_ZONE_APPEND))
+	if (op_is_flush(bio->bi_opf) &&
+	    !bdev_write_cache(bdev)) {
+		bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
+		if (!bio_sectors(bio)) {
+			status = BLK_STS_OK;
 			goto end_io;
-		if (!bdev_write_cache(bdev)) {
-			bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
-			if (!bio_sectors(bio)) {
-				status = BLK_STS_OK;
-				goto end_io;
-			}
 		}
 	}
 
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 8b846c09350b..5455412b2b75 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -3,7 +3,6 @@ config BCACHEFS_FS
 	tristate "bcachefs filesystem support (EXPERIMENTAL)"
 	depends on BLOCK
 	select EXPORTFS
-	select CLOSURES
 	select CRC32
 	select CRC64
 	select FS_POSIX_ACL
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index a4258615dffa..1e87eee962ec 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -98,7 +98,8 @@ bcachefs-y		:=	\
 	two_state_shared_lock.o	\
 	util.o			\
 	varint.o		\
-	xattr.o
+	xattr.o			\
+	vendor/closure.o
 
 obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 83d6ab9c1a91..3ccca855f05e 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -196,7 +196,6 @@
 #include <linux/backing-dev-defs.h>
 #include <linux/bug.h>
 #include <linux/bio.h>
-#include <linux/closure.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/math64.h>
@@ -217,6 +216,7 @@
 
 #include "bcachefs_format.h"
 #include "btree_journal_iter_types.h"
+#include "closure.h"
 #include "disk_accounting_types.h"
 #include "errcode.h"
 #include "fast_list.h"
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d29bd684b137..090f11e122ad 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -707,7 +707,8 @@ struct bch_sb_field_ext {
 	x(inode_has_case_insensitive,	BCH_VERSION(1, 28))		\
 	x(extent_snapshot_whiteouts,	BCH_VERSION(1, 29))		\
 	x(31bit_dirent_offset,		BCH_VERSION(1, 30))		\
-	x(btree_node_accounting,	BCH_VERSION(1, 31))
+	x(btree_node_accounting,	BCH_VERSION(1, 31))		\
+	x(rebalance_v2,			BCH_VERSION(1, 32))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 59638d09e1fd..3b1d694dcb3a 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -15,6 +15,7 @@
 
 #include <linux/prefetch.h>
 #include <linux/sched/mm.h>
+#include <linux/seq_buf.h>
 #include <linux/swap.h>
 
 const char * const bch2_btree_node_flags[] = {
@@ -565,6 +566,19 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
 	return btree_cache_can_free(list);
 }
 
+static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
+{
+	struct btree_cache_list *list = shrink->private_data;
+	struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+
+	char *cbuf;
+	size_t buflen = seq_buf_get_buf(s, &cbuf);
+	struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
+
+	bch2_btree_cache_to_text(&out, bc);
+	seq_buf_commit(s, out.pos);
+}
+
 void bch2_fs_btree_cache_exit(struct bch_fs *c)
 {
 	struct btree_cache *bc = &c->btree_cache;
@@ -659,6 +673,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 	bc->live[0].shrink	= shrink;
 	shrink->count_objects	= bch2_btree_cache_count;
 	shrink->scan_objects	= bch2_btree_cache_scan;
+	shrink->to_text		= bch2_btree_cache_shrinker_to_text;
 	shrink->seeks		= 2;
 	shrink->private_data	= &bc->live[0];
 	shrinker_register(shrink);
@@ -669,6 +684,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 	bc->live[1].shrink	= shrink;
 	shrink->count_objects	= bch2_btree_cache_count;
 	shrink->scan_objects	= bch2_btree_cache_scan;
+	shrink->to_text		= bch2_btree_cache_shrinker_to_text;
 	shrink->seeks		= 8;
 	shrink->private_data	= &bc->live[1];
 	shrinker_register(shrink);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 4890cbc88e7c..e3336ab27ccc 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -13,6 +13,7 @@
 #include "trace.h"
 
 #include <linux/sched/mm.h>
+#include <linux/seq_buf.h>
 
 static inline bool btree_uses_pcpu_readers(enum btree_id id)
 {
@@ -808,6 +809,18 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 {
 }
 
+static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
+{
+	struct bch_fs *c = shrink->private_data;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	char *cbuf;
+	size_t buflen = seq_buf_get_buf(s, &cbuf);
+	struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
+
+	bch2_btree_key_cache_to_text(&out, bc);
+	seq_buf_commit(s, out.pos);
+}
+
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 {
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
@@ -832,6 +845,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 	bc->shrink = shrink;
 	shrink->count_objects	= bch2_btree_key_cache_count;
 	shrink->scan_objects	= bch2_btree_key_cache_scan;
+	shrink->to_text		= bch2_btree_key_cache_shrinker_to_text;
 	shrink->batch		= 1 << 14;
 	shrink->seeks		= 0;
 	shrink->private_data	= c;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 10bfadcde80a..362846d5bb87 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -143,6 +143,17 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
 	return bch2_csum_opt_to_type(opts.data_checksum, true);
 }
 
+static inline enum bch_csum_type bch2_data_checksum_type_rb(struct bch_fs *c,
+							    struct bch_extent_rebalance opts)
+{
+	if (c->sb.encryption_type)
+		return c->opts.wide_macs
+			? BCH_CSUM_chacha20_poly1305_128
+			: BCH_CSUM_chacha20_poly1305_80;
+
+	return bch2_csum_opt_to_type(opts.data_checksum, true);
+}
+
 static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
 {
 	if (c->sb.encryption_type)
diff --git a/fs/bcachefs/closure.h b/fs/bcachefs/closure.h
new file mode 100644
index 000000000000..d8d4c7093ce0
--- /dev/null
+++ b/fs/bcachefs/closure.h
@@ -0,0 +1,5 @@
+#include "vendor/closure.h"
+
+#define closure_wait		bch2_closure_wait
+#define closure_return_sync	bch2_closure_return_sync
+#define __closure_wake_up	__bch2_closure_wake_up
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 155c1ad42fc1..6333af6adbae 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -208,28 +208,6 @@ static void trace_data_update2(struct data_update *m,
 }
 
 noinline_for_stack
-static void trace_io_move_created_rebalance2(struct data_update *m,
-					     struct bkey_s_c old, struct bkey_s_c k,
-					     struct bkey_i *insert)
-{
-	struct bch_fs *c = m->op.c;
-	CLASS(printbuf, buf)();
-
-	bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
-
-	prt_str(&buf, "\nold: ");
-	bch2_bkey_val_to_text(&buf, c, old);
-	prt_str(&buf, "\nk:   ");
-	bch2_bkey_val_to_text(&buf, c, k);
-	prt_str(&buf, "\nnew: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
-	trace_io_move_created_rebalance(c, buf.buf);
-
-	count_event(c, io_move_created_rebalance);
-}
-
-noinline_for_stack
 static int data_update_invalid_bkey(struct data_update *m,
 				    struct bkey_s_c old, struct bkey_s_c k,
 				    struct bkey_i *insert)
@@ -438,7 +416,7 @@ restart_drop_extra_replicas:
 			bch2_insert_snapshot_whiteouts(trans, m->btree_id,
 						k.k->p, insert->k.p) ?:
 			bch2_inum_snapshot_opts_get(trans, k.k->p.inode, k.k->p.snapshot, &opts) ?:
-			bch2_bkey_set_needs_rebalance(c, &opts, insert,
+			bch2_bkey_set_needs_rebalance(trans, NULL, &opts, insert,
 						      SET_NEEDS_REBALANCE_foreground,
 						      m->op.opts.change_cookie) ?:
 			bch2_trans_update(trans, &iter, insert,
@@ -449,10 +427,6 @@ restart_drop_extra_replicas:
 		if (trace_data_update_enabled())
 			trace_data_update2(m, old, k, insert);
 
-		if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size >
-		    bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size)
-			trace_io_move_created_rebalance2(m, old, k, insert);
-
 		ret =   bch2_trans_commit(trans, &op->res,
 				NULL,
 				BCH_TRANS_COMMIT_no_check_rw|
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index a99f821c6a1c..9da26e11446b 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -282,6 +282,9 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
 		prt_str(out, "btree=");
 		bch2_btree_id_to_text(out, k->btree.id);
 		break;
+	case BCH_DISK_ACCOUNTING_rebalance_work_v2:
+		bch2_prt_rebalance_accounting_type(out, k->rebalance_work_v2.type);
+		break;
 	}
 }
 
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index 730a17ea4243..0b61d6100180 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -110,7 +110,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 	x(snapshot,		5,	1)	\
 	x(btree,		6,	3)	\
 	x(rebalance_work,	7,	1)	\
-	x(inum,			8,	3)
+	x(inum,			8,	3)	\
+	x(rebalance_work_v2,	9,	1)	\
 
 enum disk_accounting_type {
 #define x(f, nr, ...)	BCH_DISK_ACCOUNTING_##f	= nr,
@@ -210,6 +211,10 @@ struct bch_acct_inum {
 struct bch_acct_rebalance_work {
 };
 
+struct bch_acct_rebalance_work_v2 {
+	__u8			type;
+};
+
 struct disk_accounting_pos {
 	union {
 	struct {
@@ -224,6 +229,7 @@ struct disk_accounting_pos {
 		struct bch_acct_btree		btree;
 		struct bch_acct_rebalance_work	rebalance_work;
 		struct bch_acct_inum		inum;
+		struct bch_acct_rebalance_work_v2 rebalance_work_v2;
 		} __packed;
 	} __packed;
 		struct bpos			_pad;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 89a95b6c4e51..103719a76c81 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1134,7 +1134,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 
 	ret =   bch2_extent_get_io_opts_one(trans, &opts, &iter, bkey_i_to_s_c(n),
 					    SET_NEEDS_REBALANCE_other) ?:
-		bch2_bkey_set_needs_rebalance(trans->c, &opts, n,
+		bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n,
 					      SET_NEEDS_REBALANCE_other, 0) ?:
 		bch2_trans_update(trans, &iter, n, 0);
 out:
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 3274ba42c995..c534b009bf60 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1522,24 +1522,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
 					 "redundant stripe entry");
 			have_ec = true;
 			break;
-		case BCH_EXTENT_ENTRY_rebalance: {
-			/*
-			 * this shouldn't be a fsck error, for forward
-			 * compatibility; the rebalance code should just refetch
-			 * the compression opt if it's unknown
-			 */
-#if 0
-			const struct bch_extent_rebalance *r = &entry->rebalance;
-
-			if (!bch2_compression_opt_valid(r->compression)) {
-				union bch_compression_opt opt = { .value = r->compression };
-				prt_printf(err, "invalid compression opt %u:%u",
-					   opt.type, opt.level);
-				return bch_err_throw(c, invalid_bkey);
-			}
-#endif
+		case BCH_EXTENT_ENTRY_rebalance:
+			ret = bch2_extent_rebalance_validate(c, k, from, &entry->rebalance);
+			if (ret)
+				return ret;
 			break;
-		}
 		case BCH_EXTENT_ENTRY_flags:
 			bkey_fsck_err_on(entry != ptrs.start,
 					 c, extent_flags_not_at_start,
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index ca480b8f8dae..ac545f962ce9 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -42,12 +42,6 @@ module_param_named(read_corrupt_device, bch2_read_corrupt_device, int, 0644);
 MODULE_PARM_DESC(read_corrupt_device, "");
 #endif
 
-static bool bch2_poison_extents_on_checksum_error;
-module_param_named(poison_extents_on_checksum_error,
-		   bch2_poison_extents_on_checksum_error, bool, 0644);
-MODULE_PARM_DESC(poison_extents_on_checksum_error,
-		 "Extents with checksum errors are marked as poisoned - unsafe without read fua support");
-
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 
 static inline u32 bch2_dev_congested_read(struct bch_dev *ca, u64 now)
@@ -551,9 +545,6 @@ static void get_rbio_extent(struct btree_trans *trans,
 static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
 					enum btree_id btree, struct bkey_s_c read_k)
 {
-	if (!bch2_poison_extents_on_checksum_error)
-		return 0;
-
 	struct bch_fs *c = trans->c;
 
 	struct data_update *u = rbio_data_update(rbio);
@@ -1291,6 +1282,10 @@ retry_pick:
 
 	async_object_list_add(c, rbio, rbio, &rbio->list_idx);
 
+	/* XXX: also nvme read recovery level */
+	if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev)))
+		rbio->bio.bi_opf |= REQ_FUA;
+
 	if (rbio->bounce)
 		trace_and_count(c, io_read_bounce, &rbio->bio);
 
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 6a5da02ce266..ccbca802db0b 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -365,7 +365,7 @@ int bch2_extent_update(struct btree_trans *trans,
 						  min(k->k.p.offset << 9, new_i_size),
 						  i_sectors_delta, &inode) ?:
 		(bch2_inode_opts_get_inode(c, &inode, &opts),
-		 bch2_bkey_set_needs_rebalance(c, &opts, k,
+		 bch2_bkey_set_needs_rebalance(trans, NULL, &opts, k,
 					       SET_NEEDS_REBALANCE_foreground,
 					       change_cookie)) ?:
 		bch2_trans_update(trans, iter, k, 0) ?:
@@ -1271,7 +1271,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
 	return  bch2_extent_update_i_size_sectors(trans, iter,
 					min(new->k.p.offset << 9, new_i_size), 0, &inode) ?:
 		(bch2_inode_opts_get_inode(c, &inode, &opts),
-		 bch2_bkey_set_needs_rebalance(c, &opts, new,
+		 bch2_bkey_set_needs_rebalance(trans, NULL, &opts, new,
 					       SET_NEEDS_REBALANCE_foreground,
 					       op->opts.change_cookie)) ?:
 		bch2_trans_update(trans, iter, new,
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 139a6587a64e..9b172af4f8c8 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -84,7 +84,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 	struct bch_inode_opts opts;
 
 	ret =   bch2_extent_get_apply_io_opts_one(trans, &opts, iter, k, ctx) ?:
-		bch2_bkey_set_needs_rebalance(c, &opts, n, ctx, 0) ?:
+		bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, ctx, 0) ?:
 		drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, err, false);
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 58cfd540c6d6..71b17f18e90c 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -2,11 +2,10 @@
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
+#include "closure.h"
 #include "nocow_locking.h"
 #include "util.h"
 
-#include <linux/closure.h>
-
 bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
 {
 	u64 dev_bucket = bucket_to_u64(bucket);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index bd5faafc9aa7..365cce4a6b49 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -103,6 +103,13 @@ static const char * const __bch2_fs_usage_types[] = {
 
 #undef x
 
+static const char * const __bch2_rebalance_accounting_types[] = {
+#define x(n)	#n,
+	BCH_REBALANCE_ACCOUNTING()
+#undef x
+	NULL
+};
+
 static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
 				    unsigned nr, const char *type, unsigned idx)
 {
@@ -125,6 +132,7 @@ PRT_STR_OPT_BOUNDSCHECKED(csum_opt,		enum bch_csum_opt);
 PRT_STR_OPT_BOUNDSCHECKED(csum_type,		enum bch_csum_type);
 PRT_STR_OPT_BOUNDSCHECKED(compression_type,	enum bch_compression_type);
 PRT_STR_OPT_BOUNDSCHECKED(str_hash_type,	enum bch_str_hash_type);
+PRT_STR_OPT_BOUNDSCHECKED(rebalance_accounting_type,	enum bch_rebalance_accounting_type);
 
 static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
 				     struct printbuf *err)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 6b9f18839345..de1ac235e929 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -34,6 +34,7 @@ void bch2_prt_csum_opt(struct printbuf *,		enum bch_csum_opt);
 void bch2_prt_csum_type(struct printbuf *,		enum bch_csum_type);
 void bch2_prt_compression_type(struct printbuf *,	enum bch_compression_type);
 void bch2_prt_str_hash_type(struct printbuf *,		enum bch_str_hash_type);
+void bch2_prt_rebalance_accounting_type(struct printbuf *, enum bch_rebalance_accounting_type);
 
 static inline const char *bch2_d_type_str(unsigned d_type)
 {
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 67d6a90e86ef..0c29be7d662f 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -25,8 +25,29 @@
 #include <linux/kthread.h>
 #include <linux/sched/cputime.h>
 
+#define REBALANCE_WORK_SCAN_OFFSET	(U64_MAX - 1)
+
 /* bch_extent_rebalance: */
 
+int bch2_extent_rebalance_validate(struct bch_fs *c,
+				   struct bkey_s_c k,
+				   struct bkey_validate_context from,
+				   const struct bch_extent_rebalance *r)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(r->pending && !(r->need_rb & BIT(BCH_REBALANCE_background_target)),
+			 c, extent_rebalance_bad_pending,
+			 "pending incorrectly set");
+
+	bkey_fsck_err_on(r->hipri && !(r->need_rb & BIT(BCH_REBALANCE_data_replicas)),
+			 c, extent_rebalance_bad_pending,
+			 "hipri incorrectly set");
+
+fsck_err:
+	return ret;
+}
+
 static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
 {
 	const union bch_extent_entry *entry;
@@ -38,15 +59,30 @@ static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct b
 	return NULL;
 }
 
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
 {
 	return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
 }
 
+static const char * const rebalance_opts[] = {
+#define x(n) #n,
+	BCH_REBALANCE_OPTS()
+#undef x
+	NULL
+};
+
 void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
 				   const struct bch_extent_rebalance *r)
 {
-	prt_printf(out, "replicas=%u", r->data_replicas);
+	prt_str(out, "need_rb=");
+	prt_bitflags(out, rebalance_opts, r->need_rb);
+
+	if (r->hipri)
+		prt_str(out, " hipri");
+	if (r->pending)
+		prt_str(out, " pending");
+
+	prt_printf(out, " replicas=%u", r->data_replicas);
 	if (r->data_replicas_from_inode)
 		prt_str(out, " (inode)");
 
@@ -92,32 +128,54 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
 	}
 }
 
-int bch2_trigger_extent_rebalance(struct btree_trans *trans,
-				  struct bkey_s_c old, struct bkey_s_c new,
-				  enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	int need_rebalance_delta = 0;
-	s64 need_rebalance_sectors_delta[1] = { 0 };
+/*
+ * XXX: check in bkey_validate that if r->hipri or r->pending are set,
+ * r->data_replicas are also set
+ */
 
-	s64 s = bch2_bkey_sectors_need_rebalance(c, old);
-	need_rebalance_delta -= s != 0;
-	need_rebalance_sectors_delta[0] -= s;
+static inline unsigned rb_accounting_counters(const struct bch_extent_rebalance *r)
+{
+	if (!r)
+		return 0;
+	unsigned ret = r->need_rb;
 
-	s = bch2_bkey_sectors_need_rebalance(c, new);
-	need_rebalance_delta += s != 0;
-	need_rebalance_sectors_delta[0] += s;
+	if (r->hipri)
+		ret |= BIT(BCH_REBALANCE_ACCOUNTING_high_priority);
+	if (r->pending) {
+		ret |= BIT(BCH_REBALANCE_ACCOUNTING_pending);
+		ret &= ~BIT(BCH_REBALANCE_ACCOUNTING_background_target);
+	}
+	return ret;
+}
 
-	if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
+int __bch2_trigger_extent_rebalance(struct btree_trans *trans,
+				    struct bkey_s_c old, struct bkey_s_c new,
+				    unsigned old_r, unsigned new_r,
+				    enum btree_iter_update_trigger_flags flags)
+{
+	int delta = (int) !!new_r - (int) !!old_r;
+	if ((flags & BTREE_TRIGGER_transactional) && delta) {
 		int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
-						      new.k->p, need_rebalance_delta > 0);
+						      new.k->p, delta > 0);
 		if (ret)
 			return ret;
 	}
 
-	if (need_rebalance_sectors_delta[0]) {
+	delta = old.k->size == new.k->size
+		? old_r ^ new_r
+		: old_r | new_r;
+	while (delta) {
+		unsigned c = __ffs(delta);
+		delta ^= BIT(c);
+
+		s64 v[1] = { 0 };
+		if (old_r & BIT(c))
+			v[0] -= (s64) old.k->size;
+		if (new_r & BIT(c))
+			v[0] += (s64) new.k->size;
+
 		int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
-						    need_rebalance_sectors_delta, rebalance_work);
+						    v, rebalance_work_v2, c);
 		if (ret)
 			return ret;
 	}
@@ -125,39 +183,48 @@ int bch2_trigger_extent_rebalance(struct btree_trans *trans,
 	return 0;
 }
 
-static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
-				      struct bch_inode_opts *io_opts,
-				      unsigned *move_ptrs,
-				      unsigned *compress_ptrs,
-				      u64 *sectors)
+static struct bch_extent_rebalance
+bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
+			  struct bch_inode_opts *opts,
+			  unsigned *move_ptrs,
+			  unsigned *compress_ptrs,
+			  unsigned *csum_ptrs,
+			  bool may_update_indirect)
 {
 	*move_ptrs	= 0;
 	*compress_ptrs	= 0;
-	*sectors	= 0;
+	*csum_ptrs	= 0;
 
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	const struct bch_extent_rebalance *rb_opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
-	if (!io_opts && !rb_opts)
-		return;
+	struct bch_extent_rebalance r = { .type = BIT(BCH_EXTENT_ENTRY_rebalance) };
 
 	if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
-		return;
+		return r;
 
-	unsigned compression_type =
-		bch2_compression_opt_to_type(io_opts
-					     ? io_opts->background_compression
-					     : rb_opts->background_compression);
-	unsigned target = io_opts
-		? io_opts->background_target
-		: rb_opts->background_target;
-	if (target && !bch2_target_accepts_data(c, BCH_DATA_user, target))
-		target = 0;
+	const struct bch_extent_rebalance *old_r = bch2_bkey_ptrs_rebalance_opts(ptrs);
+	if (old_r) {
+		r = *old_r;
+		r.need_rb = 0;
+	}
+
+#define x(_name)							\
+	if (k.k->type != KEY_TYPE_reflink_v ||				\
+	    may_update_indirect ||					\
+	    (!opts->_name##_from_inode && !r._name##_from_inode)) {	\
+		r._name			= opts->_name;			\
+		r._name##_from_inode	= opts->_name##_from_inode;	\
+	}
+	BCH_REBALANCE_OPTS()
+#undef x
+
+	unsigned compression_type = bch2_compression_opt_to_type(r.background_compression);
+	unsigned csum_type	= bch2_data_checksum_type_rb(c, r);
+
+	bool incompressible = false, unwritten = false, ec = false;
+	unsigned durability = 0, min_durability = INT_MAX;
 
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	bool incompressible = false, unwritten = false;
-
 	unsigned ptr_idx = 1;
 
 	guard(rcu)();
@@ -166,102 +233,222 @@ static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
 		unwritten	|= p.ptr.unwritten;
 
 		if (!p.ptr.cached) {
-			if (p.crc.compression_type != compression_type)
+			if (p.crc.compression_type != compression_type) {
 				*compress_ptrs |= ptr_idx;
+				r.need_rb |= BIT(BCH_REBALANCE_background_compression);
+			}
 
-			if (target && !bch2_dev_in_target(c, p.ptr.dev, target))
+			if (p.crc.csum_type != csum_type) {
+				*csum_ptrs |= ptr_idx;
+				r.need_rb |= BIT(BCH_REBALANCE_data_checksum);
+			}
+
+			if (r.background_target &&
+			    !bch2_dev_in_target(c, p.ptr.dev, r.background_target)) {
 				*move_ptrs |= ptr_idx;
+				r.need_rb |= BIT(BCH_REBALANCE_background_target);
+			}
+
+			unsigned d = bch2_extent_ptr_durability(c, &p);
+			durability += d;
+			min_durability = min(min_durability, d);
+
+			ec |= p.has_ec;
 		}
 
 		ptr_idx <<= 1;
 	}
 
-	if (unwritten)
+	if (unwritten || incompressible) {
 		*compress_ptrs = 0;
-	if (incompressible)
-		*compress_ptrs = 0;
-
-	unsigned rb_ptrs = *move_ptrs | *compress_ptrs;
-
-	if (!rb_ptrs)
-		return;
+		r.need_rb &= ~BIT(BCH_REBALANCE_background_compression);
+	}
 
-	ptr_idx = 1;
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		if (rb_ptrs & ptr_idx)
-			*sectors += p.crc.compressed_size;
-		ptr_idx <<= 1;
+	if (unwritten) {
+		*csum_ptrs = 0;
+		r.need_rb &= !BIT(BCH_REBALANCE_data_checksum);
 	}
+
+	return r;
 }
 
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+static int check_rebalance_scan_cookie(struct btree_trans *trans, u64 inum, bool *v)
 {
-	unsigned move_ptrs	= 0;
-	unsigned compress_ptrs	= 0;
-	u64 sectors		= 0;
+	if (v && *v)
+		return 1;
+
+	/*
+	 * If opts need to be propagated to the extent, a scan cookie should be
+	 * present:
+	 */
+	CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work,
+				SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+				0);
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
 
-	bch2_bkey_needs_rebalance(c, k, NULL, &move_ptrs, &compress_ptrs, &sectors);
-	return sectors;
+	ret = k.k->type == KEY_TYPE_cookie;
+	if (v)
+		*v = ret;
+	return ret;
 }
 
-static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
-					      struct bch_inode_opts *opts,
-					      struct bkey_s_c k)
+static int new_needs_rb_allowed(struct btree_trans *trans,
+				struct per_snapshot_io_opts *s,
+				struct bkey_s_c k,
+				enum set_needs_rebalance_ctx ctx,
+				unsigned opt_change_cookie,
+				const struct bch_extent_rebalance *old,
+				const struct bch_extent_rebalance *new,
+				unsigned new_need_rb)
 {
-	unsigned move_ptrs	= 0;
-	unsigned compress_ptrs	= 0;
-	u64 sectors		= 0;
+	struct bch_fs *c = trans->c;
+	/*
+	 * New need_rb - pointers that don't match the current io path options -
+	 * are only allowed in certain situations:
+	 *
+	 * Propagating new options: from bch2_set_rebalance_needs_scan
+	 *
+	 * Foreground writes: background_compression and background_target are
+	 * allowed
+	 *
+	 * Foreground writes: we may have raced with an option change:
+	 * opt_change_cookie checks for this
+	 *
+	 * XXX: foreground writes should still match compression,
+	 * foreground_target - figure out how to check for this
+	 */
+	if (ctx == SET_NEEDS_REBALANCE_opt_change ||
+	    ctx == SET_NEEDS_REBALANCE_opt_change_indirect)
+		return 0;
+
+	if (ctx == SET_NEEDS_REBALANCE_foreground) {
+		new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)|
+				 BIT(BCH_REBALANCE_background_target));
+		if (!new_need_rb)
+			return 0;
+
+		if (opt_change_cookie != atomic_read(&c->opt_change_cookie))
+			return 0;
+	}
+
+	/*
+	 * Either the extent data or the extent io options (from
+	 * bch_extent_rebalance) should match the io_opts from the
+	 * inode/filesystem, unless
+	 *
+	 * - There's a scan pending to propagate new options
+	 * - It's an indirect extent: it may be referenced by inodes
+	 *   with inconsistent options
+	 *
+	 * For efficiency (so that we can cache checking for scan
+	 * cookies), only check option consistency when we're called
+	 * with snapshot_io_opts - don't bother when we're called from
+	 * move_data_phys() -> get_io_opts_one()
+	 *
+	 * Note that we can cache the existence of a cookie, but not the
+	 * non-existence, to avoid spurious false positives.
+	 */
+	int ret = check_rebalance_scan_cookie(trans, 0,			s ? &s->fs_scan_cookie : NULL) ?:
+		  check_rebalance_scan_cookie(trans, k.k->p.inode,	s ? &s->inum_scan_cookie : NULL);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		return 0;
+
+	CLASS(printbuf, buf)();
+
+	prt_printf(&buf, "extent with incorrect/missing rebalance opts:\n");
+	bch2_bkey_val_to_text(&buf, c, k);
 
-	bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &sectors);
-	return move_ptrs|compress_ptrs;
+	const struct bch_extent_rebalance _old = {};
+	if (!old)
+		old = &_old;
+
+#define x(_name)								\
+	if (new_need_rb & BIT(BCH_REBALANCE_##_name))				\
+		prt_printf(&buf, "\n" #_name " %u != %u", old->_name, new->_name);
+	BCH_REBALANCE_OPTS()
+#undef x
+
+	fsck_err(trans, extent_io_opts_not_set, "%s", buf.buf);
+fsck_err:
+	return ret;
 }
 
-static inline bool bkey_should_have_rb_opts(struct bch_fs *c,
-					    struct bch_inode_opts *opts,
-					    struct bkey_s_c k)
+static inline bool bkey_should_have_rb_opts(struct bkey_s_c k,
+					    struct bch_extent_rebalance new)
 {
 	if (k.k->type == KEY_TYPE_reflink_v) {
-#define x(n)	if (opts->n##_from_inode) return true;
+#define x(n)	if (new.n##_from_inode) return true;
 		BCH_REBALANCE_OPTS()
 #undef x
 	}
-	return bch2_bkey_ptrs_need_rebalance(c, opts, k);
+	return new.need_rb;
 }
 
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts,
+int bch2_bkey_set_needs_rebalance(struct btree_trans *trans,
+				  struct per_snapshot_io_opts *snapshot_io_opts,
+				  struct bch_inode_opts *opts,
 				  struct bkey_i *_k,
 				  enum set_needs_rebalance_ctx ctx,
-				  u32 change_cookie)
+				  u32 opt_change_cookie)
 {
 	if (!bkey_extent_is_direct_data(&_k->k))
 		return 0;
 
+	struct bch_fs *c = trans->c;
 	struct bkey_s k = bkey_i_to_s(_k);
 	struct bch_extent_rebalance *old =
 		(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
 
-	if (bkey_should_have_rb_opts(c, opts, k.s_c)) {
+	unsigned move_ptrs	= 0;
+	unsigned compress_ptrs	= 0;
+	unsigned csum_ptrs	= 0;
+	struct bch_extent_rebalance new =
+		bch2_bkey_needs_rebalance(c, k.s_c, opts, &move_ptrs, &compress_ptrs, &csum_ptrs,
+					  ctx == SET_NEEDS_REBALANCE_opt_change_indirect);
+
+	bool should_have_rb = bkey_should_have_rb_opts(k.s_c, new);
+
+	if (should_have_rb == !!old &&
+	    (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old))
+		return 0;
+
+	unsigned new_need_rb = new.need_rb & ~(old ? old->need_rb : 0);
+
+	if (unlikely(new_need_rb)) {
+		int ret = new_needs_rb_allowed(trans, snapshot_io_opts,
+					       k.s_c, ctx, opt_change_cookie,
+					       old, &new, new_need_rb);
+		if (ret)
+			return ret;
+	}
+
+	if (should_have_rb) {
 		if (!old) {
 			old = bkey_val_end(k);
 			k.k->u64s += sizeof(*old) / sizeof(u64);
 		}
 
-		*old = io_opts_to_rebalance_opts(c, opts);
-	} else {
-		if (old)
-			extent_entry_drop(k, (union bch_extent_entry *) old);
-	}
+		*old = new;
+	} else if (old)
+		extent_entry_drop(k, (union bch_extent_entry *) old);
 
 	return 0;
 }
 
 static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
+					  struct per_snapshot_io_opts *snapshot_io_opts,
 					  struct bch_inode_opts *io_opts,
 					  struct btree_iter *iter,
 					  struct bkey_s_c k,
 					  enum set_needs_rebalance_ctx ctx)
 {
 	struct bch_fs *c = trans->c;
+	int ret = 0;
 
 	BUG_ON(iter->flags & BTREE_ITER_is_extents);
 	BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
@@ -269,36 +456,24 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
 	if (!bkey_extent_is_direct_data(k.k))
 		return 0;
 
-	bool may_update_indirect = ctx == SET_NEEDS_REBALANCE_opt_change_indirect;
+	struct bch_extent_rebalance *old =
+		(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k);
 
-	/*
-	 * If it's an indirect extent, and we walked to it directly, we won't
-	 * have the options from the inode that were directly applied: options
-	 * from the extent take precedence - unless the io_opts option came from
-	 * the inode and may_update_indirect is true (walked from a
-	 * REFLINK_P_MAY_UPDATE_OPTIONS pointer).
-	 */
-	const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
-	if (old && k.k->type == KEY_TYPE_reflink_v) {
-#define x(_name)								\
-		if (old->_name##_from_inode &&					\
-		    !(may_update_indirect && io_opts->_name##_from_inode)) {	\
-			io_opts->_name = old->_name;				\
-			io_opts->_name##_from_inode = true;			\
-		}
-		BCH_REBALANCE_OPTS()
-#undef x
-	}
+	unsigned move_ptrs	= 0;
+	unsigned compress_ptrs	= 0;
+	unsigned csum_ptrs	= 0;
+	struct bch_extent_rebalance new =
+		bch2_bkey_needs_rebalance(c, k, io_opts, &move_ptrs, &compress_ptrs, &csum_ptrs,
+					  ctx == SET_NEEDS_REBALANCE_opt_change_indirect);
 
-	struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, io_opts);
+	bool should_have_rb = bkey_should_have_rb_opts(k, new);
 
-	if (bkey_should_have_rb_opts(c, io_opts, k)
-	    ? old && !memcmp(old, &new, sizeof(new))
-	    : !old)
+	if (should_have_rb == !!old &&
+	    (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old))
 		return 0;
 
 	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
-	int ret = PTR_ERR_OR_ZERO(n);
+	ret = PTR_ERR_OR_ZERO(n);
 	if (ret)
 		return ret;
 
@@ -306,7 +481,7 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
 
 	/* On successfull transaction commit, @k was invalidated: */
 
-	return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?:
+	return  bch2_bkey_set_needs_rebalance(trans, snapshot_io_opts, io_opts, n, ctx, 0) ?:
 		bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
 		bch2_trans_commit(trans, NULL, NULL, 0) ?:
 		bch_err_throw(c, transaction_restart_commit);
@@ -349,7 +524,8 @@ static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans *trans,
 
 			darray_push(&io_opts->d, e);
 		}));
-		io_opts->cur_inum = extent_pos.inode;
+		io_opts->cur_inum		= extent_pos.inode;
+		io_opts->inum_scan_cookie	= false;
 	}
 
 	ret = ret ?: trans_was_restarted(trans, restart_count);
@@ -372,11 +548,13 @@ struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *trans,
 			  enum set_needs_rebalance_ctx ctx)
 {
 	struct bch_inode_opts *opts =
-		bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, extent_iter, extent_k);
+		bch2_extent_get_io_opts(trans, snapshot_io_opts,
+					extent_pos, extent_iter, extent_k);
 	if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level)
 		return opts;
 
-	int ret = bch2_get_update_rebalance_opts(trans, opts, extent_iter, extent_k, ctx);
+	int ret = bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts,
+						 extent_iter, extent_k, ctx);
 	return ret ? ERR_PTR(ret) : opts;
 }
 
@@ -420,11 +598,9 @@ int bch2_extent_get_apply_io_opts_one(struct btree_trans *trans,
 	if (ret || btree_iter_path(trans, extent_iter)->level)
 		return ret;
 
-	return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k, ctx);
+	return bch2_get_update_rebalance_opts(trans, NULL, io_opts, extent_iter, extent_k, ctx);
 }
 
-#define REBALANCE_WORK_SCAN_OFFSET	(U64_MAX - 1)
-
 static const char * const bch2_rebalance_state_strs[] = {
 #define x(t) #t,
 	BCH_REBALANCE_STATES()
@@ -535,23 +711,6 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans,
 	return &(&darray_pop(buf))->k_i;
 }
 
-static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
-					   struct btree_iter *iter,
-					   struct bkey_s_c k)
-{
-	if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
-		return 0;
-
-	struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
-	int ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		return ret;
-
-	extent_entry_drop(bkey_i_to_s(n),
-			  (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
-	return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
 static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 			struct per_snapshot_io_opts *snapshot_io_opts,
 			struct bpos work_pos,
@@ -570,6 +729,10 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 	if (bkey_err(k))
 		return k;
 
+	const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+	if (!r || !r->need_rb) /* Write buffer race? */
+		return bkey_s_c_null;
+
 	struct bch_inode_opts *opts =
 		bch2_extent_get_apply_io_opts(trans, snapshot_io_opts,
 					      extent_iter->pos, extent_iter, k,
@@ -580,22 +743,24 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 
 	*opts_ret = opts;
 
+	unsigned move_ptrs	= 0;
+	unsigned compress_ptrs	= 0;
+	unsigned csum_ptrs	= 0;
+	bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &csum_ptrs, false);
+
 	memset(data_opts, 0, sizeof(*data_opts));
-	data_opts->rewrite_ptrs		= bch2_bkey_ptrs_need_rebalance(c, opts, k);
+	data_opts->rewrite_ptrs		= move_ptrs|compress_ptrs|csum_ptrs;
 	data_opts->target		= opts->background_target;
 	data_opts->write_flags		|= BCH_WRITE_only_specified_devs;
 
-	if (!data_opts->rewrite_ptrs) {
-		/*
-		 * device we would want to write to offline? devices in target
-		 * changed?
-		 *
-		 * We'll now need a full scan before this extent is picked up
-		 * again:
-		 */
-		int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
-		if (ret)
-			return bkey_s_c_err(ret);
+	if (!data_opts->rewrite_ptrs &&
+	    !data_opts->kill_ptrs &&
+	    !data_opts->kill_ec_ptrs &&
+	    !data_opts->extra_replicas) {
+		CLASS(printbuf, buf)();
+		prt_printf(&buf, "got extent to rebalance but nothing to do, confused\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch_err(c, "%s", buf.buf);
 		return bkey_s_c_null;
 	}
 
@@ -605,12 +770,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 		bch2_bkey_val_to_text(&buf, c, k);
 		prt_newline(&buf);
 
-		unsigned move_ptrs	= 0;
-		unsigned compress_ptrs	= 0;
-		u64 sectors		= 0;
-
-		bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &sectors);
-
 		if (move_ptrs) {
 			prt_str(&buf, "move=");
 			bch2_target_to_text(&buf, c, opts->background_target);
@@ -627,6 +786,14 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 			prt_newline(&buf);
 		}
 
+		if (csum_ptrs) {
+			prt_str(&buf, "csum=");
+			bch2_prt_csum_opt(&buf, opts->data_checksum);
+			prt_str(&buf, " ");
+			bch2_prt_u64_base2(&buf, csum_ptrs);
+			prt_newline(&buf);
+		}
+
 		trace_rebalance_extent(c, buf.buf);
 	}
 	count_event(c, rebalance_extent);
@@ -690,6 +857,7 @@ out:
 
 static int do_rebalance_scan_indirect(struct btree_trans *trans,
 				      struct bkey_s_c_reflink_p p,
+				      struct per_snapshot_io_opts *snapshot_io_opts,
 				      struct bch_inode_opts *opts)
 {
 	u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
@@ -702,7 +870,7 @@ static int do_rebalance_scan_indirect(struct btree_trans *trans,
 				     BTREE_ITER_not_extents, k, ({
 		if (bpos_ge(bkey_start_pos(k.k), POS(0, end)))
 			break;
-		bch2_get_update_rebalance_opts(trans, opts, &iter, k,
+		bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts, &iter, k,
 					       SET_NEEDS_REBALANCE_opt_change_indirect);
 	}));
 	if (ret)
@@ -750,7 +918,8 @@ static int do_rebalance_scan(struct moving_context *ctxt,
 		(inum &&
 		 k.k->type == KEY_TYPE_reflink_p &&
 		 REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)
-		 ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), opts)
+		 ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k),
+					      snapshot_io_opts, opts)
 		 : 0);
 	}));
 	if (ret)
@@ -1049,6 +1218,7 @@ int bch2_fs_rebalance_init(struct bch_fs *c)
 static int check_rebalance_work_one(struct btree_trans *trans,
 				    struct btree_iter *extent_iter,
 				    struct btree_iter *rebalance_iter,
+				    struct per_snapshot_io_opts *snapshot_io_opts,
 				    struct bkey_buf *last_flushed)
 {
 	struct bch_fs *c = trans->c;
@@ -1089,8 +1259,7 @@ static int check_rebalance_work_one(struct btree_trans *trans,
 		extent_k.k = &deleted;
 	}
 
-	bool should_have_rebalance =
-		bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
+	bool should_have_rebalance = bch2_bkey_needs_rb(extent_k);
 	bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
 
 	if (should_have_rebalance != have_rebalance) {
@@ -1119,6 +1288,21 @@ static int check_rebalance_work_one(struct btree_trans *trans,
 			return ret;
 	}
 
+	struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans,
+				snapshot_io_opts, extent_iter->pos, extent_iter, extent_k,
+				SET_NEEDS_REBALANCE_other);
+	ret = PTR_ERR_OR_ZERO(opts);
+	if (ret == -BCH_ERR_transaction_restart_commit) {
+		/*
+		 * If get_apply_io_opts() did work, just advance and check the
+		 * next key; it may have updated the rebalance_work btree so
+		 * we'd need a write buffer flush to check what it just did.
+		 */
+		ret = 0;
+	}
+	if (ret)
+		return ret;
+
 	if (cmp <= 0)
 		bch2_btree_iter_advance(extent_iter);
 	if (cmp >= 0)
@@ -1131,10 +1315,14 @@ int bch2_check_rebalance_work(struct bch_fs *c)
 {
 	CLASS(btree_trans, trans)(c);
 	CLASS(btree_iter, extent_iter)(trans, BTREE_ID_reflink, POS_MIN,
+				       BTREE_ITER_not_extents|
 				       BTREE_ITER_prefetch);
 	CLASS(btree_iter, rebalance_iter)(trans, BTREE_ID_rebalance_work, POS_MIN,
 					  BTREE_ITER_prefetch);
 
+	struct per_snapshot_io_opts snapshot_io_opts;
+	per_snapshot_io_opts_init(&snapshot_io_opts, c);
+
 	struct bkey_buf last_flushed;
 	bch2_bkey_buf_init(&last_flushed);
 	bkey_init(&last_flushed.k->k);
@@ -1148,12 +1336,15 @@ int bch2_check_rebalance_work(struct bch_fs *c)
 
 		bch2_trans_begin(trans);
 
-		ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
+		ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter,
+					       &snapshot_io_opts, &last_flushed) ?:
+			bch2_trans_commit(trans, NULL, NULL, 0);
 
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			ret = 0;
 	}
 
+	per_snapshot_io_opts_exit(&snapshot_io_opts);
 	bch2_bkey_buf_exit(&last_flushed, c);
 	return ret < 0 ? ret : 0;
 }
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 24bafa42f070..f40f670af046 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -7,10 +7,14 @@
 #include "opts.h"
 #include "rebalance_types.h"
 
+int bch2_extent_rebalance_validate(struct bch_fs *, struct bkey_s_c,
+				   struct bkey_validate_context,
+				   const struct bch_extent_rebalance *);
+
 static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
 								    struct bch_inode_opts *opts)
 {
-	struct bch_extent_rebalance r = {
+	return (struct bch_extent_rebalance) {
 		.type = BIT(BCH_EXTENT_ENTRY_rebalance),
 #define x(_name)							\
 		._name = opts->_name,					\
@@ -18,22 +22,36 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_f
 		BCH_REBALANCE_OPTS()
 #undef x
 	};
-
-	if (r.background_target &&
-	    !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
-		r.background_target = 0;
-
-	return r;
 };
 
 void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *,
 				   const struct bch_extent_rebalance *);
 
-int bch2_trigger_extent_rebalance(struct btree_trans *,
-				  struct bkey_s_c, struct bkey_s_c,
-				  enum btree_iter_update_trigger_flags);
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
+
+static inline int bch2_bkey_needs_rb(struct bkey_s_c k)
+{
+	const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+	return r ? r->need_rb : 0;
+}
+
+int __bch2_trigger_extent_rebalance(struct btree_trans *,
+				    struct bkey_s_c, struct bkey_s_c,
+				    unsigned, unsigned,
+				    enum btree_iter_update_trigger_flags);
 
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
+static inline int bch2_trigger_extent_rebalance(struct btree_trans *trans,
+				  struct bkey_s_c old, struct bkey_s_c new,
+				  enum btree_iter_update_trigger_flags flags)
+{
+	unsigned old_r = bch2_bkey_needs_rb(old);
+	unsigned new_r = bch2_bkey_needs_rb(new);
+
+	return old_r != new_r ||
+		(old.k->size != new.k->size && (old_r|new_r))
+		? __bch2_trigger_extent_rebalance(trans, old, new, old_r, new_r, flags)
+		: 0;
+}
 
 enum set_needs_rebalance_ctx {
 	SET_NEEDS_REBALANCE_opt_change,
@@ -42,9 +60,6 @@ enum set_needs_rebalance_ctx {
 	SET_NEEDS_REBALANCE_other,
 };
 
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *,
-				  struct bkey_i *, enum set_needs_rebalance_ctx, u32);
-
 /* Inodes in different snapshots may have different IO options: */
 struct snapshot_io_opts_entry {
 	u32			snapshot;
@@ -53,6 +68,9 @@ struct snapshot_io_opts_entry {
 
 struct per_snapshot_io_opts {
 	u64			cur_inum;
+	bool			fs_scan_cookie;
+	bool			inum_scan_cookie;
+
 	struct bch_inode_opts	fs_io_opts;
 	DARRAY(struct snapshot_io_opts_entry) d;
 };
@@ -68,6 +86,10 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt
 	darray_exit(&io_opts->d);
 }
 
+int bch2_bkey_set_needs_rebalance(struct btree_trans *,
+				  struct per_snapshot_io_opts *, struct bch_inode_opts *,
+				  struct bkey_i *, enum set_needs_rebalance_ctx, u32);
+
 struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *,
 			  struct per_snapshot_io_opts *, struct bpos,
 			  struct btree_iter *, struct bkey_s_c,
diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h
index ff9a1342a22b..d7a5f899e789 100644
--- a/fs/bcachefs/rebalance_format.h
+++ b/fs/bcachefs/rebalance_format.h
@@ -5,49 +5,76 @@
 struct bch_extent_rebalance {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u64			type:6,
-				unused:3,
+				unused:5,
+				hipri:1,
+				pending:1,
+				need_rb:5,
 
-				promote_target_from_inode:1,
-				erasure_code_from_inode:1,
+				data_replicas_from_inode:1,
 				data_checksum_from_inode:1,
+				erasure_code_from_inode:1,
 				background_compression_from_inode:1,
-				data_replicas_from_inode:1,
 				background_target_from_inode:1,
+				promote_target_from_inode:1,
 
-				promote_target:16,
-				erasure_code:1,
+				data_replicas:3,
 				data_checksum:4,
-				data_replicas:4,
+				erasure_code:1,
 				background_compression:8, /* enum bch_compression_opt */
-				background_target:16;
+				background_target:12,
+				promote_target:12;
 #elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			background_target:16,
+	__u64			promote_target:12,
+				background_target:12,
 				background_compression:8,
-				data_replicas:4,
-				data_checksum:4,
 				erasure_code:1,
-				promote_target:16,
+				data_checksum:4,
+				data_replicas:3,
 
+				promote_target_from_inode:1,
 				background_target_from_inode:1,
-				data_replicas_from_inode:1,
 				background_compression_from_inode:1,
-				data_checksum_from_inode:1,
 				erasure_code_from_inode:1,
-				promote_target_from_inode:1,
+				data_checksum_from_inode:1,
+				data_replicas_from_inode:1,
 
-				unused:3,
+				need_rb:5,
+				pending:1,
+				hipri:1,
+				unused:5,
 				type:6;
 #endif
 };
 
 /* subset of BCH_INODE_OPTS */
 #define BCH_REBALANCE_OPTS()			\
+	x(data_replicas)			\
 	x(data_checksum)			\
+	x(erasure_code)				\
 	x(background_compression)		\
+	x(background_target)			\
+	x(promote_target)
+
+enum bch_rebalance_opts {
+#define x(n)	BCH_REBALANCE_##n,
+	BCH_REBALANCE_OPTS()
+#undef x
+};
+
+#define BCH_REBALANCE_ACCOUNTING()		\
 	x(data_replicas)			\
-	x(promote_target)			\
+	x(data_checksum)			\
+	x(erasure_code)				\
+	x(background_compression)		\
 	x(background_target)			\
-	x(erasure_code)
+	x(high_priority)			\
+	x(pending)				\
+
+enum bch_rebalance_accounting_type {
+#define x(n) BCH_REBALANCE_ACCOUNTING_##n,
+	BCH_REBALANCE_ACCOUNTING()
+#undef x
+};
 
 #endif /* _BCACHEFS_REBALANCE_FORMAT_H */
 
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index bfd06fd5d506..66b7f19f0437 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -107,7 +107,10 @@
 	  BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\
 	x(btree_node_accounting,				\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_accounting_mismatch)
+	  BCH_FSCK_ERR_accounting_mismatch)			\
+	x(rebalance_v2,						\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work),	\
+	  BCH_FSCK_ERR_extent_io_opts_not_set)
 
 #define DOWNGRADE_TABLE()					\
 	x(bucket_stripe_sectors,				\
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 77e3fc92e39b..9ec2df6c8071 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -159,6 +159,8 @@ enum bch_fsck_flags {
 	x(extent_ptrs_redundant_stripe,				139,	0)		\
 	x(extent_ptrs_unwritten,				140,	0)		\
 	x(extent_ptrs_written_and_unwritten,			141,	0)		\
+	x(extent_rebalance_bad_pending,				330,	0)		\
+	x(extent_rebalance_bad_hipri,				331,	0)		\
 	x(ptr_to_invalid_device,				142,	0)		\
 	x(ptr_to_removed_device,				322,	FSCK_AUTOFIX)	\
 	x(ptr_to_duplicate_device,				143,	0)		\
@@ -339,7 +341,9 @@ enum bch_fsck_flags {
 	x(dirent_stray_data_after_cf_name,			305,	0)		\
 	x(rebalance_work_incorrectly_set,			309,	FSCK_AUTOFIX)	\
 	x(rebalance_work_incorrectly_unset,			310,	FSCK_AUTOFIX)	\
-	x(MAX,							328,	0)
+	x(extent_io_opts_not_set,				328,	FSCK_AUTOFIX)	\
+	x(extent_io_opts_unneeded,				329,	FSCK_AUTOFIX)	\
+	x(MAX,							332,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 40adefe7170f..ef6312c50f88 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -45,6 +45,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/sort.h>
+#include <linux/string_choices.h>
 #include <linux/sched/clock.h>
 
 #include "util.h"
@@ -157,6 +158,7 @@ write_attribute(trigger_recalc_capacity);
 write_attribute(trigger_delete_dead_snapshots);
 write_attribute(trigger_emergency_read_only);
 read_attribute(gc_gens_pos);
+__sysfs_attribute(read_fua_test, 0400);
 
 read_attribute(uuid);
 read_attribute(minor);
@@ -304,6 +306,116 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
 	prt_printf(out, "reserved:\t\t%llu\n",	b.reserved);
 }
 
+static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+	struct bio *bio = NULL;
+	void *buf = NULL;
+	unsigned bs = c->opts.block_size, iters;
+	u64 end, test_duration = NSEC_PER_SEC * 2;
+	struct bch2_time_stats stats_nofua, stats_fua, stats_random;
+	int ret = 0;
+
+	bch2_time_stats_init_no_pcpu(&stats_nofua);
+	bch2_time_stats_init_no_pcpu(&stats_fua);
+	bch2_time_stats_init_no_pcpu(&stats_random);
+
+	if (!bch2_dev_get_ioref(c, ca->dev_idx, READ, BCH_DEV_READ_REF_read_fua_test)) {
+		prt_str(out, "offline\n");
+		return 0;
+	}
+
+	struct block_device *bdev = ca->disk_sb.bdev;
+
+	bio = bio_kmalloc(1, GFP_KERNEL);
+	if (!bio) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	buf = kmalloc(bs, GFP_KERNEL);
+	if (!buf)
+		goto err;
+
+	end = ktime_get_ns() + test_duration;
+	for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
+		bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ);
+		bch2_bio_map(bio, buf, bs);
+
+		u64 submit_time = ktime_get_ns();
+		ret = submit_bio_wait(bio);
+		bch2_time_stats_update(&stats_nofua, submit_time);
+
+		if (ret)
+			goto err;
+	}
+
+	end = ktime_get_ns() + test_duration;
+	for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
+		bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ);
+		bch2_bio_map(bio, buf, bs);
+
+		u64 submit_time = ktime_get_ns();
+		ret = submit_bio_wait(bio);
+		bch2_time_stats_update(&stats_fua, submit_time);
+
+		if (ret)
+			goto err;
+	}
+
+	u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca);
+
+	end = ktime_get_ns() + test_duration;
+	for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
+		bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ);
+		bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9;
+		bch2_bio_map(bio, buf, bs);
+
+		u64 submit_time = ktime_get_ns();
+		ret = submit_bio_wait(bio);
+		bch2_time_stats_update(&stats_random, submit_time);
+
+		if (ret)
+			goto err;
+	}
+
+	u64 ns_nofua		= mean_and_variance_get_mean(stats_nofua.duration_stats);
+	u64 ns_fua		= mean_and_variance_get_mean(stats_fua.duration_stats);
+	u64 ns_rand		= mean_and_variance_get_mean(stats_random.duration_stats);
+
+	u64 stddev_nofua	= mean_and_variance_get_stddev(stats_nofua.duration_stats);
+	u64 stddev_fua		= mean_and_variance_get_stddev(stats_fua.duration_stats);
+	u64 stddev_rand		= mean_and_variance_get_stddev(stats_random.duration_stats);
+
+	printbuf_tabstop_push(out, 8);
+	printbuf_tabstop_push(out, 12);
+	printbuf_tabstop_push(out, 12);
+	prt_printf(out, "This test must be run on an idle drive for accurate results\n");
+	prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device));
+	prt_printf(out, "fua support advertized: %s\n", str_yes_no(bdev_fua(bdev)));
+	prt_newline(out);
+	prt_printf(out, "ns:\tlatency\rstddev\r\n");
+	prt_printf(out, "nofua\t%llu\r%llu\r\n",	ns_nofua,	stddev_nofua);
+	prt_printf(out, "fua\t%llu\r%llu\r\n",		ns_fua,		stddev_fua);
+	prt_printf(out, "random\t%llu\r%llu\r\n",	ns_rand,	stddev_rand);
+
+	bool read_cache = ns_nofua * 2 < ns_rand;
+	bool fua_cached	= read_cache && ns_fua < (ns_nofua + ns_rand) / 2;
+
+	if (!read_cache)
+		prt_str(out, "reads don't appear to be cached - safe\n");
+	else if (!fua_cached)
+		prt_str(out, "fua reads don't appear to be cached - safe\n");
+	else
+		prt_str(out, "fua reads appear to be cached - unsafe\n");
+err:
+	kfree(buf);
+	kfree(bio);
+	enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_read_fua_test);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
 SHOW(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -847,6 +959,9 @@ SHOW(bch2_dev)
 	if (attr == &sysfs_open_buckets)
 		bch2_open_buckets_to_text(out, c, ca);
 
+	if (attr == &sysfs_read_fua_test)
+		return bch2_read_fua_test(out, ca);
+
 	int opt_id = bch2_opt_lookup(attr->name);
 	if (opt_id >= 0)
 		return sysfs_opt_show(c, ca, opt_id, out);
@@ -911,6 +1026,8 @@ struct attribute *bch2_dev_files[] = {
 	&sysfs_congested,
 #endif
 
+	&sysfs_read_fua_test,
+
 	/* debug: */
 	&sysfs_alloc_debug,
 	&sysfs_open_buckets,
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 6c312fd9a447..c5d7be2eba03 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1339,11 +1339,6 @@ DEFINE_EVENT(fs_str, io_move_pred,
 	TP_ARGS(c, str)
 );
 
-DEFINE_EVENT(fs_str, io_move_created_rebalance,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
 DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
 	TP_PROTO(struct bch_fs *c, const char *str),
 	TP_ARGS(c, str)
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 52ac8230be9f..555e0d8f3cf0 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -4,7 +4,6 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/closure.h>
 #include <linux/errno.h>
 #include <linux/freezer.h>
 #include <linux/kernel.h>
@@ -21,6 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 
+#include "closure.h"
 #include "mean_and_variance.h"
 
 #include "darray.h"
diff --git a/fs/bcachefs/vendor/closure.c b/fs/bcachefs/vendor/closure.c
new file mode 100644
index 000000000000..bdafd3a57386
--- /dev/null
+++ b/fs/bcachefs/vendor/closure.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Asynchronous refcounty things
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "closure.h"
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/sched/debug.h>
+
+static void closure_val_checks(struct closure *cl, unsigned new, int d)
+{
+	unsigned count = new & CLOSURE_REMAINING_MASK;
+
+	if (WARN(new & CLOSURE_GUARD_MASK,
+		 "closure %ps has guard bits set: %x (%u), delta %i",
+		 cl->fn,
+		 new, (unsigned) __fls(new & CLOSURE_GUARD_MASK), d))
+		new &= ~CLOSURE_GUARD_MASK;
+
+	WARN(!count && (new & ~(CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING)),
+	     "closure %ps ref hit 0 with incorrect flags set: %x (%u)",
+	     cl->fn,
+	     new, (unsigned) __fls(new));
+}
+
+enum new_closure_state {
+	CLOSURE_normal_put,
+	CLOSURE_requeue,
+	CLOSURE_done,
+};
+
+/* For clearing flags with the same atomic op as a put */
+void bch2_closure_sub(struct closure *cl, int v)
+{
+	enum new_closure_state s;
+	struct task_struct *sleeper;
+
+	/* rcu_read_lock, atomic_read_acquire() are both for cl->sleeper: */
+	guard(rcu)();
+
+	int old = atomic_read_acquire(&cl->remaining), new;
+	do {
+		new = old - v;
+
+		if (new & CLOSURE_REMAINING_MASK) {
+			s = CLOSURE_normal_put;
+		} else {
+			if ((cl->fn || (new & CLOSURE_SLEEPING)) &&
+			    !(new & CLOSURE_DESTRUCTOR)) {
+				s = CLOSURE_requeue;
+				new += CLOSURE_REMAINING_INITIALIZER;
+			} else
+				s = CLOSURE_done;
+
+			sleeper = new & CLOSURE_SLEEPING ? cl->sleeper : NULL;
+			new &= ~CLOSURE_SLEEPING;
+		}
+
+		closure_val_checks(cl, new, -v);
+	} while (!atomic_try_cmpxchg_release(&cl->remaining, &old, new));
+
+	if (s == CLOSURE_normal_put)
+		return;
+
+	if (sleeper) {
+		smp_mb();
+		wake_up_process(sleeper);
+		return;
+	}
+
+	if (s == CLOSURE_requeue) {
+		closure_queue(cl);
+	} else {
+		struct closure *parent = cl->parent;
+		closure_fn *destructor = cl->fn;
+
+		closure_debug_destroy(cl);
+
+		if (destructor)
+			destructor(&cl->work);
+
+		if (parent)
+			closure_put(parent);
+	}
+}
+
+/*
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
+ */
+void __bch2_closure_wake_up(struct closure_waitlist *wait_list)
+{
+	struct llist_node *list;
+	struct closure *cl, *t;
+	struct llist_node *reverse = NULL;
+
+	list = llist_del_all(&wait_list->list);
+
+	/* We first reverse the list to preserve FIFO ordering and fairness */
+	reverse = llist_reverse_order(list);
+
+	/* Then do the wakeups */
+	llist_for_each_entry_safe(cl, t, reverse, list) {
+		closure_set_waiting(cl, 0);
+		bch2_closure_sub(cl, CLOSURE_WAITING + 1);
+	}
+}
+
+/**
+ * closure_wait - add a closure to a waitlist
+ * @waitlist: will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ * @cl: closure pointer.
+ *
+ */
+bool bch2_closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
+{
+	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+		return false;
+
+	closure_set_waiting(cl, _RET_IP_);
+	unsigned r = atomic_add_return(CLOSURE_WAITING + 1, &cl->remaining);
+	closure_val_checks(cl, r, CLOSURE_WAITING + 1);
+
+	llist_add(&cl->list, &waitlist->list);
+
+	return true;
+}
+
+void __sched __bch2_closure_sync(struct closure *cl)
+{
+	cl->sleeper = current;
+	bch2_closure_sub(cl,
+		    CLOSURE_REMAINING_INITIALIZER -
+		    CLOSURE_SLEEPING);
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+			break;
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+}
+
+/*
+ * closure_return_sync - finish running a closure, synchronously (i.e. waiting
+ * for outstanding get()s to finish) and returning once closure refcount is 0.
+ *
+ * Unlike closure_sync() this doesn't reinit the ref to 1; subsequent
+ * closure_get_not_zero() calls will fail.
+ */
+void __sched bch2_closure_return_sync(struct closure *cl)
+{
+	cl->sleeper = current;
+	bch2_closure_sub(cl,
+		    CLOSURE_REMAINING_INITIALIZER -
+		    CLOSURE_DESTRUCTOR -
+		    CLOSURE_SLEEPING);
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+			break;
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+
+	if (cl->parent)
+		closure_put(cl->parent);
+}
+
+int __sched __bch2_closure_sync_timeout(struct closure *cl, unsigned long timeout)
+{
+	int ret = 0;
+
+	cl->sleeper = current;
+	bch2_closure_sub(cl,
+		    CLOSURE_REMAINING_INITIALIZER -
+		    CLOSURE_SLEEPING);
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		/*
+		 * Carefully undo the continue_at() - but only if it
+		 * hasn't completed, i.e. the final closure_put() hasn't
+		 * happened yet:
+		 */
+		unsigned old = atomic_read(&cl->remaining), new;
+		if (!(old & CLOSURE_SLEEPING))
+			goto success;
+
+		if (!timeout) {
+			do {
+				if (!(old & CLOSURE_SLEEPING))
+					goto success;
+
+				new = old + CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING;
+				closure_val_checks(cl, new, CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING);
+			} while (!atomic_try_cmpxchg(&cl->remaining, &old, new));
+
+			ret = -ETIME;
+			break;
+		}
+
+		timeout = schedule_timeout(timeout);
+	}
+success:
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
diff --git a/fs/bcachefs/vendor/closure.h b/fs/bcachefs/vendor/closure.h
new file mode 100644
index 000000000000..79112efe30a7
--- /dev/null
+++ b/fs/bcachefs/vendor/closure.h
@@ -0,0 +1,490 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ *   continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, requires a 'return' immediately following the
+ * location where this macro is referenced, to return to the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio)
+ * {
+ *	closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ */
+
+struct closure;
+struct closure_syncer;
+typedef void (closure_fn) (struct work_struct *);
+extern struct dentry *bcache_debug;
+
+struct closure_waitlist {
+	struct llist_head	list;
+};
+
+enum closure_state {
+	/*
+	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+	 * the thread that owns the closure, and cleared by the thread that's
+	 * waking up the closure.
+	 *
+	 * The rest are for debugging and don't affect behaviour:
+	 *
+	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+	 * closure_init() and when closure_put() runs then next function), and
+	 * must be cleared before remaining hits 0. Primarily to help guard
+	 * against incorrect usage and accidentally transferring references.
+	 * continue_at() and closure_return() clear it for you, if you're doing
+	 * something unusual you can use closure_set_dead() which also helps
+	 * annotate where references are being transferred.
+	 */
+
+	CLOSURE_BITS_START	= (1U << 24),
+	CLOSURE_DESTRUCTOR	= (1U << 24),
+	CLOSURE_SLEEPING	= (1U << 26),
+	CLOSURE_WAITING		= (1U << 28),
+	CLOSURE_RUNNING		= (1U << 30),
+};
+
+#define CLOSURE_GUARD_MASK					\
+	(((CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)|(CLOSURE_BITS_START >> 1))
+
+#define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
+#define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
+
+struct closure {
+	union {
+		struct {
+			struct workqueue_struct *wq;
+			struct task_struct	*sleeper;
+			struct llist_node	list;
+			closure_fn		*fn;
+		};
+		struct work_struct	work;
+	};
+
+	struct closure		*parent;
+
+	atomic_t		remaining;
+
+#ifdef CONFIG_DEBUG_CLOSURES
+#define CLOSURE_MAGIC_DEAD	0xc054dead
+#define CLOSURE_MAGIC_ALIVE	0xc054a11e
+#define CLOSURE_MAGIC_STACK	0xc05451cc
+
+	unsigned int		magic;
+	struct list_head	all;
+	unsigned long		ip;
+	unsigned long		waiting_on;
+#endif
+};
+
+void bch2_closure_sub(struct closure *cl, int v);
+void __bch2_closure_wake_up(struct closure_waitlist *list);
+bool bch2_closure_wait(struct closure_waitlist *list, struct closure *cl);
+void __bch2_closure_sync(struct closure *cl);
+
+/*
+ * closure_put - decrement a closure's refcount
+ */
+static inline void closure_put(struct closure *cl)
+{
+	bch2_closure_sub(cl, 1);
+}
+
+static inline unsigned closure_nr_remaining(struct closure *cl)
+{
+	return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK;
+}
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+static inline void closure_sync(struct closure *cl)
+{
+	if (closure_nr_remaining(cl) > 1)
+		__bch2_closure_sync(cl);
+}
+
+int __bch2_closure_sync_timeout(struct closure *cl, unsigned long timeout);
+
+static inline int closure_sync_timeout(struct closure *cl, unsigned long timeout)
+{
+	return closure_nr_remaining(cl) > 1
+		? __bch2_closure_sync_timeout(cl, timeout)
+		: 0;
+}
+
+//#ifdef CONFIG_DEBUG_CLOSURES
+#if 0
+
+void bch2_closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void bch2_closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->waiting_on = f;
+#endif
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+				  struct workqueue_struct *wq)
+{
+	closure_set_ip(cl);
+	cl->fn = fn;
+	cl->wq = wq;
+}
+
+static inline void closure_queue(struct closure *cl)
+{
+	struct workqueue_struct *wq = cl->wq;
+	/**
+	 * Changes made to closure, work_struct, or a couple of other structs
+	 * may cause work.func not pointing to the right location.
+	 */
+	BUILD_BUG_ON(offsetof(struct closure, fn)
+		     != offsetof(struct work_struct, func));
+
+	if (wq) {
+		INIT_WORK(&cl->work, cl->work.func);
+		BUG_ON(!queue_work(wq, &cl->work));
+	} else
+		cl->fn(&cl->work);
+}
+
+/**
+ * closure_get - increment a closure's refcount
+ */
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	BUG_ON((atomic_inc_return(&cl->remaining) &
+		CLOSURE_REMAINING_MASK) <= 1);
+#else
+	atomic_inc(&cl->remaining);
+#endif
+}
+
+/**
+ * closure_get_not_zero
+ */
+static inline bool closure_get_not_zero(struct closure *cl)
+{
+	unsigned old = atomic_read(&cl->remaining);
+	do {
+		if (!(old & CLOSURE_REMAINING_MASK))
+			return false;
+
+	} while (!atomic_try_cmpxchg_acquire(&cl->remaining, &old, old + 1));
+
+	return true;
+}
+
+/**
+ * closure_init - Initialize a closure, setting the refcount to 1
+ * @cl:		closure to initialize
+ * @parent:	parent of the new closure. cl will take a refcount on it for its
+ *		lifetime; may be NULL.
+ */
+static inline void closure_init(struct closure *cl, struct closure *parent)
+{
+	cl->fn = NULL;
+	cl->parent = parent;
+	if (parent)
+		closure_get(parent);
+
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+
+	bch2_closure_debug_create(cl);
+	closure_set_ip(cl);
+}
+
+static inline void closure_init_stack(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->magic = CLOSURE_MAGIC_STACK;
+#endif
+}
+
+static inline void closure_init_stack_release(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->magic = CLOSURE_MAGIC_STACK;
+#endif
+}
+
+/**
+ * closure_wake_up - wake up all closures on a wait list,
+ *		     with memory barrier
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+	/* Memory barrier for the wait list */
+	smp_mb();
+	__bch2_closure_wake_up(list);
+}
+
+#define CLOSURE_CALLBACK(name)	void name(struct work_struct *ws)
+#define closure_type(name, type, member)				\
+	struct closure *cl = container_of(ws, struct closure, work);	\
+	type *name = container_of(cl, type, member)
+
+/**
+ * continue_at - jump to another function with barrier
+ *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
+ * been dropped with closure_put()), it will resume execution at @fn running out
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
+ *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
+ * and whatever @cl owns may be freed out from under you - a running closure fn
+ * has a ref on its own closure which continue_at() drops.
+ *
+ * Note you are expected to immediately return after using this macro.
+ */
+#define continue_at(_cl, _fn, _wq)					\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	bch2_closure_sub(_cl, CLOSURE_RUNNING + 1);				\
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
+#define closure_return(_cl)	continue_at((_cl), NULL, NULL)
+
+void bch2_closure_return_sync(struct closure *cl);
+
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
+#define continue_at_nobarrier(_cl, _fn, _wq)				\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_queue(_cl);						\
+} while (0)
+
+/**
+ * closure_return_with_destructor - finish execution of a closure,
+ *				    with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
+#define closure_return_with_destructor(_cl, _destructor)		\
+do {									\
+	set_closure_fn(_cl, _destructor, NULL);				\
+	bch2_closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
+} while (0)
+
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
+static inline void closure_call(struct closure *cl, closure_fn fn,
+				struct workqueue_struct *wq,
+				struct closure *parent)
+{
+	closure_init(cl, parent);
+	continue_at_nobarrier(cl, fn, wq);
+}
+
+#define __closure_wait_event(waitlist, _cond)				\
+do {									\
+	struct closure cl;						\
+									\
+	closure_init_stack(&cl);					\
+									\
+	while (1) {							\
+		bch2_closure_wait(waitlist, &cl);			\
+		if (_cond)						\
+			break;						\
+		closure_sync(&cl);					\
+	}								\
+	closure_wake_up(waitlist);					\
+	closure_sync(&cl);						\
+} while (0)
+
+#define closure_wait_event(waitlist, _cond)				\
+do {									\
+	if (!(_cond))							\
+		__closure_wait_event(waitlist, _cond);			\
+} while (0)
+
+#define __closure_wait_event_timeout(waitlist, _cond, _until)		\
+({									\
+	struct closure cl;						\
+	long _t;							\
+									\
+	closure_init_stack(&cl);					\
+									\
+	while (1) {							\
+		bch2_closure_wait(waitlist, &cl);			\
+		if (_cond) {						\
+			_t = max_t(long, 1L, _until - jiffies);		\
+			break;						\
+		}							\
+		_t = max_t(long, 0L, _until - jiffies);			\
+		if (!_t)						\
+			break;						\
+		closure_sync_timeout(&cl, _t);				\
+	}								\
+	closure_wake_up(waitlist);					\
+	closure_sync(&cl);						\
+	_t;								\
+})
+
+/*
+ * Returns 0 if timeout expired, remaining time in jiffies (at least 1) if
+ * condition became true
+ */
+#define closure_wait_event_timeout(waitlist, _cond, _timeout)		\
+({									\
+	unsigned long _until = jiffies + _timeout;			\
+	(_cond)								\
+		? max_t(long, 1L, _until - jiffies)			\
+		: __closure_wait_event_timeout(waitlist, _cond, _until);\
+})
+
+#endif /* _LINUX_CLOSURE_H */
diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 52791e070506..c055243ec206 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -173,4 +173,8 @@ int seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary);
 
 void seq_buf_do_printk(struct seq_buf *s, const char *lvl);
 
+enum string_size_units;
+void seq_buf_human_readable_u64(struct seq_buf *s, u64 v,
+				const enum string_size_units units);
+
 #endif /* _LINUX_SEQ_BUF_H */
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 1a00be90d93a..106622ddac77 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -24,6 +24,8 @@ struct shrinker_info {
 	struct shrinker_info_unit *unit[];
 };
 
+struct seq_buf;
+
 /*
  * This struct is used to pass information from page reclaim to the shrinkers.
  * We consolidate the values for easier extension later.
@@ -80,10 +82,12 @@ struct shrink_control {
  * @flags determine the shrinker abilities, like numa awareness
  */
 struct shrinker {
+	const char *name;
 	unsigned long (*count_objects)(struct shrinker *,
 				       struct shrink_control *sc);
 	unsigned long (*scan_objects)(struct shrinker *,
 				      struct shrink_control *sc);
+	void (*to_text)(struct seq_buf *, struct shrinker *);
 
 	long batch;	/* reclaim batch size, 0 = default */
 	int seeks;	/* seeks to recreate an obj */
@@ -110,11 +114,16 @@ struct shrinker {
 #endif
 #ifdef CONFIG_SHRINKER_DEBUG
 	int debugfs_id;
-	const char *name;
 	struct dentry *debugfs_entry;
 #endif
 	/* objs pending delete, per node */
 	atomic_long_t *nr_deferred;
+
+	atomic_long_t	objects_requested_to_free;
+	atomic_long_t	objects_freed;
+	unsigned long	last_freed;	/* timestamp, in jiffies */
+	unsigned long	last_scanned;	/* timestamp, in jiffies */
+	atomic64_t	ns_run;
 };
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
 
@@ -135,6 +144,8 @@ __printf(2, 3)
 struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
 void shrinker_register(struct shrinker *shrinker);
 void shrinker_free(struct shrinker *shrinker);
+void shrinker_to_text(struct seq_buf *, struct shrinker *);
+void shrinkers_to_text(struct seq_buf *);
 
 static inline bool shrinker_try_get(struct shrinker *shrinker)
 {
diff --git a/lib/seq_buf.c b/lib/seq_buf.c
index f3f3436d60a9..3c41ca83a0c3 100644
--- a/lib/seq_buf.c
+++ b/lib/seq_buf.c
@@ -436,3 +436,13 @@ int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, int prefix_type,
 	}
 	return 0;
 }
+
+void seq_buf_human_readable_u64(struct seq_buf *s, u64 v, const enum string_size_units units)
+{
+	char *buf;
+	size_t size = seq_buf_get_buf(s, &buf);
+	int wrote = string_get_size(v, 1, units, buf, size);
+
+	seq_buf_commit(s, wrote);
+}
+EXPORT_SYMBOL(seq_buf_human_readable_u64);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 25923cfec9c6..ad1ac9be0db4 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,27 +169,6 @@ static bool oom_unkillable_task(struct task_struct *p)
 	return false;
 }
 
-/*
- * Check whether unreclaimable slab amount is greater than
- * all user memory(LRU pages).
- * dump_unreclaimable_slab() could help in the case that
- * oom due to too much unreclaimable slab used by kernel.
-*/
-static bool should_dump_unreclaim_slab(void)
-{
-	unsigned long nr_lru;
-
-	nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
-		 global_node_page_state(NR_INACTIVE_ANON) +
-		 global_node_page_state(NR_ACTIVE_FILE) +
-		 global_node_page_state(NR_INACTIVE_FILE) +
-		 global_node_page_state(NR_ISOLATED_ANON) +
-		 global_node_page_state(NR_ISOLATED_FILE) +
-		 global_node_page_state(NR_UNEVICTABLE);
-
-	return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
-}
-
 /**
  * oom_badness - heuristic function to determine which candidate task to kill
  * @p: task struct of which task we should calculate
@@ -469,8 +448,6 @@ static void dump_header(struct oom_control *oc)
 		mem_cgroup_print_oom_meminfo(oc->memcg);
 	else {
 		__show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask));
-		if (should_dump_unreclaim_slab())
-			dump_unreclaimable_slab();
 	}
 	if (sysctl_oom_dump_tasks)
 		dump_tasks(oc);
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 41999e94a56d..013ad4da618c 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -7,15 +7,18 @@
 
 #include <linux/blkdev.h>
 #include <linux/cma.h>
+#include <linux/console.h>
 #include <linux/cpuset.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/seq_buf.h>
 #include <linux/swap.h>
 #include <linux/vmstat.h>
 
 #include "internal.h"
+#include "slab.h"
 #include "swap.h"
 
 atomic_long_t _totalram_pages __read_mostly;
@@ -392,10 +395,31 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 	show_swap_cache_info();
 }
 
+static void print_string_as_lines(const char *prefix, const char *lines)
+{
+	if (!lines) {
+		printk("%s (null)\n", prefix);
+		return;
+	}
+
+	bool locked = console_trylock();
+
+	while (1) {
+		const char *p = strchrnul(lines, '\n');
+		printk("%s%.*s\n", prefix, (int) (p - lines), lines);
+		if (!*p)
+			break;
+		lines = p + 1;
+	}
+	if (locked)
+		console_unlock();
+}
+
 void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
 {
 	unsigned long total = 0, reserved = 0, highmem = 0;
 	struct zone *zone;
+	char *buf;
 
 	printk("Mem-Info:\n");
 	show_free_areas(filter, nodemask, max_zone_idx);
@@ -447,4 +471,30 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
 		}
 	}
 #endif
+
+	const unsigned buf_size = 8192;
+	buf = kmalloc(buf_size, GFP_ATOMIC);
+	if (buf) {
+		struct seq_buf s;
+
+		printk("Unreclaimable slab info:\n");
+		seq_buf_init(&s, buf, buf_size);
+		dump_unreclaimable_slab(&s);
+		print_string_as_lines(KERN_NOTICE, seq_buf_str(&s));
+
+		static unsigned long shrinkers_last_print;
+
+		/* Ratelimit to at most once every 30 seconds */
+		if (!shrinkers_last_print ||
+		    time_after(jiffies, shrinkers_last_print + HZ * 30)) {
+			shrinkers_last_print = jiffies;
+
+			printk("Shrinkers:\n");
+			seq_buf_init(&s, buf, buf_size);
+			shrinkers_to_text(&s);
+			print_string_as_lines(KERN_NOTICE, seq_buf_str(&s));
+		}
+
+		kfree(buf);
+	}
 }
diff --git a/mm/shrinker.c b/mm/shrinker.c
index 4a93fd433689..4a76364d2b7e 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/memcontrol.h>
+#include <linux/rculist.h>
 #include <linux/rwsem.h>
+#include <linux/seq_buf.h>
 #include <linux/shrinker.h>
-#include <linux/rculist.h>
 #include <trace/events/vmscan.h>
 
 #include "internal.h"
@@ -411,6 +412,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 
 	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
 				   freeable, delta, total_scan, priority);
+	u64 start_time = ktime_get_ns();
 
 	/*
 	 * Normally, we should not scan less than batch_size objects in one
@@ -461,6 +463,17 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 	 */
 	new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
 
+	unsigned long now = jiffies;
+	if (freed) {
+		atomic_long_add(freed, &shrinker->objects_freed);
+		shrinker->last_freed = now;
+	}
+	shrinker->last_scanned = now;
+	atomic_long_add(scanned, &shrinker->objects_requested_to_free);
+
+	atomic64_add(ktime_get_ns() - start_time, &shrinker->ns_run);
+
+
 	trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
 	return freed;
 }
@@ -809,3 +822,95 @@ void shrinker_free(struct shrinker *shrinker)
 	call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
 }
 EXPORT_SYMBOL_GPL(shrinker_free);
+
+void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker)
+{
+	struct shrink_control sc = {
+		.gfp_mask = GFP_KERNEL,
+#ifdef CONFIG_MEMCG
+		.memcg = root_mem_cgroup,
+#endif
+	};
+	unsigned long nr_freed = atomic_long_read(&shrinker->objects_freed);
+
+	seq_buf_printf(out, "%ps", shrinker->scan_objects);
+	if (shrinker->name)
+		seq_buf_printf(out, ": %s", shrinker->name);
+	seq_buf_putc(out, '\n');
+
+	seq_buf_printf(out, "objects:             %lu\n", shrinker->count_objects(shrinker, &sc));
+	seq_buf_printf(out, "requested to free:   %lu\n", atomic_long_read(&shrinker->objects_requested_to_free));
+	seq_buf_printf(out, "objects freed:       %lu\n", nr_freed);
+	seq_buf_printf(out, "last scanned:        %li sec ago\n", (jiffies - shrinker->last_scanned) / HZ);
+	seq_buf_printf(out, "last freed:          %li sec ago\n", (jiffies - shrinker->last_freed) / HZ);
+	seq_buf_printf(out, "ns per object freed: %llu\n", nr_freed
+		       ? div64_ul(atomic64_read(&shrinker->ns_run), nr_freed)
+		       : 0);
+
+	if (shrinker->to_text) {
+		shrinker->to_text(out, shrinker);
+		seq_buf_puts(out, "\n");
+	}
+}
+
+/**
+ * shrinkers_to_text - Report on shrinkers with highest usage
+ *
+ * This reports on the top 10 shrinkers, by object counts, in sorted order:
+ * intended to be used for OOM reporting.
+ */
+void shrinkers_to_text(struct seq_buf *out)
+{
+	struct shrinker *shrinker;
+	struct shrinker_by_mem {
+		struct shrinker	*shrinker;
+		unsigned long	mem;
+	} shrinkers_by_mem[4];
+	int i, nr = 0;
+
+	if (!mutex_trylock(&shrinker_mutex)) {
+		seq_buf_puts(out, "(couldn't take shrinker lock)");
+		return;
+	}
+
+	list_for_each_entry(shrinker, &shrinker_list, list) {
+		struct shrink_control sc = {
+			.gfp_mask = GFP_KERNEL,
+#ifdef CONFIG_MEMCG
+			.memcg = root_mem_cgroup,
+#endif
+		};
+		unsigned long mem = shrinker->count_objects(shrinker, &sc);
+
+		if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY)
+			continue;
+
+		for (i = 0; i < nr; i++)
+			if (mem < shrinkers_by_mem[i].mem)
+				break;
+
+		if (nr < ARRAY_SIZE(shrinkers_by_mem)) {
+			memmove(&shrinkers_by_mem[i + 1],
+				&shrinkers_by_mem[i],
+				sizeof(shrinkers_by_mem[0]) * (nr - i));
+			nr++;
+		} else if (i) {
+			i--;
+			memmove(&shrinkers_by_mem[0],
+				&shrinkers_by_mem[1],
+				sizeof(shrinkers_by_mem[0]) * i);
+		} else {
+			continue;
+		}
+
+		shrinkers_by_mem[i] = (struct shrinker_by_mem) {
+			.shrinker = shrinker,
+			.mem = mem,
+		};
+	}
+
+	for (i = nr - 1; i >= 0; --i)
+		shrinker_to_text(out, shrinkers_by_mem[i].shrinker);
+
+	mutex_unlock(&shrinker_mutex);
+}
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 20eaee3e97f7..a16c2848d332 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -2,6 +2,7 @@
 #include <linux/idr.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
+#include <linux/seq_buf.h>
 #include <linux/seq_file.h>
 #include <linux/shrinker.h>
 #include <linux/memcontrol.h>
@@ -159,6 +160,21 @@ static const struct file_operations shrinker_debugfs_scan_fops = {
 	.write	 = shrinker_debugfs_scan_write,
 };
 
+static int shrinker_debugfs_report_show(struct seq_file *m, void *v)
+{
+	struct shrinker *shrinker = m->private;
+	char *bufp;
+	size_t buflen = seq_get_buf(m, &bufp);
+	struct seq_buf out;
+
+	seq_buf_init(&out, bufp, buflen);
+	shrinker_to_text(&out, shrinker);
+	seq_commit(m, seq_buf_used(&out));
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_report);
+
 int shrinker_debugfs_add(struct shrinker *shrinker)
 {
 	struct dentry *entry;
@@ -190,6 +206,8 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
 			    &shrinker_debugfs_count_fops);
 	debugfs_create_file("scan", 0220, entry, shrinker,
 			    &shrinker_debugfs_scan_fops);
+	debugfs_create_file("report", 0440, entry, shrinker,
+			    &shrinker_debugfs_report_fops);
 	return 0;
 }
 
diff --git a/mm/slab.h b/mm/slab.h
index 248b34c839b7..09be83786b68 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -587,10 +587,12 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
 	return s->size;
 }
 
+struct seq_buf;
+
 #ifdef CONFIG_SLUB_DEBUG
-void dump_unreclaimable_slab(void);
+void dump_unreclaimable_slab(struct seq_buf *);
 #else
-static inline void dump_unreclaimable_slab(void)
+static inline void dump_unreclaimable_slab(struct seq_buf *out)
 {
 }
 #endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index bfe7c40eeee1..a365f146e669 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -27,6 +27,7 @@
 #include <asm/tlbflush.h>
 #include <asm/page.h>
 #include <linux/memcontrol.h>
+#include <linux/seq_buf.h>
 #include <linux/stackdepot.h>
 #include <trace/events/rcu.h>
 
@@ -1127,10 +1128,15 @@ static int slab_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-void dump_unreclaimable_slab(void)
+void dump_unreclaimable_slab(struct seq_buf *out)
 {
 	struct kmem_cache *s;
 	struct slabinfo sinfo;
+	struct slab_by_mem {
+		struct kmem_cache *s;
+		size_t total, active;
+	} slabs_by_mem[10], n;
+	int i, nr = 0;
 
 	/*
 	 * Here acquiring slab_mutex is risky since we don't prefer to get
@@ -1140,24 +1146,52 @@ void dump_unreclaimable_slab(void)
 	 * without acquiring the mutex.
 	 */
 	if (!mutex_trylock(&slab_mutex)) {
-		pr_warn("excessive unreclaimable slab but cannot dump stats\n");
+		seq_buf_puts(out, "excessive unreclaimable slab but cannot dump stats\n");
 		return;
 	}
 
-	pr_info("Unreclaimable slab info:\n");
-	pr_info("Name                      Used          Total\n");
-
 	list_for_each_entry(s, &slab_caches, list) {
 		if (s->flags & SLAB_RECLAIM_ACCOUNT)
 			continue;
 
 		get_slabinfo(s, &sinfo);
 
-		if (sinfo.num_objs > 0)
-			pr_info("%-17s %10luKB %10luKB\n", s->name,
-				(sinfo.active_objs * s->size) / 1024,
-				(sinfo.num_objs * s->size) / 1024);
+		if (!sinfo.num_objs)
+			continue;
+
+		n.s = s;
+		n.total = sinfo.num_objs * s->size;
+		n.active = sinfo.active_objs * s->size;
+
+		for (i = 0; i < nr; i++)
+			if (n.total < slabs_by_mem[i].total)
+				break;
+
+		if (nr < ARRAY_SIZE(slabs_by_mem)) {
+			memmove(&slabs_by_mem[i + 1],
+				&slabs_by_mem[i],
+				sizeof(slabs_by_mem[0]) * (nr - i));
+			nr++;
+		} else if (i) {
+			i--;
+			memmove(&slabs_by_mem[0],
+				&slabs_by_mem[1],
+				sizeof(slabs_by_mem[0]) * i);
+		} else {
+			continue;
+		}
+
+		slabs_by_mem[i] = n;
 	}
+
+	for (i = nr - 1; i >= 0; --i) {
+		seq_buf_printf(out, "%-17s total: ", slabs_by_mem[i].s->name);
+		seq_buf_human_readable_u64(out, slabs_by_mem[i].total, STRING_UNITS_2);
+		seq_buf_printf(out, " active: ");
+		seq_buf_human_readable_u64(out, slabs_by_mem[i].active, STRING_UNITS_2);
+		seq_buf_putc(out, '\n');
+	}
+
 	mutex_unlock(&slab_mutex);
 }