6 files changed, 195 insertions, 115 deletions
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index b20ea162bfa3..73f50e5489b4 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -789,102 +789,10 @@ static struct journal_key *accumulate_and_read_journal_accounting(struct btree_t
 	return ret ? ERR_PTR(ret) : next;
 }
 
-/*
- * At startup time, initialize the in memory accounting from the btree (and
- * journal)
- */
-int bch2_accounting_read(struct bch_fs *c)
+static int accounting_read_mem_fixups(struct btree_trans *trans)
 {
+	struct bch_fs *c = trans->c;
 	struct bch_accounting_mem *acc = &c->accounting;
-	CLASS(btree_trans, trans)(c);
-	CLASS(printbuf, buf)();
-
-	/*
-	 * We might run more than once if we rewind to start topology repair or
-	 * btree node scan - and those might cause us to get different results,
-	 * so we can't just skip if we've already run.
-	 *
-	 * Instead, zero out any accounting we have:
-	 */
-	scoped_guard(percpu_write, &c->mark_lock) {
-		darray_for_each(acc->k, e)
-			percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
-		for_each_member_device(c, ca)
-			percpu_memset(ca->usage, 0, sizeof(*ca->usage));
-		percpu_memset(c->usage, 0, sizeof(*c->usage));
-	}
-
-	struct journal_keys *keys = &c->journal_keys;
-	struct journal_key *jk = keys->data;
-
-	move_gap(keys, keys->nr);
-
-	while (jk < &darray_top(*keys) &&
-	       __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0)
-		jk++;
-
-	struct journal_key *end = jk;
-	while (end < &darray_top(*keys) &&
-	       __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0)
-		end++;
-
-	struct btree_iter iter;
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
-			     BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
-	iter.flags &= ~BTREE_ITER_with_journal;
-	int ret = for_each_btree_key_continue(trans, iter,
-				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
-		if (k.k->type != KEY_TYPE_accounting)
-			continue;
-
-		while (jk < end &&
-		       __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0)
-			jk = accumulate_and_read_journal_accounting(trans, jk);
-
-		while (jk < end &&
-		       __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 &&
-		       bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) {
-			jk->overwritten = true;
-			jk++;
-		}
-
-		if (jk < end &&
-		    __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0)
-			jk = accumulate_and_read_journal_accounting(trans, jk);
-
-		struct disk_accounting_pos acc_k;
-		bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
-		if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
-			break;
-
-		if (!bch2_accounting_is_mem(&acc_k)) {
-			struct disk_accounting_pos next_acc;
-			memset(&next_acc, 0, sizeof(next_acc));
-			next_acc.type = acc_k.type + 1;
-			struct bpos next = bpos_predecessor(disk_accounting_pos_to_bpos(&next_acc));
-			if (jk < end)
-				next = bpos_min(next, journal_key_k(c, jk)->k.p);
-
-			bch2_btree_iter_set_pos(&iter, next);
-			continue;
-		}
-
-		accounting_read_key(trans, k);
-	}));
-	bch2_trans_iter_exit(&iter);
-	if (ret)
-		return ret;
-
-	while (jk < end)
-		jk = accumulate_and_read_journal_accounting(trans, jk);
-
-	struct journal_key *dst = keys->data;
-	darray_for_each(*keys, i)
-		if (!i->overwritten)
-			*dst++ = *i;
-	keys->gap = keys->nr = dst - keys->data;
-
 	CLASS(printbuf, underflow_err)();
 
 	scoped_guard(percpu_write, &c->mark_lock) {
@@ -905,7 +813,7 @@ int bch2_accounting_read(struct bch_fs *c)
 			 * Remove it, so that if it's re-added it gets re-marked in the
 			 * superblock:
 			 */
-			ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
+			int ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
 				? -BCH_ERR_remove_disk_accounting_entry
 				: bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
 
@@ -987,7 +895,7 @@ int bch2_accounting_read(struct bch_fs *c)
 	if (underflow_err.pos) {
 		bool print = bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err);
 		unsigned pos = underflow_err.pos;
-		ret = bch2_run_explicit_recovery_pass(c, &underflow_err,
+		int ret = bch2_run_explicit_recovery_pass(c, &underflow_err,
 						      BCH_RECOVERY_PASS_check_allocations, 0);
 		print |= underflow_err.pos != pos;
 
@@ -997,7 +905,108 @@ int bch2_accounting_read(struct bch_fs *c)
 			return ret;
 	}
 
-	return ret;
+	return 0;
+}
+
+/*
+ * At startup time, initialize the in memory accounting from the btree (and
+ * journal)
+ */
+int bch2_accounting_read(struct bch_fs *c)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+	CLASS(btree_trans, trans)(c);
+	CLASS(printbuf, buf)();
+
+	/*
+	 * We might run more than once if we rewind to start topology repair or
+	 * btree node scan - and those might cause us to get different results,
+	 * so we can't just skip if we've already run.
+	 *
+	 * Instead, zero out any accounting we have:
+	 */
+	scoped_guard(percpu_write, &c->mark_lock) {
+		darray_for_each(acc->k, e)
+			percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
+		for_each_member_device(c, ca)
+			percpu_memset(ca->usage, 0, sizeof(*ca->usage));
+		percpu_memset(c->usage, 0, sizeof(*c->usage));
+	}
+
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_key *jk = keys->data;
+
+	move_gap(keys, keys->nr);
+
+	while (jk < &darray_top(*keys) &&
+	       __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0)
+		jk++;
+
+	struct journal_key *end = jk;
+	while (end < &darray_top(*keys) &&
+	       __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0)
+		end++;
+
+	struct btree_iter iter;
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
+			     BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
+	iter.flags &= ~BTREE_ITER_with_journal;
+	int ret = for_each_btree_key_continue(trans, iter,
+				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
+		if (k.k->type != KEY_TYPE_accounting)
+			continue;
+
+		while (jk < end &&
+		       __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0)
+			jk = accumulate_and_read_journal_accounting(trans, jk);
+
+		while (jk < end &&
+		       __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 &&
+		       bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) {
+			jk->overwritten = true;
+			jk++;
+		}
+
+		if (jk < end &&
+		    __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0)
+			jk = accumulate_and_read_journal_accounting(trans, jk);
+
+		struct disk_accounting_pos acc_k;
+		bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+		if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+			break;
+
+		if (!bch2_accounting_is_mem(&acc_k)) {
+			struct disk_accounting_pos next_acc;
+			memset(&next_acc, 0, sizeof(next_acc));
+			next_acc.type = acc_k.type + 1;
+			struct bpos next = bpos_predecessor(disk_accounting_pos_to_bpos(&next_acc));
+			if (jk < end)
+				next = bpos_min(next, journal_key_k(c, jk)->k.p);
+
+			bch2_btree_iter_set_pos(&iter, next);
+			continue;
+		}
+
+		accounting_read_key(trans, k);
+	}));
+	bch2_trans_iter_exit(&iter);
+	if (ret)
+		return ret;
+
+	while (jk < end)
+		jk = accumulate_and_read_journal_accounting(trans, jk);
+
+	bch2_trans_unlock(trans);
+
+	struct journal_key *dst = keys->data;
+	darray_for_each(*keys, i)
+		if (!i->overwritten)
+			*dst++ = *i;
+	keys->gap = keys->nr = dst - keys->data;
+
+	return accounting_read_mem_fixups(trans);
 }
 
 int bch2_dev_usage_remove(struct bch_fs *c, struct bch_dev *ca)
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 78175d659e0e..e01c808e7893 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -654,7 +654,8 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
 
 	val = bch2_opt_val_synonym_lookup(name, val);
 
-	if (!(bch2_opt_table[id].flags & OPT_MOUNT))
+	if (!(bch2_opt_table[id].flags & OPT_MOUNT) &&
+	    !(bch2_opt_table[id].flags & OPT_MOUNT_OLD))
 		return -BCH_ERR_option_name;
 
 	if ((id == Opt_usrquota ||
@@ -677,6 +678,12 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
 	if (ret < 0)
 		return -BCH_ERR_option_value;
 
+	if (bch2_opt_table[id].flags & OPT_MOUNT_OLD) {
+		pr_err("option %s may no longer be specified at mount time; set via sysfs opts dir",
+		       bch2_opt_table[id].attr.name);
+		return 0;
+	}
+
 	if (opts)
 		bch2_opt_set_by_id(opts, id, v);
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index de1ac235e929..68982196b5dc 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -67,6 +67,7 @@ enum opt_flags {
 	OPT_SB_FIELD_ILOG2	= BIT(9),	/* Superblock field is ilog2 of actual value */
 	OPT_SB_FIELD_ONE_BIAS	= BIT(10),	/* 0 means default value */
 	OPT_HIDDEN		= BIT(11),
+	OPT_MOUNT_OLD		= BIT(12),	/* May not be specified at mount time, but don't fail the mount */
 };
 
 enum opt_type {
@@ -150,12 +151,12 @@ enum fsck_err_opts {
 	  BCH_SB_WRITE_ERROR_TIMEOUT,	30,				\
 	  NULL,		"Number of consecutive write errors allowed before kicking out a device")\
 	x(metadata_replicas,		u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_RUNTIME,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME,			\
 	  OPT_UINT(1, BCH_REPLICAS_MAX + 1),				\
 	  BCH_SB_META_REPLICAS_WANT,	1,				\
 	  "#",		"Number of metadata replicas")			\
 	x(data_replicas,		u8,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME,	\
 	  OPT_UINT(1, BCH_REPLICAS_MAX + 1),				\
 	  BCH_SB_DATA_REPLICAS_WANT,	1,				\
 	  "#",		"Number of data replicas")			\
@@ -176,12 +177,12 @@ enum fsck_err_opts {
 	  BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10,			\
 	  "size",	"Maximum size of checksummed/compressed extents")\
 	x(metadata_checksum,		u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_RUNTIME,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME,			\
 	  OPT_STR(__bch2_csum_opts),					\
 	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
 	  NULL,		NULL)						\
 	x(data_checksum,		u8,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME,	\
 	  OPT_STR(__bch2_csum_opts),					\
 	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
 	  NULL,		NULL)						\
@@ -191,12 +192,12 @@ enum fsck_err_opts {
 	  BCH_SB_CSUM_ERR_RETRY_NR,	3,				\
 	  NULL,		NULL)						\
 	x(compression,			u8,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME,	\
 	  OPT_FN(bch2_opt_compression),					\
 	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_none,	\
 	  NULL,		NULL)						\
 	x(background_compression,	u8,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME,	\
 	  OPT_FN(bch2_opt_compression),					\
 	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none,	\
 	  NULL,		NULL)						\
@@ -206,27 +207,27 @@ enum fsck_err_opts {
 	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_siphash,	\
 	  NULL,		"Hash function for directory entries and xattrs")\
 	x(metadata_target,		u16,				\
-	  OPT_FS|OPT_FORMAT|OPT_RUNTIME,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME,			\
 	  OPT_FN(bch2_opt_target),					\
 	  BCH_SB_METADATA_TARGET,	0,				\
 	  "(target)",	"Device or label for metadata writes")		\
 	x(foreground_target,		u16,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME,	\
 	  OPT_FN(bch2_opt_target),					\
 	  BCH_SB_FOREGROUND_TARGET,	0,				\
 	  "(target)",	"Device or label for foreground writes")	\
 	x(background_target,		u16,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME,	\
 	  OPT_FN(bch2_opt_target),					\
 	  BCH_SB_BACKGROUND_TARGET,	0,				\
 	  "(target)",	"Device or label to move data to in the background")\
 	x(promote_target,		u16,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME,	\
 	  OPT_FN(bch2_opt_target),					\
 	  BCH_SB_PROMOTE_TARGET,	0,				\
 	  "(target)",	"Device or label to promote data to on read")	\
 	x(erasure_code,			u16,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME,			\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME,	\
 	  OPT_BOOL(),							\
 	  BCH_SB_ERASURE_CODE,		false,				\
 	  NULL,		"Enable erasure coding (DO NOT USE YET)")	\
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 27282616bcb0..0e40e7bd3441 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -11,6 +11,7 @@
 #include "clock.h"
 #include "compress.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "errcode.h"
 #include "error.h"
 #include "inode.h"
@@ -272,6 +273,8 @@ bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
 
 	if (durability < r.data_replicas || durability >= r.data_replicas + min_durability)
 		r.need_rb |= BIT(BCH_REBALANCE_data_replicas);
+	if (!unwritten && r.erasure_code != ec)
+		r.need_rb |= BIT(BCH_REBALANCE_erasure_code);
 	return r;
 }
 
@@ -321,6 +324,17 @@ static int check_dev_rebalance_scan_cookie(struct btree_trans *trans, struct bke
 	return 0;
 }
 
+static bool bkey_has_ec(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+
+	bkey_extent_entry_for_each(ptrs, entry)
+		if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr)
+			return true;
+	return false;
+}
+
 static int new_needs_rb_allowed(struct btree_trans *trans,
 				struct per_snapshot_io_opts *s,
 				struct bkey_s_c k,
@@ -350,6 +364,14 @@ static int new_needs_rb_allowed(struct btree_trans *trans,
 	    ctx == SET_NEEDS_REBALANCE_opt_change_indirect)
 		return 0;
 
+	if ((new_need_rb & BIT(BCH_REBALANCE_erasure_code)) &&
+	    !bkey_has_ec(k)) {
+		/* Foreground writes are not initially erasure coded - and we
+		 * may crash before a stripe is created
+		 */
+		new_need_rb &= ~BIT(BCH_REBALANCE_erasure_code);
+	}
+
 	if (ctx == SET_NEEDS_REBALANCE_foreground) {
 		new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)|
 				 BIT(BCH_REBALANCE_background_target));
@@ -759,6 +781,23 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans,
 	return &(&darray_pop(buf))->k_i;
 }
 
+static int extent_ec_pending(struct btree_trans *trans, struct bkey_ptrs_c ptrs)
+{
+	struct bch_fs *c = trans->c;
+
+	guard(rcu)();
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+		if (!ca)
+			continue;
+
+		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+		if (bch2_bucket_has_new_stripe(c, bucket_to_u64(bucket)))
+			return true;
+	}
+	return false;
+}
+
 static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 			struct per_snapshot_io_opts *snapshot_io_opts,
 			struct bpos work_pos,
@@ -801,9 +840,12 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 	data_opts->target		= opts->background_target;
 	data_opts->write_flags		|= BCH_WRITE_only_specified_devs;
 
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
 	if (r->need_rb & BIT(BCH_REBALANCE_data_replicas)) {
 		unsigned durability = bch2_bkey_durability(c, k);
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 		unsigned ptr_bit = 1;
 
 		guard(rcu)();
@@ -817,9 +859,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 
 			data_opts->extra_replicas = opts->data_replicas - durability;
 		} else {
-			const union bch_extent_entry *entry;
-			struct extent_ptr_decoded p;
-
 			bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 				unsigned d = bch2_extent_ptr_durability(c, &p);
 
@@ -843,6 +882,26 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
 		}
 	}
 
+	if (r->need_rb & BIT(BCH_REBALANCE_erasure_code)) {
+		if (opts->erasure_code) {
+			/* XXX: we'll need ratelimiting */
+			if (extent_ec_pending(trans, ptrs))
+				return bkey_s_c_null;
+
+			data_opts->extra_replicas = opts->data_replicas;
+		} else {
+			unsigned ptr_bit = 1;
+			bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+				if (p.has_ec) {
+					data_opts->kill_ec_ptrs |= ptr_bit;
+					data_opts->extra_replicas += p.ec.redundancy;
+				}
+
+				ptr_bit <<= 1;
+			}
+		}
+	}
+
 	if (!data_opts->rewrite_ptrs &&
 	    !data_opts->kill_ptrs &&
 	    !data_opts->kill_ec_ptrs &&
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 3984f3cee929..534ba2c0f83a 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1974,6 +1974,9 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 
 	bch_notice(ca, "%s", bch2_member_states[new_state]);
 
+	if (new_state == BCH_MEMBER_STATE_failed)
+		bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
+
 	scoped_guard(mutex, &c->sb_lock) {
 		struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 		SET_BCH_MEMBER_STATE(m, new_state);
@@ -1983,7 +1986,6 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 	if (new_state == BCH_MEMBER_STATE_rw)
 		__bch2_dev_read_write(c, ca);
 
-	/* XXX: add a superblock bit to make this transactional */
 	if (new_state == BCH_MEMBER_STATE_failed)
 		bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 1f60854ab3ef..85ccc084ecd9 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -793,6 +793,9 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
 	bool is_sb = opt->get_sb || opt->get_member;
 	bool changed = false;
 
+	if (id == Opt_durability)
+		bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
+
 	if (is_sb) {
 		changed = bch2_opt_set_sb(c, ca, opt, v);
 	} else if (!ca) {
@@ -806,7 +809,6 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
 	if (!ca)
 		bch2_opt_set_by_id(&c->opts, id, v);
 
-	/* XXX: add a superblock bit to make this transactional */
 	if (id == Opt_durability)
 		bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);