bcachefs: bch2_set_rebalance_needs_scan_device()

Rebalance can now evacuate devices in response to state changes. This obsoletes BCH_DATA_OP_migrate; setting a device to BCH_MEMBER_STATE_failed (perhaps we should rename this) will cause it to be evacuated (and the evacuate will resume if e.g. we crash or shutdown and restart). Additionally, we'll now be able to automatically evacuate failing devices. Currently we only set devices read-only in response to IO errors; we'll need to add configuration/policy/good heuristics (and clearly document them) for deciding when a device is failing and should be evacuated. This works with rebalance scan cookies; these are currently used to respond to filesystem/inode option changes. Cookies in the range of 1-4095 now refer to devices; when rebalance sees one of those it will walk backpointers on that device and update bch_extent_rebalance, which will react to the new device state (or durability setting change). Performance implications: with BCH_DATA_OP_migrate, we walk backpointers and do the data moves directly, meaning they happen in device LBA order. However, by walking backpointers to queue up rebalance work entries and then doing the work from the rebalance_work btree, we'll do the data moves in logical key order. Pro: doing data moves in logical key order will help with fragmentation/data locality: extents from the same inode will be moved at the same time, we'll get a bit of defragmentation and do better at keeping related data together Con: reads from the device being evacuated will no longer be sequential, this will hurt performance on spinning rust. Perhaps add a mode where we kick off data moves from do_rebalance_scan_bp()? Would be pretty easy XXX: slurp backpointers into a darray and sort before processing extents in do_rebalance_scan_device: we recently saw a very slow evacuate that was mostly just dropping cached data, on a huge filesystem entirely on spinning rust with only 8GB of ram in the server - the backpointers -> extents lookups are fairly random, batching + sorting will greatly improve performance XXX: add a superblock bit to make this transactional, if we crash between the write_super for the member state/durability change and creating the device scan cookie XXX: new_needs_rb_allowed should check for device scan cookies Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
author: Kent Overstreet <kent.overstreet@linux.dev> 2025-08-23 20:05:08 -0400
committer: Kent Overstreet <kent.overstreet@linux.dev> 2025-09-17 11:37:16 -0400
commit: 0e986243e41b7f5808b23f536852dbbbe6539e03 (patch)
tree: 0b82ebbd31607a42af1d05d54a4a5c6a304d9905
parent: 9d8a38b10c56ccff4fa8010e3e8d4d1d3d60f8a5 (diff)
4 files changed, 122 insertions, 3 deletions
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 0c29be7d662f..94de89d6a6cf 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "backpointers.h"
 #include "btree_iter.h"
 #include "btree_update.h"
 #include "btree_write_buffer.h"
@@ -295,6 +296,29 @@ static int check_rebalance_scan_cookie(struct btree_trans *trans, u64 inum, bool
 	return ret;
 }
 
+static int check_dev_rebalance_scan_cookie(struct btree_trans *trans, struct bkey_s_c k,
+					   struct bch_devs_mask *v)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (v && test_bit(ptr->dev, v->d))
+			return 1;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		int ret = check_rebalance_scan_cookie(trans, ptr->dev + 1, NULL);
+		if (ret < 0)
+			return ret;
+		if (ret) {
+			if (v)
+				__set_bit(ptr->dev, v->d);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
 static int new_needs_rb_allowed(struct btree_trans *trans,
 				struct per_snapshot_io_opts *s,
 				struct bkey_s_c k,
@@ -358,6 +382,14 @@ static int new_needs_rb_allowed(struct btree_trans *trans,
 	if (ret)
 		return 0;
 
+	if (new_need_rb == BIT(BCH_REBALANCE_data_replicas)) {
+		ret = check_dev_rebalance_scan_cookie(trans, k, s ? &s->dev_cookie : NULL);
+		if (ret < 0)
+			return ret;
+		if (ret)
+			return 0;
+	}
+
 	CLASS(printbuf, buf)();
 
 	prt_printf(&buf, "extent with incorrect/missing rebalance opts:\n");
@@ -643,6 +675,11 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
 	return ret;
 }
 
+int bch2_set_rebalance_needs_scan_device(struct bch_fs *c, unsigned dev)
+{
+	return bch2_set_rebalance_needs_scan(c, dev + 1);
+}
+
 int bch2_set_fs_needs_rebalance(struct bch_fs *c)
 {
 	return bch2_set_rebalance_needs_scan(c, 0);
@@ -855,6 +892,72 @@ out:
 	return ret;
 }
 
+static int do_rebalance_scan_bp(struct btree_trans *trans,
+				struct bkey_s_c_backpointer bp,
+				struct bkey_buf *last_flushed)
+{
+	if (bp.v->level) /* metadata not supported yet */
+		return 0;
+
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent,
+						     last_flushed);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (!k.k)
+		return 0;
+
+	struct bch_inode_opts io_opts;
+	ret = bch2_extent_get_io_opts_one(trans, &io_opts, &iter, k,
+					  SET_NEEDS_REBALANCE_opt_change);
+	bch2_trans_iter_exit(&iter);
+	return ret;
+}
+
+static int do_rebalance_scan_device(struct moving_context *ctxt,
+				    unsigned dev, u64 cookie,
+				    u64 *sectors_scanned)
+{
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	struct bch_fs_rebalance *r = &c->rebalance;
+
+	struct bkey_buf last_flushed;
+	bch2_bkey_buf_init(&last_flushed);
+	bkey_init(&last_flushed.k->k);
+
+	bch2_btree_write_buffer_flush_sync(trans);
+
+	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers,
+					 POS(dev, 0), POS(dev, U64_MAX),
+					 BTREE_ITER_prefetch, k, ({
+		ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
+
+		if (k.k->type != KEY_TYPE_backpointer)
+			continue;
+
+		do_rebalance_scan_bp(trans, bkey_s_c_to_backpointer(k), &last_flushed);
+	})) ?:
+	commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+		  bch2_clear_rebalance_needs_scan(trans, dev + 1, cookie));
+
+	*sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen);
+	/*
+	 * Ensure that the rebalance_work entries we created are seen by the
+	 * next iteration of do_rebalance(), so we don't end up stuck in
+	 * rebalance_wait():
+	 */
+	*sectors_scanned += 1;
+	bch2_move_stats_exit(&r->scan_stats, c);
+
+	bch2_btree_write_buffer_flush_sync(trans);
+
+	bch2_bkey_buf_exit(&last_flushed, c);
+	return ret;
+}
+
 static int do_rebalance_scan_indirect(struct btree_trans *trans,
 				      struct bkey_s_c_reflink_p p,
 				      struct per_snapshot_io_opts *snapshot_io_opts,
@@ -892,15 +995,21 @@ static int do_rebalance_scan(struct moving_context *ctxt,
 	bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
 	ctxt->stats = &r->scan_stats;
 
+	r->state = BCH_REBALANCE_scanning;
+
 	if (!inum) {
 		r->scan_start	= BBPOS_MIN;
 		r->scan_end	= BBPOS_MAX;
-	} else {
+	} else if (inum >= BCACHEFS_ROOT_INO) {
 		r->scan_start	= BBPOS(BTREE_ID_extents, POS(inum, 0));
 		r->scan_end	= BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
-	}
+	} else {
+		unsigned dev = inum - 1;
+		r->scan_start	= BBPOS(BTREE_ID_backpointers, POS(dev, 0));
+		r->scan_end	= BBPOS(BTREE_ID_backpointers, POS(dev, U64_MAX));
 
-	r->state = BCH_REBALANCE_scanning;
+		return do_rebalance_scan_device(ctxt, inum - 1, cookie, sectors_scanned);
+	}
 
 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
 					 r->scan_start.pos, r->scan_end.pos,
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index f40f670af046..e6dd9a7db26c 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -70,6 +70,7 @@ struct per_snapshot_io_opts {
 	u64			cur_inum;
 	bool			fs_scan_cookie;
 	bool			inum_scan_cookie;
+	struct bch_devs_mask	dev_cookie;
 
 	struct bch_inode_opts	fs_io_opts;
 	DARRAY(struct snapshot_io_opts_entry) d;
@@ -104,6 +105,7 @@ int bch2_extent_get_apply_io_opts_one(struct btree_trans *, struct bch_inode_opt
 
 int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
 int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+int bch2_set_rebalance_needs_scan_device(struct bch_fs *, unsigned);
 int bch2_set_fs_needs_rebalance(struct bch_fs *);
 
 static inline void bch2_rebalance_wakeup(struct bch_fs *c)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 5cd308a68035..52c6823ae7a4 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1980,6 +1980,10 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 	if (new_state == BCH_MEMBER_STATE_rw)
 		__bch2_dev_read_write(c, ca);
 
+	/* XXX: add a superblock bit to make this transactional */
+	if (new_state == BCH_MEMBER_STATE_failed)
+		bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
+
 	bch2_rebalance_wakeup(c);
 
 	return ret;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ef6312c50f88..1f60854ab3ef 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -806,6 +806,10 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
 	if (!ca)
 		bch2_opt_set_by_id(&c->opts, id, v);
 
+	/* XXX: add a superblock bit to make this transactional */
+	if (id == Opt_durability)
+		bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
+
 	if (changed)
 		bch2_opt_hook_post_set(c, ca, 0, id, v);
author	Kent Overstreet <kent.overstreet@linux.dev>	2025-08-23 20:05:08 -0400
committer	Kent Overstreet <kent.overstreet@linux.dev>	2025-09-17 11:37:16 -0400
commit	0e986243e41b7f5808b23f536852dbbbe6539e03 (patch)
tree	0b82ebbd31607a42af1d05d54a4a5c6a304d9905
parent	9d8a38b10c56ccff4fa8010e3e8d4d1d3d60f8a5 (diff)