summaryrefslogtreecommitdiff
path: root/libbcachefs/btree_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbcachefs/btree_io.c')
-rw-r--r--libbcachefs/btree_io.c57
1 files changed, 47 insertions, 10 deletions
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 08b22bdd..84e302af 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1337,15 +1337,42 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_node_reset_sib_u64s(b);
- scoped_guard(rcu)
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
- if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
- set_btree_node_need_rewrite(b);
- set_btree_node_need_rewrite_degraded(b);
+ /*
+ * XXX:
+ *
+ * We deadlock if too many btree updates require node rewrites while
+ * we're still in journal replay.
+ *
+ * This is because btree node rewrites generate more updates for the
+ * interior updates (alloc, backpointers), and if those updates touch
+ * new nodes and generate more rewrites - well, you see the problem.
+ *
+ * The biggest cause is that we don't use the btree write buffer (for
+ * the backpointer updates - this needs some real thought on locking in
+ * order to fix.
+ *
+ * The problem with this workaround (not doing the rewrite for degraded
+ * nodes in journal replay) is that those degraded nodes persist, and we
+ * don't want that (this is a real bug when a btree node write completes
+ * with fewer replicas than we wanted and leaves a degraded node due to
+ * device _removal_, i.e. the device went away mid write).
+ *
+ * It's less of a bug here, but still a problem because we don't yet
+ * have a way of tracking degraded data - we another index (all
+ * extents/btree nodes, by replicas entry) in order to fix properly
+ * (re-replicate degraded data at the earliest possible time).
+ */
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
+ scoped_guard(rcu)
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+
+ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
+ set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_degraded(b);
+ }
}
- }
+ }
if (!ptr_written) {
set_btree_node_need_rewrite(b);
@@ -1771,7 +1798,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
struct bio *bio;
int ret;
- trace_and_count(c, btree_node_read, trans, b);
+ trace_btree_node(c, b, btree_node_read);
if (static_branch_unlikely(&bch2_verify_all_btree_replicas) &&
!btree_node_read_all_replicas(c, b, sync))
@@ -2505,7 +2532,17 @@ do_write:
c->opts.nochanges)
goto err;
- trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
+ if (trace_btree_node_write_enabled()) {
+ CLASS(printbuf, buf)();
+ printbuf_indent_add(&buf, 2);
+ prt_printf(&buf, "offset %u sectors %u bytes %u\n",
+ b->written,
+ sectors_to_write,
+ bytes_to_write);
+ bch2_btree_pos_to_text(&buf, c, b);
+ trace_btree_node_write(c, buf.buf);
+ }
+ count_event(c, btree_node_write);
wbio = container_of(bio_alloc_bioset(NULL,
buf_pages(data, sectors_to_write << 9),