diff options
Diffstat (limited to 'libbcachefs/btree_io.c')
-rw-r--r-- | libbcachefs/btree_io.c | 57 |
1 files changed, 47 insertions, 10 deletions
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 08b22bdd..84e302af 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1337,15 +1337,42 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); - scoped_guard(rcu) - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_degraded(b); + /* + * XXX: + * + * We deadlock if too many btree updates require node rewrites while + * we're still in journal replay. + * + * This is because btree node rewrites generate more updates for the + * interior updates (alloc, backpointers), and if those updates touch + * new nodes and generate more rewrites - well, you see the problem. + * + * The biggest cause is that we don't use the btree write buffer (for + * the backpointer updates - this needs some real thought on locking in + * order to fix. + * + * The problem with this workaround (not doing the rewrite for degraded + * nodes in journal replay) is that those degraded nodes persist, and we + * don't want that (this is a real bug when a btree node write completes + * with fewer replicas than we wanted and leaves a degraded node due to + * device _removal_, i.e. the device went away mid write). + * + * It's less of a bug here, but still a problem because we don't yet + * have a way of tracking degraded data - we another index (all + * extents/btree nodes, by replicas entry) in order to fix properly + * (re-replicate degraded data at the earliest possible time). + */ + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) { + scoped_guard(rcu) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); + + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { + set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_degraded(b); + } } - } + } if (!ptr_written) { set_btree_node_need_rewrite(b); @@ -1771,7 +1798,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, struct bio *bio; int ret; - trace_and_count(c, btree_node_read, trans, b); + trace_btree_node(c, b, btree_node_read); if (static_branch_unlikely(&bch2_verify_all_btree_replicas) && !btree_node_read_all_replicas(c, b, sync)) @@ -2505,7 +2532,17 @@ do_write: c->opts.nochanges) goto err; - trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); + if (trace_btree_node_write_enabled()) { + CLASS(printbuf, buf)(); + printbuf_indent_add(&buf, 2); + prt_printf(&buf, "offset %u sectors %u bytes %u\n", + b->written, + sectors_to_write, + bytes_to_write); + bch2_btree_pos_to_text(&buf, c, b); + trace_btree_node_write(c, buf.buf); + } + count_event(c, btree_node_write); wbio = container_of(bio_alloc_bioset(NULL, buf_pages(data, sectors_to_write << 9), |