diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-19 13:45:48 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-19 13:45:48 -0700 |
commit | 16dbfae867cdeb32f3d24cea81193793d5decc61 (patch) | |
tree | fee64885a8eb977b56ef913bb7a4f74a773c78b8 /fs/bcachefs/backpointers.c | |
parent | a90f1cd105c6c5c246f07ca371d873d35b78c7d9 (diff) | |
parent | 07f9a27f1969764d11374942961d51fee0ab628f (diff) |
Merge tag 'bcachefs-2024-05-19' of https://evilpiepirate.org/git/bcachefs
Pull bcachefs updates from Kent Overstreet:
- More safety fixes, primarily found by syzbot
- Run the upgrade/downgrade paths in nochnages mode. Nochanges mode is
primarily for testing fsck/recovery in dry run mode, so it shouldn't
change anything besides disabling writes and holding dirty metadata
in memory.
The idea here was to reduce the amount of activity if we can't write
anything out, so that bringing up a filesystem in "super ro" mode
would be more lilkely to work for data recovery - but norecovery is
the correct option for this.
- btree_trans->locked; we now track whether a btree_trans has any btree
nodes locked, and this is used for improved assertions related to
trans_unlock() and trans_relock(). We'll also be using it for
improving how we work with lockdep in the future: we don't want
lockdep to be tracking individual btree node locks because we take
too many for lockdep to track, and it's not necessary since we have a
cycle detector.
- Trigger improvements that are prep work for online fsck
- BTREE_TRIGGER_check_repair; this regularizes how we do some repair
work for extents that goes with running triggers in fsck, and fixes
some subtle issues with transaction restarts there.
- bch2_snapshot_equiv() has now been ripped out of fsck.c; snapshot
equivalence classes are for when snapshot deletion leaves behind
redundant snapshot nodes, but snapshot deletion now cleans this up
right away, so the abstraction doesn't need to leak.
- Improvements to how we resume writing to the journal in recovery. The
code for picking the new place to write when reading the journal is
greatly simplified and we also store the position in the superblock
for when we don't read the journal; this means that we preserve more
of the journal for list_journal debugging.
- Improvements to sysfs btree_cache and btree_node_cache, for debugging
memory reclaim.
- We now detect when we've blocked for 10 seconds on the allocator in
the write path and dump some useful info.
- Safety fixes for devices references: this is a big series that
changes almost all device lookups to properly check if the device
exists and take a reference to it.
Previously we assumed that if a bkey exists that references a device
then the device must exist, and this was enforced in .invalid
methods, but this was incorrect because it meant device removal
relied on accounting being correct to not leave keys pointing to
invalid devices, and that's not something we can assume.
Getting the "pointer to invalid device" checks out of our .invalid()
methods fixes some long standing device removal bugs; the only
outstanding bug with device removal now is a race between the discard
path and deleting alloc info, which should be easily fixed.
- The allocator now prefers not to expand the new
member_info.btree_allocated bitmap, meaning if repair ever requires
scanning for btree nodes (because of a corrupt interior nodes) we
won't have to scan the whole device(s).
- New coding style document, which among other things talks about the
correct usage of assertions
* tag 'bcachefs-2024-05-19' of https://evilpiepirate.org/git/bcachefs: (155 commits)
bcachefs: add no_invalid_checks flag
bcachefs: add counters for failed shrinker reclaim
bcachefs: Fix sb_field_downgrade validation
bcachefs: Plumb bch_validate_flags to sb_field_ops.validate()
bcachefs: s/bkey_invalid_flags/bch_validate_flags
bcachefs: fsync() should not return -EROFS
bcachefs: Invalid devices are now checked for by fsck, not .invalid methods
bcachefs: kill bch2_dev_bkey_exists() in bch2_check_fix_ptrs()
bcachefs: kill bch2_dev_bkey_exists() in bch2_read_endio()
bcachefs: bch2_dev_get_ioref() checks for device not present
bcachefs: bch2_dev_get_ioref2(); io_read.c
bcachefs: bch2_dev_get_ioref2(); debug.c
bcachefs: bch2_dev_get_ioref2(); journal_io.c
bcachefs: bch2_dev_get_ioref2(); io_write.c
bcachefs: bch2_dev_get_ioref2(); btree_io.c
bcachefs: bch2_dev_get_ioref2(); backpointers.c
bcachefs: bch2_dev_get_ioref2(); alloc_background.c
bcachefs: for_each_bset() declares loop iter
bcachefs: Move BCACHEFS_STATFS_MAGIC value to UAPI magic.h
bcachefs: Improve sysfs internal/btree_cache
...
Diffstat (limited to 'fs/bcachefs/backpointers.c')
-rw-r--r-- | fs/bcachefs/backpointers.c | 158 |
1 files changed, 88 insertions, 70 deletions
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index af7a71de1bdf..692b1c7d5018 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -23,6 +23,7 @@ static bool extent_matches_bp(struct bch_fs *c, const union bch_extent_entry *entry; struct extent_ptr_decoded p; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { struct bpos bucket2; struct bch_backpointer bp2; @@ -30,31 +31,43 @@ static bool extent_matches_bp(struct bch_fs *c, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2); + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + if (!ca) + continue; + + bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2); if (bpos_eq(bucket, bucket2) && - !memcmp(&bp, &bp2, sizeof(bp))) + !memcmp(&bp, &bp2, sizeof(bp))) { + rcu_read_unlock(); return true; + } } + rcu_read_unlock(); return false; } int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - /* these will be caught by fsck */ - if (!bch2_dev_exists2(c, bp.k->p.inode)) + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, bp.k->p.inode); + if (!ca) { + /* these will be caught by fsck */ + rcu_read_unlock(); return 0; + } - struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode); - struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); + struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p); + struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset); + rcu_read_unlock(); int ret = 0; bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || - !bpos_eq(bp.k->p, bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset)), + !bpos_eq(bp.k->p, bp_pos), c, err, backpointer_bucket_offset_wrong, "backpointer bucket_offset wrong"); @@ -75,10 +88,16 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - if (bch2_dev_exists2(c, k.k->p.inode)) { + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, k.k->p.inode); + if (ca) { + struct bpos bucket = bp_pos_to_bucket(ca, k.k->p); + rcu_read_unlock(); prt_str(out, "bucket="); - bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); + bch2_bpos_to_text(out, bucket); prt_str(out, " "); + } else { + rcu_read_unlock(); } bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); @@ -117,8 +136,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch_err(c, "%s", buf.buf); } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { - prt_printf(&buf, "backpointer not found when deleting"); - prt_newline(&buf); + prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); prt_printf(&buf, "searching for "); @@ -145,6 +163,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, @@ -161,7 +180,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, return ret; bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); bp_k->v = bp; if (!insert) { @@ -171,9 +190,9 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p, - BTREE_ITER_INTENT| - BTREE_ITER_SLOTS| - BTREE_ITER_WITH_UPDATES); + BTREE_ITER_intent| + BTREE_ITER_slots| + BTREE_ITER_with_updates); ret = bkey_err(k); if (ret) goto err; @@ -197,13 +216,13 @@ err: * Find the next backpointer >= *bp_offset: */ int bch2_get_next_backpointer(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, int gen, struct bpos *bp_pos, struct bch_backpointer *bp, unsigned iter_flags) { - struct bch_fs *c = trans->c; - struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); + struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0); struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; struct bkey_s_c k; int ret = 0; @@ -213,7 +232,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans, if (gen >= 0) { k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, - bucket, BTREE_ITER_CACHED|iter_flags); + bucket, BTREE_ITER_cached|iter_flags); ret = bkey_err(k); if (ret) goto out; @@ -223,7 +242,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans, goto done; } - *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0)); + *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0)); for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, *bp_pos, iter_flags, k, ret) { @@ -249,7 +268,6 @@ static void backpointer_not_found(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); /* * If we're using the btree write buffer, the backpointer we were @@ -259,6 +277,10 @@ static void backpointer_not_found(struct btree_trans *trans, if (likely(!bch2_backpointers_no_use_write_buffer)) return; + struct bpos bucket; + if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + return; + prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", bp.level ? "btree node" : "extent"); prt_printf(&buf, "bucket: "); @@ -288,15 +310,17 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, { if (likely(!bp.level)) { struct bch_fs *c = trans->c; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); - struct bkey_s_c k; + + struct bpos bucket; + if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + return bkey_s_c_err(-EIO); bch2_trans_node_iter_init(trans, iter, bp.btree_id, bp.pos, 0, 0, iter_flags); - k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k)) { bch2_trans_iter_exit(trans, iter); return k; @@ -325,18 +349,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct bch_backpointer bp) { struct bch_fs *c = trans->c; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); - struct btree *b; BUG_ON(!bp.level); + struct bpos bucket; + if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + return ERR_PTR(-EIO); + bch2_trans_node_iter_init(trans, iter, bp.btree_id, bp.pos, 0, bp.level - 1, 0); - b = bch2_btree_iter_peek_node(iter); + struct btree *b = bch2_btree_iter_peek_node(iter); if (IS_ERR_OR_NULL(b)) goto err; @@ -367,16 +393,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ struct printbuf buf = PRINTBUF; int ret = 0; - if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, - backpointer_to_missing_device, - "backpointer for missing device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, bp_iter, 0); + struct bpos bucket; + if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { + if (fsck_err(c, backpointer_to_missing_device, + "backpointer for missing device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, bp_iter, 0); goto out; } - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, - bp_pos_to_bucket(c, k.k->p), 0); + alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); ret = bkey_err(alloc_k); if (ret) goto out; @@ -460,8 +486,8 @@ found: bytes = p.crc.compressed_size << 9; - struct bch_dev *ca = bch_dev_bkey_exists(c, dev); - if (!bch2_dev_get_ioref(ca, READ)) + struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); + if (!ca) return false; data_buf = kvmalloc(bytes, GFP_KERNEL); @@ -511,25 +537,27 @@ static int check_bp_exists(struct btree_trans *trans, struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; struct bkey_buf tmp; - int ret; + int ret = 0; bch2_bkey_buf_init(&tmp); - if (!bch2_dev_bucket_exists(c, bucket)) { + struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket); + if (!ca) { prt_str(&buf, "extent for nonexistent device:bucket "); bch2_bpos_to_text(&buf, bucket); prt_str(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, orig_k); bch_err(c, "%s", buf.buf); - return -BCH_ERR_fsck_repair_unimplemented; + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; } if (bpos_lt(bucket, s->bucket_start) || bpos_gt(bucket, s->bucket_end)) - return 0; + goto out; bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(c, bucket, bp.bucket_offset), + bucket_pos_to_bp(ca, bucket, bp.bucket_offset), 0); ret = bkey_err(bp_k); if (ret) @@ -562,6 +590,7 @@ fsck_err: bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); bch2_bkey_buf_exit(&tmp, c); + bch2_dev_put(ca); printbuf_exit(&buf); return ret; check_existing_bp: @@ -637,13 +666,13 @@ missing: struct bkey_i_backpointer n_bp_k; bkey_backpointer_init(&n_bp_k.k_i); - n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); n_bp_k.v = bp; prt_printf(&buf, "\n want: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) - ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); + ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true); goto out; } @@ -667,7 +696,14 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp); + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + if (ca) + bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp); + rcu_read_unlock(); + + if (!ca) + continue; ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) @@ -760,7 +796,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, __for_each_btree_node(trans, iter, btree, btree == start.btree ? start.pos : POS_MIN, - 0, depth, BTREE_ITER_PREFETCH, b, ret) { + 0, depth, BTREE_ITER_prefetch, b, ret) { mem_may_pin -= btree_buf_bytes(b); if (mem_may_pin <= 0) { c->btree_cache.pinned_nodes_end = *end = @@ -794,31 +830,13 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, while (level >= depth) { struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, - level, - BTREE_ITER_PREFETCH); - while (1) { - bch2_trans_begin(trans); - - struct bkey_s_c k = bch2_btree_iter_peek(&iter); - if (!k.k) - break; - ret = bkey_err(k) ?: - check_extent_to_backpointers(trans, s, btree_id, level, k) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - if (ret) - break; - if (bpos_eq(iter.pos, SPOS_MAX)) - break; - bch2_btree_iter_advance(&iter); - } - bch2_trans_iter_exit(trans, &iter); + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, + BTREE_ITER_prefetch); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + check_extent_to_backpointers(trans, s, btree_id, level, k) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + })); if (ret) return ret; @@ -936,7 +954,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bpos last_flushed_pos = SPOS_MAX; return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, - POS_MIN, BTREE_ITER_PREFETCH, k, + POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_one_backpointer(trans, start, end, bkey_s_c_to_backpointer(k), |