diff options
Diffstat (limited to 'fs/bcachefs')
37 files changed, 482 insertions, 291 deletions
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index c43aaab4c108..cb25cddb759b 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -532,10 +532,6 @@ static int check_bp_exists(struct btree_trans *trans, struct btree_iter other_extent_iter = {}; CLASS(printbuf, buf)(); - if (bpos_lt(bp->k.p, s->bp_start) || - bpos_gt(bp->k.p, s->bp_end)) - return 0; - CLASS(btree_iter, bp_iter)(trans, BTREE_ID_backpointers, bp->k.p, 0); struct bkey_s_c bp_k = bch2_btree_iter_peek_slot(&bp_iter); int ret = bkey_err(bp_k); @@ -690,6 +686,10 @@ static int check_extent_to_backpointers(struct btree_trans *trans, struct bkey_i_backpointer bp; bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); + if (bpos_lt(bp.k.p, s->bp_start) || + bpos_gt(bp.k.p, s->bp_end)) + continue; + int ret = !empty ? check_bp_exists(trans, s, &bp, k) : bch2_bucket_backpointer_mod(trans, k, &bp, true); @@ -809,8 +809,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, for (enum btree_id btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { - /* btree_type_has_ptrs should probably include BTREE_ID_stripes, - * definitely her... */ int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; ret = commit_do(trans, NULL, NULL, @@ -899,7 +897,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && + if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointer_bucket_gen && (bp.v->bucket_gen != a->gen || bp.v->pad)) { ret = bch2_backpointer_del(trans, bp_k.k->p); @@ -931,6 +929,14 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b if (sectors[ALLOC_dirty] != a->dirty_sectors || sectors[ALLOC_cached] != a->cached_sectors || sectors[ALLOC_stripe] != a->stripe_sectors) { + /* + * Post 1.14 upgrade, we assume that backpointers are mostly + * correct and a sector count mismatch is probably due to a + * write buffer race + * + * Pre upgrade, we expect all the buckets to be wrong, a write + * buffer flush is pointless: + */ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); if (ret) @@ -978,12 +984,22 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) goto next; struct bpos bucket = bp_pos_to_bucket(ca, pos); - u64 next = ca->mi.nbuckets; - - unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets); - if (bitmap) - next = min_t(u64, next, - find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset)); + u64 next = min(bucket.offset, ca->mi.nbuckets); + + unsigned long *mismatch = READ_ONCE(ca->bucket_backpointer_mismatch.buckets); + unsigned long *empty = READ_ONCE(ca->bucket_backpointer_empty.buckets); + /* + * Find the first bucket with mismatches - but + * not empty buckets; we don't need to pin those + * because we just recreate all backpointers in + * those buckets + */ + if (mismatch && empty) + next = find_next_andnot_bit(mismatch, empty, ca->mi.nbuckets, next); + else if (mismatch) + next = find_next_bit(mismatch, ca->mi.nbuckets, next); + else + next = ca->mi.nbuckets; bucket.offset = next; if (bucket.offset == ca->mi.nbuckets) @@ -1110,17 +1126,18 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) if (ret) goto err; - u64 nr_buckets = 0, nr_mismatches = 0; + u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; for_each_member_device(c, ca) { nr_buckets += ca->mi.nbuckets; nr_mismatches += ca->bucket_backpointer_mismatch.nr; + nr_empty += ca->bucket_backpointer_empty.nr; } if (!nr_mismatches) goto err; - bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", - nr_mismatches, nr_buckets); + bch_info(c, "scanning for missing backpointers in %llu/%llu buckets, %llu buckets with no backpointers", + nr_mismatches - nr_empty, nr_buckets, nr_empty); while (1) { ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 80a48548ddd5..b2de993d802b 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -706,7 +706,8 @@ struct bch_sb_field_ext { x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ x(fast_device_removal, BCH_VERSION(1, 27)) \ x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \ - x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) + x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \ + x(31bit_dirent_offset, BCH_VERSION(1, 30)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index ae7d260589d8..43f294284d57 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -356,7 +356,7 @@ again: bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); iter.prefetch = true; - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); BUG_ON(bpos_gt(k.k->p, b->data->max_key)); @@ -470,7 +470,7 @@ again: bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); iter.prefetch = true; - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) { bch2_bkey_buf_reassemble(&cur_k, c, k); bch2_btree_and_journal_iter_advance(&iter); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 276cf088539e..2e3dd9bacac5 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -131,10 +131,10 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; - p = kvmalloc(size, GFP_NOWAIT); + p = kvmalloc(size, GFP_NOWAIT|__GFP_ACCOUNT|__GFP_RECLAIMABLE); if (!p) { *used_mempool = true; - p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); + p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS|__GFP_ACCOUNT|__GFP_RECLAIMABLE); } memalloc_nofs_restore(flags); return p; @@ -1014,6 +1014,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, k = bkey_p_next(k); continue; drop_this_key: + ret = 0; next_good_key = k->u64s; if (!next_good_key || @@ -1470,7 +1471,7 @@ start: } prt_newline(&buf); - if (failed.nr) + if (ret || failed.nr) bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); async_object_list_del(c, btree_read_bio, rb->list_idx); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 546b559fe3ce..76f430f93dc1 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -650,7 +650,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v; if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = + const struct bkey_i *j_k = bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); @@ -848,7 +848,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p break; bch2_btree_and_journal_iter_advance(jiter); - k = bch2_btree_and_journal_iter_peek(jiter); + k = bch2_btree_and_journal_iter_peek(c, jiter); if (!k.k) break; @@ -898,7 +898,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); - k = bch2_btree_and_journal_iter_peek(&jiter); + k = bch2_btree_and_journal_iter_peek(c, &jiter); if (!k.k) { CLASS(printbuf, buf)(); @@ -2120,10 +2120,10 @@ void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_ } } -static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_pos, - struct bpos end_pos) +static const struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos search_pos, + struct bpos end_pos) { struct btree_path *path = btree_iter_path(trans, iter); @@ -2139,7 +2139,7 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, struct btree_iter *iter) { struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos); + const struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos); if (k) { iter->k = k->k; @@ -2156,7 +2156,7 @@ void btree_trans_peek_journal(struct btree_trans *trans, struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *next_journal = + const struct bkey_i *next_journal = bch2_btree_journal_peek(trans, iter, search_key, k->k ? k->k->p : path_l(path)->b->key.k.p); if (next_journal) { @@ -2165,10 +2165,10 @@ void btree_trans_peek_journal(struct btree_trans *trans, } } -static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_key, - struct bpos end_pos) +static const struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos search_key, + struct bpos end_pos) { struct btree_path *path = btree_iter_path(trans, iter); @@ -2186,7 +2186,7 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans, struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *next_journal = + const struct bkey_i *next_journal = bch2_btree_journal_peek_prev(trans, iter, search_key, k->k ? k->k->p : path_l(path)->b->data->min_key); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 24f2fbe84ad7..65bb4a931cd8 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -46,21 +46,22 @@ static size_t __bch2_journal_key_search(struct journal_keys *keys, enum btree_id id, unsigned level, struct bpos pos) { + struct bch_fs *c = container_of(keys, struct bch_fs, journal_keys); size_t l = 0, r = keys->nr, m; while (l < r) { m = l + ((r - l) >> 1); - if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) + if (__journal_key_cmp(c, id, level, pos, idx_to_key(keys, m)) > 0) l = m + 1; else r = m; } BUG_ON(l < keys->nr && - __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); + __journal_key_cmp(c, id, level, pos, idx_to_key(keys, l)) > 0); BUG_ON(l && - __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); + __journal_key_cmp(c, id, level, pos, idx_to_key(keys, l - 1)) <= 0); return l; } @@ -72,10 +73,20 @@ static size_t bch2_journal_key_search(struct journal_keys *keys, return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); } +static inline struct journal_key_range_overwritten *__overwrite_range(struct journal_keys *keys, u32 idx) +{ + return idx ? keys->overwrites.data + idx : NULL; +} + +static inline struct journal_key_range_overwritten *overwrite_range(struct journal_keys *keys, u32 idx) +{ + return idx ? rcu_dereference(keys->overwrites.data) + idx : NULL; +} + /* Returns first non-overwritten key >= search key: */ -struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos, - struct bpos end_pos, size_t *idx) +const struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) { struct journal_keys *keys = &c->journal_keys; unsigned iters = 0; @@ -87,7 +98,7 @@ search: *idx = __bch2_journal_key_search(keys, btree_id, level, pos); while (*idx && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { + __journal_key_cmp(c, btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { --(*idx); iters++; if (iters == 10) { @@ -96,23 +107,23 @@ search: } } - struct bkey_i *ret = NULL; + const struct bkey_i *ret = NULL; rcu_read_lock(); /* for overwritten_ranges */ while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { - if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) + if (__journal_key_cmp(c, btree_id, level, end_pos, k) < 0) break; if (k->overwritten) { if (k->overwritten_range) - *idx = rcu_dereference(k->overwritten_range)->end; + *idx = overwrite_range(keys, k->overwritten_range)->end; else *idx += 1; continue; } - if (__journal_key_cmp(btree_id, level, pos, k) <= 0) { - ret = k->k; + if (__journal_key_cmp(c, btree_id, level, pos, k) <= 0) { + ret = journal_key_k(c, k); break; } @@ -129,9 +140,9 @@ search: return ret; } -struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos, - struct bpos end_pos, size_t *idx) +const struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) { struct journal_keys *keys = &c->journal_keys; unsigned iters = 0; @@ -146,7 +157,7 @@ search: *idx = __bch2_journal_key_search(keys, btree_id, level, pos); while (*idx < keys->nr && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) { + __journal_key_cmp(c, btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) { (*idx)++; iters++; if (iters == 10) { @@ -158,25 +169,25 @@ search: if (*idx == keys->nr) --(*idx); - struct bkey_i *ret = NULL; + const struct bkey_i *ret = NULL; rcu_read_lock(); /* for overwritten_ranges */ while (true) { k = idx_to_key(keys, *idx); - if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) + if (__journal_key_cmp(c, btree_id, level, end_pos, k) > 0) break; if (k->overwritten) { if (k->overwritten_range) - *idx = rcu_dereference(k->overwritten_range)->start; + *idx = overwrite_range(keys, k->overwritten_range)->start; if (!*idx) break; --(*idx); continue; } - if (__journal_key_cmp(btree_id, level, pos, k) >= 0) { - ret = k->k; + if (__journal_key_cmp(c, btree_id, level, pos, k) >= 0) { + ret = journal_key_k(c, k); break; } @@ -194,8 +205,8 @@ search: return ret; } -struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos) +const struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos) { size_t idx = 0; @@ -264,13 +275,8 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, struct journal_key n = { .btree_id = id, .level = level, - .k = k, .allocated = true, - /* - * Ensure these keys are done last by journal replay, to unblock - * journal reclaim: - */ - .journal_seq = U64_MAX, + .allocated_k = k, }; struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); @@ -278,8 +284,8 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, BUG_ON(test_bit(BCH_FS_rw, &c->flags)); if (idx < keys->size && - journal_key_cmp(&n, &keys->data[idx]) == 0) { - struct bkey_i *o = keys->data[idx].k; + journal_key_cmp(c, &n, &keys->data[idx]) == 0) { + struct bkey_i *o = journal_key_k(c, &keys->data[idx]); if (k->k.type == KEY_TYPE_accounting && o->k.type == KEY_TYPE_accounting) { @@ -291,7 +297,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, } if (keys->data[idx].allocated) - kfree(keys->data[idx].k); + kfree(keys->data[idx].allocated_k); keys->data[idx] = n; return 0; } @@ -376,17 +382,20 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bpos pos) { - struct journal_keys *keys = &trans->c->journal_keys; + if (!trans->journal_replay_not_finished) + return false; + + struct bch_fs *c = trans->c; + struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, btree, level, pos); - if (!trans->journal_replay_not_finished) + if (idx >= keys->size || + keys->data[idx].btree_id != btree || + keys->data[idx].level != level) return false; - return (idx < keys->size && - keys->data[idx].btree_id == btree && - keys->data[idx].level == level && - bpos_eq(keys->data[idx].k->k.p, pos) && - bkey_deleted(&keys->data[idx].k->k)); + struct bkey_i *k = journal_key_k(c, &keys->data[idx]); + return bpos_eq(k->k.p, pos) && bkey_deleted(&k->k); } static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos) @@ -403,9 +412,9 @@ static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos bool next_overwritten = next && next->overwritten; struct journal_key_range_overwritten *prev_range = - prev_overwritten ? prev->overwritten_range : NULL; + prev_overwritten ? overwrite_range(keys, prev->overwritten_range) : NULL; struct journal_key_range_overwritten *next_range = - next_overwritten ? next->overwritten_range : NULL; + next_overwritten ? overwrite_range(keys, next->overwritten_range) : NULL; BUG_ON(prev_range && prev_range->end != idx); BUG_ON(next_range && next_range->start != idx + 1); @@ -413,37 +422,47 @@ static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos if (prev_range && next_range) { prev_range->end = next_range->end; - keys->data[pos].overwritten_range = prev_range; + keys->data[pos].overwritten_range = prev->overwritten_range; + + u32 old = next->overwritten_range; + for (size_t i = next_range->start; i < next_range->end; i++) { struct journal_key *ip = keys->data + idx_to_pos(keys, i); - BUG_ON(ip->overwritten_range != next_range); - ip->overwritten_range = prev_range; + BUG_ON(ip->overwritten_range != old); + ip->overwritten_range = prev->overwritten_range; } - - kfree_rcu_mightsleep(next_range); } else if (prev_range) { prev_range->end++; - k->overwritten_range = prev_range; + k->overwritten_range = prev->overwritten_range; if (next_overwritten) { prev_range->end++; - next->overwritten_range = prev_range; + next->overwritten_range = prev->overwritten_range; } } else if (next_range) { next_range->start--; - k->overwritten_range = next_range; + k->overwritten_range = next->overwritten_range; if (prev_overwritten) { next_range->start--; - prev->overwritten_range = next_range; + prev->overwritten_range = next->overwritten_range; } } else if (prev_overwritten || next_overwritten) { - struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL); - if (!r) + /* 0 is a sentinel value */ + if (darray_resize_rcu(&keys->overwrites, max(keys->overwrites.nr + 1, 2))) return; - r->start = idx - (size_t) prev_overwritten; - r->end = idx + 1 + (size_t) next_overwritten; + if (!keys->overwrites.nr) + darray_push(&keys->overwrites, (struct journal_key_range_overwritten) {}); + + darray_push(&keys->overwrites, ((struct journal_key_range_overwritten) { + .start = idx - (size_t) prev_overwritten, + .end = idx + 1 + (size_t) next_overwritten, + })); + + smp_wmb(); + u32 r = keys->overwrites.nr - 1; + + k->overwritten_range = r; - rcu_assign_pointer(k->overwritten_range, r); if (prev_overwritten) prev->overwritten_range = r; if (next_overwritten) @@ -457,11 +476,15 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, btree, level, pos); - if (idx < keys->size && - keys->data[idx].btree_id == btree && - keys->data[idx].level == level && - bpos_eq(keys->data[idx].k->k.p, pos) && - !keys->data[idx].overwritten) { + if (idx >= keys->size || + keys->data[idx].btree_id != btree || + keys->data[idx].level != level || + keys->data[idx].overwritten) + return; + + struct bkey_i *k = journal_key_k(c, &keys->data[idx]); + + if (bpos_eq(k->k.p, pos)) { guard(mutex)(&keys->overwrite_lock); __bch2_journal_key_overwritten(keys, idx); } @@ -476,7 +499,7 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) } } -static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) +static struct bkey_s_c bch2_journal_iter_peek(struct bch_fs *c, struct journal_iter *iter) { journal_iter_verify(iter); @@ -490,10 +513,10 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) BUG_ON(cmp); if (!k->overwritten) - return bkey_i_to_s_c(k->k); + return bkey_i_to_s_c(journal_key_k(c, k)); if (k->overwritten_range) - iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); + iter->idx = idx_to_pos(iter->keys, overwrite_range(iter->keys, k->overwritten_range)->end); else bch2_journal_iter_advance(iter); } @@ -554,7 +577,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter while (nr--) { bch2_btree_and_journal_iter_advance(&iter); - struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); + struct bkey_s_c k = bch2_btree_and_journal_iter_peek(c, &iter); if (!k.k) break; @@ -565,7 +588,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter bch2_bkey_buf_exit(&tmp, c); } -struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct bch_fs *c, struct btree_and_journal_iter *iter) { struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; size_t iters = 0; @@ -586,7 +609,7 @@ again: bch2_journal_iter_advance_btree(iter); if (iter->trans->journal_replay_not_finished) - while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + while ((journal_k = bch2_journal_iter_peek(c, &iter->journal)).k && bpos_lt(journal_k.k->p, iter->pos)) bch2_journal_iter_advance(&iter->journal); @@ -658,15 +681,22 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, /* * When keys compare equal, oldest compares first: */ -static int journal_sort_key_cmp(const void *_l, const void *_r) +static int journal_sort_key_cmp(const void *_l, const void *_r, const void *priv) { + struct bch_fs *c = (void *) priv; const struct journal_key *l = _l; const struct journal_key *r = _r; int rewind = l->rewind && r->rewind ? -1 : 1; - return journal_key_cmp(l, r) ?: - ((cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset)) * rewind); + int cmp = journal_key_cmp(c, l, r); + if (cmp) + return cmp; + + if (l->allocated || r->allocated) + return cmp_int(l->allocated, r->allocated); + + return ((cmp_int(l->journal_seq_offset, r->journal_seq_offset) ?: + cmp_int(l->journal_offset, r->journal_offset)) * rewind); } void bch2_journal_keys_put(struct bch_fs *c) @@ -680,20 +710,16 @@ void bch2_journal_keys_put(struct bch_fs *c) move_gap(keys, keys->nr); - darray_for_each(*keys, i) { - if (i->overwritten_range && - (i == &darray_last(*keys) || - i->overwritten_range != i[1].overwritten_range)) - kfree(i->overwritten_range); - + darray_for_each(*keys, i) if (i->allocated) - kfree(i->k); - } + kfree(i->allocated_k); kvfree(keys->data); keys->data = NULL; keys->nr = keys->gap = keys->size = 0; + darray_exit(&keys->overwrites); + struct journal_replay **i; struct genradix_iter iter; @@ -704,8 +730,10 @@ void bch2_journal_keys_put(struct bch_fs *c) static void __journal_keys_sort(struct journal_keys *keys) { - sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]), - journal_sort_key_cmp, NULL); + struct bch_fs *c = container_of(keys, struct bch_fs, journal_keys); + + sort_r_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]), + journal_sort_key_cmp, NULL, c); cond_resched(); @@ -717,9 +745,10 @@ static void __journal_keys_sort(struct journal_keys *keys) * compare each individual accounting key against the version in * the btree during replay: */ - if (src->k->k.type != KEY_TYPE_accounting && + struct bkey_i *k = journal_key_k(c, src); + if (k->k.type != KEY_TYPE_accounting && src + 1 < &darray_top(*keys) && - !journal_key_cmp(src, src + 1)) + !journal_key_cmp(c, src, src + 1)) continue; *dst++ = *src; @@ -763,8 +792,7 @@ int bch2_journal_keys_sort(struct bch_fs *c) .btree_id = entry->btree_id, .level = entry->level, .rewind = rewind, - .k = k, - .journal_seq = le64_to_cpu(i->j.seq), + .journal_seq_offset = journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)), .journal_offset = k->_data - i->j._data, }; @@ -801,13 +829,18 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, move_gap(keys, keys->nr); - darray_for_each(*keys, i) + darray_for_each(*keys, i) { + struct bkey_i *k = journal_key_k(c, i); + if (!(i->btree_id == btree && i->level >= level_min && i->level <= level_max && - bpos_ge(i->k->k.p, start) && - bpos_le(i->k->k.p, end))) + bpos_ge(k->k.p, start) && + bpos_le(k->k.p, end))) keys->data[dst++] = *i; + else if (i->allocated) + kfree(i->allocated_k); + } keys->nr = keys->gap = dst; } @@ -825,7 +858,7 @@ void bch2_journal_keys_dump(struct bch_fs *c) prt_printf(&buf, "btree="); bch2_btree_id_to_text(&buf, i->btree_id); prt_printf(&buf, " l=%u ", i->level); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(journal_key_k(c, i))); pr_err("%s", buf.buf); } } diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 2a3082919b8d..8dc8e778be6c 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -29,6 +29,22 @@ struct btree_and_journal_iter { bool fail_if_too_many_whiteouts; }; +static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) +{ + return (seq - c->journal_entries_base_seq) & (~0U >> 1); +} + +static inline struct bkey_i *journal_key_k(struct bch_fs *c, + const struct journal_key *k) +{ + if (k->allocated) + return k->allocated_k; + + struct journal_replay *i = *genradix_ptr(&c->journal_entries, k->journal_seq_offset); + + return (struct bkey_i *) (i->j._data + k->journal_offset); +} + static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, unsigned l_level, const struct journal_key *r) @@ -37,25 +53,28 @@ static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, cmp_int(l_btree_id, r->btree_id); } -static inline int __journal_key_cmp(enum btree_id l_btree_id, +static inline int __journal_key_cmp(struct bch_fs *c, + enum btree_id l_btree_id, unsigned l_level, struct bpos l_pos, const struct journal_key *r) { return __journal_key_btree_cmp(l_btree_id, l_level, r) ?: - bpos_cmp(l_pos, r->k->k.p); + bpos_cmp(l_pos, journal_key_k(c, r)->k.p); } -static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) +static inline int journal_key_cmp(struct bch_fs *c, + const struct journal_key *l, const struct journal_key *r) { - return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); + return __journal_key_cmp(c, l->btree_id, l->level, + journal_key_k(c, l)->k.p, r); } -struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, +const struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos, size_t *); -struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id, +const struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos, size_t *); -struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, +const struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, unsigned, struct bpos); int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *, @@ -71,7 +90,7 @@ bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos); void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); -struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct bch_fs *, struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h index 86aacb254fb2..4495fc92f848 100644 --- a/fs/bcachefs/btree_journal_iter_types.h +++ b/fs/bcachefs/btree_journal_iter_types.h @@ -2,21 +2,47 @@ #ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H #define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H +struct journal_ptr { + bool csum_good; + struct bch_csum csum; + u8 dev; + u32 bucket; + u32 bucket_offset; + u64 sector; +}; + +/* + * Only used for holding the journal entries we read in btree_journal_read() + * during cache_registration + */ +struct journal_replay { + DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; + + bool csum_good; + bool ignore_blacklisted; + bool ignore_not_dirty; + /* must be last: */ + struct jset j; +}; + struct journal_key_range_overwritten { size_t start, end; }; struct journal_key { - u64 journal_seq; - u32 journal_offset; + union { + struct { + u32 journal_seq_offset; + u32 journal_offset; + }; + struct bkey_i *allocated_k; + }; enum btree_id btree_id:8; unsigned level:8; bool allocated:1; bool overwritten:1; bool rewind:1; - struct journal_key_range_overwritten __rcu * - overwritten_range; - struct bkey_i *k; + u32 overwritten_range; }; struct journal_keys { @@ -31,7 +57,9 @@ struct journal_keys { size_t gap; atomic_t ref; bool initial_ref_held; + struct mutex overwrite_lock; + DARRAY(struct journal_key_range_overwritten) overwrites; }; #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 4d58bdb233e9..5fa7f2f9f1e9 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -54,7 +54,7 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u); if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = + const struct bkey_i *j_k = bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); if (j_k) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 053a837cf241..b70eb095a37e 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -403,7 +403,7 @@ __btree_trans_update_by_path(struct btree_trans *trans, i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = + const struct bkey_i *j_k = bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); if (j_k) { diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 65ca54c5b0ff..a9877a47bfc6 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -95,7 +95,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) if (!b->c.level) goto out; - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) { if (k.k->type != KEY_TYPE_btree_ptr_v2) goto out; diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c index e86d36d23e9e..c06c81e842e4 100644 --- a/fs/bcachefs/darray.c +++ b/fs/bcachefs/darray.c @@ -5,7 +5,8 @@ #include <linux/vmalloc.h> #include "darray.h" -int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) +int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp, + bool rcu) { if (new_size > d->size) { new_size = roundup_pow_of_two(new_size); @@ -20,18 +21,25 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_ if (unlikely(check_mul_overflow(new_size, element_size, &bytes))) return -ENOMEM; - void *data = likely(bytes < INT_MAX) + void *old = d->data; + void *new = likely(bytes < INT_MAX) ? kvmalloc_noprof(bytes, gfp) : vmalloc_noprof(bytes); - if (!data) + if (!new) return -ENOMEM; if (d->size) - memcpy(data, d->data, d->size * element_size); - if (d->data != d->preallocated) - kvfree(d->data); - d->data = data; + memcpy(new, old, d->size * element_size); + + rcu_assign_pointer(d->data, new); d->size = new_size; + + if (old != d->preallocated) { + if (!rcu) + kvfree(old); + else + kvfree_rcu_mightsleep(old); + } } return 0; diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 4080ee99aadd..b4f284fe9652 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -34,17 +34,17 @@ typedef DARRAY(s16) darray_s16; typedef DARRAY(s32) darray_s32; typedef DARRAY(s64) darray_s64; -int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); +int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t, bool); #define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__)) -#define __darray_resize(_d, _element_size, _new_size, _gfp) \ +#define __darray_resize(_d, _element_size, _new_size, _gfp, _rcu) \ (unlikely((_new_size) > (_d)->size) \ - ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\ + ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp), _rcu)\ : 0) #define darray_resize_gfp(_d, _new_size, _gfp) \ - __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp) + __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp, false) #define darray_resize(_d, _new_size) \ darray_resize_gfp(_d, _new_size, GFP_KERNEL) @@ -55,6 +55,12 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); #define darray_make_room(_d, _more) \ darray_make_room_gfp(_d, _more, GFP_KERNEL) +#define darray_resize_rcu(_d, _new_size) \ + __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), GFP_KERNEL, true) + +#define darray_make_room_rcu(_d, _more) \ + darray_resize_rcu((_d), (_d)->nr + (_more)) + #define darray_room(_d) ((_d).size - (_d).nr) #define darray_top(_d) ((_d).data[(_d).nr]) @@ -107,8 +113,11 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); #define __darray_for_each(_d, _i) \ for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) +#define darray_for_each_from(_d, _i, _start) \ + for (typeof(&(_d).data[0]) _i = _start; _i < (_d).data + (_d).nr; _i++) + #define darray_for_each(_d, _i) \ - for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) + darray_for_each_from(_d, _i, (_d).data) #define darray_for_each_reverse(_d, _i) \ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index cb44b35e0f1d..fe6f3d874a47 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -95,7 +95,7 @@ static u64 bch2_dirent_hash(const struct bch_hash_info *info, bch2_str_hash_update(&ctx, info, name->name, name->len); /* [0,2) reserved for dots */ - return max_t(u64, bch2_str_hash_end(&ctx, info), 2); + return max_t(u64, bch2_str_hash_end(&ctx, info, true), 2); } static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 5944ad6d0f8d..5ec57b710501 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -765,75 +765,77 @@ int bch2_accounting_read(struct bch_fs *c) iter.flags &= ~BTREE_ITER_with_journal; int ret = for_each_btree_key_continue(trans, iter, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); - if (k.k->type != KEY_TYPE_accounting) - continue; + if (k.k->type != KEY_TYPE_accounting) + continue; - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + break; - if (!bch2_accounting_is_mem(&acc_k)) { - struct disk_accounting_pos next; - memset(&next, 0, sizeof(next)); - next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); - continue; - } + if (!bch2_accounting_is_mem(&acc_k)) { + struct disk_accounting_pos next; + memset(&next, 0, sizeof(next)); + next.type = acc_k.type + 1; + bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + continue; + } - accounting_read_key(trans, k); - })); + accounting_read_key(trans, k); + })); bch2_trans_iter_exit(&iter); if (ret) return ret; struct journal_keys *keys = &c->journal_keys; - struct journal_key *dst = keys->data; move_gap(keys, keys->nr); darray_for_each(*keys, i) { - if (i->k->k.type == KEY_TYPE_accounting) { + if (i->overwritten) + continue; + + struct bkey_i *k = journal_key_k(c, i); + + if (k->k.type == KEY_TYPE_accounting) { struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); + bpos_to_disk_accounting_pos(&acc_k, k->k.p); if (!bch2_accounting_is_mem(&acc_k)) continue; - struct bkey_s_c k = bkey_i_to_s_c(i->k); unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &k.k->p); + accounting_pos_cmp, &k->k.p); bool applied = idx < acc->k.nr && - bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0; + bversion_cmp(acc->k.data[idx].bversion, k->k.bversion) >= 0; if (applied) continue; - if (i + 1 < &darray_top(*keys) && - i[1].k->k.type == KEY_TYPE_accounting && - !journal_key_cmp(i, i + 1)) { - WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); + darray_for_each_from(*keys, j, i + 1) { + if (journal_key_cmp(c, i, j)) + break; - i[1].journal_seq = i[0].journal_seq; + struct bkey_i *n = journal_key_k(c, j); + if (n->k.type == KEY_TYPE_accounting) { + WARN_ON(bversion_cmp(k->k.bversion, n->k.bversion) >= 0); - bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k), - bkey_s_c_to_accounting(k)); - continue; + bch2_accounting_accumulate(bkey_i_to_accounting(k), + bkey_i_to_s_c_accounting(n)); + j->overwritten = true; + } } - ret = accounting_read_key(trans, k); + ret = accounting_read_key(trans, bkey_i_to_s_c(k)); if (ret) return ret; } - - *dst++ = *i; } - keys->gap = keys->nr = dst - keys->data; guard(percpu_write)(&c->mark_lock); diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 7ddb156c765c..73eb28090bc7 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -115,9 +115,15 @@ int bch2_extent_trim_atomic(struct btree_trans *trans, copy.flags |= BTREE_ITER_nofilter_whiteouts; + /* + * We're doing our own whiteout filtering, but we still need to pass a + * max key to avoid popping an assert in bch2_snapshot_is_ancestor(): + */ struct bkey_s_c k; unsigned nr_iters = 0; - for_each_btree_key_continue_norestart(copy, 0, k, ret) { + for_each_btree_key_max_continue_norestart(copy, + POS(insert->k.p.inode, U64_MAX), + 0, k, ret) { unsigned offset = 0; if (bkey_gt(iter->pos, bkey_start_pos(k.k))) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 01c1c6372229..ccc44b1fc178 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -266,7 +266,8 @@ create_lostfound: root_inode.bi_nlink++; - ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); + ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu, + inode_opt_get(c, &root_inode, inodes_32bit)); if (ret) goto err; @@ -573,7 +574,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub new_inode.bi_subvol = subvolid; - int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: + int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu, false) ?: bch2_btree_iter_traverse(&inode_iter) ?: bch2_inode_write(trans, &inode_iter, &new_inode); bch2_trans_iter_exit(&inode_iter); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index d5e5190f0663..4aa130ff7cf6 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -944,11 +944,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, } static struct bkey_i_inode_alloc_cursor * -bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) +bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max, + bool is_32bit) { struct bch_fs *c = trans->c; - u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1; + u64 cursor_idx = is_32bit ? 0 : cpu + 1; cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); @@ -967,7 +968,7 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m if (IS_ERR(cursor)) return cursor; - if (c->opts.inodes_32bit) { + if (is_32bit) { *min = BLOCKDEV_INODE_MAX; *max = INT_MAX; } else { @@ -996,11 +997,11 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m int bch2_inode_create(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode_u, - u32 snapshot, u64 cpu) + u32 snapshot, u64 cpu, bool is_32bit) { u64 min, max; struct bkey_i_inode_alloc_cursor *cursor = - bch2_inode_alloc_cursor_get(trans, cpu, &min, &max); + bch2_inode_alloc_cursor_get(trans, cpu, &min, &max, is_32bit); int ret = PTR_ERR_OR_ZERO(cursor); if (ret) return ret; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index b8ec3e628d90..79092ea74844 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -172,7 +172,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, struct bch_inode_unpacked *); int bch2_inode_create(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, u32, u64); + struct bch_inode_unpacked *, u32, u64, bool); int bch2_inode_rm(struct bch_fs *, subvol_inum); diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 1f00938b1bdc..e07fa6cc99bd 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -144,7 +144,8 @@ enum inode_opt_id { x(unlinked, 7) \ x(backptr_untrusted, 8) \ x(has_child_snapshot, 9) \ - x(has_case_insensitive, 10) + x(has_case_insensitive, 10) \ + x(31bit_dirent_offset, 11) /* bits 20+ reserved for packed fields below: */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 07869436a964..93ac0faedf7d 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -120,6 +120,7 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) INIT_LIST_HEAD(&p->flushed[i]); atomic_set(&p->count, count); p->devs.nr = 0; + p->bytes = 0; } /* @@ -264,6 +265,11 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + struct journal_entry_pin_list *pin_list = + journal_seq_pin(j, journal_cur_seq(j)); + pin_list->bytes = roundup_pow_of_two(vstruct_bytes(buf->data)); + j->dirty_entry_bytes += pin_list->bytes; + if (trace_journal_entry_close_enabled() && trace) { CLASS(printbuf, err)(); guard(printbuf_atomic)(&err); @@ -737,9 +743,9 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, return ret; CLASS(printbuf, buf)(); + prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); bch2_journal_debug_to_text(&buf, j); bch2_print_str(c, KERN_ERR, buf.buf); - prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); closure_wait_event(&j->async_wait, !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 093e4acad085..6e8a89a0f244 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -3,6 +3,7 @@ #include "alloc_background.h" #include "alloc_foreground.h" #include "btree_io.h" +#include "btree_journal_iter.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "buckets.h" @@ -106,11 +107,6 @@ static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *cs return !bch2_crc_cmp(j->csum, *csum); } -static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) -{ - return (seq - c->journal_entries_base_seq) & (~0U >> 1); -} - static void __journal_replay_free(struct bch_fs *c, struct journal_replay *i) { diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index f53c5c81d137..f8754bf71264 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -7,29 +7,6 @@ void bch2_journal_pos_from_member_info_set(struct bch_fs *); void bch2_journal_pos_from_member_info_resume(struct bch_fs *); -struct journal_ptr { - bool csum_good; - struct bch_csum csum; - u8 dev; - u32 bucket; - u32 bucket_offset; - u64 sector; -}; - -/* - * Only used for holding the journal entries we read in btree_journal_read() - * during cache_registration - */ -struct journal_replay { - DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; - - bool csum_good; - bool ignore_blacklisted; - bool ignore_not_dirty; - /* must be last: */ - struct jset j; -}; - static inline bool journal_replay_ignore(struct journal_replay *i) { return !i || i->ignore_blacklisted || i->ignore_not_dirty; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index f23e5ee9ad75..6400a63ed79b 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -148,6 +148,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); + ssize_t mem_limit = max_t(ssize_t, 0, + (totalram_pages() * PAGE_SIZE) / 4 - j->dirty_entry_bytes); + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { if (!ca->journal.nr || !ca->mi.durability) @@ -180,6 +183,7 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne * @nr_devs_want largest devices: */ space = dev_space[nr_devs_want - 1]; + space.total = min(space.total, mem_limit >> 9); space.next_entry = min(space.next_entry, min_bucket_size); return space; } @@ -328,9 +332,17 @@ void bch2_journal_reclaim_fast(struct journal *j) * Unpin journal entries whose reference counts reached zero, meaning * all btree nodes got written out */ + struct journal_entry_pin_list *pin_list; while (!fifo_empty(&j->pin) && j->pin.front <= j->seq_ondisk && - !atomic_read(&fifo_peek_front(&j->pin).count)) { + !atomic_read(&(pin_list = &fifo_peek_front(&j->pin))->count)) { + + if (WARN_ON(j->dirty_entry_bytes < pin_list->bytes)) + pin_list->bytes = j->dirty_entry_bytes; + + j->dirty_entry_bytes -= pin_list->bytes; + pin_list->bytes = 0; + j->pin.front++; popped = true; } diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 51104bbb99da..7c9273bd0e15 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -71,6 +71,7 @@ struct journal_entry_pin_list { struct list_head flushed[JOURNAL_PIN_TYPE_NR]; atomic_t count; struct bch_devs_list devs; + size_t bytes; }; struct journal; @@ -253,6 +254,7 @@ struct journal { u64 front, back, size, mask; struct journal_entry_pin_list *data; } pin; + size_t dirty_entry_bytes; struct journal_space space[journal_space_nr]; diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index d1019052f182..5c321a0d1f89 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -62,7 +62,8 @@ int bch2_create_trans(struct btree_trans *trans, if (flags & BCH_CREATE_TMPFILE) new_inode->bi_flags |= BCH_INODE_unlinked; - ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); + ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu, + inode_opt_get(c, dir_u, inodes_32bit)); if (ret) goto err; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 84ce69a7f131..31a3abcbd83e 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -242,7 +242,7 @@ enum fsck_err_opts { x(inodes_32bit, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - BCH_SB_INODE_32BIT, true, \ + BCH_SB_INODE_32BIT, false, \ NULL, "Constrain inode numbers to 32 bits") \ x(shard_inode_numbers_bits, u8, \ OPT_FS|OPT_FORMAT, \ @@ -321,6 +321,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Don't kick drives out when splitbrain detected")\ + x(no_version_check, u8, \ + OPT_HIDDEN, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't fail reading the superblock due to incompatible version")\ x(verbose, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 29e81f96db0f..6319144a440c 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -64,7 +64,6 @@ int bch2_btree_lost_data(struct bch_fs *c, * but in debug mode we want the next fsck run to be clean: */ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0, &write_sb) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0, &write_sb) ?: ret; #endif write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); @@ -182,9 +181,12 @@ void bch2_reconstruct_alloc(struct bch_fs *c) */ static void zero_out_btree_mem_ptr(struct journal_keys *keys) { - darray_for_each(*keys, i) - if (i->k->k.type == KEY_TYPE_btree_ptr_v2) - bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; + struct bch_fs *c = container_of(keys, struct bch_fs, journal_keys); + darray_for_each(*keys, i) { + struct bkey_i *k = journal_key_k(c, i); + if (k->k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(k)->v.mem_ptr = 0; + } } /* journal replay: */ @@ -202,8 +204,10 @@ static void replay_now_at(struct journal *j, u64 seq) static int bch2_journal_replay_accounting_key(struct btree_trans *trans, struct journal_key *k) { + struct bch_fs *c = trans->c; + struct bkey_i *bk = journal_key_k(c, k); struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + bch2_trans_node_iter_init(trans, &iter, k->btree_id, bk->k.p, BTREE_MAX_DEPTH, k->level, BTREE_ITER_intent); int ret = bch2_btree_iter_traverse(&iter); @@ -214,14 +218,14 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans, struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u); /* Has this delta already been applied to the btree? */ - if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) { + if (bversion_cmp(old.k->bversion, bk->k.bversion) >= 0) { ret = 0; goto out; } - struct bkey_i *new = k->k; + struct bkey_i *new = bk; if (old.k->type == KEY_TYPE_accounting) { - new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k)); + new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(bk)); ret = PTR_ERR_OR_ZERO(new); if (ret) goto out; @@ -230,7 +234,8 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans, bkey_s_c_to_accounting(old)); } - trans->journal_res.seq = k->journal_seq; + if (!k->allocated) + trans->journal_res.seq = c->journal_entries_base_seq + k->journal_seq_offset; ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun); out: @@ -241,6 +246,7 @@ out: static int bch2_journal_replay_key(struct btree_trans *trans, struct journal_key *k) { + struct bch_fs *c = trans->c; struct btree_iter iter; unsigned iter_flags = BTREE_ITER_intent| @@ -251,7 +257,8 @@ static int bch2_journal_replay_key(struct btree_trans *trans, if (k->overwritten) return 0; - trans->journal_res.seq = k->journal_seq; + if (!k->allocated) + trans->journal_res.seq = c->journal_entries_base_seq + k->journal_seq_offset; /* * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to @@ -266,7 +273,8 @@ static int bch2_journal_replay_key(struct btree_trans *trans, else update_flags |= BTREE_UPDATE_key_cache_reclaim; - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + struct bkey_i *bk = journal_key_k(c, k); + bch2_trans_node_iter_init(trans, &iter, k->btree_id, bk->k.p, BTREE_MAX_DEPTH, k->level, iter_flags); ret = bch2_btree_iter_traverse(&iter); @@ -275,13 +283,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans, struct btree_path *path = btree_iter_path(trans, &iter); if (unlikely(!btree_path_node(path, k->level))) { - struct bch_fs *c = trans->c; - CLASS(printbuf, buf)(); prt_str(&buf, "btree="); bch2_btree_id_to_text(&buf, k->btree_id); prt_printf(&buf, " level=%u ", k->level); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k)); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(bk)); if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)| BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) { @@ -298,7 +304,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans, } bch2_trans_iter_exit(&iter); - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + bch2_trans_node_iter_init(trans, &iter, k->btree_id, bk->k.p, BTREE_MAX_DEPTH, 0, iter_flags); ret = bch2_btree_iter_traverse(&iter) ?: bch2_btree_increase_depth(trans, iter.path, 0) ?: @@ -310,17 +316,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans, if (k->overwritten) goto out; - if (k->k->k.type == KEY_TYPE_accounting) { - struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s); + if (bk->k.type == KEY_TYPE_accounting) { + struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, bk->k.u64s); ret = PTR_ERR_OR_ZERO(n); if (ret) goto out; - bkey_copy(n, k->k); + bkey_copy(n, bk); goto out; } - ret = bch2_trans_update(trans, &iter, k->k, update_flags); + ret = bch2_trans_update(trans, &iter, bk, update_flags); out: bch2_trans_iter_exit(&iter); return ret; @@ -331,13 +337,9 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) const struct journal_key *l = *((const struct journal_key **)_l); const struct journal_key *r = *((const struct journal_key **)_r); - /* - * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last - * - * journal_seq == 0 means that the key comes from early repair, and - * should be inserted last so as to avoid overflowing the journal - */ - return cmp_int(l->journal_seq - 1, r->journal_seq - 1); + return !l->allocated && !r->allocated + ? cmp_int(l->journal_seq_offset, r->journal_seq_offset) + : cmp_int(l->allocated, r->allocated); } DEFINE_DARRAY_NAMED(darray_journal_keys, struct journal_key *) @@ -369,7 +371,9 @@ int bch2_journal_replay(struct bch_fs *c) * flush accounting keys until we're done */ darray_for_each(*keys, k) { - if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated)) + struct bkey_i *bk = journal_key_k(trans->c, k); + + if (!(bk->k.type == KEY_TYPE_accounting && !k->allocated)) continue; cond_resched(); @@ -412,7 +416,6 @@ int bch2_journal_replay(struct bch_fs *c) BCH_TRANS_COMMIT_skip_accounting_apply| (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), bch2_journal_replay_key(trans, k)); - BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting); if (ret) { ret = darray_push(&keys_sorted, k); if (ret) @@ -434,8 +437,8 @@ int bch2_journal_replay(struct bch_fs *c) struct journal_key *k = *kp; - if (k->journal_seq) - replay_now_at(j, k->journal_seq); + if (!k->allocated) + replay_now_at(j, c->journal_entries_base_seq + k->journal_seq_offset); else replay_now_at(j, j->replay_journal_seq_end); diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h index b63c20558d3d..2696eee00345 100644 --- a/fs/bcachefs/recovery_passes_format.h +++ b/fs/bcachefs/recovery_passes_format.h @@ -37,7 +37,7 @@ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ + x(check_backpointers_to_extents, 13, PASS_ONLINE) \ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index 44bc12573a0c..96ad64920810 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -22,7 +22,7 @@ enum counters_flags { x(io_read_split, 33, TYPE_COUNTER) \ x(io_read_reuse_race, 34, TYPE_COUNTER) \ x(io_read_retry, 32, TYPE_COUNTER) \ - x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ + x(io_read_fail_and_poison, 95, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ x(io_move_read, 35, TYPE_SECTORS) \ @@ -124,4 +124,13 @@ struct bch_sb_field_counters { __le64 d[]; }; +static inline void __maybe_unused check_bch_counter_ids_unique(void) { + switch(0){ +#define x(t, n, ...) case (n): + BCH_PERSISTENT_COUNTERS() +#undef x + ; + } +} + #endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 84f987d3a02a..eab0c1e3ff56 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1673,7 +1673,8 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, return ret; darray_for_each(*deleted, i) - nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id); + nr_deleted_ancestors += bch2_snapshots_same_tree(c, s->k.p.offset, i->id) && + bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id); if (!nr_deleted_ancestors) return 0; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index fef32a0118c4..28d9a29a1fd0 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -51,6 +51,17 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) return s ? s->tree : 0; } +static inline bool bch2_snapshots_same_tree(struct bch_fs *c, u32 id1, u32 id2) +{ + if (id1 == id2) + return true; + + guard(rcu)(); + const struct snapshot_t *s1 = snapshot_t(c, id1); + const struct snapshot_t *s2 = snapshot_t(c, id2); + return s1 && s2 && s1->tree == s2->tree; +} + static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) { const struct snapshot_t *s = snapshot_t(c, id); @@ -157,6 +168,10 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { + EBUG_ON(!id); + EBUG_ON(!ancestor); + EBUG_ON(!bch2_snapshots_same_tree(c, id, ancestor)); + return id == ancestor ? true : __bch2_snapshot_is_ancestor(c, id, ancestor); diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 8c0fb44929cc..2a61cc36ddbf 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -34,6 +34,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) struct bch_hash_info { u32 inum_snapshot; u8 type; + bool is_31bit; struct unicode_map *cf_encoding; /* * For crc32 or crc64 string hashes the first key value of @@ -48,6 +49,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) struct bch_hash_info info = { .inum_snapshot = bi->bi_snapshot, .type = INODE_STR_HASH(bi), + .is_31bit = bi->bi_flags & BCH_INODE_31bit_dirent_offset, .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, .siphash_key = { .k0 = bi->bi_hash_seed } }; @@ -112,8 +114,8 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, } } -static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info) +static inline u64 __bch2_str_hash_end(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info) { switch (info->type) { case BCH_STR_HASH_crc32c: @@ -128,6 +130,14 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, } } +static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info, + bool maybe_31bit) +{ + return __bch2_str_hash_end(ctx, info) & + (maybe_31bit && info->is_31bit ? INT_MAX : U64_MAX); +} + struct bch_hash_desc { enum btree_id btree_id; u8 key_type; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 369465a4de77..5897380c4c08 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -379,7 +379,7 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) return 0; } -int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, +int bch2_sb_validate(struct bch_sb *sb, struct bch_opts *opts, u64 read_offset, enum bch_validate_flags flags, struct printbuf *out) { enum bch_opt_id opt_id; @@ -389,28 +389,30 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, if (ret) return ret; - u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR); - unsigned incompat_bit = 0; - if (incompat) - incompat_bit = __ffs64(incompat); - else if (sb->features[1]) - incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1])); - - if (incompat_bit) { - prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)", - incompat_bit, - bch2_sb_features[BCH_FEATURE_NR - 1], - BCH_FEATURE_NR - 1); - return -BCH_ERR_invalid_sb_features; - } + if (!opts->no_version_check) { + u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR); + unsigned incompat_bit = 0; + if (incompat) + incompat_bit = __ffs64(incompat); + else if (sb->features[1]) + incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1])); + + if (incompat_bit) { + prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)", + incompat_bit, + bch2_sb_features[BCH_FEATURE_NR - 1], + BCH_FEATURE_NR - 1); + return -BCH_ERR_invalid_sb_features; + } - if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || - BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { - prt_str(out, "Filesystem has incompatible version "); - bch2_version_to_text(out, le16_to_cpu(sb->version)); - prt_str(out, ", current version "); - bch2_version_to_text(out, bcachefs_metadata_version_current); - return -BCH_ERR_invalid_sb_features; + if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || + BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { + prt_str(out, "Filesystem has incompatible version "); + bch2_version_to_text(out, le16_to_cpu(sb->version)); + prt_str(out, ", current version "); + bch2_version_to_text(out, bcachefs_metadata_version_current); + return -BCH_ERR_invalid_sb_features; + } } if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { @@ -915,7 +917,7 @@ got_super: sb->have_layout = true; - ret = bch2_sb_validate(sb->sb, offset, 0, &err); + ret = bch2_sb_validate(sb->sb, opts, offset, 0, &err); if (ret) { bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", path, err.buf); @@ -1081,9 +1083,10 @@ int bch2_write_super(struct bch_fs *c) bch2_sb_from_fs(c, (*ca)); darray_for_each(online_devices, ca) { + struct bch_opts opts = bch2_opts_empty(); printbuf_reset(&err); - ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err); + ret = bch2_sb_validate((*ca)->disk_sb.sb, &opts, 0, BCH_VALIDATE_write, &err); if (ret) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); goto out; diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index a3b7a90f2533..82cb3a3ceeae 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -92,7 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); -int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *); +int bch2_sb_validate(struct bch_sb *, struct bch_opts *, u64, + enum bch_validate_flags, struct printbuf *); int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 09e7f8ae9922..ee3b30b1c2b5 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1021,6 +1021,12 @@ static int bch2_fs_opt_version_init(struct bch_fs *c) prt_bitflags(&p, bch2_recovery_passes, sb_passes); } + u64 btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); + if (btrees_lost_data) { + prt_str(&p, "\nsuperblock indicates damage to following btrees:\n "); + prt_bitflags(&p, __bch2_btree_ids, btrees_lost_data); + } + if (bch2_check_version_downgrade(c)) { prt_str(&p, "\nVersion downgrade required:"); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 6094b568dd33..6d7303008b19 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -4,6 +4,7 @@ #include "acl.h" #include "bkey_methods.h" #include "btree_update.h" +#include "dirent.h" #include "extents.h" #include "fs.h" #include "rebalance.h" @@ -25,7 +26,7 @@ static u64 bch2_xattr_hash(const struct bch_hash_info *info, bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); - return bch2_str_hash_end(&ctx, info); + return bch2_str_hash_end(&ctx, info, false); } static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) @@ -484,6 +485,22 @@ static int inode_opt_set_fn(struct btree_trans *trans, return ret; } + if (s->id == Inode_opt_inodes_32bit && + !bch2_request_incompat_feature(trans->c, bcachefs_metadata_version_31bit_dirent_offset)) { + /* + * Make sure the dir is empty, as otherwise we'd need to + * rehash everything and update the dirent keys. + */ + int ret = bch2_empty_dir_trans(trans, inode_inum(inode)); + if (ret < 0) + return ret; + + if (s->defined) + bi->bi_flags |= BCH_INODE_31bit_dirent_offset; + else + bi->bi_flags &= ~BCH_INODE_31bit_dirent_offset; + } + if (s->defined) bi->bi_fields_set |= 1U << s->id; else |