summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/backpointers.c49
-rw-r--r--fs/bcachefs/btree_gc.c4
-rw-r--r--fs/bcachefs/btree_io.c7
-rw-r--r--fs/bcachefs/btree_iter.c28
-rw-r--r--fs/bcachefs/btree_journal_iter.c139
-rw-r--r--fs/bcachefs/btree_journal_iter.h36
-rw-r--r--fs/bcachefs/btree_journal_iter_types.h29
-rw-r--r--fs/bcachefs/btree_trans_commit.c2
-rw-r--r--fs/bcachefs/btree_update.c2
-rw-r--r--fs/bcachefs/btree_update_interior.c2
-rw-r--r--fs/bcachefs/darray.h5
-rw-r--r--fs/bcachefs/disk_accounting.c72
-rw-r--r--fs/bcachefs/extent_update.c8
-rw-r--r--fs/bcachefs/journal.c8
-rw-r--r--fs/bcachefs/journal_io.c6
-rw-r--r--fs/bcachefs/journal_io.h23
-rw-r--r--fs/bcachefs/journal_reclaim.c14
-rw-r--r--fs/bcachefs/journal_types.h2
-rw-r--r--fs/bcachefs/opts.h7
-rw-r--r--fs/bcachefs/recovery.c39
-rw-r--r--fs/bcachefs/recovery_passes_format.h2
-rw-r--r--fs/bcachefs/sb-counters_format.h11
-rw-r--r--fs/bcachefs/super-io.c51
-rw-r--r--fs/bcachefs/super-io.h3
-rw-r--r--fs/bcachefs/super.c6
25 files changed, 334 insertions, 221 deletions
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index c43aaab4c108..cb25cddb759b 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -532,10 +532,6 @@ static int check_bp_exists(struct btree_trans *trans,
struct btree_iter other_extent_iter = {};
CLASS(printbuf, buf)();
- if (bpos_lt(bp->k.p, s->bp_start) ||
- bpos_gt(bp->k.p, s->bp_end))
- return 0;
-
CLASS(btree_iter, bp_iter)(trans, BTREE_ID_backpointers, bp->k.p, 0);
struct bkey_s_c bp_k = bch2_btree_iter_peek_slot(&bp_iter);
int ret = bkey_err(bp_k);
@@ -690,6 +686,10 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
struct bkey_i_backpointer bp;
bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
+ if (bpos_lt(bp.k.p, s->bp_start) ||
+ bpos_gt(bp.k.p, s->bp_end))
+ continue;
+
int ret = !empty
? check_bp_exists(trans, s, &bp, k)
: bch2_bucket_backpointer_mod(trans, k, &bp, true);
@@ -809,8 +809,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
for (enum btree_id btree_id = 0;
btree_id < btree_id_nr_alive(c);
btree_id++) {
- /* btree_type_has_ptrs should probably include BTREE_ID_stripes,
- * definitely her... */
int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
ret = commit_do(trans, NULL, NULL,
@@ -899,7 +897,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen &&
+ if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointer_bucket_gen &&
(bp.v->bucket_gen != a->gen ||
bp.v->pad)) {
ret = bch2_backpointer_del(trans, bp_k.k->p);
@@ -931,6 +929,14 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
if (sectors[ALLOC_dirty] != a->dirty_sectors ||
sectors[ALLOC_cached] != a->cached_sectors ||
sectors[ALLOC_stripe] != a->stripe_sectors) {
+ /*
+ * Post 1.14 upgrade, we assume that backpointers are mostly
+ * correct and a sector count mismatch is probably due to a
+ * write buffer race
+ *
+ * Pre upgrade, we expect all the buckets to be wrong, a write
+ * buffer flush is pointless:
+ */
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
if (ret)
@@ -978,12 +984,22 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
goto next;
struct bpos bucket = bp_pos_to_bucket(ca, pos);
- u64 next = ca->mi.nbuckets;
-
- unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets);
- if (bitmap)
- next = min_t(u64, next,
- find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset));
+ u64 next = min(bucket.offset, ca->mi.nbuckets);
+
+ unsigned long *mismatch = READ_ONCE(ca->bucket_backpointer_mismatch.buckets);
+ unsigned long *empty = READ_ONCE(ca->bucket_backpointer_empty.buckets);
+ /*
+ * Find the first bucket with mismatches - but
+ * not empty buckets; we don't need to pin those
+ * because we just recreate all backpointers in
+ * those buckets
+ */
+ if (mismatch && empty)
+ next = find_next_andnot_bit(mismatch, empty, ca->mi.nbuckets, next);
+ else if (mismatch)
+ next = find_next_bit(mismatch, ca->mi.nbuckets, next);
+ else
+ next = ca->mi.nbuckets;
bucket.offset = next;
if (bucket.offset == ca->mi.nbuckets)
@@ -1110,17 +1126,18 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
if (ret)
goto err;
- u64 nr_buckets = 0, nr_mismatches = 0;
+ u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0;
for_each_member_device(c, ca) {
nr_buckets += ca->mi.nbuckets;
nr_mismatches += ca->bucket_backpointer_mismatch.nr;
+ nr_empty += ca->bucket_backpointer_empty.nr;
}
if (!nr_mismatches)
goto err;
- bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
- nr_mismatches, nr_buckets);
+ bch_info(c, "scanning for missing backpointers in %llu/%llu buckets, %llu buckets with no backpointers",
+ nr_mismatches - nr_empty, nr_buckets, nr_empty);
while (1) {
ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ae7d260589d8..43f294284d57 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -356,7 +356,7 @@ again:
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
BUG_ON(bpos_gt(k.k->p, b->data->max_key));
@@ -470,7 +470,7 @@ again:
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) {
bch2_bkey_buf_reassemble(&cur_k, c, k);
bch2_btree_and_journal_iter_advance(&iter);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 276cf088539e..2e3dd9bacac5 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -131,10 +131,10 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
- p = kvmalloc(size, GFP_NOWAIT);
+ p = kvmalloc(size, GFP_NOWAIT|__GFP_ACCOUNT|__GFP_RECLAIMABLE);
if (!p) {
*used_mempool = true;
- p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS|__GFP_ACCOUNT|__GFP_RECLAIMABLE);
}
memalloc_nofs_restore(flags);
return p;
@@ -1014,6 +1014,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
k = bkey_p_next(k);
continue;
drop_this_key:
+ ret = 0;
next_good_key = k->u64s;
if (!next_good_key ||
@@ -1470,7 +1471,7 @@ start:
}
prt_newline(&buf);
- if (failed.nr)
+ if (ret || failed.nr)
bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
async_object_list_del(c, btree_read_bio, rb->list_idx);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 546b559fe3ce..76f430f93dc1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -650,7 +650,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str
i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
+ const struct bkey_i *j_k =
bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
i->k->k.p);
@@ -848,7 +848,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
break;
bch2_btree_and_journal_iter_advance(jiter);
- k = bch2_btree_and_journal_iter_peek(jiter);
+ k = bch2_btree_and_journal_iter_peek(c, jiter);
if (!k.k)
break;
@@ -898,7 +898,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
- k = bch2_btree_and_journal_iter_peek(&jiter);
+ k = bch2_btree_and_journal_iter_peek(c, &jiter);
if (!k.k) {
CLASS(printbuf, buf)();
@@ -2120,10 +2120,10 @@ void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_
}
}
-static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos search_pos,
- struct bpos end_pos)
+static const struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos search_pos,
+ struct bpos end_pos)
{
struct btree_path *path = btree_iter_path(trans, iter);
@@ -2139,7 +2139,7 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
struct btree_iter *iter)
{
struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
+ const struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
if (k) {
iter->k = k->k;
@@ -2156,7 +2156,7 @@ void btree_trans_peek_journal(struct btree_trans *trans,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *next_journal =
+ const struct bkey_i *next_journal =
bch2_btree_journal_peek(trans, iter, search_key,
k->k ? k->k->p : path_l(path)->b->key.k.p);
if (next_journal) {
@@ -2165,10 +2165,10 @@ void btree_trans_peek_journal(struct btree_trans *trans,
}
}
-static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos search_key,
- struct bpos end_pos)
+static const struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos search_key,
+ struct bpos end_pos)
{
struct btree_path *path = btree_iter_path(trans, iter);
@@ -2186,7 +2186,7 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *next_journal =
+ const struct bkey_i *next_journal =
bch2_btree_journal_peek_prev(trans, iter, search_key,
k->k ? k->k->p : path_l(path)->b->data->min_key);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 24f2fbe84ad7..f63c349e09da 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -46,21 +46,22 @@ static size_t __bch2_journal_key_search(struct journal_keys *keys,
enum btree_id id, unsigned level,
struct bpos pos)
{
+ struct bch_fs *c = container_of(keys, struct bch_fs, journal_keys);
size_t l = 0, r = keys->nr, m;
while (l < r) {
m = l + ((r - l) >> 1);
- if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
+ if (__journal_key_cmp(c, id, level, pos, idx_to_key(keys, m)) > 0)
l = m + 1;
else
r = m;
}
BUG_ON(l < keys->nr &&
- __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
+ __journal_key_cmp(c, id, level, pos, idx_to_key(keys, l)) > 0);
BUG_ON(l &&
- __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
+ __journal_key_cmp(c, id, level, pos, idx_to_key(keys, l - 1)) <= 0);
return l;
}
@@ -73,9 +74,9 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
}
/* Returns first non-overwritten key >= search key: */
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos,
- struct bpos end_pos, size_t *idx)
+const struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos,
+ struct bpos end_pos, size_t *idx)
{
struct journal_keys *keys = &c->journal_keys;
unsigned iters = 0;
@@ -87,7 +88,7 @@ search:
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
while (*idx &&
- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+ __journal_key_cmp(c, btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
--(*idx);
iters++;
if (iters == 10) {
@@ -96,11 +97,11 @@ search:
}
}
- struct bkey_i *ret = NULL;
+ const struct bkey_i *ret = NULL;
rcu_read_lock(); /* for overwritten_ranges */
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
- if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
+ if (__journal_key_cmp(c, btree_id, level, end_pos, k) < 0)
break;
if (k->overwritten) {
@@ -111,8 +112,8 @@ search:
continue;
}
- if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
- ret = k->k;
+ if (__journal_key_cmp(c, btree_id, level, pos, k) <= 0) {
+ ret = journal_key_k(c, k);
break;
}
@@ -129,9 +130,9 @@ search:
return ret;
}
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos,
- struct bpos end_pos, size_t *idx)
+const struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos,
+ struct bpos end_pos, size_t *idx)
{
struct journal_keys *keys = &c->journal_keys;
unsigned iters = 0;
@@ -146,7 +147,7 @@ search:
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
while (*idx < keys->nr &&
- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) {
+ __journal_key_cmp(c, btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) {
(*idx)++;
iters++;
if (iters == 10) {
@@ -158,12 +159,12 @@ search:
if (*idx == keys->nr)
--(*idx);
- struct bkey_i *ret = NULL;
+ const struct bkey_i *ret = NULL;
rcu_read_lock(); /* for overwritten_ranges */
while (true) {
k = idx_to_key(keys, *idx);
- if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
+ if (__journal_key_cmp(c, btree_id, level, end_pos, k) > 0)
break;
if (k->overwritten) {
@@ -175,8 +176,8 @@ search:
continue;
}
- if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
- ret = k->k;
+ if (__journal_key_cmp(c, btree_id, level, pos, k) >= 0) {
+ ret = journal_key_k(c, k);
break;
}
@@ -194,8 +195,8 @@ search:
return ret;
}
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos)
+const struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos)
{
size_t idx = 0;
@@ -264,13 +265,8 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
struct journal_key n = {
.btree_id = id,
.level = level,
- .k = k,
.allocated = true,
- /*
- * Ensure these keys are done last by journal replay, to unblock
- * journal reclaim:
- */
- .journal_seq = U64_MAX,
+ .allocated_k = k,
};
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
@@ -278,8 +274,8 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
BUG_ON(test_bit(BCH_FS_rw, &c->flags));
if (idx < keys->size &&
- journal_key_cmp(&n, &keys->data[idx]) == 0) {
- struct bkey_i *o = keys->data[idx].k;
+ journal_key_cmp(c, &n, &keys->data[idx]) == 0) {
+ struct bkey_i *o = journal_key_k(c, &keys->data[idx]);
if (k->k.type == KEY_TYPE_accounting &&
o->k.type == KEY_TYPE_accounting) {
@@ -291,7 +287,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
}
if (keys->data[idx].allocated)
- kfree(keys->data[idx].k);
+ kfree(keys->data[idx].allocated_k);
keys->data[idx] = n;
return 0;
}
@@ -376,17 +372,20 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
unsigned level, struct bpos pos)
{
- struct journal_keys *keys = &trans->c->journal_keys;
+ if (!trans->journal_replay_not_finished)
+ return false;
+
+ struct bch_fs *c = trans->c;
+ struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
- if (!trans->journal_replay_not_finished)
+ if (idx >= keys->size ||
+ keys->data[idx].btree_id != btree ||
+ keys->data[idx].level != level)
return false;
- return (idx < keys->size &&
- keys->data[idx].btree_id == btree &&
- keys->data[idx].level == level &&
- bpos_eq(keys->data[idx].k->k.p, pos) &&
- bkey_deleted(&keys->data[idx].k->k));
+ struct bkey_i *k = journal_key_k(c, &keys->data[idx]);
+ return bpos_eq(k->k.p, pos) && bkey_deleted(&k->k);
}
static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
@@ -457,11 +456,15 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
- if (idx < keys->size &&
- keys->data[idx].btree_id == btree &&
- keys->data[idx].level == level &&
- bpos_eq(keys->data[idx].k->k.p, pos) &&
- !keys->data[idx].overwritten) {
+ if (idx >= keys->size ||
+ keys->data[idx].btree_id != btree ||
+ keys->data[idx].level != level ||
+ keys->data[idx].overwritten)
+ return;
+
+ struct bkey_i *k = journal_key_k(c, &keys->data[idx]);
+
+ if (bpos_eq(k->k.p, pos)) {
guard(mutex)(&keys->overwrite_lock);
__bch2_journal_key_overwritten(keys, idx);
}
@@ -476,7 +479,7 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
}
}
-static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+static struct bkey_s_c bch2_journal_iter_peek(struct bch_fs *c, struct journal_iter *iter)
{
journal_iter_verify(iter);
@@ -490,7 +493,7 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
BUG_ON(cmp);
if (!k->overwritten)
- return bkey_i_to_s_c(k->k);
+ return bkey_i_to_s_c(journal_key_k(c, k));
if (k->overwritten_range)
iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
@@ -554,7 +557,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
while (nr--) {
bch2_btree_and_journal_iter_advance(&iter);
- struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
+ struct bkey_s_c k = bch2_btree_and_journal_iter_peek(c, &iter);
if (!k.k)
break;
@@ -565,7 +568,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
bch2_bkey_buf_exit(&tmp, c);
}
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct bch_fs *c, struct btree_and_journal_iter *iter)
{
struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
size_t iters = 0;
@@ -586,7 +589,7 @@ again:
bch2_journal_iter_advance_btree(iter);
if (iter->trans->journal_replay_not_finished)
- while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+ while ((journal_k = bch2_journal_iter_peek(c, &iter->journal)).k &&
bpos_lt(journal_k.k->p, iter->pos))
bch2_journal_iter_advance(&iter->journal);
@@ -658,15 +661,22 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
/*
* When keys compare equal, oldest compares first:
*/
-static int journal_sort_key_cmp(const void *_l, const void *_r)
+static int journal_sort_key_cmp(const void *_l, const void *_r, const void *priv)
{
+ struct bch_fs *c = (void *) priv;
const struct journal_key *l = _l;
const struct journal_key *r = _r;
int rewind = l->rewind && r->rewind ? -1 : 1;
- return journal_key_cmp(l, r) ?:
- ((cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->journal_offset, r->journal_offset)) * rewind);
+ int cmp = journal_key_cmp(c, l, r);
+ if (cmp)
+ return cmp;
+
+ if (l->allocated || r->allocated)
+ return cmp_int(l->allocated, r->allocated);
+
+ return ((cmp_int(l->journal_seq, r->journal_seq) ?:
+ cmp_int(l->journal_offset, r->journal_offset)) * rewind);
}
void bch2_journal_keys_put(struct bch_fs *c)
@@ -687,7 +697,7 @@ void bch2_journal_keys_put(struct bch_fs *c)
kfree(i->overwritten_range);
if (i->allocated)
- kfree(i->k);
+ kfree(i->allocated_k);
}
kvfree(keys->data);
@@ -704,8 +714,10 @@ void bch2_journal_keys_put(struct bch_fs *c)
static void __journal_keys_sort(struct journal_keys *keys)
{
- sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
- journal_sort_key_cmp, NULL);
+ struct bch_fs *c = container_of(keys, struct bch_fs, journal_keys);
+
+ sort_r_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
+ journal_sort_key_cmp, NULL, c);
cond_resched();
@@ -717,9 +729,10 @@ static void __journal_keys_sort(struct journal_keys *keys)
* compare each individual accounting key against the version in
* the btree during replay:
*/
- if (src->k->k.type != KEY_TYPE_accounting &&
+ struct bkey_i *k = journal_key_k(c, src);
+ if (k->k.type != KEY_TYPE_accounting &&
src + 1 < &darray_top(*keys) &&
- !journal_key_cmp(src, src + 1))
+ !journal_key_cmp(c, src, src + 1))
continue;
*dst++ = *src;
@@ -763,7 +776,6 @@ int bch2_journal_keys_sort(struct bch_fs *c)
.btree_id = entry->btree_id,
.level = entry->level,
.rewind = rewind,
- .k = k,
.journal_seq = le64_to_cpu(i->j.seq),
.journal_offset = k->_data - i->j._data,
};
@@ -801,13 +813,18 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
move_gap(keys, keys->nr);
- darray_for_each(*keys, i)
+ darray_for_each(*keys, i) {
+ struct bkey_i *k = journal_key_k(c, i);
+
if (!(i->btree_id == btree &&
i->level >= level_min &&
i->level <= level_max &&
- bpos_ge(i->k->k.p, start) &&
- bpos_le(i->k->k.p, end)))
+ bpos_ge(k->k.p, start) &&
+ bpos_le(k->k.p, end)))
keys->data[dst++] = *i;
+ else if (i->allocated)
+ kfree(i->allocated_k);
+ }
keys->nr = keys->gap = dst;
}
@@ -825,7 +842,7 @@ void bch2_journal_keys_dump(struct bch_fs *c)
prt_printf(&buf, "btree=");
bch2_btree_id_to_text(&buf, i->btree_id);
prt_printf(&buf, " l=%u ", i->level);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(journal_key_k(c, i)));
pr_err("%s", buf.buf);
}
}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 2a3082919b8d..cfd2061bc966 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -29,6 +29,23 @@ struct btree_and_journal_iter {
bool fail_if_too_many_whiteouts;
};
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
+{
+ return (seq - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static inline struct bkey_i *journal_key_k(struct bch_fs *c,
+ const struct journal_key *k)
+{
+ if (k->allocated)
+ return k->allocated_k;
+
+ struct journal_replay *i =
+ *genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, k->journal_seq));
+
+ return (struct bkey_i *) (i->j._data + k->journal_offset);
+}
+
static inline int __journal_key_btree_cmp(enum btree_id l_btree_id,
unsigned l_level,
const struct journal_key *r)
@@ -37,25 +54,28 @@ static inline int __journal_key_btree_cmp(enum btree_id l_btree_id,
cmp_int(l_btree_id, r->btree_id);
}
-static inline int __journal_key_cmp(enum btree_id l_btree_id,
+static inline int __journal_key_cmp(struct bch_fs *c,
+ enum btree_id l_btree_id,
unsigned l_level,
struct bpos l_pos,
const struct journal_key *r)
{
return __journal_key_btree_cmp(l_btree_id, l_level, r) ?:
- bpos_cmp(l_pos, r->k->k.p);
+ bpos_cmp(l_pos, journal_key_k(c, r)->k.p);
}
-static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+static inline int journal_key_cmp(struct bch_fs *c,
+ const struct journal_key *l, const struct journal_key *r)
{
- return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+ return __journal_key_cmp(c, l->btree_id, l->level,
+ journal_key_k(c, l)->k.p, r);
}
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
+const struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
+const struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+const struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
@@ -71,7 +91,7 @@ bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned,
void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct bch_fs *, struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h
index 86aacb254fb2..e9d8628edec6 100644
--- a/fs/bcachefs/btree_journal_iter_types.h
+++ b/fs/bcachefs/btree_journal_iter_types.h
@@ -2,12 +2,38 @@
#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
+struct journal_ptr {
+ bool csum_good;
+ struct bch_csum csum;
+ u8 dev;
+ u32 bucket;
+ u32 bucket_offset;
+ u64 sector;
+};
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+ DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
+
+ bool csum_good;
+ bool ignore_blacklisted;
+ bool ignore_not_dirty;
+ /* must be last: */
+ struct jset j;
+};
+
struct journal_key_range_overwritten {
size_t start, end;
};
struct journal_key {
- u64 journal_seq;
+ union {
+ u64 journal_seq;
+ struct bkey_i *allocated_k;
+ };
u32 journal_offset;
enum btree_id btree_id:8;
unsigned level:8;
@@ -16,7 +42,6 @@ struct journal_key {
bool rewind:1;
struct journal_key_range_overwritten __rcu *
overwritten_range;
- struct bkey_i *k;
};
struct journal_keys {
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 4d58bdb233e9..5fa7f2f9f1e9 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -54,7 +54,7 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u);
if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
+ const struct bkey_i *j_k =
bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
if (j_k)
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 053a837cf241..b70eb095a37e 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -403,7 +403,7 @@ __btree_trans_update_by_path(struct btree_trans *trans,
i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
+ const struct bkey_i *j_k =
bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
if (j_k) {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 65ca54c5b0ff..a9877a47bfc6 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -95,7 +95,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
if (!b->c.level)
goto out;
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) {
if (k.k->type != KEY_TYPE_btree_ptr_v2)
goto out;
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 4080ee99aadd..14c7fc7c8061 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -107,8 +107,11 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
#define __darray_for_each(_d, _i) \
for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
+#define darray_for_each_from(_d, _i, _start) \
+ for (typeof(&(_d).data[0]) _i = _start; _i < (_d).data + (_d).nr; _i++)
+
#define darray_for_each(_d, _i) \
- for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+ darray_for_each_from(_d, _i, (_d).data)
#define darray_for_each_reverse(_d, _i) \
for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 5944ad6d0f8d..5ec57b710501 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -765,75 +765,77 @@ int bch2_accounting_read(struct bch_fs *c)
iter.flags &= ~BTREE_ITER_with_journal;
int ret = for_each_btree_key_continue(trans, iter,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
+ struct bkey u;
+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
- if (k.k->type != KEY_TYPE_accounting)
- continue;
+ if (k.k->type != KEY_TYPE_accounting)
+ continue;
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
- if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- break;
+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+ break;
- if (!bch2_accounting_is_mem(&acc_k)) {
- struct disk_accounting_pos next;
- memset(&next, 0, sizeof(next));
- next.type = acc_k.type + 1;
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
- continue;
- }
+ if (!bch2_accounting_is_mem(&acc_k)) {
+ struct disk_accounting_pos next;
+ memset(&next, 0, sizeof(next));
+ next.type = acc_k.type + 1;
+ bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
+ continue;
+ }
- accounting_read_key(trans, k);
- }));
+ accounting_read_key(trans, k);
+ }));
bch2_trans_iter_exit(&iter);
if (ret)
return ret;
struct journal_keys *keys = &c->journal_keys;
- struct journal_key *dst = keys->data;
move_gap(keys, keys->nr);
darray_for_each(*keys, i) {
- if (i->k->k.type == KEY_TYPE_accounting) {
+ if (i->overwritten)
+ continue;
+
+ struct bkey_i *k = journal_key_k(c, i);
+
+ if (k->k.type == KEY_TYPE_accounting) {
struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
+ bpos_to_disk_accounting_pos(&acc_k, k->k.p);
if (!bch2_accounting_is_mem(&acc_k))
continue;
- struct bkey_s_c k = bkey_i_to_s_c(i->k);
unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
sizeof(acc->k.data[0]),
- accounting_pos_cmp, &k.k->p);
+ accounting_pos_cmp, &k->k.p);
bool applied = idx < acc->k.nr &&
- bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
+ bversion_cmp(acc->k.data[idx].bversion, k->k.bversion) >= 0;
if (applied)
continue;
- if (i + 1 < &darray_top(*keys) &&
- i[1].k->k.type == KEY_TYPE_accounting &&
- !journal_key_cmp(i, i + 1)) {
- WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
+ darray_for_each_from(*keys, j, i + 1) {
+ if (journal_key_cmp(c, i, j))
+ break;
- i[1].journal_seq = i[0].journal_seq;
+ struct bkey_i *n = journal_key_k(c, j);
+ if (n->k.type == KEY_TYPE_accounting) {
+ WARN_ON(bversion_cmp(k->k.bversion, n->k.bversion) >= 0);
- bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k),
- bkey_s_c_to_accounting(k));
- continue;
+ bch2_accounting_accumulate(bkey_i_to_accounting(k),
+ bkey_i_to_s_c_accounting(n));
+ j->overwritten = true;
+ }
}
- ret = accounting_read_key(trans, k);
+ ret = accounting_read_key(trans, bkey_i_to_s_c(k));
if (ret)
return ret;
}
-
- *dst++ = *i;
}
- keys->gap = keys->nr = dst - keys->data;
guard(percpu_write)(&c->mark_lock);
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 7ddb156c765c..73eb28090bc7 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -115,9 +115,15 @@ int bch2_extent_trim_atomic(struct btree_trans *trans,
copy.flags |= BTREE_ITER_nofilter_whiteouts;
+ /*
+ * We're doing our own whiteout filtering, but we still need to pass a
+ * max key to avoid popping an assert in bch2_snapshot_is_ancestor():
+ */
struct bkey_s_c k;
unsigned nr_iters = 0;
- for_each_btree_key_continue_norestart(copy, 0, k, ret) {
+ for_each_btree_key_max_continue_norestart(copy,
+ POS(insert->k.p.inode, U64_MAX),
+ 0, k, ret) {
unsigned offset = 0;
if (bkey_gt(iter->pos, bkey_start_pos(k.k)))
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 07869436a964..93ac0faedf7d 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -120,6 +120,7 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
INIT_LIST_HEAD(&p->flushed[i]);
atomic_set(&p->count, count);
p->devs.nr = 0;
+ p->bytes = 0;
}
/*
@@ -264,6 +265,11 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
+ struct journal_entry_pin_list *pin_list =
+ journal_seq_pin(j, journal_cur_seq(j));
+ pin_list->bytes = roundup_pow_of_two(vstruct_bytes(buf->data));
+ j->dirty_entry_bytes += pin_list->bytes;
+
if (trace_journal_entry_close_enabled() && trace) {
CLASS(printbuf, err)();
guard(printbuf_atomic)(&err);
@@ -737,9 +743,9 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
return ret;
CLASS(printbuf, buf)();
+ prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret));
bch2_journal_debug_to_text(&buf, j);
bch2_print_str(c, KERN_ERR, buf.buf);
- prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret));
closure_wait_event(&j->async_wait,
!bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 093e4acad085..6e8a89a0f244 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -3,6 +3,7 @@
#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_io.h"
+#include "btree_journal_iter.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "buckets.h"
@@ -106,11 +107,6 @@ static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *cs
return !bch2_crc_cmp(j->csum, *csum);
}
-static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
-{
- return (seq - c->journal_entries_base_seq) & (~0U >> 1);
-}
-
static void __journal_replay_free(struct bch_fs *c,
struct journal_replay *i)
{
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index f53c5c81d137..f8754bf71264 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -7,29 +7,6 @@
void bch2_journal_pos_from_member_info_set(struct bch_fs *);
void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
-struct journal_ptr {
- bool csum_good;
- struct bch_csum csum;
- u8 dev;
- u32 bucket;
- u32 bucket_offset;
- u64 sector;
-};
-
-/*
- * Only used for holding the journal entries we read in btree_journal_read()
- * during cache_registration
- */
-struct journal_replay {
- DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
-
- bool csum_good;
- bool ignore_blacklisted;
- bool ignore_not_dirty;
- /* must be last: */
- struct jset j;
-};
-
static inline bool journal_replay_ignore(struct journal_replay *i)
{
return !i || i->ignore_blacklisted || i->ignore_not_dirty;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index f23e5ee9ad75..6400a63ed79b 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -148,6 +148,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
+ ssize_t mem_limit = max_t(ssize_t, 0,
+ (totalram_pages() * PAGE_SIZE) / 4 - j->dirty_entry_bytes);
+
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
if (!ca->journal.nr ||
!ca->mi.durability)
@@ -180,6 +183,7 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
* @nr_devs_want largest devices:
*/
space = dev_space[nr_devs_want - 1];
+ space.total = min(space.total, mem_limit >> 9);
space.next_entry = min(space.next_entry, min_bucket_size);
return space;
}
@@ -328,9 +332,17 @@ void bch2_journal_reclaim_fast(struct journal *j)
* Unpin journal entries whose reference counts reached zero, meaning
* all btree nodes got written out
*/
+ struct journal_entry_pin_list *pin_list;
while (!fifo_empty(&j->pin) &&
j->pin.front <= j->seq_ondisk &&
- !atomic_read(&fifo_peek_front(&j->pin).count)) {
+ !atomic_read(&(pin_list = &fifo_peek_front(&j->pin))->count)) {
+
+ if (WARN_ON(j->dirty_entry_bytes < pin_list->bytes))
+ pin_list->bytes = j->dirty_entry_bytes;
+
+ j->dirty_entry_bytes -= pin_list->bytes;
+ pin_list->bytes = 0;
+
j->pin.front++;
popped = true;
}
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 51104bbb99da..7c9273bd0e15 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -71,6 +71,7 @@ struct journal_entry_pin_list {
struct list_head flushed[JOURNAL_PIN_TYPE_NR];
atomic_t count;
struct bch_devs_list devs;
+ size_t bytes;
};
struct journal;
@@ -253,6 +254,7 @@ struct journal {
u64 front, back, size, mask;
struct journal_entry_pin_list *data;
} pin;
+ size_t dirty_entry_bytes;
struct journal_space space[journal_space_nr];
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 84ce69a7f131..31a3abcbd83e 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -242,7 +242,7 @@ enum fsck_err_opts {
x(inodes_32bit, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- BCH_SB_INODE_32BIT, true, \
+ BCH_SB_INODE_32BIT, false, \
NULL, "Constrain inode numbers to 32 bits") \
x(shard_inode_numbers_bits, u8, \
OPT_FS|OPT_FORMAT, \
@@ -321,6 +321,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Don't kick drives out when splitbrain detected")\
+ x(no_version_check, u8, \
+ OPT_HIDDEN, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Don't fail reading the superblock due to incompatible version")\
x(verbose, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 29e81f96db0f..0117405e51ef 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -64,7 +64,6 @@ int bch2_btree_lost_data(struct bch_fs *c,
* but in debug mode we want the next fsck run to be clean:
*/
ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0, &write_sb) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0, &write_sb) ?: ret;
#endif
write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
@@ -182,9 +181,12 @@ void bch2_reconstruct_alloc(struct bch_fs *c)
*/
static void zero_out_btree_mem_ptr(struct journal_keys *keys)
{
- darray_for_each(*keys, i)
- if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
+ struct bch_fs *c = container_of(keys, struct bch_fs, journal_keys);
+ darray_for_each(*keys, i) {
+ struct bkey_i *k = journal_key_k(c, i);
+ if (k->k.type == KEY_TYPE_btree_ptr_v2)
+ bkey_i_to_btree_ptr_v2(k)->v.mem_ptr = 0;
+ }
}
/* journal replay: */
@@ -202,8 +204,9 @@ static void replay_now_at(struct journal *j, u64 seq)
static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
struct journal_key *k)
{
+ struct bkey_i *bk = journal_key_k(trans->c, k);
struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, bk->k.p,
BTREE_MAX_DEPTH, k->level,
BTREE_ITER_intent);
int ret = bch2_btree_iter_traverse(&iter);
@@ -214,14 +217,14 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
/* Has this delta already been applied to the btree? */
- if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
+ if (bversion_cmp(old.k->bversion, bk->k.bversion) >= 0) {
ret = 0;
goto out;
}
- struct bkey_i *new = k->k;
+ struct bkey_i *new = bk;
if (old.k->type == KEY_TYPE_accounting) {
- new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
+ new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(bk));
ret = PTR_ERR_OR_ZERO(new);
if (ret)
goto out;
@@ -266,7 +269,8 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
else
update_flags |= BTREE_UPDATE_key_cache_reclaim;
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ struct bkey_i *bk = journal_key_k(trans->c, k);
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, bk->k.p,
BTREE_MAX_DEPTH, k->level,
iter_flags);
ret = bch2_btree_iter_traverse(&iter);
@@ -281,7 +285,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
prt_str(&buf, "btree=");
bch2_btree_id_to_text(&buf, k->btree_id);
prt_printf(&buf, " level=%u ", k->level);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k));
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(bk));
if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)|
BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) {
@@ -298,7 +302,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
}
bch2_trans_iter_exit(&iter);
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, bk->k.p,
BTREE_MAX_DEPTH, 0, iter_flags);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_btree_increase_depth(trans, iter.path, 0) ?:
@@ -310,17 +314,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (k->overwritten)
goto out;
- if (k->k->k.type == KEY_TYPE_accounting) {
- struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s);
+ if (bk->k.type == KEY_TYPE_accounting) {
+ struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, bk->k.u64s);
ret = PTR_ERR_OR_ZERO(n);
if (ret)
goto out;
- bkey_copy(n, k->k);
+ bkey_copy(n, bk);
goto out;
}
- ret = bch2_trans_update(trans, &iter, k->k, update_flags);
+ ret = bch2_trans_update(trans, &iter, bk, update_flags);
out:
bch2_trans_iter_exit(&iter);
return ret;
@@ -369,7 +373,9 @@ int bch2_journal_replay(struct bch_fs *c)
* flush accounting keys until we're done
*/
darray_for_each(*keys, k) {
- if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
+ struct bkey_i *bk = journal_key_k(trans->c, k);
+
+ if (!(bk->k.type == KEY_TYPE_accounting && !k->allocated))
continue;
cond_resched();
@@ -412,7 +418,6 @@ int bch2_journal_replay(struct bch_fs *c)
BCH_TRANS_COMMIT_skip_accounting_apply|
(!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
bch2_journal_replay_key(trans, k));
- BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
if (ret) {
ret = darray_push(&keys_sorted, k);
if (ret)
diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h
index b63c20558d3d..2696eee00345 100644
--- a/fs/bcachefs/recovery_passes_format.h
+++ b/fs/bcachefs/recovery_passes_format.h
@@ -37,7 +37,7 @@
x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \
x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \
x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \
- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
+ x(check_backpointers_to_extents, 13, PASS_ONLINE) \
x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \
x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \
x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
index 44bc12573a0c..96ad64920810 100644
--- a/fs/bcachefs/sb-counters_format.h
+++ b/fs/bcachefs/sb-counters_format.h
@@ -22,7 +22,7 @@ enum counters_flags {
x(io_read_split, 33, TYPE_COUNTER) \
x(io_read_reuse_race, 34, TYPE_COUNTER) \
x(io_read_retry, 32, TYPE_COUNTER) \
- x(io_read_fail_and_poison, 82, TYPE_COUNTER) \
+ x(io_read_fail_and_poison, 95, TYPE_COUNTER) \
x(io_write, 1, TYPE_SECTORS) \
x(io_move, 2, TYPE_SECTORS) \
x(io_move_read, 35, TYPE_SECTORS) \
@@ -124,4 +124,13 @@ struct bch_sb_field_counters {
__le64 d[];
};
+static inline void __maybe_unused check_bch_counter_ids_unique(void) {
+ switch(0){
+#define x(t, n, ...) case (n):
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ ;
+ }
+}
+
#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 369465a4de77..5897380c4c08 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -379,7 +379,7 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
return 0;
}
-int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
+int bch2_sb_validate(struct bch_sb *sb, struct bch_opts *opts, u64 read_offset,
enum bch_validate_flags flags, struct printbuf *out)
{
enum bch_opt_id opt_id;
@@ -389,28 +389,30 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
if (ret)
return ret;
- u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
- unsigned incompat_bit = 0;
- if (incompat)
- incompat_bit = __ffs64(incompat);
- else if (sb->features[1])
- incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
-
- if (incompat_bit) {
- prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
- incompat_bit,
- bch2_sb_features[BCH_FEATURE_NR - 1],
- BCH_FEATURE_NR - 1);
- return -BCH_ERR_invalid_sb_features;
- }
+ if (!opts->no_version_check) {
+ u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
+ unsigned incompat_bit = 0;
+ if (incompat)
+ incompat_bit = __ffs64(incompat);
+ else if (sb->features[1])
+ incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
+
+ if (incompat_bit) {
+ prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
+ incompat_bit,
+ bch2_sb_features[BCH_FEATURE_NR - 1],
+ BCH_FEATURE_NR - 1);
+ return -BCH_ERR_invalid_sb_features;
+ }
- if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
- BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
- prt_str(out, "Filesystem has incompatible version ");
- bch2_version_to_text(out, le16_to_cpu(sb->version));
- prt_str(out, ", current version ");
- bch2_version_to_text(out, bcachefs_metadata_version_current);
- return -BCH_ERR_invalid_sb_features;
+ if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
+ BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
+ prt_str(out, "Filesystem has incompatible version ");
+ bch2_version_to_text(out, le16_to_cpu(sb->version));
+ prt_str(out, ", current version ");
+ bch2_version_to_text(out, bcachefs_metadata_version_current);
+ return -BCH_ERR_invalid_sb_features;
+ }
}
if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
@@ -915,7 +917,7 @@ got_super:
sb->have_layout = true;
- ret = bch2_sb_validate(sb->sb, offset, 0, &err);
+ ret = bch2_sb_validate(sb->sb, opts, offset, 0, &err);
if (ret) {
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
@@ -1081,9 +1083,10 @@ int bch2_write_super(struct bch_fs *c)
bch2_sb_from_fs(c, (*ca));
darray_for_each(online_devices, ca) {
+ struct bch_opts opts = bch2_opts_empty();
printbuf_reset(&err);
- ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
+ ret = bch2_sb_validate((*ca)->disk_sb.sb, &opts, 0, BCH_VALIDATE_write, &err);
if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
goto out;
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index a3b7a90f2533..82cb3a3ceeae 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -92,7 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
-int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
+int bch2_sb_validate(struct bch_sb *, struct bch_opts *, u64,
+ enum bch_validate_flags, struct printbuf *);
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 09e7f8ae9922..ee3b30b1c2b5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1021,6 +1021,12 @@ static int bch2_fs_opt_version_init(struct bch_fs *c)
prt_bitflags(&p, bch2_recovery_passes, sb_passes);
}
+ u64 btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
+ if (btrees_lost_data) {
+ prt_str(&p, "\nsuperblock indicates damage to following btrees:\n ");
+ prt_bitflags(&p, __bch2_btree_ids, btrees_lost_data);
+ }
+
if (bch2_check_version_downgrade(c)) {
prt_str(&p, "\nVersion downgrade required:");