diff options
-rw-r--r-- | .bcachefs_revision | 2 | ||||
-rw-r--r-- | libbcachefs/bcachefs_format.h | 11 | ||||
-rw-r--r-- | libbcachefs/bkey_methods.c | 6 | ||||
-rw-r--r-- | libbcachefs/bkey_types.h | 5 | ||||
-rw-r--r-- | libbcachefs/btree_iter.c | 8 | ||||
-rw-r--r-- | libbcachefs/btree_update.c | 67 | ||||
-rw-r--r-- | libbcachefs/fsck.c | 4 | ||||
-rw-r--r-- | libbcachefs/lru.h | 10 | ||||
-rw-r--r-- | libbcachefs/move.c | 8 | ||||
-rw-r--r-- | libbcachefs/movinggc.c | 188 | ||||
-rw-r--r-- | libbcachefs/recovery.c | 193 | ||||
-rw-r--r-- | libbcachefs/sb-counters_format.h | 2 | ||||
-rw-r--r-- | libbcachefs/super.c | 314 |
13 files changed, 488 insertions, 330 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision index 409a75d2..9ba5b364 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -dca6a42b74674732d1d7683c282f6002752b2bda +62ab4bbc52902916e1f22b642968a09deb9c1a23 diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index b4a04df5..a8f59522 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -423,7 +423,8 @@ enum bch_bkey_type_flags { x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \ x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \ x(accounting, 34, BKEY_TYPE_strict_btree_checks) \ - x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) + x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) \ + x(extent_whiteout, 36, BKEY_TYPE_strict_btree_checks) enum bch_bkey_type { #define x(name, nr, ...) KEY_TYPE_##name = nr, @@ -440,6 +441,10 @@ struct bch_whiteout { struct bch_val v; }; +struct bch_extent_whiteout { + struct bch_val v; +}; + struct bch_error { struct bch_val v; }; @@ -700,7 +705,8 @@ struct bch_sb_field_ext { x(extent_flags, BCH_VERSION(1, 25)) \ x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ x(fast_device_removal, BCH_VERSION(1, 27)) \ - x(inode_has_case_insensitive, BCH_VERSION(1, 28)) + x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \ + x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1340,6 +1346,7 @@ enum btree_id_flags { BTREE_IS_snapshots| \ BTREE_IS_data, \ BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_extent_whiteout)| \ BIT_ULL(KEY_TYPE_error)| \ BIT_ULL(KEY_TYPE_cookie)| \ BIT_ULL(KEY_TYPE_extent)| \ diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index fcd8c82c..75d73677 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -41,6 +41,10 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, .key_validate = deleted_key_validate, \ }) +#define bch2_bkey_ops_extent_whiteout ((struct bkey_ops) { \ + .key_validate = deleted_key_validate, \ +}) + static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { @@ -203,7 +207,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, ? bch2_bkey_types[k.k->type] : "(unknown)"); - if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { + if (btree_node_type_is_extents(type) && !bkey_extent_whiteout(k.k)) { bkey_fsck_err_on(k.k->size == 0, c, bkey_extent_size_zero, "size == 0"); diff --git a/libbcachefs/bkey_types.h b/libbcachefs/bkey_types.h index b4f328f9..88a48ce6 100644 --- a/libbcachefs/bkey_types.h +++ b/libbcachefs/bkey_types.h @@ -44,6 +44,11 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) #define bkey_whiteout(_k) \ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) +#define bkey_extent_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || \ + (_k)->type == KEY_TYPE_whiteout || \ + (_k)->type == KEY_TYPE_extent_whiteout) + /* bkey with split value, const */ struct bkey_s_c { const struct bkey *k; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index a67babf6..1f30326f 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -2450,7 +2450,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en continue; } - if (bkey_whiteout(k.k) && + if (bkey_extent_whiteout(k.k) && !(iter->flags & BTREE_ITER_nofilter_whiteouts)) { search_key = bkey_successor(iter, k.k->p); continue; @@ -2711,7 +2711,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp saved_path = 0; } - if (!bkey_whiteout(k.k)) { + if (!bkey_extent_whiteout(k.k)) { saved_path = btree_path_clone(trans, iter->path, iter->flags & BTREE_ITER_intent, _THIS_IP_); @@ -2724,7 +2724,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp continue; } - if (bkey_whiteout(k.k)) { + if (bkey_extent_whiteout(k.k)) { search_key = bkey_predecessor(iter, k.k->p); search_key.snapshot = U32_MAX; continue; @@ -2865,7 +2865,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) iter->k = *k.k; } - if (unlikely(k.k->type == KEY_TYPE_whiteout && + if (unlikely(bkey_extent_whiteout(k.k) && (iter->flags & BTREE_ITER_filter_snapshots) && !(iter->flags & BTREE_ITER_nofilter_whiteouts))) iter->k.type = KEY_TYPE_deleted; diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 6f3b5757..f59f018f 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -12,6 +12,7 @@ #include "extents.h" #include "keylist.h" #include "snapshot.h" +#include "super-io.h" #include "trace.h" #include <linux/string_helpers.h> @@ -158,6 +159,21 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, return ret; } +static inline enum bch_bkey_type extent_whiteout_type(struct bch_fs *c, enum btree_id btree, const struct bkey *k) +{ + /* + * KEY_TYPE_extent_whiteout indicates that there isn't a real extent + * present at that position: key start positions inclusive of + * KEY_TYPE_extent_whiteout (but not KEY_TYPE_whiteout) are + * monotonically increasing + */ + return btree_id_is_extents_snapshots(btree) && + bkey_deleted(k) && + !bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_snapshot_whiteouts) + ? KEY_TYPE_extent_whiteout + : KEY_TYPE_whiteout; +} + int bch2_trans_update_extent_overwrite(struct btree_trans *trans, struct btree_iter *iter, enum btree_iter_update_trigger_flags flags, @@ -224,14 +240,14 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, update->k.p = old.k->p; update->k.p.snapshot = new.k->p.snapshot; - if (new.k->p.snapshot != old.k->p.snapshot) { - update->k.type = KEY_TYPE_whiteout; - } else if (btree_type_has_snapshots(btree_id)) { - ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); + if (btree_type_has_snapshots(btree_id)) { + ret = new.k->p.snapshot != old.k->p.snapshot + ? 1 + : need_whiteout_for_snapshot(trans, btree_id, update->k.p); if (ret < 0) return ret; if (ret) - update->k.type = KEY_TYPE_whiteout; + update->k.type = extent_whiteout_type(trans->c, iter->btree_id, new.k); } ret = bch2_btree_insert_nonextent(trans, btree_id, update, @@ -265,7 +281,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans, CLASS(btree_iter, iter)(trans, btree_id, bkey_start_pos(&insert->k), BTREE_ITER_intent| BTREE_ITER_with_updates| - BTREE_ITER_not_extents); + BTREE_ITER_not_extents| + BTREE_ITER_nofilter_whiteouts); struct bkey_s_c k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); int ret = bkey_err(k); if (ret) @@ -283,12 +300,40 @@ static int bch2_trans_update_extent(struct btree_trans *trans, goto next; } - while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { - bool done = bkey_lt(insert->k.p, k.k->p); + while (true) { + BUG_ON(bkey_le(k.k->p, bkey_start_pos(&insert->k))); - ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); - if (ret) - return ret; + /* + * When KEY_TYPE_whiteout is included, bkey_start_pos is not + * monotonically increasing + */ + if (k.k->type != KEY_TYPE_whiteout && bkey_le(insert->k.p, bkey_start_pos(k.k))) + break; + + bool done = k.k->type != KEY_TYPE_whiteout && bkey_lt(insert->k.p, k.k->p); + + if (bkey_extent_whiteout(k.k)) { + enum bch_bkey_type whiteout_type = extent_whiteout_type(trans->c, btree_id, &insert->k); + + if (bkey_le(k.k->p, insert->k.p) && + k.k->type != whiteout_type) { + struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + return ret; + + update->k.p.snapshot = iter.snapshot; + update->k.type = whiteout_type; + + ret = bch2_trans_update(trans, &iter, update, 0); + if (ret) + return ret; + } + } else { + ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); + if (ret) + return ret; + } if (done) goto out; diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 6ccea092..01c1c637 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1444,7 +1444,7 @@ static int check_key_has_inode(struct btree_trans *trans, if (ret) return ret; - if (k.k->type == KEY_TYPE_whiteout) + if (bkey_extent_whiteout(k.k)) return 0; bool have_inode = i && !i->whiteout; @@ -1924,7 +1924,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, &inode->recalculate_sums); if (ret) goto err; + } + if (!bkey_extent_whiteout(k.k)) { /* * Check inodes in reverse order, from oldest snapshots to * newest, starting from the inode that matches this extent's diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h index 8abd0aa2..6f1e0a7b 100644 --- a/libbcachefs/lru.h +++ b/libbcachefs/lru.h @@ -24,6 +24,16 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) return pos; } +static inline struct bpos lru_start(u16 lru_id) +{ + return lru_pos(lru_id, 0, 0); +} + +static inline struct bpos lru_end(u16 lru_id) +{ + return lru_pos(lru_id, U64_MAX, LRU_TIME_MAX); +} + static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index a38996f5..30fe269d 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -344,9 +344,13 @@ int bch2_move_extent(struct moving_context *ctxt, if (!data_opts.rewrite_ptrs && !data_opts.extra_replicas && !data_opts.scrub) { - if (data_opts.kill_ptrs) + if (data_opts.kill_ptrs) { + this_cpu_add(c->counters[BCH_COUNTER_io_move_drop_only], k.k->size); return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); - return 0; + } else { + this_cpu_add(c->counters[BCH_COUNTER_io_move_noop], k.k->size); + return 0; + } } struct moving_io *io = allocate_dropping_locks(trans, ret, diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index b0cbe3c1..f36d60b8 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -14,6 +14,7 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "clock.h" +#include "ec.h" #include "errcode.h" #include "error.h" #include "lru.h" @@ -131,72 +132,153 @@ static bool bucket_in_flight(struct buckets_in_flight *list, return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params); } +static int try_add_copygc_bucket(struct btree_trans *trans, + struct buckets_in_flight *buckets_in_flight, + struct bpos bucket, u64 lru_time) +{ + struct move_bucket b = { .k.bucket = bucket }; + + int ret = bch2_bucket_is_movable(trans, &b, lru_time); + if (ret <= 0) + return ret; + + if (bucket_in_flight(buckets_in_flight, b.k)) + return 0; + + struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); + if (!b_i) + return -ENOMEM; + + *b_i = b; + + ret = darray_push(&buckets_in_flight->to_evacuate, b_i); + if (ret) { + kfree(b_i); + return ret; + } + + ret = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, + bch_move_bucket_params); + BUG_ON(ret); + + size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); + return buckets_in_flight->to_evacuate.nr >= nr_to_get; +} + static int bch2_copygc_get_buckets(struct moving_context *ctxt, struct buckets_in_flight *buckets_in_flight) { struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); - size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; - int ret; - move_buckets_wait(ctxt, buckets_in_flight, false); + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_start(BCH_LRU_BUCKET_FRAGMENTATION), + lru_end(BCH_LRU_BUCKET_FRAGMENTATION), + 0, k, + try_add_copygc_bucket(trans, buckets_in_flight, + u64_to_bucket(k.k->p.offset), + lru_pos_time(k.k->p)) + ); - ret = bch2_btree_write_buffer_tryflush(trans); - if (bch2_err_matches(ret, EROFS)) - return ret; + return ret < 0 ? ret : 0; +} - if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) - return ret; +static int bch2_copygc_get_stripe_buckets(struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight) +{ + struct btree_trans *trans = ctxt->trans; - ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), - 0, k, ({ - struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; - int ret2 = 0; + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_start(BCH_LRU_STRIPE_FRAGMENTATION), + lru_end(BCH_LRU_STRIPE_FRAGMENTATION), + 0, lru_k, ({ + CLASS(btree_iter, s_iter)(trans, BTREE_ID_stripes, POS(0, lru_k.k->p.offset), 0); + struct bkey_s_c s_k = bch2_btree_iter_peek_slot(&s_iter); + int ret2 = bkey_err(s_k); + if (ret2) + goto err; - saw++; + if (s_k.k->type != KEY_TYPE_stripe) + continue; - ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)); - if (ret2 < 0) - goto err; + const struct bch_stripe *s = bkey_s_c_to_stripe(s_k).v; - if (!ret2) - not_movable++; - else if (bucket_in_flight(buckets_in_flight, b.k)) - in_flight++; - else { - struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); - ret2 = b_i ? 0 : -ENOMEM; + /* write buffer race? */ + if (stripe_lru_pos(s) != lru_pos_time(lru_k.k->p)) + continue; + + unsigned nr_data = s->nr_blocks - s->nr_redundant; + for (unsigned i = 0; i < nr_data; i++) { + if (!stripe_blockcount_get(s, i)) + continue; + + const struct bch_extent_ptr *ptr = s->ptrs + i; + CLASS(bch2_dev_tryget, ca)(trans->c, ptr->dev); + if (unlikely(!ca)) + continue; + + ret2 = try_add_copygc_bucket(trans, buckets_in_flight, + PTR_BUCKET_POS(ca, ptr), U64_MAX); if (ret2) - goto err; + break; + } +err: + ret2; + })); - *b_i = b; + return ret < 0 ? ret : 0; +} + +static bool should_do_ec_copygc(struct btree_trans *trans) +{ + u64 stripe_frag_ratio = 0; + + for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_start(BCH_LRU_STRIPE_FRAGMENTATION), + lru_end(BCH_LRU_STRIPE_FRAGMENTATION), + 0, lru_k, ({ + CLASS(btree_iter, s_iter)(trans, BTREE_ID_stripes, POS(0, lru_k.k->p.offset), 0); + struct bkey_s_c s_k = bch2_btree_iter_peek_slot(&s_iter); + int ret = bkey_err(s_k); + if (ret) + goto err; - ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); - if (ret2) { - kfree(b_i); - goto err; - } + if (s_k.k->type != KEY_TYPE_stripe) + continue; - ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, - bch_move_bucket_params); - BUG_ON(ret2); + const struct bch_stripe *s = bkey_s_c_to_stripe(s_k).v; - sectors += b.sectors; - } + /* write buffer race? */ + if (stripe_lru_pos(s) != lru_pos_time(lru_k.k->p)) + continue; - ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; + unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_nonempty = 0; + for (unsigned i = 0; i < nr_data; i++) + blocks_nonempty += !!stripe_blockcount_get(s, i); + + /* stripe is pending delete */ + if (!blocks_nonempty) + continue; + + /* This matches the calculation in alloc_lru_idx_fragmentation, so we can + * directly compare without actually looking up the bucket pointed to by the + * bucket fragmentation lru: + */ + stripe_frag_ratio = div_u64(blocks_nonempty * (1ULL << 31), nr_data); + break; err: - ret2; + ret; })); - pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", - buckets_in_flight->nr, buckets_in_flight->sectors, - saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); + CLASS(btree_iter, iter)(trans, BTREE_ID_lru, lru_start(BCH_LRU_BUCKET_FRAGMENTATION), 0); + struct bkey_s_c lru_k; - return ret < 0 ? ret : 0; + lockrestart_do(trans, bkey_err(lru_k = bch2_btree_iter_peek_max(&iter, + lru_end(BCH_LRU_BUCKET_FRAGMENTATION)))); + + u64 bucket_frag_ratio = lru_k.k && !bkey_err(lru_k) ? lru_pos_time(lru_k.k->p) : 0; + + /* Prefer normal bucket copygc */ + return stripe_frag_ratio && stripe_frag_ratio * 2 < bucket_frag_ratio; } noinline @@ -213,7 +295,18 @@ static int bch2_copygc(struct moving_context *ctxt, u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; - ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); + move_buckets_wait(ctxt, buckets_in_flight, false); + + ret = bch2_btree_write_buffer_tryflush(trans); + if (bch2_err_matches(ret, EROFS)) + goto err; + + if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) + goto err; + + ret = should_do_ec_copygc(trans) + ? bch2_copygc_get_stripe_buckets(ctxt, buckets_in_flight) + : bch2_copygc_get_buckets(ctxt, buckets_in_flight); if (ret) goto err; @@ -265,7 +358,8 @@ static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) for (unsigned i = 0; i < BCH_DATA_NR; i++) if (data_type_movable(i)) - fragmented += usage_full.d[i].fragmented; + fragmented += usage_full.d[i].buckets * ca->mi.bucket_size - + usage_full.d[i].sectors; return max(0LL, fragmented_allowed - fragmented); } diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index c57ff235..21aa2edb 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -626,93 +626,6 @@ fsck_err: return ret; } -static bool check_version_upgrade(struct bch_fs *c) -{ - unsigned latest_version = bcachefs_metadata_version_current; - unsigned latest_compatible = min(latest_version, - bch2_latest_compatible_version(c->sb.version)); - unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; - unsigned new_version = 0; - bool ret = false; - - if (old_version < bcachefs_metadata_required_upgrade_below) { - if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || - latest_compatible < bcachefs_metadata_required_upgrade_below) - new_version = latest_version; - else - new_version = latest_compatible; - } else { - switch (c->opts.version_upgrade) { - case BCH_VERSION_UPGRADE_compatible: - new_version = latest_compatible; - break; - case BCH_VERSION_UPGRADE_incompatible: - new_version = latest_version; - break; - case BCH_VERSION_UPGRADE_none: - new_version = min(old_version, latest_version); - break; - } - } - - if (new_version > old_version) { - CLASS(printbuf, buf)(); - - if (old_version < bcachefs_metadata_required_upgrade_below) - prt_str(&buf, "Version upgrade required:\n"); - - if (old_version != c->sb.version) { - prt_str(&buf, "Version upgrade from "); - bch2_version_to_text(&buf, c->sb.version_upgrade_complete); - prt_str(&buf, " to "); - bch2_version_to_text(&buf, c->sb.version); - prt_str(&buf, " incomplete\n"); - } - - prt_printf(&buf, "Doing %s version upgrade from ", - BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) - ? "incompatible" : "compatible"); - bch2_version_to_text(&buf, old_version); - prt_str(&buf, " to "); - bch2_version_to_text(&buf, new_version); - prt_newline(&buf); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_upgrade(c, old_version, new_version); - passes = ext->recovery_passes_required[0] & ~passes; - - if (passes) { - prt_str(&buf, " running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } - - bch_notice(c, "%s", buf.buf); - ret = true; - } - - if (new_version > c->sb.version_incompat_allowed && - c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { - CLASS(printbuf, buf)(); - - prt_str(&buf, "Now allowing incompatible features up to "); - bch2_version_to_text(&buf, new_version); - prt_str(&buf, ", previously allowed up to "); - bch2_version_to_text(&buf, c->sb.version_incompat_allowed); - prt_newline(&buf); - - bch_notice(c, "%s", buf.buf); - ret = true; - } - - if (ret) - bch2_sb_upgrade(c, new_version, - c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible); - - return ret; -} - int bch2_fs_recovery(struct bch_fs *c) { struct bch_sb_field_clean *clean = NULL; @@ -732,108 +645,6 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "recovering from unclean shutdown"); } - if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { - bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); - ret = -EINVAL; - goto err; - } - - if (!c->sb.clean && - !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { - bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); - ret = -EINVAL; - goto err; - } - - if (c->opts.norecovery) { - c->opts.recovery_pass_last = c->opts.recovery_pass_last - ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) - : BCH_RECOVERY_PASS_snapshots_read; - c->opts.nochanges = true; - } - - if (c->opts.nochanges) - c->opts.read_only = true; - - if (c->opts.journal_rewind) { - bch_info(c, "rewinding journal, fsck required"); - c->opts.fsck = true; - } - - if (go_rw_in_recovery(c)) { - /* - * start workqueues/kworkers early - kthread creation checks for - * pending signals, which is _very_ annoying - */ - ret = bch2_fs_init_rw(c); - if (ret) - goto err; - } - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - bool write_sb = false; - - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { - ext->recovery_passes_required[0] |= - cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); - write_sb = true; - } - - u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - if (sb_passes) { - CLASS(printbuf, buf)(); - prt_str(&buf, "superblock requires following recovery passes to be run:\n "); - prt_bitflags(&buf, bch2_recovery_passes, sb_passes); - bch_info(c, "%s", buf.buf); - } - - if (bch2_check_version_downgrade(c)) { - CLASS(printbuf, buf)(); - - prt_str(&buf, "Version downgrade required:"); - - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_downgrade(c, - BCH_VERSION_MINOR(bcachefs_metadata_version_current), - BCH_VERSION_MINOR(c->sb.version)); - passes = ext->recovery_passes_required[0] & ~passes; - if (passes) { - prt_str(&buf, "\n running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } - - bch_info(c, "%s", buf.buf); - write_sb = true; - } - - if (check_version_upgrade(c)) - write_sb = true; - - c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - - if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) { - SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); - write_sb = true; - } - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (c->sb.clean) - set_bit(BCH_FS_clean_recovery, &c->flags); - if (c->opts.fsck) - set_bit(BCH_FS_in_fsck, &c->flags); - set_bit(BCH_FS_in_recovery, &c->flags); - - ret = bch2_blacklist_table_initialize(c); - if (ret) { - bch_err(c, "error initializing blacklist table"); - goto err; - } - bch2_journal_pos_from_member_info_resume(c); if (!c->sb.clean || c->opts.retain_recovery_info) { @@ -1053,8 +864,8 @@ use_clean: } mutex_lock(&c->sb_lock); - ext = bch2_sb_field_get(c->disk_sb.sb, ext); - write_sb = false; + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + bool write_sb = false; if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h index 2e3a56bf..f3ea53a5 100644 --- a/libbcachefs/sb-counters_format.h +++ b/libbcachefs/sb-counters_format.h @@ -31,6 +31,8 @@ enum counters_flags { x(io_move_fail, 38, TYPE_COUNTER) \ x(io_move_write_fail, 82, TYPE_COUNTER) \ x(io_move_start_fail, 39, TYPE_COUNTER) \ + x(io_move_drop_only, 91, TYPE_COUNTER) \ + x(io_move_noop, 92, TYPE_COUNTER) \ x(io_move_created_rebalance, 83, TYPE_COUNTER) \ x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index b0019488..ef15e614 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -55,6 +55,7 @@ #include "replicas.h" #include "sb-clean.h" #include "sb-counters.h" +#include "sb-downgrade.h" #include "sb-errors.h" #include "sb-members.h" #include "snapshot.h" @@ -842,6 +843,233 @@ int bch2_fs_init_rw(struct bch_fs *c) return 0; } +static bool check_version_upgrade(struct bch_fs *c) +{ + unsigned latest_version = bcachefs_metadata_version_current; + unsigned latest_compatible = min(latest_version, + bch2_latest_compatible_version(c->sb.version)); + unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; + unsigned new_version = 0; + bool ret = false; + + if (old_version < bcachefs_metadata_required_upgrade_below) { + if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || + latest_compatible < bcachefs_metadata_required_upgrade_below) + new_version = latest_version; + else + new_version = latest_compatible; + } else { + switch (c->opts.version_upgrade) { + case BCH_VERSION_UPGRADE_compatible: + new_version = latest_compatible; + break; + case BCH_VERSION_UPGRADE_incompatible: + new_version = latest_version; + break; + case BCH_VERSION_UPGRADE_none: + new_version = min(old_version, latest_version); + break; + } + } + + if (new_version > old_version) { + CLASS(printbuf, buf)(); + + if (old_version < bcachefs_metadata_required_upgrade_below) + prt_str(&buf, "Version upgrade required:\n"); + + if (old_version != c->sb.version) { + prt_str(&buf, "Version upgrade from "); + bch2_version_to_text(&buf, c->sb.version_upgrade_complete); + prt_str(&buf, " to "); + bch2_version_to_text(&buf, c->sb.version); + prt_str(&buf, " incomplete\n"); + } + + prt_printf(&buf, "Doing %s version upgrade from ", + BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) + ? "incompatible" : "compatible"); + bch2_version_to_text(&buf, old_version); + prt_str(&buf, " to "); + bch2_version_to_text(&buf, new_version); + prt_newline(&buf); + + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __le64 passes = ext->recovery_passes_required[0]; + bch2_sb_set_upgrade(c, old_version, new_version); + passes = ext->recovery_passes_required[0] & ~passes; + + if (passes) { + prt_str(&buf, " running recovery passes: "); + prt_bitflags(&buf, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); + } + + bch_notice(c, "%s", buf.buf); + ret = true; + } + + if (new_version > c->sb.version_incompat_allowed && + c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { + CLASS(printbuf, buf)(); + + prt_str(&buf, "Now allowing incompatible features up to "); + bch2_version_to_text(&buf, new_version); + prt_str(&buf, ", previously allowed up to "); + bch2_version_to_text(&buf, c->sb.version_incompat_allowed); + prt_newline(&buf); + + bch_notice(c, "%s", buf.buf); + ret = true; + } + + if (ret) + bch2_sb_upgrade(c, new_version, + c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible); + + return ret; +} + +noinline_for_stack +static int bch2_fs_opt_version_init(struct bch_fs *c) +{ + int ret = 0; + + if (c->opts.norecovery) { + c->opts.recovery_pass_last = c->opts.recovery_pass_last + ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) + : BCH_RECOVERY_PASS_snapshots_read; + c->opts.nochanges = true; + } + + if (c->opts.nochanges) + c->opts.read_only = true; + + if (c->opts.journal_rewind) + c->opts.fsck = true; + + CLASS(printbuf, p)(); + bch2_log_msg_start(c, &p); + + prt_str(&p, "starting version "); + bch2_version_to_text(&p, c->sb.version); + + bool first = true; + for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; + u64 v = bch2_opt_get_by_id(&c->opts, i); + + if (!(opt->flags & OPT_MOUNT)) + continue; + + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) + continue; + + prt_str(&p, first ? " opts=" : ","); + first = false; + bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); + } + + if (c->sb.version_incompat_allowed != c->sb.version) { + prt_printf(&p, "\nallowing incompatible features above "); + bch2_version_to_text(&p, c->sb.version_incompat_allowed); + } + + if (c->opts.verbose) { + prt_printf(&p, "\nfeatures: "); + prt_bitflags(&p, bch2_sb_features, c->sb.features); + } + + if (c->sb.multi_device) { + prt_printf(&p, "\nwith devices"); + for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { + prt_char(&p, ' '); + prt_str(&p, ca->name); + } + } + + if (c->cf_encoding) + prt_printf(&p, "\nUsing encoding defined by superblock: utf8-%u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + + if (c->opts.journal_rewind) + prt_printf(&p, "\nrewinding journal, fsck required"); + + scoped_guard(mutex, &c->sb_lock) { + struct bch_sb_field_ext *ext = bch2_sb_field_get_minsize(&c->disk_sb, ext, + sizeof(struct bch_sb_field_ext) / sizeof(u64)); + if (!ext) + return bch_err_throw(c, ENOSPC_sb); + + ret = bch2_sb_members_v2_init(c); + if (ret) + return ret; + + __le64 now = cpu_to_le64(ktime_get_real_seconds()); + for_each_online_member_rcu(c, ca) + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = now; + + if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) + ext->recovery_passes_required[0] |= + cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); + + u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (sb_passes) { + prt_str(&p, "\nsuperblock requires following recovery passes to be run:\n "); + prt_bitflags(&p, bch2_recovery_passes, sb_passes); + } + + if (bch2_check_version_downgrade(c)) { + prt_str(&p, "\nVersion downgrade required:"); + + __le64 passes = ext->recovery_passes_required[0]; + bch2_sb_set_downgrade(c, + BCH_VERSION_MINOR(bcachefs_metadata_version_current), + BCH_VERSION_MINOR(c->sb.version)); + passes = ext->recovery_passes_required[0] & ~passes; + if (passes) { + prt_str(&p, "\nrunning recovery passes: "); + prt_bitflags(&p, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); + } + } + + check_version_upgrade(c); + + c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + + if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) + SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); + + /* Don't write the superblock, defer that until we go rw */ + } + + if (c->sb.clean) + set_bit(BCH_FS_clean_recovery, &c->flags); + if (c->opts.fsck) + set_bit(BCH_FS_in_fsck, &c->flags); + set_bit(BCH_FS_in_recovery, &c->flags); + + bch2_print_str(c, KERN_INFO, p.buf); + + if (BCH_SB_INITIALIZED(c->disk_sb.sb)) { + if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { + bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); + return -EINVAL; + } + + if (!c->sb.clean && + !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { + bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); + return -EINVAL; + } + } + + return 0; +} + static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch_sb_handles *sbs) { @@ -1013,6 +1241,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, ret = bch2_fs_async_obj_init(c) ?: + bch2_blacklist_table_initialize(c) ?: bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: @@ -1063,7 +1292,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, } #endif - for (i = 0; i < c->sb.nr_devices; i++) { + for (unsigned i = 0; i < c->sb.nr_devices; i++) { if (!bch2_member_exists(c->disk_sb.sb, i)) continue; ret = bch2_dev_alloc(c, i); @@ -1078,6 +1307,20 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, &c->clock_journal_res, (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); + ret = bch2_fs_opt_version_init(c); + if (ret) + goto err; + + /* + * start workqueues/kworkers early - kthread creation checks for pending + * signals, which is _very_ annoying + */ + if (go_rw_in_recovery(c)) { + ret = bch2_fs_init_rw(c); + if (ret) + goto err; + } + scoped_guard(mutex, &bch_fs_list_lock) ret = bch2_fs_online(c); @@ -1093,53 +1336,6 @@ err: goto out; } -noinline_for_stack -static void print_mount_opts(struct bch_fs *c) -{ - enum bch_opt_id i; - CLASS(printbuf, p)(); - bch2_log_msg_start(c, &p); - - prt_str(&p, "starting version "); - bch2_version_to_text(&p, c->sb.version); - - bool first = true; - for (i = 0; i < bch2_opts_nr; i++) { - const struct bch_option *opt = &bch2_opt_table[i]; - u64 v = bch2_opt_get_by_id(&c->opts, i); - - if (!(opt->flags & OPT_MOUNT)) - continue; - - if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) - continue; - - prt_str(&p, first ? " opts=" : ","); - first = false; - bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); - } - - if (c->sb.version_incompat_allowed != c->sb.version) { - prt_printf(&p, "\nallowing incompatible features above "); - bch2_version_to_text(&p, c->sb.version_incompat_allowed); - } - - if (c->opts.verbose) { - prt_printf(&p, "\nfeatures: "); - prt_bitflags(&p, bch2_sb_features, c->sb.features); - } - - if (c->sb.multi_device) { - prt_printf(&p, "\nwith devices"); - for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { - prt_char(&p, ' '); - prt_str(&p, ca->name); - } - } - - bch2_print_str(c, KERN_INFO, p.buf); -} - static bool bch2_fs_may_start(struct bch_fs *c) { struct bch_dev *ca; @@ -1174,38 +1370,16 @@ static bool bch2_fs_may_start(struct bch_fs *c) int bch2_fs_start(struct bch_fs *c) { - time64_t now = ktime_get_real_seconds(); int ret = 0; BUG_ON(test_bit(BCH_FS_started, &c->flags)); - print_mount_opts(c); - - if (c->cf_encoding) - bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); - if (!bch2_fs_may_start(c)) return bch_err_throw(c, insufficient_devices_to_start); scoped_guard(rwsem_write, &c->state_lock) { - guard(mutex)(&c->sb_lock); - if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, - sizeof(struct bch_sb_field_ext) / sizeof(u64))) { - ret = bch_err_throw(c, ENOSPC_sb); - goto err; - } - - ret = bch2_sb_members_v2_init(c); - if (ret) - goto err; - scoped_guard(rcu) for_each_online_member_rcu(c, ca) { - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = - cpu_to_le64(now); if (ca->mi.state == BCH_MEMBER_STATE_rw) bch2_dev_allocator_add(c, ca); } |