diff options
-rw-r--r-- | fs/bcachefs/bcachefs_format.h | 11 | ||||
-rw-r--r-- | fs/bcachefs/bkey_methods.c | 6 | ||||
-rw-r--r-- | fs/bcachefs/bkey_types.h | 5 | ||||
-rw-r--r-- | fs/bcachefs/btree_iter.c | 72 | ||||
-rw-r--r-- | fs/bcachefs/btree_update.c | 67 | ||||
-rw-r--r-- | fs/bcachefs/fsck.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/lru.h | 10 | ||||
-rw-r--r-- | fs/bcachefs/movinggc.c | 188 |
8 files changed, 278 insertions, 85 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index b4a04df5ea95..a8f59522e258 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -423,7 +423,8 @@ enum bch_bkey_type_flags { x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \ x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \ x(accounting, 34, BKEY_TYPE_strict_btree_checks) \ - x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) + x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) \ + x(extent_whiteout, 36, BKEY_TYPE_strict_btree_checks) enum bch_bkey_type { #define x(name, nr, ...) KEY_TYPE_##name = nr, @@ -440,6 +441,10 @@ struct bch_whiteout { struct bch_val v; }; +struct bch_extent_whiteout { + struct bch_val v; +}; + struct bch_error { struct bch_val v; }; @@ -700,7 +705,8 @@ struct bch_sb_field_ext { x(extent_flags, BCH_VERSION(1, 25)) \ x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ x(fast_device_removal, BCH_VERSION(1, 27)) \ - x(inode_has_case_insensitive, BCH_VERSION(1, 28)) + x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \ + x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1340,6 +1346,7 @@ enum btree_id_flags { BTREE_IS_snapshots| \ BTREE_IS_data, \ BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_extent_whiteout)| \ BIT_ULL(KEY_TYPE_error)| \ BIT_ULL(KEY_TYPE_cookie)| \ BIT_ULL(KEY_TYPE_extent)| \ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index fcd8c82cba4f..75d73677c4d8 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -41,6 +41,10 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, .key_validate = deleted_key_validate, \ }) +#define bch2_bkey_ops_extent_whiteout ((struct bkey_ops) { \ + .key_validate = deleted_key_validate, \ +}) + static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { @@ -203,7 +207,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, ? bch2_bkey_types[k.k->type] : "(unknown)"); - if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { + if (btree_node_type_is_extents(type) && !bkey_extent_whiteout(k.k)) { bkey_fsck_err_on(k.k->size == 0, c, bkey_extent_size_zero, "size == 0"); diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h index b4f328f9853c..88a48ce63656 100644 --- a/fs/bcachefs/bkey_types.h +++ b/fs/bcachefs/bkey_types.h @@ -44,6 +44,11 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) #define bkey_whiteout(_k) \ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) +#define bkey_extent_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || \ + (_k)->type == KEY_TYPE_whiteout || \ + (_k)->type == KEY_TYPE_extent_whiteout) + /* bkey with split value, const */ struct bkey_s_c { const struct bkey *k; diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index a67babf69d39..8962c481e310 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2450,10 +2450,27 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en continue; } - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_nofilter_whiteouts)) { - search_key = bkey_successor(iter, k.k->p); - continue; + if (!(iter->flags & BTREE_ITER_nofilter_whiteouts)) { + /* + * KEY_TYPE_extent_whiteout indicates that there + * are no extents that overlap with this + * whiteout - meaning bkey_start_pos() is + * monotonically increasing when including + * KEY_TYPE_extent_whiteout (not + * KEY_TYPE_whiteout). + * + * Without this @end wouldn't be able to + * terminate searches and we'd have to scan + * through tons of whiteouts: + */ + if (k.k->type == KEY_TYPE_extent_whiteout && + bkey_ge(k.k->p, end)) + goto end; + + if (bkey_extent_whiteout(k.k)) { + search_key = bkey_successor(iter, k.k->p); + continue; + } } } @@ -2711,7 +2728,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp saved_path = 0; } - if (!bkey_whiteout(k.k)) { + if (!bkey_extent_whiteout(k.k)) { saved_path = btree_path_clone(trans, iter->path, iter->flags & BTREE_ITER_intent, _THIS_IP_); @@ -2724,7 +2741,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp continue; } - if (bkey_whiteout(k.k)) { + if (bkey_extent_whiteout(k.k)) { search_key = bkey_predecessor(iter, k.k->p); search_key.snapshot = U32_MAX; continue; @@ -2865,7 +2882,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) iter->k = *k.k; } - if (unlikely(k.k->type == KEY_TYPE_whiteout && + if (unlikely(bkey_extent_whiteout(k.k) && (iter->flags & BTREE_ITER_filter_snapshots) && !(iter->flags & BTREE_ITER_nofilter_whiteouts))) iter->k.type = KEY_TYPE_deleted; @@ -2878,31 +2895,40 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) EBUG_ON(btree_iter_path(trans, iter)->level); - if (iter->flags & BTREE_ITER_intent) { - struct btree_iter iter2; + struct btree_iter iter2; - bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek_max(&iter2, end); + bch2_trans_copy_iter(&iter2, iter); + iter2.flags |= BTREE_ITER_nofilter_whiteouts; - if (k.k && !bkey_err(k)) { - swap(iter->key_cache_path, iter2.key_cache_path); - iter->k = iter2.k; - k.k = &iter->k; + while (1) { + k = bch2_btree_iter_peek_max(&iter2, end); + if ((iter2.flags & BTREE_ITER_is_extents) && + k.k && + !bkey_err(k) && + k.k->type == KEY_TYPE_whiteout) { + bch2_btree_iter_set_pos(&iter2, k.k->p); + continue; } - bch2_trans_iter_exit(&iter2); - } else { - struct bpos pos = iter->pos; - k = bch2_btree_iter_peek_max(iter, end); - if (unlikely(bkey_err(k))) - bch2_btree_iter_set_pos(iter, pos); - else - iter->pos = pos; + break; + } + + if (k.k && !bkey_err(k)) { + swap(iter->key_cache_path, iter2.key_cache_path); + iter->k = iter2.k; + k.k = &iter->k; } + bch2_trans_iter_exit(&iter2); if (unlikely(bkey_err(k))) goto out; + if (unlikely(k.k && + bkey_extent_whiteout(k.k) && + (iter->flags & BTREE_ITER_filter_snapshots) && + !(iter->flags & BTREE_ITER_nofilter_whiteouts))) + iter->k.type = KEY_TYPE_deleted; + next = k.k ? bkey_start_pos(k.k) : POS_MAX; if (bkey_lt(iter->pos, next)) { diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 6f3b57573cba..f59f018fe0d8 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -12,6 +12,7 @@ #include "extents.h" #include "keylist.h" #include "snapshot.h" +#include "super-io.h" #include "trace.h" #include <linux/string_helpers.h> @@ -158,6 +159,21 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, return ret; } +static inline enum bch_bkey_type extent_whiteout_type(struct bch_fs *c, enum btree_id btree, const struct bkey *k) +{ + /* + * KEY_TYPE_extent_whiteout indicates that there isn't a real extent + * present at that position: key start positions inclusive of + * KEY_TYPE_extent_whiteout (but not KEY_TYPE_whiteout) are + * monotonically increasing + */ + return btree_id_is_extents_snapshots(btree) && + bkey_deleted(k) && + !bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_snapshot_whiteouts) + ? KEY_TYPE_extent_whiteout + : KEY_TYPE_whiteout; +} + int bch2_trans_update_extent_overwrite(struct btree_trans *trans, struct btree_iter *iter, enum btree_iter_update_trigger_flags flags, @@ -224,14 +240,14 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, update->k.p = old.k->p; update->k.p.snapshot = new.k->p.snapshot; - if (new.k->p.snapshot != old.k->p.snapshot) { - update->k.type = KEY_TYPE_whiteout; - } else if (btree_type_has_snapshots(btree_id)) { - ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); + if (btree_type_has_snapshots(btree_id)) { + ret = new.k->p.snapshot != old.k->p.snapshot + ? 1 + : need_whiteout_for_snapshot(trans, btree_id, update->k.p); if (ret < 0) return ret; if (ret) - update->k.type = KEY_TYPE_whiteout; + update->k.type = extent_whiteout_type(trans->c, iter->btree_id, new.k); } ret = bch2_btree_insert_nonextent(trans, btree_id, update, @@ -265,7 +281,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans, CLASS(btree_iter, iter)(trans, btree_id, bkey_start_pos(&insert->k), BTREE_ITER_intent| BTREE_ITER_with_updates| - BTREE_ITER_not_extents); + BTREE_ITER_not_extents| + BTREE_ITER_nofilter_whiteouts); struct bkey_s_c k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); int ret = bkey_err(k); if (ret) @@ -283,12 +300,40 @@ static int bch2_trans_update_extent(struct btree_trans *trans, goto next; } - while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { - bool done = bkey_lt(insert->k.p, k.k->p); + while (true) { + BUG_ON(bkey_le(k.k->p, bkey_start_pos(&insert->k))); - ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); - if (ret) - return ret; + /* + * When KEY_TYPE_whiteout is included, bkey_start_pos is not + * monotonically increasing + */ + if (k.k->type != KEY_TYPE_whiteout && bkey_le(insert->k.p, bkey_start_pos(k.k))) + break; + + bool done = k.k->type != KEY_TYPE_whiteout && bkey_lt(insert->k.p, k.k->p); + + if (bkey_extent_whiteout(k.k)) { + enum bch_bkey_type whiteout_type = extent_whiteout_type(trans->c, btree_id, &insert->k); + + if (bkey_le(k.k->p, insert->k.p) && + k.k->type != whiteout_type) { + struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + return ret; + + update->k.p.snapshot = iter.snapshot; + update->k.type = whiteout_type; + + ret = bch2_trans_update(trans, &iter, update, 0); + if (ret) + return ret; + } + } else { + ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); + if (ret) + return ret; + } if (done) goto out; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 6ccea09243ab..01c1c6372229 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1444,7 +1444,7 @@ static int check_key_has_inode(struct btree_trans *trans, if (ret) return ret; - if (k.k->type == KEY_TYPE_whiteout) + if (bkey_extent_whiteout(k.k)) return 0; bool have_inode = i && !i->whiteout; @@ -1924,7 +1924,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, &inode->recalculate_sums); if (ret) goto err; + } + if (!bkey_extent_whiteout(k.k)) { /* * Check inodes in reverse order, from oldest snapshots to * newest, starting from the inode that matches this extent's diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 8abd0aa2083a..6f1e0a7b5db5 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -24,6 +24,16 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) return pos; } +static inline struct bpos lru_start(u16 lru_id) +{ + return lru_pos(lru_id, 0, 0); +} + +static inline struct bpos lru_end(u16 lru_id) +{ + return lru_pos(lru_id, U64_MAX, LRU_TIME_MAX); +} + static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index b0cbe3c1aab6..f36d60b8fb07 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -14,6 +14,7 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "clock.h" +#include "ec.h" #include "errcode.h" #include "error.h" #include "lru.h" @@ -131,72 +132,153 @@ static bool bucket_in_flight(struct buckets_in_flight *list, return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params); } +static int try_add_copygc_bucket(struct btree_trans *trans, + struct buckets_in_flight *buckets_in_flight, + struct bpos bucket, u64 lru_time) +{ + struct move_bucket b = { .k.bucket = bucket }; + + int ret = bch2_bucket_is_movable(trans, &b, lru_time); + if (ret <= 0) + return ret; + + if (bucket_in_flight(buckets_in_flight, b.k)) + return 0; + + struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); + if (!b_i) + return -ENOMEM; + + *b_i = b; + + ret = darray_push(&buckets_in_flight->to_evacuate, b_i); + if (ret) { + kfree(b_i); + return ret; + } + + ret = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, + bch_move_bucket_params); + BUG_ON(ret); + + size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); + return buckets_in_flight->to_evacuate.nr >= nr_to_get; +} + static int bch2_copygc_get_buckets(struct moving_context *ctxt, struct buckets_in_flight *buckets_in_flight) { struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); - size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; - int ret; - move_buckets_wait(ctxt, buckets_in_flight, false); + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_start(BCH_LRU_BUCKET_FRAGMENTATION), + lru_end(BCH_LRU_BUCKET_FRAGMENTATION), + 0, k, + try_add_copygc_bucket(trans, buckets_in_flight, + u64_to_bucket(k.k->p.offset), + lru_pos_time(k.k->p)) + ); - ret = bch2_btree_write_buffer_tryflush(trans); - if (bch2_err_matches(ret, EROFS)) - return ret; + return ret < 0 ? ret : 0; +} - if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) - return ret; +static int bch2_copygc_get_stripe_buckets(struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight) +{ + struct btree_trans *trans = ctxt->trans; - ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), - 0, k, ({ - struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; - int ret2 = 0; + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_start(BCH_LRU_STRIPE_FRAGMENTATION), + lru_end(BCH_LRU_STRIPE_FRAGMENTATION), + 0, lru_k, ({ + CLASS(btree_iter, s_iter)(trans, BTREE_ID_stripes, POS(0, lru_k.k->p.offset), 0); + struct bkey_s_c s_k = bch2_btree_iter_peek_slot(&s_iter); + int ret2 = bkey_err(s_k); + if (ret2) + goto err; - saw++; + if (s_k.k->type != KEY_TYPE_stripe) + continue; - ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)); - if (ret2 < 0) - goto err; + const struct bch_stripe *s = bkey_s_c_to_stripe(s_k).v; - if (!ret2) - not_movable++; - else if (bucket_in_flight(buckets_in_flight, b.k)) - in_flight++; - else { - struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); - ret2 = b_i ? 0 : -ENOMEM; + /* write buffer race? */ + if (stripe_lru_pos(s) != lru_pos_time(lru_k.k->p)) + continue; + + unsigned nr_data = s->nr_blocks - s->nr_redundant; + for (unsigned i = 0; i < nr_data; i++) { + if (!stripe_blockcount_get(s, i)) + continue; + + const struct bch_extent_ptr *ptr = s->ptrs + i; + CLASS(bch2_dev_tryget, ca)(trans->c, ptr->dev); + if (unlikely(!ca)) + continue; + + ret2 = try_add_copygc_bucket(trans, buckets_in_flight, + PTR_BUCKET_POS(ca, ptr), U64_MAX); if (ret2) - goto err; + break; + } +err: + ret2; + })); - *b_i = b; + return ret < 0 ? ret : 0; +} + +static bool should_do_ec_copygc(struct btree_trans *trans) +{ + u64 stripe_frag_ratio = 0; + + for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_start(BCH_LRU_STRIPE_FRAGMENTATION), + lru_end(BCH_LRU_STRIPE_FRAGMENTATION), + 0, lru_k, ({ + CLASS(btree_iter, s_iter)(trans, BTREE_ID_stripes, POS(0, lru_k.k->p.offset), 0); + struct bkey_s_c s_k = bch2_btree_iter_peek_slot(&s_iter); + int ret = bkey_err(s_k); + if (ret) + goto err; - ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); - if (ret2) { - kfree(b_i); - goto err; - } + if (s_k.k->type != KEY_TYPE_stripe) + continue; - ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, - bch_move_bucket_params); - BUG_ON(ret2); + const struct bch_stripe *s = bkey_s_c_to_stripe(s_k).v; - sectors += b.sectors; - } + /* write buffer race? */ + if (stripe_lru_pos(s) != lru_pos_time(lru_k.k->p)) + continue; - ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; + unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_nonempty = 0; + for (unsigned i = 0; i < nr_data; i++) + blocks_nonempty += !!stripe_blockcount_get(s, i); + + /* stripe is pending delete */ + if (!blocks_nonempty) + continue; + + /* This matches the calculation in alloc_lru_idx_fragmentation, so we can + * directly compare without actually looking up the bucket pointed to by the + * bucket fragmentation lru: + */ + stripe_frag_ratio = div_u64(blocks_nonempty * (1ULL << 31), nr_data); + break; err: - ret2; + ret; })); - pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", - buckets_in_flight->nr, buckets_in_flight->sectors, - saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); + CLASS(btree_iter, iter)(trans, BTREE_ID_lru, lru_start(BCH_LRU_BUCKET_FRAGMENTATION), 0); + struct bkey_s_c lru_k; - return ret < 0 ? ret : 0; + lockrestart_do(trans, bkey_err(lru_k = bch2_btree_iter_peek_max(&iter, + lru_end(BCH_LRU_BUCKET_FRAGMENTATION)))); + + u64 bucket_frag_ratio = lru_k.k && !bkey_err(lru_k) ? lru_pos_time(lru_k.k->p) : 0; + + /* Prefer normal bucket copygc */ + return stripe_frag_ratio && stripe_frag_ratio * 2 < bucket_frag_ratio; } noinline @@ -213,7 +295,18 @@ static int bch2_copygc(struct moving_context *ctxt, u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; - ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); + move_buckets_wait(ctxt, buckets_in_flight, false); + + ret = bch2_btree_write_buffer_tryflush(trans); + if (bch2_err_matches(ret, EROFS)) + goto err; + + if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) + goto err; + + ret = should_do_ec_copygc(trans) + ? bch2_copygc_get_stripe_buckets(ctxt, buckets_in_flight) + : bch2_copygc_get_buckets(ctxt, buckets_in_flight); if (ret) goto err; @@ -265,7 +358,8 @@ static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) for (unsigned i = 0; i < BCH_DATA_NR; i++) if (data_type_movable(i)) - fragmented += usage_full.d[i].fragmented; + fragmented += usage_full.d[i].buckets * ca->mi.bucket_size - + usage_full.d[i].sectors; return max(0LL, fragmented_allowed - fragmented); } |