diff options
Diffstat (limited to 'fs/bcachefs')
-rw-r--r-- | fs/bcachefs/bcachefs_format.h | 11 | ||||
-rw-r--r-- | fs/bcachefs/bkey_methods.c | 6 | ||||
-rw-r--r-- | fs/bcachefs/bkey_types.h | 5 | ||||
-rw-r--r-- | fs/bcachefs/btree_cache.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/btree_io.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/btree_iter.c | 72 | ||||
-rw-r--r-- | fs/bcachefs/btree_iter.h | 6 | ||||
-rw-r--r-- | fs/bcachefs/btree_trans_commit.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/btree_update.c | 67 | ||||
-rw-r--r-- | fs/bcachefs/btree_update_interior.c | 14 | ||||
-rw-r--r-- | fs/bcachefs/fs.c | 10 | ||||
-rw-r--r-- | fs/bcachefs/fsck.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/lru.h | 10 | ||||
-rw-r--r-- | fs/bcachefs/move.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/movinggc.c | 188 | ||||
-rw-r--r-- | fs/bcachefs/rebalance.c | 93 |
16 files changed, 359 insertions, 137 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index b4a04df5ea95..a8f59522e258 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -423,7 +423,8 @@ enum bch_bkey_type_flags { x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \ x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \ x(accounting, 34, BKEY_TYPE_strict_btree_checks) \ - x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) + x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) \ + x(extent_whiteout, 36, BKEY_TYPE_strict_btree_checks) enum bch_bkey_type { #define x(name, nr, ...) KEY_TYPE_##name = nr, @@ -440,6 +441,10 @@ struct bch_whiteout { struct bch_val v; }; +struct bch_extent_whiteout { + struct bch_val v; +}; + struct bch_error { struct bch_val v; }; @@ -700,7 +705,8 @@ struct bch_sb_field_ext { x(extent_flags, BCH_VERSION(1, 25)) \ x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ x(fast_device_removal, BCH_VERSION(1, 27)) \ - x(inode_has_case_insensitive, BCH_VERSION(1, 28)) + x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \ + x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1340,6 +1346,7 @@ enum btree_id_flags { BTREE_IS_snapshots| \ BTREE_IS_data, \ BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_extent_whiteout)| \ BIT_ULL(KEY_TYPE_error)| \ BIT_ULL(KEY_TYPE_cookie)| \ BIT_ULL(KEY_TYPE_extent)| \ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index fcd8c82cba4f..75d73677c4d8 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -41,6 +41,10 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, .key_validate = deleted_key_validate, \ }) +#define bch2_bkey_ops_extent_whiteout ((struct bkey_ops) { \ + .key_validate = deleted_key_validate, \ +}) + static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { @@ -203,7 +207,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, ? bch2_bkey_types[k.k->type] : "(unknown)"); - if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { + if (btree_node_type_is_extents(type) && !bkey_extent_whiteout(k.k)) { bkey_fsck_err_on(k.k->size == 0, c, bkey_extent_size_zero, "size == 0"); diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h index b4f328f9853c..88a48ce63656 100644 --- a/fs/bcachefs/bkey_types.h +++ b/fs/bcachefs/bkey_types.h @@ -44,6 +44,11 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) #define bkey_whiteout(_k) \ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) +#define bkey_extent_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || \ + (_k)->type == KEY_TYPE_whiteout || \ + (_k)->type == KEY_TYPE_extent_whiteout) + /* bkey with split value, const */ struct bkey_s_c { const struct bkey *k; diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 9261ad043564..3b1d694dcb3a 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -804,7 +804,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea goto got_node; } - b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); + b = __btree_node_mem_alloc(c, GFP_NOWAIT); if (b) { bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); } else { @@ -842,7 +842,7 @@ got_node: mutex_unlock(&bc->lock); - if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { + if (btree_node_data_alloc(c, b, GFP_NOWAIT)) { bch2_trans_unlock(trans); if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) goto err; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 8a03cd75a64f..276cf088539e 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -131,7 +131,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; - p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); + p = kvmalloc(size, GFP_NOWAIT); if (!p) { *used_mempool = true; p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index a67babf69d39..8962c481e310 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2450,10 +2450,27 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en continue; } - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_nofilter_whiteouts)) { - search_key = bkey_successor(iter, k.k->p); - continue; + if (!(iter->flags & BTREE_ITER_nofilter_whiteouts)) { + /* + * KEY_TYPE_extent_whiteout indicates that there + * are no extents that overlap with this + * whiteout - meaning bkey_start_pos() is + * monotonically increasing when including + * KEY_TYPE_extent_whiteout (not + * KEY_TYPE_whiteout). + * + * Without this @end wouldn't be able to + * terminate searches and we'd have to scan + * through tons of whiteouts: + */ + if (k.k->type == KEY_TYPE_extent_whiteout && + bkey_ge(k.k->p, end)) + goto end; + + if (bkey_extent_whiteout(k.k)) { + search_key = bkey_successor(iter, k.k->p); + continue; + } } } @@ -2711,7 +2728,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp saved_path = 0; } - if (!bkey_whiteout(k.k)) { + if (!bkey_extent_whiteout(k.k)) { saved_path = btree_path_clone(trans, iter->path, iter->flags & BTREE_ITER_intent, _THIS_IP_); @@ -2724,7 +2741,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp continue; } - if (bkey_whiteout(k.k)) { + if (bkey_extent_whiteout(k.k)) { search_key = bkey_predecessor(iter, k.k->p); search_key.snapshot = U32_MAX; continue; @@ -2865,7 +2882,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) iter->k = *k.k; } - if (unlikely(k.k->type == KEY_TYPE_whiteout && + if (unlikely(bkey_extent_whiteout(k.k) && (iter->flags & BTREE_ITER_filter_snapshots) && !(iter->flags & BTREE_ITER_nofilter_whiteouts))) iter->k.type = KEY_TYPE_deleted; @@ -2878,31 +2895,40 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) EBUG_ON(btree_iter_path(trans, iter)->level); - if (iter->flags & BTREE_ITER_intent) { - struct btree_iter iter2; + struct btree_iter iter2; - bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek_max(&iter2, end); + bch2_trans_copy_iter(&iter2, iter); + iter2.flags |= BTREE_ITER_nofilter_whiteouts; - if (k.k && !bkey_err(k)) { - swap(iter->key_cache_path, iter2.key_cache_path); - iter->k = iter2.k; - k.k = &iter->k; + while (1) { + k = bch2_btree_iter_peek_max(&iter2, end); + if ((iter2.flags & BTREE_ITER_is_extents) && + k.k && + !bkey_err(k) && + k.k->type == KEY_TYPE_whiteout) { + bch2_btree_iter_set_pos(&iter2, k.k->p); + continue; } - bch2_trans_iter_exit(&iter2); - } else { - struct bpos pos = iter->pos; - k = bch2_btree_iter_peek_max(iter, end); - if (unlikely(bkey_err(k))) - bch2_btree_iter_set_pos(iter, pos); - else - iter->pos = pos; + break; + } + + if (k.k && !bkey_err(k)) { + swap(iter->key_cache_path, iter2.key_cache_path); + iter->k = iter2.k; + k.k = &iter->k; } + bch2_trans_iter_exit(&iter2); if (unlikely(bkey_err(k))) goto out; + if (unlikely(k.k && + bkey_extent_whiteout(k.k) && + (iter->flags & BTREE_ITER_filter_snapshots) && + !(iter->flags & BTREE_ITER_nofilter_whiteouts))) + iter->k.type = KEY_TYPE_deleted; + next = k.k ? bkey_start_pos(k.k) : POS_MAX; if (bkey_lt(iter->pos, next)) { diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index b117cb5d7f94..c8fc6ee01d96 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -954,7 +954,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); #define allocate_dropping_locks_errcode(_trans, _do) \ ({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + gfp_t _gfp = GFP_NOWAIT; \ int _ret = _do; \ \ if (bch2_err_matches(_ret, ENOMEM)) { \ @@ -966,7 +966,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); #define allocate_dropping_locks(_trans, _ret, _do) \ ({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + gfp_t _gfp = GFP_NOWAIT; \ typeof(_do) _p = _do; \ \ _ret = 0; \ @@ -979,7 +979,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); #define allocate_dropping_locks_norelock(_trans, _lock_dropped, _do) \ ({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + gfp_t _gfp = GFP_NOWAIT; \ typeof(_do) _p = _do; \ _lock_dropped = false; \ if (unlikely(!_p)) { \ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 8b94a8156fbf..4d58bdb233e9 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -449,7 +449,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags return 0; new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); if (unlikely(!new_k)) return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 6f3b57573cba..f59f018fe0d8 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -12,6 +12,7 @@ #include "extents.h" #include "keylist.h" #include "snapshot.h" +#include "super-io.h" #include "trace.h" #include <linux/string_helpers.h> @@ -158,6 +159,21 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, return ret; } +static inline enum bch_bkey_type extent_whiteout_type(struct bch_fs *c, enum btree_id btree, const struct bkey *k) +{ + /* + * KEY_TYPE_extent_whiteout indicates that there isn't a real extent + * present at that position: key start positions inclusive of + * KEY_TYPE_extent_whiteout (but not KEY_TYPE_whiteout) are + * monotonically increasing + */ + return btree_id_is_extents_snapshots(btree) && + bkey_deleted(k) && + !bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_snapshot_whiteouts) + ? KEY_TYPE_extent_whiteout + : KEY_TYPE_whiteout; +} + int bch2_trans_update_extent_overwrite(struct btree_trans *trans, struct btree_iter *iter, enum btree_iter_update_trigger_flags flags, @@ -224,14 +240,14 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, update->k.p = old.k->p; update->k.p.snapshot = new.k->p.snapshot; - if (new.k->p.snapshot != old.k->p.snapshot) { - update->k.type = KEY_TYPE_whiteout; - } else if (btree_type_has_snapshots(btree_id)) { - ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); + if (btree_type_has_snapshots(btree_id)) { + ret = new.k->p.snapshot != old.k->p.snapshot + ? 1 + : need_whiteout_for_snapshot(trans, btree_id, update->k.p); if (ret < 0) return ret; if (ret) - update->k.type = KEY_TYPE_whiteout; + update->k.type = extent_whiteout_type(trans->c, iter->btree_id, new.k); } ret = bch2_btree_insert_nonextent(trans, btree_id, update, @@ -265,7 +281,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans, CLASS(btree_iter, iter)(trans, btree_id, bkey_start_pos(&insert->k), BTREE_ITER_intent| BTREE_ITER_with_updates| - BTREE_ITER_not_extents); + BTREE_ITER_not_extents| + BTREE_ITER_nofilter_whiteouts); struct bkey_s_c k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); int ret = bkey_err(k); if (ret) @@ -283,12 +300,40 @@ static int bch2_trans_update_extent(struct btree_trans *trans, goto next; } - while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { - bool done = bkey_lt(insert->k.p, k.k->p); + while (true) { + BUG_ON(bkey_le(k.k->p, bkey_start_pos(&insert->k))); - ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); - if (ret) - return ret; + /* + * When KEY_TYPE_whiteout is included, bkey_start_pos is not + * monotonically increasing + */ + if (k.k->type != KEY_TYPE_whiteout && bkey_le(insert->k.p, bkey_start_pos(k.k))) + break; + + bool done = k.k->type != KEY_TYPE_whiteout && bkey_lt(insert->k.p, k.k->p); + + if (bkey_extent_whiteout(k.k)) { + enum bch_bkey_type whiteout_type = extent_whiteout_type(trans->c, btree_id, &insert->k); + + if (bkey_le(k.k->p, insert->k.p) && + k.k->type != whiteout_type) { + struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + return ret; + + update->k.p.snapshot = iter.snapshot; + update->k.type = whiteout_type; + + ret = bch2_trans_update(trans, &iter, update, 0); + if (ret) + return ret; + } + } else { + ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); + if (ret) + return ret; + } if (done) goto out; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 76897cf15946..65ca54c5b0ff 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -336,6 +336,20 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, BUG_ON(b->ob.nr); mutex_lock(&c->btree_reserve_cache_lock); + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { + guard(spinlock)(&c->freelist_lock); + if (c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark)) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); + + ret = cl + ? bch_err_throw(c, bucket_alloc_blocked) + : bch_err_throw(c, open_buckets_empty); + mutex_unlock(&c->btree_reserve_cache_lock); + goto err; + } + } + if (c->btree_reserve_cache_nr > nr_reserve) { for (struct btree_alloc *a = c->btree_reserve_cache; a < c->btree_reserve_cache + c->btree_reserve_cache_nr;) { diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index b5e3090f1cb8..52722a5e8526 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -268,7 +268,7 @@ restart: rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { if (inode->ei_inum.inum == inum) { ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, - GFP_NOWAIT|__GFP_NOWARN); + GFP_NOWAIT); if (ret) { rcu_read_unlock(); ret = darray_make_room(&subvols, 1); @@ -826,14 +826,6 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_MTIME); - if (inode_u.bi_subvol) { - /* - * Subvolume deletion is asynchronous, but we still want to tell - * the VFS that it's been deleted here: - */ - set_nlink(&inode->v, 0); - } - if (IS_CASEFOLDED(vdir)) d_invalidate(dentry); err: diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 6ccea09243ab..01c1c6372229 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1444,7 +1444,7 @@ static int check_key_has_inode(struct btree_trans *trans, if (ret) return ret; - if (k.k->type == KEY_TYPE_whiteout) + if (bkey_extent_whiteout(k.k)) return 0; bool have_inode = i && !i->whiteout; @@ -1924,7 +1924,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, &inode->recalculate_sums); if (ret) goto err; + } + if (!bkey_extent_whiteout(k.k)) { /* * Check inodes in reverse order, from oldest snapshots to * newest, starting from the inode that matches this extent's diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 8abd0aa2083a..6f1e0a7b5db5 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -24,6 +24,16 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) return pos; } +static inline struct bpos lru_start(u16 lru_id) +{ + return lru_pos(lru_id, 0, 0); +} + +static inline struct bpos lru_end(u16 lru_id) +{ + return lru_pos(lru_id, U64_MAX, LRU_TIME_MAX); +} + static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 30fe269d531d..8457dd6b35ac 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -542,7 +542,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt) if (ctxt->wait_on_copygc && c->copygc_running) { bch2_moving_ctxt_flush_all(ctxt); - wait_event_killable(c->copygc_running_wq, + wait_event_freezable(c->copygc_running_wq, !c->copygc_running || (is_kthread && kthread_should_stop())); } diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index b0cbe3c1aab6..f36d60b8fb07 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -14,6 +14,7 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "clock.h" +#include "ec.h" #include "errcode.h" #include "error.h" #include "lru.h" @@ -131,72 +132,153 @@ static bool bucket_in_flight(struct buckets_in_flight *list, return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params); } +static int try_add_copygc_bucket(struct btree_trans *trans, + struct buckets_in_flight *buckets_in_flight, + struct bpos bucket, u64 lru_time) +{ + struct move_bucket b = { .k.bucket = bucket }; + + int ret = bch2_bucket_is_movable(trans, &b, lru_time); + if (ret <= 0) + return ret; + + if (bucket_in_flight(buckets_in_flight, b.k)) + return 0; + + struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); + if (!b_i) + return -ENOMEM; + + *b_i = b; + + ret = darray_push(&buckets_in_flight->to_evacuate, b_i); + if (ret) { + kfree(b_i); + return ret; + } + + ret = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, + bch_move_bucket_params); + BUG_ON(ret); + + size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); + return buckets_in_flight->to_evacuate.nr >= nr_to_get; +} + static int bch2_copygc_get_buckets(struct moving_context *ctxt, struct buckets_in_flight *buckets_in_flight) { struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); - size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; - int ret; - move_buckets_wait(ctxt, buckets_in_flight, false); + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_start(BCH_LRU_BUCKET_FRAGMENTATION), + lru_end(BCH_LRU_BUCKET_FRAGMENTATION), + 0, k, + try_add_copygc_bucket(trans, buckets_in_flight, + u64_to_bucket(k.k->p.offset), + lru_pos_time(k.k->p)) + ); - ret = bch2_btree_write_buffer_tryflush(trans); - if (bch2_err_matches(ret, EROFS)) - return ret; + return ret < 0 ? ret : 0; +} - if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) - return ret; +static int bch2_copygc_get_stripe_buckets(struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight) +{ + struct btree_trans *trans = ctxt->trans; - ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), - 0, k, ({ - struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; - int ret2 = 0; + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_start(BCH_LRU_STRIPE_FRAGMENTATION), + lru_end(BCH_LRU_STRIPE_FRAGMENTATION), + 0, lru_k, ({ + CLASS(btree_iter, s_iter)(trans, BTREE_ID_stripes, POS(0, lru_k.k->p.offset), 0); + struct bkey_s_c s_k = bch2_btree_iter_peek_slot(&s_iter); + int ret2 = bkey_err(s_k); + if (ret2) + goto err; - saw++; + if (s_k.k->type != KEY_TYPE_stripe) + continue; - ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)); - if (ret2 < 0) - goto err; + const struct bch_stripe *s = bkey_s_c_to_stripe(s_k).v; - if (!ret2) - not_movable++; - else if (bucket_in_flight(buckets_in_flight, b.k)) - in_flight++; - else { - struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); - ret2 = b_i ? 0 : -ENOMEM; + /* write buffer race? */ + if (stripe_lru_pos(s) != lru_pos_time(lru_k.k->p)) + continue; + + unsigned nr_data = s->nr_blocks - s->nr_redundant; + for (unsigned i = 0; i < nr_data; i++) { + if (!stripe_blockcount_get(s, i)) + continue; + + const struct bch_extent_ptr *ptr = s->ptrs + i; + CLASS(bch2_dev_tryget, ca)(trans->c, ptr->dev); + if (unlikely(!ca)) + continue; + + ret2 = try_add_copygc_bucket(trans, buckets_in_flight, + PTR_BUCKET_POS(ca, ptr), U64_MAX); if (ret2) - goto err; + break; + } +err: + ret2; + })); - *b_i = b; + return ret < 0 ? ret : 0; +} + +static bool should_do_ec_copygc(struct btree_trans *trans) +{ + u64 stripe_frag_ratio = 0; + + for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_start(BCH_LRU_STRIPE_FRAGMENTATION), + lru_end(BCH_LRU_STRIPE_FRAGMENTATION), + 0, lru_k, ({ + CLASS(btree_iter, s_iter)(trans, BTREE_ID_stripes, POS(0, lru_k.k->p.offset), 0); + struct bkey_s_c s_k = bch2_btree_iter_peek_slot(&s_iter); + int ret = bkey_err(s_k); + if (ret) + goto err; - ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); - if (ret2) { - kfree(b_i); - goto err; - } + if (s_k.k->type != KEY_TYPE_stripe) + continue; - ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, - bch_move_bucket_params); - BUG_ON(ret2); + const struct bch_stripe *s = bkey_s_c_to_stripe(s_k).v; - sectors += b.sectors; - } + /* write buffer race? */ + if (stripe_lru_pos(s) != lru_pos_time(lru_k.k->p)) + continue; - ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; + unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_nonempty = 0; + for (unsigned i = 0; i < nr_data; i++) + blocks_nonempty += !!stripe_blockcount_get(s, i); + + /* stripe is pending delete */ + if (!blocks_nonempty) + continue; + + /* This matches the calculation in alloc_lru_idx_fragmentation, so we can + * directly compare without actually looking up the bucket pointed to by the + * bucket fragmentation lru: + */ + stripe_frag_ratio = div_u64(blocks_nonempty * (1ULL << 31), nr_data); + break; err: - ret2; + ret; })); - pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", - buckets_in_flight->nr, buckets_in_flight->sectors, - saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); + CLASS(btree_iter, iter)(trans, BTREE_ID_lru, lru_start(BCH_LRU_BUCKET_FRAGMENTATION), 0); + struct bkey_s_c lru_k; - return ret < 0 ? ret : 0; + lockrestart_do(trans, bkey_err(lru_k = bch2_btree_iter_peek_max(&iter, + lru_end(BCH_LRU_BUCKET_FRAGMENTATION)))); + + u64 bucket_frag_ratio = lru_k.k && !bkey_err(lru_k) ? lru_pos_time(lru_k.k->p) : 0; + + /* Prefer normal bucket copygc */ + return stripe_frag_ratio && stripe_frag_ratio * 2 < bucket_frag_ratio; } noinline @@ -213,7 +295,18 @@ static int bch2_copygc(struct moving_context *ctxt, u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; - ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); + move_buckets_wait(ctxt, buckets_in_flight, false); + + ret = bch2_btree_write_buffer_tryflush(trans); + if (bch2_err_matches(ret, EROFS)) + goto err; + + if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) + goto err; + + ret = should_do_ec_copygc(trans) + ? bch2_copygc_get_stripe_buckets(ctxt, buckets_in_flight) + : bch2_copygc_get_buckets(ctxt, buckets_in_flight); if (ret) goto err; @@ -265,7 +358,8 @@ static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) for (unsigned i = 0; i < BCH_DATA_NR; i++) if (data_type_movable(i)) - fragmented += usage_full.d[i].fragmented; + fragmented += usage_full.d[i].buckets * ca->mi.bucket_size - + usage_full.d[i].sectors; return max(0LL, fragmented_allowed - fragmented); } diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index c0c5fe961a83..17ca56b0e2ac 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -292,12 +292,48 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, : 0; } -static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, - struct btree_iter *work_iter) +#define REBALANCE_WORK_BUF_NR 1024 +DEFINE_DARRAY_NAMED(darray_rebalance_work, struct bkey_i_cookie); + +static struct bkey_i *next_rebalance_entry(struct btree_trans *trans, + darray_rebalance_work *buf, struct bpos *work_pos) { - return !kthread_should_stop() - ? bch2_btree_iter_peek(work_iter) - : bkey_s_c_null; + if (unlikely(!buf->nr)) { + /* + * Avoid contention with write buffer flush: buffer up rebalance + * work entries in a darray + */ + + BUG_ON(!buf->size);; + + bch2_trans_begin(trans); + + for_each_btree_key(trans, iter, BTREE_ID_rebalance_work, *work_pos, + BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ + /* we previously used darray_make_room */ + BUG_ON(bkey_bytes(k.k) > sizeof(buf->data[0])); + + bkey_reassemble(&darray_top(*buf).k_i, k); + buf->nr++; + + *work_pos = bpos_successor(iter.pos); + if (buf->nr == buf->size) + break; + 0; + })); + + if (!buf->nr) + return NULL; + + unsigned l = 0, r = buf->nr - 1; + while (l < r) { + swap(buf->data[l], buf->data[r]); + l++; + --r; + } + } + + return &(&darray_pop(buf))->k_i; } static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, @@ -464,10 +500,9 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) per_snapshot_io_opts_init(&snapshot_io_opts, c); int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, - r->scan_start.pos, r->scan_end.pos, - BTREE_ITER_all_snapshots| - BTREE_ITER_not_extents| - BTREE_ITER_prefetch, k, ({ + r->scan_start.pos, r->scan_end.pos, + BTREE_ITER_all_snapshots| + BTREE_ITER_prefetch, k, ({ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, @@ -524,49 +559,37 @@ static int do_rebalance(struct moving_context *ctxt) struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &c->rebalance; - struct btree_iter extent_iter = { NULL }; - struct bkey_s_c k; + struct btree_iter extent_iter = {}; u32 kick = r->kick; - int ret = 0; - bch2_trans_begin(trans); + struct bpos work_pos = POS_MIN; + CLASS(darray_rebalance_work, work)(); + int ret = darray_make_room(&work, REBALANCE_WORK_BUF_NR); + if (ret) + return ret; bch2_move_stats_init(&r->work_stats, "rebalance_work"); bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); - CLASS(btree_iter, rebalance_work_iter)(trans, - BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_all_snapshots); - while (!bch2_move_ratelimit(ctxt)) { if (!bch2_rebalance_enabled(c)) { bch2_moving_ctxt_flush_all(ctxt); kthread_wait_freezable(bch2_rebalance_enabled(c) || kthread_should_stop()); + if (kthread_should_stop()) + break; } - if (kthread_should_stop()) + struct bkey_i *k = next_rebalance_entry(trans, &work, &work_pos); + if (!k) break; - bch2_trans_begin(trans); - - ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret || !k.k) - break; - - ret = k.k->type == KEY_TYPE_cookie - ? do_rebalance_scan(ctxt, k.k->p.inode, - le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) - : do_rebalance_extent(ctxt, k.k->p, &extent_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; + ret = k->k.type == KEY_TYPE_cookie + ? do_rebalance_scan(ctxt, k->k.p.inode, + le64_to_cpu(bkey_i_to_cookie(k)->v.cookie)) + : lockrestart_do(trans, do_rebalance_extent(ctxt, k->k.p, &extent_iter)); if (ret) break; - - bch2_btree_iter_advance(&rebalance_work_iter); } bch2_trans_iter_exit(&extent_iter); |