diff options
Diffstat (limited to 'fs/bcachefs/rebalance.c')
-rw-r--r-- | fs/bcachefs/rebalance.c | 665 |
1 files changed, 509 insertions, 156 deletions
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 67d6a90e86ef..27282616bcb0 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "btree_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" @@ -25,8 +26,29 @@ #include <linux/kthread.h> #include <linux/sched/cputime.h> +#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) + /* bch_extent_rebalance: */ +int bch2_extent_rebalance_validate(struct bch_fs *c, + struct bkey_s_c k, + struct bkey_validate_context from, + const struct bch_extent_rebalance *r) +{ + int ret = 0; + + bkey_fsck_err_on(r->pending && !(r->need_rb & BIT(BCH_REBALANCE_background_target)), + c, extent_rebalance_bad_pending, + "pending incorrectly set"); + + bkey_fsck_err_on(r->hipri && !(r->need_rb & BIT(BCH_REBALANCE_data_replicas)), + c, extent_rebalance_bad_pending, + "hipri incorrectly set"); + +fsck_err: + return ret; +} + static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) { const union bch_extent_entry *entry; @@ -38,15 +60,30 @@ static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct b return NULL; } -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) { return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); } +static const char * const rebalance_opts[] = { +#define x(n) #n, + BCH_REBALANCE_OPTS() +#undef x + NULL +}; + void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_rebalance *r) { - prt_printf(out, "replicas=%u", r->data_replicas); + prt_str(out, "need_rb="); + prt_bitflags(out, rebalance_opts, r->need_rb); + + if (r->hipri) + prt_str(out, " hipri"); + if (r->pending) + prt_str(out, " pending"); + + prt_printf(out, " replicas=%u", r->data_replicas); if (r->data_replicas_from_inode) prt_str(out, " (inode)"); @@ -92,32 +129,54 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, } } -int bch2_trigger_extent_rebalance(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta[1] = { 0 }; +/* + * XXX: check in bkey_validate that if r->hipri or r->pending are set, + * r->data_replicas are also set + */ - s64 s = bch2_bkey_sectors_need_rebalance(c, old); - need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta[0] -= s; +static inline unsigned rb_accounting_counters(const struct bch_extent_rebalance *r) +{ + if (!r) + return 0; + unsigned ret = r->need_rb; - s = bch2_bkey_sectors_need_rebalance(c, new); - need_rebalance_delta += s != 0; - need_rebalance_sectors_delta[0] += s; + if (r->hipri) + ret |= BIT(BCH_REBALANCE_ACCOUNTING_high_priority); + if (r->pending) { + ret |= BIT(BCH_REBALANCE_ACCOUNTING_pending); + ret &= ~BIT(BCH_REBALANCE_ACCOUNTING_background_target); + } + return ret; +} - if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { +int __bch2_trigger_extent_rebalance(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned old_r, unsigned new_r, + enum btree_iter_update_trigger_flags flags) +{ + int delta = (int) !!new_r - (int) !!old_r; + if ((flags & BTREE_TRIGGER_transactional) && delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - new.k->p, need_rebalance_delta > 0); + new.k->p, delta > 0); if (ret) return ret; } - if (need_rebalance_sectors_delta[0]) { + delta = old.k->size == new.k->size + ? old_r ^ new_r + : old_r | new_r; + while (delta) { + unsigned c = __ffs(delta); + delta ^= BIT(c); + + s64 v[1] = { 0 }; + if (old_r & BIT(c)) + v[0] -= (s64) old.k->size; + if (new_r & BIT(c)) + v[0] += (s64) new.k->size; + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, - need_rebalance_sectors_delta, rebalance_work); + v, rebalance_work_v2, c); if (ret) return ret; } @@ -125,39 +184,48 @@ int bch2_trigger_extent_rebalance(struct btree_trans *trans, return 0; } -static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, - struct bch_inode_opts *io_opts, - unsigned *move_ptrs, - unsigned *compress_ptrs, - u64 *sectors) +static struct bch_extent_rebalance +bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, + struct bch_inode_opts *opts, + unsigned *move_ptrs, + unsigned *compress_ptrs, + unsigned *csum_ptrs, + bool may_update_indirect) { *move_ptrs = 0; *compress_ptrs = 0; - *sectors = 0; + *csum_ptrs = 0; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - const struct bch_extent_rebalance *rb_opts = bch2_bkey_ptrs_rebalance_opts(ptrs); - if (!io_opts && !rb_opts) - return; + struct bch_extent_rebalance r = { .type = BIT(BCH_EXTENT_ENTRY_rebalance) }; if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return; + return r; + + const struct bch_extent_rebalance *old_r = bch2_bkey_ptrs_rebalance_opts(ptrs); + if (old_r) { + r = *old_r; + r.need_rb = 0; + } + +#define x(_name) \ + if (k.k->type != KEY_TYPE_reflink_v || \ + may_update_indirect || \ + (!opts->_name##_from_inode && !r._name##_from_inode)) { \ + r._name = opts->_name; \ + r._name##_from_inode = opts->_name##_from_inode; \ + } + BCH_REBALANCE_OPTS() +#undef x + + unsigned compression_type = bch2_compression_opt_to_type(r.background_compression); + unsigned csum_type = bch2_data_checksum_type_rb(c, r); - unsigned compression_type = - bch2_compression_opt_to_type(io_opts - ? io_opts->background_compression - : rb_opts->background_compression); - unsigned target = io_opts - ? io_opts->background_target - : rb_opts->background_target; - if (target && !bch2_target_accepts_data(c, BCH_DATA_user, target)) - target = 0; + bool incompressible = false, unwritten = false, ec = false; + unsigned durability = 0, min_durability = INT_MAX; const union bch_extent_entry *entry; struct extent_ptr_decoded p; - bool incompressible = false, unwritten = false; - unsigned ptr_idx = 1; guard(rcu)(); @@ -166,102 +234,264 @@ static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, unwritten |= p.ptr.unwritten; if (!p.ptr.cached) { - if (p.crc.compression_type != compression_type) + if (p.crc.compression_type != compression_type) { *compress_ptrs |= ptr_idx; + r.need_rb |= BIT(BCH_REBALANCE_background_compression); + } + + if (p.crc.csum_type != csum_type) { + *csum_ptrs |= ptr_idx; + r.need_rb |= BIT(BCH_REBALANCE_data_checksum); + } - if (target && !bch2_dev_in_target(c, p.ptr.dev, target)) + if (r.background_target && + !bch2_dev_in_target(c, p.ptr.dev, r.background_target)) { *move_ptrs |= ptr_idx; + r.need_rb |= BIT(BCH_REBALANCE_background_target); + } + + unsigned d = bch2_extent_ptr_durability(c, &p); + durability += d; + min_durability = min(min_durability, d); + + ec |= p.has_ec; } ptr_idx <<= 1; } - if (unwritten) - *compress_ptrs = 0; - if (incompressible) + if (unwritten || incompressible) { *compress_ptrs = 0; + r.need_rb &= ~BIT(BCH_REBALANCE_background_compression); + } + + if (unwritten) { + *csum_ptrs = 0; + r.need_rb &= !BIT(BCH_REBALANCE_data_checksum); + } - unsigned rb_ptrs = *move_ptrs | *compress_ptrs; + if (durability < r.data_replicas || durability >= r.data_replicas + min_durability) + r.need_rb |= BIT(BCH_REBALANCE_data_replicas); + return r; +} - if (!rb_ptrs) - return; +static int check_rebalance_scan_cookie(struct btree_trans *trans, u64 inum, bool *v) +{ + if (v && *v) + return 1; - ptr_idx = 1; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (rb_ptrs & ptr_idx) - *sectors += p.crc.compressed_size; - ptr_idx <<= 1; - } + /* + * If opts need to be propagated to the extent, a scan cookie should be + * present: + */ + CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + 0); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + return ret; + + ret = k.k->type == KEY_TYPE_cookie; + if (v) + *v = ret; + return ret; } -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) +static int check_dev_rebalance_scan_cookie(struct btree_trans *trans, struct bkey_s_c k, + struct bch_devs_mask *v) { - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - u64 sectors = 0; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bch2_bkey_needs_rebalance(c, k, NULL, &move_ptrs, &compress_ptrs, §ors); - return sectors; + bkey_for_each_ptr(ptrs, ptr) + if (v && test_bit(ptr->dev, v->d)) + return 1; + + bkey_for_each_ptr(ptrs, ptr) { + int ret = check_rebalance_scan_cookie(trans, ptr->dev + 1, NULL); + if (ret < 0) + return ret; + if (ret) { + if (v) + __set_bit(ptr->dev, v->d); + return ret; + } + } + + return 0; } -static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, - struct bch_inode_opts *opts, - struct bkey_s_c k) +static int new_needs_rb_allowed(struct btree_trans *trans, + struct per_snapshot_io_opts *s, + struct bkey_s_c k, + enum set_needs_rebalance_ctx ctx, + unsigned opt_change_cookie, + const struct bch_extent_rebalance *old, + const struct bch_extent_rebalance *new, + unsigned new_need_rb) { - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - u64 sectors = 0; + struct bch_fs *c = trans->c; + /* + * New need_rb - pointers that don't match the current io path options - + * are only allowed in certain situations: + * + * Propagating new options: from bch2_set_rebalance_needs_scan + * + * Foreground writes: background_compression and background_target are + * allowed + * + * Foreground writes: we may have raced with an option change: + * opt_change_cookie checks for this + * + * XXX: foreground writes should still match compression, + * foreground_target - figure out how to check for this + */ + if (ctx == SET_NEEDS_REBALANCE_opt_change || + ctx == SET_NEEDS_REBALANCE_opt_change_indirect) + return 0; + + if (ctx == SET_NEEDS_REBALANCE_foreground) { + new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)| + BIT(BCH_REBALANCE_background_target)); + + /* + * Foreground writes might end up degraded when a device is + * getting yanked: + * + * XXX: this is something we need to fix, but adding retries to + * the write path is something we have to do carefully. + */ + new_need_rb &= ~BIT(BCH_REBALANCE_data_replicas); + if (!new_need_rb) + return 0; + + if (opt_change_cookie != atomic_read(&c->opt_change_cookie)) + return 0; + } - bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, §ors); - return move_ptrs|compress_ptrs; + /* + * Either the extent data or the extent io options (from + * bch_extent_rebalance) should match the io_opts from the + * inode/filesystem, unless + * + * - There's a scan pending to propagate new options + * - It's an indirect extent: it may be referenced by inodes + * with inconsistent options + * + * For efficiency (so that we can cache checking for scan + * cookies), only check option consistency when we're called + * with snapshot_io_opts - don't bother when we're called from + * move_data_phys() -> get_io_opts_one() + * + * Note that we can cache the existence of a cookie, but not the + * non-existence, to avoid spurious false positives. + */ + int ret = check_rebalance_scan_cookie(trans, 0, s ? &s->fs_scan_cookie : NULL) ?: + check_rebalance_scan_cookie(trans, k.k->p.inode, s ? &s->inum_scan_cookie : NULL); + if (ret < 0) + return ret; + if (ret) + return 0; + + if (new_need_rb == BIT(BCH_REBALANCE_data_replicas)) { + ret = check_dev_rebalance_scan_cookie(trans, k, s ? &s->dev_cookie : NULL); + if (ret < 0) + return ret; + if (ret) + return 0; + } + + CLASS(printbuf, buf)(); + + prt_printf(&buf, "extent with incorrect/missing rebalance opts:\n"); + bch2_bkey_val_to_text(&buf, c, k); + + const struct bch_extent_rebalance _old = {}; + if (!old) + old = &_old; + +#define x(_name) \ + if (new_need_rb & BIT(BCH_REBALANCE_##_name)) \ + prt_printf(&buf, "\n" #_name " %u != %u", old->_name, new->_name); + BCH_REBALANCE_OPTS() +#undef x + + fsck_err(trans, extent_io_opts_not_set, "%s", buf.buf); +fsck_err: + return ret; } -static inline bool bkey_should_have_rb_opts(struct bch_fs *c, - struct bch_inode_opts *opts, - struct bkey_s_c k) +static inline bool bkey_should_have_rb_opts(struct bkey_s_c k, + struct bch_extent_rebalance new) { if (k.k->type == KEY_TYPE_reflink_v) { -#define x(n) if (opts->n##_from_inode) return true; +#define x(n) if (new.n##_from_inode) return true; BCH_REBALANCE_OPTS() #undef x } - return bch2_bkey_ptrs_need_rebalance(c, opts, k); + return new.need_rb; } -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts, +int bch2_bkey_set_needs_rebalance(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, + struct bch_inode_opts *opts, struct bkey_i *_k, enum set_needs_rebalance_ctx ctx, - u32 change_cookie) + u32 opt_change_cookie) { if (!bkey_extent_is_direct_data(&_k->k)) return 0; + struct bch_fs *c = trans->c; struct bkey_s k = bkey_i_to_s(_k); struct bch_extent_rebalance *old = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - if (bkey_should_have_rb_opts(c, opts, k.s_c)) { + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + unsigned csum_ptrs = 0; + struct bch_extent_rebalance new = + bch2_bkey_needs_rebalance(c, k.s_c, opts, &move_ptrs, &compress_ptrs, &csum_ptrs, + ctx == SET_NEEDS_REBALANCE_opt_change_indirect); + + bool should_have_rb = bkey_should_have_rb_opts(k.s_c, new); + + if (should_have_rb == !!old && + (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old)) + return 0; + + unsigned new_need_rb = new.need_rb & ~(old ? old->need_rb : 0); + + if (unlikely(new_need_rb)) { + int ret = new_needs_rb_allowed(trans, snapshot_io_opts, + k.s_c, ctx, opt_change_cookie, + old, &new, new_need_rb); + if (ret) + return ret; + } + + if (should_have_rb) { if (!old) { old = bkey_val_end(k); k.k->u64s += sizeof(*old) / sizeof(u64); } - *old = io_opts_to_rebalance_opts(c, opts); - } else { - if (old) - extent_entry_drop(k, (union bch_extent_entry *) old); - } + *old = new; + } else if (old) + extent_entry_drop(k, (union bch_extent_entry *) old); return 0; } static int bch2_get_update_rebalance_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, struct bch_inode_opts *io_opts, struct btree_iter *iter, struct bkey_s_c k, enum set_needs_rebalance_ctx ctx) { struct bch_fs *c = trans->c; + int ret = 0; BUG_ON(iter->flags & BTREE_ITER_is_extents); BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); @@ -269,36 +499,24 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans, if (!bkey_extent_is_direct_data(k.k)) return 0; - bool may_update_indirect = ctx == SET_NEEDS_REBALANCE_opt_change_indirect; + struct bch_extent_rebalance *old = + (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k); - /* - * If it's an indirect extent, and we walked to it directly, we won't - * have the options from the inode that were directly applied: options - * from the extent take precedence - unless the io_opts option came from - * the inode and may_update_indirect is true (walked from a - * REFLINK_P_MAY_UPDATE_OPTIONS pointer). - */ - const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); - if (old && k.k->type == KEY_TYPE_reflink_v) { -#define x(_name) \ - if (old->_name##_from_inode && \ - !(may_update_indirect && io_opts->_name##_from_inode)) { \ - io_opts->_name = old->_name; \ - io_opts->_name##_from_inode = true; \ - } - BCH_REBALANCE_OPTS() -#undef x - } + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + unsigned csum_ptrs = 0; + struct bch_extent_rebalance new = + bch2_bkey_needs_rebalance(c, k, io_opts, &move_ptrs, &compress_ptrs, &csum_ptrs, + ctx == SET_NEEDS_REBALANCE_opt_change_indirect); - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, io_opts); + bool should_have_rb = bkey_should_have_rb_opts(k, new); - if (bkey_should_have_rb_opts(c, io_opts, k) - ? old && !memcmp(old, &new, sizeof(new)) - : !old) + if (should_have_rb == !!old && + (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old)) return 0; struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); - int ret = PTR_ERR_OR_ZERO(n); + ret = PTR_ERR_OR_ZERO(n); if (ret) return ret; @@ -306,7 +524,7 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans, /* On successfull transaction commit, @k was invalidated: */ - return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?: + return bch2_bkey_set_needs_rebalance(trans, snapshot_io_opts, io_opts, n, ctx, 0) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, 0) ?: bch_err_throw(c, transaction_restart_commit); @@ -349,7 +567,8 @@ static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans *trans, darray_push(&io_opts->d, e); })); - io_opts->cur_inum = extent_pos.inode; + io_opts->cur_inum = extent_pos.inode; + io_opts->inum_scan_cookie = false; } ret = ret ?: trans_was_restarted(trans, restart_count); @@ -372,11 +591,13 @@ struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *trans, enum set_needs_rebalance_ctx ctx) { struct bch_inode_opts *opts = - bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, extent_iter, extent_k); + bch2_extent_get_io_opts(trans, snapshot_io_opts, + extent_pos, extent_iter, extent_k); if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level) return opts; - int ret = bch2_get_update_rebalance_opts(trans, opts, extent_iter, extent_k, ctx); + int ret = bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts, + extent_iter, extent_k, ctx); return ret ? ERR_PTR(ret) : opts; } @@ -420,11 +641,9 @@ int bch2_extent_get_apply_io_opts_one(struct btree_trans *trans, if (ret || btree_iter_path(trans, extent_iter)->level) return ret; - return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k, ctx); + return bch2_get_update_rebalance_opts(trans, NULL, io_opts, extent_iter, extent_k, ctx); } -#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) - static const char * const bch2_rebalance_state_strs[] = { #define x(t) #t, BCH_REBALANCE_STATES() @@ -467,6 +686,11 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) return ret; } +int bch2_set_rebalance_needs_scan_device(struct bch_fs *c, unsigned dev) +{ + return bch2_set_rebalance_needs_scan(c, dev + 1); +} + int bch2_set_fs_needs_rebalance(struct bch_fs *c) { return bch2_set_rebalance_needs_scan(c, 0); @@ -535,23 +759,6 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans, return &(&darray_pop(buf))->k_i; } -static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) - return 0; - - struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - extent_entry_drop(bkey_i_to_s(n), - (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -} - static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct per_snapshot_io_opts *snapshot_io_opts, struct bpos work_pos, @@ -570,6 +777,10 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, if (bkey_err(k)) return k; + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + if (!r || !r->need_rb) /* Write buffer race? */ + return bkey_s_c_null; + struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans, snapshot_io_opts, extent_iter->pos, extent_iter, k, @@ -580,22 +791,66 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, *opts_ret = opts; + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + unsigned csum_ptrs = 0; + bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &csum_ptrs, false); + memset(data_opts, 0, sizeof(*data_opts)); - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k); + data_opts->rewrite_ptrs = move_ptrs|compress_ptrs|csum_ptrs; data_opts->target = opts->background_target; data_opts->write_flags |= BCH_WRITE_only_specified_devs; - if (!data_opts->rewrite_ptrs) { - /* - * device we would want to write to offline? devices in target - * changed? - * - * We'll now need a full scan before this extent is picked up - * again: - */ - int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); - if (ret) - return bkey_s_c_err(ret); + if (r->need_rb & BIT(BCH_REBALANCE_data_replicas)) { + unsigned durability = bch2_bkey_durability(c, k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned ptr_bit = 1; + + guard(rcu)(); + if (durability <= opts->data_replicas) { + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + if (ca && !ptr->cached && !ca->mi.durability) + data_opts->kill_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + data_opts->extra_replicas = opts->data_replicas - durability; + } else { + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + unsigned d = bch2_extent_ptr_durability(c, &p); + + if (d && durability - d >= opts->data_replicas) { + data_opts->kill_ptrs |= ptr_bit; + durability -= d; + } + + ptr_bit <<= 1; + } + + ptr_bit = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.has_ec && durability - p.ec.redundancy >= opts->data_replicas) { + data_opts->kill_ec_ptrs |= ptr_bit; + durability -= p.ec.redundancy; + } + + ptr_bit <<= 1; + } + } + } + + if (!data_opts->rewrite_ptrs && + !data_opts->kill_ptrs && + !data_opts->kill_ec_ptrs && + !data_opts->extra_replicas) { + CLASS(printbuf, buf)(); + prt_printf(&buf, "got extent to rebalance but nothing to do, confused\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "%s", buf.buf); return bkey_s_c_null; } @@ -605,12 +860,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - u64 sectors = 0; - - bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, §ors); - if (move_ptrs) { prt_str(&buf, "move="); bch2_target_to_text(&buf, c, opts->background_target); @@ -627,6 +876,14 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, prt_newline(&buf); } + if (csum_ptrs) { + prt_str(&buf, "csum="); + bch2_prt_csum_opt(&buf, opts->data_checksum); + prt_str(&buf, " "); + bch2_prt_u64_base2(&buf, csum_ptrs); + prt_newline(&buf); + } + trace_rebalance_extent(c, buf.buf); } count_event(c, rebalance_extent); @@ -688,8 +945,75 @@ out: return ret; } +static int do_rebalance_scan_bp(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct bkey_buf *last_flushed) +{ + if (bp.v->level) /* metadata not supported yet */ + return 0; + + struct btree_iter iter; + struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, + last_flushed); + int ret = bkey_err(k); + if (ret) + return ret; + + if (!k.k) + return 0; + + struct bch_inode_opts io_opts; + ret = bch2_extent_get_io_opts_one(trans, &io_opts, &iter, k, + SET_NEEDS_REBALANCE_opt_change); + bch2_trans_iter_exit(&iter); + return ret; +} + +static int do_rebalance_scan_device(struct moving_context *ctxt, + unsigned dev, u64 cookie, + u64 *sectors_scanned) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &c->rebalance; + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + bch2_btree_write_buffer_flush_sync(trans); + + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, + POS(dev, 0), POS(dev, U64_MAX), + BTREE_ITER_prefetch, k, ({ + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + + if (k.k->type != KEY_TYPE_backpointer) + continue; + + do_rebalance_scan_bp(trans, bkey_s_c_to_backpointer(k), &last_flushed); + })) ?: + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_clear_rebalance_needs_scan(trans, dev + 1, cookie)); + + *sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen); + /* + * Ensure that the rebalance_work entries we created are seen by the + * next iteration of do_rebalance(), so we don't end up stuck in + * rebalance_wait(): + */ + *sectors_scanned += 1; + bch2_move_stats_exit(&r->scan_stats, c); + + bch2_btree_write_buffer_flush_sync(trans); + + bch2_bkey_buf_exit(&last_flushed, c); + return ret; +} + static int do_rebalance_scan_indirect(struct btree_trans *trans, struct bkey_s_c_reflink_p p, + struct per_snapshot_io_opts *snapshot_io_opts, struct bch_inode_opts *opts) { u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); @@ -702,7 +1026,7 @@ static int do_rebalance_scan_indirect(struct btree_trans *trans, BTREE_ITER_not_extents, k, ({ if (bpos_ge(bkey_start_pos(k.k), POS(0, end))) break; - bch2_get_update_rebalance_opts(trans, opts, &iter, k, + bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts, &iter, k, SET_NEEDS_REBALANCE_opt_change_indirect); })); if (ret) @@ -724,15 +1048,21 @@ static int do_rebalance_scan(struct moving_context *ctxt, bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); ctxt->stats = &r->scan_stats; + r->state = BCH_REBALANCE_scanning; + if (!inum) { r->scan_start = BBPOS_MIN; r->scan_end = BBPOS_MAX; - } else { + } else if (inum >= BCACHEFS_ROOT_INO) { r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); - } + } else { + unsigned dev = inum - 1; + r->scan_start = BBPOS(BTREE_ID_backpointers, POS(dev, 0)); + r->scan_end = BBPOS(BTREE_ID_backpointers, POS(dev, U64_MAX)); - r->state = BCH_REBALANCE_scanning; + return do_rebalance_scan_device(ctxt, inum - 1, cookie, sectors_scanned); + } int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, r->scan_start.pos, r->scan_end.pos, @@ -750,7 +1080,8 @@ static int do_rebalance_scan(struct moving_context *ctxt, (inum && k.k->type == KEY_TYPE_reflink_p && REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v) - ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), opts) + ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), + snapshot_io_opts, opts) : 0); })); if (ret) @@ -1049,6 +1380,7 @@ int bch2_fs_rebalance_init(struct bch_fs *c) static int check_rebalance_work_one(struct btree_trans *trans, struct btree_iter *extent_iter, struct btree_iter *rebalance_iter, + struct per_snapshot_io_opts *snapshot_io_opts, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; @@ -1089,8 +1421,7 @@ static int check_rebalance_work_one(struct btree_trans *trans, extent_k.k = &deleted; } - bool should_have_rebalance = - bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; + bool should_have_rebalance = bch2_bkey_needs_rb(extent_k); bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; if (should_have_rebalance != have_rebalance) { @@ -1119,6 +1450,21 @@ static int check_rebalance_work_one(struct btree_trans *trans, return ret; } + struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans, + snapshot_io_opts, extent_iter->pos, extent_iter, extent_k, + SET_NEEDS_REBALANCE_other); + ret = PTR_ERR_OR_ZERO(opts); + if (ret == -BCH_ERR_transaction_restart_commit) { + /* + * If get_apply_io_opts() did work, just advance and check the + * next key; it may have updated the rebalance_work btree so + * we'd need a write buffer flush to check what it just did. + */ + ret = 0; + } + if (ret) + return ret; + if (cmp <= 0) bch2_btree_iter_advance(extent_iter); if (cmp >= 0) @@ -1131,10 +1477,14 @@ int bch2_check_rebalance_work(struct bch_fs *c) { CLASS(btree_trans, trans)(c); CLASS(btree_iter, extent_iter)(trans, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_not_extents| BTREE_ITER_prefetch); CLASS(btree_iter, rebalance_iter)(trans, BTREE_ID_rebalance_work, POS_MIN, BTREE_ITER_prefetch); + struct per_snapshot_io_opts snapshot_io_opts; + per_snapshot_io_opts_init(&snapshot_io_opts, c); + struct bkey_buf last_flushed; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); @@ -1148,12 +1498,15 @@ int bch2_check_rebalance_work(struct bch_fs *c) bch2_trans_begin(trans); - ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); + ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, + &snapshot_io_opts, &last_flushed) ?: + bch2_trans_commit(trans, NULL, NULL, 0); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ret = 0; } + per_snapshot_io_opts_exit(&snapshot_io_opts); bch2_bkey_buf_exit(&last_flushed, c); return ret < 0 ? ret : 0; } |