diff options
Diffstat (limited to 'libbcachefs/move.c')
-rw-r--r-- | libbcachefs/move.c | 276 |
1 files changed, 193 insertions, 83 deletions
diff --git a/libbcachefs/move.c b/libbcachefs/move.c index e5a46ba6..a176484a 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -58,6 +58,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) BKEY_PADDED(k) _new, _insert; struct bch_extent_ptr *ptr; struct bch_extent_crc_unpacked crc; + unsigned nr_dirty; bool did_work = false; if (btree_iter_err(k)) { @@ -71,6 +72,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op) m->ptr, m->offset)) goto nomatch; + if (m->data_cmd == DATA_REWRITE && + !bch2_extent_has_device(bkey_s_c_to_extent(k), + m->data_opts.rewrite_dev)) + goto nomatch; + bkey_reassemble(&_insert.k, k); insert = bkey_i_to_extent(&_insert.k); @@ -81,11 +87,12 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_cut_back(new->k.p, &insert->k); bch2_cut_back(insert->k.p, &new->k); - if (m->move_dev >= 0 && - (ptr = (struct bch_extent_ptr *) - bch2_extent_has_device(extent_i_to_s_c(insert), - m->move_dev))) + if (m->data_cmd == DATA_REWRITE) { + ptr = (struct bch_extent_ptr *) + bch2_extent_has_device(extent_i_to_s_c(insert), + m->data_opts.rewrite_dev); bch2_extent_drop_ptr(extent_i_to_s(insert), ptr); + } extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) { if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) { @@ -108,10 +115,35 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_extent_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); bch2_extent_normalize(c, extent_i_to_s(insert).s); - bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert)); + bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert), + c->opts.data_replicas); + + /* + * It's possible we race, and for whatever reason the extent now + * has fewer replicas than when we last looked at it - meaning + * we need to get a disk reservation here: + */ + nr_dirty = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)); + if (m->nr_ptrs_reserved < nr_dirty) { + unsigned sectors = (nr_dirty - m->nr_ptrs_reserved) * + keylist_sectors(keys); + + /* + * can't call bch2_disk_reservation_add() with btree + * locks held, at least not without a song and dance + */ + bch2_btree_iter_unlock(&iter); + + ret = bch2_disk_reservation_add(c, &op->res, sectors, 0); + if (ret) + goto out; + + m->nr_ptrs_reserved = nr_dirty; + goto next; + } - ret = bch2_check_mark_super(c, BCH_DATA_USER, - bch2_extent_devs(extent_i_to_s_c(insert))); + ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, + extent_i_to_s_c(insert).s_c); if (ret) break; @@ -119,7 +151,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op) NULL, op_journal_seq(op), BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| - m->btree_insert_flags, + BTREE_INSERT_USE_RESERVE| + m->data_opts.btree_insert_flags, BTREE_INSERT_ENTRY(&iter, &insert->k_i)); if (!ret) atomic_long_inc(&c->extent_migrate_done); @@ -150,8 +183,7 @@ out: return ret; } -void bch2_migrate_write_init(struct migrate_write *m, - struct bch_read_bio *rbio) +void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) { /* write bio must own pages: */ BUG_ON(!m->op.wbio.bio.bi_vcnt); @@ -162,16 +194,39 @@ void bch2_migrate_write_init(struct migrate_write *m, m->op.pos = rbio->pos; m->op.version = rbio->version; m->op.crc = rbio->pick.crc; + m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { m->op.nonce = m->op.crc.nonce + m->op.crc.offset; m->op.csum_type = m->op.crc.csum_type; } - if (m->move_dev >= 0) - bch2_dev_list_drop_dev(&m->op.devs_have, m->move_dev); + if (m->data_cmd == DATA_REWRITE) + bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); +} + +int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + struct bch_devs_mask *devs, + struct write_point_specifier wp, + struct bch_io_opts io_opts, + enum data_cmd data_cmd, + struct data_opts data_opts, + struct bkey_s_c k) +{ + int ret; - if (m->btree_insert_flags & BTREE_INSERT_USE_RESERVE) + m->data_cmd = data_cmd; + m->data_opts = data_opts; + m->nr_ptrs_reserved = bch2_extent_nr_dirty_ptrs(k); + + bch2_write_op_init(&m->op, c); + m->op.csum_type = bch2_data_checksum_type(c, io_opts.data_checksum); + m->op.compression_type = + bch2_compression_opt_to_type[io_opts.compression]; + m->op.devs = devs; + m->op.write_point = wp; + + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) m->op.alloc_reserve = RESERVE_MOVINGGC; m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| @@ -180,10 +235,35 @@ void bch2_migrate_write_init(struct migrate_write *m, BCH_WRITE_DATA_ENCODED| BCH_WRITE_NOMARK_REPLICAS; - m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; m->op.nr_replicas = 1; m->op.nr_replicas_required = 1; m->op.index_update_fn = bch2_migrate_index_update; + + switch (data_cmd) { + case DATA_ADD_REPLICAS: + if (m->nr_ptrs_reserved < c->opts.data_replicas) { + m->op.nr_replicas = c->opts.data_replicas - m->nr_ptrs_reserved; + + ret = bch2_disk_reservation_get(c, &m->op.res, + k.k->size, + m->op.nr_replicas, 0); + if (ret) + return ret; + + m->nr_ptrs_reserved = c->opts.data_replicas; + } + break; + case DATA_REWRITE: + break; + case DATA_PROMOTE: + m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; + m->op.flags |= BCH_WRITE_CACHED; + break; + default: + BUG(); + } + + return 0; } static void move_free(struct closure *cl) @@ -210,7 +290,7 @@ static void move_write(struct closure *cl) struct moving_io *io = container_of(cl, struct moving_io, cl); if (likely(!io->rbio.bio.bi_status)) { - bch2_migrate_write_init(&io->write, &io->rbio); + bch2_migrate_read_done(&io->write, &io->rbio); closure_call(&io->write.op.cl, bch2_write, NULL, cl); } @@ -238,19 +318,19 @@ static void move_read_endio(struct bio *bio) } static int bch2_move_extent(struct bch_fs *c, - struct moving_context *ctxt, - struct bch_devs_mask *devs, - struct write_point_specifier wp, - int btree_insert_flags, - int move_device, - struct bch_io_opts opts, - struct bkey_s_c_extent e) + struct moving_context *ctxt, + struct bch_devs_mask *devs, + struct write_point_specifier wp, + struct bch_io_opts io_opts, + struct bkey_s_c_extent e, + enum data_cmd data_cmd, + struct data_opts data_opts) { struct extent_pick_ptr pick; struct moving_io *io; const struct bch_extent_ptr *ptr; struct bch_extent_crc_unpacked crc; - unsigned sectors = e.k->size, pages, nr_good; + unsigned sectors = e.k->size, pages; int ret = -ENOMEM; bch2_extent_pick_ptr(c, e.s_c, NULL, &pick); @@ -279,7 +359,7 @@ static int bch2_move_extent(struct bch_fs *c, if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) goto err_free; - io->rbio.opts = opts; + io->rbio.opts = io_opts; bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); io->rbio.bio.bi_iter.bi_size = sectors << 9; @@ -288,27 +368,10 @@ static int bch2_move_extent(struct bch_fs *c, io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k); io->rbio.bio.bi_end_io = move_read_endio; - io->write.btree_insert_flags = btree_insert_flags; - io->write.move_dev = move_device; - - bch2_write_op_init(&io->write.op, c); - io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum); - io->write.op.compression_type = - bch2_compression_opt_to_type(opts.compression); - io->write.op.devs = devs; - io->write.op.write_point = wp; - - if (move_device < 0 && - ((nr_good = bch2_extent_nr_good_ptrs(c, e)) < - c->opts.data_replicas)) { - io->write.op.nr_replicas = c->opts.data_replicas - nr_good; - - ret = bch2_disk_reservation_get(c, &io->write.op.res, - e.k->size, - io->write.op.nr_replicas, 0); - if (ret) - goto err_free_pages; - } + ret = bch2_migrate_write_init(c, &io->write, devs, wp, + io_opts, data_cmd, data_opts, e.s_c); + if (ret) + goto err_free_pages; atomic64_inc(&ctxt->stats->keys_moved); atomic64_add(e.k->size, &ctxt->stats->sectors_moved); @@ -369,8 +432,6 @@ int bch2_move_data(struct bch_fs *c, unsigned sectors_in_flight, struct bch_devs_mask *devs, struct write_point_specifier wp, - int btree_insert_flags, - int move_device, struct bpos start, struct bpos end, move_pred_fn pred, void *arg, @@ -378,12 +439,14 @@ int bch2_move_data(struct bch_fs *c, { bool kthread = (current->flags & PF_KTHREAD) != 0; struct moving_context ctxt = { .stats = stats }; - struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); BKEY_PADDED(k) tmp; struct bkey_s_c k; struct bkey_s_c_extent e; + struct data_opts data_opts; + enum data_cmd data_cmd; u64 cur_inum = U64_MAX; - int ret = 0; + int ret = 0, ret2; closure_init_stack(&ctxt.cl); INIT_LIST_HEAD(&ctxt.reads); @@ -430,28 +493,44 @@ peek: /* don't hold btree locks while looking up inode: */ bch2_btree_iter_unlock(&stats->iter); - opts = bch2_opts_to_inode_opts(c->opts); + io_opts = bch2_opts_to_inode_opts(c->opts); if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) - bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode)); + bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); cur_inum = k.k->p.inode; goto peek; } - if (!pred(arg, e)) + switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e, + &io_opts, &data_opts))) { + case DATA_SKIP: goto next; + case DATA_SCRUB: + BUG(); + case DATA_ADD_REPLICAS: + case DATA_REWRITE: + case DATA_PROMOTE: + break; + default: + BUG(); + } /* unlock before doing IO: */ bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); bch2_btree_iter_unlock(&stats->iter); - if (bch2_move_extent(c, &ctxt, devs, wp, - btree_insert_flags, - move_device, opts, - bkey_s_c_to_extent(k))) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(&ctxt); - continue; + ret2 = bch2_move_extent(c, &ctxt, devs, wp, io_opts, + bkey_s_c_to_extent(k), + data_cmd, data_opts); + if (ret2) { + if (ret2 == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(&ctxt); + continue; + } + + /* XXX signal failure */ + goto next; } if (rate) @@ -486,11 +565,11 @@ static int bch2_gc_data_replicas(struct bch_fs *c) int ret; mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_USER); + bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED)); for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) { - ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k)); + ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k); if (ret) break; } @@ -514,8 +593,8 @@ static int bch2_gc_btree_replicas(struct bch_fs *c) for (id = 0; id < BTREE_ID_NR; id++) { for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - ret = bch2_check_mark_super(c, BCH_DATA_BTREE, - bch2_bkey_devs(bkey_i_to_s_c(&b->key))); + ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, + bkey_i_to_s_c(&b->key)); bch2_btree_iter_cond_resched(&iter); } @@ -534,18 +613,35 @@ static int bch2_move_btree(struct bch_fs *c, void *arg, struct bch_move_stats *stats) { + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree *b; unsigned id; + struct data_opts data_opts; + enum data_cmd cmd; int ret = 0; stats->data_type = BCH_DATA_BTREE; for (id = 0; id < BTREE_ID_NR; id++) { for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - if (pred(arg, bkey_i_to_s_c_extent(&b->key))) - ret = bch2_btree_node_rewrite(c, &stats->iter, - b->data->keys.seq, 0) ?: ret; + switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE, + bkey_i_to_s_c_extent(&b->key), + &io_opts, + &data_opts))) { + case DATA_SKIP: + goto next; + case DATA_SCRUB: + BUG(); + case DATA_ADD_REPLICAS: + case DATA_REWRITE: + break; + default: + BUG(); + } + ret = bch2_btree_node_rewrite(c, &stats->iter, + b->data->keys.seq, 0) ?: ret; +next: bch2_btree_iter_cond_resched(&stats->iter); } @@ -556,32 +652,48 @@ static int bch2_move_btree(struct bch_fs *c, } #if 0 -static bool scrub_data_pred(void *arg, struct bkey_s_c_extent e) +static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, + enum bkey_type type, + struct bkey_s_c_extent e, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) { + return DATA_SCRUB; } #endif -static bool rereplicate_metadata_pred(void *arg, struct bkey_s_c_extent e) +static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, + enum bkey_type type, + struct bkey_s_c_extent e, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) { - struct bch_fs *c = arg; unsigned nr_good = bch2_extent_nr_good_ptrs(c, e); + unsigned replicas = type == BKEY_TYPE_BTREE + ? c->opts.metadata_replicas + : c->opts.data_replicas; - return nr_good && nr_good < c->opts.metadata_replicas; -} + if (!nr_good || nr_good >= replicas) + return DATA_SKIP; -static bool rereplicate_data_pred(void *arg, struct bkey_s_c_extent e) -{ - struct bch_fs *c = arg; - unsigned nr_good = bch2_extent_nr_good_ptrs(c, e); - - return nr_good && nr_good < c->opts.data_replicas; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; } -static bool migrate_pred(void *arg, struct bkey_s_c_extent e) +static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, + enum bkey_type type, + struct bkey_s_c_extent e, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) { struct bch_ioctl_data *op = arg; - return bch2_extent_has_device(e, op->migrate.dev); + if (!bch2_extent_has_device(e, op->migrate.dev)) + return DATA_SKIP; + + data_opts->btree_insert_flags = 0; + data_opts->rewrite_dev = op->migrate.dev; + return DATA_REWRITE; } int bch2_data_job(struct bch_fs *c, @@ -595,16 +707,15 @@ int bch2_data_job(struct bch_fs *c, stats->data_type = BCH_DATA_JOURNAL; ret = bch2_journal_flush_device(&c->journal, -1); - ret = bch2_move_btree(c, rereplicate_metadata_pred, c, stats) ?: ret; + ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; ret = bch2_gc_btree_replicas(c) ?: ret; ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE, NULL, writepoint_hashed((unsigned long) current), - 0, -1, op.start, op.end, - rereplicate_data_pred, c, stats) ?: ret; + rereplicate_pred, c, stats) ?: ret; ret = bch2_gc_data_replicas(c) ?: ret; break; case BCH_DATA_OP_MIGRATE: @@ -620,7 +731,6 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE, NULL, writepoint_hashed((unsigned long) current), - 0, -1, op.start, op.end, migrate_pred, &op, stats) ?: ret; |