diff options
Diffstat (limited to 'libbcachefs/io.c')
-rw-r--r-- | libbcachefs/io.c | 314 |
1 files changed, 143 insertions, 171 deletions
diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 60a14fa1..82caaf51 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -16,6 +16,7 @@ #include "checksum.h" #include "compress.h" #include "clock.h" +#include "data_update.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" @@ -237,12 +238,14 @@ int bch2_extent_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, struct disk_reservation *disk_res, - u64 *journal_seq, u64 new_i_size, s64 *i_sectors_delta_total, bool check_enospc) { struct btree_iter inode_iter = { NULL }; + struct bkey_s_c inode_k; + struct bkey_s_c_inode_v3 inode; + struct bkey_i_inode_v3 *new_inode; struct bpos next_pos; bool usage_increasing; s64 i_sectors_delta = 0, disk_sectors_delta = 0; @@ -282,59 +285,51 @@ int bch2_extent_update(struct btree_trans *trans, return ret; } - if (new_i_size || i_sectors_delta) { - struct bkey_s_c k; - struct bkey_s_c_inode_v3 inode; - struct bkey_i_inode_v3 *new_inode; - bool i_size_update; + bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes, + SPOS(0, inum.inum, iter->snapshot), + BTREE_ITER_INTENT|BTREE_ITER_CACHED); + inode_k = bch2_btree_iter_peek_slot(&inode_iter); + ret = bkey_err(inode_k); + if (unlikely(ret)) + goto err; - bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes, - SPOS(0, inum.inum, iter->snapshot), - BTREE_ITER_INTENT|BTREE_ITER_CACHED); - k = bch2_btree_iter_peek_slot(&inode_iter); - ret = bkey_err(k); - if (unlikely(ret)) - goto err; + ret = bkey_is_inode(inode_k.k) ? 0 : -ENOENT; + if (unlikely(ret)) + goto err; - ret = bkey_is_inode(k.k) ? 0 : -ENOENT; + if (unlikely(inode_k.k->type != KEY_TYPE_inode_v3)) { + inode_k = bch2_inode_to_v3(trans, inode_k); + ret = bkey_err(inode_k); if (unlikely(ret)) goto err; + } - if (unlikely(k.k->type != KEY_TYPE_inode_v3)) { - k = bch2_inode_to_v3(trans, k); - ret = bkey_err(k); - if (unlikely(ret)) - goto err; - } - - inode = bkey_s_c_to_inode_v3(k); - i_size_update = !(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > le64_to_cpu(inode.v->bi_size); - - if (!i_sectors_delta && !i_size_update) - goto no_inode_update; + inode = bkey_s_c_to_inode_v3(inode_k); - new_inode = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - ret = PTR_ERR_OR_ZERO(new_inode); - if (unlikely(ret)) - goto err; + new_inode = bch2_trans_kmalloc(trans, bkey_bytes(inode_k.k)); + ret = PTR_ERR_OR_ZERO(new_inode); + if (unlikely(ret)) + goto err; - bkey_reassemble(&new_inode->k_i, k); + bkey_reassemble(&new_inode->k_i, inode.s_c); - if (i_size_update) - new_inode->v.bi_size = cpu_to_le64(new_i_size); + if (!(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > le64_to_cpu(inode.v->bi_size)) + new_inode->v.bi_size = cpu_to_le64(new_i_size); - le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta); + le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta); - new_inode->k.p.snapshot = iter->snapshot; + new_inode->k.p.snapshot = iter->snapshot; - ret = bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0); - if (unlikely(ret)) - goto err; - } -no_inode_update: - ret = bch2_trans_update(trans, iter, k, 0) ?: - bch2_trans_commit(trans, disk_res, journal_seq, + /* + * Note: + * We always have to do an inode updated - even when i_size/i_sectors + * aren't changing - for fsync to work properly; fsync relies on + * inode->bi_journal_seq which is updated by the trigger code: + */ + ret = bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0) ?: + bch2_trans_update(trans, iter, k, 0) ?: + bch2_trans_commit(trans, disk_res, NULL, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL); if (unlikely(ret)) @@ -397,8 +392,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_cut_back(end_pos, &delete); ret = bch2_extent_update(trans, inum, iter, &delete, - &disk_res, NULL, - 0, i_sectors_delta, false); + &disk_res, 0, i_sectors_delta, false); bch2_disk_reservation_put(c, &disk_res); } @@ -428,7 +422,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, return ret; } -int bch2_write_index_default(struct bch_write_op *op) +static int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; struct bkey_buf sk; @@ -465,7 +459,7 @@ int bch2_write_index_default(struct bch_write_op *op) BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ret = bch2_extent_update(&trans, inum, &iter, sk.k, - &op->res, op_journal_seq(op), + &op->res, op->new_i_size, &op->i_sectors_delta, op->flags & BCH_WRITE_CHECK_ENOSPC); bch2_trans_iter_exit(&trans, &iter); @@ -543,29 +537,22 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, } } -static void __bch2_write(struct closure *); +static void __bch2_write(struct bch_write_op *); static void bch2_write_done(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; - if (!op->error && (op->flags & BCH_WRITE_FLUSH)) - op->error = bch2_journal_error(&c->journal); - bch2_disk_reservation_put(c, &op->res); percpu_ref_put(&c->writes); bch2_keylist_free(&op->insert_keys, op->inline_keys); bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - if (op->end_io) { - EBUG_ON(cl->parent); - closure_debug_destroy(cl); + closure_debug_destroy(cl); + if (op->end_io) op->end_io(op); - } else { - closure_return(cl); - } } static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) @@ -603,7 +590,7 @@ static void __bch2_write_index(struct bch_write_op *op) struct keylist *keys = &op->insert_keys; struct bkey_i *k; unsigned dev; - int ret; + int ret = 0; if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { ret = bch2_write_drop_io_error_ptrs(op); @@ -626,7 +613,10 @@ static void __bch2_write_index(struct bch_write_op *op) if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); - int ret = op->index_update_fn(op); + + ret = !(op->flags & BCH_WRITE_MOVE) + ? bch2_write_index_default(op) + : bch2_data_update_index_update(op); BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); BUG_ON(keylist_sectors(keys) && !ret); @@ -636,7 +626,7 @@ static void __bch2_write_index(struct bch_write_op *op) if (ret) { bch_err_inum_ratelimited(c, op->pos.inode, "write error while doing btree update: %s", bch2_err_str(ret)); - op->error = ret; + goto err; } } out: @@ -649,25 +639,45 @@ out: err: keys->top = keys->keys; op->error = ret; + op->flags |= BCH_WRITE_DONE; goto out; } static void bch2_write_index(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_fs *c = op->c; + struct write_point *wp = op->wp; + struct workqueue_struct *wq = index_update_wq(op); - __bch2_write_index(op); + barrier(); + op->btree_update_ready = true; + queue_work(wq, &wp->index_update_work); +} - if (!(op->flags & BCH_WRITE_DONE)) { - continue_at(cl, __bch2_write, index_update_wq(op)); - } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { - bch2_journal_flush_seq_async(&c->journal, - *op_journal_seq(op), - cl); - continue_at(cl, bch2_write_done, index_update_wq(op)); - } else { - continue_at_nobarrier(cl, bch2_write_done, NULL); +void bch2_write_point_do_index_updates(struct work_struct *work) +{ + struct write_point *wp = + container_of(work, struct write_point, index_update_work); + struct bch_write_op *op; + + while (1) { + spin_lock(&wp->writes_lock); + op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); + if (op && !op->btree_update_ready) + op = NULL; + if (op) + list_del(&op->wp_list); + spin_unlock(&wp->writes_lock); + + if (!op) + break; + + __bch2_write_index(op); + + if (!(op->flags & BCH_WRITE_DONE)) + __bch2_write(op); + else + bch2_write_done(&op->cl); } } @@ -700,12 +710,12 @@ static void bch2_write_endio(struct bio *bio) if (wbio->put_bio) bio_put(bio); - if (parent) + if (parent) { bio_endio(&parent->bio); - else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) - closure_put(cl); - else - continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); + return; + } + + closure_put(cl); } static void init_append_extent(struct bch_write_op *op, @@ -1112,19 +1122,18 @@ err: return ret; } -static void __bch2_write(struct closure *cl) +static void __bch2_write(struct bch_write_op *op) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; - struct write_point *wp; + struct write_point *wp = NULL; struct bio *bio = NULL; - bool skip_put = true; unsigned nofs_flags; int ret; nofs_flags = memalloc_nofs_save(); again: memset(&op->failed, 0, sizeof(op->failed)); + op->btree_update_ready = false; do { struct bkey_i *key_to_write; @@ -1134,76 +1143,60 @@ again: /* +1 for possible cache device: */ if (op->open_buckets.nr + op->nr_replicas + 1 > ARRAY_SIZE(op->open_buckets.v)) - goto flush_io; + break; if (bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ARRAY_SIZE(op->inline_keys), BKEY_EXTENT_U64s_MAX)) - goto flush_io; + break; /* * The copygc thread is now global, which means it's no longer * freeing up space on specific disks, which means that * allocations for specific disks may hang arbitrarily long: */ - wp = bch2_alloc_sectors_start(c, - op->target, - op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), - op->write_point, - &op->devs_have, - op->nr_replicas, - op->nr_replicas_required, - op->alloc_reserve, - op->flags, - (op->flags & (BCH_WRITE_ALLOC_NOWAIT| - BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); - EBUG_ON(!wp); - - if (IS_ERR(wp)) { - if (unlikely(wp != ERR_PTR(-EAGAIN))) { - ret = PTR_ERR(wp); - goto err; + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_alloc_sectors_start_trans(&trans, + op->target, + op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->write_point, + &op->devs_have, + op->nr_replicas, + op->nr_replicas_required, + op->alloc_reserve, + op->flags, + (op->flags & (BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_ONLY_SPECIFIED_DEVS)) + ? NULL : &op->cl, &wp)); + if (unlikely(ret)) { + if (unlikely(ret != -EAGAIN)) { + op->error = ret; + op->flags |= BCH_WRITE_DONE; } - goto flush_io; + break; } - /* - * It's possible for the allocator to fail, put us on the - * freelist waitlist, and then succeed in one of various retry - * paths: if that happens, we need to disable the skip_put - * optimization because otherwise there won't necessarily be a - * barrier before we free the bch_write_op: - */ - if (atomic_read(&cl->remaining) & CLOSURE_WAITING) - skip_put = false; - bch2_open_bucket_get(c, wp, &op->open_buckets); ret = bch2_write_extent(op, wp, &bio); - bch2_alloc_sectors_done(c, wp); - if (ret < 0) - goto err; + bch2_alloc_sectors_done(c, wp); - if (ret) { - skip_put = false; - } else { - /* - * for the skip_put optimization this has to be set - * before we submit the bio: - */ + if (ret < 0) { + op->error = ret; op->flags |= BCH_WRITE_DONE; + break; } + if (!ret) + op->flags |= BCH_WRITE_DONE; + bio->bi_end_io = bch2_write_endio; bio->bi_private = &op->cl; bio->bi_opf |= REQ_OP_WRITE; - if (!skip_put) - closure_get(bio->bi_private); - else - op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; + closure_get(bio->bi_private); key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); @@ -1212,48 +1205,34 @@ again: key_to_write); } while (ret); - if (!skip_put) - continue_at(cl, bch2_write_index, index_update_wq(op)); -out: - memalloc_nofs_restore(nofs_flags); - return; -err: - op->error = ret; - op->flags |= BCH_WRITE_DONE; - - continue_at(cl, bch2_write_index, index_update_wq(op)); - goto out; -flush_io: /* - * If the write can't all be submitted at once, we generally want to - * block synchronously as that signals backpressure to the caller. + * Sync or no? * - * However, if we're running out of a workqueue, we can't block here - * because we'll be blocking other work items from completing: + * If we're running asynchronously, wne may still want to block + * synchronously here if we weren't able to submit all of the IO at + * once, as that signals backpressure to the caller. */ - if (current->flags & PF_WQ_WORKER) { - continue_at(cl, bch2_write_index, index_update_wq(op)); - goto out; - } - - closure_sync(cl); - - if (!bch2_keylist_empty(&op->insert_keys)) { + if ((op->flags & BCH_WRITE_SYNC) || !(op->flags & BCH_WRITE_DONE)) { + closure_sync(&op->cl); __bch2_write_index(op); - if (op->error) { - op->flags |= BCH_WRITE_DONE; - continue_at_nobarrier(cl, bch2_write_done, NULL); - goto out; - } + if (!(op->flags & BCH_WRITE_DONE)) + goto again; + bch2_write_done(&op->cl); + } else { + spin_lock(&wp->writes_lock); + op->wp = wp; + list_add_tail(&op->wp_list, &wp->writes); + spin_unlock(&wp->writes_lock); + + continue_at(&op->cl, bch2_write_index, NULL); } - goto again; + memalloc_nofs_restore(nofs_flags); } static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) { - struct closure *cl = &op->cl; struct bio *bio = &op->wbio.bio; struct bvec_iter iter; struct bkey_i_inline_data *id; @@ -1290,8 +1269,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) op->flags |= BCH_WRITE_WROTE_DATA_INLINE; op->flags |= BCH_WRITE_DONE; - continue_at_nobarrier(cl, bch2_write_index, NULL); - return; + __bch2_write_index(op); err: bch2_write_done(&op->cl); } @@ -1319,6 +1297,7 @@ void bch2_write(struct closure *cl) struct bch_fs *c = op->c; unsigned data_len; + EBUG_ON(op->cl.parent); BUG_ON(!op->nr_replicas); BUG_ON(!op->write_point.v); BUG_ON(!bkey_cmp(op->pos, POS_MAX)); @@ -1352,24 +1331,19 @@ void bch2_write(struct closure *cl) return; } - continue_at_nobarrier(cl, __bch2_write, NULL); + __bch2_write(op); return; err: bch2_disk_reservation_put(c, &op->res); - if (op->end_io) { - EBUG_ON(cl->parent); - closure_debug_destroy(cl); + closure_debug_destroy(&op->cl); + if (op->end_io) op->end_io(op); - } else { - closure_return(cl); - } } /* Cache promotion on read */ struct promote_op { - struct closure cl; struct rcu_head rcu; u64 start_time; @@ -1423,10 +1397,10 @@ static void promote_free(struct bch_fs *c, struct promote_op *op) kfree_rcu(op, rcu); } -static void promote_done(struct closure *cl) +static void promote_done(struct bch_write_op *wop) { struct promote_op *op = - container_of(cl, struct promote_op, cl); + container_of(wop, struct promote_op, write.op); struct bch_fs *c = op->write.op.c; bch2_time_stats_update(&c->times[BCH_TIME_data_promote], @@ -1438,7 +1412,6 @@ static void promote_done(struct closure *cl) static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) { - struct closure *cl = &op->cl; struct bio *bio = &op->write.op.wbio.bio; trace_and_count(op->write.op.c, read_promote, &rbio->bio); @@ -1451,9 +1424,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) sizeof(struct bio_vec) * rbio->bio.bi_vcnt); swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - closure_init(cl, NULL); - bch2_data_update_read_done(&op->write, rbio->pick.crc, cl); - closure_return_with_destructor(cl, promote_done); + bch2_data_update_read_done(&op->write, rbio->pick.crc); } static struct promote_op *__promote_alloc(struct bch_fs *c, @@ -1518,6 +1489,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, }, btree_id, k); BUG_ON(ret); + op->write.op.end_io = promote_done; return op; err: |