diff options
Diffstat (limited to 'libbcachefs/io.c')
-rw-r--r-- | libbcachefs/io.c | 832 |
1 files changed, 557 insertions, 275 deletions
diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 27e45081..bb656522 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -14,6 +14,7 @@ #include "compress.h" #include "clock.h" #include "debug.h" +#include "disk_groups.h" #include "error.h" #include "extents.h" #include "io.h" @@ -30,14 +31,71 @@ #include <trace/events/bcachefs.h> -/* Allocate, free from mempool: */ +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + const struct bch_devs_mask *devs; + unsigned d, nr = 0, total = 0; + u64 now = local_clock(), last; + s64 congested; + struct bch_dev *ca; + + if (!target) + return false; + + rcu_read_lock(); + devs = bch2_target_to_mask(c, target); + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { + ca = rcu_dereference(c->devs[d]); + if (!ca) + continue; + + congested = atomic_read(&ca->congested); + last = READ_ONCE(ca->congested_last); + if (time_after64(now, last)) + congested -= (now - last) >> 12; + + total += max(congested, 0LL); + nr++; + } + rcu_read_unlock(); -void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw) + return bch2_rand_range(nr * CONGESTED_MAX) < total; +} + +static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, + u64 now, int rw) +{ + u64 latency_capable = + ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; + /* ideally we'd be taking into account the device's variance here: */ + u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); + s64 latency_over = io_latency - latency_threshold; + + if (latency_threshold && latency_over > 0) { + /* + * bump up congested by approximately latency_over * 4 / + * latency_threshold - we don't need much accuracy here so don't + * bother with the divide: + */ + if (atomic_read(&ca->congested) < CONGESTED_MAX) + atomic_add(latency_over >> + max_t(int, ilog2(latency_threshold) - 2, 0), + &ca->congested); + + ca->congested_last = now; + } else if (atomic_read(&ca->congested) > 0) { + atomic_dec(&ca->congested); + } +} + +void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) { + atomic64_t *latency = &ca->cur_latency[rw]; u64 now = local_clock(); - unsigned io_latency = (now >> 10) - submit_time_us; - atomic_t *latency = &ca->latency[rw]; - unsigned old, new, v = atomic_read(latency); + u64 io_latency = time_after64(now, submit_time) + ? now - submit_time + : 0; + u64 old, new, v = atomic64_read(latency); do { old = v; @@ -51,10 +109,16 @@ void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw) now & ~(~0 << 5)) break; - new = ewma_add((u64) old, io_latency, 6); - } while ((v = atomic_cmpxchg(latency, old, new)) != old); + new = ewma_add(old, io_latency, 5); + } while ((v = atomic64_cmpxchg(latency, old, new)) != old); + + bch2_congested_acct(ca, io_latency, now, rw); + + __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); } +/* Allocate, free from mempool: */ + void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { struct bio_vec *bv; @@ -169,22 +233,21 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, } n->c = c; - n->ca = ca; - n->submit_time_us = local_clock_us(); + n->dev = ptr->dev; + n->have_ioref = bch2_dev_get_ioref(ca, WRITE); + n->submit_time = local_clock(); n->bio.bi_iter.bi_sector = ptr->offset; if (!journal_flushes_device(ca)) n->bio.bi_opf |= REQ_FUA; - if (likely(percpu_ref_tryget(&ca->io_ref))) { + if (likely(n->have_ioref)) { this_cpu_add(ca->io_done->sectors[WRITE][type], bio_sectors(&n->bio)); - n->have_io_ref = true; bio_set_dev(&n->bio, ca->disk_sb.bdev); submit_bio(&n->bio); } else { - n->have_io_ref = false; n->bio.bi_status = BLK_STS_REMOVED; bio_endio(&n->bio); } @@ -196,15 +259,18 @@ static void __bch2_write(struct closure *); static void bch2_write_done(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; if (!op->error && (op->flags & BCH_WRITE_FLUSH)) - op->error = bch2_journal_error(&op->c->journal); + op->error = bch2_journal_error(&c->journal); if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) - bch2_disk_reservation_put(op->c, &op->res); - percpu_ref_put(&op->c->writes); + bch2_disk_reservation_put(c, &op->res); + percpu_ref_put(&c->writes); bch2_keylist_free(&op->insert_keys, op->inline_keys); + bch2_time_stats_update(&c->data_write_time, op->start_time); + closure_return(cl); } @@ -318,15 +384,15 @@ static void bch2_write_endio(struct bio *bio) struct bch_write_bio *wbio = to_wbio(bio); struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_fs *c = wbio->c; - struct bch_dev *ca = wbio->ca; - - bch2_latency_acct(ca, wbio->submit_time_us, WRITE); + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); if (bch2_dev_io_err_on(bio->bi_status, ca, "data write")) - set_bit(ca->dev_idx, op->failed.d); + set_bit(wbio->dev, op->failed.d); - if (wbio->have_io_ref) + if (wbio->have_ioref) { + bch2_latency_acct(ca, wbio->submit_time, WRITE); percpu_ref_put(&ca->io_ref); + } if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -821,6 +887,8 @@ void bch2_write(struct closure *cl) BUG_ON(!bkey_cmp(op->pos, POS_MAX)); BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX); + op->start_time = local_clock(); + memset(&op->failed, 0, sizeof(op->failed)); bch2_keylist_init(&op->insert_keys, op->inline_keys); @@ -844,19 +912,72 @@ void bch2_write(struct closure *cl) struct promote_op { struct closure cl; + u64 start_time; + + struct rhash_head hash; + struct bpos pos; + struct migrate_write write; struct bio_vec bi_inline_vecs[0]; /* must be last */ }; +static const struct rhashtable_params bch_promote_params = { + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), +}; + +static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, + struct bpos pos, + struct bch_io_opts opts, + unsigned flags) +{ + if (!opts.promote_target) + return false; + + if (!(flags & BCH_READ_MAY_PROMOTE)) + return false; + + if (percpu_ref_is_dying(&c->writes)) + return false; + + if (!bkey_extent_is_data(k.k)) + return false; + + if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target)) + return false; + + if (bch2_target_congested(c, opts.promote_target)) + return false; + + if (rhashtable_lookup_fast(&c->promote_table, &pos, + bch_promote_params)) + return false; + + return true; +} + +static void promote_free(struct bch_fs *c, struct promote_op *op) +{ + int ret; + + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + percpu_ref_put(&c->writes); + kfree(op); +} + static void promote_done(struct closure *cl) { struct promote_op *op = container_of(cl, struct promote_op, cl); struct bch_fs *c = op->write.op.c; - percpu_ref_put(&c->writes); + bch2_time_stats_update(&c->data_promote_time, op->start_time); + bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); - kfree(op); + promote_free(c, op); } static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) @@ -865,17 +986,15 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) struct closure *cl = &op->cl; struct bio *bio = &op->write.op.wbio.bio; - BUG_ON(!rbio->split || !rbio->bounce); - - if (!percpu_ref_tryget(&c->writes)) - return; - trace_promote(&rbio->bio); /* we now own pages: */ + BUG_ON(!rbio->bounce); BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); + + memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, + sizeof(struct bio_vec) * rbio->bio.bi_vcnt); swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - rbio->promote = NULL; bch2_migrate_read_done(&op->write, rbio); @@ -884,69 +1003,120 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) closure_return_with_destructor(cl, promote_done); } -/* - * XXX: multiple promotes can race with each other, wastefully. Keep a list of - * outstanding promotes? - */ -static struct promote_op *promote_alloc(struct bch_read_bio *rbio, - struct bkey_s_c k) +noinline +static struct promote_op *__promote_alloc(struct bch_fs *c, + struct bpos pos, + struct extent_pick_ptr *pick, + struct bch_io_opts opts, + unsigned rbio_sectors, + struct bch_read_bio **rbio) { - struct bch_fs *c = rbio->c; - struct promote_op *op; + struct promote_op *op = NULL; struct bio *bio; + unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS); /* data might have to be decompressed in the write path: */ - unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size, - PAGE_SECTORS); + unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size, + PAGE_SECTORS); int ret; - BUG_ON(!rbio->bounce); - BUG_ON(pages < rbio->bio.bi_vcnt); + if (!percpu_ref_tryget(&c->writes)) + return NULL; - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages, GFP_NOIO); if (!op) - return NULL; + goto err; - bio = &op->write.op.wbio.bio; - bio_init(bio, bio->bi_inline_vecs, pages); + op->start_time = local_clock(); + op->pos = pos; - memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); + /* + * promotes require bouncing, but if the extent isn't + * checksummed/compressed it might be too big for the mempool: + */ + if (rbio_sectors > c->sb.encoded_extent_max) { + *rbio = kzalloc(sizeof(struct bch_read_bio) + + sizeof(struct bio_vec) * rbio_pages, + GFP_NOIO); + if (!*rbio) + goto err; + + rbio_init(&(*rbio)->bio, opts); + bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, + rbio_pages); + + (*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9; + bch2_bio_map(&(*rbio)->bio, NULL); + + if (bio_alloc_pages(&(*rbio)->bio, GFP_NOIO)) + goto err; + + (*rbio)->bounce = true; + (*rbio)->split = true; + (*rbio)->kmalloc = true; + } + + if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, + bch_promote_params)) + goto err; + + bio = &op->write.op.wbio.bio; + bio_init(bio, bio->bi_inline_vecs, wbio_pages); ret = bch2_migrate_write_init(c, &op->write, writepoint_hashed((unsigned long) current), - rbio->opts, + opts, DATA_PROMOTE, (struct data_opts) { - .target = rbio->opts.promote_target + .target = opts.promote_target }, - k); + bkey_s_c_null); BUG_ON(ret); return op; +err: + if (*rbio) + bio_free_pages(&(*rbio)->bio); + kfree(*rbio); + *rbio = NULL; + kfree(op); + percpu_ref_put(&c->writes); + return NULL; } -static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e, - unsigned flags, u16 target) +static inline struct promote_op *promote_alloc(struct bch_fs *c, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_pick_ptr *pick, + struct bch_io_opts opts, + unsigned flags, + struct bch_read_bio **rbio, + bool *bounce, + bool *read_full) { - if (!target) - return false; - - if (!(flags & BCH_READ_MAY_PROMOTE)) - return false; + bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); + unsigned sectors = promote_full + ? pick->crc.compressed_size + : bvec_iter_sectors(iter); + struct bpos pos = promote_full + ? bkey_start_pos(k.k) + : POS(k.k->p.inode, iter.bi_sector); + struct promote_op *promote; + + if (!should_promote(c, k, pos, opts, flags)) + return NULL; - if (percpu_ref_is_dying(&c->writes)) - return false; + promote = __promote_alloc(c, pos, pick, opts, sectors, rbio); + if (!promote) + return NULL; - return bch2_extent_has_target(c, e, target) == NULL; + *bounce = true; + *read_full = promote_full; + return promote; } /* Read */ -static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *, - struct bvec_iter, u64, - struct bch_devs_mask *, unsigned); - #define READ_RETRY_AVOID 1 #define READ_RETRY 2 #define READ_ERR 3 @@ -979,38 +1149,144 @@ static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) { - struct bch_read_bio *parent = rbio->parent; - - BUG_ON(!rbio->split); + BUG_ON(rbio->bounce && !rbio->split); if (rbio->promote) - kfree(rbio->promote); + promote_free(rbio->c, rbio->promote); + rbio->promote = NULL; + if (rbio->bounce) bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - bio_put(&rbio->bio); - return parent; + if (rbio->split) { + struct bch_read_bio *parent = rbio->parent; + + if (rbio->kmalloc) + kfree(rbio); + else + bio_put(&rbio->bio); + + rbio = parent; + } + + return rbio; } static void bch2_rbio_done(struct bch_read_bio *rbio) { - if (rbio->promote) - kfree(rbio->promote); - rbio->promote = NULL; - - if (rbio->split) - rbio = bch2_rbio_free(rbio); + bch2_time_stats_update(&rbio->c->data_read_time, rbio->start_time); bio_endio(&rbio->bio); } +static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, u64 inode, + struct bch_devs_mask *avoid, unsigned flags) +{ + struct btree_iter iter; + BKEY_PADDED(k) tmp; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, + rbio->pos, BTREE_ITER_SLOTS); +retry: + rbio->bio.bi_status = 0; + + k = bch2_btree_iter_peek_slot(&iter); + if (btree_iter_err(k)) { + bch2_btree_iter_unlock(&iter); + goto err; + } + + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + bch2_btree_iter_unlock(&iter); + + if (!bkey_extent_is_data(k.k) || + !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k), + rbio->pick.ptr, + rbio->pos.offset - + rbio->pick.crc.offset)) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; + goto out; + } + + ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags); + if (ret == READ_RETRY) + goto retry; + if (ret) + goto err; + goto out; +err: + rbio->bio.bi_status = BLK_STS_IOERR; +out: + bch2_rbio_done(rbio); +} + +static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, u64 inode, + struct bch_devs_mask *avoid, unsigned flags) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; +retry: + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + POS(inode, bvec_iter.bi_sector), + BTREE_ITER_SLOTS, k) { + BKEY_PADDED(k) tmp; + unsigned bytes; + + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + bch2_btree_iter_unlock(&iter); + + bytes = min_t(unsigned, bvec_iter.bi_size, + (k.k->p.offset - bvec_iter.bi_sector) << 9); + swap(bvec_iter.bi_size, bytes); + + ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags); + switch (ret) { + case READ_RETRY: + goto retry; + case READ_ERR: + goto err; + }; + + if (bytes == bvec_iter.bi_size) + goto out; + + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + } + + /* + * If we get here, it better have been because there was an error + * reading a btree node + */ + ret = bch2_btree_iter_unlock(&iter); + BUG_ON(!ret); + __bcache_io_error(c, "btree IO error %i", ret); +err: + rbio->bio.bi_status = BLK_STS_IOERR; +out: + bch2_rbio_done(rbio); +} + static void bch2_rbio_retry(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bvec_iter iter = rbio->bvec_iter; - unsigned flags = rbio->flags; - u64 inode = rbio->pos.inode; + struct bch_fs *c = rbio->c; + struct bvec_iter iter = rbio->bvec_iter; + unsigned flags = rbio->flags; + u64 inode = rbio->pos.inode; struct bch_devs_mask avoid; trace_read_retry(&rbio->bio); @@ -1018,26 +1294,19 @@ static void bch2_rbio_retry(struct work_struct *work) memset(&avoid, 0, sizeof(avoid)); if (rbio->retry == READ_RETRY_AVOID) - __set_bit(rbio->pick.ca->dev_idx, avoid.d); + __set_bit(rbio->pick.ptr.dev, avoid.d); - if (rbio->promote) - kfree(rbio->promote); - rbio->promote = NULL; + rbio->bio.bi_status = 0; - if (rbio->split) - rbio = bch2_rbio_free(rbio); - else - rbio->bio.bi_status = 0; + rbio = bch2_rbio_free(rbio); - if (!(flags & BCH_READ_NODECODE)) - flags |= BCH_READ_MUST_CLONE; flags |= BCH_READ_IN_RETRY; flags &= ~BCH_READ_MAY_PROMOTE; if (flags & BCH_READ_NODECODE) - bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags); + bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags); else - __bch2_read(c, rbio, iter, inode, &avoid, flags); + bch2_read_retry(c, rbio, iter, inode, &avoid, flags); } static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, @@ -1049,7 +1318,9 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, return; if (retry == READ_ERR) { - bch2_rbio_parent(rbio)->bio.bi_status = error; + rbio = bch2_rbio_free(rbio); + + rbio->bio.bi_status = error; bch2_rbio_done(rbio); } else { bch2_rbio_punt(rbio, bch2_rbio_retry, @@ -1121,12 +1392,13 @@ out: bch2_btree_iter_unlock(&iter); } -static bool should_narrow_crcs(struct bkey_s_c_extent e, +static bool should_narrow_crcs(struct bkey_s_c k, struct extent_pick_ptr *pick, unsigned flags) { return !(flags & BCH_READ_IN_RETRY) && - bch2_can_narrow_extent_crcs(e, pick->crc); + bkey_extent_is_data(k.k) && + bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc); } /* Inner part that may run in process context */ @@ -1134,8 +1406,10 @@ static void __bch2_read_endio(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio; + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bio *src = &rbio->bio; + struct bio *dst = &bch2_rbio_parent(rbio)->bio; struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); @@ -1191,10 +1465,13 @@ static void __bch2_read_endio(struct work_struct *work) */ bch2_encrypt_bio(c, crc.csum_type, nonce, src); promote_start(rbio->promote, rbio); + rbio->promote = NULL; } nodecode: - if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) + if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); + } return; csum_err: /* @@ -1208,7 +1485,7 @@ csum_err: return; } - bch2_dev_io_error(rbio->pick.ca, + bch2_dev_io_error(ca, "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)", rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, @@ -1227,25 +1504,27 @@ static void bch2_read_endio(struct bio *bio) { struct bch_read_bio *rbio = container_of(bio, struct bch_read_bio, bio); - struct bch_fs *c = rbio->c; + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; - bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ); - - percpu_ref_put(&rbio->pick.ca->io_ref); + if (rbio->have_ioref) { + bch2_latency_acct(ca, rbio->submit_time, READ); + percpu_ref_put(&ca->io_ref); + } if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) { + if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) { bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } if (rbio->pick.ptr.cached && (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) { + ptr_stale(ca, &rbio->pick.ptr))) { atomic_long_inc(&c->read_realloc_races); if (rbio->flags & BCH_READ_RETRY_IF_STALE) @@ -1266,76 +1545,97 @@ static void bch2_read_endio(struct bio *bio) } int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, - struct bvec_iter iter, struct bkey_s_c_extent e, - struct extent_pick_ptr *pick, unsigned flags) + struct bvec_iter iter, struct bkey_s_c k, + struct bch_devs_mask *avoid, unsigned flags) { - struct bch_read_bio *rbio; - bool split = false, bounce = false, read_full = false; - bool promote = false, narrow_crcs = false; - struct bpos pos = bkey_start_pos(e.k); - int ret = 0; + struct extent_pick_ptr pick; + struct bch_read_bio *rbio = NULL; + struct bch_dev *ca; + struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos pos = bkey_start_pos(k.k); + int pick_ret; - lg_local_lock(&c->usage_lock); - bucket_io_clock_reset(c, pick->ca, - PTR_BUCKET_NR(pick->ca, &pick->ptr), READ); - lg_local_unlock(&c->usage_lock); + pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick); + + /* hole or reservation - just zero fill: */ + if (!pick_ret) + goto hole; - narrow_crcs = should_narrow_crcs(e, pick, flags); + if (pick_ret < 0) + goto no_device; + + if (pick_ret > 0) + ca = bch_dev_bkey_exists(c, pick.ptr.dev); if (flags & BCH_READ_NODECODE) { - BUG_ON(iter.bi_size < pick->crc.compressed_size << 9); - iter.bi_size = pick->crc.compressed_size << 9; + /* + * can happen if we retry, and the extent we were going to read + * has been merged in the meantime: + */ + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + goto hole; + + iter.bi_sector = pos.offset; + iter.bi_size = pick.crc.compressed_size << 9; goto noclone; } + if (!(flags & BCH_READ_LAST_FRAGMENT) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_MUST_CLONE; + + narrow_crcs = should_narrow_crcs(k, &pick, flags); + if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) flags |= BCH_READ_MUST_BOUNCE; - EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector || - e.k->p.offset < bvec_iter_end_sector(iter)); + EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector || + k.k->p.offset < bvec_iter_end_sector(iter)); - if (pick->crc.compression_type != BCH_COMPRESSION_NONE || - (pick->crc.csum_type != BCH_CSUM_NONE && - (bvec_iter_sectors(iter) != pick->crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick->crc.csum_type) && + if (pick.crc.compression_type != BCH_COMPRESSION_NONE || + (pick.crc.csum_type != BCH_CSUM_NONE && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && (flags & BCH_READ_USER_MAPPED)) || (flags & BCH_READ_MUST_BOUNCE)))) { read_full = true; bounce = true; } - promote = should_promote(c, e, flags, orig->opts.promote_target); - /* could also set read_full */ - if (promote) - bounce = true; + promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, + &rbio, &bounce, &read_full); if (!read_full) { - EBUG_ON(pick->crc.compression_type); - EBUG_ON(pick->crc.csum_type && - (bvec_iter_sectors(iter) != pick->crc.uncompressed_size || - bvec_iter_sectors(iter) != pick->crc.live_size || - pick->crc.offset || + EBUG_ON(pick.crc.compression_type); + EBUG_ON(pick.crc.csum_type && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + bvec_iter_sectors(iter) != pick.crc.live_size || + pick.crc.offset || iter.bi_sector != pos.offset)); - pick->ptr.offset += pick->crc.offset + + pick.ptr.offset += pick.crc.offset + (iter.bi_sector - pos.offset); - pick->crc.compressed_size = bvec_iter_sectors(iter); - pick->crc.uncompressed_size = bvec_iter_sectors(iter); - pick->crc.offset = 0; - pick->crc.live_size = bvec_iter_sectors(iter); + pick.crc.compressed_size = bvec_iter_sectors(iter); + pick.crc.uncompressed_size = bvec_iter_sectors(iter); + pick.crc.offset = 0; + pick.crc.live_size = bvec_iter_sectors(iter); pos.offset = iter.bi_sector; } - if (bounce) { - unsigned sectors = pick->crc.compressed_size; + if (rbio) { + /* promote already allocated bounce rbio */ + } else if (bounce) { + unsigned sectors = pick.crc.compressed_size; rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - &c->bio_read_split), + DIV_ROUND_UP(sectors, PAGE_SECTORS), + &c->bio_read_split), orig->opts); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); - split = true; + rbio->bounce = true; + rbio->split = true; } else if (flags & BCH_READ_MUST_CLONE) { /* * Have to clone if there were any splits, due to error @@ -1349,156 +1649,138 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, &c->bio_read_split), orig->opts); rbio->bio.bi_iter = iter; - split = true; + rbio->split = true; } else { noclone: rbio = orig; rbio->bio.bi_iter = iter; - split = false; BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); } - BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size); + BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); rbio->c = c; - if (split) + rbio->submit_time = local_clock(); + if (rbio->split) rbio->parent = orig; else rbio->end_io = orig->bio.bi_end_io; rbio->bvec_iter = iter; - rbio->submit_time_us = local_clock_us(); rbio->flags = flags; - rbio->bounce = bounce; - rbio->split = split; + rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); rbio->narrow_crcs = narrow_crcs; + rbio->hole = 0; rbio->retry = 0; rbio->context = 0; - rbio->devs_have = bch2_extent_devs(e); - rbio->pick = *pick; + rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; rbio->pos = pos; - rbio->version = e.k->version; - rbio->promote = promote ? promote_alloc(rbio, e.s_c) : NULL; + rbio->version = k.k->version; + rbio->promote = promote; INIT_WORK(&rbio->work, NULL); - bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev); rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick->ptr.offset; + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; - if (bounce) + if (rbio->bounce) trace_read_bounce(&rbio->bio); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER], + + if (!rbio->have_ioref) + goto no_device_postclone; + + lg_local_lock(&c->usage_lock); + bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); + lg_local_unlock(&c->usage_lock); + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + if (likely(!(flags & BCH_READ_IN_RETRY))) { + if (!(flags & BCH_READ_LAST_FRAGMENT)) { + bio_inc_remaining(&orig->bio); + trace_read_split(&orig->bio); + } + submit_bio(&rbio->bio); + return 0; } else { + int ret; + submit_bio_wait(&rbio->bio); rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); ret = rbio->retry; - if (rbio->split) - rbio = bch2_rbio_free(rbio); - if (!ret) - bch2_rbio_done(rbio); - } - - return ret; -} - -static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) -{ - struct extent_pick_ptr pick; - struct btree_iter iter; - BKEY_PADDED(k) tmp; - struct bkey_s_c k; - int ret; - - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS); -retry: - k = bch2_btree_iter_peek_slot(&iter); - if (btree_iter_err(k)) { - bch2_btree_iter_unlock(&iter); - goto err; - } - - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&iter); + rbio = bch2_rbio_free(rbio); - if (!bkey_extent_is_data(k.k) || - !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k), - rbio->pick.ptr, - rbio->pos.offset - - rbio->pick.crc.offset) || - bkey_start_offset(k.k) != bvec_iter.bi_sector) - goto err; + if (ret == READ_RETRY_AVOID) { + __set_bit(pick.ptr.dev, avoid->d); + ret = READ_RETRY; + } - bch2_extent_pick_ptr(c, k, avoid, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, &rbio->bio, "no device to read from"); - bio_endio(&rbio->bio); - return; + return ret; } - if (!pick.ca) - goto err; +no_device_postclone: + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + bch2_rbio_free(rbio); +no_device: + __bcache_io_error(c, "no device to read from"); - if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) { - percpu_ref_put(&pick.ca->io_ref); - goto err; + if (likely(!(flags & BCH_READ_IN_RETRY))) { + orig->bio.bi_status = BLK_STS_IOERR; + if (flags & BCH_READ_LAST_FRAGMENT) + bch2_rbio_done(orig); + return 0; + } else { + return READ_ERR; } - ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k), - &pick, flags); - switch (ret) { - case READ_RETRY_AVOID: - __set_bit(pick.ca->dev_idx, avoid->d); - case READ_RETRY: - goto retry; - case READ_ERR: - bio_endio(&rbio->bio); - return; - }; - - return; -err: +hole: /* - * extent we wanted to read no longer exists, or - * was merged or partially overwritten (and thus - * possibly bigger than the memory that was - * originally allocated) + * won't normally happen in the BCH_READ_NODECODE + * (bch2_move_extent()) path, but if we retry and the extent we wanted + * to read no longer exists we have to signal that: */ - rbio->bio.bi_status = BLK_STS_AGAIN; - bio_endio(&rbio->bio); - return; + if (flags & BCH_READ_NODECODE) + orig->hole = true; + + zero_fill_bio_iter(&orig->bio, iter); + + if (flags & BCH_READ_LAST_FRAGMENT) + bch2_rbio_done(orig); + return 0; } -void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) +void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) { struct btree_iter iter; struct bkey_s_c k; + unsigned flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| + BCH_READ_USER_MAPPED; int ret; - EBUG_ON(flags & BCH_READ_NODECODE); -retry: + BUG_ON(rbio->_state); + BUG_ON(flags & BCH_READ_NODECODE); + BUG_ON(flags & BCH_READ_IN_RETRY); + + rbio->c = c; + rbio->start_time = local_clock(); + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), + POS(inode, rbio->bio.bi_iter.bi_sector), BTREE_ITER_SLOTS, k) { BKEY_PADDED(k) tmp; - struct extent_pick_ptr pick; - struct bvec_iter fragment; + unsigned bytes; /* * Unlock the iterator while the btree node's lock is still in @@ -1508,49 +1790,20 @@ retry: k = bkey_i_to_s_c(&tmp.k); bch2_btree_iter_unlock(&iter); - bch2_extent_pick_ptr(c, k, avoid, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, &rbio->bio, "no device to read from"); - bio_endio(&rbio->bio); - return; - } - - fragment = bvec_iter; - fragment.bi_size = (min_t(u64, k.k->p.offset, - bvec_iter_end_sector(bvec_iter)) - - bvec_iter.bi_sector) << 9; + bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size, + (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9); + swap(rbio->bio.bi_iter.bi_size, bytes); - if (pick.ca) { - if (fragment.bi_size != bvec_iter.bi_size) { - bio_inc_remaining(&rbio->bio); - flags |= BCH_READ_MUST_CLONE; - trace_read_split(&rbio->bio); - } + if (rbio->bio.bi_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; - ret = __bch2_read_extent(c, rbio, fragment, - bkey_s_c_to_extent(k), - &pick, flags); - switch (ret) { - case READ_RETRY_AVOID: - __set_bit(pick.ca->dev_idx, avoid->d); - case READ_RETRY: - goto retry; - case READ_ERR: - rbio->bio.bi_status = BLK_STS_IOERR; - bio_endio(&rbio->bio); - return; - }; - } else { - zero_fill_bio_iter(&rbio->bio, fragment); - - if (fragment.bi_size == bvec_iter.bi_size) - bio_endio(&rbio->bio); - } + bch2_read_extent(c, rbio, k, flags); - if (fragment.bi_size == bvec_iter.bi_size) + if (flags & BCH_READ_LAST_FRAGMENT) return; - bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size); + swap(rbio->bio.bi_iter.bi_size, bytes); + bio_advance(&rbio->bio, bytes); } /* @@ -1560,5 +1813,34 @@ retry: ret = bch2_btree_iter_unlock(&iter); BUG_ON(!ret); bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); - bio_endio(&rbio->bio); + bch2_rbio_done(rbio); +} + +void bch2_fs_io_exit(struct bch_fs *c) +{ + if (c->promote_table.tbl) + rhashtable_destroy(&c->promote_table); + mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->bio_write); + bioset_exit(&c->bio_read_split); + bioset_exit(&c->bio_read); +} + +int bch2_fs_io_init(struct bch_fs *c) +{ + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS) || + bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS) || + bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), + BIOSET_NEED_BVECS) || + mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->opts.btree_node_size, + c->sb.encoded_extent_max) / + PAGE_SECTORS, 0) || + rhashtable_init(&c->promote_table, &bch_promote_params)) + return -ENOMEM; + + return 0; } |