diff options
Diffstat (limited to 'fs/bcachefs/io.c')
-rw-r--r-- | fs/bcachefs/io.c | 398 |
1 files changed, 240 insertions, 158 deletions
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index f26d4041cdda..792f1df5d0a1 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Some low level IO code, and hacks for various block layer limitations * @@ -6,7 +7,7 @@ */ #include "bcachefs.h" -#include "alloc.h" +#include "alloc_foreground.h" #include "bset.h" #include "btree_update.h" #include "buckets.h" @@ -15,6 +16,7 @@ #include "clock.h" #include "debug.h" #include "disk_groups.h" +#include "ec.h" #include "error.h" #include "extents.h" #include "io.h" @@ -22,7 +24,6 @@ #include "keylist.h" #include "move.h" #include "rebalance.h" -#include "replicas.h" #include "super.h" #include "super-io.h" @@ -121,10 +122,11 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, i, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -202,20 +204,20 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); const struct bch_extent_ptr *ptr; struct bch_write_bio *n; struct bch_dev *ca; BUG_ON(c->opts.nochanges); - extent_for_each_ptr(e, ptr) { + bkey_for_each_ptr(ptrs, ptr) { BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || !c->devs[ptr->dev]); ca = bch_dev_bkey_exists(c, ptr->dev); - if (ptr + 1 < &extent_entry_last(e)->ptr) { + if (to_entry(ptr + 1) < ptrs.end) { n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, &ca->replica_set)); @@ -276,19 +278,44 @@ static void bch2_write_done(struct closure *cl) int bch2_write_index_default(struct bch_write_op *op) { + struct bch_fs *c = op->c; + struct btree_trans trans; + struct btree_iter *iter; struct keylist *keys = &op->insert_keys; - struct btree_iter iter; int ret; - bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS, - bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_INTENT); + BUG_ON(bch2_keylist_empty(keys)); + bch2_verify_keylist_sorted(keys); + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_INTENT); + + do { + BKEY_PADDED(k) split; + + bkey_copy(&split.k, bch2_keylist_front(keys)); - ret = bch2_btree_insert_list_at(&iter, keys, &op->res, - NULL, op_journal_seq(op), + bch2_extent_trim_atomic(&split.k, iter); + + bch2_trans_update(&trans, + BTREE_INSERT_ENTRY(iter, &split.k)); + + ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE); - bch2_btree_iter_unlock(&iter); + if (ret) + break; + + if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0) + bch2_cut_front(iter->pos, bch2_keylist_front(keys)); + else + bch2_keylist_pop_front(keys); + } while (!bch2_keylist_empty(keys)); + + bch2_trans_exit(&trans); return ret; } @@ -300,31 +327,23 @@ static void __bch2_write_index(struct bch_write_op *op) { struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; - struct bkey_s_extent e; struct bch_extent_ptr *ptr; struct bkey_i *src, *dst = keys->keys, *n, *k; + unsigned dev; int ret; for (src = keys->keys; src != keys->top; src = n) { n = bkey_next(src); bkey_copy(dst, src); - e = bkey_i_to_s_extent(dst); - extent_for_each_ptr_backwards(e, ptr) - if (test_bit(ptr->dev, op->failed.d)) - bch2_extent_drop_ptr(e, ptr); + bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr, + test_bit(ptr->dev, op->failed.d)); - if (!bch2_extent_nr_ptrs(e.c)) { + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) { ret = -EIO; goto err; } - if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) { - ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, e.s_c); - if (ret) - goto err; - } - dst = bkey_next(dst); } @@ -352,7 +371,11 @@ static void __bch2_write_index(struct bch_write_op *op) } } out: - bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets); + /* If some a bucket wasn't written, we can't erasure code it: */ + for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) + bch2_open_bucket_write_error(c, &op->open_buckets, dev); + + bch2_open_buckets_put(c, &op->open_buckets); return; err: keys->top = keys->keys; @@ -411,16 +434,32 @@ static void init_append_extent(struct bch_write_op *op, struct bversion version, struct bch_extent_crc_unpacked crc) { + struct bch_fs *c = op->c; struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top); + struct extent_ptr_decoded p = { .crc = crc }; + struct open_bucket *ob; + unsigned i; op->pos.offset += crc.uncompressed_size; - e->k.p = op->pos; - e->k.size = crc.uncompressed_size; - e->k.version = version; - bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED); + e->k.p = op->pos; + e->k.size = crc.uncompressed_size; + e->k.version = version; + + BUG_ON(crc.compressed_size > wp->sectors_free); + wp->sectors_free -= crc.compressed_size; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - bch2_extent_crc_append(e, crc); - bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size); + p.ptr = ob->ptr; + p.ptr.cached = !ca->mi.durability || + (op->flags & BCH_WRITE_CACHED) != 0; + p.ptr.offset += ca->mi.bucket_size - ob->sectors_free; + bch2_extent_ptr_decoded_append(e, &p); + + BUG_ON(crc.compressed_size > ob->sectors_free); + ob->sectors_free -= crc.compressed_size; + } bch2_keylist_push(&op->insert_keys); } @@ -428,7 +467,8 @@ static void init_append_extent(struct bch_write_op *op, static struct bio *bch2_write_bio_alloc(struct bch_fs *c, struct write_point *wp, struct bio *src, - bool *page_alloc_failed) + bool *page_alloc_failed, + void *buf) { struct bch_write_bio *wbio; struct bio *bio; @@ -438,11 +478,18 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); wbio = wbio_init(bio); - wbio->bounce = true; wbio->put_bio = true; /* copy WRITE_SYNC flag */ wbio->bio.bi_opf = src->bi_opf; + if (buf) { + bio->bi_iter.bi_size = output_available; + bch2_bio_map(bio, buf); + return bio; + } + + wbio->bounce = true; + /* * We can't use mempool for more than c->sb.encoded_extent_max * worth of pages, but we'd like to allocate more if we can: @@ -607,14 +654,18 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) struct bio *src = &op->wbio.bio, *dst = src; struct bvec_iter saved_iter; struct bkey_i *key_to_write; + void *ec_buf; unsigned key_to_write_offset = op->insert_keys.top_p - op->insert_keys.keys_p; - unsigned total_output = 0; - bool bounce = false, page_alloc_failed = false; + unsigned total_output = 0, total_input = 0; + bool bounce = false; + bool page_alloc_failed = false; int ret, more = 0; BUG_ON(!bio_sectors(src)); + ec_buf = bch2_writepoint_ec_buf(c, wp); + switch (bch2_write_prep_encoded_data(op, wp)) { case PREP_ENCODED_OK: break; @@ -624,16 +675,26 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) case PREP_ENCODED_CHECKSUM_ERR: goto csum_err; case PREP_ENCODED_DO_WRITE: + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } init_append_extent(op, wp, op->version, op->crc); goto do_write; } - if (op->compression_type || + if (ec_buf || + op->compression_type || (op->csum_type && !(op->flags & BCH_WRITE_PAGES_STABLE)) || (bch2_csum_type_is_encryption(op->csum_type) && !(op->flags & BCH_WRITE_PAGES_OWNED))) { - dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed); + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); bounce = true; } @@ -736,7 +797,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); - total_output += dst_len; + total_output += dst_len; + total_input += src_len; } while (dst->bi_iter.bi_size && src->bi_iter.bi_size && wp->sectors_free && @@ -749,16 +811,20 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) dst->bi_iter = saved_iter; - if (!bounce && more) { - dst = bio_split(src, total_output >> 9, + if (dst == src && more) { + BUG_ON(total_output != total_input); + + dst = bio_split(src, total_input >> 9, GFP_NOIO, &c->bio_write); - wbio_init(dst)->put_bio = true; + wbio_init(dst)->put_bio = true; + /* copy WRITE_SYNC flag */ + dst->bi_opf = src->bi_opf; } dst->bi_iter.bi_size = total_output; /* Free unneeded pages after compressing: */ - if (bounce) + if (to_wbio(dst)->bounce) while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE)) mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page, &c->bio_bounce_pages); @@ -767,6 +833,10 @@ do_write: key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); + bch2_ec_add_backpointer(c, wp, + bkey_start_pos(&key_to_write->k), + total_input >> 9); + dst->bi_end_io = bch2_write_endio; dst->bi_private = &op->cl; bio_set_op_attrs(dst, REQ_OP_WRITE, 0); @@ -781,10 +851,10 @@ csum_err: "rewriting existing data (memory corruption?)"); ret = -EIO; err: - if (bounce) { + if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); + if (to_wbio(dst)->put_bio) bio_put(dst); - } return ret; } @@ -796,10 +866,12 @@ static void __bch2_write(struct closure *cl) struct write_point *wp; int ret; again: + memset(&op->failed, 0, sizeof(op->failed)); + do { /* +1 for possible cache device: */ - if (op->open_buckets_nr + op->nr_replicas + 1 > - ARRAY_SIZE(op->open_buckets)) + if (op->open_buckets.nr + op->nr_replicas + 1 > + ARRAY_SIZE(op->open_buckets.v)) goto flush_io; if (bch2_keylist_realloc(&op->insert_keys, @@ -810,6 +882,7 @@ again: wp = bch2_alloc_sectors_start(c, op->target, + op->opts.erasure_code, op->write_point, &op->devs_have, op->nr_replicas, @@ -830,11 +903,7 @@ again: ret = bch2_write_extent(op, wp); - BUG_ON(op->open_buckets_nr + wp->nr_ptrs - wp->first_ptr > - ARRAY_SIZE(op->open_buckets)); - bch2_open_bucket_get(c, wp, - &op->open_buckets_nr, - op->open_buckets); + bch2_open_bucket_get(c, wp, &op->open_buckets); bch2_alloc_sectors_done(c, wp); if (ret < 0) @@ -889,12 +958,9 @@ void bch2_write(struct closure *cl) BUG_ON(!op->nr_replicas); BUG_ON(!op->write_point.v); BUG_ON(!bkey_cmp(op->pos, POS_MAX)); - BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX); op->start_time = local_clock(); - memset(&op->failed, 0, sizeof(op->failed)); - bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(&op->wbio.bio)->put_bio = false; @@ -917,6 +983,7 @@ void bch2_write(struct closure *cl) struct promote_op { struct closure cl; + struct rcu_head rcu; u64 start_time; struct rhash_head hash; @@ -937,23 +1004,23 @@ static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts opts, unsigned flags) { - if (!opts.promote_target) + if (!bkey_extent_is_data(k.k)) return false; if (!(flags & BCH_READ_MAY_PROMOTE)) return false; - if (percpu_ref_is_dying(&c->writes)) - return false; - - if (!bkey_extent_is_data(k.k)) + if (!opts.promote_target) return false; - if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target)) + if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), + opts.promote_target)) return false; - if (bch2_target_congested(c, opts.promote_target)) + if (bch2_target_congested(c, opts.promote_target)) { + /* XXX trace this */ return false; + } if (rhashtable_lookup_fast(&c->promote_table, &pos, bch_promote_params)) @@ -970,7 +1037,7 @@ static void promote_free(struct bch_fs *c, struct promote_op *op) bch_promote_params); BUG_ON(ret); percpu_ref_put(&c->writes); - kfree(op); + kfree_rcu(op, rcu); } static void promote_done(struct closure *cl) @@ -1012,7 +1079,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) noinline static struct promote_op *__promote_alloc(struct bch_fs *c, struct bpos pos, - struct extent_pick_ptr *pick, + struct extent_ptr_decoded *pick, struct bch_io_opts opts, unsigned rbio_sectors, struct bch_read_bio **rbio) @@ -1093,7 +1160,7 @@ err: static inline struct promote_op *promote_alloc(struct bch_fs *c, struct bvec_iter iter, struct bkey_s_c k, - struct extent_pick_ptr *pick, + struct extent_ptr_decoded *pick, struct bch_io_opts opts, unsigned flags, struct bch_read_bio **rbio, @@ -1187,29 +1254,31 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) + struct bch_io_failures *failed, + unsigned flags) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; BKEY_PADDED(k) tmp; struct bkey_s_c k; int ret; flags &= ~BCH_READ_LAST_FRAGMENT; - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, - rbio->pos, BTREE_ITER_SLOTS); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + rbio->pos, BTREE_ITER_SLOTS); retry: rbio->bio.bi_status = 0; - k = bch2_btree_iter_peek_slot(&iter); - if (btree_iter_err(k)) { - bch2_btree_iter_unlock(&iter); + k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) goto err; - } bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&iter); + bch2_trans_unlock(&trans); if (!bkey_extent_is_data(k.k) || !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k), @@ -1221,44 +1290,49 @@ retry: goto out; } - ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags); + ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags); if (ret == READ_RETRY) goto retry; if (ret) goto err; - goto out; -err: - rbio->bio.bi_status = BLK_STS_IOERR; out: bch2_rbio_done(rbio); + bch2_trans_exit(&trans); + return; +err: + rbio->bio.bi_status = BLK_STS_IOERR; + goto out; } static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) + struct bch_io_failures *failed, unsigned flags) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; int ret; + bch2_trans_init(&trans, c, 0, 0); + flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; retry: - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS, k) { + BTREE_ITER_SLOTS, k, ret) { BKEY_PADDED(k) tmp; unsigned bytes; bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&iter); + bch2_trans_unlock(&trans); bytes = min_t(unsigned, bvec_iter.bi_size, (k.k->p.offset - bvec_iter.bi_sector) << 9); swap(bvec_iter.bi_size, bytes); - ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags); + ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags); switch (ret) { case READ_RETRY: goto retry; @@ -1277,12 +1351,12 @@ retry: * If we get here, it better have been because there was an error * reading a btree node */ - ret = bch2_btree_iter_unlock(&iter); BUG_ON(!ret); - __bcache_io_error(c, "btree IO error %i", ret); + __bcache_io_error(c, "btree IO error: %i", ret); err: rbio->bio.bi_status = BLK_STS_IOERR; out: + bch2_trans_exit(&trans); bch2_rbio_done(rbio); } @@ -1294,14 +1368,12 @@ static void bch2_rbio_retry(struct work_struct *work) struct bvec_iter iter = rbio->bvec_iter; unsigned flags = rbio->flags; u64 inode = rbio->pos.inode; - struct bch_devs_mask avoid; + struct bch_io_failures failed = { .nr = 0 }; trace_read_retry(&rbio->bio); - memset(&avoid, 0, sizeof(avoid)); - if (rbio->retry == READ_RETRY_AVOID) - __set_bit(rbio->pick.ptr.dev, avoid.d); + bch2_mark_io_failure(&failed, &rbio->pick); rbio->bio.bi_status = 0; @@ -1311,9 +1383,9 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_MAY_PROMOTE; if (flags & BCH_READ_NODECODE) - bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags); + bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); else - bch2_read_retry(c, rbio, iter, inode, &avoid, flags); + bch2_read_retry(c, rbio, iter, inode, &failed, flags); } static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, @@ -1338,21 +1410,25 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { struct bch_fs *c = rbio->c; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; struct bkey_i_extent *e; BKEY_PADDED(k) new; struct bch_extent_crc_unpacked new_crc; - unsigned offset; + u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; int ret; if (rbio->pick.crc.compression_type) return; - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos, - BTREE_ITER_INTENT); + bch2_trans_init(&trans, c, 0, 0); retry: - k = bch2_btree_iter_peek(&iter); + bch2_trans_begin(&trans); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos, + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(iter); if (IS_ERR_OR_NULL(k.k)) goto out; @@ -1363,24 +1439,19 @@ retry: e = bkey_i_to_extent(&new.k); if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e), - rbio->pick.ptr, - rbio->pos.offset - - rbio->pick.crc.offset) || + rbio->pick.ptr, data_offset) || bversion_cmp(e->k.version, rbio->version)) goto out; /* Extent was merged? */ - if (bkey_start_offset(&e->k) < rbio->pos.offset || - e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size) + if (bkey_start_offset(&e->k) < data_offset || + e->k.p.offset > data_offset + rbio->pick.crc.uncompressed_size) goto out; - /* The extent might have been partially overwritten since we read it: */ - offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset); - if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, - rbio->pick.crc, NULL, &new_crc, - offset, e->k.size, - rbio->pick.crc.csum_type)) { + rbio->pick.crc, NULL, &new_crc, + bkey_start_offset(&e->k) - data_offset, e->k.size, + rbio->pick.crc.csum_type)) { bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); goto out; } @@ -1388,19 +1459,19 @@ retry: if (!bch2_extent_narrow_crcs(e, new_crc)) goto out; - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOWAIT, - BTREE_INSERT_ENTRY(&iter, &e->k_i)); + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &e->k_i)); + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOWAIT); if (ret == -EINTR) goto retry; out: - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); } static bool should_narrow_crcs(struct bkey_s_c k, - struct extent_pick_ptr *pick, + struct extent_ptr_decoded *pick, unsigned flags) { return !(flags & BCH_READ_IN_RETRY) && @@ -1553,9 +1624,9 @@ static void bch2_read_endio(struct bio *bio) int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bvec_iter iter, struct bkey_s_c k, - struct bch_devs_mask *avoid, unsigned flags) + struct bch_io_failures *failed, unsigned flags) { - struct extent_pick_ptr pick; + struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; struct bch_dev *ca; struct promote_op *promote = NULL; @@ -1563,14 +1634,16 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bpos pos = bkey_start_pos(k.k); int pick_ret; - pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick); + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); /* hole or reservation - just zero fill: */ if (!pick_ret) goto hole; - if (pick_ret < 0) - goto no_device; + if (pick_ret < 0) { + __bcache_io_error(c, "no device to read from"); + goto err; + } if (pick_ret > 0) ca = bch_dev_bkey_exists(c, pick.ptr.dev); @@ -1695,31 +1768,46 @@ noclone: bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - if (!rbio->have_ioref) - goto no_device_postclone; - - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read(&c->mark_lock); bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read(&c->mark_lock); - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], - bio_sectors(&rbio->bio)); + if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) { + bio_inc_remaining(&orig->bio); + trace_read_split(&orig->bio); + } - bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + if (!rbio->pick.idx) { + if (!rbio->have_ioref) { + __bcache_io_error(c, "no device to read from"); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } - if (likely(!(flags & BCH_READ_IN_RETRY))) { - if (!(flags & BCH_READ_LAST_FRAGMENT)) { - bio_inc_remaining(&orig->bio); - trace_read_split(&orig->bio); + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], + bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } else { + /* Attempting reconstruct read: */ + if (bch2_ec_read_extent(c, rbio)) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; } - submit_bio(&rbio->bio); + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } +out: + if (likely(!(flags & BCH_READ_IN_RETRY))) { return 0; } else { int ret; - submit_bio_wait(&rbio->bio); - rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); @@ -1727,29 +1815,19 @@ noclone: rbio = bch2_rbio_free(rbio); if (ret == READ_RETRY_AVOID) { - __set_bit(pick.ptr.dev, avoid->d); + bch2_mark_io_failure(failed, &pick); ret = READ_RETRY; } return ret; } -no_device_postclone: - if (!rbio->split) - rbio->bio.bi_end_io = rbio->end_io; - bch2_rbio_free(rbio); -no_device: - __bcache_io_error(c, "no device to read from"); - - if (likely(!(flags & BCH_READ_IN_RETRY))) { - orig->bio.bi_status = BLK_STS_IOERR; - - if (flags & BCH_READ_LAST_FRAGMENT) - bch2_rbio_done(orig); - return 0; - } else { +err: + if (flags & BCH_READ_IN_RETRY) return READ_ERR; - } + + orig->bio.bi_status = BLK_STS_IOERR; + goto out_read_done; hole: /* @@ -1761,7 +1839,7 @@ hole: orig->hole = true; zero_fill_bio_iter(&orig->bio, iter); - +out_read_done: if (flags & BCH_READ_LAST_FRAGMENT) bch2_rbio_done(orig); return 0; @@ -1769,13 +1847,16 @@ hole: void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; unsigned flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE| BCH_READ_USER_MAPPED; int ret; + bch2_trans_init(&trans, c, 0, 0); + BUG_ON(rbio->_state); BUG_ON(flags & BCH_READ_NODECODE); BUG_ON(flags & BCH_READ_IN_RETRY); @@ -1783,9 +1864,9 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) rbio->c = c; rbio->start_time = local_clock(); - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode, rbio->bio.bi_iter.bi_sector), - BTREE_ITER_SLOTS, k) { + BTREE_ITER_SLOTS, k, ret) { BKEY_PADDED(k) tmp; unsigned bytes; @@ -1795,7 +1876,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) */ bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&iter); + bch2_trans_unlock(&trans); bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size, (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9); @@ -1817,9 +1898,10 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) * If we get here, it better have been because there was an error * reading a btree node */ - ret = bch2_btree_iter_unlock(&iter); BUG_ON(!ret); - bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); + bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); + + bch2_trans_exit(&trans); bch2_rbio_done(rbio); } |