summaryrefslogtreecommitdiff
path: root/libbcachefs/io.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbcachefs/io.c')
-rw-r--r--libbcachefs/io.c1209
1 files changed, 743 insertions, 466 deletions
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index e5fc72da..0c41e411 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -29,6 +29,29 @@
/* Allocate, free from mempool: */
+void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw)
+{
+ u64 now = local_clock();
+ unsigned io_latency = (now >> 10) - submit_time_us;
+ atomic_t *latency = &ca->latency[rw];
+ unsigned old, new, v = atomic_read(latency);
+
+ do {
+ old = v;
+
+ /*
+ * If the io latency was reasonably close to the current
+ * latency, skip doing the update and atomic operation - most of
+ * the time:
+ */
+ if (abs((int) (old - io_latency)) < (old >> 1) &&
+ now & ~(~0 << 5))
+ break;
+
+ new = ewma_add((u64) old, io_latency, 6);
+ } while ((v = atomic_cmpxchg(latency, old, new)) != old);
+}
+
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
struct bio_vec *bv;
@@ -63,10 +86,12 @@ pool_alloc:
}
void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
- size_t bytes)
+ size_t bytes)
{
bool using_mempool = false;
+ BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
+
bio->bi_iter.bi_size = bytes;
while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
@@ -76,7 +101,35 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
mutex_unlock(&c->bio_bounce_pages_lock);
}
-/* Bios with headers */
+void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
+ size_t bytes)
+{
+ while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
+
+ BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
+
+ bv->bv_page = alloc_page(GFP_NOIO);
+ if (!bv->bv_page) {
+ /*
+ * We already allocated from mempool, we can't allocate from it again
+ * without freeing the pages we already allocated or else we could
+ * deadlock:
+ */
+ bch2_bio_free_pages_pool(c, bio);
+ bch2_bio_alloc_pages_pool(c, bio, bytes);
+ return;
+ }
+
+ bv->bv_len = PAGE_SIZE;
+ bv->bv_offset = 0;
+ bio->bi_vcnt++;
+ }
+
+ bio->bi_iter.bi_size = bytes;
+}
+
+/* Writes */
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
enum bch_data_type type,
@@ -137,17 +190,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
}
}
-/* IO errors */
-
-/* Writes */
-
-static struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
- return op->alloc_reserve == RESERVE_MOVINGGC
- ? op->c->copygc_wq
- : op->c->wq;
-}
-
static void __bch2_write(struct closure *);
static void bch2_write_done(struct closure *cl)
@@ -176,7 +218,7 @@ static u64 keylist_sectors(struct keylist *keys)
return ret;
}
-static int bch2_write_index_default(struct bch_write_op *op)
+int bch2_write_index_default(struct bch_write_op *op)
{
struct keylist *keys = &op->insert_keys;
struct btree_iter iter;
@@ -202,7 +244,6 @@ static void bch2_write_index(struct closure *cl)
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
- unsigned i;
op->flags |= BCH_WRITE_LOOPED;
@@ -220,13 +261,7 @@ static void bch2_write_index(struct closure *cl)
}
}
- for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++)
- if (op->open_buckets[i]) {
- bch2_open_bucket_put(c,
- c->open_buckets +
- op->open_buckets[i]);
- op->open_buckets[i] = 0;
- }
+ bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
if (!(op->flags & BCH_WRITE_DONE))
continue_at(cl, __bch2_write, op->io_wq);
@@ -287,6 +322,8 @@ static void bch2_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct bch_dev *ca = wbio->ca;
+ bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+
if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
set_bit(ca->dev_idx, op->failed.d);
set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
@@ -307,179 +344,364 @@ static void bch2_write_endio(struct bio *bio)
closure_put(cl);
}
-static struct nonce extent_nonce(struct bversion version,
- unsigned nonce,
- unsigned uncompressed_size,
- unsigned compression_type)
-{
- return (struct nonce) {{
- [0] = cpu_to_le32((nonce << 12) |
- (uncompressed_size << 22)),
- [1] = cpu_to_le32(version.lo),
- [2] = cpu_to_le32(version.lo >> 32),
- [3] = cpu_to_le32(version.hi|
- (compression_type << 24))^BCH_NONCE_EXTENT,
- }};
-}
-
static void init_append_extent(struct bch_write_op *op,
- unsigned compressed_size,
- unsigned uncompressed_size,
- unsigned compression_type,
- unsigned nonce,
- struct bch_csum csum, unsigned csum_type,
- struct open_bucket *ob)
+ struct write_point *wp,
+ struct bversion version,
+ struct bch_extent_crc_unpacked crc)
{
struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
- op->pos.offset += uncompressed_size;
+ op->pos.offset += crc.uncompressed_size;
e->k.p = op->pos;
- e->k.size = uncompressed_size;
- e->k.version = op->version;
+ e->k.size = crc.uncompressed_size;
+ e->k.version = version;
bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
- bch2_extent_crc_append(e, compressed_size,
- uncompressed_size,
- compression_type,
- nonce, csum, csum_type);
-
- bch2_alloc_sectors_append_ptrs(op->c, e, op->nr_replicas,
- ob, compressed_size);
+ bch2_extent_crc_append(e, crc);
+ bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size);
bkey_extent_set_cached(&e->k, (op->flags & BCH_WRITE_CACHED));
bch2_keylist_push(&op->insert_keys);
}
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
+ struct write_point *wp,
+ struct bio *src,
+ bool *page_alloc_failed)
{
- struct bch_fs *c = op->c;
- struct bio *orig = &op->wbio.bio;
- struct bio *bio;
struct bch_write_bio *wbio;
- unsigned key_to_write_offset = op->insert_keys.top_p -
- op->insert_keys.keys_p;
- struct bkey_i *key_to_write;
- unsigned csum_type = op->csum_type;
- unsigned compression_type = op->compression_type;
- int ret, more;
+ struct bio *bio;
+ unsigned output_available =
+ min(wp->sectors_free << 9, src->bi_iter.bi_size);
+ unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
+
+ bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
+ wbio = wbio_init(bio);
+ wbio->bounce = true;
+ wbio->put_bio = true;
+ /* copy WRITE_SYNC flag */
+ wbio->bio.bi_opf = src->bi_opf;
+
+ /*
+ * We can't use mempool for more than c->sb.encoded_extent_max
+ * worth of pages, but we'd like to allocate more if we can:
+ */
+ while (bio->bi_iter.bi_size < output_available) {
+ unsigned len = min_t(unsigned, PAGE_SIZE,
+ output_available - bio->bi_iter.bi_size);
+ struct page *p;
+
+ p = alloc_page(GFP_NOIO);
+ if (!p) {
+ unsigned pool_max =
+ min_t(unsigned, output_available,
+ c->sb.encoded_extent_max << 9);
+
+ if (bio_sectors(bio) < pool_max)
+ bch2_bio_alloc_pages_pool(c, bio, pool_max);
+ break;
+ }
+
+ bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
+ .bv_page = p,
+ .bv_len = len,
+ .bv_offset = 0,
+ };
+ bio->bi_iter.bi_size += len;
+ }
- /* don't refetch csum type/compression type */
- barrier();
+ *page_alloc_failed = bio->bi_vcnt < pages;
+ return bio;
+}
+
+static int bch2_write_rechecksum(struct bch_fs *c,
+ struct bch_write_op *op,
+ unsigned new_csum_type)
+{
+ struct bio *bio = &op->wbio.bio;
+ struct bch_extent_crc_unpacked new_crc;
+ int ret;
- BUG_ON(!bio_sectors(orig));
+ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
+
+ if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
+ bch2_csum_type_is_encryption(new_csum_type))
+ new_csum_type = op->crc.csum_type;
+
+ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+ NULL, &new_crc,
+ op->crc.offset, op->crc.live_size,
+ new_csum_type);
+ if (ret)
+ return ret;
+
+ bio_advance(bio, op->crc.offset << 9);
+ bio->bi_iter.bi_size = op->crc.live_size << 9;
+ op->crc = new_crc;
+ return 0;
+}
+
+static int bch2_write_decrypt(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct nonce nonce = extent_nonce(op->version, op->crc);
+ struct bch_csum csum;
+
+ if (!bch2_csum_type_is_encryption(op->crc.csum_type))
+ return 0;
+
+ /*
+ * If we need to decrypt data in the write path, we'll no longer be able
+ * to verify the existing checksum (poly1305 mac, in this case) after
+ * it's decrypted - this is the last point we'll be able to reverify the
+ * checksum:
+ */
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+ if (bch2_crc_cmp(op->crc.csum, csum))
+ return -EIO;
+
+ bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+ op->crc.csum_type = 0;
+ op->crc.csum = (struct bch_csum) { 0, 0 };
+ return 0;
+}
+
+static enum prep_encoded_ret {
+ PREP_ENCODED_OK,
+ PREP_ENCODED_ERR,
+ PREP_ENCODED_CHECKSUM_ERR,
+ PREP_ENCODED_DO_WRITE,
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+{
+ struct bch_fs *c = op->c;
+ struct bio *bio = &op->wbio.bio;
- /* Need to decompress data? */
- if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
- (crc_uncompressed_size(NULL, &op->crc) != op->size ||
- crc_compressed_size(NULL, &op->crc) > wp->sectors_free)) {
- int ret;
+ if (!(op->flags & BCH_WRITE_DATA_ENCODED))
+ return PREP_ENCODED_OK;
- ret = bch2_bio_uncompress_inplace(c, orig, op->size, op->crc);
- if (ret)
- return ret;
+ BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
- op->flags &= ~BCH_WRITE_DATA_COMPRESSED;
+ /* Can we just write the entire extent as is? */
+ if (op->crc.uncompressed_size == op->crc.live_size &&
+ op->crc.compressed_size <= wp->sectors_free &&
+ op->crc.compression_type == op->compression_type) {
+ if (!op->crc.compression_type &&
+ op->csum_type != op->crc.csum_type &&
+ bch2_write_rechecksum(c, op, op->csum_type))
+ return PREP_ENCODED_CHECKSUM_ERR;
+
+ return PREP_ENCODED_DO_WRITE;
}
- if (op->flags & BCH_WRITE_DATA_COMPRESSED) {
- init_append_extent(op,
- crc_compressed_size(NULL, &op->crc),
- crc_uncompressed_size(NULL, &op->crc),
- op->crc.compression_type,
- op->crc.nonce,
- op->crc.csum,
- op->crc.csum_type,
- wp->ob);
-
- bio = orig;
- wbio = wbio_init(bio);
- more = 0;
- } else if (csum_type != BCH_CSUM_NONE ||
- compression_type != BCH_COMPRESSION_NONE) {
- /* all units here in bytes */
- unsigned total_output = 0, output_available =
- min(wp->sectors_free << 9, orig->bi_iter.bi_size);
- unsigned crc_nonce = bch2_csum_type_is_encryption(csum_type)
- ? op->nonce : 0;
+ /*
+ * If the data is compressed and we couldn't write the entire extent as
+ * is, we have to decompress it:
+ */
+ if (op->crc.compression_type) {
struct bch_csum csum;
- struct nonce nonce;
- bio = bio_alloc_bioset(GFP_NOIO,
- DIV_ROUND_UP(output_available, PAGE_SIZE),
- &c->bio_write);
- wbio = wbio_init(bio);
- wbio->bounce = true;
- wbio->put_bio = true;
- /* copy WRITE_SYNC flag */
- wbio->bio.bi_opf = orig->bi_opf;
+ if (bch2_write_decrypt(op))
+ return PREP_ENCODED_CHECKSUM_ERR;
- /*
- * XXX: can't use mempool for more than
- * BCH_COMPRESSED_EXTENT_MAX worth of pages
- */
- bch2_bio_alloc_pages_pool(c, bio, output_available);
+ /* Last point we can still verify checksum: */
+ csum = bch2_checksum_bio(c, op->crc.csum_type,
+ extent_nonce(op->version, op->crc),
+ bio);
+ if (bch2_crc_cmp(op->crc.csum, csum))
+ return PREP_ENCODED_CHECKSUM_ERR;
- do {
- unsigned fragment_compression_type = compression_type;
- size_t dst_len, src_len;
+ if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+ return PREP_ENCODED_ERR;
+ }
- bch2_bio_compress(c, bio, &dst_len,
- orig, &src_len,
- &fragment_compression_type);
+ /*
+ * No longer have compressed data after this point - data might be
+ * encrypted:
+ */
- nonce = extent_nonce(op->version,
- crc_nonce,
- src_len >> 9,
- fragment_compression_type);
+ /*
+ * If the data is checksummed and we're only writing a subset,
+ * rechecksum and adjust bio to point to currently live data:
+ */
+ if ((op->crc.live_size != op->crc.uncompressed_size ||
+ op->crc.csum_type != op->csum_type) &&
+ bch2_write_rechecksum(c, op, op->csum_type))
+ return PREP_ENCODED_CHECKSUM_ERR;
- swap(bio->bi_iter.bi_size, dst_len);
- bch2_encrypt_bio(c, csum_type, nonce, bio);
+ /*
+ * If we want to compress the data, it has to be decrypted:
+ */
+ if ((op->compression_type ||
+ bch2_csum_type_is_encryption(op->crc.csum_type) !=
+ bch2_csum_type_is_encryption(op->csum_type)) &&
+ bch2_write_decrypt(op))
+ return PREP_ENCODED_CHECKSUM_ERR;
- csum = bch2_checksum_bio(c, csum_type, nonce, bio);
- swap(bio->bi_iter.bi_size, dst_len);
+ return PREP_ENCODED_OK;
+}
- init_append_extent(op,
- dst_len >> 9, src_len >> 9,
- fragment_compression_type,
- crc_nonce, csum, csum_type, wp->ob);
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
+{
+ struct bch_fs *c = op->c;
+ struct bio *src = &op->wbio.bio, *dst = src;
+ struct bvec_iter saved_iter;
+ struct bkey_i *key_to_write;
+ unsigned key_to_write_offset = op->insert_keys.top_p -
+ op->insert_keys.keys_p;
+ unsigned total_output = 0;
+ bool bounce = false, page_alloc_failed = false;
+ int ret, more = 0;
- total_output += dst_len;
- bio_advance(bio, dst_len);
- bio_advance(orig, src_len);
- } while (bio->bi_iter.bi_size &&
- orig->bi_iter.bi_size &&
- !bch2_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_EXTENT_U64s_MAX));
+ BUG_ON(!bio_sectors(src));
- BUG_ON(total_output > output_available);
+ switch (bch2_write_prep_encoded_data(op, wp)) {
+ case PREP_ENCODED_OK:
+ break;
+ case PREP_ENCODED_ERR:
+ ret = -EIO;
+ goto err;
+ case PREP_ENCODED_CHECKSUM_ERR:
+ goto csum_err;
+ case PREP_ENCODED_DO_WRITE:
+ init_append_extent(op, wp, op->version, op->crc);
+ goto do_write;
+ }
- memset(&bio->bi_iter, 0, sizeof(bio->bi_iter));
- bio->bi_iter.bi_size = total_output;
+ if (op->compression_type ||
+ (op->csum_type &&
+ !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+ (bch2_csum_type_is_encryption(op->csum_type) &&
+ !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+ dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
+ bounce = true;
+ }
- /*
- * Free unneeded pages after compressing:
- */
- while (bio->bi_vcnt * PAGE_SIZE >
- round_up(bio->bi_iter.bi_size, PAGE_SIZE))
- mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page,
- &c->bio_bounce_pages);
+ saved_iter = dst->bi_iter;
- more = orig->bi_iter.bi_size != 0;
- } else {
- bio = bio_next_split(orig, wp->sectors_free, GFP_NOIO,
- &c->bio_write);
- wbio = wbio_init(bio);
- wbio->put_bio = bio != orig;
+ do {
+ struct bch_extent_crc_unpacked crc =
+ (struct bch_extent_crc_unpacked) { 0 };
+ struct bversion version = op->version;
+ size_t dst_len, src_len;
+
+ if (page_alloc_failed &&
+ bio_sectors(dst) < wp->sectors_free &&
+ bio_sectors(dst) < c->sb.encoded_extent_max)
+ break;
- init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
- compression_type, 0,
- (struct bch_csum) { 0 }, csum_type, wp->ob);
+ BUG_ON(op->compression_type &&
+ (op->flags & BCH_WRITE_DATA_ENCODED) &&
+ bch2_csum_type_is_encryption(op->crc.csum_type));
+ BUG_ON(op->compression_type && !bounce);
+
+ crc.compression_type = op->compression_type
+ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+ op->compression_type)
+ : 0;
+ if (!crc.compression_type) {
+ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
+
+ if (op->csum_type)
+ dst_len = min_t(unsigned, dst_len,
+ c->sb.encoded_extent_max << 9);
+
+ if (bounce) {
+ swap(dst->bi_iter.bi_size, dst_len);
+ bio_copy_data(dst, src);
+ swap(dst->bi_iter.bi_size, dst_len);
+ }
- more = bio != orig;
+ src_len = dst_len;
+ }
+
+ BUG_ON(!src_len || !dst_len);
+
+ if (bch2_csum_type_is_encryption(op->csum_type)) {
+ if (bversion_zero(version)) {
+ version.lo = atomic64_inc_return(&c->key_version) + 1;
+ } else {
+ crc.nonce = op->nonce;
+ op->nonce += src_len >> 9;
+ }
+ }
+
+ if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+ !crc.compression_type &&
+ bch2_csum_type_is_encryption(op->crc.csum_type) ==
+ bch2_csum_type_is_encryption(op->csum_type)) {
+ /*
+ * Note: when we're using rechecksum(), we need to be
+ * checksumming @src because it has all the data our
+ * existing checksum covers - if we bounced (because we
+ * were trying to compress), @dst will only have the
+ * part of the data the new checksum will cover.
+ *
+ * But normally we want to be checksumming post bounce,
+ * because part of the reason for bouncing is so the
+ * data can't be modified (by userspace) while it's in
+ * flight.
+ */
+ if (bch2_rechecksum_bio(c, src, version, op->crc,
+ &crc, &op->crc,
+ src_len >> 9,
+ bio_sectors(src) - (src_len >> 9),
+ op->csum_type))
+ goto csum_err;
+ } else {
+ if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+ bch2_rechecksum_bio(c, src, version, op->crc,
+ NULL, &op->crc,
+ src_len >> 9,
+ bio_sectors(src) - (src_len >> 9),
+ op->crc.csum_type))
+ goto csum_err;
+
+ crc.compressed_size = dst_len >> 9;
+ crc.uncompressed_size = src_len >> 9;
+ crc.live_size = src_len >> 9;
+
+ swap(dst->bi_iter.bi_size, dst_len);
+ bch2_encrypt_bio(c, op->csum_type,
+ extent_nonce(version, crc), dst);
+ crc.csum = bch2_checksum_bio(c, op->csum_type,
+ extent_nonce(version, crc), dst);
+ crc.csum_type = op->csum_type;
+ swap(dst->bi_iter.bi_size, dst_len);
+ }
+
+ init_append_extent(op, wp, version, crc);
+
+ if (dst != src)
+ bio_advance(dst, dst_len);
+ bio_advance(src, src_len);
+ total_output += dst_len;
+ } while (dst->bi_iter.bi_size &&
+ src->bi_iter.bi_size &&
+ wp->sectors_free &&
+ !bch2_keylist_realloc(&op->insert_keys,
+ op->inline_keys,
+ ARRAY_SIZE(op->inline_keys),
+ BKEY_EXTENT_U64s_MAX));
+
+ more = src->bi_iter.bi_size != 0;
+
+ dst->bi_iter = saved_iter;
+
+ if (!bounce && more) {
+ dst = bio_split(src, total_output >> 9,
+ GFP_NOIO, &c->bio_write);
+ wbio_init(dst)->put_bio = true;
}
+ dst->bi_iter.bi_size = total_output;
+
+ /* Free unneeded pages after compressing: */
+ if (bounce)
+ while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
+ mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
+ &c->bio_bounce_pages);
+do_write:
/* might have done a realloc... */
key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
@@ -487,30 +709,40 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
BCH_DATA_USER);
if (ret)
- return ret;
+ goto err;
- bio->bi_end_io = bch2_write_endio;
- bio->bi_private = &op->cl;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ dst->bi_end_io = bch2_write_endio;
+ dst->bi_private = &op->cl;
+ bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
- closure_get(bio->bi_private);
+ closure_get(dst->bi_private);
- bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+ bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
key_to_write);
return more;
+csum_err:
+ bch_err(c, "error verifying existing checksum while "
+ "rewriting existing data (memory corruption?)");
+ ret = -EIO;
+err:
+ if (bounce) {
+ bch2_bio_free_pages_pool(c, dst);
+ bio_put(dst);
+ }
+
+ return ret;
}
static void __bch2_write(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
- unsigned open_bucket_nr = 0;
struct write_point *wp;
- struct open_bucket *ob;
int ret;
do {
- if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
+ if (op->open_buckets_nr + op->nr_replicas >
+ ARRAY_SIZE(op->open_buckets))
continue_at(cl, bch2_write_index, index_update_wq(op));
/* for the device pointers and 1 for the chksum */
@@ -520,11 +752,12 @@ static void __bch2_write(struct closure *cl)
BKEY_EXTENT_U64s_MAX))
continue_at(cl, bch2_write_index, index_update_wq(op));
- wp = bch2_alloc_sectors_start(c, BCH_DATA_USER,
+ wp = bch2_alloc_sectors_start(c,
op->devs,
op->write_point,
+ &op->devs_have,
op->nr_replicas,
- c->opts.data_replicas_required,
+ op->nr_replicas_required,
op->alloc_reserve,
op->flags,
(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
@@ -565,14 +798,13 @@ static void __bch2_write(struct closure *cl)
continue;
}
- ob = wp->ob;
-
- BUG_ON(ob - c->open_buckets == 0 ||
- ob - c->open_buckets > U8_MAX);
- op->open_buckets[open_bucket_nr++] = ob - c->open_buckets;
-
ret = bch2_write_extent(op, wp);
+ BUG_ON(op->open_buckets_nr + wp->nr_ptrs_can_use >
+ ARRAY_SIZE(op->open_buckets));
+ bch2_open_bucket_get(c, wp,
+ &op->open_buckets_nr,
+ op->open_buckets);
bch2_alloc_sectors_done(c, wp);
if (ret < 0)
@@ -603,30 +835,6 @@ err:
: bch2_write_done, index_update_wq(op));
}
-void bch2_wake_delayed_writes(unsigned long data)
-{
- struct bch_fs *c = (void *) data;
- struct bch_write_op *op;
- unsigned long flags;
-
- spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
-
- while ((op = c->write_wait_head)) {
- if (time_after(op->expires, jiffies)) {
- mod_timer(&c->foreground_write_wakeup, op->expires);
- break;
- }
-
- c->write_wait_head = op->next;
- if (!c->write_wait_head)
- c->write_wait_tail = NULL;
-
- closure_put(&op->cl);
- }
-
- spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
-}
-
/**
* bch_write - handle a write to a cache device or flash only volume
*
@@ -646,9 +854,17 @@ void bch2_wake_delayed_writes(unsigned long data)
void bch2_write(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bio *bio = &op->wbio.bio;
struct bch_fs *c = op->c;
- u64 inode = op->pos.inode;
+
+ BUG_ON(!op->nr_replicas);
+ BUG_ON(!op->write_point.v);
+ BUG_ON(!bkey_cmp(op->pos, POS_MAX));
+ BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
+
+ memset(&op->failed, 0, sizeof(op->failed));
+
+ bch2_keylist_init(&op->insert_keys, op->inline_keys);
+ wbio_init(&op->wbio.bio)->put_bio = false;
if (c->opts.nochanges ||
!percpu_ref_tryget(&c->writes)) {
@@ -658,102 +874,11 @@ void bch2_write(struct closure *cl)
closure_return(cl);
}
- if (bversion_zero(op->version) &&
- bch2_csum_type_is_encryption(op->csum_type))
- op->version.lo =
- atomic64_inc_return(&c->key_version) + 1;
-
- bch2_increment_clock(c, bio_sectors(bio), WRITE);
-
- /* Don't call bch2_next_delay() if rate is >= 1 GB/sec */
-
- if ((op->flags & BCH_WRITE_THROTTLE) &&
- c->foreground_write_ratelimit_enabled &&
- c->foreground_write_pd.rate.rate < (1 << 30)) {
- unsigned long flags;
- u64 delay;
-
- spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
- bch2_ratelimit_increment(&c->foreground_write_pd.rate,
- bio->bi_iter.bi_size);
-
- delay = bch2_ratelimit_delay(&c->foreground_write_pd.rate);
-
- if (delay >= HZ / 100) {
- trace_write_throttle(c, inode, bio, delay);
-
- closure_get(&op->cl); /* list takes a ref */
-
- op->expires = jiffies + delay;
- op->next = NULL;
-
- if (c->write_wait_tail)
- c->write_wait_tail->next = op;
- else
- c->write_wait_head = op;
- c->write_wait_tail = op;
-
- if (!timer_pending(&c->foreground_write_wakeup))
- mod_timer(&c->foreground_write_wakeup,
- op->expires);
-
- spin_unlock_irqrestore(&c->foreground_write_pd_lock,
- flags);
- continue_at(cl, __bch2_write, index_update_wq(op));
- }
-
- spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
- }
+ bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
continue_at_nobarrier(cl, __bch2_write, NULL);
}
-void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
- struct disk_reservation res,
- struct bch_devs_mask *devs,
- unsigned long write_point,
- struct bpos pos,
- u64 *journal_seq, unsigned flags)
-{
- EBUG_ON(res.sectors && !res.nr_replicas);
-
- op->c = c;
- op->io_wq = index_update_wq(op);
- op->written = 0;
- op->error = 0;
- op->flags = flags;
- op->csum_type = bch2_data_checksum_type(c);
- op->compression_type =
- bch2_compression_opt_to_type(c->opts.compression);
- op->nr_replicas = res.nr_replicas;
- op->alloc_reserve = RESERVE_NONE;
- op->nonce = 0;
- op->pos = pos;
- op->version = ZERO_VERSION;
- op->res = res;
- op->devs = devs;
- op->write_point = write_point;
-
- if (journal_seq) {
- op->journal_seq_p = journal_seq;
- op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
- } else {
- op->journal_seq = 0;
- }
-
- op->index_update_fn = bch2_write_index_default;
-
- memset(op->open_buckets, 0, sizeof(op->open_buckets));
- memset(&op->failed, 0, sizeof(op->failed));
-
- bch2_keylist_init(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys));
-
- if (version_stress_test(c))
- get_random_bytes(&op->version, sizeof(op->version));
-}
-
/* Cache promotion on read */
struct promote_op {
@@ -787,11 +912,20 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
trace_promote(&rbio->bio);
/* we now own pages: */
+ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
- sizeof(struct bio_vec) * bio->bi_vcnt);
rbio->promote = NULL;
+ __bch2_write_op_init(&op->write.op, c);
+
+ op->write.move_dev = -1;
+ op->write.op.devs = c->fastest_devs;
+ op->write.op.write_point = writepoint_hashed((unsigned long) current);
+ op->write.op.flags |= BCH_WRITE_ALLOC_NOWAIT;
+ op->write.op.flags |= BCH_WRITE_CACHED;
+
+ bch2_migrate_write_init(&op->write, rbio);
+
closure_init(cl, NULL);
closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
closure_return_with_destructor(cl, promote_done);
@@ -801,57 +935,27 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
* XXX: multiple promotes can race with each other, wastefully. Keep a list of
* outstanding promotes?
*/
-static struct promote_op *promote_alloc(struct bch_fs *c,
- struct bvec_iter iter,
- struct bkey_s_c k,
- struct extent_pick_ptr *pick,
- bool read_full)
+static struct promote_op *promote_alloc(struct bch_read_bio *rbio)
{
struct promote_op *op;
struct bio *bio;
- /*
- * biovec needs to be big enough to hold decompressed data, if
- * bch2_write_extent() has to decompress/recompress it:
- */
- unsigned sectors = max_t(unsigned, k.k->size,
- crc_uncompressed_size(NULL, &pick->crc));
- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+ /* data might have to be decompressed in the write path: */
+ unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size,
+ PAGE_SECTORS);
- op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+ BUG_ON(!rbio->bounce);
+ BUG_ON(pages < rbio->bio.bi_vcnt);
+
+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages,
+ GFP_NOIO);
if (!op)
return NULL;
bio = &op->write.op.wbio.bio;
bio_init(bio, bio->bi_inline_vecs, pages);
- bio->bi_iter = iter;
-
- if (pick->crc.compression_type) {
- op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED;
- op->write.op.crc = pick->crc;
- op->write.op.size = k.k->size;
- } else if (read_full) {
- /*
- * Adjust bio to correspond to _live_ portion of @k -
- * which might be less than what we're actually reading:
- */
- bio->bi_iter.bi_size = sectors << 9;
- bio_advance(bio, pick->crc.offset << 9);
- BUG_ON(bio_sectors(bio) < k.k->size);
- bio->bi_iter.bi_size = k.k->size << 9;
- } else {
- /*
- * Set insert pos to correspond to what we're actually
- * reading:
- */
- op->write.op.pos.offset = iter.bi_sector;
- }
- bch2_migrate_write_init(c, &op->write,
- c->fastest_devs,
- k, NULL,
- BCH_WRITE_ALLOC_NOWAIT|
- BCH_WRITE_CACHED);
- op->write.promote = true;
+ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+ sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
return op;
}
@@ -863,9 +967,6 @@ static bool should_promote(struct bch_fs *c,
if (!(flags & BCH_READ_MAY_PROMOTE))
return false;
- if (flags & BCH_READ_IN_RETRY)
- return false;
-
if (percpu_ref_is_dying(&c->writes))
return false;
@@ -875,10 +976,20 @@ static bool should_promote(struct bch_fs *c,
/* Read */
+static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *,
+ struct bvec_iter, u64,
+ struct bch_devs_mask *, unsigned);
+
#define READ_RETRY_AVOID 1
#define READ_RETRY 2
#define READ_ERR 3
+enum rbio_context {
+ RBIO_CONTEXT_NULL,
+ RBIO_CONTEXT_HIGHPRI,
+ RBIO_CONTEXT_UNBOUND,
+};
+
static inline struct bch_read_bio *
bch2_rbio_parent(struct bch_read_bio *rbio)
{
@@ -887,14 +998,14 @@ bch2_rbio_parent(struct bch_read_bio *rbio)
__always_inline
static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+ enum rbio_context context,
struct workqueue_struct *wq)
{
-
- if (!wq || rbio->process_context) {
+ if (context <= rbio->context) {
fn(&rbio->work);
} else {
rbio->work.func = fn;
- rbio->process_context = true;
+ rbio->context = context;
queue_work(wq, &rbio->work);
}
}
@@ -932,7 +1043,7 @@ static void bch2_rbio_retry(struct work_struct *work)
struct bch_fs *c = rbio->c;
struct bvec_iter iter = rbio->bvec_iter;
unsigned flags = rbio->flags;
- u64 inode = rbio->inode;
+ u64 inode = rbio->pos.inode;
struct bch_devs_mask avoid;
trace_read_retry(&rbio->bio);
@@ -942,15 +1053,24 @@ static void bch2_rbio_retry(struct work_struct *work)
if (rbio->retry == READ_RETRY_AVOID)
__set_bit(rbio->pick.ca->dev_idx, avoid.d);
+ if (rbio->promote)
+ kfree(rbio->promote);
+ rbio->promote = NULL;
+
if (rbio->split)
rbio = bch2_rbio_free(rbio);
else
rbio->bio.bi_error = 0;
- flags |= BCH_READ_MUST_CLONE;
+ if (!(flags & BCH_READ_NODECODE))
+ flags |= BCH_READ_MUST_CLONE;
flags |= BCH_READ_IN_RETRY;
+ flags &= ~BCH_READ_MAY_PROMOTE;
- __bch2_read(c, rbio, iter, inode, &avoid, flags);
+ if (flags & BCH_READ_NODECODE)
+ bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags);
+ else
+ __bch2_read(c, rbio, iter, inode, &avoid, flags);
}
static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
@@ -964,108 +1084,175 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
bch2_rbio_parent(rbio)->bio.bi_error = error;
bch2_rbio_done(rbio);
} else {
- bch2_rbio_punt(rbio, bch2_rbio_retry, rbio->c->wq);
+ bch2_rbio_punt(rbio, bch2_rbio_retry,
+ RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+ }
+}
+
+static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+ struct bch_fs *c = rbio->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_extent *e;
+ BKEY_PADDED(k) new;
+ struct bch_extent_crc_unpacked new_crc;
+ unsigned offset;
+ int ret;
+
+ if (rbio->pick.crc.compression_type)
+ return;
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos,
+ BTREE_ITER_INTENT);
+retry:
+ k = bch2_btree_iter_peek(&iter);
+ if (IS_ERR_OR_NULL(k.k))
+ goto out;
+
+ if (!bkey_extent_is_data(k.k))
+ goto out;
+
+ bkey_reassemble(&new.k, k);
+ e = bkey_i_to_extent(&new.k);
+
+ if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
+ rbio->pick.ptr,
+ rbio->pos.offset -
+ rbio->pick.crc.offset) ||
+ bversion_cmp(e->k.version, rbio->version))
+ goto out;
+
+ /* Extent was merged? */
+ if (bkey_start_offset(&e->k) < rbio->pos.offset ||
+ e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size)
+ goto out;
+
+ /* The extent might have been partially overwritten since we read it: */
+ offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset);
+
+ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+ rbio->pick.crc, NULL, &new_crc,
+ offset, e->k.size,
+ rbio->pick.crc.csum_type)) {
+ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+ goto out;
}
+
+ if (!bch2_extent_narrow_crcs(e, new_crc))
+ goto out;
+
+ ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_NOWAIT,
+ BTREE_INSERT_ENTRY(&iter, &e->k_i));
+ if (ret == -EINTR)
+ goto retry;
+out:
+ bch2_btree_iter_unlock(&iter);
+}
+
+static bool should_narrow_crcs(struct bkey_s_c_extent e,
+ struct extent_pick_ptr *pick,
+ unsigned flags)
+{
+ return !(flags & BCH_READ_IN_RETRY) &&
+ bch2_can_narrow_extent_crcs(e, pick->crc);
}
-static int bch2_rbio_checksum_uncompress(struct bio *dst,
- struct bch_read_bio *rbio)
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
- struct bio *src = &rbio->bio;
+ struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
- struct nonce nonce = extent_nonce(rbio->version,
- rbio->pick.crc.nonce,
- crc_uncompressed_size(NULL, &rbio->pick.crc),
- rbio->pick.crc.compression_type);
+ struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+ struct nonce nonce = extent_nonce(rbio->version, crc);
struct bch_csum csum;
- int ret = 0;
- /*
- * reset iterator for checksumming and copying bounced data: here we've
- * set rbio->compressed_size to the amount of data we actually read,
- * which was not necessarily the full extent if we were only bouncing
- * in order to promote
- */
+ /* Reset iterator for checksumming and copying bounced data: */
if (rbio->bounce) {
- src->bi_iter.bi_size = crc_compressed_size(NULL, &rbio->pick.crc) << 9;
- src->bi_iter.bi_idx = 0;
- src->bi_iter.bi_bvec_done = 0;
+ src->bi_iter.bi_size = crc.compressed_size << 9;
+ src->bi_iter.bi_idx = 0;
+ src->bi_iter.bi_bvec_done = 0;
} else {
- src->bi_iter = rbio->bvec_iter;
+ src->bi_iter = rbio->bvec_iter;
}
- csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, nonce, src);
- if (bch2_dev_io_err_on(bch2_crc_cmp(rbio->pick.crc.csum, csum),
- rbio->pick.ca,
- "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
- rbio->inode, (u64) rbio->bvec_iter.bi_sector << 9,
- rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
- csum.hi, csum.lo,
- rbio->pick.crc.csum_type))
- ret = -EIO;
+ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+ if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
+ goto csum_err;
- /*
- * If there was a checksum error, still copy the data back - unless it
- * was compressed, we don't want to decompress bad data:
- */
- if (rbio->pick.crc.compression_type != BCH_COMPRESSION_NONE) {
- if (!ret) {
- bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
- ret = bch2_bio_uncompress(c, src, dst,
- dst_iter, rbio->pick.crc);
- if (ret)
- __bcache_io_error(c, "decompression error");
- }
- } else if (rbio->bounce) {
- bio_advance(src, rbio->pick.crc.offset << 9);
-
- /* don't need to decrypt the entire bio: */
- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
- src->bi_iter.bi_size = dst_iter.bi_size;
+ if (unlikely(rbio->narrow_crcs))
+ bch2_rbio_narrow_crcs(rbio);
- nonce = nonce_add(nonce, rbio->pick.crc.offset << 9);
+ if (rbio->flags & BCH_READ_NODECODE)
+ goto nodecode;
- bch2_encrypt_bio(c, rbio->pick.crc.csum_type,
- nonce, src);
+ /* Adjust crc to point to subset of data we want: */
+ crc.offset += rbio->bvec_iter.bi_sector - rbio->pos.offset;
+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
- bio_copy_data_iter(dst, &dst_iter,
- src, &src->bi_iter);
+ if (crc.compression_type != BCH_COMPRESSION_NONE) {
+ bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
+ goto decompression_err;
} else {
- bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
- }
+ /* don't need to decrypt the entire bio: */
+ nonce = nonce_add(nonce, crc.offset << 9);
+ bio_advance(src, crc.offset << 9);
- return ret;
-}
+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+ src->bi_iter.bi_size = dst_iter.bi_size;
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- int ret;
+ bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- ret = bch2_rbio_checksum_uncompress(&bch2_rbio_parent(rbio)->bio, rbio);
- if (ret) {
- /*
- * Checksum error: if the bio wasn't bounced, we may have been
- * reading into buffers owned by userspace (that userspace can
- * scribble over) - retry the read, bouncing it this time:
- */
- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
- rbio->flags |= BCH_READ_MUST_BOUNCE;
- bch2_rbio_error(rbio, READ_RETRY, ret);
- } else {
- bch2_rbio_error(rbio, READ_RETRY_AVOID, ret);
+ if (rbio->bounce) {
+ struct bvec_iter src_iter = src->bi_iter;
+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
}
- return;
}
- if (rbio->promote)
+ if (rbio->promote) {
+ /*
+ * Re encrypt data we decrypted, so it's consistent with
+ * rbio->crc:
+ */
+ bch2_encrypt_bio(c, crc.csum_type, nonce, src);
promote_start(rbio->promote, rbio);
-
+ }
+nodecode:
if (likely(!(rbio->flags & BCH_READ_IN_RETRY)))
bch2_rbio_done(rbio);
+ return;
+csum_err:
+ /*
+ * Checksum error: if the bio wasn't bounced, we may have been
+ * reading into buffers owned by userspace (that userspace can
+ * scribble over) - retry the read, bouncing it this time:
+ */
+ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+ rbio->flags |= BCH_READ_MUST_BOUNCE;
+ bch2_rbio_error(rbio, READ_RETRY, -EIO);
+ return;
+ }
+
+ bch2_dev_io_error(rbio->pick.ca,
+ "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
+ rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+ csum.hi, csum.lo, crc.csum_type);
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO);
+ return;
+decompression_err:
+ __bcache_io_error(c, "decompression error, inode %llu offset %llu",
+ rbio->pos.inode,
+ (u64) rbio->bvec_iter.bi_sector);
+ bch2_rbio_error(rbio, READ_ERR, -EIO);
+ return;
}
static void bch2_read_endio(struct bio *bio)
@@ -1074,6 +1261,9 @@ static void bch2_read_endio(struct bio *bio)
container_of(bio, struct bch_read_bio, bio);
struct bch_fs *c = rbio->c;
struct workqueue_struct *wq = NULL;
+ enum rbio_context context = RBIO_CONTEXT_NULL;
+
+ bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ);
percpu_ref_put(&rbio->pick.ca->io_ref);
@@ -1097,38 +1287,45 @@ static void bch2_read_endio(struct bio *bio)
return;
}
- if (rbio->pick.crc.compression_type ||
+ if (rbio->narrow_crcs ||
+ rbio->pick.crc.compression_type ||
bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
- wq = system_unbound_wq;
+ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
else if (rbio->pick.crc.csum_type)
- wq = system_highpri_wq;
+ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
- bch2_rbio_punt(rbio, __bch2_read_endio, wq);
+ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
}
int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
- struct bvec_iter iter, struct bkey_s_c k,
+ struct bvec_iter iter, struct bkey_s_c_extent e,
struct extent_pick_ptr *pick, unsigned flags)
{
struct bch_read_bio *rbio;
- struct promote_op *promote_op = NULL;
- unsigned skip = iter.bi_sector - bkey_start_offset(k.k);
- bool bounce = false, split, read_full = false;
+ bool split = false, bounce = false, read_full = false;
+ bool promote = false, narrow_crcs = false;
+ struct bpos pos = bkey_start_pos(e.k);
int ret = 0;
- bch2_increment_clock(c, bio_sectors(&orig->bio), READ);
PTR_BUCKET(pick->ca, &pick->ptr)->prio[READ] = c->prio_clock[READ].hand;
- EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
- k.k->p.offset < bvec_iter_end_sector(iter));
+ narrow_crcs = should_narrow_crcs(e, pick, flags);
+
+ if (flags & BCH_READ_NODECODE) {
+ BUG_ON(iter.bi_size < pick->crc.compressed_size << 9);
+ iter.bi_size = pick->crc.compressed_size << 9;
+ goto noclone;
+ }
+
+ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+ flags |= BCH_READ_MUST_BOUNCE;
+
+ EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector ||
+ e.k->p.offset < bvec_iter_end_sector(iter));
- /*
- * note: if compression_type and crc_type both == none, then
- * compressed/uncompressed size is zero
- */
if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
(pick->crc.csum_type != BCH_CSUM_NONE &&
- (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) ||
+ (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
(bch2_csum_type_is_encryption(pick->crc.csum_type) &&
(flags & BCH_READ_USER_MAPPED)) ||
(flags & BCH_READ_MUST_BOUNCE)))) {
@@ -1136,17 +1333,30 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
bounce = true;
}
- if (should_promote(c, pick, flags))
- promote_op = promote_alloc(c, iter, k, pick, read_full);
-
+ promote = should_promote(c, pick, flags);
/* could also set read_full */
- if (promote_op)
+ if (promote)
bounce = true;
+ if (!read_full) {
+ EBUG_ON(pick->crc.compression_type);
+ EBUG_ON(pick->crc.csum_type &&
+ (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
+ bvec_iter_sectors(iter) != pick->crc.live_size ||
+ pick->crc.offset ||
+ iter.bi_sector != pos.offset));
+
+ pick->ptr.offset += pick->crc.offset +
+ (iter.bi_sector - pos.offset);
+ pick->crc.compressed_size = bvec_iter_sectors(iter);
+ pick->crc.uncompressed_size = bvec_iter_sectors(iter);
+ pick->crc.offset = 0;
+ pick->crc.live_size = bvec_iter_sectors(iter);
+ pos.offset = iter.bi_sector;
+ }
+
if (bounce) {
- unsigned sectors = read_full
- ? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size)
- : bvec_iter_sectors(iter);
+ unsigned sectors = pick->crc.compressed_size;
rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
DIV_ROUND_UP(sectors, PAGE_SECTORS),
@@ -1163,41 +1373,38 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
* from the whole bio, in which case we don't want to retry and
* lose the error)
*/
- rbio = rbio_init(bio_clone_fast(&orig->bio,
- GFP_NOIO, &c->bio_read_split));
+ rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
+ &c->bio_read_split));
rbio->bio.bi_iter = iter;
split = true;
} else {
+noclone:
rbio = orig;
rbio->bio.bi_iter = iter;
split = false;
BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
}
- rbio->c = c;
+ BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size);
+ rbio->c = c;
if (split)
rbio->parent = orig;
else
rbio->end_io = orig->bio.bi_end_io;
-
rbio->bvec_iter = iter;
+ rbio->submit_time_us = local_clock_us();
rbio->flags = flags;
rbio->bounce = bounce;
rbio->split = split;
- rbio->process_context = false;
+ rbio->narrow_crcs = narrow_crcs;
rbio->retry = 0;
+ rbio->context = 0;
+ rbio->devs_have = bch2_extent_devs(e);
rbio->pick = *pick;
- /*
- * crc.compressed_size will be 0 if there wasn't any checksum
- * information, also we need to stash the original size of the bio if we
- * bounced (which isn't necessarily the original key size, if we bounced
- * only for promoting)
- */
- rbio->pick.crc._compressed_size = bio_sectors(&rbio->bio) - 1;
- rbio->version = k.k->version;
- rbio->promote = promote_op;
- rbio->inode = k.k->p.inode;
+ rbio->pos = pos;
+ rbio->version = e.k->version;
+ rbio->promote = promote ? promote_alloc(rbio) : NULL;
INIT_WORK(&rbio->work, NULL);
rbio->bio.bi_bdev = pick->ca->disk_sb.bdev;
@@ -1205,16 +1412,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
- if (read_full)
- rbio->pick.crc.offset += skip;
- else
- rbio->bio.bi_iter.bi_sector += skip;
-
- rbio->submit_time_us = local_clock_us();
-
if (bounce)
trace_read_bounce(&rbio->bio);
+ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER],
bio_sectors(&rbio->bio));
@@ -1223,7 +1424,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
} else {
submit_bio_wait(&rbio->bio);
- rbio->process_context = true;
+ rbio->context = RBIO_CONTEXT_UNBOUND;
bch2_read_endio(&rbio->bio);
ret = rbio->retry;
@@ -1234,6 +1435,79 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
return ret;
}
+static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, u64 inode,
+ struct bch_devs_mask *avoid, unsigned flags)
+{
+ struct extent_pick_ptr pick;
+ struct btree_iter iter;
+ BKEY_PADDED(k) tmp;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+ POS(inode, bvec_iter.bi_sector),
+ BTREE_ITER_WITH_HOLES);
+retry:
+ k = bch2_btree_iter_peek_with_holes(&iter);
+ if (btree_iter_err(k)) {
+ bch2_btree_iter_unlock(&iter);
+ goto err;
+ }
+
+ bkey_reassemble(&tmp.k, k);
+ k = bkey_i_to_s_c(&tmp.k);
+ bch2_btree_iter_unlock(&iter);
+
+ if (!bkey_extent_is_data(k.k) ||
+ !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
+ rbio->pick.ptr,
+ rbio->pos.offset -
+ rbio->pick.crc.offset) ||
+ bkey_start_offset(k.k) != bvec_iter.bi_sector)
+ goto err;
+
+ bch2_extent_pick_ptr(c, k, avoid, &pick);
+ if (IS_ERR(pick.ca)) {
+ bcache_io_error(c, &rbio->bio, "no device to read from");
+ bio_endio(&rbio->bio);
+ return;
+ }
+
+ if (!pick.ca)
+ goto err;
+
+ if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) {
+ percpu_ref_put(&pick.ca->io_ref);
+ goto err;
+
+ }
+
+ ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k),
+ &pick, flags);
+ switch (ret) {
+ case READ_RETRY_AVOID:
+ __set_bit(pick.ca->dev_idx, avoid->d);
+ case READ_RETRY:
+ goto retry;
+ case READ_ERR:
+ bio_endio(&rbio->bio);
+ return;
+ };
+
+ return;
+err:
+ /*
+ * extent we wanted to read no longer exists, or
+ * was merged or partially overwritten (and thus
+ * possibly bigger than the memory that was
+ * originally allocated)
+ */
+ rbio->bio.bi_error = -EINTR;
+ bio_endio(&rbio->bio);
+ return;
+}
+
void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
struct bvec_iter bvec_iter, u64 inode,
struct bch_devs_mask *avoid, unsigned flags)
@@ -1241,6 +1515,8 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
struct btree_iter iter;
struct bkey_s_c k;
int ret;
+
+ EBUG_ON(flags & BCH_READ_NODECODE);
retry:
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
POS(inode, bvec_iter.bi_sector),
@@ -1277,7 +1553,8 @@ retry:
}
ret = __bch2_read_extent(c, rbio, fragment,
- k, &pick, flags);
+ bkey_s_c_to_extent(k),
+ &pick, flags);
switch (ret) {
case READ_RETRY_AVOID:
__set_bit(pick.ca->dev_idx, avoid->d);