summaryrefslogtreecommitdiff
path: root/libbcachefs/io.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbcachefs/io.c')
-rw-r--r--libbcachefs/io.c832
1 files changed, 557 insertions, 275 deletions
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 27e45081..bb656522 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -14,6 +14,7 @@
#include "compress.h"
#include "clock.h"
#include "debug.h"
+#include "disk_groups.h"
#include "error.h"
#include "extents.h"
#include "io.h"
@@ -30,14 +31,71 @@
#include <trace/events/bcachefs.h>
-/* Allocate, free from mempool: */
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+ const struct bch_devs_mask *devs;
+ unsigned d, nr = 0, total = 0;
+ u64 now = local_clock(), last;
+ s64 congested;
+ struct bch_dev *ca;
+
+ if (!target)
+ return false;
+
+ rcu_read_lock();
+ devs = bch2_target_to_mask(c, target);
+ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+ ca = rcu_dereference(c->devs[d]);
+ if (!ca)
+ continue;
+
+ congested = atomic_read(&ca->congested);
+ last = READ_ONCE(ca->congested_last);
+ if (time_after64(now, last))
+ congested -= (now - last) >> 12;
+
+ total += max(congested, 0LL);
+ nr++;
+ }
+ rcu_read_unlock();
-void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw)
+ return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
+ u64 now, int rw)
+{
+ u64 latency_capable =
+ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
+ /* ideally we'd be taking into account the device's variance here: */
+ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
+ s64 latency_over = io_latency - latency_threshold;
+
+ if (latency_threshold && latency_over > 0) {
+ /*
+ * bump up congested by approximately latency_over * 4 /
+ * latency_threshold - we don't need much accuracy here so don't
+ * bother with the divide:
+ */
+ if (atomic_read(&ca->congested) < CONGESTED_MAX)
+ atomic_add(latency_over >>
+ max_t(int, ilog2(latency_threshold) - 2, 0),
+ &ca->congested);
+
+ ca->congested_last = now;
+ } else if (atomic_read(&ca->congested) > 0) {
+ atomic_dec(&ca->congested);
+ }
+}
+
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
{
+ atomic64_t *latency = &ca->cur_latency[rw];
u64 now = local_clock();
- unsigned io_latency = (now >> 10) - submit_time_us;
- atomic_t *latency = &ca->latency[rw];
- unsigned old, new, v = atomic_read(latency);
+ u64 io_latency = time_after64(now, submit_time)
+ ? now - submit_time
+ : 0;
+ u64 old, new, v = atomic64_read(latency);
do {
old = v;
@@ -51,10 +109,16 @@ void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw)
now & ~(~0 << 5))
break;
- new = ewma_add((u64) old, io_latency, 6);
- } while ((v = atomic_cmpxchg(latency, old, new)) != old);
+ new = ewma_add(old, io_latency, 5);
+ } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+
+ bch2_congested_acct(ca, io_latency, now, rw);
+
+ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
}
+/* Allocate, free from mempool: */
+
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
struct bio_vec *bv;
@@ -169,22 +233,21 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
}
n->c = c;
- n->ca = ca;
- n->submit_time_us = local_clock_us();
+ n->dev = ptr->dev;
+ n->have_ioref = bch2_dev_get_ioref(ca, WRITE);
+ n->submit_time = local_clock();
n->bio.bi_iter.bi_sector = ptr->offset;
if (!journal_flushes_device(ca))
n->bio.bi_opf |= REQ_FUA;
- if (likely(percpu_ref_tryget(&ca->io_ref))) {
+ if (likely(n->have_ioref)) {
this_cpu_add(ca->io_done->sectors[WRITE][type],
bio_sectors(&n->bio));
- n->have_io_ref = true;
bio_set_dev(&n->bio, ca->disk_sb.bdev);
submit_bio(&n->bio);
} else {
- n->have_io_ref = false;
n->bio.bi_status = BLK_STS_REMOVED;
bio_endio(&n->bio);
}
@@ -196,15 +259,18 @@ static void __bch2_write(struct closure *);
static void bch2_write_done(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ struct bch_fs *c = op->c;
if (!op->error && (op->flags & BCH_WRITE_FLUSH))
- op->error = bch2_journal_error(&op->c->journal);
+ op->error = bch2_journal_error(&c->journal);
if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
- bch2_disk_reservation_put(op->c, &op->res);
- percpu_ref_put(&op->c->writes);
+ bch2_disk_reservation_put(c, &op->res);
+ percpu_ref_put(&c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
+ bch2_time_stats_update(&c->data_write_time, op->start_time);
+
closure_return(cl);
}
@@ -318,15 +384,15 @@ static void bch2_write_endio(struct bio *bio)
struct bch_write_bio *wbio = to_wbio(bio);
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_fs *c = wbio->c;
- struct bch_dev *ca = wbio->ca;
-
- bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
- set_bit(ca->dev_idx, op->failed.d);
+ set_bit(wbio->dev, op->failed.d);
- if (wbio->have_io_ref)
+ if (wbio->have_ioref) {
+ bch2_latency_acct(ca, wbio->submit_time, WRITE);
percpu_ref_put(&ca->io_ref);
+ }
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
@@ -821,6 +887,8 @@ void bch2_write(struct closure *cl)
BUG_ON(!bkey_cmp(op->pos, POS_MAX));
BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
+ op->start_time = local_clock();
+
memset(&op->failed, 0, sizeof(op->failed));
bch2_keylist_init(&op->insert_keys, op->inline_keys);
@@ -844,19 +912,72 @@ void bch2_write(struct closure *cl)
struct promote_op {
struct closure cl;
+ u64 start_time;
+
+ struct rhash_head hash;
+ struct bpos pos;
+
struct migrate_write write;
struct bio_vec bi_inline_vecs[0]; /* must be last */
};
+static const struct rhashtable_params bch_promote_params = {
+ .head_offset = offsetof(struct promote_op, hash),
+ .key_offset = offsetof(struct promote_op, pos),
+ .key_len = sizeof(struct bpos),
+};
+
+static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
+ struct bpos pos,
+ struct bch_io_opts opts,
+ unsigned flags)
+{
+ if (!opts.promote_target)
+ return false;
+
+ if (!(flags & BCH_READ_MAY_PROMOTE))
+ return false;
+
+ if (percpu_ref_is_dying(&c->writes))
+ return false;
+
+ if (!bkey_extent_is_data(k.k))
+ return false;
+
+ if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target))
+ return false;
+
+ if (bch2_target_congested(c, opts.promote_target))
+ return false;
+
+ if (rhashtable_lookup_fast(&c->promote_table, &pos,
+ bch_promote_params))
+ return false;
+
+ return true;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+ int ret;
+
+ ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params);
+ BUG_ON(ret);
+ percpu_ref_put(&c->writes);
+ kfree(op);
+}
+
static void promote_done(struct closure *cl)
{
struct promote_op *op =
container_of(cl, struct promote_op, cl);
struct bch_fs *c = op->write.op.c;
- percpu_ref_put(&c->writes);
+ bch2_time_stats_update(&c->data_promote_time, op->start_time);
+
bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
- kfree(op);
+ promote_free(c, op);
}
static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
@@ -865,17 +986,15 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
struct closure *cl = &op->cl;
struct bio *bio = &op->write.op.wbio.bio;
- BUG_ON(!rbio->split || !rbio->bounce);
-
- if (!percpu_ref_tryget(&c->writes))
- return;
-
trace_promote(&rbio->bio);
/* we now own pages: */
+ BUG_ON(!rbio->bounce);
BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+ sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
- rbio->promote = NULL;
bch2_migrate_read_done(&op->write, rbio);
@@ -884,69 +1003,120 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
closure_return_with_destructor(cl, promote_done);
}
-/*
- * XXX: multiple promotes can race with each other, wastefully. Keep a list of
- * outstanding promotes?
- */
-static struct promote_op *promote_alloc(struct bch_read_bio *rbio,
- struct bkey_s_c k)
+noinline
+static struct promote_op *__promote_alloc(struct bch_fs *c,
+ struct bpos pos,
+ struct extent_pick_ptr *pick,
+ struct bch_io_opts opts,
+ unsigned rbio_sectors,
+ struct bch_read_bio **rbio)
{
- struct bch_fs *c = rbio->c;
- struct promote_op *op;
+ struct promote_op *op = NULL;
struct bio *bio;
+ unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
/* data might have to be decompressed in the write path: */
- unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size,
- PAGE_SECTORS);
+ unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
+ PAGE_SECTORS);
int ret;
- BUG_ON(!rbio->bounce);
- BUG_ON(pages < rbio->bio.bi_vcnt);
+ if (!percpu_ref_tryget(&c->writes))
+ return NULL;
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages,
+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
GFP_NOIO);
if (!op)
- return NULL;
+ goto err;
- bio = &op->write.op.wbio.bio;
- bio_init(bio, bio->bi_inline_vecs, pages);
+ op->start_time = local_clock();
+ op->pos = pos;
- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
- sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+ /*
+ * promotes require bouncing, but if the extent isn't
+ * checksummed/compressed it might be too big for the mempool:
+ */
+ if (rbio_sectors > c->sb.encoded_extent_max) {
+ *rbio = kzalloc(sizeof(struct bch_read_bio) +
+ sizeof(struct bio_vec) * rbio_pages,
+ GFP_NOIO);
+ if (!*rbio)
+ goto err;
+
+ rbio_init(&(*rbio)->bio, opts);
+ bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs,
+ rbio_pages);
+
+ (*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
+ bch2_bio_map(&(*rbio)->bio, NULL);
+
+ if (bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
+ goto err;
+
+ (*rbio)->bounce = true;
+ (*rbio)->split = true;
+ (*rbio)->kmalloc = true;
+ }
+
+ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+ bch_promote_params))
+ goto err;
+
+ bio = &op->write.op.wbio.bio;
+ bio_init(bio, bio->bi_inline_vecs, wbio_pages);
ret = bch2_migrate_write_init(c, &op->write,
writepoint_hashed((unsigned long) current),
- rbio->opts,
+ opts,
DATA_PROMOTE,
(struct data_opts) {
- .target = rbio->opts.promote_target
+ .target = opts.promote_target
},
- k);
+ bkey_s_c_null);
BUG_ON(ret);
return op;
+err:
+ if (*rbio)
+ bio_free_pages(&(*rbio)->bio);
+ kfree(*rbio);
+ *rbio = NULL;
+ kfree(op);
+ percpu_ref_put(&c->writes);
+ return NULL;
}
-static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e,
- unsigned flags, u16 target)
+static inline struct promote_op *promote_alloc(struct bch_fs *c,
+ struct bvec_iter iter,
+ struct bkey_s_c k,
+ struct extent_pick_ptr *pick,
+ struct bch_io_opts opts,
+ unsigned flags,
+ struct bch_read_bio **rbio,
+ bool *bounce,
+ bool *read_full)
{
- if (!target)
- return false;
-
- if (!(flags & BCH_READ_MAY_PROMOTE))
- return false;
+ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+ unsigned sectors = promote_full
+ ? pick->crc.compressed_size
+ : bvec_iter_sectors(iter);
+ struct bpos pos = promote_full
+ ? bkey_start_pos(k.k)
+ : POS(k.k->p.inode, iter.bi_sector);
+ struct promote_op *promote;
+
+ if (!should_promote(c, k, pos, opts, flags))
+ return NULL;
- if (percpu_ref_is_dying(&c->writes))
- return false;
+ promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
+ if (!promote)
+ return NULL;
- return bch2_extent_has_target(c, e, target) == NULL;
+ *bounce = true;
+ *read_full = promote_full;
+ return promote;
}
/* Read */
-static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *,
- struct bvec_iter, u64,
- struct bch_devs_mask *, unsigned);
-
#define READ_RETRY_AVOID 1
#define READ_RETRY 2
#define READ_ERR 3
@@ -979,38 +1149,144 @@ static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
{
- struct bch_read_bio *parent = rbio->parent;
-
- BUG_ON(!rbio->split);
+ BUG_ON(rbio->bounce && !rbio->split);
if (rbio->promote)
- kfree(rbio->promote);
+ promote_free(rbio->c, rbio->promote);
+ rbio->promote = NULL;
+
if (rbio->bounce)
bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
- bio_put(&rbio->bio);
- return parent;
+ if (rbio->split) {
+ struct bch_read_bio *parent = rbio->parent;
+
+ if (rbio->kmalloc)
+ kfree(rbio);
+ else
+ bio_put(&rbio->bio);
+
+ rbio = parent;
+ }
+
+ return rbio;
}
static void bch2_rbio_done(struct bch_read_bio *rbio)
{
- if (rbio->promote)
- kfree(rbio->promote);
- rbio->promote = NULL;
-
- if (rbio->split)
- rbio = bch2_rbio_free(rbio);
+ bch2_time_stats_update(&rbio->c->data_read_time, rbio->start_time);
bio_endio(&rbio->bio);
}
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, u64 inode,
+ struct bch_devs_mask *avoid, unsigned flags)
+{
+ struct btree_iter iter;
+ BKEY_PADDED(k) tmp;
+ struct bkey_s_c k;
+ int ret;
+
+ flags &= ~BCH_READ_LAST_FRAGMENT;
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+ rbio->pos, BTREE_ITER_SLOTS);
+retry:
+ rbio->bio.bi_status = 0;
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ if (btree_iter_err(k)) {
+ bch2_btree_iter_unlock(&iter);
+ goto err;
+ }
+
+ bkey_reassemble(&tmp.k, k);
+ k = bkey_i_to_s_c(&tmp.k);
+ bch2_btree_iter_unlock(&iter);
+
+ if (!bkey_extent_is_data(k.k) ||
+ !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
+ rbio->pick.ptr,
+ rbio->pos.offset -
+ rbio->pick.crc.offset)) {
+ /* extent we wanted to read no longer exists: */
+ rbio->hole = true;
+ goto out;
+ }
+
+ ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+ if (ret == READ_RETRY)
+ goto retry;
+ if (ret)
+ goto err;
+ goto out;
+err:
+ rbio->bio.bi_status = BLK_STS_IOERR;
+out:
+ bch2_rbio_done(rbio);
+}
+
+static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, u64 inode,
+ struct bch_devs_mask *avoid, unsigned flags)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ flags &= ~BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_MUST_CLONE;
+retry:
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+ POS(inode, bvec_iter.bi_sector),
+ BTREE_ITER_SLOTS, k) {
+ BKEY_PADDED(k) tmp;
+ unsigned bytes;
+
+ bkey_reassemble(&tmp.k, k);
+ k = bkey_i_to_s_c(&tmp.k);
+ bch2_btree_iter_unlock(&iter);
+
+ bytes = min_t(unsigned, bvec_iter.bi_size,
+ (k.k->p.offset - bvec_iter.bi_sector) << 9);
+ swap(bvec_iter.bi_size, bytes);
+
+ ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+ switch (ret) {
+ case READ_RETRY:
+ goto retry;
+ case READ_ERR:
+ goto err;
+ };
+
+ if (bytes == bvec_iter.bi_size)
+ goto out;
+
+ swap(bvec_iter.bi_size, bytes);
+ bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+ }
+
+ /*
+ * If we get here, it better have been because there was an error
+ * reading a btree node
+ */
+ ret = bch2_btree_iter_unlock(&iter);
+ BUG_ON(!ret);
+ __bcache_io_error(c, "btree IO error %i", ret);
+err:
+ rbio->bio.bi_status = BLK_STS_IOERR;
+out:
+ bch2_rbio_done(rbio);
+}
+
static void bch2_rbio_retry(struct work_struct *work)
{
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bvec_iter iter = rbio->bvec_iter;
- unsigned flags = rbio->flags;
- u64 inode = rbio->pos.inode;
+ struct bch_fs *c = rbio->c;
+ struct bvec_iter iter = rbio->bvec_iter;
+ unsigned flags = rbio->flags;
+ u64 inode = rbio->pos.inode;
struct bch_devs_mask avoid;
trace_read_retry(&rbio->bio);
@@ -1018,26 +1294,19 @@ static void bch2_rbio_retry(struct work_struct *work)
memset(&avoid, 0, sizeof(avoid));
if (rbio->retry == READ_RETRY_AVOID)
- __set_bit(rbio->pick.ca->dev_idx, avoid.d);
+ __set_bit(rbio->pick.ptr.dev, avoid.d);
- if (rbio->promote)
- kfree(rbio->promote);
- rbio->promote = NULL;
+ rbio->bio.bi_status = 0;
- if (rbio->split)
- rbio = bch2_rbio_free(rbio);
- else
- rbio->bio.bi_status = 0;
+ rbio = bch2_rbio_free(rbio);
- if (!(flags & BCH_READ_NODECODE))
- flags |= BCH_READ_MUST_CLONE;
flags |= BCH_READ_IN_RETRY;
flags &= ~BCH_READ_MAY_PROMOTE;
if (flags & BCH_READ_NODECODE)
- bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags);
+ bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags);
else
- __bch2_read(c, rbio, iter, inode, &avoid, flags);
+ bch2_read_retry(c, rbio, iter, inode, &avoid, flags);
}
static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
@@ -1049,7 +1318,9 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
return;
if (retry == READ_ERR) {
- bch2_rbio_parent(rbio)->bio.bi_status = error;
+ rbio = bch2_rbio_free(rbio);
+
+ rbio->bio.bi_status = error;
bch2_rbio_done(rbio);
} else {
bch2_rbio_punt(rbio, bch2_rbio_retry,
@@ -1121,12 +1392,13 @@ out:
bch2_btree_iter_unlock(&iter);
}
-static bool should_narrow_crcs(struct bkey_s_c_extent e,
+static bool should_narrow_crcs(struct bkey_s_c k,
struct extent_pick_ptr *pick,
unsigned flags)
{
return !(flags & BCH_READ_IN_RETRY) &&
- bch2_can_narrow_extent_crcs(e, pick->crc);
+ bkey_extent_is_data(k.k) &&
+ bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
}
/* Inner part that may run in process context */
@@ -1134,8 +1406,10 @@ static void __bch2_read_endio(struct work_struct *work)
{
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio;
+ struct bch_fs *c = rbio->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+ struct bio *src = &rbio->bio;
+ struct bio *dst = &bch2_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
struct nonce nonce = extent_nonce(rbio->version, crc);
@@ -1191,10 +1465,13 @@ static void __bch2_read_endio(struct work_struct *work)
*/
bch2_encrypt_bio(c, crc.csum_type, nonce, src);
promote_start(rbio->promote, rbio);
+ rbio->promote = NULL;
}
nodecode:
- if (likely(!(rbio->flags & BCH_READ_IN_RETRY)))
+ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+ rbio = bch2_rbio_free(rbio);
bch2_rbio_done(rbio);
+ }
return;
csum_err:
/*
@@ -1208,7 +1485,7 @@ csum_err:
return;
}
- bch2_dev_io_error(rbio->pick.ca,
+ bch2_dev_io_error(ca,
"data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
@@ -1227,25 +1504,27 @@ static void bch2_read_endio(struct bio *bio)
{
struct bch_read_bio *rbio =
container_of(bio, struct bch_read_bio, bio);
- struct bch_fs *c = rbio->c;
+ struct bch_fs *c = rbio->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
struct workqueue_struct *wq = NULL;
enum rbio_context context = RBIO_CONTEXT_NULL;
- bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ);
-
- percpu_ref_put(&rbio->pick.ca->io_ref);
+ if (rbio->have_ioref) {
+ bch2_latency_acct(ca, rbio->submit_time, READ);
+ percpu_ref_put(&ca->io_ref);
+ }
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) {
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
}
if (rbio->pick.ptr.cached &&
(((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) {
+ ptr_stale(ca, &rbio->pick.ptr))) {
atomic_long_inc(&c->read_realloc_races);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
@@ -1266,76 +1545,97 @@ static void bch2_read_endio(struct bio *bio)
}
int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
- struct bvec_iter iter, struct bkey_s_c_extent e,
- struct extent_pick_ptr *pick, unsigned flags)
+ struct bvec_iter iter, struct bkey_s_c k,
+ struct bch_devs_mask *avoid, unsigned flags)
{
- struct bch_read_bio *rbio;
- bool split = false, bounce = false, read_full = false;
- bool promote = false, narrow_crcs = false;
- struct bpos pos = bkey_start_pos(e.k);
- int ret = 0;
+ struct extent_pick_ptr pick;
+ struct bch_read_bio *rbio = NULL;
+ struct bch_dev *ca;
+ struct promote_op *promote = NULL;
+ bool bounce = false, read_full = false, narrow_crcs = false;
+ struct bpos pos = bkey_start_pos(k.k);
+ int pick_ret;
- lg_local_lock(&c->usage_lock);
- bucket_io_clock_reset(c, pick->ca,
- PTR_BUCKET_NR(pick->ca, &pick->ptr), READ);
- lg_local_unlock(&c->usage_lock);
+ pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick);
+
+ /* hole or reservation - just zero fill: */
+ if (!pick_ret)
+ goto hole;
- narrow_crcs = should_narrow_crcs(e, pick, flags);
+ if (pick_ret < 0)
+ goto no_device;
+
+ if (pick_ret > 0)
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
if (flags & BCH_READ_NODECODE) {
- BUG_ON(iter.bi_size < pick->crc.compressed_size << 9);
- iter.bi_size = pick->crc.compressed_size << 9;
+ /*
+ * can happen if we retry, and the extent we were going to read
+ * has been merged in the meantime:
+ */
+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+ goto hole;
+
+ iter.bi_sector = pos.offset;
+ iter.bi_size = pick.crc.compressed_size << 9;
goto noclone;
}
+ if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+ bio_flagged(&orig->bio, BIO_CHAIN))
+ flags |= BCH_READ_MUST_CLONE;
+
+ narrow_crcs = should_narrow_crcs(k, &pick, flags);
+
if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
flags |= BCH_READ_MUST_BOUNCE;
- EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector ||
- e.k->p.offset < bvec_iter_end_sector(iter));
+ EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
+ k.k->p.offset < bvec_iter_end_sector(iter));
- if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
- (pick->crc.csum_type != BCH_CSUM_NONE &&
- (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
- (bch2_csum_type_is_encryption(pick->crc.csum_type) &&
+ if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
+ (pick.crc.csum_type != BCH_CSUM_NONE &&
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+ (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
(flags & BCH_READ_USER_MAPPED)) ||
(flags & BCH_READ_MUST_BOUNCE)))) {
read_full = true;
bounce = true;
}
- promote = should_promote(c, e, flags, orig->opts.promote_target);
- /* could also set read_full */
- if (promote)
- bounce = true;
+ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+ &rbio, &bounce, &read_full);
if (!read_full) {
- EBUG_ON(pick->crc.compression_type);
- EBUG_ON(pick->crc.csum_type &&
- (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
- bvec_iter_sectors(iter) != pick->crc.live_size ||
- pick->crc.offset ||
+ EBUG_ON(pick.crc.compression_type);
+ EBUG_ON(pick.crc.csum_type &&
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+ bvec_iter_sectors(iter) != pick.crc.live_size ||
+ pick.crc.offset ||
iter.bi_sector != pos.offset));
- pick->ptr.offset += pick->crc.offset +
+ pick.ptr.offset += pick.crc.offset +
(iter.bi_sector - pos.offset);
- pick->crc.compressed_size = bvec_iter_sectors(iter);
- pick->crc.uncompressed_size = bvec_iter_sectors(iter);
- pick->crc.offset = 0;
- pick->crc.live_size = bvec_iter_sectors(iter);
+ pick.crc.compressed_size = bvec_iter_sectors(iter);
+ pick.crc.uncompressed_size = bvec_iter_sectors(iter);
+ pick.crc.offset = 0;
+ pick.crc.live_size = bvec_iter_sectors(iter);
pos.offset = iter.bi_sector;
}
- if (bounce) {
- unsigned sectors = pick->crc.compressed_size;
+ if (rbio) {
+ /* promote already allocated bounce rbio */
+ } else if (bounce) {
+ unsigned sectors = pick.crc.compressed_size;
rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
- DIV_ROUND_UP(sectors, PAGE_SECTORS),
- &c->bio_read_split),
+ DIV_ROUND_UP(sectors, PAGE_SECTORS),
+ &c->bio_read_split),
orig->opts);
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
- split = true;
+ rbio->bounce = true;
+ rbio->split = true;
} else if (flags & BCH_READ_MUST_CLONE) {
/*
* Have to clone if there were any splits, due to error
@@ -1349,156 +1649,138 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
&c->bio_read_split),
orig->opts);
rbio->bio.bi_iter = iter;
- split = true;
+ rbio->split = true;
} else {
noclone:
rbio = orig;
rbio->bio.bi_iter = iter;
- split = false;
BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
}
- BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size);
+ BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
rbio->c = c;
- if (split)
+ rbio->submit_time = local_clock();
+ if (rbio->split)
rbio->parent = orig;
else
rbio->end_io = orig->bio.bi_end_io;
rbio->bvec_iter = iter;
- rbio->submit_time_us = local_clock_us();
rbio->flags = flags;
- rbio->bounce = bounce;
- rbio->split = split;
+ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
rbio->narrow_crcs = narrow_crcs;
+ rbio->hole = 0;
rbio->retry = 0;
rbio->context = 0;
- rbio->devs_have = bch2_extent_devs(e);
- rbio->pick = *pick;
+ rbio->devs_have = bch2_bkey_devs(k);
+ rbio->pick = pick;
rbio->pos = pos;
- rbio->version = e.k->version;
- rbio->promote = promote ? promote_alloc(rbio, e.s_c) : NULL;
+ rbio->version = k.k->version;
+ rbio->promote = promote;
INIT_WORK(&rbio->work, NULL);
- bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev);
rbio->bio.bi_opf = orig->bio.bi_opf;
- rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
+ rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
- if (bounce)
+ if (rbio->bounce)
trace_read_bounce(&rbio->bio);
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
- this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER],
+
+ if (!rbio->have_ioref)
+ goto no_device_postclone;
+
+ lg_local_lock(&c->usage_lock);
+ bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
+ lg_local_unlock(&c->usage_lock);
+
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
bio_sectors(&rbio->bio));
+ bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
if (likely(!(flags & BCH_READ_IN_RETRY))) {
+ if (!(flags & BCH_READ_LAST_FRAGMENT)) {
+ bio_inc_remaining(&orig->bio);
+ trace_read_split(&orig->bio);
+ }
+
submit_bio(&rbio->bio);
+ return 0;
} else {
+ int ret;
+
submit_bio_wait(&rbio->bio);
rbio->context = RBIO_CONTEXT_UNBOUND;
bch2_read_endio(&rbio->bio);
ret = rbio->retry;
- if (rbio->split)
- rbio = bch2_rbio_free(rbio);
- if (!ret)
- bch2_rbio_done(rbio);
- }
-
- return ret;
-}
-
-static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
- struct bch_devs_mask *avoid, unsigned flags)
-{
- struct extent_pick_ptr pick;
- struct btree_iter iter;
- BKEY_PADDED(k) tmp;
- struct bkey_s_c k;
- int ret;
-
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
- POS(inode, bvec_iter.bi_sector),
- BTREE_ITER_SLOTS);
-retry:
- k = bch2_btree_iter_peek_slot(&iter);
- if (btree_iter_err(k)) {
- bch2_btree_iter_unlock(&iter);
- goto err;
- }
-
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
- bch2_btree_iter_unlock(&iter);
+ rbio = bch2_rbio_free(rbio);
- if (!bkey_extent_is_data(k.k) ||
- !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
- rbio->pick.ptr,
- rbio->pos.offset -
- rbio->pick.crc.offset) ||
- bkey_start_offset(k.k) != bvec_iter.bi_sector)
- goto err;
+ if (ret == READ_RETRY_AVOID) {
+ __set_bit(pick.ptr.dev, avoid->d);
+ ret = READ_RETRY;
+ }
- bch2_extent_pick_ptr(c, k, avoid, &pick);
- if (IS_ERR(pick.ca)) {
- bcache_io_error(c, &rbio->bio, "no device to read from");
- bio_endio(&rbio->bio);
- return;
+ return ret;
}
- if (!pick.ca)
- goto err;
+no_device_postclone:
+ if (!rbio->split)
+ rbio->bio.bi_end_io = rbio->end_io;
+ bch2_rbio_free(rbio);
+no_device:
+ __bcache_io_error(c, "no device to read from");
- if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) {
- percpu_ref_put(&pick.ca->io_ref);
- goto err;
+ if (likely(!(flags & BCH_READ_IN_RETRY))) {
+ orig->bio.bi_status = BLK_STS_IOERR;
+ if (flags & BCH_READ_LAST_FRAGMENT)
+ bch2_rbio_done(orig);
+ return 0;
+ } else {
+ return READ_ERR;
}
- ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k),
- &pick, flags);
- switch (ret) {
- case READ_RETRY_AVOID:
- __set_bit(pick.ca->dev_idx, avoid->d);
- case READ_RETRY:
- goto retry;
- case READ_ERR:
- bio_endio(&rbio->bio);
- return;
- };
-
- return;
-err:
+hole:
/*
- * extent we wanted to read no longer exists, or
- * was merged or partially overwritten (and thus
- * possibly bigger than the memory that was
- * originally allocated)
+ * won't normally happen in the BCH_READ_NODECODE
+ * (bch2_move_extent()) path, but if we retry and the extent we wanted
+ * to read no longer exists we have to signal that:
*/
- rbio->bio.bi_status = BLK_STS_AGAIN;
- bio_endio(&rbio->bio);
- return;
+ if (flags & BCH_READ_NODECODE)
+ orig->hole = true;
+
+ zero_fill_bio_iter(&orig->bio, iter);
+
+ if (flags & BCH_READ_LAST_FRAGMENT)
+ bch2_rbio_done(orig);
+ return 0;
}
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
- struct bch_devs_mask *avoid, unsigned flags)
+void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
{
struct btree_iter iter;
struct bkey_s_c k;
+ unsigned flags = BCH_READ_RETRY_IF_STALE|
+ BCH_READ_MAY_PROMOTE|
+ BCH_READ_USER_MAPPED;
int ret;
- EBUG_ON(flags & BCH_READ_NODECODE);
-retry:
+ BUG_ON(rbio->_state);
+ BUG_ON(flags & BCH_READ_NODECODE);
+ BUG_ON(flags & BCH_READ_IN_RETRY);
+
+ rbio->c = c;
+ rbio->start_time = local_clock();
+
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
- POS(inode, bvec_iter.bi_sector),
+ POS(inode, rbio->bio.bi_iter.bi_sector),
BTREE_ITER_SLOTS, k) {
BKEY_PADDED(k) tmp;
- struct extent_pick_ptr pick;
- struct bvec_iter fragment;
+ unsigned bytes;
/*
* Unlock the iterator while the btree node's lock is still in
@@ -1508,49 +1790,20 @@ retry:
k = bkey_i_to_s_c(&tmp.k);
bch2_btree_iter_unlock(&iter);
- bch2_extent_pick_ptr(c, k, avoid, &pick);
- if (IS_ERR(pick.ca)) {
- bcache_io_error(c, &rbio->bio, "no device to read from");
- bio_endio(&rbio->bio);
- return;
- }
-
- fragment = bvec_iter;
- fragment.bi_size = (min_t(u64, k.k->p.offset,
- bvec_iter_end_sector(bvec_iter)) -
- bvec_iter.bi_sector) << 9;
+ bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
+ (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
+ swap(rbio->bio.bi_iter.bi_size, bytes);
- if (pick.ca) {
- if (fragment.bi_size != bvec_iter.bi_size) {
- bio_inc_remaining(&rbio->bio);
- flags |= BCH_READ_MUST_CLONE;
- trace_read_split(&rbio->bio);
- }
+ if (rbio->bio.bi_iter.bi_size == bytes)
+ flags |= BCH_READ_LAST_FRAGMENT;
- ret = __bch2_read_extent(c, rbio, fragment,
- bkey_s_c_to_extent(k),
- &pick, flags);
- switch (ret) {
- case READ_RETRY_AVOID:
- __set_bit(pick.ca->dev_idx, avoid->d);
- case READ_RETRY:
- goto retry;
- case READ_ERR:
- rbio->bio.bi_status = BLK_STS_IOERR;
- bio_endio(&rbio->bio);
- return;
- };
- } else {
- zero_fill_bio_iter(&rbio->bio, fragment);
-
- if (fragment.bi_size == bvec_iter.bi_size)
- bio_endio(&rbio->bio);
- }
+ bch2_read_extent(c, rbio, k, flags);
- if (fragment.bi_size == bvec_iter.bi_size)
+ if (flags & BCH_READ_LAST_FRAGMENT)
return;
- bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size);
+ swap(rbio->bio.bi_iter.bi_size, bytes);
+ bio_advance(&rbio->bio, bytes);
}
/*
@@ -1560,5 +1813,34 @@ retry:
ret = bch2_btree_iter_unlock(&iter);
BUG_ON(!ret);
bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
- bio_endio(&rbio->bio);
+ bch2_rbio_done(rbio);
+}
+
+void bch2_fs_io_exit(struct bch_fs *c)
+{
+ if (c->promote_table.tbl)
+ rhashtable_destroy(&c->promote_table);
+ mempool_exit(&c->bio_bounce_pages);
+ bioset_exit(&c->bio_write);
+ bioset_exit(&c->bio_read_split);
+ bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_init(struct bch_fs *c)
+{
+ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ mempool_init_page_pool(&c->bio_bounce_pages,
+ max_t(unsigned,
+ c->opts.btree_node_size,
+ c->sb.encoded_extent_max) /
+ PAGE_SECTORS, 0) ||
+ rhashtable_init(&c->promote_table, &bch_promote_params))
+ return -ENOMEM;
+
+ return 0;
}