summaryrefslogtreecommitdiff
path: root/drivers/md/bcache/bch2.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/bcache/bch2.c')
-rw-r--r--drivers/md/bcache/bch2.c540
1 files changed, 540 insertions, 0 deletions
diff --git a/drivers/md/bcache/bch2.c b/drivers/md/bcache/bch2.c
new file mode 100644
index 000000000000..b0ada5a61564
--- /dev/null
+++ b/drivers/md/bcache/bch2.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Main bcache entry point - handle a read or a write request and decide what to
+ * do with it; the make_request functions are called by the block layer.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "backingdev.h"
+#include "bch2.h"
+
+#include "../../../fs/bcachefs/bcachefs.h"
+#include "../../../fs/bcachefs/alloc_foreground.h"
+#include "../../../fs/bcachefs/btree_update.h"
+#include "../../../fs/bcachefs/buckets.h"
+#include "../../../fs/bcachefs/io.h"
+#include "../../../fs/bcachefs/fs.h"
+#include "../../../fs/bcachefs/fs-common.h"
+#include "../../../fs/bcachefs/str_hash.h"
+
+#include "io.h"
+
+#include <linux/kthread.h>
+//#include <trace/events/bcache.h>
+
+static unsigned fs_used_percent(struct bch_fs *c)
+{
+ struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
+
+ return div64_u64(usage.used * 100, usage.capacity);
+}
+
+static inline bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
+ struct bkey *end)
+{
+ return false;
+}
+
+/* Reads: */
+
+struct bch_cached_dev_rbio {
+ struct bio *orig;
+ struct cached_dev *dc;
+ struct bch_read_bio rbio;
+};
+
+static void cached_dev_read_endio(struct bio *bio)
+{
+ struct bch_cached_dev_rbio *c_rbio =
+ container_of(bio, struct bch_cached_dev_rbio, rbio.bio);
+ struct bio *orig = c_rbio->orig;
+ struct cached_dev *dc = c_rbio->dc;
+
+ bio_put(bio);
+ cached_dev_put(dc);
+ bio_endio(orig);
+}
+
+static void cached_dev_read(struct cached_dev *dc, struct bio *bio)
+{
+ struct bch_fs *c = dc->disk.c2;
+ struct bch_read_bio *rbio;
+ struct bch_cached_dev_rbio *c_rbio;
+ struct bch_io_opts opts = { 0 };
+ unsigned flags = BCH_READ_RETRY_IF_STALE|
+ BCH_READ_USER_MAPPED|
+ BCH_READ_PASSTHROUGH_BLOCK_DEV;
+
+ if (!bch_check_should_bypass(dc, bio, c->opts.block_size, 0)) {
+ /* XXX: implement promotes from block devices in bch2:
+ flags |= BCH_READ_MAY_PROMOTE;
+ */
+ }
+
+ /* XXX: plumb through write point for promotes:
+ unsigned write_point = writepoint_hashed((unsigned long) current);
+ */
+
+ rbio = rbio_init(bio_clone_fast(bio, GFP_NOIO, &dc->bch2_bio_read), opts);
+ rbio->bio.bi_end_io = cached_dev_read_endio;
+ c_rbio = container_of(rbio, struct bch_cached_dev_rbio, rbio);
+ c_rbio->orig = bio;
+ c_rbio->dc = dc;
+
+ bch2_read(c, rbio, dc->disk.id, flags);
+}
+
+/* Writes: */
+
+struct bch_write {
+ struct closure cl;
+
+ struct bcache_device *d;
+ struct bio *orig_bio;
+ struct bio backingdev_bio;
+
+ blk_status_t status;
+ unsigned long start_time;
+
+ unsigned int bypass:1;
+ unsigned int writeback:1;
+ struct bch_write_op op;
+};
+
+static void cached_dev_bio_complete(struct closure *cl)
+{
+ struct bch_write *io = container_of(cl, struct bch_write, cl);
+ struct cached_dev *dc = container_of(io->d, struct cached_dev, disk);
+
+ generic_end_io_acct(io->d->disk->queue, bio_op(io->orig_bio),
+ &io->d->disk->part0, io->start_time);
+
+ //trace_bcache_request_end(s->d, s->orig_bio);
+ io->orig_bio->bi_status = io->status;
+ bio_endio(io->orig_bio);
+
+ closure_debug_destroy(cl);
+ mempool_free(io, &dc->bch2_io_write);
+
+ cached_dev_put(dc);
+}
+
+static void cached_dev_write_complete(struct closure *cl)
+{
+ struct bch_write *s = container_of(cl, struct bch_write, cl);
+ struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+ up_read_non_owner(&dc->writeback_lock);
+ cached_dev_bio_complete(cl);
+}
+
+static void backingdev_endio(struct bio *bio)
+{
+ struct bch_write *io = container_of(bio, struct bch_write, backingdev_bio);
+ struct cached_dev *dc = container_of(io->d, struct cached_dev, disk);
+
+ if (bio->bi_status) {
+ io->status = bio->bi_status;
+ bch_count_backing_io_errors(dc, bio);
+ }
+
+ closure_put(&io->cl);
+}
+
+static void submit_backingdev_io(struct bch_write *io)
+{
+ struct cached_dev *dc = container_of(io->d, struct cached_dev, disk);
+
+ /*
+ * If it's a discard and the backing device doesn't support discards, no
+ * need to submit it:
+ */
+ if (bio_op(io->orig_bio) == REQ_OP_DISCARD &&
+ !blk_queue_discard(bdev_get_queue(dc->bdev)))
+ return;
+
+ bio_init(&io->backingdev_bio, NULL, 0);
+ __bio_clone_fast(&io->backingdev_bio, io->orig_bio);
+ io->backingdev_bio.bi_end_io = backingdev_endio;
+
+ closure_get(&io->cl);
+ generic_make_request(&io->backingdev_bio);
+}
+
+static void cached_dev_write(struct cached_dev *dc, struct bio *orig_bio)
+{
+ struct bch_fs *c = dc->disk.c2;
+ struct bch_write *io;
+ struct bkey start = KEY(dc->disk.id, orig_bio->bi_iter.bi_sector, 0);
+ struct bkey end = KEY(dc->disk.id, bio_end_sector(orig_bio), 0);
+ struct bch_io_opts opts = { 0 };
+ unsigned in_use = fs_used_percent(c);
+
+ io = mempool_alloc(&dc->bch2_io_write, GFP_NOIO);
+ closure_init(&io->cl, NULL);
+ io->d = &dc->disk;
+ io->orig_bio = orig_bio;
+ io->status = 0;
+ io->start_time = jiffies;
+ io->bypass = bch_check_should_bypass(dc, orig_bio,
+ c->opts.block_size, in_use);
+ io->writeback = false;
+
+ down_read_non_owner(&dc->writeback_lock);
+ if (bch_keybuf_check_overlapping(dc->writeback_keys, &start, &end)) {
+ /*
+ * We overlap with some dirty data undergoing background
+ * writeback, force this write to writeback
+ */
+ io->bypass = false;
+ io->writeback = true;
+ }
+
+ /*
+ * Discards aren't _required_ to do anything, so skipping if
+ * check_overlapping returned true is ok
+ *
+ * But check_overlapping drops dirty keys for which io hasn't started,
+ * so we still want to call it.
+ */
+ if (bio_op(orig_bio) == REQ_OP_DISCARD) {
+ io->bypass = true;
+ io->writeback = false;
+ }
+
+ if (should_writeback(dc, io->orig_bio, cache_mode(dc),
+ io->bypass, in_use)) {
+ io->bypass = false;
+ io->writeback = true;
+ }
+
+ /*
+ * Submit IO to backing device, if we're not doing a writeback write:
+ *
+ * If it's a discard and the backing device doesn't support discards, no
+ * need to submit to the backing device:
+ */
+ if (!io->writeback)
+ submit_backingdev_io(io);
+
+ /* If we're bypassing, delete the range we're writing to from the cache: */
+ if (io->bypass) {
+ u64 journal_seq = 0;
+
+ bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+ POS(dc->disk.id, orig_bio->bi_iter.bi_sector),
+ POS(dc->disk.id, bio_end_sector(orig_bio)),
+ &journal_seq);
+
+ if ((orig_bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) &&
+ !(c->opts.journal_flush_disabled))
+ bch2_journal_flush_seq_async(&c->journal, journal_seq, &io->cl);
+ } else {
+ bch2_write_op_init(&io->op, c, opts);
+ bio_init(&io->op.wbio.bio, NULL, 0);
+ __bio_clone_fast(&io->op.wbio.bio, orig_bio);
+ io->op.nr_replicas = 1;
+ io->op.write_point = writepoint_hashed((unsigned long) current);
+ io->op.new_i_size = U64_MAX;
+ io->op.pos = POS(dc->disk.id, orig_bio->bi_iter.bi_sector);
+
+ if (orig_bio->bi_opf & (REQ_FUA|REQ_PREFLUSH))
+ io->op.flags |= BCH_WRITE_FLUSH;
+
+ if (io->writeback) {
+ int ret = bch2_disk_reservation_get(c, &io->op.res, bio_sectors(orig_bio),
+ io->op.nr_replicas, 0);
+ if (ret) {
+ io->status = BLK_STS_RESOURCE;
+ goto err;
+ }
+
+ /* Mark superblock dirty, if necessary: */
+ bch_writeback_add(dc);
+ } else {
+ io->op.flags |= BCH_WRITE_CACHED;
+ }
+
+ closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
+ }
+err:
+ continue_at(&io->cl, cached_dev_write_complete, NULL);
+}
+
+static void cached_dev_nodata(struct cached_dev *dc, struct bio *orig_bio)
+{
+ struct bch_fs *c = dc->disk.c2;
+ bool flush_backingdev = cache_mode(dc) != CACHE_MODE_WRITEBACK;
+ bool flush_cache = !c->opts.journal_flush_disabled;
+ struct bch_write *io;
+
+ if (!(orig_bio->bi_opf & REQ_PREFLUSH)) {
+ generic_make_request(orig_bio);
+ return;
+ }
+
+ if (!flush_backingdev && !flush_cache) {
+ bio_endio(orig_bio);
+ return;
+ }
+
+ if (!flush_cache) {
+ generic_make_request(orig_bio);
+ return;
+ }
+
+ io = mempool_alloc(&dc->bch2_io_write, GFP_NOIO);
+ closure_init(&io->cl, NULL);
+ io->d = &dc->disk;
+ io->orig_bio = orig_bio;
+ io->status = 0;
+ io->start_time = jiffies;
+ io->bypass = false;
+ io->writeback = false;
+
+ if (flush_backingdev)
+ submit_backingdev_io(io);
+
+ bch2_journal_flush_async(&c->journal, &io->cl);
+ continue_at(&io->cl, cached_dev_bio_complete, NULL);
+}
+
+void bch2_cached_dev_make_request(struct cached_dev *dc, struct bio *bio)
+{
+ //trace_bcache_request_start(d, bio);
+
+ if (!bio->bi_iter.bi_size)
+ cached_dev_nodata(dc, bio);
+ else if (bio_data_dir(bio) == WRITE)
+ cached_dev_write(dc, bio);
+ else
+ cached_dev_read(dc, bio);
+
+}
+
+static int bch2_dev_attach_trans(struct btree_trans *trans,
+ struct qstr *name,
+ u64 *inum,
+ bool must_exist)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked root_inode;
+ struct bch_inode_unpacked dev_inode;
+ struct bch_hash_info root_hash_info;
+ struct btree_iter *iter;
+ int ret;
+
+ ret = bch2_inode_find_by_inum_trans(trans, BCACHEFS_ROOT_INO, &root_inode);
+ if (ret)
+ return ret;
+
+ root_hash_info = bch2_hash_info_init(c, &root_inode);
+
+ iter = __bch2_dirent_lookup_trans(trans, BCACHEFS_ROOT_INO,
+ &root_hash_info, name, 0);
+ ret = PTR_ERR_OR_ZERO(iter);
+ if (ret && ret != -ENOENT)
+ return ret;
+
+ if (!ret) {
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ *inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+ return 0;
+ }
+
+ if (must_exist)
+ return ret;
+
+ /* Doesn't exist, create it: */
+ bch2_inode_init_early(c, &dev_inode);
+
+ ret = bch2_create_trans(trans, BCACHEFS_ROOT_INO,
+ &root_inode, &dev_inode,
+ name, 0, 0, S_IFREG, 0, NULL, NULL) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+ *inum = dev_inode.bi_inum;
+ return ret;
+}
+
+static int bch2_cached_dev_attach_one(struct cached_dev *dc, struct bch_fs *c,
+ uint8_t *fs_uuid)
+{
+ char backingdev_filename[80];
+ struct qstr backingdev_qstr;
+ struct inode *inode = NULL;
+ u64 inum;
+ int ret = 0;
+
+ snprintf(backingdev_filename, sizeof(backingdev_filename),
+ "backing-device-%pU", dc->sb.uuid);
+ backingdev_qstr = (struct qstr) QSTR_INIT(backingdev_filename,
+ strlen(backingdev_filename));
+
+ if (bcache_dev_is_attached(&dc->disk)) {
+ pr_err("Can't attach %s: already attached",
+ dc->backing_dev_name);
+ return -EINVAL;
+ }
+#if 0
+ if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
+ pr_err("Can't attach %s: shutting down",
+ dc->backing_dev_name);
+ return -EINVAL;
+ }
+#endif
+ if (dc->sb.block_size < c->opts.block_size) {
+ /* Will die */
+ pr_err("Couldn't attach %s: block size less than set's block size",
+ dc->backing_dev_name);
+ return -EINVAL;
+ }
+
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_dev_attach_trans(&trans, &backingdev_qstr, &inum,
+ BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY));
+ if (ret) {
+ pr_err("Error attaching %s: %i\n",
+ dc->backing_dev_name, ret);
+ return ret;
+ }
+
+ inode = bch2_vfs_inode_get(c, inum);
+ if (IS_ERR(inode)) {
+ pr_err("Can't attach %s: error getting inode %li",
+ dc->backing_dev_name, PTR_ERR(inode));
+ return PTR_ERR(inode);
+ }
+
+ ret = get_write_access(inode);
+ if (ret) {
+ pr_err("Can't attach %s: error getting inode %i",
+ dc->backing_dev_name, ret);
+ iput(inode);
+ return ret;
+ }
+
+ /* XXX should we be calling __mnt_want_write() too? */
+
+ if (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
+ struct closure cl;
+
+ closure_init_stack(&cl);
+
+ ret = bch2_fpunch(c, inum, 0, U64_MAX, NULL, NULL);
+ if (ret) {
+ pr_err("Error attaching %s: error deleting existing data %i\n",
+ dc->backing_dev_name, ret);
+ return ret;
+ }
+
+ SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+ bch_write_bdev_super(dc, &cl);
+ closure_sync(&cl);
+ }
+
+ /*
+ * XXX: set inode size
+ */
+
+ dc->disk.id = inum;
+ dc->disk.inode = inode;
+ dc->disk.c2 = c;
+#if 0
+ bcache_device_attach(&dc->disk, c, inum);
+ list_move(&dc->list, &c->cached_devs);
+ calc_cached_dev_sectors(c);
+#endif
+ /*
+ * dc->c must be set before dc->count != 0 - paired with the mb in
+ * cached_dev_get()
+ */
+ smp_wmb();
+ refcount_set(&dc->count, 1);
+#if 0
+ /* Block writeback thread, but spawn it */
+ down_write(&dc->writeback_lock);
+ if (bch_cached_dev_writeback_start(dc)) {
+ up_write(&dc->writeback_lock);
+ pr_err("Couldn't start writeback facilities for %s",
+ dc->disk.disk->disk_name);
+ return -ENOMEM;
+ }
+
+ if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+ atomic_set(&dc->has_dirty, 1);
+ bch_writeback_queue(dc);
+ }
+
+ bch_sectors_dirty_init(&dc->disk);
+
+ ret = bch_cached_dev_run(dc);
+ if (ret && (ret != -EBUSY)) {
+ up_write(&dc->writeback_lock);
+ /*
+ * bch_register_lock is held, bcache_device_stop() is not
+ * able to be directly called. The kthread and kworker
+ * created previously in bch_cached_dev_writeback_start()
+ * have to be stopped manually here.
+ */
+ kthread_stop(dc->writeback_thread);
+ cancel_writeback_rate_update_dwork(dc);
+ pr_err("Couldn't run cached device %s",
+ dc->backing_dev_name);
+ return ret;
+ }
+
+ /* Allow the writeback thread to proceed */
+ up_write(&dc->writeback_lock);
+#endif
+
+#if 0
+ bcache_device_link(&dc->disk, c, "bdev");
+ atomic_inc(&c->attached_dev_nr);
+#endif
+
+ pr_info("Caching %s as %s on set %pU",
+ dc->backing_dev_name,
+ dc->disk.disk->disk_name,
+ &dc->disk.c2->sb.uuid);
+ return 0;
+
+}
+
+int bch2_cached_dev_attach(struct cached_dev *dc, uint8_t *fs_uuid)
+{
+ struct bch_fs *c;
+ int ret;
+
+ mutex_lock(&bch2_fs_list_lock);
+ list_for_each_entry(c, &bch2_fs_list, list) {
+ if (fs_uuid
+ ? !memcmp(fs_uuid, &c->sb.user_uuid, 16)
+ : !memcmp(dc->sb.set_uuid, &c->sb.uuid, 16)) {
+ closure_get(&c->cl);
+ mutex_unlock(&bch2_fs_list_lock);
+ goto found;
+ }
+ }
+ mutex_unlock(&bch2_fs_list_lock);
+ return -ENOENT;
+found:
+ ret = bch2_cached_dev_attach_one(dc, c, fs_uuid);
+ closure_put(&c->cl);
+ return ret;
+}
+
+void bch2_request_exit(struct cached_dev *dc)
+{
+ mempool_exit(&dc->bch2_io_write);
+ bioset_exit(&dc->bch2_bio_read);
+}
+
+int bch2_request_init(struct cached_dev *dc)
+{
+ return bioset_init(&dc->bch2_bio_read, 1,
+ offsetof(struct bch_cached_dev_rbio, rbio.bio),
+ BIOSET_NEED_RESCUER) ?:
+ mempool_init_kmalloc_pool(&dc->bch2_io_write, 1, sizeof(struct bch_write));
+}