diff options
Diffstat (limited to 'drivers/md')
95 files changed, 2185 insertions, 1489 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 4a249ee86364..83b9362be09c 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -178,7 +178,7 @@ config MD_FAULTY config MD_CLUSTER - tristate "Cluster Support for MD (EXPERIMENTAL)" + tristate "Cluster Support for MD" depends on BLK_DEV_MD depends on DLM default n @@ -188,7 +188,8 @@ config MD_CLUSTER nodes in the cluster can access the MD devices simultaneously. This brings the redundancy (and uptime) of RAID levels across the - nodes of the cluster. + nodes of the cluster. Currently, it can work with raid1 and raid10 + (limited support). If unsure, say N. diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 786ec9e86d65..f701bb211783 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the kernel software RAID and LVM drivers. # @@ -18,9 +19,12 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ dm-cache-smq-y += dm-cache-policy-smq.o dm-era-y += dm-era-target.o dm-verity-y += dm-verity-target.o -md-mod-y += md.o bitmap.o +md-mod-y += md.o md-bitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o +linear-y += md-linear.o +multipath-y += md-multipath.o +faulty-y += md-faulty.o # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index c488b846f831..d26b35195825 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_BCACHE) += bcache.o diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index ca4abe1ccd8d..a27d85232ce1 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Primary bucket allocation code * @@ -68,6 +69,8 @@ #include <linux/random.h> #include <trace/events/bcache.h> +#define MAX_OPEN_BUCKETS 128 + /* Bucket heap / gen */ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) @@ -404,7 +407,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) finish_wait(&ca->set->bucket_wait, &w); out: - wake_up_process(ca->alloc_thread); + if (ca->alloc_thread) + wake_up_process(ca->alloc_thread); trace_bcache_alloc(ca, reserve); @@ -439,6 +443,11 @@ out: b->prio = INITIAL_PRIO; } + if (ca->set->avail_nbuckets > 0) { + ca->set->avail_nbuckets--; + bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); + } + return r; } @@ -446,6 +455,11 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); + + if (ca->set->avail_nbuckets < ca->set->nbuckets) { + ca->set->avail_nbuckets++; + bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); + } } void bch_bucket_free(struct cache_set *c, struct bkey *k) @@ -598,7 +612,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, /* * If we had to allocate, we might race and not need to allocate the - * second time we call find_data_bucket(). If we allocated a bucket but + * second time we call pick_data_bucket(). If we allocated a bucket but * didn't use it, drop the refcount bch_bucket_alloc_set() took: */ if (KEY_PTRS(&alloc.key)) @@ -671,7 +685,7 @@ int bch_open_buckets_alloc(struct cache_set *c) spin_lock_init(&c->data_bucket_lock); - for (i = 0; i < 6; i++) { + for (i = 0; i < MAX_OPEN_BUCKETS; i++) { struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); if (!b) return -ENOMEM; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index dee542fff68e..843877e017e1 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_H #define _BCACHE_H @@ -184,6 +185,7 @@ #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/rwsem.h> +#include <linux/refcount.h> #include <linux/types.h> #include <linux/workqueue.h> @@ -265,9 +267,6 @@ struct bcache_device { atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; - unsigned long sectors_dirty_last; - long sectors_dirty_derivative; - struct bio_set *bio_split; unsigned data_csum:1; @@ -299,7 +298,7 @@ struct cached_dev { struct semaphore sb_write_mutex; /* Refcount on the cache set. Always nonzero when we're caching. */ - atomic_t count; + refcount_t count; struct work_struct detach; /* @@ -333,6 +332,7 @@ struct cached_dev { /* Limit number of writeback bios in flight */ struct semaphore in_flight; struct task_struct *writeback_thread; + struct workqueue_struct *writeback_write_wq; struct keybuf writeback_keys; @@ -361,12 +361,14 @@ struct cached_dev { uint64_t writeback_rate_target; int64_t writeback_rate_proportional; - int64_t writeback_rate_derivative; - int64_t writeback_rate_change; + int64_t writeback_rate_integral; + int64_t writeback_rate_integral_scaled; + int32_t writeback_rate_change; unsigned writeback_rate_update_seconds; - unsigned writeback_rate_d_term; + unsigned writeback_rate_i_term_inverse; unsigned writeback_rate_p_term_inverse; + unsigned writeback_rate_minimum; }; enum alloc_reserve { @@ -580,6 +582,7 @@ struct cache_set { uint8_t need_gc; struct gc_stat gc_stats; size_t nbuckets; + size_t avail_nbuckets; struct task_struct *gc_thread; /* Where in the btree gc currently is */ @@ -805,13 +808,13 @@ do { \ static inline void cached_dev_put(struct cached_dev *dc) { - if (atomic_dec_and_test(&dc->count)) + if (refcount_dec_and_test(&dc->count)) schedule_work(&dc->detach); } static inline bool cached_dev_get(struct cached_dev *dc) { - if (!atomic_inc_not_zero(&dc->count)) + if (!refcount_inc_not_zero(&dc->count)) return false; /* Paired with the mb in cached_dev_attach */ diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 18526d44688d..e56d3ecdbfcb 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Code for working with individual keys, and sorted sets of keys with in a * btree node diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index ae964624efb2..fa506c1aa524 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_BSET_H #define _BCACHE_BSET_H diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 866dcf78ff8e..11c5503d31dc 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> * @@ -1240,6 +1241,11 @@ void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) __bch_btree_mark_key(c, level, k); } +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) +{ + stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets; +} + static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; @@ -1651,9 +1657,8 @@ static void btree_gc_start(struct cache_set *c) mutex_unlock(&c->bucket_lock); } -static size_t bch_btree_gc_finish(struct cache_set *c) +static void bch_btree_gc_finish(struct cache_set *c) { - size_t available = 0; struct bucket *b; struct cache *ca; unsigned i; @@ -1690,6 +1695,7 @@ static size_t bch_btree_gc_finish(struct cache_set *c) } rcu_read_unlock(); + c->avail_nbuckets = 0; for_each_cache(ca, c, i) { uint64_t *i; @@ -1711,18 +1717,16 @@ static size_t bch_btree_gc_finish(struct cache_set *c) BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) - available++; + c->avail_nbuckets++; } } mutex_unlock(&c->bucket_lock); - return available; } static void bch_btree_gc(struct cache_set *c) { int ret; - unsigned long available; struct gc_stat stats; struct closure writes; struct btree_op op; @@ -1745,14 +1749,14 @@ static void bch_btree_gc(struct cache_set *c) pr_warn("gc failed!"); } while (ret); - available = bch_btree_gc_finish(c); + bch_btree_gc_finish(c); wake_up_allocators(c); bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); stats.data <<= 9; - stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; + bch_update_bucket_in_use(c, &stats); memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); trace_bcache_gc_end(c); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 73da1f5626cb..d211e2c25b6b 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_BTREE_H #define _BCACHE_BTREE_H @@ -305,5 +306,5 @@ void bch_keybuf_del(struct keybuf *, struct keybuf_key *); struct keybuf_key *bch_keybuf_next(struct keybuf *); struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, struct bkey *, keybuf_pred_fn *); - +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats); #endif diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 864e673aec39..1841d0359bac 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -64,27 +64,16 @@ EXPORT_SYMBOL(closure_put); void __closure_wake_up(struct closure_waitlist *wait_list) { struct llist_node *list; - struct closure *cl; + struct closure *cl, *t; struct llist_node *reverse = NULL; list = llist_del_all(&wait_list->list); /* We first reverse the list to preserve FIFO ordering and fairness */ - - while (list) { - struct llist_node *t = list; - list = llist_next(list); - - t->next = reverse; - reverse = t; - } + reverse = llist_reverse_order(list); /* Then do the wakeups */ - - while (reverse) { - cl = container_of(reverse, struct closure, list); - reverse = llist_next(reverse); - + llist_for_each_entry_safe(cl, t, reverse, list) { closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 1ec84ca81146..ccfbea6f9f6b 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CLOSURE_H #define _LINUX_CLOSURE_H @@ -251,6 +252,12 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn, static inline void closure_queue(struct closure *cl) { struct workqueue_struct *wq = cl->wq; + /** + * Changes made to closure, work_struct, or a couple of other structs + * may cause work.func not pointing to the right location. + */ + BUILD_BUG_ON(offsetof(struct closure, fn) + != offsetof(struct work_struct, func)); if (wq) { INIT_WORK(&cl->work, cl->work.func); BUG_ON(!queue_work(wq, &cl->work)); @@ -312,8 +319,6 @@ static inline void closure_wake_up(struct closure_waitlist *list) * been dropped with closure_put()), it will resume execution at @fn running out * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). * - * NOTE: This macro expands to a return in the calling function! - * * This is because after calling continue_at() you no longer have a ref on @cl, * and whatever @cl owns may be freed out from under you - a running closure fn * has a ref on its own closure which continue_at() drops. @@ -340,8 +345,6 @@ do { \ * Causes @fn to be executed out of @cl, in @wq context (or called directly if * @wq is NULL). * - * NOTE: like continue_at(), this macro expands to a return in the caller! - * * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, * thus it's not safe to touch anything protected by @cl after a * continue_at_nobarrier(). diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 35a5a7210e51..c7a02c4900da 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Assorted bcache debug code * @@ -49,7 +50,7 @@ void bch_btree_verify(struct btree *b) v->keys.ops = b->keys.ops; bio = bch_bbio_alloc(b->c); - bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; + bio_set_dev(bio, PTR_CACHE(b->c, &b->key, 0)->bdev); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; bio->bi_opf = REQ_OP_READ | REQ_META; diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 1f63c195d247..acc48d3fa274 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_DEBUG_H #define _BCACHE_DEBUG_H diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 243de0bf15cd..41c238fc3733 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> * diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h index e2ed54054e7a..0cd3575afa1d 100644 --- a/drivers/md/bcache/extents.h +++ b/drivers/md/bcache/extents.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_EXTENTS_H #define _BCACHE_EXTENTS_H diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 6a9b85095e7b..fac97ec2d0e2 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Some low level IO code, and hacks for various block layer limitations * @@ -34,7 +35,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) struct bbio *b = container_of(bio, struct bbio, bio); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); - bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; + bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); b->submit_time_us = local_clock_us(); closure_bio_submit(bio, bio->bi_private); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 0352d05e495c..02a98ddb592d 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * bcache journalling code, for btree insertions * @@ -53,7 +54,7 @@ reread: left = ca->sb.bucket_size - offset; bio_reset(bio); bio->bi_iter.bi_sector = bucket + offset; - bio->bi_bdev = ca->bdev; + bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = len << 9; bio->bi_end_io = journal_read_endio; @@ -452,7 +453,7 @@ static void do_journal_discard(struct cache *ca) bio_set_op_attrs(bio, REQ_OP_DISCARD, 0); bio->bi_iter.bi_sector = bucket_to_sector(ca->set, ca->sb.d[ja->discard_idx]); - bio->bi_bdev = ca->bdev; + bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = bucket_bytes(ca); bio->bi_end_io = journal_discard_endio; @@ -623,7 +624,7 @@ static void journal_write_unlocked(struct closure *cl) bio_reset(bio); bio->bi_iter.bi_sector = PTR_OFFSET(k, i); - bio->bi_bdev = ca->bdev; + bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index e3c39457afbb..b5788199188f 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_JOURNAL_H #define _BCACHE_JOURNAL_H diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index f633b30c962e..d50c1c97da68 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Moving/copying garbage collector * diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 019b3df9f1c6..3a7aed7282b2 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Main bcache entry point - handle a read or a write request and decide what to * do with it; the make_request functions are called by the block layer. @@ -26,12 +27,12 @@ struct kmem_cache *bch_search_cache; static void bch_data_insert_start(struct closure *); -static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) +static unsigned cache_mode(struct cached_dev *dc) { return BDEV_CACHE_MODE(&dc->sb); } -static bool verify(struct cached_dev *dc, struct bio *bio) +static bool verify(struct cached_dev *dc) { return dc->verify; } @@ -196,12 +197,12 @@ static void bch_data_insert_start(struct closure *cl) struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); struct bio *bio = op->bio, *n; - if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) - wake_up_gc(op->c); - if (op->bypass) return bch_data_invalidate(cl); + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) + wake_up_gc(op->c); + /* * Journal writes are marked REQ_PREFLUSH; if the original write was a * flush, it'll wait on the journal write. @@ -369,7 +370,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) { struct cache_set *c = dc->disk.c; - unsigned mode = cache_mode(dc, bio); + unsigned mode = cache_mode(dc); unsigned sectors, congested = bch_get_congested(c); struct task_struct *task = current; struct io *i; @@ -384,6 +385,14 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) op_is_write(bio_op(bio)))) goto skip; + /* + * Flag for bypass if the IO is for read-ahead or background, + * unless the read-ahead request is for metadata (eg, for gfs2). + */ + if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && + !(bio->bi_opf & REQ_META)) + goto skip; + if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || bio_sectors(bio) & (c->sb.block_size - 1)) { pr_debug("skipping unaligned io"); @@ -400,12 +409,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) if (!congested && !dc->sequential_cutoff) goto rescale; - if (!congested && - mode == CACHE_MODE_WRITEBACK && - op_is_write(bio->bi_opf) && - op_is_sync(bio->bi_opf)) - goto rescale; - spin_lock(&dc->io_lock); hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) @@ -468,6 +471,7 @@ struct search { unsigned recoverable:1; unsigned write:1; unsigned read_dirty_data:1; + unsigned cache_missed:1; unsigned long start_time; @@ -607,7 +611,8 @@ static void request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - generic_end_io_acct(bio_data_dir(s->orig_bio), + struct request_queue *q = s->orig_bio->bi_disk->queue; + generic_end_io_acct(q, bio_data_dir(s->orig_bio), &s->d->disk->part0, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); @@ -653,6 +658,7 @@ static inline struct search *search_alloc(struct bio *bio, s->orig_bio = bio; s->cache_miss = NULL; + s->cache_missed = 0; s->d = d; s->recoverable = 1; s->write = op_is_write(bio_op(bio)); @@ -702,8 +708,16 @@ static void cached_dev_read_error(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); struct bio *bio = &s->bio.bio; + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - if (s->recoverable) { + /* + * If cache device is dirty (dc->has_dirty is non-zero), then + * recovery a failed read request from cached device may get a + * stale data back. So read failure recovery is only permitted + * when cache device is clean. + */ + if (s->recoverable && + (dc && !atomic_read(&dc->has_dirty))) { /* Retry from the backing device: */ trace_bcache_read_retry(s->orig_bio); @@ -734,7 +748,7 @@ static void cached_dev_read_done(struct closure *cl) if (s->iop.bio) { bio_reset(s->iop.bio); s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector; - s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; + bio_copy_dev(s->iop.bio, s->cache_miss); s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; bch_bio_map(s->iop.bio, NULL); @@ -744,7 +758,7 @@ static void cached_dev_read_done(struct closure *cl) s->cache_miss = NULL; } - if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data) + if (verify(dc) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio); bio_complete(s); @@ -764,12 +778,12 @@ static void cached_dev_read_done_bh(struct closure *cl) struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); bch_mark_cache_accounting(s->iop.c, s->d, - !s->cache_miss, s->iop.bypass); + !s->cache_missed, s->iop.bypass); trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); if (s->iop.status) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); - else if (s->iop.bio || verify(dc, &s->bio.bio)) + else if (s->iop.bio || verify(dc)) continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); else continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); @@ -783,6 +797,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; + s->cache_missed = 1; + if (s->cache_miss || s->iop.bypass) { miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); ret = miss == bio ? MAP_DONE : MAP_CONTINUE; @@ -793,7 +809,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, !(bio->bi_opf & REQ_META) && s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) reada = min_t(sector_t, dc->readahead >> 9, - bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); + get_capacity(bio->bi_disk) - bio_end_sector(bio)); s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); @@ -819,7 +835,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_submit; cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector; - cache_bio->bi_bdev = miss->bi_bdev; + bio_copy_dev(cache_bio, miss); cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; cache_bio->bi_end_io = request_endio; @@ -896,7 +912,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) s->iop.bypass = true; if (should_writeback(dc, s->orig_bio, - cache_mode(dc, bio), + cache_mode(dc), s->iop.bypass)) { s->iop.bypass = false; s->iop.writeback = true; @@ -918,7 +934,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, dc->disk.bio_split); - flush->bi_bdev = bio->bi_bdev; + bio_copy_dev(flush, bio); flush->bi_end_io = request_endio; flush->bi_private = cl; flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; @@ -955,13 +971,13 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio) { struct search *s; - struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; + struct bcache_device *d = bio->bi_disk->private_data; struct cached_dev *dc = container_of(d, struct cached_dev, disk); int rw = bio_data_dir(bio); - generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0); + generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - bio->bi_bdev = dc->bdev; + bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; if (cached_dev_get(dc)) { @@ -1071,10 +1087,10 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, { struct search *s; struct closure *cl; - struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; + struct bcache_device *d = bio->bi_disk->private_data; int rw = bio_data_dir(bio); - generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0); + generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); s = search_alloc(bio, d); cl = &s->cl; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 7689176951ce..dea0886b81c1 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_REQUEST_H_ #define _BCACHE_REQUEST_H_ diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 0ca072c20d0d..be119326297b 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * bcache stats code * @@ -146,9 +147,9 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) } } -static void scale_accounting(unsigned long data) +static void scale_accounting(struct timer_list *t) { - struct cache_accounting *acc = (struct cache_accounting *) data; + struct cache_accounting *acc = from_timer(acc, t, timer); #define move_stat(name) do { \ unsigned t = atomic_xchg(&acc->collector.name, 0); \ @@ -233,9 +234,7 @@ void bch_cache_accounting_init(struct cache_accounting *acc, kobject_init(&acc->day.kobj, &bch_stats_ktype); closure_init(&acc->cl, parent); - init_timer(&acc->timer); + timer_setup(&acc->timer, scale_accounting, 0); acc->timer.expires = jiffies + accounting_delay; - acc->timer.data = (unsigned long) acc; - acc->timer.function = scale_accounting; add_timer(&acc->timer); } diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index adbff141c887..0b70f9de0c03 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_STATS_H_ #define _BCACHE_STATS_H_ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 8352fad765f6..b4d28928dec5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -53,12 +53,15 @@ LIST_HEAD(bch_cache_sets); static LIST_HEAD(uncached_devices); static int bcache_major; -static DEFINE_IDA(bcache_minor); +static DEFINE_IDA(bcache_device_idx); static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) -#define BCACHE_MINORS 16 /* partition support */ +/* limitation of partitions number on single bcache device */ +#define BCACHE_MINORS 128 +/* limitation of bcache devices number on single system */ +#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS) /* Superblock */ @@ -257,7 +260,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) closure_init(cl, parent); bio_reset(bio); - bio->bi_bdev = dc->bdev; + bio_set_dev(bio, dc->bdev); bio->bi_end_io = write_bdev_super_endio; bio->bi_private = dc; @@ -303,7 +306,7 @@ void bcache_write_super(struct cache_set *c) SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); bio_reset(bio); - bio->bi_bdev = ca->bdev; + bio_set_dev(bio, ca->bdev); bio->bi_end_io = write_super_endio; bio->bi_private = ca; @@ -508,7 +511,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, closure_init_stack(cl); bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size; - bio->bi_bdev = ca->bdev; + bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = bucket_bytes(ca); bio->bi_end_io = prio_endio; @@ -721,6 +724,16 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, closure_get(&c->caching); } +static inline int first_minor_to_idx(int first_minor) +{ + return (first_minor/BCACHE_MINORS); +} + +static inline int idx_to_first_minor(int idx) +{ + return (idx * BCACHE_MINORS); +} + static void bcache_device_free(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); @@ -734,7 +747,8 @@ static void bcache_device_free(struct bcache_device *d) if (d->disk && d->disk->queue) blk_cleanup_queue(d->disk->queue); if (d->disk) { - ida_simple_remove(&bcache_minor, d->disk->first_minor); + ida_simple_remove(&bcache_device_idx, + first_minor_to_idx(d->disk->first_minor)); put_disk(d->disk); } @@ -751,7 +765,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, { struct request_queue *q; size_t n; - int minor; + int idx; if (!d->stripe_size) d->stripe_size = 1 << 31; @@ -776,25 +790,24 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (!d->full_dirty_stripes) return -ENOMEM; - minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL); - if (minor < 0) - return minor; - - minor *= BCACHE_MINORS; + idx = ida_simple_get(&bcache_device_idx, 0, + BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); + if (idx < 0) + return idx; if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio), BIOSET_NEED_BVECS | BIOSET_NEED_RESCUER)) || !(d->disk = alloc_disk(BCACHE_MINORS))) { - ida_simple_remove(&bcache_minor, minor); + ida_simple_remove(&bcache_device_idx, idx); return -ENOMEM; } set_capacity(d->disk, sectors); - snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); + snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); d->disk->major = bcache_major; - d->disk->first_minor = minor; + d->disk->first_minor = idx_to_first_minor(idx); d->disk->fops = &bcache_ops; d->disk->private_data = d; @@ -889,7 +902,7 @@ static void cached_dev_detach_finish(struct work_struct *w) closure_init_stack(&cl); BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); - BUG_ON(atomic_read(&dc->count)); + BUG_ON(refcount_read(&dc->count)); mutex_lock(&bch_register_lock); @@ -1016,7 +1029,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) * dc->c must be set before dc->count != 0 - paired with the mb in * cached_dev_get() */ - atomic_set(&dc->count, 1); + refcount_set(&dc->count, 1); /* Block writeback thread, but spawn it */ down_write(&dc->writeback_lock); @@ -1026,9 +1039,9 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) } if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - bch_sectors_dirty_init(dc); + bch_sectors_dirty_init(&dc->disk); atomic_set(&dc->has_dirty, 1); - atomic_inc(&dc->count); + refcount_inc(&dc->count); bch_writeback_queue(dc); } @@ -1059,6 +1072,8 @@ static void cached_dev_free(struct closure *cl) cancel_delayed_work_sync(&dc->writeback_rate_update); if (!IS_ERR_OR_NULL(dc->writeback_thread)) kthread_stop(dc->writeback_thread); + if (dc->writeback_write_wq) + destroy_workqueue(dc->writeback_write_wq); mutex_lock(&bch_register_lock); @@ -1127,9 +1142,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) if (ret) return ret; - set_capacity(dc->disk.disk, - dc->bdev->bd_part->nr_sects - dc->sb.data_offset); - dc->disk.disk->queue->backing_dev_info->ra_pages = max(dc->disk.disk->queue->backing_dev_info->ra_pages, q->backing_dev_info->ra_pages); @@ -1228,6 +1240,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) goto err; bcache_device_attach(d, c, u - c->uuids); + bch_sectors_dirty_init(d); bch_flash_dev_request_init(d); add_disk(d->disk); @@ -1374,9 +1387,6 @@ static void cache_set_flush(struct closure *cl) struct btree *b; unsigned i; - if (!c) - closure_return(cl); - bch_cache_accounting_destroy(&c->accounting); kobject_put(&c->internal); @@ -1964,6 +1974,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, else err = "device busy"; mutex_unlock(&bch_register_lock); + if (!IS_ERR(bdev)) + bdput(bdev); if (attr == &ksysfs_register_quiet) goto out; } @@ -2083,6 +2095,7 @@ static void bcache_exit(void) if (bcache_major) unregister_blkdev(bcache_major, "bcache"); unregister_reboot_notifier(&reboot); + mutex_destroy(&bch_register_lock); } static int __init bcache_init(void) @@ -2101,14 +2114,15 @@ static int __init bcache_init(void) bcache_major = register_blkdev(0, "bcache"); if (bcache_major < 0) { unregister_reboot_notifier(&reboot); + mutex_destroy(&bch_register_lock); return bcache_major; } if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || - sysfs_create_files(bcache_kobj, files) || bch_request_init() || - bch_debug_init(bcache_kobj)) + bch_debug_init(bcache_kobj) || + sysfs_create_files(bcache_kobj, files)) goto err; return 0; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index f90f13616980..b4184092c727 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * bcache sysfs interfaces * @@ -81,8 +82,9 @@ rw_attribute(writeback_delay); rw_attribute(writeback_rate); rw_attribute(writeback_rate_update_seconds); -rw_attribute(writeback_rate_d_term); +rw_attribute(writeback_rate_i_term_inverse); rw_attribute(writeback_rate_p_term_inverse); +rw_attribute(writeback_rate_minimum); read_attribute(writeback_rate_debug); read_attribute(stripe_size); @@ -130,15 +132,16 @@ SHOW(__bch_cached_dev) sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); var_print(writeback_rate_update_seconds); - var_print(writeback_rate_d_term); + var_print(writeback_rate_i_term_inverse); var_print(writeback_rate_p_term_inverse); + var_print(writeback_rate_minimum); if (attr == &sysfs_writeback_rate_debug) { char rate[20]; char dirty[20]; char target[20]; char proportional[20]; - char derivative[20]; + char integral[20]; char change[20]; s64 next_io; @@ -146,7 +149,7 @@ SHOW(__bch_cached_dev) bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(target, dc->writeback_rate_target << 9); bch_hprint(proportional,dc->writeback_rate_proportional << 9); - bch_hprint(derivative, dc->writeback_rate_derivative << 9); + bch_hprint(integral, dc->writeback_rate_integral_scaled << 9); bch_hprint(change, dc->writeback_rate_change << 9); next_io = div64_s64(dc->writeback_rate.next - local_clock(), @@ -157,11 +160,11 @@ SHOW(__bch_cached_dev) "dirty:\t\t%s\n" "target:\t\t%s\n" "proportional:\t%s\n" - "derivative:\t%s\n" + "integral:\t%s\n" "change:\t\t%s/sec\n" "next io:\t%llims\n", rate, dirty, target, proportional, - derivative, change, next_io); + integral, change, next_io); } sysfs_hprint(dirty_data, @@ -192,7 +195,7 @@ STORE(__cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); - unsigned v = size; + ssize_t v = size; struct cache_set *c; struct kobj_uevent_env *env; @@ -213,7 +216,7 @@ STORE(__cached_dev) dc->writeback_rate.rate, 1, INT_MAX); d_strtoul_nonzero(writeback_rate_update_seconds); - d_strtoul(writeback_rate_d_term); + d_strtoul(writeback_rate_i_term_inverse); d_strtoul_nonzero(writeback_rate_p_term_inverse); d_strtoi_h(sequential_cutoff); @@ -227,7 +230,7 @@ STORE(__cached_dev) bch_cached_dev_run(dc); if (attr == &sysfs_cache_mode) { - ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1); + v = bch_read_string_list(buf, bch_cache_modes + 1); if (v < 0) return v; @@ -319,7 +322,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_percent, &sysfs_writeback_rate, &sysfs_writeback_rate_update_seconds, - &sysfs_writeback_rate_d_term, + &sysfs_writeback_rate_i_term_inverse, &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_debug, &sysfs_dirty_data, @@ -615,8 +618,21 @@ STORE(__bch_cache_set) bch_cache_accounting_clear(&c->accounting); } - if (attr == &sysfs_trigger_gc) + if (attr == &sysfs_trigger_gc) { + /* + * Garbage collection thread only works when sectors_to_gc < 0, + * when users write to sysfs entry trigger_gc, most of time + * they want to forcibly triger gargage collection. Here -1 is + * set to c->sectors_to_gc, to make gc_should_run() give a + * chance to permit gc thread to run. "give a chance" means + * before going into gc_should_run(), there is still chance + * that c->sectors_to_gc being set to other positive value. So + * writing sysfs entry trigger_gc won't always make sure gc + * thread takes effect. + */ + atomic_set(&c->sectors_to_gc, -1); wake_up_gc(c); + } if (attr == &sysfs_prune_cache) { struct shrink_control sc; @@ -732,6 +748,11 @@ static struct attribute *bch_cache_set_internal_files[] = { }; KTYPE(bch_cache_set_internal); +static int __bch_cache_cmp(const void *l, const void *r) +{ + return *((uint16_t *)r) - *((uint16_t *)l); +} + SHOW(__bch_cache) { struct cache *ca = container_of(kobj, struct cache, kobj); @@ -756,9 +777,6 @@ SHOW(__bch_cache) CACHE_REPLACEMENT(&ca->sb)); if (attr == &sysfs_priority_stats) { - int cmp(const void *l, const void *r) - { return *((uint16_t *) r) - *((uint16_t *) l); } - struct bucket *b; size_t n = ca->sb.nbuckets, i; size_t unused = 0, available = 0, dirty = 0, meta = 0; @@ -787,7 +805,7 @@ SHOW(__bch_cache) p[i] = ca->buckets[i].prio; mutex_unlock(&ca->set->bucket_lock); - sort(p, n, sizeof(uint16_t), cmp, NULL); + sort(p, n, sizeof(uint16_t), __bch_cache_cmp, NULL); while (n && !cached[n - 1]) diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index 0526fe92a683..b54fe9602529 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_SYSFS_H_ #define _BCACHE_SYSFS_H_ diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index b7820b0d2621..a9a73f560c04 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcache.h" #include "btree.h" diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 8c3a938f4bf0..e548b8b51322 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -74,24 +74,44 @@ STRTO_H(strtouint, unsigned int) STRTO_H(strtoll, long long) STRTO_H(strtoull, unsigned long long) +/** + * bch_hprint() - formats @v to human readable string for sysfs. + * + * @v - signed 64 bit integer + * @buf - the (at least 8 byte) buffer to format the result into. + * + * Returns the number of bytes used by format. + */ ssize_t bch_hprint(char *buf, int64_t v) { static const char units[] = "?kMGTPEZY"; - char dec[4] = ""; - int u, t = 0; - - for (u = 0; v >= 1024 || v <= -1024; u++) { - t = v & ~(~0 << 10); - v >>= 10; - } - - if (!u) - return sprintf(buf, "%llu", v); - - if (v < 100 && v > -100) - snprintf(dec, sizeof(dec), ".%i", t / 100); - - return sprintf(buf, "%lli%s%c", v, dec, units[u]); + int u = 0, t; + + uint64_t q; + + if (v < 0) + q = -v; + else + q = v; + + /* For as long as the number is more than 3 digits, but at least + * once, shift right / divide by 1024. Keep the remainder for + * a digit after the decimal point. + */ + do { + u++; + + t = q & ~(~0 << 10); + q >>= 10; + } while (q >= 1000); + + if (v < 0) + /* '-', up to 3 digits, '.', 1 digit, 1 character, null; + * yields 8 bytes. + */ + return sprintf(buf, "-%llu.%i%c", q, t * 10 / 1024, units[u]); + else + return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]); } ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], @@ -212,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) d->next += div_u64(done * NSEC_PER_SEC, d->rate); - if (time_before64(now + NSEC_PER_SEC, d->next)) - d->next = now + NSEC_PER_SEC; + /* Bound the time. Don't let us fall further than 2 seconds behind + * (this prevents unnecessary backlog that would make it impossible + * to catch up). If we're ahead of the desired writeback rate, + * don't let us sleep more than 2.5 seconds (so we can notice/respond + * if the control system tells us to speed up!). + */ + if (time_before64(now + NSEC_PER_SEC * 5LLU / 2LLU, d->next)) + d->next = now + NSEC_PER_SEC * 5LLU / 2LLU; if (time_after64(now - NSEC_PER_SEC * 2, d->next)) d->next = now - NSEC_PER_SEC * 2; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index cb8d2ccbb6c6..ed5e8a412eb8 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_UTIL_H #define _BCACHE_UTIL_H @@ -441,10 +442,10 @@ struct bch_ratelimit { uint64_t next; /* - * Rate at which we want to do work, in units per nanosecond + * Rate at which we want to do work, in units per second * The units here correspond to the units passed to bch_next_delay() */ - unsigned rate; + uint32_t rate; }; static inline void bch_ratelimit_reset(struct bch_ratelimit *d) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 42c66e76f05e..56a37884ca8b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * background writeback - scan btree for dirty data and write it to the backing * device @@ -21,51 +22,67 @@ static void __update_writeback_rate(struct cached_dev *dc) { struct cache_set *c = dc->disk.c; - uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; + uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - + bcache_flash_devs_sectors_dirty(c); uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); - int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), c->cached_dev_sectors); - /* PD controller */ - + /* + * PI controller: + * Figures out the amount that should be written per second. + * + * First, the error (number of sectors that are dirty beyond our + * target) is calculated. The error is accumulated (numerically + * integrated). + * + * Then, the proportional value and integral value are scaled + * based on configured values. These are stored as inverses to + * avoid fixed point math and to make configuration easy-- e.g. + * the default value of 40 for writeback_rate_p_term_inverse + * attempts to write at a rate that would retire all the dirty + * blocks in 40 seconds. + * + * The writeback_rate_i_inverse value of 10000 means that 1/10000th + * of the error is accumulated in the integral term per second. + * This acts as a slow, long-term average that is not subject to + * variations in usage like the p term. + */ int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); - int64_t derivative = dirty - dc->disk.sectors_dirty_last; - int64_t proportional = dirty - target; - int64_t change; - - dc->disk.sectors_dirty_last = dirty; - - /* Scale to sectors per second */ - - proportional *= dc->writeback_rate_update_seconds; - proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); - - derivative = div_s64(derivative, dc->writeback_rate_update_seconds); - - derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, - (dc->writeback_rate_d_term / - dc->writeback_rate_update_seconds) ?: 1, 0); - - derivative *= dc->writeback_rate_d_term; - derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); - - change = proportional + derivative; + int64_t error = dirty - target; + int64_t proportional_scaled = + div_s64(error, dc->writeback_rate_p_term_inverse); + int64_t integral_scaled; + uint32_t new_rate; + + if ((error < 0 && dc->writeback_rate_integral > 0) || + (error > 0 && time_before64(local_clock(), + dc->writeback_rate.next + NSEC_PER_MSEC))) { + /* + * Only decrease the integral term if it's more than + * zero. Only increase the integral term if the device + * is keeping up. (Don't wind up the integral + * ineffectively in either case). + * + * It's necessary to scale this by + * writeback_rate_update_seconds to keep the integral + * term dimensioned properly. + */ + dc->writeback_rate_integral += error * + dc->writeback_rate_update_seconds; + } - /* Don't increase writeback rate if the device isn't keeping up */ - if (change > 0 && - time_after64(local_clock(), - dc->writeback_rate.next + NSEC_PER_MSEC)) - change = 0; + integral_scaled = div_s64(dc->writeback_rate_integral, + dc->writeback_rate_i_term_inverse); - dc->writeback_rate.rate = - clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, - 1, NSEC_PER_MSEC); + new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), + dc->writeback_rate_minimum, NSEC_PER_SEC); - dc->writeback_rate_proportional = proportional; - dc->writeback_rate_derivative = derivative; - dc->writeback_rate_change = change; + dc->writeback_rate_proportional = proportional_scaled; + dc->writeback_rate_integral_scaled = integral_scaled; + dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; + dc->writeback_rate.rate = new_rate; dc->writeback_rate_target = target; } @@ -178,15 +195,23 @@ static void write_dirty(struct closure *cl) struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; - dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); - io->bio.bi_iter.bi_sector = KEY_START(&w->key); - io->bio.bi_bdev = io->dc->bdev; - io->bio.bi_end_io = dirty_endio; + /* + * IO errors are signalled using the dirty bit on the key. + * If we failed to read, we should not attempt to write to the + * backing device. Instead, immediately go to write_dirty_finish + * to clean up. + */ + if (KEY_DIRTY(&w->key)) { + dirty_init(w); + bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); + io->bio.bi_iter.bi_sector = KEY_START(&w->key); + bio_set_dev(&io->bio, io->dc->bdev); + io->bio.bi_end_io = dirty_endio; - closure_bio_submit(&io->bio, cl); + closure_bio_submit(&io->bio, cl); + } - continue_at(cl, write_dirty_finish, system_wq); + continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } static void read_dirty_endio(struct bio *bio) @@ -206,7 +231,7 @@ static void read_dirty_submit(struct closure *cl) closure_bio_submit(&io->bio, cl); - continue_at(cl, write_dirty, system_wq); + continue_at(cl, write_dirty, io->dc->writeback_write_wq); } static void read_dirty(struct cached_dev *dc) @@ -250,8 +275,7 @@ static void read_dirty(struct cached_dev *dc) dirty_init(w); bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); - io->bio.bi_bdev = PTR_CACHE(dc->disk.c, - &w->key, 0)->bdev; + bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); io->bio.bi_end_io = read_dirty_endio; if (bio_alloc_pages(&io->bio, GFP_KERNEL)) @@ -417,6 +441,8 @@ static int bch_writeback_thread(void *arg) struct cached_dev *dc = arg; bool searched_full_index; + bch_ratelimit_reset(&dc->writeback_rate); + while (!kthread_should_stop()) { down_write(&dc->writeback_lock); if (!atomic_read(&dc->has_dirty) || @@ -444,7 +470,6 @@ static int bch_writeback_thread(void *arg) up_write(&dc->writeback_lock); - bch_ratelimit_reset(&dc->writeback_rate); read_dirty(dc); if (searched_full_index) { @@ -454,6 +479,8 @@ static int bch_writeback_thread(void *arg) !kthread_should_stop() && !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) delay = schedule_timeout_interruptible(delay); + + bch_ratelimit_reset(&dc->writeback_rate); } } @@ -482,17 +509,15 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, return MAP_CONTINUE; } -void bch_sectors_dirty_init(struct cached_dev *dc) +void bch_sectors_dirty_init(struct bcache_device *d) { struct sectors_dirty_init op; bch_btree_op_init(&op.op, -1); - op.inode = dc->disk.id; + op.inode = d->id; - bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), + bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); - - dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); } void bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -506,16 +531,22 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_percent = 10; dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; + dc->writeback_rate_minimum = 8; dc->writeback_rate_update_seconds = 5; - dc->writeback_rate_d_term = 30; - dc->writeback_rate_p_term_inverse = 6000; + dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_i_term_inverse = 10000; INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); } int bch_cached_dev_writeback_start(struct cached_dev *dc) { + dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", + WQ_MEM_RECLAIM, 0); + if (!dc->writeback_write_wq) + return -ENOMEM; + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 629bd1a502fd..a9e3ffb4b03c 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_WRITEBACK_H #define _BCACHE_WRITEBACK_H @@ -14,6 +15,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } +static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) +{ + uint64_t i, ret = 0; + + mutex_lock(&bch_register_lock); + + for (i = 0; i < c->nr_uuids; i++) { + struct bcache_device *d = c->devices[i]; + + if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) + continue; + ret += bcache_dev_sectors_dirty(d); + } + + mutex_unlock(&bch_register_lock); + + return ret; +} + static inline unsigned offset_to_stripe(struct bcache_device *d, uint64_t offset) { @@ -57,7 +77,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (would_skip) return false; - return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK; + return (op_is_sync(bio->bi_opf) || + bio->bi_opf & (REQ_META|REQ_PRIO) || + in_use <= CUTOFF_WRITEBACK); } static inline void bch_writeback_queue(struct cached_dev *dc) @@ -70,7 +92,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) { if (!atomic_read(&dc->has_dirty) && !atomic_xchg(&dc->has_dirty, 1)) { - atomic_inc(&dc->count); + refcount_inc(&dc->count); if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); @@ -84,7 +106,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); -void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_sectors_dirty_init(struct bcache_device *); void bch_cached_dev_writeback_init(struct cached_dev *); int bch_cached_dev_writeback_start(struct cached_dev *); diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index dd3646111561..c82578af56a5 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -18,21 +18,24 @@ */ struct dm_bio_details { - struct block_device *bi_bdev; + struct gendisk *bi_disk; + u8 bi_partno; unsigned long bi_flags; struct bvec_iter bi_iter; }; static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) { - bd->bi_bdev = bio->bi_bdev; + bd->bi_disk = bio->bi_disk; + bd->bi_partno = bio->bi_partno; bd->bi_flags = bio->bi_flags; bd->bi_iter = bio->bi_iter; } static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) { - bio->bi_bdev = bd->bi_bdev; + bio->bi_disk = bd->bi_disk; + bio->bi_partno = bd->bi_partno; bio->bi_flags = bd->bi_flags; bio->bi_iter = bd->bi_iter; } diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 850ff6c67994..b8ac591aaaa7 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -64,6 +64,12 @@ #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) /* + * Align buffer writes to this boundary. + * Tests show that SSDs have the highest IOPS when using 4k writes. + */ +#define DM_BUFIO_WRITE_ALIGN 4096 + +/* * dm_buffer->list_mode */ #define LIST_CLEAN 0 @@ -149,6 +155,10 @@ struct dm_buffer { blk_status_t write_error; unsigned long state; unsigned long last_accessed; + unsigned dirty_start; + unsigned dirty_end; + unsigned write_start; + unsigned write_end; struct dm_bufio_client *c; struct list_head write_list; struct bio bio; @@ -337,7 +347,7 @@ static void __cache_size_refresh(void) BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); BUG_ON(dm_bufio_client_count < 0); - dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); + dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size); /* * Use default if set to 0 and report the actual cache size used. @@ -560,7 +570,7 @@ static void dmio_complete(unsigned long error, void *context) } static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, - unsigned n_sectors, bio_end_io_t *end_io) + unsigned n_sectors, unsigned offset, bio_end_io_t *end_io) { int r; struct dm_io_request io_req = { @@ -578,10 +588,10 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, if (b->data_mode != DATA_MODE_VMALLOC) { io_req.mem.type = DM_IO_KMEM; - io_req.mem.ptr.addr = b->data; + io_req.mem.ptr.addr = (char *)b->data + offset; } else { io_req.mem.type = DM_IO_VMA; - io_req.mem.ptr.vma = b->data; + io_req.mem.ptr.vma = (char *)b->data + offset; } b->bio.bi_end_io = end_io; @@ -609,14 +619,14 @@ static void inline_endio(struct bio *bio) } static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector, - unsigned n_sectors, bio_end_io_t *end_io) + unsigned n_sectors, unsigned offset, bio_end_io_t *end_io) { char *ptr; - int len; + unsigned len; bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS); b->bio.bi_iter.bi_sector = sector; - b->bio.bi_bdev = b->c->bdev; + bio_set_dev(&b->bio, b->c->bdev); b->bio.bi_end_io = inline_endio; /* * Use of .bi_private isn't a problem here because @@ -625,29 +635,20 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector, b->bio.bi_private = end_io; bio_set_op_attrs(&b->bio, rw, 0); - /* - * We assume that if len >= PAGE_SIZE ptr is page-aligned. - * If len < PAGE_SIZE the buffer doesn't cross page boundary. - */ - ptr = b->data; + ptr = (char *)b->data + offset; len = n_sectors << SECTOR_SHIFT; - if (len >= PAGE_SIZE) - BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); - else - BUG_ON((unsigned long)ptr & (len - 1)); - do { - if (!bio_add_page(&b->bio, virt_to_page(ptr), - len < PAGE_SIZE ? len : PAGE_SIZE, + unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len); + if (!bio_add_page(&b->bio, virt_to_page(ptr), this_step, offset_in_page(ptr))) { BUG_ON(b->c->block_size <= PAGE_SIZE); - use_dmio(b, rw, sector, n_sectors, end_io); + use_dmio(b, rw, sector, n_sectors, offset, end_io); return; } - len -= PAGE_SIZE; - ptr += PAGE_SIZE; + len -= this_step; + ptr += this_step; } while (len > 0); submit_bio(&b->bio); @@ -657,18 +658,33 @@ static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io) { unsigned n_sectors; sector_t sector; - - if (rw == WRITE && b->c->write_callback) - b->c->write_callback(b); + unsigned offset, end; sector = (b->block << b->c->sectors_per_block_bits) + b->c->start; - n_sectors = 1 << b->c->sectors_per_block_bits; + + if (rw != WRITE) { + n_sectors = 1 << b->c->sectors_per_block_bits; + offset = 0; + } else { + if (b->c->write_callback) + b->c->write_callback(b); + offset = b->write_start; + end = b->write_end; + offset &= -DM_BUFIO_WRITE_ALIGN; + end += DM_BUFIO_WRITE_ALIGN - 1; + end &= -DM_BUFIO_WRITE_ALIGN; + if (unlikely(end > b->c->block_size)) + end = b->c->block_size; + + sector += offset >> SECTOR_SHIFT; + n_sectors = (end - offset) >> SECTOR_SHIFT; + } if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) && b->data_mode != DATA_MODE_VMALLOC) - use_inline_bio(b, rw, sector, n_sectors, end_io); + use_inline_bio(b, rw, sector, n_sectors, offset, end_io); else - use_dmio(b, rw, sector, n_sectors, end_io); + use_dmio(b, rw, sector, n_sectors, offset, end_io); } /*---------------------------------------------------------------- @@ -720,6 +736,9 @@ static void __write_dirty_buffer(struct dm_buffer *b, clear_bit(B_DIRTY, &b->state); wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); + b->write_start = b->dirty_start; + b->write_end = b->dirty_end; + if (!write_list) submit_io(b, WRITE, write_endio); else @@ -941,7 +960,7 @@ static void __get_memory_limit(struct dm_bufio_client *c, { unsigned long buffers; - if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) { + if (unlikely(READ_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) { if (mutex_trylock(&dm_bufio_clients_lock)) { __cache_size_refresh(); mutex_unlock(&dm_bufio_clients_lock); @@ -955,7 +974,8 @@ static void __get_memory_limit(struct dm_bufio_client *c, buffers = c->minimum_buffers; *limit_buffers = buffers; - *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; + *threshold_buffers = mult_frac(buffers, + DM_BUFIO_WRITEBACK_PERCENT, 100); } /* @@ -1221,19 +1241,37 @@ void dm_bufio_release(struct dm_buffer *b) } EXPORT_SYMBOL_GPL(dm_bufio_release); -void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) +void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b, + unsigned start, unsigned end) { struct dm_bufio_client *c = b->c; + BUG_ON(start >= end); + BUG_ON(end > b->c->block_size); + dm_bufio_lock(c); BUG_ON(test_bit(B_READING, &b->state)); - if (!test_and_set_bit(B_DIRTY, &b->state)) + if (!test_and_set_bit(B_DIRTY, &b->state)) { + b->dirty_start = start; + b->dirty_end = end; __relink_lru(b, LIST_DIRTY); + } else { + if (start < b->dirty_start) + b->dirty_start = start; + if (end > b->dirty_end) + b->dirty_end = end; + } dm_bufio_unlock(c); } +EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty); + +void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) +{ + dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size); +} EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) @@ -1258,8 +1296,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); */ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) { - blk_status_t a; - int f; + int a, f; unsigned long buffers_processed = 0; struct dm_buffer *b, *tmp; @@ -1399,6 +1436,8 @@ retry: wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); set_bit(B_DIRTY, &b->state); + b->dirty_start = 0; + b->dirty_end = c->block_size; __unlink_buffer(b); __link_buffer(b, new_block, LIST_DIRTY); } else { @@ -1562,7 +1601,7 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) static unsigned long get_retain_buffers(struct dm_bufio_client *c) { - unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); + unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes); return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT); } @@ -1609,7 +1648,7 @@ dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker); - return ACCESS_ONCE(c->n_buffers[LIST_CLEAN]) + ACCESS_ONCE(c->n_buffers[LIST_DIRTY]); + return READ_ONCE(c->n_buffers[LIST_CLEAN]) + READ_ONCE(c->n_buffers[LIST_DIRTY]); } /* @@ -1780,7 +1819,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset); static unsigned get_max_age_hz(void) { - unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); + unsigned max_age = READ_ONCE(dm_bufio_max_age); if (max_age > UINT_MAX / HZ) max_age = UINT_MAX / HZ; @@ -1872,19 +1911,15 @@ static int __init dm_bufio_init(void) memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); - mem = (__u64)((totalram_pages - totalhigh_pages) * - DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; + mem = (__u64)mult_frac(totalram_pages - totalhigh_pages, + DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT; if (mem > ULONG_MAX) mem = ULONG_MAX; #ifdef CONFIG_MMU - /* - * Get the size of vmalloc space the same way as VMALLOC_TOTAL - * in fs/proc/internal.h - */ - if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) - mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; + if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100)) + mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100); #endif dm_bufio_default_cache_size = mem; diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h index b6d8f53ec15b..be732d3f8611 100644 --- a/drivers/md/dm-bufio.h +++ b/drivers/md/dm-bufio.h @@ -94,6 +94,15 @@ void dm_bufio_release(struct dm_buffer *b); void dm_bufio_mark_buffer_dirty(struct dm_buffer *b); /* + * Mark a part of the buffer dirty. + * + * The specified part of the buffer is scheduled to be written. dm-bufio may + * write the specified part of the buffer or it may write a larger superset. + */ +void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b, + unsigned start, unsigned end); + +/* * Initiate writing of dirty buffers, without waiting for completion. */ void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c); diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c index f092771878c2..8eb52e425141 100644 --- a/drivers/md/dm-builtin.c +++ b/drivers/md/dm-builtin.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "dm-core.h" /* diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c index 707233891291..1d0af0a21fc7 100644 --- a/drivers/md/dm-cache-background-tracker.c +++ b/drivers/md/dm-cache-background-tracker.c @@ -161,8 +161,17 @@ EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued); static bool max_work_reached(struct background_tracker *b) { - // FIXME: finish - return false; + return atomic_read(&b->pending_promotes) + + atomic_read(&b->pending_writebacks) + + atomic_read(&b->pending_demotes) >= b->max_work; +} + +struct bt_work *alloc_work(struct background_tracker *b) +{ + if (max_work_reached(b)) + return NULL; + + return kmem_cache_alloc(b->work_cache, GFP_NOWAIT); } int btracker_queue(struct background_tracker *b, @@ -174,10 +183,7 @@ int btracker_queue(struct background_tracker *b, if (pwork) *pwork = NULL; - if (max_work_reached(b)) - return -ENOMEM; - - w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT); + w = alloc_work(b); if (!w) return -ENOMEM; diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 4a4e9c75fc4c..0d7212410e21 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -13,6 +13,7 @@ #include "persistent-data/dm-transaction-manager.h" #include <linux/device-mapper.h> +#include <linux/refcount.h> /*----------------------------------------------------------------*/ @@ -100,7 +101,7 @@ struct cache_disk_superblock { } __packed; struct dm_cache_metadata { - atomic_t ref_count; + refcount_t ref_count; struct list_head list; unsigned version; @@ -753,7 +754,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev, } cmd->version = metadata_version; - atomic_set(&cmd->ref_count, 1); + refcount_set(&cmd->ref_count, 1); init_rwsem(&cmd->root_lock); cmd->bdev = bdev; cmd->data_block_size = data_block_size; @@ -791,7 +792,7 @@ static struct dm_cache_metadata *lookup(struct block_device *bdev) list_for_each_entry(cmd, &table, list) if (cmd->bdev == bdev) { - atomic_inc(&cmd->ref_count); + refcount_inc(&cmd->ref_count); return cmd; } @@ -862,7 +863,7 @@ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, void dm_cache_metadata_close(struct dm_cache_metadata *cmd) { - if (atomic_dec_and_test(&cmd->ref_count)) { + if (refcount_dec_and_test(&cmd->ref_count)) { mutex_lock(&table_lock); list_del(&cmd->list); mutex_unlock(&table_lock); diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index e5eb9c9b4bc8..4ab23d0075f6 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -213,6 +213,19 @@ static void l_del(struct entry_space *es, struct ilist *l, struct entry *e) l->nr_elts--; } +static struct entry *l_pop_head(struct entry_space *es, struct ilist *l) +{ + struct entry *e; + + for (e = l_head(es, l); e; e = l_next(es, e)) + if (!e->sentinel) { + l_del(es, l, e); + return e; + } + + return NULL; +} + static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l) { struct entry *e; @@ -719,7 +732,7 @@ static struct entry *alloc_entry(struct entry_alloc *ea) if (l_empty(&ea->free)) return NULL; - e = l_pop_tail(ea->es, &ea->free); + e = l_pop_head(ea->es, &ea->free); init_entry(e); ea->nr_allocated++; @@ -1158,13 +1171,13 @@ static void clear_pending(struct smq_policy *mq, struct entry *e) e->pending_work = false; } -static void queue_writeback(struct smq_policy *mq) +static void queue_writeback(struct smq_policy *mq, bool idle) { int r; struct policy_work work; struct entry *e; - e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed); + e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle); if (e) { mark_pending(mq, e); q_del(&mq->dirty, e); @@ -1174,12 +1187,16 @@ static void queue_writeback(struct smq_policy *mq) work.cblock = infer_cblock(mq, e); r = btracker_queue(mq->bg_work, &work, NULL); - WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race. + if (r) { + clear_pending(mq, e); + q_push_front(&mq->dirty, e); + } } } static void queue_demotion(struct smq_policy *mq) { + int r; struct policy_work work; struct entry *e; @@ -1189,7 +1206,7 @@ static void queue_demotion(struct smq_policy *mq) e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true); if (!e) { if (!clean_target_met(mq, true)) - queue_writeback(mq); + queue_writeback(mq, false); return; } @@ -1199,12 +1216,17 @@ static void queue_demotion(struct smq_policy *mq) work.op = POLICY_DEMOTE; work.oblock = e->oblock; work.cblock = infer_cblock(mq, e); - btracker_queue(mq->bg_work, &work, NULL); + r = btracker_queue(mq->bg_work, &work, NULL); + if (r) { + clear_pending(mq, e); + q_push_front(&mq->clean, e); + } } static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, struct policy_work **workp) { + int r; struct entry *e; struct policy_work work; @@ -1234,7 +1256,9 @@ static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, work.op = POLICY_PROMOTE; work.oblock = oblock; work.cblock = infer_cblock(mq, e); - btracker_queue(mq->bg_work, &work, workp); + r = btracker_queue(mq->bg_work, &work, workp); + if (r) + free_entry(&mq->cache_alloc, e); } /*----------------------------------------------------------------*/ @@ -1418,7 +1442,7 @@ static int smq_get_background_work(struct dm_cache_policy *p, bool idle, r = btracker_issue(mq->bg_work, result); if (r == -ENODATA) { if (!clean_target_met(mq, idle)) { - queue_writeback(mq); + queue_writeback(mq, idle); r = btracker_issue(mq->bg_work, result); } } @@ -1778,7 +1802,7 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, mq->next_hotspot_period = jiffies; mq->next_cache_period = jiffies; - mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */ + mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */ if (!mq->bg_work) goto bad_btracker; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index c5ea03fc7ee1..cf23a14f9c6a 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -408,9 +408,7 @@ struct cache { int sectors_per_block_shift; spinlock_t lock; - struct list_head deferred_cells; struct bio_list deferred_bios; - struct bio_list deferred_writethrough_bios; sector_t migration_threshold; wait_queue_head_t migration_wait; atomic_t nr_allocated_migrations; @@ -446,10 +444,10 @@ struct cache { struct dm_kcopyd_client *copier; struct workqueue_struct *wq; struct work_struct deferred_bio_worker; - struct work_struct deferred_writethrough_worker; struct work_struct migration_worker; struct delayed_work waker; struct dm_bio_prison_v2 *prison; + struct bio_set *bs; mempool_t *migration_pool; @@ -490,15 +488,6 @@ struct per_bio_data { struct dm_bio_prison_cell_v2 *cell; struct dm_hook_info hook_info; sector_t len; - - /* - * writethrough fields. These MUST remain at the end of this - * structure and the 'cache' member must be the first as it - * is used to determine the offset of the writethrough fields. - */ - struct cache *cache; - dm_cblock_t cblock; - struct dm_bio_details bio_details; }; struct dm_cache_migration { @@ -515,19 +504,19 @@ struct dm_cache_migration { /*----------------------------------------------------------------*/ -static bool writethrough_mode(struct cache_features *f) +static bool writethrough_mode(struct cache *cache) { - return f->io_mode == CM_IO_WRITETHROUGH; + return cache->features.io_mode == CM_IO_WRITETHROUGH; } -static bool writeback_mode(struct cache_features *f) +static bool writeback_mode(struct cache *cache) { - return f->io_mode == CM_IO_WRITEBACK; + return cache->features.io_mode == CM_IO_WRITEBACK; } -static inline bool passthrough_mode(struct cache_features *f) +static inline bool passthrough_mode(struct cache *cache) { - return unlikely(f->io_mode == CM_IO_PASSTHROUGH); + return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); } /*----------------------------------------------------------------*/ @@ -537,14 +526,9 @@ static void wake_deferred_bio_worker(struct cache *cache) queue_work(cache->wq, &cache->deferred_bio_worker); } -static void wake_deferred_writethrough_worker(struct cache *cache) -{ - queue_work(cache->wq, &cache->deferred_writethrough_worker); -} - static void wake_migration_worker(struct cache *cache) { - if (passthrough_mode(&cache->features)) + if (passthrough_mode(cache)) return; queue_work(cache->wq, &cache->migration_worker); @@ -567,10 +551,13 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache) struct dm_cache_migration *mg; mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); - if (mg) { - mg->cache = cache; - atomic_inc(&mg->cache->nr_allocated_migrations); - } + if (!mg) + return NULL; + + memset(mg, 0, sizeof(*mg)); + + mg->cache = cache; + atomic_inc(&cache->nr_allocated_migrations); return mg; } @@ -618,27 +605,16 @@ static unsigned lock_level(struct bio *bio) * Per bio data *--------------------------------------------------------------*/ -/* - * If using writeback, leave out struct per_bio_data's writethrough fields. - */ -#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) -#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) - -static size_t get_per_bio_data_size(struct cache *cache) +static struct per_bio_data *get_per_bio_data(struct bio *bio) { - return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; -} - -static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) -{ - struct per_bio_data *pb = dm_per_bio_data(bio, data_size); + struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); BUG_ON(!pb); return pb; } -static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) +static struct per_bio_data *init_per_bio_data(struct bio *bio) { - struct per_bio_data *pb = get_per_bio_data(bio, data_size); + struct per_bio_data *pb = get_per_bio_data(bio); pb->tick = false; pb->req_nr = dm_bio_get_target_bio_nr(bio); @@ -678,7 +654,6 @@ static void defer_bios(struct cache *cache, struct bio_list *bios) static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) { bool r; - size_t pb_size; struct per_bio_data *pb; struct dm_cell_key_v2 key; dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); @@ -703,8 +678,7 @@ static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bi if (cell != cell_prealloc) free_prison_cell(cache, cell_prealloc); - pb_size = get_per_bio_data_size(cache); - pb = get_per_bio_data(bio, pb_size); + pb = get_per_bio_data(bio); pb->cell = cell; return r; @@ -833,7 +807,7 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) *--------------------------------------------------------------*/ static void remap_to_origin(struct cache *cache, struct bio *bio) { - bio->bi_bdev = cache->origin_dev->bdev; + bio_set_dev(bio, cache->origin_dev->bdev); } static void remap_to_cache(struct cache *cache, struct bio *bio, @@ -842,7 +816,7 @@ static void remap_to_cache(struct cache *cache, struct bio *bio, sector_t bi_sector = bio->bi_iter.bi_sector; sector_t block = from_cblock(cblock); - bio->bi_bdev = cache->cache_dev->bdev; + bio_set_dev(bio, cache->cache_dev->bdev); if (!block_size_is_power_of_two(cache)) bio->bi_iter.bi_sector = (block * cache->sectors_per_block) + @@ -856,28 +830,35 @@ static void remap_to_cache(struct cache *cache, struct bio *bio, static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) { unsigned long flags; - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb; spin_lock_irqsave(&cache->lock, flags); if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && bio_op(bio) != REQ_OP_DISCARD) { + pb = get_per_bio_data(bio); pb->tick = true; cache->need_tick_bio = false; } spin_unlock_irqrestore(&cache->lock, flags); } -static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, - dm_oblock_t oblock) +static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, bool bio_has_pbd) { - // FIXME: this is called way too much. - check_if_tick_bio_needed(cache, bio); + if (bio_has_pbd) + check_if_tick_bio_needed(cache, bio); remap_to_origin(cache, bio); if (bio_data_dir(bio) == WRITE) clear_discard(cache, oblock_to_dblock(cache, oblock)); } +static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, + dm_oblock_t oblock) +{ + // FIXME: check_if_tick_bio_needed() is called way too much through this interface + __remap_to_origin_clear_discard(cache, bio, oblock, true); +} + static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, dm_oblock_t oblock, dm_cblock_t cblock) { @@ -908,10 +889,10 @@ static bool accountable_bio(struct cache *cache, struct bio *bio) static void accounted_begin(struct cache *cache, struct bio *bio) { - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb; if (accountable_bio(cache, bio)) { + pb = get_per_bio_data(bio); pb->len = bio_sectors(bio); iot_io_begin(&cache->tracker, pb->len); } @@ -919,8 +900,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio) static void accounted_complete(struct cache *cache, struct bio *bio) { - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb = get_per_bio_data(bio); iot_io_end(&cache->tracker, pb->len); } @@ -937,57 +917,26 @@ static void issue_op(struct bio *bio, void *context) accounted_request(cache, bio); } -static void defer_writethrough_bio(struct cache *cache, struct bio *bio) -{ - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); - bio_list_add(&cache->deferred_writethrough_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); - - wake_deferred_writethrough_worker(cache); -} - -static void writethrough_endio(struct bio *bio) -{ - struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); - - dm_unhook_bio(&pb->hook_info, bio); - - if (bio->bi_status) { - bio_endio(bio); - return; - } - - dm_bio_restore(&pb->bio_details, bio); - remap_to_cache(pb->cache, bio, pb->cblock); - - /* - * We can't issue this bio directly, since we're in interrupt - * context. So it gets put on a bio list for processing by the - * worker thread. - */ - defer_writethrough_bio(pb->cache, bio); -} - /* - * FIXME: send in parallel, huge latency as is. * When running in writethrough mode we need to send writes to clean blocks - * to both the cache and origin devices. In future we'd like to clone the - * bio and send them in parallel, but for now we're doing them in - * series as this is easier. + * to both the cache and origin devices. Clone the bio and send them in parallel. */ -static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, - dm_oblock_t oblock, dm_cblock_t cblock) +static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) { - struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); + struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs); + + BUG_ON(!origin_bio); - pb->cache = cache; - pb->cblock = cblock; - dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); - dm_bio_record(&pb->bio_details, bio); + bio_chain(origin_bio, bio); + /* + * Passing false to __remap_to_origin_clear_discard() skips + * all code that might use per_bio_data (since clone doesn't have it) + */ + __remap_to_origin_clear_discard(cache, origin_bio, oblock, false); + submit_bio(origin_bio); - remap_to_origin_clear_discard(pb->cache, bio, oblock); + remap_to_cache(cache, bio, cblock); } /*---------------------------------------------------------------- @@ -1201,6 +1150,18 @@ static void background_work_end(struct cache *cache) /*----------------------------------------------------------------*/ +static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE) && + (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); +} + +static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) +{ + return writeback_mode(cache) && + (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); +} + static void quiesce(struct dm_cache_migration *mg, void (*continuation)(struct work_struct *)) { @@ -1248,8 +1209,7 @@ static int copy(struct dm_cache_migration *mg, bool promote) static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) { - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb = get_per_bio_data(bio); if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) free_prison_cell(cache, pb->cell); @@ -1260,23 +1220,21 @@ static void overwrite_endio(struct bio *bio) { struct dm_cache_migration *mg = bio->bi_private; struct cache *cache = mg->cache; - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb = get_per_bio_data(bio); dm_unhook_bio(&pb->hook_info, bio); if (bio->bi_status) mg->k.input = bio->bi_status; - queue_continuation(mg->cache->wq, &mg->k); + queue_continuation(cache->wq, &mg->k); } static void overwrite(struct dm_cache_migration *mg, void (*continuation)(struct work_struct *)) { struct bio *bio = mg->overwrite_bio; - size_t pb_data_size = get_per_bio_data_size(mg->cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb = get_per_bio_data(bio); dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); @@ -1474,13 +1432,51 @@ static void mg_upgrade_lock(struct work_struct *ws) } } +static void mg_full_copy(struct work_struct *ws) +{ + struct dm_cache_migration *mg = ws_to_mg(ws); + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; + bool is_policy_promote = (op->op == POLICY_PROMOTE); + + if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || + is_discarded_oblock(cache, op->oblock)) { + mg_upgrade_lock(ws); + return; + } + + init_continuation(&mg->k, mg_upgrade_lock); + + if (copy(mg, is_policy_promote)) { + DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); + mg->k.input = BLK_STS_IOERR; + mg_complete(mg, false); + } +} + static void mg_copy(struct work_struct *ws) { - int r; struct dm_cache_migration *mg = ws_to_mg(ws); if (mg->overwrite_bio) { /* + * No exclusive lock was held when we last checked if the bio + * was optimisable. So we have to check again in case things + * have changed (eg, the block may no longer be discarded). + */ + if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { + /* + * Fallback to a real full copy after doing some tidying up. + */ + bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); + BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */ + mg->overwrite_bio = NULL; + inc_io_migrations(mg->cache); + mg_full_copy(ws); + return; + } + + /* * It's safe to do this here, even though it's new data * because all IO has been locked out of the block. * @@ -1489,26 +1485,8 @@ static void mg_copy(struct work_struct *ws) */ overwrite(mg, mg_update_metadata_after_copy); - } else { - struct cache *cache = mg->cache; - struct policy_work *op = mg->op; - bool is_policy_promote = (op->op == POLICY_PROMOTE); - - if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || - is_discarded_oblock(cache, op->oblock)) { - mg_upgrade_lock(ws); - return; - } - - init_continuation(&mg->k, mg_upgrade_lock); - - r = copy(mg, is_policy_promote); - if (r) { - DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); - mg->k.input = BLK_STS_IOERR; - mg_complete(mg, false); - } - } + } else + mg_full_copy(ws); } static int mg_lock_writes(struct dm_cache_migration *mg) @@ -1567,9 +1545,6 @@ static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio return -ENOMEM; } - memset(mg, 0, sizeof(*mg)); - - mg->cache = cache; mg->op = op; mg->overwrite_bio = bio; @@ -1703,9 +1678,6 @@ static int invalidate_start(struct cache *cache, dm_cblock_t cblock, return -ENOMEM; } - memset(mg, 0, sizeof(*mg)); - - mg->cache = cache; mg->overwrite_bio = bio; mg->invalidate_cblock = cblock; mg->invalidate_oblock = oblock; @@ -1748,26 +1720,12 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio) /*----------------------------------------------------------------*/ -static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) -{ - return (bio_data_dir(bio) == WRITE) && - (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); -} - -static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) -{ - return writeback_mode(&cache->features) && - (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); -} - static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, bool *commit_needed) { int r, data_dir; bool rb, background_queued; dm_cblock_t cblock; - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); *commit_needed = false; @@ -1816,6 +1774,8 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, } if (r == -ENOENT) { + struct per_bio_data *pb = get_per_bio_data(bio); + /* * Miss. */ @@ -1823,7 +1783,6 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, if (pb->req_nr == 0) { accounted_begin(cache, bio); remap_to_origin_clear_discard(cache, bio, block); - } else { /* * This is a duplicate writethrough io that is no @@ -1842,18 +1801,17 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, * Passthrough always maps to the origin, invalidating any * cache blocks that are written to. */ - if (passthrough_mode(&cache->features)) { + if (passthrough_mode(cache)) { if (bio_data_dir(bio) == WRITE) { bio_drop_shared_lock(cache, bio); atomic_inc(&cache->stats.demotion); invalidate_start(cache, cblock, block, bio); } else remap_to_origin_clear_discard(cache, bio, block); - } else { - if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && + if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && !is_dirty(cache, cblock)) { - remap_to_origin_then_cache(cache, bio, block, cblock); + remap_to_origin_and_cache(cache, bio, block, cblock); accounted_begin(cache, bio); } else remap_to_cache_dirty(cache, bio, block, cblock); @@ -1922,8 +1880,7 @@ static blk_status_t commit_op(void *context) static bool process_flush_bio(struct cache *cache, struct bio *bio) { - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb = get_per_bio_data(bio); if (!pb->req_nr) remap_to_origin(cache, bio); @@ -1983,28 +1940,6 @@ static void process_deferred_bios(struct work_struct *ws) schedule_commit(&cache->committer); } -static void process_deferred_writethrough_bios(struct work_struct *ws) -{ - struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker); - - unsigned long flags; - struct bio_list bios; - struct bio *bio; - - bio_list_init(&bios); - - spin_lock_irqsave(&cache->lock, flags); - bio_list_merge(&bios, &cache->deferred_writethrough_bios); - bio_list_init(&cache->deferred_writethrough_bios); - spin_unlock_irqrestore(&cache->lock, flags); - - /* - * These bios have already been through accounted_begin() - */ - while ((bio = bio_list_pop(&bios))) - generic_make_request(bio); -} - /*---------------------------------------------------------------- * Main worker loop *--------------------------------------------------------------*/ @@ -2112,6 +2047,9 @@ static void destroy(struct cache *cache) kfree(cache->ctr_args[i]); kfree(cache->ctr_args); + if (cache->bs) + bioset_free(cache->bs); + kfree(cache); } @@ -2306,7 +2244,7 @@ static void init_features(struct cache_features *cf) static int parse_features(struct cache_args *ca, struct dm_arg_set *as, char **error) { - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {0, 2, "Invalid number of cache feature arguments"}, }; @@ -2348,7 +2286,7 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, char **error) { - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {0, 1024, "Invalid number of policy arguments"}, }; @@ -2555,8 +2493,15 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->discards_supported = true; ti->split_discard_bios = false; + ti->per_io_data_size = sizeof(struct per_bio_data); + cache->features = ca->features; - ti->per_io_data_size = get_per_bio_data_size(cache); + if (writethrough_mode(cache)) { + /* Create bioset for writethrough bios issued to origin */ + cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0); + if (!cache->bs) + goto bad; + } cache->callbacks.congested_fn = cache_is_congested; dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -2618,7 +2563,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - if (passthrough_mode(&cache->features)) { + if (passthrough_mode(cache)) { bool all_clean; r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); @@ -2637,9 +2582,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) } spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->deferred_cells); bio_list_init(&cache->deferred_bios); - bio_list_init(&cache->deferred_writethrough_bios); atomic_set(&cache->nr_allocated_migrations, 0); atomic_set(&cache->nr_io_migrations, 0); init_waitqueue_head(&cache->migration_wait); @@ -2678,8 +2621,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); - INIT_WORK(&cache->deferred_writethrough_worker, - process_deferred_writethrough_bios); INIT_WORK(&cache->migration_worker, check_migrations); INIT_DELAYED_WORK(&cache->waker, do_waker); @@ -2795,9 +2736,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio) int r; bool commit_needed; dm_oblock_t block = get_bio_block(cache, bio); - size_t pb_data_size = get_per_bio_data_size(cache); - init_per_bio_data(bio, pb_data_size); + init_per_bio_data(bio); if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { /* * This can only occur if the io goes to a partial block at @@ -2821,13 +2761,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return r; } -static int cache_end_io(struct dm_target *ti, struct bio *bio, - blk_status_t *error) +static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) { struct cache *cache = ti->private; unsigned long flags; - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + struct per_bio_data *pb = get_per_bio_data(bio); if (pb->tick) { policy_tick(cache->policy, false); @@ -3243,13 +3181,13 @@ static void cache_status(struct dm_target *ti, status_type_t type, else DMEMIT("1 "); - if (writethrough_mode(&cache->features)) + if (writethrough_mode(cache)) DMEMIT("writethrough "); - else if (passthrough_mode(&cache->features)) + else if (passthrough_mode(cache)) DMEMIT("passthrough "); - else if (writeback_mode(&cache->features)) + else if (writeback_mode(cache)) DMEMIT("writeback "); else { @@ -3415,7 +3353,7 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun unsigned i; struct cblock_range range; - if (!passthrough_mode(&cache->features)) { + if (!passthrough_mode(cache)) { DMERR("%s: cache has to be in passthrough mode for invalidation", cache_device_name(cache)); return -EPERM; diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 24eddbdf2ab4..6a14f945783c 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -29,7 +29,6 @@ struct dm_kobject_holder { * DM targets must _not_ deference a mapped_device to directly access its members! */ struct mapped_device { - struct srcu_struct io_barrier; struct mutex suspend_lock; /* @@ -127,6 +126,8 @@ struct mapped_device { struct blk_mq_tag_set *tag_set; bool use_blk_mq:1; bool init_tio_pdu:1; + + struct srcu_struct io_barrier; }; void dm_init_md_queue(struct mapped_device *md); @@ -149,5 +150,6 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen extern atomic_t dm_global_event_nr; extern wait_queue_head_t dm_global_eventq; +void dm_issue_global_event(void); #endif diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index cdf6b1e12460..9fc12f556534 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -758,9 +758,8 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc, int i, r; /* xor whitening with sector number */ - memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE); - crypto_xor(buf, (u8 *)§or, 8); - crypto_xor(&buf[8], (u8 *)§or, 8); + crypto_xor_cpy(buf, tcw->whitening, (u8 *)§or, 8); + crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)§or, 8); /* calculate crc32 for every 32bit part and xor it */ desc->tfm = tcw->crc32_tfm; @@ -805,10 +804,10 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, } /* Calculate IV */ - memcpy(iv, tcw->iv_seed, cc->iv_size); - crypto_xor(iv, (u8 *)§or, 8); + crypto_xor_cpy(iv, tcw->iv_seed, (u8 *)§or, 8); if (cc->iv_size > 8) - crypto_xor(&iv[8], (u8 *)§or, cc->iv_size - 8); + crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)§or, + cc->iv_size - 8); return r; } @@ -933,9 +932,6 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) bip->bip_iter.bi_size = tag_len; bip->bip_iter.bi_sector = io->cc->start + io->sector; - /* We own the metadata, do not let bio_free to release it */ - bip->bip_flags &= ~BIP_BLOCK_INTEGRITY; - ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata), tag_len, offset_in_page(io->integrity_metadata)); if (unlikely(ret != tag_len)) @@ -1079,7 +1075,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc, BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size); /* Reject unexpected unaligned bio. */ - if (unlikely(bv_in.bv_offset & (cc->sector_size - 1))) + if (unlikely(bv_in.bv_len & (cc->sector_size - 1))) return -EIO; dmreq = dmreq_of_req(cc, req); @@ -1172,7 +1168,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, int r = 0; /* Reject unexpected unaligned bio. */ - if (unlikely(bv_in.bv_offset & (cc->sector_size - 1))) + if (unlikely(bv_in.bv_len & (cc->sector_size - 1))) return -EIO; dmreq = dmreq_of_req(cc, req); @@ -1547,7 +1543,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_private = io; clone->bi_end_io = crypt_endio; - clone->bi_bdev = cc->dev->bdev; + bio_set_dev(clone, cc->dev->bdev); clone->bi_opf = io->base_bio->bi_opf; } @@ -2470,6 +2466,7 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key kfree(cipher_api); return ret; } + kfree(cipher_api); return 0; bad_mem: @@ -2533,7 +2530,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar { struct crypt_config *cc = ti->private; struct dm_arg_set as; - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {0, 6, "Invalid number of feature args"}, }; unsigned int opt_params, val; @@ -2588,6 +2585,10 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar ti->error = "Invalid feature value for sector_size"; return -EINVAL; } + if (ti->len & ((cc->sector_size >> SECTOR_SHIFT) - 1)) { + ti->error = "Device size is not multiple of sector_size feature"; + return -EINVAL; + } cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT; } else if (!strcasecmp(opt_string, "iv_large_sectors")) set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); @@ -2796,7 +2797,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) */ if (unlikely(bio->bi_opf & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD)) { - bio->bi_bdev = cc->dev->bdev; + bio_set_dev(bio, cc->dev->bdev); if (bio_sectors(bio)) bio->bi_iter.bi_sector = cc->start + dm_target_offset(ti, bio->bi_iter.bi_sector); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index ae3158795d26..288386bfbfb5 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -44,9 +44,9 @@ struct dm_delay_info { static DEFINE_MUTEX(delayed_bios_lock); -static void handle_delayed_timer(unsigned long data) +static void handle_delayed_timer(struct timer_list *t) { - struct delay_c *dc = (struct delay_c *)data; + struct delay_c *dc = from_timer(dc, t, delay_timer); queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); } @@ -195,7 +195,7 @@ out: goto bad_queue; } - setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); + timer_setup(&dc->delay_timer, handle_delayed_timer, 0); INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); INIT_LIST_HEAD(&dc->delayed_bios); @@ -282,7 +282,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio) struct delay_c *dc = ti->private; if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { - bio->bi_bdev = dc->dev_write->bdev; + bio_set_dev(bio, dc->dev_write->bdev); if (bio_sectors(bio)) bio->bi_iter.bi_sector = dc->start_write + dm_target_offset(ti, bio->bi_iter.bi_sector); @@ -290,7 +290,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio) return delay_bio(dc, dc->write_delay, bio); } - bio->bi_bdev = dc->dev_read->bdev; + bio_set_dev(bio, dc->dev_read->bdev); bio->bi_iter.bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_iter.bi_sector); diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index e7ba89f98d8d..73a5c198113a 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1192,7 +1192,7 @@ static dm_block_t get_block(struct era *era, struct bio *bio) static void remap_to_origin(struct era *era, struct bio *bio) { - bio->bi_bdev = era->origin_dev->bdev; + bio_set_dev(bio, era->origin_dev->bdev); } /*---------------------------------------------------------------- @@ -1513,7 +1513,6 @@ static int era_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->flush_supported = true; ti->num_discard_bios = 1; - ti->discards_supported = true; era->callbacks.congested_fn = era_is_congested; dm_table_add_target_callbacks(ti->table, &era->callbacks); diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index e2c7234931bc..b82cb1ab1eaa 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -51,7 +51,7 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, unsigned argc; const char *arg_name; - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {0, 6, "Invalid number of feature args"}, {1, UINT_MAX, "Invalid corrupt bio byte"}, {0, 255, "Invalid corrupt value to write into bio byte (0-255)"}, @@ -178,7 +178,7 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, */ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) { - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {0, UINT_MAX, "Invalid up interval"}, {0, UINT_MAX, "Invalid down interval"}, }; @@ -274,7 +274,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) { struct flakey_c *fc = ti->private; - bio->bi_bdev = fc->dev->bdev; + bio_set_dev(bio, fc->dev->bdev); if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) bio->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 1b224aa9cf15..05c7bfd0c9d9 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -6,6 +6,7 @@ * This file is released under the GPL. */ +#include <linux/compiler.h> #include <linux/module.h> #include <linux/device-mapper.h> #include <linux/dm-io.h> @@ -80,13 +81,13 @@ struct journal_entry { #define journal_entry_tag(ic, je) ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block]) #if BITS_PER_LONG == 64 -#define journal_entry_set_sector(je, x) do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0) +#define journal_entry_set_sector(je, x) do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0) #define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) #elif defined(CONFIG_LBDAF) -#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0) +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0) #define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) #else -#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0) +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32(0)); } while (0) #define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo) #endif #define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1)) @@ -225,6 +226,8 @@ struct dm_integrity_c { struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; struct alg_spec journal_mac_alg; + + atomic64_t number_of_mismatches; }; struct dm_integrity_range { @@ -250,7 +253,8 @@ struct dm_integrity_io { struct completion *completion; - struct block_device *orig_bi_bdev; + struct gendisk *orig_bi_disk; + u8 orig_bi_partno; bio_end_io_t *orig_bi_end_io; struct bio_integrity_payload *orig_bi_integrity; struct bvec_iter orig_bi_iter; @@ -297,7 +301,7 @@ static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...) /* * DM Integrity profile, protection is performed layer above (dm-crypt) */ -static struct blk_integrity_profile dm_integrity_profile = { +static const struct blk_integrity_profile dm_integrity_profile = { .name = "DM-DIF-EXT-TAG", .generate_fn = NULL, .verify_fn = NULL, @@ -309,13 +313,15 @@ static void dm_integrity_dtr(struct dm_target *ti); static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err) { + if (err == -EILSEQ) + atomic64_inc(&ic->number_of_mismatches); if (!cmpxchg(&ic->failed, 0, err)) DMERR("Error on %s: %d", msg, err); } static int dm_integrity_failed(struct dm_integrity_c *ic) { - return ACCESS_ONCE(ic->failed); + return READ_ONCE(ic->failed); } static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i, @@ -769,13 +775,13 @@ static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsi unsigned i; io_comp.ic = ic; - io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp); + init_completion(&io_comp.comp); if (commit_start + commit_sections <= ic->journal_sections) { io_comp.in_flight = (atomic_t)ATOMIC_INIT(1); if (ic->journal_io) { crypt_comp_1.ic = ic; - crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); + init_completion(&crypt_comp_1.comp); crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1); wait_for_completion_io(&crypt_comp_1.comp); @@ -791,18 +797,18 @@ static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsi to_end = ic->journal_sections - commit_start; if (ic->journal_io) { crypt_comp_1.ic = ic; - crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); + init_completion(&crypt_comp_1.comp); crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1); if (try_wait_for_completion(&crypt_comp_1.comp)) { rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); - crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); + reinit_completion(&crypt_comp_1.comp); crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1); wait_for_completion_io(&crypt_comp_1.comp); } else { crypt_comp_2.ic = ic; - crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp); + init_completion(&crypt_comp_2.comp); crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0); encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2); wait_for_completion_io(&crypt_comp_1.comp); @@ -1040,7 +1046,7 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se memcpy(tag, dp, to_copy); } else if (op == TAG_WRITE) { memcpy(dp, tag, to_copy); - dm_bufio_mark_buffer_dirty(b); + dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); } else { /* e.g.: op == TAG_CMP */ if (unlikely(memcmp(dp, tag, to_copy))) { @@ -1088,9 +1094,9 @@ static void sleep_on_endio_wait(struct dm_integrity_c *ic) __remove_wait_queue(&ic->endio_wait, &wait); } -static void autocommit_fn(unsigned long data) +static void autocommit_fn(struct timer_list *t) { - struct dm_integrity_c *ic = (struct dm_integrity_c *)data; + struct dm_integrity_c *ic = from_timer(ic, t, autocommit_timer); if (likely(!dm_integrity_failed(ic))) queue_work(ic->commit_wq, &ic->commit_work); @@ -1164,7 +1170,8 @@ static void integrity_end_io(struct bio *bio) struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); bio->bi_iter = dio->orig_bi_iter; - bio->bi_bdev = dio->orig_bi_bdev; + bio->bi_disk = dio->orig_bi_disk; + bio->bi_partno = dio->orig_bi_partno; if (dio->orig_bi_integrity) { bio->bi_integrity = dio->orig_bi_integrity; bio->bi_opf |= REQ_INTEGRITY; @@ -1273,6 +1280,7 @@ again: DMERR("Checksum failed at sector 0x%llx", (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size))); r = -EILSEQ; + atomic64_inc(&ic->number_of_mismatches); } if (likely(checksums != checksums_onstack)) kfree(checksums); @@ -1369,7 +1377,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) struct bvec_iter iter; struct bio_vec bv; bio_for_each_segment(bv, bio, iter) { - if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { + if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary", bv.bv_offset, bv.bv_len, ic->sectors_per_block); return DM_MAPIO_KILL; @@ -1538,7 +1546,7 @@ retry_kmap: smp_mb(); if (unlikely(waitqueue_active(&ic->copy_to_journal_wait))) wake_up(&ic->copy_to_journal_wait); - if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) { + if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) { queue_work(ic->commit_wq, &ic->commit_work); } else { schedule_autocommit(ic); @@ -1587,16 +1595,18 @@ retry: if (likely(ic->mode == 'J')) { if (dio->write) { unsigned next_entry, i, pos; - unsigned ws, we; + unsigned ws, we, range_sectors; - dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors); + dio->range.n_sectors = min(dio->range.n_sectors, + ic->free_sectors << ic->sb->log2_sectors_per_block); if (unlikely(!dio->range.n_sectors)) goto sleep; - ic->free_sectors -= dio->range.n_sectors; + range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block; + ic->free_sectors -= range_sectors; journal_section = ic->free_section; journal_entry = ic->free_section_entry; - next_entry = ic->free_section_entry + dio->range.n_sectors; + next_entry = ic->free_section_entry + range_sectors; ic->free_section_entry = next_entry % ic->journal_section_entries; ic->free_section += next_entry / ic->journal_section_entries; ic->n_uncommitted_sections += next_entry / ic->journal_section_entries; @@ -1672,15 +1682,16 @@ sleep: dio->in_flight = (atomic_t)ATOMIC_INIT(2); if (need_sync_io) { - read_comp = COMPLETION_INITIALIZER_ONSTACK(read_comp); + init_completion(&read_comp); dio->completion = &read_comp; } else dio->completion = NULL; dio->orig_bi_iter = bio->bi_iter; - dio->orig_bi_bdev = bio->bi_bdev; - bio->bi_bdev = ic->dev->bdev; + dio->orig_bi_disk = bio->bi_disk; + dio->orig_bi_partno = bio->bi_partno; + bio_set_dev(bio, ic->dev->bdev); dio->orig_bi_integrity = bio_integrity(bio); bio->bi_integrity = NULL; @@ -1695,7 +1706,11 @@ sleep: if (need_sync_io) { wait_for_completion_io(&read_comp); - integrity_metadata(&dio->work); + if (likely(!bio->bi_status)) + integrity_metadata(&dio->work); + else + dec_in_flight(dio); + } else { INIT_WORK(&dio->work, integrity_metadata); queue_work(ic->metadata_wq, &dio->work); @@ -1727,6 +1742,8 @@ static void pad_uncommitted(struct dm_integrity_c *ic) wraparound_section(ic, &ic->free_section); ic->n_uncommitted_sections++; } + WARN_ON(ic->journal_sections * ic->journal_section_entries != + (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors); } static void integrity_commit(struct work_struct *w) @@ -1782,7 +1799,7 @@ static void integrity_commit(struct work_struct *w) ic->n_committed_sections += commit_sections; spin_unlock_irq(&ic->endio_wait.lock); - if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) + if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) queue_work(ic->writer_wq, &ic->writer_work); release_flush_bios: @@ -1821,10 +1838,13 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, { unsigned i, j, n; struct journal_completion comp; + struct blk_plug plug; + + blk_start_plug(&plug); comp.ic = ic; comp.in_flight = (atomic_t)ATOMIC_INIT(1); - comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp); + init_completion(&comp.comp); i = write_start; for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) { @@ -1945,6 +1965,8 @@ skip_io: dm_bufio_write_dirty_buffers_async(ic->bufio); + blk_finish_plug(&plug); + complete_journal_op(&comp); wait_for_completion_io(&comp.comp); @@ -1959,7 +1981,7 @@ static void integrity_writer(struct work_struct *w) unsigned prev_free_sectors; /* the following test is not needed, but it tests the replay code */ - if (ACCESS_ONCE(ic->suspending)) + if (READ_ONCE(ic->suspending)) return; spin_lock_irq(&ic->endio_wait.lock); @@ -2049,7 +2071,7 @@ static void replay_journal(struct dm_integrity_c *ic) if (ic->journal_io) { struct journal_completion crypt_comp; crypt_comp.ic = ic; - crypt_comp.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp.comp); + init_completion(&crypt_comp.comp); crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0); encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp); wait_for_completion(&crypt_comp.comp); @@ -2221,7 +2243,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - result[0] = '\0'; + DMEMIT("%llu", (unsigned long long)atomic64_read(&ic->number_of_mismatches)); break; case STATUSTYPE_TABLE: { @@ -2622,7 +2644,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) memset(iv, 0x00, ivsize); skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv); - comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp); + init_completion(&comp.comp); comp.in_flight = (atomic_t)ATOMIC_INIT(1); if (do_crypt(true, req, &comp)) wait_for_completion(&comp.comp); @@ -2679,7 +2701,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) sg_init_one(&sg, crypt_data, crypt_len); skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv); - comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp); + init_completion(&comp.comp); comp.in_flight = (atomic_t)ATOMIC_INIT(1); if (do_crypt(true, req, &comp)) wait_for_completion(&comp.comp); @@ -2766,7 +2788,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) int r; unsigned extra_args; struct dm_arg_set as; - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {0, 9, "Invalid number of feature args"}, }; unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; @@ -2794,6 +2816,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) bio_list_init(&ic->flush_bio_list); init_waitqueue_head(&ic->copy_to_journal_wait); init_completion(&ic->crypto_backoff); + atomic64_set(&ic->number_of_mismatches, 0); r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev); if (r) { @@ -2919,7 +2942,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); ic->autocommit_msec = sync_msec; - setup_timer(&ic->autocommit_timer, autocommit_fn, (unsigned long)ic); + timer_setup(&ic->autocommit_timer, autocommit_fn, 0); ic->io = dm_io_client_create(); if (IS_ERR(ic->io)) { @@ -3019,6 +3042,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->error = "Block size doesn't match the information in superblock"; goto bad; } + if (!le32_to_cpu(ic->sb->journal_sections)) { + r = -EINVAL; + ti->error = "Corrupted superblock, journal_sections is 0"; + goto bad; + } /* make sure that ti->max_io_len doesn't overflow */ if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS || ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) { @@ -3185,7 +3213,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 25039607f3cb..b4357ed4d541 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -347,7 +347,7 @@ static void do_region(int op, int op_flags, unsigned region, bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); bio->bi_iter.bi_sector = where->sector + (where->count - remaining); - bio->bi_bdev = where->bdev; + bio_set_dev(bio, where->bdev); bio->bi_end_io = endio; bio_set_op_attrs(bio, op, op_flags); store_io_and_region_in_bio(bio, io, region); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index e06f0ef7d2ec..e52676fa9832 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -477,9 +477,13 @@ static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_si * Round up the ptr to an 8-byte boundary. */ #define ALIGN_MASK 7 +static inline size_t align_val(size_t val) +{ + return (val + ALIGN_MASK) & ~ALIGN_MASK; +} static inline void *align_ptr(void *ptr) { - return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK); + return (void *)align_val((size_t)ptr); } /* @@ -505,7 +509,7 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ struct hash_cell *hc; size_t len, needed = 0; struct gendisk *disk; - struct dm_name_list *nl, *old_nl = NULL; + struct dm_name_list *orig_nl, *nl, *old_nl = NULL; uint32_t *event_nr; down_write(&_hash_lock); @@ -516,17 +520,15 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ */ for (i = 0; i < NUM_BUCKETS; i++) { list_for_each_entry (hc, _name_buckets + i, name_list) { - needed += sizeof(struct dm_name_list); - needed += strlen(hc->name) + 1; - needed += ALIGN_MASK; - needed += (sizeof(uint32_t) + ALIGN_MASK) & ~ALIGN_MASK; + needed += align_val(offsetof(struct dm_name_list, name) + strlen(hc->name) + 1); + needed += align_val(sizeof(uint32_t)); } } /* * Grab our output buffer. */ - nl = get_result_buffer(param, param_size, &len); + nl = orig_nl = get_result_buffer(param, param_size, &len); if (len < needed) { param->flags |= DM_BUFFER_FULL_FLAG; goto out; @@ -549,11 +551,16 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ strcpy(nl->name, hc->name); old_nl = nl; - event_nr = align_ptr(((void *) (nl + 1)) + strlen(hc->name) + 1); + event_nr = align_ptr(nl->name + strlen(hc->name) + 1); *event_nr = dm_get_event_nr(hc->md); nl = align_ptr(event_nr + 1); } } + /* + * If mismatch happens, security may be compromised due to buffer + * overflow, so it's better to crash. + */ + BUG_ON((char *)nl - (char *)orig_nl != needed); out: up_write(&_hash_lock); @@ -1621,7 +1628,8 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para * which has a variable size, is not used by the function processing * the ioctl. */ -#define IOCTL_FLAGS_NO_PARAMS 1 +#define IOCTL_FLAGS_NO_PARAMS 1 +#define IOCTL_FLAGS_ISSUE_GLOBAL_EVENT 2 /*----------------------------------------------------------------- * Implementation of open/close/ioctl on the special char @@ -1629,18 +1637,18 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para *---------------------------------------------------------------*/ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) { - static struct { + static const struct { int cmd; int flags; ioctl_fn fn; } _ioctls[] = { {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */ - {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all}, + {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, remove_all}, {DM_LIST_DEVICES_CMD, 0, list_devices}, - {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create}, - {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove}, - {DM_DEV_RENAME_CMD, 0, dev_rename}, + {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_create}, + {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_remove}, + {DM_DEV_RENAME_CMD, IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_rename}, {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend}, {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status}, {DM_DEV_WAIT_CMD, 0, dev_wait}, @@ -1869,6 +1877,9 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS)) DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd); + if (!r && ioctl_flags & IOCTL_FLAGS_ISSUE_GLOBAL_EVENT) + dm_issue_global_event(); + /* * Copy the results back to userland. */ diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index cf2c67e35eaf..eb45cc3df31d 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -107,7 +107,7 @@ static void io_job_start(struct dm_kcopyd_throttle *t) try_again: spin_lock_irq(&throttle_spinlock); - throttle = ACCESS_ONCE(t->throttle); + throttle = READ_ONCE(t->throttle); if (likely(throttle >= 100)) goto skip_limit; @@ -157,7 +157,7 @@ static void io_job_finish(struct dm_kcopyd_throttle *t) t->num_io_jobs--; - if (likely(ACCESS_ONCE(t->throttle) >= 100)) + if (likely(READ_ONCE(t->throttle) >= 100)) goto skip_limit; if (!t->num_io_jobs) { diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 41971a090e34..d5f8eff7c11d 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -88,7 +88,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) { struct linear_c *lc = ti->private; - bio->bi_bdev = lc->dev->bdev; + bio_set_dev(bio, lc->dev->bdev); if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) bio->bi_iter.bi_sector = linear_map_sector(ti, bio->bi_iter.bi_sector); @@ -184,20 +184,6 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); } -static void linear_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr, - size_t size) -{ - struct linear_c *lc = ti->private; - struct block_device *bdev = lc->dev->bdev; - struct dax_device *dax_dev = lc->dev->dax_dev; - sector_t dev_sector, sector = pgoff * PAGE_SECTORS; - - dev_sector = linear_map_sector(ti, sector); - if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff)) - return; - dax_flush(dax_dev, pgoff, addr, size); -} - static struct target_type linear_target = { .name = "linear", .version = {1, 4, 0}, @@ -212,7 +198,6 @@ static struct target_type linear_target = { .iterate_devices = linear_iterate_devices, .direct_access = linear_dax_direct_access, .dax_copy_from_iter = linear_dax_copy_from_iter, - .dax_flush = linear_dax_flush, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index a1da0eb58a93..189badbeddaf 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -10,9 +10,11 @@ #include <linux/init.h> #include <linux/blkdev.h> #include <linux/bio.h> +#include <linux/dax.h> #include <linux/slab.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/uio.h> #define DM_MSG_PREFIX "log-writes" @@ -100,6 +102,7 @@ struct log_writes_c { struct dm_dev *logdev; u64 logged_entries; u32 sectorsize; + u32 sectorshift; atomic_t io_blocks; atomic_t pending_blocks; sector_t next_sector; @@ -128,6 +131,18 @@ struct per_bio_data { struct pending_block *block; }; +static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc, + sector_t sectors) +{ + return sectors >> (lc->sectorshift - SECTOR_SHIFT); +} + +static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc, + sector_t sectors) +{ + return sectors << (lc->sectorshift - SECTOR_SHIFT); +} + static void put_pending_block(struct log_writes_c *lc) { if (atomic_dec_and_test(&lc->pending_blocks)) { @@ -198,7 +213,7 @@ static int write_metadata(struct log_writes_c *lc, void *entry, } bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = sector; - bio->bi_bdev = lc->logdev->bdev; + bio_set_dev(bio, lc->logdev->bdev); bio->bi_end_io = log_end_io; bio->bi_private = lc; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -233,27 +248,108 @@ error: return -1; } +static int write_inline_data(struct log_writes_c *lc, void *entry, + size_t entrylen, void *data, size_t datalen, + sector_t sector) +{ + int num_pages, bio_pages, pg_datalen, pg_sectorlen, i; + struct page *page; + struct bio *bio; + size_t ret; + void *ptr; + + while (datalen) { + num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT; + bio_pages = min(num_pages, BIO_MAX_PAGES); + + atomic_inc(&lc->io_blocks); + + bio = bio_alloc(GFP_KERNEL, bio_pages); + if (!bio) { + DMERR("Couldn't alloc inline data bio"); + goto error; + } + + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, lc->logdev->bdev); + bio->bi_end_io = log_end_io; + bio->bi_private = lc; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + for (i = 0; i < bio_pages; i++) { + pg_datalen = min_t(int, datalen, PAGE_SIZE); + pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize); + + page = alloc_page(GFP_KERNEL); + if (!page) { + DMERR("Couldn't alloc inline data page"); + goto error_bio; + } + + ptr = kmap_atomic(page); + memcpy(ptr, data, pg_datalen); + if (pg_sectorlen > pg_datalen) + memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen); + kunmap_atomic(ptr); + + ret = bio_add_page(bio, page, pg_sectorlen, 0); + if (ret != pg_sectorlen) { + DMERR("Couldn't add page of inline data"); + __free_page(page); + goto error_bio; + } + + datalen -= pg_datalen; + data += pg_datalen; + } + submit_bio(bio); + + sector += bio_pages * PAGE_SECTORS; + } + return 0; +error_bio: + bio_free_pages(bio); + bio_put(bio); +error: + put_io_block(lc); + return -1; +} + static int log_one_block(struct log_writes_c *lc, struct pending_block *block, sector_t sector) { struct bio *bio; struct log_write_entry entry; - size_t ret; + size_t metadatalen, ret; int i; entry.sector = cpu_to_le64(block->sector); entry.nr_sectors = cpu_to_le64(block->nr_sectors); entry.flags = cpu_to_le64(block->flags); entry.data_len = cpu_to_le64(block->datalen); + + metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0; if (write_metadata(lc, &entry, sizeof(entry), block->data, - block->datalen, sector)) { + metadatalen, sector)) { free_pending_block(lc, block); return -1; } + sector += dev_to_bio_sectors(lc, 1); + + if (block->datalen && metadatalen == 0) { + if (write_inline_data(lc, &entry, sizeof(entry), block->data, + block->datalen, sector)) { + free_pending_block(lc, block); + return -1; + } + /* we don't support both inline data & bio data */ + goto out; + } + if (!block->vec_cnt) goto out; - sector++; atomic_inc(&lc->io_blocks); bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES)); @@ -263,7 +359,7 @@ static int log_one_block(struct log_writes_c *lc, } bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = sector; - bio->bi_bdev = lc->logdev->bdev; + bio_set_dev(bio, lc->logdev->bdev); bio->bi_end_io = log_end_io; bio->bi_private = lc; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -285,7 +381,7 @@ static int log_one_block(struct log_writes_c *lc, } bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = sector; - bio->bi_bdev = lc->logdev->bdev; + bio_set_dev(bio, lc->logdev->bdev); bio->bi_end_io = log_end_io; bio->bi_private = lc; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -354,10 +450,9 @@ static int log_writes_kthread(void *arg) goto next; sector = lc->next_sector; - if (block->flags & LOG_DISCARD_FLAG) - lc->next_sector++; - else - lc->next_sector += block->nr_sectors + 1; + if (!(block->flags & LOG_DISCARD_FLAG)) + lc->next_sector += dev_to_bio_sectors(lc, block->nr_sectors); + lc->next_sector += dev_to_bio_sectors(lc, 1); /* * Apparently the size of the device may not be known @@ -399,7 +494,7 @@ next: if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && - !atomic_read(&lc->pending_blocks)) + list_empty(&lc->logging_blocks)) schedule(); __set_current_state(TASK_RUNNING); } @@ -435,7 +530,6 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) INIT_LIST_HEAD(&lc->unflushed_blocks); INIT_LIST_HEAD(&lc->logging_blocks); init_waitqueue_head(&lc->wait); - lc->sectorsize = 1 << SECTOR_SHIFT; atomic_set(&lc->io_blocks, 0); atomic_set(&lc->pending_blocks, 0); @@ -455,6 +549,8 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + lc->sectorsize = bdev_logical_block_size(lc->dev->bdev); + lc->sectorshift = ilog2(lc->sectorsize); lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write"); if (IS_ERR(lc->log_kthread)) { ret = PTR_ERR(lc->log_kthread); @@ -464,8 +560,12 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - /* We put the super at sector 0, start logging at sector 1 */ - lc->next_sector = 1; + /* + * next_sector is in 512b sectors to correspond to what bi_sector expects. + * The super starts at sector 0, and the next_sector is the next logical + * one based on the sectorsize of the device. + */ + lc->next_sector = lc->sectorsize >> SECTOR_SHIFT; lc->logging_enabled = true; lc->end_sector = logdev_last_sector(lc); lc->device_supports_discard = true; @@ -510,6 +610,51 @@ static int log_mark(struct log_writes_c *lc, char *data) return 0; } +static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes, + struct iov_iter *i) +{ + struct pending_block *block; + + if (!bytes) + return 0; + + block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); + if (!block) { + DMERR("Error allocating dax pending block"); + return -ENOMEM; + } + + block->data = kzalloc(bytes, GFP_KERNEL); + if (!block->data) { + DMERR("Error allocating dax data space"); + kfree(block); + return -ENOMEM; + } + + /* write data provided via the iterator */ + if (!copy_from_iter(block->data, bytes, i)) { + DMERR("Error copying dax data"); + kfree(block->data); + kfree(block); + return -EIO; + } + + /* rewind the iterator so that the block driver can use it */ + iov_iter_revert(i, bytes); + + block->datalen = bytes; + block->sector = bio_to_dev_sectors(lc, sector); + block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift; + + atomic_inc(&lc->pending_blocks); + spin_lock_irq(&lc->blocks_lock); + list_add_tail(&block->list, &lc->unflushed_blocks); + spin_unlock_irq(&lc->blocks_lock); + wake_up_process(lc->log_kthread); + + return 0; +} + static void log_writes_dtr(struct dm_target *ti) { struct log_writes_c *lc = ti->private; @@ -539,7 +684,7 @@ static void normal_map_bio(struct dm_target *ti, struct bio *bio) { struct log_writes_c *lc = ti->private; - bio->bi_bdev = lc->dev->bdev; + bio_set_dev(bio, lc->dev->bdev); } static int log_writes_map(struct dm_target *ti, struct bio *bio) @@ -599,8 +744,8 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) if (discard_bio) block->flags |= LOG_DISCARD_FLAG; - block->sector = bio->bi_iter.bi_sector; - block->nr_sectors = bio_sectors(bio); + block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector); + block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio)); /* We don't need the data, just submit */ if (discard_bio) { @@ -767,14 +912,54 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit if (!q || !blk_queue_discard(q)) { lc->device_supports_discard = false; - limits->discard_granularity = 1 << SECTOR_SHIFT; + limits->discard_granularity = lc->sectorsize; limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT); } + limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev); + limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev); + limits->io_min = limits->physical_block_size; +} + +static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct log_writes_c *lc = ti->private; + sector_t sector = pgoff * PAGE_SECTORS; + int ret; + + ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages * PAGE_SIZE, &pgoff); + if (ret) + return ret; + return dax_direct_access(lc->dev->dax_dev, pgoff, nr_pages, kaddr, pfn); +} + +static size_t log_writes_dax_copy_from_iter(struct dm_target *ti, + pgoff_t pgoff, void *addr, size_t bytes, + struct iov_iter *i) +{ + struct log_writes_c *lc = ti->private; + sector_t sector = pgoff * PAGE_SECTORS; + int err; + + if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) + return 0; + + /* Don't bother doing anything if logging has been disabled */ + if (!lc->logging_enabled) + goto dax_copy; + + err = log_dax(lc, sector, bytes, i); + if (err) { + DMWARN("Error %d logging DAX write", err); + return 0; + } +dax_copy: + return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); } static struct target_type log_writes_target = { .name = "log-writes", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = log_writes_ctr, .dtr = log_writes_dtr, @@ -785,6 +970,8 @@ static struct target_type log_writes_target = { .message = log_writes_message, .iterate_devices = log_writes_iterate_devices, .io_hints = log_writes_io_hints, + .direct_access = log_writes_dax_direct_access, + .dax_copy_from_iter = log_writes_dax_copy_from_iter, }; static int __init dm_log_writes_init(void) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 0e8ab5bb3575..c8faa2b85842 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(struct multipath *m, pgpath = path_to_pgpath(path); - if (unlikely(lockless_dereference(m->current_pg) != pg)) { + if (unlikely(READ_ONCE(m->current_pg) != pg)) { /* Only update current_pgpath if pg changed */ spin_lock_irqsave(&m->lock, flags); m->current_pgpath = pgpath; @@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) } /* Were we instructed to switch PG? */ - if (lockless_dereference(m->next_pg)) { + if (READ_ONCE(m->next_pg)) { spin_lock_irqsave(&m->lock, flags); pg = m->next_pg; if (!pg) { @@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) /* Don't change PG until it has no remaining paths */ check_current_pg: - pg = lockless_dereference(m->current_pg); + pg = READ_ONCE(m->current_pg); if (pg) { pgpath = choose_path_in_pg(m, pg, nr_bytes); if (!IS_ERR_OR_NULL(pgpath)) @@ -473,7 +473,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, struct request *clone; /* Do we need to select a new pgpath? */ - pgpath = lockless_dereference(m->current_pgpath); + pgpath = READ_ONCE(m->current_pgpath); if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) pgpath = choose_pgpath(m, nr_bytes); @@ -499,12 +499,9 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, if (IS_ERR(clone)) { /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ bool queue_dying = blk_queue_dying(q); - DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing", - PTR_ERR(clone), queue_dying ? " (path offline)" : ""); if (queue_dying) { atomic_inc(&m->pg_init_in_progress); activate_or_offline_path(pgpath); - return DM_MAPIO_REQUEUE; } return DM_MAPIO_DELAY_REQUEUE; } @@ -536,7 +533,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m bool queue_io; /* Do we need to select a new pgpath? */ - pgpath = lockless_dereference(m->current_pgpath); + pgpath = READ_ONCE(m->current_pgpath); queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); if (!pgpath || !queue_io) pgpath = choose_pgpath(m, nr_bytes); @@ -566,7 +563,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m mpio->nr_bytes = nr_bytes; bio->bi_status = 0; - bio->bi_bdev = pgpath->path.dev->bdev; + bio_set_dev(bio, pgpath->path.dev->bdev); bio->bi_opf |= REQ_FAILFAST_TRANSPORT; if (pgpath->pg->ps.type->start_io) @@ -633,19 +630,15 @@ static void process_queued_bios(struct work_struct *work) case DM_MAPIO_REMAPPED: generic_make_request(bio); break; + case 0: + break; + default: + WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r); } } blk_finish_plug(&plug); } -static void assign_bit(bool value, long nr, unsigned long *addr) -{ - if (value) - set_bit(nr, addr); - else - clear_bit(nr, addr); -} - /* * If we run out of usable paths, should we queue I/O or error it? */ @@ -655,11 +648,11 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, unsigned long flags; spin_lock_irqsave(&m->lock, flags); - assign_bit((save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) || - (!save_old_value && queue_if_no_path), - MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); - assign_bit(queue_if_no_path || dm_noflush_suspending(m->ti), - MPATHF_QUEUE_IF_NO_PATH, &m->flags); + assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, + (save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) || + (!save_old_value && queue_if_no_path)); + assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, + queue_if_no_path || dm_noflush_suspending(m->ti)); spin_unlock_irqrestore(&m->lock, flags); if (!queue_if_no_path) { @@ -699,7 +692,7 @@ static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, struct path_selector_type *pst; unsigned ps_argc; - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {0, 1024, "invalid number of path selector args"}, }; @@ -823,7 +816,7 @@ retain: static struct priority_group *parse_priority_group(struct dm_arg_set *as, struct multipath *m) { - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {1, 1024, "invalid number of paths"}, {0, 1024, "invalid number of selector args"} }; @@ -899,7 +892,7 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) int ret; struct dm_target *ti = m->ti; - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {0, 1024, "invalid number of hardware handler args"}, }; @@ -951,7 +944,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) struct dm_target *ti = m->ti; const char *arg_name; - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {0, 8, "invalid number of feature args"}, {1, 50, "pg_init_retries must be between 1 and 50"}, {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, @@ -1020,7 +1013,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) { /* target arguments */ - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {0, 1024, "invalid number of priority groups"}, {0, 1024, "invalid initial priority group number"}, }; @@ -1380,6 +1373,7 @@ static void pg_init_done(void *data, int errors) case SCSI_DH_RETRY: /* Wait before retrying. */ delay_retry = 1; + /* fall through */ case SCSI_DH_IMM_RETRY: case SCSI_DH_RES_TEMP_UNAVAIL: if (pg_init_limit_reached(m, pgpath)) @@ -1458,7 +1452,6 @@ static int noretry_error(blk_status_t error) case BLK_STS_TARGET: case BLK_STS_NEXUS: case BLK_STS_MEDIUM: - case BLK_STS_RESOURCE: return 1; } @@ -1585,8 +1578,8 @@ static void multipath_resume(struct dm_target *ti) unsigned long flags; spin_lock_irqsave(&m->lock, flags); - assign_bit(test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags), - MPATHF_QUEUE_IF_NO_PATH, &m->flags); + assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); spin_unlock_irqrestore(&m->lock, flags); } @@ -1801,7 +1794,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, struct pgpath *current_pgpath; int r; - current_pgpath = lockless_dereference(m->current_pgpath); + current_pgpath = READ_ONCE(m->current_pgpath); if (!current_pgpath) current_pgpath = choose_pgpath(m, 0); @@ -1823,7 +1816,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } if (r == -ENOTCONN) { - if (!lockless_dereference(m->current_pg)) { + if (!READ_ONCE(m->current_pg)) { /* Path status changed, redo selection */ (void) choose_pgpath(m, 0); } @@ -1892,9 +1885,9 @@ static int multipath_busy(struct dm_target *ti) return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); /* Guess which priority_group will be used at next mapping time */ - pg = lockless_dereference(m->current_pg); - next_pg = lockless_dereference(m->next_pg); - if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) + pg = READ_ONCE(m->current_pg); + next_pg = READ_ONCE(m->next_pg); + if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg)) pg = next_pg; if (!pg) { diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 2e10c2f13a34..6319d846e0ad 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -12,7 +12,7 @@ #include "raid1.h" #include "raid5.h" #include "raid10.h" -#include "bitmap.h" +#include "md-bitmap.h" #include <linux/device-mapper.h> @@ -208,6 +208,7 @@ struct raid_dev { #define RT_FLAG_RS_BITMAP_LOADED 2 #define RT_FLAG_UPDATE_SBS 3 #define RT_FLAG_RESHAPE_RS 4 +#define RT_FLAG_RS_SUSPENDED 5 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -564,9 +565,10 @@ static const char *raid10_md_layout_to_format(int layout) if (__raid10_near_copies(layout) > 1) return "near"; - WARN_ON(__raid10_far_copies(layout) < 2); + if (__raid10_far_copies(layout) > 1) + return "far"; - return "far"; + return "unknown"; } /* Return md raid10 algorithm for @name */ @@ -2141,13 +2143,6 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) struct dm_raid_superblock *refsb; uint64_t events_sb, events_refsb; - rdev->sb_start = 0; - rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev); - if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) { - DMERR("superblock size of a logical block is no longer valid"); - return -EINVAL; - } - r = read_disk_sb(rdev, rdev->sb_size, false); if (r) return r; @@ -2492,6 +2487,17 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (test_bit(Journal, &rdev->flags)) continue; + if (!rdev->meta_bdev) + continue; + + /* Set superblock offset/size for metadata device. */ + rdev->sb_start = 0; + rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev); + if (rdev->sb_size < sizeof(struct dm_raid_superblock) || rdev->sb_size > PAGE_SIZE) { + DMERR("superblock size of a logical block is no longer valid"); + return -EINVAL; + } + /* * Skipping super_load due to CTR_FLAG_SYNC will cause * the array to undergo initialization again as @@ -2504,9 +2510,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) continue; - if (!rdev->meta_bdev) - continue; - r = super_load(rdev, freshest); switch (r) { @@ -2540,11 +2543,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (!freshest) return 0; - if (validate_raid_redundancy(rs)) { - rs->ti->error = "Insufficient redundancy to activate array"; - return -EINVAL; - } - /* * Validation of the freshest device provides the source of * validation for the remaining devices. @@ -2553,6 +2551,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (super_validate(rs, freshest)) return -EINVAL; + if (validate_raid_redundancy(rs)) { + rs->ti->error = "Insufficient redundancy to activate array"; + return -EINVAL; + } + rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags) && rdev != freshest && @@ -2884,9 +2887,6 @@ static void configure_discard_support(struct raid_set *rs) bool raid456; struct dm_target *ti = rs->ti; - /* Assume discards not supported until after checks below. */ - ti->discards_supported = false; - /* * XXX: RAID level 4,5,6 require zeroing for safety. */ @@ -2911,9 +2911,6 @@ static void configure_discard_support(struct raid_set *rs) } } - /* All RAID members properly support discards */ - ti->discards_supported = true; - /* * RAID1 and RAID10 personalities require bio splitting, * RAID0/4/5/6 don't and process large discard bios properly. @@ -3168,6 +3165,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) } mddev_suspend(&rs->md); + set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags); /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ if (rs_is_raid456(rs)) { @@ -3235,7 +3233,7 @@ static int raid_map(struct dm_target *ti, struct bio *bio) if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) return DM_MAPIO_REQUEUE; - mddev->pers->make_request(mddev, bio); + md_handle_request(mddev, bio); return DM_MAPIO_SUBMITTED; } @@ -3294,11 +3292,10 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, static sector_t rs_get_progress(struct raid_set *rs, sector_t resync_max_sectors, bool *array_in_sync) { - sector_t r, recovery_cp, curr_resync_completed; + sector_t r, curr_resync_completed; struct mddev *mddev = &rs->md; curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp; - recovery_cp = mddev->recovery_cp; *array_in_sync = false; if (rs_is_raid0(rs)) { @@ -3327,9 +3324,11 @@ static sector_t rs_get_progress(struct raid_set *rs, } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) r = curr_resync_completed; else - r = recovery_cp; + r = mddev->recovery_cp; - if (r == MaxSector) { + if ((r == MaxSector) || + (test_bit(MD_RECOVERY_DONE, &mddev->recovery) && + (mddev->curr_resync_completed == resync_max_sectors))) { /* * Sync complete. */ @@ -3625,8 +3624,11 @@ static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; - if (!rs->md.suspended) + if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + mddev_lock_nointr(&rs->md); mddev_suspend(&rs->md); + mddev_unlock(&rs->md); + } rs->md.ro = 1; } @@ -3759,7 +3761,7 @@ static int rs_start_reshape(struct raid_set *rs) return r; /* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */ - if (mddev->suspended) + if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) mddev_resume(mddev); /* @@ -3786,8 +3788,8 @@ static int rs_start_reshape(struct raid_set *rs) } /* Suspend because a resume will happen in raid_resume() */ - if (!mddev->suspended) - mddev_suspend(mddev); + set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags); + mddev_suspend(mddev); /* * Now reshape got set up, update superblocks to @@ -3883,13 +3885,16 @@ static void raid_resume(struct dm_target *ti) if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (mddev->suspended) + if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + mddev_lock_nointr(mddev); mddev_resume(mddev); + mddev_unlock(mddev); + } } static struct target_type raid_target = { .name = "raid", - .version = {1, 11, 1}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index a4fbd911d566..580c49cc8079 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -94,9 +94,9 @@ static void wakeup_mirrord(void *context) queue_work(ms->kmirrord_wq, &ms->kmirrord_work); } -static void delayed_wake_fn(unsigned long data) +static void delayed_wake_fn(struct timer_list *t) { - struct mirror_set *ms = (struct mirror_set *) data; + struct mirror_set *ms = from_timer(ms, t, timer); clear_bit(0, &ms->timer_pending); wakeup_mirrord(ms); @@ -108,8 +108,6 @@ static void delayed_wake(struct mirror_set *ms) return; ms->timer.expires = jiffies + HZ / 5; - ms->timer.data = (unsigned long) ms; - ms->timer.function = delayed_wake_fn; add_timer(&ms->timer); } @@ -145,7 +143,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list) struct dm_raid1_bio_record { struct mirror *m; - /* if details->bi_bdev == NULL, details were not saved */ + /* if details->bi_disk == NULL, details were not saved */ struct dm_bio_details details; region_t write_region; }; @@ -464,7 +462,7 @@ static sector_t map_sector(struct mirror *m, struct bio *bio) static void map_bio(struct mirror *m, struct bio *bio) { - bio->bi_bdev = m->dev->bdev; + bio_set_dev(bio, m->dev->bdev); bio->bi_iter.bi_sector = map_sector(m, bio); } @@ -1133,7 +1131,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err_free_context; } INIT_WORK(&ms->kmirrord_work, do_mirror); - init_timer(&ms->timer); + timer_setup(&ms->timer, delayed_wake_fn, 0); ms->timer_pending = 0; INIT_WORK(&ms->trigger_event, trigger_event); @@ -1199,7 +1197,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) struct dm_raid1_bio_record *bio_record = dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record)); - bio_record->details.bi_bdev = NULL; + bio_record->details.bi_disk = NULL; if (rw == WRITE) { /* Save region for mirror_end_io() handler */ @@ -1266,7 +1264,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, goto out; if (unlikely(*error)) { - if (!bio_record->details.bi_bdev) { + if (!bio_record->details.bi_disk) { /* * There wasn't enough memory to record necessary * information for a retry or there was no other @@ -1291,7 +1289,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, bd = &bio_record->details; dm_bio_restore(bd, bio); - bio_record->details.bi_bdev = NULL; + bio_record->details.bi_disk = NULL; bio->bi_status = 0; queue_bio(ms, bio, rw); @@ -1301,7 +1299,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, } out: - bio_record->details.bi_bdev = NULL; + bio_record->details.bi_disk = NULL; return DM_ENDIO_DONE; } diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index c6ebc5b1e00e..9d32f25489c2 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -56,7 +56,7 @@ static unsigned dm_get_blk_mq_queue_depth(void) int dm_request_based(struct mapped_device *md) { - return blk_queue_stackable(md->queue); + return queue_is_rq_based(md->queue); } static void dm_old_start_queue(struct request_queue *q) @@ -117,9 +117,9 @@ static void end_clone_bio(struct bio *clone) struct dm_rq_clone_bio_info *info = container_of(clone, struct dm_rq_clone_bio_info, clone); struct dm_rq_target_io *tio = info->tio; - struct bio *bio = info->orig; unsigned int nr_bytes = info->orig->bi_iter.bi_size; blk_status_t error = clone->bi_status; + bool is_last = !clone->bi_next; bio_put(clone); @@ -137,28 +137,23 @@ static void end_clone_bio(struct bio *clone) * when the request is completed. */ tio->error = error; - return; + goto exit; } /* * I/O for the bio successfully completed. * Notice the data completion to the upper layer. */ - - /* - * bios are processed from the head of the list. - * So the completing bio should always be rq->bio. - * If it's not, something wrong is happening. - */ - if (tio->orig->bio != bio) - DMERR("bio completion is going in the middle of the request"); + tio->completed += nr_bytes; /* * Update the original request. * Do not use blk_end_request() here, because it may complete * the original request before the clone, and break the ordering. */ - blk_update_request(tio->orig, BLK_STS_OK, nr_bytes); + if (is_last) + exit: + blk_update_request(tio->orig, BLK_STS_OK, tio->completed); } static struct dm_rq_target_io *tio_from_request(struct request *rq) @@ -237,14 +232,14 @@ static void dm_end_request(struct request *clone, blk_status_t error) /* * Requeue the original request of a clone. */ -static void dm_old_requeue_request(struct request *rq) +static void dm_old_requeue_request(struct request *rq, unsigned long delay_ms) { struct request_queue *q = rq->q; unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); blk_requeue_request(q, rq); - blk_run_queue_async(q); + blk_delay_queue(q, delay_ms); spin_unlock_irqrestore(q->queue_lock, flags); } @@ -270,6 +265,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ struct mapped_device *md = tio->md; struct request *rq = tio->orig; int rw = rq_data_dir(rq); + unsigned long delay_ms = delay_requeue ? 100 : 0; rq_end_stats(md, rq); if (tio->clone) { @@ -278,9 +274,9 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ } if (!rq->q->mq_ops) - dm_old_requeue_request(rq); + dm_old_requeue_request(rq, delay_ms); else - dm_mq_delay_requeue_request(rq, delay_requeue ? 100/*ms*/ : 0); + dm_mq_delay_requeue_request(rq, delay_ms); rq_completed(md, rw, false); } @@ -455,6 +451,7 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq, tio->clone = NULL; tio->orig = rq; tio->error = 0; + tio->completed = 0; /* * Avoid initializing info for blk-mq; it passes * target-specific data through info.ptr diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h index 9813922e4fe5..f43c45460aac 100644 --- a/drivers/md/dm-rq.h +++ b/drivers/md/dm-rq.h @@ -29,6 +29,7 @@ struct dm_rq_target_io { struct dm_stats_aux stats_aux; unsigned long duration_jiffies; unsigned n_sectors; + unsigned completed; }; /* diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 1ba41048b438..1113b42e1eda 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1663,7 +1663,7 @@ __find_pending_exception(struct dm_snapshot *s, static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, struct bio *bio, chunk_t chunk) { - bio->bi_bdev = s->cow->bdev; + bio_set_dev(bio, s->cow->bdev); bio->bi_iter.bi_sector = chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) + (chunk - e->old_chunk)) + @@ -1681,7 +1681,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) init_tracked_chunk(bio); if (bio->bi_opf & REQ_PREFLUSH) { - bio->bi_bdev = s->cow->bdev; + bio_set_dev(bio, s->cow->bdev); return DM_MAPIO_REMAPPED; } @@ -1769,7 +1769,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) goto out; } } else { - bio->bi_bdev = s->origin->bdev; + bio_set_dev(bio, s->origin->bdev); track_chunk(s, bio, chunk); } @@ -1802,9 +1802,9 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) if (bio->bi_opf & REQ_PREFLUSH) { if (!dm_bio_get_target_bio_nr(bio)) - bio->bi_bdev = s->origin->bdev; + bio_set_dev(bio, s->origin->bdev); else - bio->bi_bdev = s->cow->bdev; + bio_set_dev(bio, s->cow->bdev); return DM_MAPIO_REMAPPED; } @@ -1824,7 +1824,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) chunk >= s->first_merging_chunk && chunk < (s->first_merging_chunk + s->num_merging_chunks)) { - bio->bi_bdev = s->origin->bdev; + bio_set_dev(bio, s->origin->bdev); bio_list_add(&s->bios_queued_during_merge, bio); r = DM_MAPIO_SUBMITTED; goto out_unlock; @@ -1838,7 +1838,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) } redirect_to_origin: - bio->bi_bdev = s->origin->bdev; + bio_set_dev(bio, s->origin->bdev); if (bio_data_dir(bio) == WRITE) { up_write(&s->lock); @@ -2285,7 +2285,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio) struct dm_origin *o = ti->private; unsigned available_sectors; - bio->bi_bdev = o->dev->bdev; + bio_set_dev(bio, o->dev->bdev); if (unlikely(bio->bi_opf & REQ_PREFLUSH)) return DM_MAPIO_REMAPPED; diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 6028d8247f58..29bc51084c82 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/errno.h> #include <linux/numa.h> #include <linux/slab.h> @@ -431,7 +432,7 @@ do_sync_free: synchronize_rcu_expedited(); dm_stat_free(&s->rcu_head); } else { - ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1; + WRITE_ONCE(dm_stat_need_rcu_barrier, 1); call_rcu(&s->rcu_head, dm_stat_free); } return 0; @@ -639,12 +640,12 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, */ last = raw_cpu_ptr(stats->last); stats_aux->merged = - (bi_sector == (ACCESS_ONCE(last->last_sector) && + (bi_sector == (READ_ONCE(last->last_sector) && ((bi_rw == WRITE) == - (ACCESS_ONCE(last->last_rw) == WRITE)) + (READ_ONCE(last->last_rw) == WRITE)) )); - ACCESS_ONCE(last->last_sector) = end_sector; - ACCESS_ONCE(last->last_rw) = bi_rw; + WRITE_ONCE(last->last_sector, end_sector); + WRITE_ONCE(last->last_rw, bi_rw); } rcu_read_lock(); @@ -693,22 +694,22 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared for_each_possible_cpu(cpu) { p = &s->stat_percpu[cpu][x]; - shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]); - shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]); - shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]); - shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]); - shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]); - shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]); - shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]); - shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]); - shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]); - shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]); - shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total); - shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue); + shared->tmp.sectors[READ] += READ_ONCE(p->sectors[READ]); + shared->tmp.sectors[WRITE] += READ_ONCE(p->sectors[WRITE]); + shared->tmp.ios[READ] += READ_ONCE(p->ios[READ]); + shared->tmp.ios[WRITE] += READ_ONCE(p->ios[WRITE]); + shared->tmp.merges[READ] += READ_ONCE(p->merges[READ]); + shared->tmp.merges[WRITE] += READ_ONCE(p->merges[WRITE]); + shared->tmp.ticks[READ] += READ_ONCE(p->ticks[READ]); + shared->tmp.ticks[WRITE] += READ_ONCE(p->ticks[WRITE]); + shared->tmp.io_ticks[READ] += READ_ONCE(p->io_ticks[READ]); + shared->tmp.io_ticks[WRITE] += READ_ONCE(p->io_ticks[WRITE]); + shared->tmp.io_ticks_total += READ_ONCE(p->io_ticks_total); + shared->tmp.time_in_queue += READ_ONCE(p->time_in_queue); if (s->n_histogram_entries) { unsigned i; for (i = 0; i < s->n_histogram_entries + 1; i++) - shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]); + shared->tmp.histogram[i] += READ_ONCE(p->histogram[i]); } } } diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h index f1c0956e3843..2ddfae678f32 100644 --- a/drivers/md/dm-stats.h +++ b/drivers/md/dm-stats.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef DM_STATS_H #define DM_STATS_H diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index a0375530b07f..b5e892149c54 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -270,7 +270,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio, stripe_map_range_sector(sc, bio_end_sector(bio), target_stripe, &end); if (begin < end) { - bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; + bio_set_dev(bio, sc->stripe[target_stripe].dev->bdev); bio->bi_iter.bi_sector = begin + sc->stripe[target_stripe].physical_start; bio->bi_iter.bi_size = to_bytes(end - begin); @@ -291,7 +291,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) if (bio->bi_opf & REQ_PREFLUSH) { target_bio_nr = dm_bio_get_target_bio_nr(bio); BUG_ON(target_bio_nr >= sc->stripes); - bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev; + bio_set_dev(bio, sc->stripe[target_bio_nr].dev->bdev); return DM_MAPIO_REMAPPED; } if (unlikely(bio_op(bio) == REQ_OP_DISCARD) || @@ -306,7 +306,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) &stripe, &bio->bi_iter.bi_sector); bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start; - bio->bi_bdev = sc->stripe[stripe].dev->bdev; + bio_set_dev(bio, sc->stripe[stripe].dev->bdev); return DM_MAPIO_REMAPPED; } @@ -351,25 +351,6 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); } -static void stripe_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr, - size_t size) -{ - sector_t dev_sector, sector = pgoff * PAGE_SECTORS; - struct stripe_c *sc = ti->private; - struct dax_device *dax_dev; - struct block_device *bdev; - uint32_t stripe; - - stripe_map_sector(sc, sector, &stripe, &dev_sector); - dev_sector += sc->stripe[stripe].physical_start; - dax_dev = sc->stripe[stripe].dev->dax_dev; - bdev = sc->stripe[stripe].dev->bdev; - - if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff)) - return; - dax_flush(dax_dev, pgoff, addr, size); -} - /* * Stripe status: * @@ -430,9 +411,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, return DM_ENDIO_DONE; memset(major_minor, 0, sizeof(major_minor)); - sprintf(major_minor, "%d:%d", - MAJOR(disk_devt(bio->bi_bdev->bd_disk)), - MINOR(disk_devt(bio->bi_bdev->bd_disk))); + sprintf(major_minor, "%d:%d", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio))); /* * Test to see which stripe drive triggered the event @@ -491,7 +470,6 @@ static struct target_type stripe_target = { .io_hints = stripe_io_hints, .direct_access = stripe_dax_direct_access, .dax_copy_from_iter = stripe_dax_copy_from_iter, - .dax_flush = stripe_dax_flush, }; int __init dm_stripe_init(void) diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 871c18fe000d..8d0ba879777e 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -144,7 +144,7 @@ static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long switch_get_position(sctx, region_nr, ®ion_index, &bit); - return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & + return (READ_ONCE(sctx->region_table[region_index]) >> bit) & ((1 << sctx->region_table_entry_bits) - 1); } @@ -251,7 +251,7 @@ static void switch_dtr(struct dm_target *ti) */ static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv) { - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"}, {1, UINT_MAX, "Invalid region size"}, {0, 0, "Invalid number of optional args"}, @@ -322,7 +322,7 @@ static int switch_map(struct dm_target *ti, struct bio *bio) sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector); unsigned path_nr = switch_get_path_nr(sctx, offset); - bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev; + bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev); bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset; return DM_MAPIO_REMAPPED; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index a39bcd9b982a..88130b5d95f9 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -20,6 +20,7 @@ #include <linux/atomic.h> #include <linux/blk-mq.h> #include <linux/mount.h> +#include <linux/dax.h> #define DM_MSG_PREFIX "table" @@ -450,15 +451,15 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, return r; } - atomic_set(&dd->count, 0); + refcount_set(&dd->count, 1); list_add(&dd->list, &t->devices); } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { r = upgrade_mode(dd, mode, t->md); if (r) return r; + refcount_inc(&dd->count); } - atomic_inc(&dd->count); *result = dd->dm_dev; return 0; @@ -514,7 +515,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d) dm_device_name(ti->table->md), d->name); return; } - if (atomic_dec_and_test(&dd->count)) { + if (refcount_dec_and_test(&dd->count)) { dm_put_table_device(ti->table->md, d); list_del(&dd->list); kfree(dd); @@ -805,7 +806,8 @@ int dm_table_add_target(struct dm_table *t, const char *type, /* * Target argument parsing helpers. */ -static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, +static int validate_next_arg(const struct dm_arg *arg, + struct dm_arg_set *arg_set, unsigned *value, char **error, unsigned grouped) { const char *arg_str = dm_shift_arg(arg_set); @@ -823,14 +825,14 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, return 0; } -int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, +int dm_read_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set, unsigned *value, char **error) { return validate_next_arg(arg, arg_set, value, error, 0); } EXPORT_SYMBOL(dm_read_arg); -int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set, +int dm_read_arg_group(const struct dm_arg *arg, struct dm_arg_set *arg_set, unsigned *value, char **error) { return validate_next_arg(arg, arg_set, value, error, 1); @@ -998,7 +1000,7 @@ verify_rq_based: list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); - if (!blk_queue_stackable(q)) { + if (!queue_is_rq_based(q)) { DMERR("table load rejected: including" " non-request-stackable devices"); return -EINVAL; @@ -1630,6 +1632,37 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) return false; } +static int device_dax_write_cache_enabled(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct dax_device *dax_dev = dev->dax_dev; + + if (!dax_dev) + return false; + + if (dax_write_cache_enabled(dax_dev)) + return true; + return false; +} + +static int dm_table_supports_dax_write_cache(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, + device_dax_write_cache_enabled, NULL)) + return true; + } + + return false; +} + static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1725,13 +1758,12 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t) return true; } - -static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && blk_queue_discard(q); + return q && !blk_queue_discard(q); } static bool dm_table_supports_discards(struct dm_table *t) @@ -1739,28 +1771,24 @@ static bool dm_table_supports_discards(struct dm_table *t) struct dm_target *ti; unsigned i; - /* - * Unless any target used by the table set discards_supported, - * require at least one underlying device to support discards. - * t->devices includes internal dm devices such as mirror logs - * so we need to use iterate_devices here, which targets - * supporting discard selectively must provide. - */ for (i = 0; i < dm_table_get_num_targets(t); i++) { ti = dm_table_get_target(t, i); if (!ti->num_discard_bios) - continue; - - if (ti->discards_supported) - return true; + return false; - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_discard_capable, NULL)) - return true; + /* + * Either the target provides discard support (as implied by setting + * 'discards_supported') or it relies on _all_ data devices having + * discard support. + */ + if (!ti->discards_supported && + (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_discard_capable, NULL))) + return false; } - return false; + return true; } void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, @@ -1773,9 +1801,15 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, */ q->limits = *limits; - if (!dm_table_supports_discards(t)) + if (!dm_table_supports_discards(t)) { queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); - else + /* Must also clear discard limits... */ + q->limits.max_discard_sectors = 0; + q->limits.max_hw_discard_sectors = 0; + q->limits.discard_granularity = 0; + q->limits.discard_alignment = 0; + q->limits.discard_misaligned = 0; + } else queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { @@ -1785,6 +1819,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } blk_queue_write_cache(q, wc, fua); + if (dm_table_supports_dax_write_cache(t)) + dax_write_cache(t->md->dax_dev, true); + /* Ensure that all underlying devices are non-rotational. */ if (dm_table_all_devices_attribute(t, device_is_nonrot)) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); @@ -1811,19 +1848,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, */ if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); - - /* - * QUEUE_FLAG_STACKABLE must be set after all queue settings are - * visible to other CPUs because, once the flag is set, incoming bios - * are processed by request-based dm, which refers to the queue - * settings. - * Until the flag set, bios are passed to bio-based dm and queued to - * md->deferred where queue settings are not needed yet. - * Those bios are passed to request-based dm at the resume time. - */ - smp_mb(); - if (dm_table_request_based(t)) - queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); } unsigned int dm_table_get_num_targets(struct dm_table *t) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 9dec2f8cc739..89e5dff9b4cf 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -679,7 +679,7 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) struct pool *pool = tc->pool; sector_t bi_sector = bio->bi_iter.bi_sector; - bio->bi_bdev = tc->pool_dev->bdev; + bio_set_dev(bio, tc->pool_dev->bdev); if (block_size_is_power_of_two(pool)) bio->bi_iter.bi_sector = (block << pool->sectors_per_block_shift) | @@ -691,7 +691,7 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) static void remap_to_origin(struct thin_c *tc, struct bio *bio) { - bio->bi_bdev = tc->origin_dev->bdev; + bio_set_dev(bio, tc->origin_dev->bdev); } static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) @@ -2431,7 +2431,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) struct pool_c *pt = pool->ti->private; bool needs_check = dm_pool_metadata_needs_check(pool->pmd); enum pool_mode old_mode = get_pool_mode(pool); - unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ; + unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ; /* * Never allow the pool to transition to PM_WRITE mode if user @@ -3041,7 +3041,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, unsigned argc; const char *arg_name; - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {0, 4, "Invalid number of pool feature arguments"}, }; @@ -3313,7 +3313,7 @@ static int pool_map(struct dm_target *ti, struct bio *bio) * As this is a singleton target, ti->begin is always zero. */ spin_lock_irqsave(&pool->lock, flags); - bio->bi_bdev = pt->data_dev->bdev; + bio_set_dev(bio, pt->data_dev->bdev); r = DM_MAPIO_REMAPPED; spin_unlock_irqrestore(&pool->lock, flags); diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 504ba3fa328b..e13f90832b6b 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -308,19 +308,14 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) { unsigned n; - if (!fio->rs) { - fio->rs = mempool_alloc(v->fec->rs_pool, 0); - if (unlikely(!fio->rs)) { - DMERR("failed to allocate RS"); - return -ENOMEM; - } - } + if (!fio->rs) + fio->rs = mempool_alloc(v->fec->rs_pool, GFP_NOIO); fec_for_each_prealloc_buffer(n) { if (fio->bufs[n]) continue; - fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO); + fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOWAIT); if (unlikely(!fio->bufs[n])) { DMERR("failed to allocate FEC buffer"); return -ENOMEM; @@ -332,22 +327,16 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) if (fio->bufs[n]) continue; - fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO); + fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOWAIT); /* we can manage with even one buffer if necessary */ if (unlikely(!fio->bufs[n])) break; } fio->nbufs = n; - if (!fio->output) { + if (!fio->output) fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO); - if (!fio->output) { - DMERR("failed to allocate FEC page"); - return -ENOMEM; - } - } - return 0; } diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index b46705ebf01f..aedb8222836b 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -92,74 +92,33 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, return block >> (level * v->hash_per_block_bits); } -/* - * Callback function for asynchrnous crypto API completion notification - */ -static void verity_op_done(struct crypto_async_request *base, int err) -{ - struct verity_result *res = (struct verity_result *)base->data; - - if (err == -EINPROGRESS) - return; - - res->err = err; - complete(&res->completion); -} - -/* - * Wait for async crypto API callback - */ -static inline int verity_complete_op(struct verity_result *res, int ret) -{ - switch (ret) { - case 0: - break; - - case -EINPROGRESS: - case -EBUSY: - ret = wait_for_completion_interruptible(&res->completion); - if (!ret) - ret = res->err; - reinit_completion(&res->completion); - break; - - default: - DMERR("verity_wait_hash: crypto op submission failed: %d", ret); - } - - if (unlikely(ret < 0)) - DMERR("verity_wait_hash: crypto op failed: %d", ret); - - return ret; -} - static int verity_hash_update(struct dm_verity *v, struct ahash_request *req, const u8 *data, size_t len, - struct verity_result *res) + struct crypto_wait *wait) { struct scatterlist sg; sg_init_one(&sg, data, len); ahash_request_set_crypt(req, &sg, NULL, len); - return verity_complete_op(res, crypto_ahash_update(req)); + return crypto_wait_req(crypto_ahash_update(req), wait); } /* * Wrapper for crypto_ahash_init, which handles verity salting. */ static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, - struct verity_result *res) + struct crypto_wait *wait) { int r; ahash_request_set_tfm(req, v->tfm); ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG, - verity_op_done, (void *)res); - init_completion(&res->completion); + crypto_req_done, (void *)wait); + crypto_init_wait(wait); - r = verity_complete_op(res, crypto_ahash_init(req)); + r = crypto_wait_req(crypto_ahash_init(req), wait); if (unlikely(r < 0)) { DMERR("crypto_ahash_init failed: %d", r); @@ -167,18 +126,18 @@ static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, } if (likely(v->salt_size && (v->version >= 1))) - r = verity_hash_update(v, req, v->salt, v->salt_size, res); + r = verity_hash_update(v, req, v->salt, v->salt_size, wait); return r; } static int verity_hash_final(struct dm_verity *v, struct ahash_request *req, - u8 *digest, struct verity_result *res) + u8 *digest, struct crypto_wait *wait) { int r; if (unlikely(v->salt_size && (!v->version))) { - r = verity_hash_update(v, req, v->salt, v->salt_size, res); + r = verity_hash_update(v, req, v->salt, v->salt_size, wait); if (r < 0) { DMERR("verity_hash_final failed updating salt: %d", r); @@ -187,7 +146,7 @@ static int verity_hash_final(struct dm_verity *v, struct ahash_request *req, } ahash_request_set_crypt(req, NULL, digest, 0); - r = verity_complete_op(res, crypto_ahash_final(req)); + r = crypto_wait_req(crypto_ahash_final(req), wait); out: return r; } @@ -196,17 +155,17 @@ int verity_hash(struct dm_verity *v, struct ahash_request *req, const u8 *data, size_t len, u8 *digest) { int r; - struct verity_result res; + struct crypto_wait wait; - r = verity_hash_init(v, req, &res); + r = verity_hash_init(v, req, &wait); if (unlikely(r < 0)) goto out; - r = verity_hash_update(v, req, data, len, &res); + r = verity_hash_update(v, req, data, len, &wait); if (unlikely(r < 0)) goto out; - r = verity_hash_final(v, req, digest, &res); + r = verity_hash_final(v, req, digest, &wait); out: return r; @@ -389,7 +348,7 @@ out: * Calculates the digest for the given bio */ int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter *iter, struct verity_result *res) + struct bvec_iter *iter, struct crypto_wait *wait) { unsigned int todo = 1 << v->data_dev_block_bits; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); @@ -414,7 +373,7 @@ int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, */ sg_set_page(&sg, bv.bv_page, len, bv.bv_offset); ahash_request_set_crypt(req, &sg, NULL, len); - r = verity_complete_op(res, crypto_ahash_update(req)); + r = crypto_wait_req(crypto_ahash_update(req), wait); if (unlikely(r < 0)) { DMERR("verity_for_io_block crypto op failed: %d", r); @@ -482,7 +441,7 @@ static int verity_verify_io(struct dm_verity_io *io) struct dm_verity *v = io->v; struct bvec_iter start; unsigned b; - struct verity_result res; + struct crypto_wait wait; for (b = 0; b < io->n_blocks; b++) { int r; @@ -507,17 +466,17 @@ static int verity_verify_io(struct dm_verity_io *io) continue; } - r = verity_hash_init(v, req, &res); + r = verity_hash_init(v, req, &wait); if (unlikely(r < 0)) return r; start = io->iter; - r = verity_for_io_block(v, io, &io->iter, &res); + r = verity_for_io_block(v, io, &io->iter, &wait); if (unlikely(r < 0)) return r; r = verity_hash_final(v, req, verity_io_real_digest(v, io), - &res); + &wait); if (unlikely(r < 0)) return r; @@ -589,7 +548,7 @@ static void verity_prefetch_io(struct work_struct *work) verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL); verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL); if (!i) { - unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster); + unsigned cluster = READ_ONCE(dm_verity_prefetch_cluster); cluster >>= v->data_dev_block_bits; if (unlikely(!cluster)) @@ -637,7 +596,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) struct dm_verity *v = ti->private; struct dm_verity_io *io; - bio->bi_bdev = v->data_dev->bdev; + bio_set_dev(bio, v->data_dev->bdev); bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector); if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & @@ -839,7 +798,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) struct dm_target *ti = v->ti; const char *arg_name; - static struct dm_arg _args[] = { + static const struct dm_arg _args[] = { {0, DM_VERITY_OPTS_MAX, "Invalid number of feature args"}, }; diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index a59e0ada6fd3..b675bc015512 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -90,11 +90,6 @@ struct dm_verity_io { */ }; -struct verity_result { - struct completion completion; - int err; -}; - static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v, struct dm_verity_io *io) { diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 884ff7c170a0..70485de37b66 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -409,7 +409,7 @@ static struct dmz_mblock *dmz_fetch_mblock(struct dmz_metadata *zmd, } bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio->bi_bdev = zmd->dev->bdev; + bio_set_dev(bio, zmd->dev->bdev); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO); @@ -564,7 +564,7 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, set_bit(DMZ_META_WRITING, &mblk->state); bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio->bi_bdev = zmd->dev->bdev; + bio_set_dev(bio, zmd->dev->bdev); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); @@ -586,7 +586,7 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, return -ENOMEM; bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio->bi_bdev = zmd->dev->bdev; + bio_set_dev(bio, zmd->dev->bdev); bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO); bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); ret = submit_bio_wait(bio); @@ -624,7 +624,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page); if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); return ret; } @@ -658,7 +658,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, /* Flush drive cache (this will also sync data) */ if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); return ret; } @@ -722,7 +722,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); goto out; } @@ -927,7 +927,7 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); } - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_NOIO); if (!page) return -ENOMEM; @@ -1183,7 +1183,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) /* Get zone information from disk */ ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), - &blkz, &nr_blkz, GFP_KERNEL); + &blkz, &nr_blkz, GFP_NOIO); if (ret) { dmz_dev_err(zmd->dev, "Get zone %u report failed", dmz_id(zmd, zone)); @@ -1257,7 +1257,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) ret = blkdev_reset_zones(dev->bdev, dmz_start_sect(zmd, zone), - dev->zone_nr_sectors, GFP_KERNEL); + dev->zone_nr_sectors, GFP_NOIO); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", dmz_id(zmd, zone), ret); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 05c0a126f5c8..44a119e12f1a 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -75,7 +75,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, nr_blocks = block - wp_block; ret = blkdev_issue_zeroout(zrc->dev->bdev, dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), - dmz_blk2sect(nr_blocks), GFP_NOFS, false); + dmz_blk2sect(nr_blocks), GFP_NOIO, 0); if (ret) { dmz_dev_err(zrc->dev, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 2b538fa817f4..6d7bda6f8190 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -238,7 +238,7 @@ static void dmz_submit_write_bio(struct dmz_target *dmz, struct dm_zone *zone, struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); /* Setup and submit the BIO */ - bio->bi_bdev = dmz->dev->bdev; + bio_set_dev(bio, dmz->dev->bdev); bio->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); atomic_inc(&bioctx->ref); generic_make_request(bio); @@ -541,7 +541,7 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) int ret; /* Create a new chunk work */ - cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOFS); + cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO); if (!cw) goto out; @@ -586,9 +586,9 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) (unsigned long long)dmz_chunk_block(dmz->dev, dmz_bio_block(bio)), (unsigned int)dmz_bio_blocks(bio)); - bio->bi_bdev = dev->bdev; + bio_set_dev(bio, dev->bdev); - if (!nr_sectors && (bio_op(bio) != REQ_OP_FLUSH) && (bio_op(bio) != REQ_OP_WRITE)) + if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE) return DM_MAPIO_REMAPPED; /* The BIO should be block aligned */ @@ -603,7 +603,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) bioctx->status = BLK_STS_OK; /* Set the BIO pending in the flush list */ - if (bio_op(bio) == REQ_OP_FLUSH || (!nr_sectors && bio_op(bio) == REQ_OP_WRITE)) { + if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) { spin_lock(&dmz->flush_lock); bio_list_add(&dmz->flush_list, bio); spin_unlock(&dmz->flush_lock); @@ -660,6 +660,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path) struct dmz_target *dmz = ti->private; struct request_queue *q; struct dmz_dev *dev; + sector_t aligned_capacity; int ret; /* Get the target device */ @@ -685,15 +686,17 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path) goto err; } + q = bdev_get_queue(dev->bdev); dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; - if (ti->begin || (ti->len != dev->capacity)) { + aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1); + if (ti->begin || + ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) { ti->error = "Partial mapping not supported"; ret = -EINVAL; goto err; } - q = bdev_get_queue(dev->bdev); - dev->zone_nr_sectors = q->limits.chunk_sectors; + dev->zone_nr_sectors = blk_queue_zone_sectors(q); dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors); dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors); @@ -785,7 +788,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* Chunk BIO work */ mutex_init(&dmz->chunk_lock); - INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOFS); + INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_KERNEL); dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND, 0, dev->name); if (!dmz->chunk_wq) { @@ -929,8 +932,10 @@ static int dmz_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct dmz_target *dmz = ti->private; + struct dmz_dev *dev = dmz->dev; + sector_t capacity = dev->capacity & ~(dev->zone_nr_sectors - 1); - return fn(ti, dmz->ddev, 0, dmz->dev->capacity, data); + return fn(ti, dmz->ddev, 0, capacity, data); } static struct target_type dmz_type = { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2edbcc2d7d3f..de17b7193299 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -24,19 +24,10 @@ #include <linux/delay.h> #include <linux/wait.h> #include <linux/pr.h> +#include <linux/refcount.h> #define DM_MSG_PREFIX "core" -#ifdef CONFIG_PRINTK -/* - * ratelimit state to be used in DMXXX_LIMIT(). - */ -DEFINE_RATELIMIT_STATE(dm_ratelimit_state, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); -EXPORT_SYMBOL(dm_ratelimit_state); -#endif - /* * Cookies are numeric values sent with CHANGE and REMOVE * uevents while resuming, removing or renaming the device. @@ -62,6 +53,12 @@ static struct workqueue_struct *deferred_remove_workqueue; atomic_t dm_global_event_nr = ATOMIC_INIT(0); DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); +void dm_issue_global_event(void) +{ + atomic_inc(&dm_global_event_nr); + wake_up(&dm_global_eventq); +} + /* * One of these is allocated per bio. */ @@ -102,7 +99,7 @@ struct dm_md_mempools { struct table_device { struct list_head list; - atomic_t count; + refcount_t count; struct dm_dev dm_dev; }; @@ -118,7 +115,7 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; static int __dm_get_module_param_int(int *module_param, int min, int max) { - int param = ACCESS_ONCE(*module_param); + int param = READ_ONCE(*module_param); int modified_param = 0; bool modified = true; @@ -140,7 +137,7 @@ static int __dm_get_module_param_int(int *module_param, int min, int max) unsigned __dm_get_module_param(unsigned *module_param, unsigned def, unsigned max) { - unsigned param = ACCESS_ONCE(*module_param); + unsigned param = READ_ONCE(*module_param); unsigned modified_param = 0; if (!param) @@ -520,7 +517,7 @@ static void start_io_acct(struct dm_io *io) io->start_time = jiffies; cpu = part_stat_lock(); - part_round_stats(cpu, &dm_disk(md)->part0); + part_round_stats(md->queue, cpu, &dm_disk(md)->part0); part_stat_unlock(); atomic_set(&dm_disk(md)->part0.in_flight[rw], atomic_inc_return(&md->pending[rw])); @@ -539,7 +536,7 @@ static void end_io_acct(struct dm_io *io) int pending; int rw = bio_data_dir(bio); - generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); + generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), @@ -689,10 +686,11 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, format_dev_t(td->dm_dev.name, dev); - atomic_set(&td->count, 0); + refcount_set(&td->count, 1); list_add(&td->list, &md->table_devices); + } else { + refcount_inc(&td->count); } - atomic_inc(&td->count); mutex_unlock(&md->table_devices_lock); *result = &td->dm_dev; @@ -705,7 +703,7 @@ void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) struct table_device *td = container_of(d, struct table_device, dm_dev); mutex_lock(&md->table_devices_lock); - if (atomic_dec_and_test(&td->count)) { + if (refcount_dec_and_test(&td->count)) { close_table_device(td, md); list_del(&td->list); kfree(td); @@ -722,7 +720,7 @@ static void free_table_devices(struct list_head *devices) struct table_device *td = list_entry(tmp, struct table_device, list); DMWARN("dm_destroy: %s still exists with %d references", - td->dm_dev.name, atomic_read(&td->count)); + td->dm_dev.name, refcount_read(&td->count)); kfree(td); } } @@ -851,10 +849,10 @@ static void clone_endio(struct bio *bio) if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_WRITE_SAME && - !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) + !bio->bi_disk->queue->limits.max_write_same_sectors) disable_write_same(md); if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors) + !bio->bi_disk->queue->limits.max_write_zeroes_sectors) disable_write_zeroes(md); } @@ -997,24 +995,6 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, return ret; } -static void dm_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, - size_t size) -{ - struct mapped_device *md = dax_get_private(dax_dev); - sector_t sector = pgoff * PAGE_SECTORS; - struct dm_target *ti; - int srcu_idx; - - ti = dm_dax_get_live_target(md, sector, &srcu_idx); - - if (!ti) - goto out; - if (ti->type->dax_flush) - ti->type->dax_flush(ti, pgoff, addr, size); - out: - dm_put_live_table(md, srcu_idx); -} - /* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH. @@ -1215,8 +1195,8 @@ static void __map_bio(struct dm_target_io *tio) break; case DM_MAPIO_REMAPPED: /* the bio has been remapped so dispatch it */ - trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, - tio->io->bio->bi_bdev->bd_dev, sector); + trace_block_bio_remap(clone->bi_disk->queue, clone, + bio_dev(tio->io->bio), sector); generic_make_request(clone); break; case DM_MAPIO_KILL: @@ -1523,7 +1503,7 @@ static void __split_and_process_bio(struct mapped_device *md, } /* drop the extra reference count */ - dec_pending(ci.io, error); + dec_pending(ci.io, errno_to_blk_status(error)); } /*----------------------------------------------------------------- * CRUD END @@ -1542,7 +1522,7 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) map = dm_get_live_table(md, &srcu_idx); - generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); + generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0); /* if we're suspended, we have to queue this io for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { @@ -1641,17 +1621,6 @@ static void dm_wq_work(struct work_struct *work); void dm_init_md_queue(struct mapped_device *md) { /* - * Request-based dm devices cannot be stacked on top of bio-based dm - * devices. The type of this dm device may not have been decided yet. - * The type is decided at the first table loading time. - * To prevent problematic device stacking, clear the queue flag - * for request stacking support until then. - * - * This queue is new, so no concurrency on the queue_flags. - */ - queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); - - /* * Initialize data that will only be used by a non-blk-mq DM queue * - must do so here (in alloc_dev callchain) before queue is used */ @@ -1717,7 +1686,7 @@ static struct mapped_device *alloc_dev(int minor) struct mapped_device *md; void *old_md; - md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); + md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); if (!md) { DMWARN("unable to allocate device, out of memory."); return NULL; @@ -1796,7 +1765,7 @@ static struct mapped_device *alloc_dev(int minor) goto bad; bio_init(&md->flush_bio, NULL, 0); - md->flush_bio.bi_bdev = md->bdev; + bio_set_dev(&md->flush_bio, md->bdev); md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; dm_stats_init(&md->stats); @@ -1817,7 +1786,7 @@ bad_io_barrier: bad_minor: module_put(THIS_MODULE); bad_module_get: - kfree(md); + kvfree(md); return NULL; } @@ -1836,7 +1805,7 @@ static void free_dev(struct mapped_device *md) free_minor(minor); module_put(THIS_MODULE); - kfree(md); + kvfree(md); } static void __bind_mempools(struct mapped_device *md, struct dm_table *t) @@ -1893,9 +1862,8 @@ static void event_callback(void *context) dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); atomic_inc(&md->event_nr); - atomic_inc(&dm_global_event_nr); wake_up(&md->eventq); - wake_up(&dm_global_eventq); + dm_issue_global_event(); } /* @@ -2095,17 +2063,12 @@ struct mapped_device *dm_get_md(dev_t dev) spin_lock(&_minor_lock); md = idr_find(&_minor_idr, minor); - if (md) { - if ((md == MINOR_ALLOCED || - (MINOR(disk_devt(dm_disk(md))) != minor) || - dm_deleting_md(md) || - test_bit(DMF_FREEING, &md->flags))) { - md = NULL; - goto out; - } - dm_get(md); + if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || + test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { + md = NULL; + goto out; } - + dm_get(md); out: spin_unlock(&_minor_lock); @@ -2311,6 +2274,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) } map = __bind(md, table, &limits); + dm_issue_global_event(); out: mutex_unlock(&md->suspend_lock); @@ -2731,11 +2695,15 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) md = container_of(kobj, struct mapped_device, kobj_holder.kobj); - if (test_bit(DMF_FREEING, &md->flags) || - dm_deleting_md(md)) - return NULL; - + spin_lock(&_minor_lock); + if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { + md = NULL; + goto out; + } dm_get(md); +out: + spin_unlock(&_minor_lock); + return md; } @@ -3002,7 +2970,6 @@ static const struct block_device_operations dm_blk_dops = { static const struct dax_operations dm_dax_ops = { .direct_access = dm_dax_direct_access, .copy_from_iter = dm_dax_copy_from_iter, - .flush = dm_dax_flush, }; /* diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 38c84c0a35d4..36399bb875dd 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -19,6 +19,7 @@ #include <linux/hdreg.h> #include <linux/completion.h> #include <linux/kobject.h> +#include <linux/refcount.h> #include "dm-stats.h" @@ -38,7 +39,7 @@ */ struct dm_dev_internal { struct list_head list; - atomic_t count; + refcount_t count; struct dm_dev *dm_dev; }; diff --git a/drivers/md/bitmap.c b/drivers/md/md-bitmap.c index 40f3cd7eab0f..239c7bb3929b 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/md-bitmap.c @@ -29,7 +29,7 @@ #include <linux/seq_file.h> #include <trace/events/block.h> #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" static inline char *bmname(struct bitmap *bitmap) { @@ -368,7 +368,7 @@ static int read_page(struct file *file, unsigned long index, pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); - bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); + bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false); if (!bh) { ret = -ENOMEM; goto out; @@ -459,7 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap) /* rocking back to read-only */ bitmap->events_cleared = bitmap->mddev->events; sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - sb->state = cpu_to_le32(bitmap->flags); + /* + * clear BITMAP_WRITE_ERROR bit to protect against the case that + * a bitmap write error occurred but the later writes succeeded. + */ + sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR)); /* Just in case these have been changed via sysfs: */ sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); @@ -1816,6 +1820,12 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot) BUG_ON(file && mddev->bitmap_info.offset); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + pr_notice("md/raid:%s: array with journal cannot have bitmap\n", + mdname(mddev)); + return ERR_PTR(-EBUSY); + } + bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) return ERR_PTR(-ENOMEM); @@ -2058,6 +2068,11 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, long pages; struct bitmap_page *new_bp; + if (bitmap->storage.file && !init) { + pr_info("md: cannot resize file-based bitmap\n"); + return -EINVAL; + } + if (chunksize == 0) { /* If there is enough space, leave the chunk size unchanged, * else increase by factor of two until there is enough space. @@ -2147,6 +2162,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, for (k = 0; k < page; k++) { kfree(new_bp[k].map); } + kfree(new_bp); /* restore some fields from old_counts */ bitmap->counts.bp = old_counts.bp; @@ -2197,6 +2213,14 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, block += old_blocks; } + if (bitmap->counts.bp != old_counts.bp) { + unsigned long k; + for (k = 0; k < old_counts.pages; k++) + if (!old_counts.bp[k].hijacked) + kfree(old_counts.bp[k].map); + kfree(old_counts.bp); + } + if (!init) { int i; while (block < (chunks << chunkshift)) { diff --git a/drivers/md/bitmap.h b/drivers/md/md-bitmap.h index d15721ac07a6..5df35ca90f58 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/md-bitmap.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 * diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 03082e17c65c..79bfbc840385 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -15,7 +15,7 @@ #include <linux/sched.h> #include <linux/raid/md_p.h> #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "md-cluster.h" #define LVB_SIZE 64 @@ -442,10 +442,11 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) static void remove_suspend_info(struct mddev *mddev, int slot) { struct md_cluster_info *cinfo = mddev->cluster_info; + mddev->pers->quiesce(mddev, 1); spin_lock_irq(&cinfo->suspend_lock); __remove_suspend_info(cinfo, slot); spin_unlock_irq(&cinfo->suspend_lock); - mddev->pers->quiesce(mddev, 2); + mddev->pers->quiesce(mddev, 0); } @@ -492,13 +493,12 @@ static void process_suspend_info(struct mddev *mddev, s->lo = lo; s->hi = hi; mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); spin_lock_irq(&cinfo->suspend_lock); /* Remove existing entry (if exists) before adding */ __remove_suspend_info(cinfo, slot); list_add(&s->list, &cinfo->suspend_list); spin_unlock_irq(&cinfo->suspend_lock); - mddev->pers->quiesce(mddev, 2); + mddev->pers->quiesce(mddev, 0); } static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) @@ -1094,7 +1094,7 @@ static void metadata_update_cancel(struct mddev *mddev) /* * return 0 if all the bitmaps have the same sync_size */ -int cluster_check_sync_size(struct mddev *mddev) +static int cluster_check_sync_size(struct mddev *mddev) { int i, rv; bitmap_super_t *sb; @@ -1478,7 +1478,7 @@ static struct md_cluster_operations cluster_ops = { static int __init cluster_init(void) { - pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n"); + pr_warn("md-cluster: support raid1 and raid10 (limited support)\n"); pr_info("Registering Cluster MD functions\n"); register_md_cluster_operations(&cluster_ops, THIS_MODULE); return 0; diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 274016177983..c0240708f443 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _MD_CLUSTER_H diff --git a/drivers/md/faulty.c b/drivers/md/md-faulty.c index 06a64d5d8c6c..38264b38420f 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/md-faulty.c @@ -216,12 +216,12 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio) if (failit) { struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - b->bi_bdev = conf->rdev->bdev; + bio_set_dev(b, conf->rdev->bdev); b->bi_private = bio; b->bi_end_io = faulty_fail; bio = b; } else - bio->bi_bdev = conf->rdev->bdev; + bio_set_dev(bio, conf->rdev->bdev); generic_make_request(bio); return true; diff --git a/drivers/md/linear.c b/drivers/md/md-linear.c index 5f1eb9189542..773fc70dced7 100644 --- a/drivers/md/linear.c +++ b/drivers/md/md-linear.c @@ -23,7 +23,7 @@ #include <linux/slab.h> #include <trace/events/block.h> #include "md.h" -#include "linear.h" +#include "md-linear.h" /* * find which device holds a particular offset @@ -275,17 +275,17 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio = split; } - bio->bi_bdev = tmp_dev->rdev->bdev; + bio_set_dev(bio, tmp_dev->rdev->bdev); bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - start_sector + data_offset; if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { + !blk_queue_discard(bio->bi_disk->queue))) { /* Just ignore it */ bio_endio(bio); } else { if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), + trace_block_bio_remap(bio->bi_disk->queue, bio, disk_devt(mddev->gendisk), bio_sector); mddev_check_writesame(mddev, bio); diff --git a/drivers/md/linear.h b/drivers/md/md-linear.h index 8d392e6098b3..8381d651d4ed 100644 --- a/drivers/md/linear.h +++ b/drivers/md/md-linear.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINEAR_H #define _LINEAR_H diff --git a/drivers/md/multipath.c b/drivers/md/md-multipath.c index 23a162ba6c56..e40065bdbfc8 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/md-multipath.c @@ -25,7 +25,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include "md.h" -#include "multipath.h" +#include "md-multipath.h" #define MAX_WORK_PER_DISK 128 @@ -134,7 +134,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) __bio_clone_fast(&mp_bh->bio, bio); mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; - mp_bh->bio.bi_bdev = multipath->rdev->bdev; + bio_set_dev(&mp_bh->bio, multipath->rdev->bdev); mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; @@ -243,7 +243,6 @@ static void print_multipath_conf (struct mpconf *conf) static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct mpconf *conf = mddev->private; - struct request_queue *q; int err = -EEXIST; int path; struct multipath_info *p; @@ -257,7 +256,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) for (path = first; path <= last; path++) if ((p=conf->multipaths+path)->rdev == NULL) { - q = rdev->bdev->bd_disk->queue; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); @@ -345,17 +343,17 @@ static void multipathd(struct md_thread *thread) if ((mp_bh->path = multipath_map (conf))<0) { pr_err("multipath: %s: unrecoverable IO read error for block %llu\n", - bdevname(bio->bi_bdev,b), + bio_devname(bio, b), (unsigned long long)bio->bi_iter.bi_sector); multipath_end_bh_io(mp_bh, BLK_STS_IOERR); } else { pr_err("multipath: %s: redirecting sector %llu to another IO path\n", - bdevname(bio->bi_bdev,b), + bio_devname(bio, b), (unsigned long long)bio->bi_iter.bi_sector); *bio = *(mp_bh->master_bio); bio->bi_iter.bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; - bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; + bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev); bio->bi_opf |= REQ_FAILFAST_TRANSPORT; bio->bi_end_io = multipath_end_request; bio->bi_private = mp_bh; diff --git a/drivers/md/multipath.h b/drivers/md/md-multipath.h index 717c60f62898..0adb941f485a 100644 --- a/drivers/md/multipath.h +++ b/drivers/md/md-multipath.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _MULTIPATH_H #define _MULTIPATH_H diff --git a/drivers/md/md.c b/drivers/md/md.c index 8cdca0296749..41c050b59ec4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -69,7 +69,7 @@ #include <trace/events/block.h> #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "md-cluster.h" #ifndef MODULE @@ -266,6 +266,52 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ +static bool is_suspended(struct mddev *mddev, struct bio *bio) +{ + if (mddev->suspended) + return true; + if (bio_data_dir(bio) != WRITE) + return false; + if (mddev->suspend_lo >= mddev->suspend_hi) + return false; + if (bio->bi_iter.bi_sector >= mddev->suspend_hi) + return false; + if (bio_end_sector(bio) < mddev->suspend_lo) + return false; + return true; +} + +void md_handle_request(struct mddev *mddev, struct bio *bio) +{ +check_suspended: + rcu_read_lock(); + if (is_suspended(mddev, bio)) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!is_suspended(mddev, bio)) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + + if (!mddev->pers->make_request(mddev, bio)) { + atomic_dec(&mddev->active_io); + wake_up(&mddev->sb_wait); + goto check_suspended; + } + + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); +} +EXPORT_SYMBOL(md_handle_request); + static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); @@ -285,23 +331,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) bio_endio(bio); return BLK_QC_T_NONE; } -check_suspended: - rcu_read_lock(); - if (mddev->suspended) { - DEFINE_WAIT(__wait); - for (;;) { - prepare_to_wait(&mddev->sb_wait, &__wait, - TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) - break; - rcu_read_unlock(); - schedule(); - rcu_read_lock(); - } - finish_wait(&mddev->sb_wait, &__wait); - } - atomic_inc(&mddev->active_io); - rcu_read_unlock(); /* * save the sectors now since our bio can @@ -310,20 +339,14 @@ check_suspended: sectors = bio_sectors(bio); /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; - if (!mddev->pers->make_request(mddev, bio)) { - atomic_dec(&mddev->active_io); - wake_up(&mddev->sb_wait); - goto check_suspended; - } + + md_handle_request(mddev, bio); cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); part_stat_unlock(); - if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) - wake_up(&mddev->sb_wait); - return BLK_QC_T_NONE; } @@ -336,12 +359,17 @@ check_suspended: void mddev_suspend(struct mddev *mddev) { WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->suspended++) return; synchronize_rcu(); wake_up(&mddev->sb_wait); + set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); + smp_mb__after_atomic(); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); + clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); + wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); del_timer_sync(&mddev->safemode_timer); } @@ -349,6 +377,7 @@ EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { + lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; wake_up(&mddev->sb_wait); @@ -422,7 +451,7 @@ static void submit_flushes(struct work_struct *ws) bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); bi->bi_end_io = md_end_flush; bi->bi_private = rdev; - bi->bi_bdev = rdev->bdev; + bio_set_dev(bi, rdev->bdev); bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; atomic_inc(&mddev->flush_pending); submit_bio(bi); @@ -439,16 +468,22 @@ static void md_submit_flush_data(struct work_struct *ws) struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct bio *bio = mddev->flush_bio; + /* + * must reset flush_bio before calling into md_handle_request to avoid a + * deadlock, because other bios passed md_handle_request suspend check + * could wait for this and below md_handle_request could wait for those + * bios because of suspend check + */ + mddev->flush_bio = NULL; + wake_up(&mddev->sb_wait); + if (bio->bi_iter.bi_size == 0) /* an empty barrier - all done */ bio_endio(bio); else { bio->bi_opf &= ~REQ_PREFLUSH; - mddev->pers->make_request(mddev, bio); + md_handle_request(mddev, bio); } - - mddev->flush_bio = NULL; - wake_up(&mddev->sb_wait); } void md_flush_request(struct mddev *mddev, struct bio *bio) @@ -506,7 +541,7 @@ static void mddev_put(struct mddev *mddev) bioset_free(sync_bs); } -static void md_safemode_timeout(unsigned long data); +static void md_safemode_timeout(struct timer_list *t); void mddev_init(struct mddev *mddev) { @@ -515,8 +550,7 @@ void mddev_init(struct mddev *mddev) mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); - setup_timer(&mddev->safemode_timer, md_safemode_timeout, - (unsigned long) mddev); + timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); @@ -649,6 +683,7 @@ void mddev_unlock(struct mddev *mddev) */ spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); + wake_up(&mddev->sb_wait); spin_unlock(&pers_lock); } EXPORT_SYMBOL_GPL(mddev_unlock); @@ -772,7 +807,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, atomic_inc(&rdev->nr_pending); - bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; + bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev); bio->bi_iter.bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; @@ -803,8 +838,10 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct bio *bio = md_bio_alloc_sync(rdev->mddev); int ret; - bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? - rdev->meta_bdev : rdev->bdev; + if (metadata_op && rdev->meta_bdev) + bio_set_dev(bio, rdev->meta_bdev); + else + bio_set_dev(bio, rdev->bdev); bio_set_op_attrs(bio, op, op_flags); if (metadata_op) bio->bi_iter.bi_sector = sector + rdev->sb_start; @@ -1536,7 +1573,8 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ } else if (sb->bblog_offset != 0) rdev->badblocks.shift = 0; - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) { + if ((le32_to_cpu(sb->feature_map) & + (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); rdev->ppl.size = le16_to_cpu(sb->ppl.size); rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; @@ -1655,10 +1693,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) { + if (le32_to_cpu(sb->feature_map) & + (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { if (le32_to_cpu(sb->feature_map) & (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) return -EINVAL; + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && + (le32_to_cpu(sb->feature_map) & + MD_FEATURE_MULTIPLE_PPLS)) + return -EINVAL; set_bit(MD_HAS_PPL, &mddev->flags); } } else if (mddev->pers == NULL) { @@ -1875,7 +1918,11 @@ retry: sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); if (test_bit(MD_HAS_PPL, &mddev->flags)) { - sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); + if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); + else + sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); sb->ppl.size = cpu_to_le16(rdev->ppl.size); } @@ -2287,7 +2334,7 @@ static void export_array(struct mddev *mddev) static bool set_in_sync(struct mddev *mddev) { - WARN_ON_ONCE(!spin_is_locked(&mddev->lock)); + lockdep_assert_held(&mddev->lock); if (!mddev->in_sync) { mddev->sync_checkers++; spin_unlock(&mddev->lock); @@ -2406,10 +2453,18 @@ repeat: } } - /* First make sure individual recovery_offsets are correct */ + /* + * First make sure individual recovery_offsets are correct + * curr_resync_completed can only be used during recovery. + * During reshape/resync it might use array-addresses rather + * that device addresses. + */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) @@ -2625,7 +2680,7 @@ state_show(struct md_rdev *rdev, char *page) { char *sep = ","; size_t len = 0; - unsigned long flags = ACCESS_ONCE(rdev->flags); + unsigned long flags = READ_ONCE(rdev->flags); if (test_bit(Faulty, &flags) || (!test_bit(ExternalBbl, &flags) && @@ -4283,6 +4338,8 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) if (err) export_rdev(rdev); mddev_unlock(mddev); + if (!err) + md_new_event(mddev); return err ? err : len; } @@ -4796,7 +4853,7 @@ suspend_lo_show(struct mddev *mddev, char *page) static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err; err = kstrtoull(buf, 10, &new); @@ -4812,16 +4869,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers == NULL || mddev->pers->quiesce == NULL) goto unlock; - old = mddev->suspend_lo; + mddev_suspend(mddev); mddev->suspend_lo = new; - if (new >= old) - /* Shrinking suspended region */ - mddev->pers->quiesce(mddev, 2); - else { - /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); @@ -4839,7 +4890,7 @@ suspend_hi_show(struct mddev *mddev, char *page) static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err; err = kstrtoull(buf, 10, &new); @@ -4852,19 +4903,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; err = -EINVAL; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) + if (mddev->pers == NULL) goto unlock; - old = mddev->suspend_hi; + + mddev_suspend(mddev); mddev->suspend_hi = new; - if (new <= old) - /* Shrinking suspended region */ - mddev->pers->quiesce(mddev, 2); - else { - /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); @@ -5329,7 +5374,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) return NULL; } -static int add_named_array(const char *val, struct kernel_param *kp) +static int add_named_array(const char *val, const struct kernel_param *kp) { /* * val must be "md_*" or "mdNNN". @@ -5358,9 +5403,9 @@ static int add_named_array(const char *val, struct kernel_param *kp) return -EINVAL; } -static void md_safemode_timeout(unsigned long data) +static void md_safemode_timeout(struct timer_list *t) { - struct mddev *mddev = (struct mddev *) data; + struct mddev *mddev = from_timer(mddev, t, safemode_timer); mddev->safemode = 1; if (mddev->external) @@ -5806,8 +5851,14 @@ void md_stop(struct mddev *mddev) * This is called from dm-raid */ __md_stop(mddev); - if (mddev->bio_set) + if (mddev->bio_set) { bioset_free(mddev->bio_set); + mddev->bio_set = NULL; + } + if (mddev->sync_set) { + bioset_free(mddev->sync_set); + mddev->sync_set = NULL; + } } EXPORT_SYMBOL_GPL(md_stop); @@ -6334,7 +6385,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) break; } } - if (has_journal) { + if (has_journal || mddev->bitmap) { export_rdev(rdev); return -EBUSY; } @@ -6590,22 +6641,26 @@ static int set_bitmap_file(struct mddev *mddev, int fd) return -ENOENT; /* cannot remove what isn't there */ err = 0; if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); if (fd >= 0) { struct bitmap *bitmap; bitmap = bitmap_create(mddev, -1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = bitmap_load(mddev); } else err = PTR_ERR(bitmap); - } - if (fd < 0 || err) { + if (err) { + bitmap_destroy(mddev); + fd = -1; + } + mddev_resume(mddev); + } else if (fd < 0) { + mddev_suspend(mddev); bitmap_destroy(mddev); - fd = -1; /* make sure to put the file */ + mddev_resume(mddev); } - mddev->pers->quiesce(mddev, 0); } if (fd < 0) { struct file *f = mddev->bitmap_info.file; @@ -6707,7 +6762,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) { - WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->external_size) return; @@ -6889,8 +6944,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - mddev->pers->quiesce(mddev, 1); bitmap = bitmap_create(mddev, -1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = bitmap_load(mddev); @@ -6898,7 +6953,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = PTR_ERR(bitmap); if (rv) bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { @@ -6921,9 +6976,9 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); } - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); mddev->bitmap_info.offset = 0; } } @@ -7440,8 +7495,8 @@ void md_wakeup_thread(struct md_thread *thread) { if (thread) { pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags)) - wake_up(&thread->wqueue); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); } } EXPORT_SYMBOL(md_wakeup_thread); @@ -7836,7 +7891,7 @@ static const struct file_operations md_seq_fops = { .open = md_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release, .poll = mdstat_poll, }; @@ -7996,7 +8051,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (mddev->safemode == 1) mddev->safemode = 0; /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ - if (mddev->in_sync || !mddev->sync_checkers) { + if (mddev->in_sync || mddev->sync_checkers) { spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; @@ -8011,7 +8066,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || + mddev->suspended); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { percpu_ref_put(&mddev->writes_pending); return false; @@ -8082,7 +8138,6 @@ void md_allow_write(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); /* wait for the dirty state to be recorded in the metadata */ wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) && !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else spin_unlock(&mddev->lock); @@ -8449,16 +8504,19 @@ void md_do_sync(struct md_thread *thread) } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < mddev->curr_resync) - rdev->recovery_offset = mddev->curr_resync; - rcu_read_unlock(); + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < mddev->curr_resync) + rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); + } } } skip: @@ -8656,6 +8714,9 @@ void md_check_recovery(struct mddev *mddev) if (mddev_trylock(mddev)) { int spares = 0; + if (!mddev->external && mddev->safemode == 1) + mddev->safemode = 0; + if (mddev->ro) { struct md_rdev *rdev; if (!mddev->external && mddev->in_sync) @@ -8782,6 +8843,16 @@ void md_check_recovery(struct mddev *mddev) unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); + } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { + /* Write superblock - thread that called mddev_suspend() + * holds reconfig_mutex for us. + */ + set_bit(MD_UPDATING_SB, &mddev->flags); + smp_mb__after_atomic(); + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) + md_update_sb(mddev, 0); + clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); + wake_up(&mddev->sb_wait); } } EXPORT_SYMBOL(md_check_recovery); @@ -9243,11 +9314,11 @@ static __exit void md_exit(void) subsys_initcall(md_init); module_exit(md_exit) -static int get_ro(char *buffer, struct kernel_param *kp) +static int get_ro(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%d", start_readonly); } -static int set_ro(const char *val, struct kernel_param *kp) +static int set_ro(const char *val, const struct kernel_param *kp) { return kstrtouint(val, 10, (unsigned int *)&start_readonly); } diff --git a/drivers/md/md.h b/drivers/md/md.h index b50eb4ac1b82..7d6bcf0eba0c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -236,6 +236,13 @@ enum mddev_flags { * never cause the array to become failed. */ MD_HAS_PPL, /* The raid array has PPL feature set */ + MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */ + MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update + * the metadata without taking reconfig_mutex. + */ + MD_UPDATING_SB, /* md_check_recovery is updating the metadata + * without explicitly holding reconfig_mutex. + */ }; enum mddev_sb_flags { @@ -493,11 +500,6 @@ static inline void mddev_lock_nointr(struct mddev *mddev) mutex_lock(&mddev->reconfig_mutex); } -static inline int mddev_is_locked(struct mddev *mddev) -{ - return mutex_is_locked(&mddev->reconfig_mutex); -} - static inline int mddev_trylock(struct mddev *mddev) { return mutex_trylock(&mddev->reconfig_mutex); @@ -509,6 +511,11 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); } +static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) +{ + atomic_add(nr_sectors, &bio->bi_disk->sync_io); +} + struct md_personality { char *name; @@ -532,12 +539,11 @@ struct md_personality int (*check_reshape) (struct mddev *mddev); int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); - /* quiesce moves between quiescence states - * 0 - fully active - * 1 - no new requests allowed - * others - reserved + /* quiesce suspends or resumes internal processing. + * 1 - stop new actions and wait for action io to complete + * 0 - return to normal behaviour */ - void (*quiesce) (struct mddev *mddev, int state); + void (*quiesce) (struct mddev *mddev, int quiesce); /* takeover is used to transition an array from one * personality to another. The new personality must be able * to handle the data in the current layout. @@ -686,6 +692,7 @@ extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); +extern void md_handle_request(struct mddev *mddev, struct bio *bio); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, @@ -721,68 +728,14 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_SAME && - !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) + !bio->bi_disk->queue->limits.max_write_same_sectors) mddev->queue->limits.max_write_same_sectors = 0; } static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors) + !bio->bi_disk->queue->limits.max_write_zeroes_sectors) mddev->queue->limits.max_write_zeroes_sectors = 0; } - -/* Maximum size of each resync request */ -#define RESYNC_BLOCK_SIZE (64*1024) -#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) - -/* for managing resync I/O pages */ -struct resync_pages { - unsigned idx; /* for get/put page from the pool */ - void *raid_bio; - struct page *pages[RESYNC_PAGES]; -}; - -static inline int resync_alloc_pages(struct resync_pages *rp, - gfp_t gfp_flags) -{ - int i; - - for (i = 0; i < RESYNC_PAGES; i++) { - rp->pages[i] = alloc_page(gfp_flags); - if (!rp->pages[i]) - goto out_free; - } - - return 0; - -out_free: - while (--i >= 0) - put_page(rp->pages[i]); - return -ENOMEM; -} - -static inline void resync_free_pages(struct resync_pages *rp) -{ - int i; - - for (i = 0; i < RESYNC_PAGES; i++) - put_page(rp->pages[i]); -} - -static inline void resync_get_all_pages(struct resync_pages *rp) -{ - int i; - - for (i = 0; i < RESYNC_PAGES; i++) - get_page(rp->pages[i]); -} - -static inline struct page *resync_fetch_page(struct resync_pages *rp, - unsigned idx) -{ - if (WARN_ON_ONCE(idx >= RESYNC_PAGES)) - return NULL; - return rp->pages[idx]; -} #endif /* _MD_MD_H */ diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile index ff528792c358..66be7c66479a 100644 --- a/drivers/md/persistent-data/Makefile +++ b/drivers/md/persistent-data/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o dm-persistent-data-objs := \ dm-array.o \ diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 4aed69d9dd17..aec449243966 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -11,6 +11,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/device-mapper.h> +#include <linux/kernel.h> #define DM_MSG_PREFIX "space map metadata" @@ -111,7 +112,7 @@ static bool brb_empty(struct bop_ring_buffer *brb) static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) { unsigned r = old + 1; - return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r; + return r >= ARRAY_SIZE(brb->bops) ? 0 : r; } static int brb_push(struct bop_ring_buffer *brb, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 94d9ae9b0fd0..5ecba9eef441 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -30,7 +30,8 @@ ((1L << MD_HAS_JOURNAL) | \ (1L << MD_JOURNAL_CLEAN) | \ (1L << MD_FAILFAST_SUPPORTED) |\ - (1L << MD_HAS_PPL)) + (1L << MD_HAS_PPL) | \ + (1L << MD_HAS_MULTIPLE_PPLS)) static int raid0_congested(struct mddev *mddev, int bits) { @@ -539,6 +540,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) !discard_bio) continue; bio_chain(discard_bio, bio); + bio_clone_blkcg_association(discard_bio, bio); if (mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rdev->bdev), discard_bio, disk_devt(mddev->gendisk), @@ -588,14 +590,13 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) zone = find_zone(mddev->private, §or); tmp_dev = map_sector(mddev, zone, sector, §or); - bio->bi_bdev = tmp_dev->bdev; + bio_set_dev(bio, tmp_dev->bdev); bio->bi_iter.bi_sector = sector + zone->dev_start + tmp_dev->data_offset; if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), - bio, disk_devt(mddev->gendisk), - bio_sector); + trace_block_bio_remap(bio->bi_disk->queue, bio, + disk_devt(mddev->gendisk), bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); generic_make_request(bio); @@ -767,7 +768,7 @@ static void *raid0_takeover(struct mddev *mddev) return ERR_PTR(-EINVAL); } -static void raid0_quiesce(struct mddev *mddev, int state) +static void raid0_quiesce(struct mddev *mddev, int quiesce) { } diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 7127a623f5da..540e65d92642 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RAID0_H #define _RAID0_H diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c new file mode 100644 index 000000000000..400001b815db --- /dev/null +++ b/drivers/md/raid1-10.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Maximum size of each resync request */ +#define RESYNC_BLOCK_SIZE (64*1024) +#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) + +/* for managing resync I/O pages */ +struct resync_pages { + void *raid_bio; + struct page *pages[RESYNC_PAGES]; +}; + +static inline int resync_alloc_pages(struct resync_pages *rp, + gfp_t gfp_flags) +{ + int i; + + for (i = 0; i < RESYNC_PAGES; i++) { + rp->pages[i] = alloc_page(gfp_flags); + if (!rp->pages[i]) + goto out_free; + } + + return 0; + +out_free: + while (--i >= 0) + put_page(rp->pages[i]); + return -ENOMEM; +} + +static inline void resync_free_pages(struct resync_pages *rp) +{ + int i; + + for (i = 0; i < RESYNC_PAGES; i++) + put_page(rp->pages[i]); +} + +static inline void resync_get_all_pages(struct resync_pages *rp) +{ + int i; + + for (i = 0; i < RESYNC_PAGES; i++) + get_page(rp->pages[i]); +} + +static inline struct page *resync_fetch_page(struct resync_pages *rp, + unsigned idx) +{ + if (WARN_ON_ONCE(idx >= RESYNC_PAGES)) + return NULL; + return rp->pages[idx]; +} + +/* + * 'strct resync_pages' stores actual pages used for doing the resync + * IO, and it is per-bio, so make .bi_private points to it. + */ +static inline struct resync_pages *get_resync_pages(struct bio *bio) +{ + return bio->bi_private; +} + +/* generally called after bio_reset() for reseting bvec */ +static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, + int size) +{ + int idx = 0; + + /* initialize bvec table again */ + do { + struct page *page = resync_fetch_page(rp, idx); + int len = min_t(int, size, PAGE_SIZE); + + /* + * won't fail because the vec table is big + * enough to hold all these pages + */ + bio_add_page(bio, page, len, 0); + size -= len; + } while (idx++ < RESYNC_PAGES && size > 0); +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3febfc8391fb..cc9d337a1ed3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -37,18 +37,18 @@ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/ratelimit.h> -#include <linux/sched/signal.h> #include <trace/events/block.h> #include "md.h" #include "raid1.h" -#include "bitmap.h" +#include "md-bitmap.h" #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_HAS_JOURNAL) | \ (1L << MD_JOURNAL_CLEAN) | \ - (1L << MD_HAS_PPL)) + (1L << MD_HAS_PPL) | \ + (1L << MD_HAS_MULTIPLE_PPLS)) /* * Number of guaranteed r1bios in case of extreme VM load: @@ -81,14 +81,7 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #define raid1_log(md, fmt, args...) \ do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) -/* - * 'strct resync_pages' stores actual pages used for doing the resync - * IO, and it is per-bio, so make .bi_private points to it. - */ -static inline struct resync_pages *get_resync_pages(struct bio *bio) -{ - return bio->bi_private; -} +#include "raid1-10.c" /* * for resync bio, r1bio pointer can be retrieved from the per-bio @@ -170,7 +163,6 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) resync_get_all_pages(rp); } - rp->idx = 0; rp->raid_bio = r1_bio; bio->bi_private = rp; } @@ -492,10 +484,6 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { - /* we release behind master bio when all write are done */ - if (r1_bio->behind_master_bio == bio) - to_put = NULL; - if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -798,14 +786,13 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_bdev; + struct md_rdev *rdev = (void *)bio->bi_disk; bio->bi_next = NULL; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); + bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + !blk_queue_discard(bio->bi_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1002,14 +989,6 @@ static void wait_barrier(struct r1conf *conf, sector_t sector_nr) _wait_barrier(conf, idx); } -static void wait_all_barriers(struct r1conf *conf) -{ - int idx; - - for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) - _wait_barrier(conf, idx); -} - static void _allow_barrier(struct r1conf *conf, int idx) { atomic_dec(&conf->nr_pending[idx]); @@ -1023,14 +1002,6 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr) _allow_barrier(conf, idx); } -static void allow_all_barriers(struct r1conf *conf) -{ - int idx; - - for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) - _allow_barrier(conf, idx); -} - /* conf->resync_lock should be held */ static int get_unqueued_pending(struct r1conf *conf) { @@ -1088,7 +1059,7 @@ static void unfreeze_array(struct r1conf *conf) wake_up(&conf->wait_barrier); } -static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio, +static void alloc_behind_master_bio(struct r1bio *r1_bio, struct bio *bio) { int size = bio->bi_iter.bi_size; @@ -1098,11 +1069,13 @@ static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio, behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev); if (!behind_bio) - goto fail; + return; /* discard op, we don't support writezero/writesame yet */ - if (!bio_has_data(bio)) + if (!bio_has_data(bio)) { + behind_bio->bi_iter.bi_size = size; goto skip_copy; + } while (i < vcnt && size) { struct page *page; @@ -1123,14 +1096,13 @@ skip_copy: r1_bio->behind_master_bio = behind_bio;; set_bit(R1BIO_BehindIO, &r1_bio->state); - return behind_bio; + return; free_pages: pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_iter.bi_size); bio_free_pages(behind_bio); -fail: - return behind_bio; + bio_put(behind_bio); } struct raid1_plug_cb { @@ -1285,7 +1257,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_iter.bi_sector = r1_bio->sector + mirror->rdev->data_offset; - read_bio->bi_bdev = mirror->rdev->bdev; + bio_set_dev(read_bio, mirror->rdev->bdev); read_bio->bi_end_io = raid1_end_read_request; bio_set_op_attrs(read_bio, op, do_sync); if (test_bit(FailFast, &mirror->rdev->flags) && @@ -1294,9 +1266,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r1_bio; if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), - read_bio, disk_devt(mddev->gendisk), - r1_bio->sector); + trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, + disk_devt(mddev->gendisk), r1_bio->sector); generic_make_request(read_bio); } @@ -1315,42 +1286,28 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, int first_clone; int max_sectors; - /* - * Register the new request and wait if the reconstruction - * thread has put up a bar for new requests. - * Continue immediately if no resync is active currently. - */ - - - if ((bio_end_sector(bio) > mddev->suspend_lo && - bio->bi_iter.bi_sector < mddev->suspend_hi) || - (mddev_is_clustered(mddev) && + if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, bio_end_sector(bio)))) { + bio->bi_iter.bi_sector, bio_end_sector(bio))) { - /* - * As the suspend_* range is controlled by userspace, we want - * an interruptible wait. - */ DEFINE_WAIT(w); for (;;) { - sigset_t full, old; prepare_to_wait(&conf->wait_barrier, - &w, TASK_INTERRUPTIBLE); - if (bio_end_sector(bio) <= mddev->suspend_lo || - bio->bi_iter.bi_sector >= mddev->suspend_hi || - (mddev_is_clustered(mddev) && - !md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, - bio_end_sector(bio)))) + &w, TASK_IDLE); + if (!md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio))) break; - sigfillset(&full); - sigprocmask(SIG_BLOCK, &full, &old); schedule(); - sigprocmask(SIG_SETMASK, &old, NULL); } finish_wait(&conf->wait_barrier, &w); } + + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ wait_barrier(conf, bio->bi_iter.bi_sector); r1_bio = alloc_r1bio(mddev, bio); @@ -1483,7 +1440,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && !waitqueue_active(&bitmap->behind_wait)) { - mbio = alloc_behind_master_bio(r1_bio, bio); + alloc_behind_master_bio(r1_bio, bio); } bitmap_startwrite(bitmap, r1_bio->sector, @@ -1493,14 +1450,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, first_clone = 0; } - if (!mbio) { - if (r1_bio->behind_master_bio) - mbio = bio_clone_fast(r1_bio->behind_master_bio, - GFP_NOIO, - mddev->bio_set); - else - mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - } + if (r1_bio->behind_master_bio) + mbio = bio_clone_fast(r1_bio->behind_master_bio, + GFP_NOIO, mddev->bio_set); + else + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); if (r1_bio->behind_master_bio) { if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) @@ -1511,7 +1465,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio->bi_iter.bi_sector = (r1_bio->sector + conf->mirrors[i].rdev->data_offset); - mbio->bi_bdev = conf->mirrors[i].rdev->bdev; + bio_set_dev(mbio, conf->mirrors[i].rdev->bdev); mbio->bi_end_io = raid1_end_write_request; mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && @@ -1523,11 +1477,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, atomic_inc(&r1_bio->remaining); if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), + trace_block_bio_remap(mbio->bi_disk->queue, mbio, disk_devt(mddev->gendisk), r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_bdev = (void*)conf->mirrors[i].rdev; + mbio->bi_disk = (void *)conf->mirrors[i].rdev; cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); if (cb) @@ -1669,8 +1623,12 @@ static void print_conf(struct r1conf *conf) static void close_sync(struct r1conf *conf) { - wait_all_barriers(conf); - allow_all_barriers(conf); + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) { + _wait_barrier(conf, idx); + _allow_barrier(conf, idx); + } mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; @@ -2005,8 +1963,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) * Don't fail devices as that won't really help. */ pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", - mdname(mddev), - bdevname(bio->bi_bdev, b), + mdname(mddev), bio_devname(bio, b), (unsigned long long)r1_bio->sector); for (d = 0; d < conf->raid_disks * 2; d++) { rdev = conf->mirrors[d].rdev; @@ -2086,10 +2043,7 @@ static void process_checks(struct r1bio *r1_bio) /* Fix variable parts of all bios */ vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); for (i = 0; i < conf->raid_disks * 2; i++) { - int j; - int size; blk_status_t status; - struct bio_vec *bi; struct bio *b = r1_bio->bios[i]; struct resync_pages *rp = get_resync_pages(b); if (b->bi_end_io != end_sync_read) @@ -2098,24 +2052,15 @@ static void process_checks(struct r1bio *r1_bio) status = b->bi_status; bio_reset(b); b->bi_status = status; - b->bi_vcnt = vcnt; - b->bi_iter.bi_size = r1_bio->sectors << 9; b->bi_iter.bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; - b->bi_bdev = conf->mirrors[i].rdev->bdev; + bio_set_dev(b, conf->mirrors[i].rdev->bdev); b->bi_end_io = end_sync_read; rp->raid_bio = r1_bio; b->bi_private = rp; - size = b->bi_iter.bi_size; - bio_for_each_segment_all(bi, b, j) { - bi->bv_offset = 0; - if (size > PAGE_SIZE) - bi->bv_len = PAGE_SIZE; - else - bi->bv_len = size; - size -= PAGE_SIZE; - } + /* initialize bvec table again */ + md_bio_reset_resync_pages(b, rp, r1_bio->sectors << 9); } for (primary = 0; primary < conf->raid_disks * 2; primary++) if (r1_bio->bios[primary]->bi_end_io == end_sync_read && @@ -2366,8 +2311,6 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) wbio = bio_clone_fast(r1_bio->behind_master_bio, GFP_NOIO, mddev->bio_set); - /* We really need a _all clone */ - wbio->bi_iter = (struct bvec_iter){ 0 }; } else { wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, mddev->bio_set); @@ -2379,7 +2322,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) bio_trim(wbio, sector - r1_bio->sector, sectors); wbio->bi_iter.bi_sector += rdev->data_offset; - wbio->bi_bdev = rdev->bdev; + bio_set_dev(wbio, rdev->bdev); if (submit_bio_wait(wbio) < 0) /* failure! */ @@ -2469,7 +2412,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) struct mddev *mddev = conf->mddev; struct bio *bio; struct md_rdev *rdev; - dev_t bio_dev; sector_t bio_sector; clear_bit(R1BIO_ReadError, &r1_bio->state); @@ -2483,7 +2425,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) */ bio = r1_bio->bios[r1_bio->read_disk]; - bio_dev = bio->bi_bdev->bd_dev; bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector; bio_put(bio); r1_bio->bios[r1_bio->read_disk] = NULL; @@ -2593,6 +2534,23 @@ static int init_resync(struct r1conf *conf) return 0; } +static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) +{ + struct r1bio *r1bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); + struct resync_pages *rps; + struct bio *bio; + int i; + + for (i = conf->poolinfo->raid_disks; i--; ) { + bio = r1bio->bios[i]; + rps = bio->bi_private; + bio_reset(bio); + bio->bi_private = rps; + } + r1bio->master_bio = NULL; + return r1bio; +} + /* * perform a "sync" on one "block" * @@ -2619,6 +2577,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, int good_sectors = RESYNC_SECTORS; int min_bad = 0; /* number of sectors that are bad in all devices */ int idx = sector_to_idx(sector_nr); + int page_idx = 0; if (!conf->r1buf_pool) if (init_resync(conf)) @@ -2677,7 +2636,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bitmap_cond_end_sync(mddev->bitmap, sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); + r1_bio = raid1_alloc_init_r1buf(conf); raise_barrier(conf, sector_nr); @@ -2755,7 +2714,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (bio->bi_end_io) { atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; } @@ -2846,7 +2805,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r1_bio->bios[i]; rp = get_resync_pages(bio); if (bio->bi_end_io) { - page = resync_fetch_page(rp, rp->idx++); + page = resync_fetch_page(rp, page_idx); /* * won't fail because the vec table is big @@ -2858,7 +2817,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, nr_sectors += len>>9; sector_nr += len>>9; sync_blocks -= (len>>9); - } while (get_resync_pages(r1_bio->bios[disk]->bi_private)->idx < RESYNC_PAGES); + } while (++page_idx < RESYNC_PAGES); r1_bio->sectors = nr_sectors; @@ -2881,7 +2840,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { read_targets--; - md_sync_acct(bio->bi_bdev, nr_sectors); + md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; generic_make_request(bio); @@ -2890,7 +2849,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } else { atomic_set(&r1_bio->remaining, 1); bio = r1_bio->bios[r1_bio->read_disk]; - md_sync_acct(bio->bi_bdev, nr_sectors); + md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; generic_make_request(bio); @@ -3291,21 +3250,14 @@ static int raid1_reshape(struct mddev *mddev) return 0; } -static void raid1_quiesce(struct mddev *mddev, int state) +static void raid1_quiesce(struct mddev *mddev, int quiesce) { struct r1conf *conf = mddev->private; - switch(state) { - case 2: /* wake for suspend */ - wake_up(&conf->wait_barrier); - break; - case 1: + if (quiesce) freeze_array(conf, 0); - break; - case 0: + else unfreeze_array(conf); - break; - } } static void *raid1_takeover(struct mddev *mddev) diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index c8894ef1e9d2..c7294e7557e0 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RAID1_H #define _RAID1_H diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5026e7ad51d3..b9edbc747a95 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -29,7 +29,7 @@ #include "md.h" #include "raid10.h" #include "raid0.h" -#include "bitmap.h" +#include "md-bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. @@ -110,14 +110,7 @@ static void end_reshape(struct r10conf *conf); #define raid10_log(md, fmt, args...) \ do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) -/* - * 'strct resync_pages' stores actual pages used for doing the resync - * IO, and it is per-bio, so make .bi_private points to it. - */ -static inline struct resync_pages *get_resync_pages(struct bio *bio) -{ - return bio->bi_private; -} +#include "raid1-10.c" /* * for resync bio, r10bio pointer can be retrieved from the per-bio @@ -143,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data) kfree(r10_bio); } +#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW (1024*1024) /* maximum number of concurrent requests, memory permitting */ #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) +#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) +#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) /* * When performing a resync, we need to read and compare, so @@ -221,7 +217,6 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) resync_get_all_pages(rp); } - rp->idx = 0; rp->raid_bio = r10_bio; bio->bi_private = rp; if (rbio) { @@ -391,12 +386,11 @@ static void raid10_end_read_request(struct bio *bio) { int uptodate = !bio->bi_status; struct r10bio *r10_bio = bio->bi_private; - int slot, dev; + int slot; struct md_rdev *rdev; struct r10conf *conf = r10_bio->mddev->private; slot = r10_bio->read_slot; - dev = r10_bio->devs[slot].devnum; rdev = r10_bio->devs[slot].rdev; /* * this branch is our 'one mirror IO has finished' event handler: @@ -756,7 +750,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, raid10_find_phys(conf, r10_bio); rcu_read_lock(); - sectors = r10_bio->sectors; best_slot = -1; best_rdev = NULL; best_dist = MaxSector; @@ -769,8 +762,11 @@ static struct md_rdev *read_balance(struct r10conf *conf, * the resync window. We take the first readable disk when * above the resync window. */ - if (conf->mddev->recovery_cp < MaxSector - && (this_sector + sectors >= conf->next_resync)) + if ((conf->mddev->recovery_cp < MaxSector + && (this_sector + sectors >= conf->next_resync)) || + (mddev_is_clustered(conf->mddev) && + md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, + this_sector + sectors))) do_balance = 0; for (slot = 0; slot < conf->copies ; slot++) { @@ -909,14 +905,13 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_bdev; + struct md_rdev *rdev = (void*)bio->bi_disk; bio->bi_next = NULL; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); + bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + !blk_queue_discard(bio->bi_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1094,14 +1089,13 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_bdev; + struct md_rdev *rdev = (void*)bio->bi_disk; bio->bi_next = NULL; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); + bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + !blk_queue_discard(bio->bi_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1210,7 +1204,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + choose_data_offset(r10_bio, rdev); - read_bio->bi_bdev = rdev->bdev; + bio_set_dev(read_bio, rdev->bdev); read_bio->bi_end_io = raid10_end_read_request; bio_set_op_attrs(read_bio, op, do_sync); if (test_bit(FailFast, &rdev->flags) && @@ -1219,7 +1213,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r10_bio; if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), + trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, disk_devt(mddev->gendisk), r10_bio->sector); generic_make_request(read_bio); @@ -1259,7 +1253,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + choose_data_offset(r10_bio, rdev)); - mbio->bi_bdev = rdev->bdev; + bio_set_dev(mbio, rdev->bdev); mbio->bi_end_io = raid10_end_write_request; bio_set_op_attrs(mbio, op, do_sync | do_fua); if (!replacement && test_bit(FailFast, @@ -1269,11 +1263,11 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_private = r10_bio; if (conf->mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), + trace_block_bio_remap(mbio->bi_disk->queue, mbio, disk_devt(conf->mddev->gendisk), r10_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_bdev = (void *)rdev; + mbio->bi_disk = (void *)rdev; atomic_inc(&r10_bio->remaining); @@ -1303,6 +1297,22 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, sector_t sectors; int max_sectors; + if ((mddev_is_clustered(mddev) && + md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) { + DEFINE_WAIT(w); + for (;;) { + prepare_to_wait(&conf->wait_barrier, + &w, TASK_IDLE); + if (!md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, bio_end_sector(bio))) + break; + schedule(); + } + finish_wait(&conf->wait_barrier, &w); + } + /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. @@ -2087,8 +2097,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) rp = get_resync_pages(tbio); bio_reset(tbio); - tbio->bi_vcnt = vcnt; - tbio->bi_iter.bi_size = fbio->bi_iter.bi_size; + md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size); + rp->raid_bio = r10_bio; tbio->bi_private = rp; tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; @@ -2104,7 +2114,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) tbio->bi_opf |= MD_FAILFAST; tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; - tbio->bi_bdev = conf->mirrors[d].rdev->bdev; + bio_set_dev(tbio, conf->mirrors[d].rdev->bdev); generic_make_request(tbio); } @@ -2562,7 +2572,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); wbio->bi_iter.bi_sector = wsector + choose_data_offset(r10_bio, rdev); - wbio->bi_bdev = rdev->bdev; + bio_set_dev(wbio, rdev->bdev); bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); if (submit_bio_wait(wbio) < 0) @@ -2585,8 +2595,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) struct bio *bio; struct r10conf *conf = mddev->private; struct md_rdev *rdev = r10_bio->devs[slot].rdev; - dev_t bio_dev; - sector_t bio_last_sector; /* we got a read error. Maybe the drive is bad. Maybe just * the block and we can fix it. @@ -2597,8 +2605,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) * frozen. */ bio = r10_bio->devs[slot].bio; - bio_dev = bio->bi_bdev->bd_dev; - bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors; bio_put(bio); r10_bio->devs[slot].bio = NULL; @@ -2808,6 +2814,72 @@ static int init_resync(struct r10conf *conf) return 0; } +static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) +{ + struct r10bio *r10bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + struct rsync_pages *rp; + struct bio *bio; + int nalloc; + int i; + + if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) + nalloc = conf->copies; /* resync */ + else + nalloc = 2; /* recovery */ + + for (i = 0; i < nalloc; i++) { + bio = r10bio->devs[i].bio; + rp = bio->bi_private; + bio_reset(bio); + bio->bi_private = rp; + bio = r10bio->devs[i].repl_bio; + if (bio) { + rp = bio->bi_private; + bio_reset(bio); + bio->bi_private = rp; + } + } + return r10bio; +} + +/* + * Set cluster_sync_high since we need other nodes to add the + * range [cluster_sync_low, cluster_sync_high] to suspend list. + */ +static void raid10_set_cluster_sync_high(struct r10conf *conf) +{ + sector_t window_size; + int extra_chunk, chunks; + + /* + * First, here we define "stripe" as a unit which across + * all member devices one time, so we get chunks by use + * raid_disks / near_copies. Otherwise, if near_copies is + * close to raid_disks, then resync window could increases + * linearly with the increase of raid_disks, which means + * we will suspend a really large IO window while it is not + * necessary. If raid_disks is not divisible by near_copies, + * an extra chunk is needed to ensure the whole "stripe" is + * covered. + */ + + chunks = conf->geo.raid_disks / conf->geo.near_copies; + if (conf->geo.raid_disks % conf->geo.near_copies == 0) + extra_chunk = 0; + else + extra_chunk = 1; + window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors; + + /* + * At least use a 32M window to align with raid1's resync window + */ + window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ? + CLUSTER_RESYNC_WINDOW_SECTORS : window_size; + + conf->cluster_sync_high = conf->cluster_sync_low + window_size; +} + /* * perform a "sync" on one "block" * @@ -2853,6 +2925,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t sectors_skipped = 0; int chunks_skipped = 0; sector_t chunk_mask = conf->geo.chunk_mask; + int page_idx = 0; if (!conf->r10buf_pool) if (init_resync(conf)) @@ -2879,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { + conf->cluster_sync_low = 0; + conf->cluster_sync_high = 0; + /* If we aborted, we need to abort the * sync on the 'current' bitmap chucks (there can * be several when recovering multiple devices). @@ -2959,7 +3035,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that - * have bi_end_io, bi_sector, bi_bdev set, + * have bi_end_io, bi_sector, bi_disk set, * and bi_private set to the r10bio. * For recovery, we may actually create several r10bios * with 2 bios in each, that correspond to the bios in the main one. @@ -3036,7 +3112,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, atomic_inc(&mreplace->nr_pending); rcu_read_unlock(); - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; raise_barrier(conf, rb2 != NULL); atomic_set(&r10_bio->remaining, 0); @@ -3104,7 +3180,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, from_addr = r10_bio->devs[j].addr; bio->bi_iter.bi_sector = from_addr + rdev->data_offset; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); atomic_inc(&rdev->nr_pending); /* and we write to 'i' (if not in_sync) */ @@ -3126,7 +3202,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_iter.bi_sector = to_addr + mrdev->data_offset; - bio->bi_bdev = mrdev->bdev; + bio_set_dev(bio, mrdev->bdev); atomic_inc(&r10_bio->remaining); } else r10_bio->devs[1].bio->bi_end_io = NULL; @@ -3152,7 +3228,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_iter.bi_sector = to_addr + mreplace->data_offset; - bio->bi_bdev = mreplace->bdev; + bio_set_dev(bio, mreplace->bdev); atomic_inc(&r10_bio->remaining); break; } @@ -3233,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* resync. Schedule a read for every block at this virt offset */ int count = 0; - bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0); + /* + * Since curr_resync_completed could probably not update in + * time, and we will set cluster_sync_low based on it. + * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for + * safety reason, which ensures curr_resync_completed is + * updated in bitmap_cond_end_sync. + */ + bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > + conf->cluster_sync_high)); if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && @@ -3245,7 +3331,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } if (sync_blocks < max_sync) max_sync = sync_blocks; - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; r10_bio->mddev = mddev; @@ -3298,7 +3384,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; bio->bi_iter.bi_sector = sector + rdev->data_offset; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); count++; rdev = rcu_dereference(conf->mirrors[d].replacement); @@ -3320,7 +3406,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; bio->bi_iter.bi_sector = sector + rdev->data_offset; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); count++; rcu_read_unlock(); } @@ -3355,7 +3441,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, break; for (bio= biolist ; bio ; bio=bio->bi_next) { struct resync_pages *rp = get_resync_pages(bio); - page = resync_fetch_page(rp, rp->idx++); + page = resync_fetch_page(rp, page_idx); /* * won't fail because the vec table is big enough * to hold all these pages @@ -3364,9 +3450,55 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } nr_sectors += len>>9; sector_nr += len>>9; - } while (get_resync_pages(biolist)->idx < RESYNC_PAGES); + } while (++page_idx < RESYNC_PAGES); r10_bio->sectors = nr_sectors; + if (mddev_is_clustered(mddev) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* It is resync not recovery */ + if (conf->cluster_sync_high < sector_nr + nr_sectors) { + conf->cluster_sync_low = mddev->curr_resync_completed; + raid10_set_cluster_sync_high(conf); + /* Send resync message */ + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + } else if (mddev_is_clustered(mddev)) { + /* This is recovery not resync */ + sector_t sect_va1, sect_va2; + bool broadcast_msg = false; + + for (i = 0; i < conf->geo.raid_disks; i++) { + /* + * sector_nr is a device address for recovery, so we + * need translate it to array address before compare + * with cluster_sync_high. + */ + sect_va1 = raid10_find_virt(conf, sector_nr, i); + + if (conf->cluster_sync_high < sect_va1 + nr_sectors) { + broadcast_msg = true; + /* + * curr_resync_completed is similar as + * sector_nr, so make the translation too. + */ + sect_va2 = raid10_find_virt(conf, + mddev->curr_resync_completed, i); + + if (conf->cluster_sync_low == 0 || + conf->cluster_sync_low > sect_va2) + conf->cluster_sync_low = sect_va2; + } + } + if (broadcast_msg) { + raid10_set_cluster_sync_high(conf); + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + } + while (biolist) { bio = biolist; biolist = biolist->bi_next; @@ -3376,7 +3508,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->sectors = nr_sectors; if (bio->bi_end_io == end_sync_read) { - md_sync_acct(bio->bi_bdev, nr_sectors); + md_sync_acct_bio(bio, nr_sectors); bio->bi_status = 0; generic_make_request(bio); } @@ -3626,6 +3758,18 @@ static int raid10_run(struct mddev *mddev) if (!conf) goto out; + if (mddev_is_clustered(conf->mddev)) { + int fc, fo; + + fc = (mddev->layout >> 8) & 255; + fo = mddev->layout & (1<<16); + if (fc > 1 || fo > 0) { + pr_err("only near layout is supported by clustered" + " raid10\n"); + goto out; + } + } + mddev->thread = conf->thread; conf->thread = NULL; @@ -3814,18 +3958,14 @@ static void raid10_free(struct mddev *mddev, void *priv) kfree(conf); } -static void raid10_quiesce(struct mddev *mddev, int state) +static void raid10_quiesce(struct mddev *mddev, int quiesce) { struct r10conf *conf = mddev->private; - switch(state) { - case 1: + if (quiesce) raise_barrier(conf, 0); - break; - case 0: + else lower_barrier(conf); - break; - } } static int raid10_resize(struct mddev *mddev, sector_t sectors) @@ -4369,7 +4509,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, read_more: /* Now schedule reads for blocks from sector_nr to last */ - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; raise_barrier(conf, sectors_done != 0); atomic_set(&r10_bio->remaining, 0); @@ -4392,7 +4532,7 @@ read_more: read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); - read_bio->bi_bdev = rdev->bdev; + bio_set_dev(read_bio, rdev->bdev); read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset); read_bio->bi_private = r10_bio; @@ -4426,7 +4566,7 @@ read_more: if (!rdev2 || test_bit(Faulty, &rdev2->flags)) continue; - b->bi_bdev = rdev2->bdev; + bio_set_dev(b, rdev2->bdev); b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; b->bi_end_io = end_reshape_write; @@ -4458,7 +4598,7 @@ read_more: r10_bio->sectors = nr_sectors; /* Now submit the read */ - md_sync_acct(read_bio->bi_bdev, r10_bio->sectors); + md_sync_acct_bio(read_bio, r10_bio->sectors); atomic_inc(&r10_bio->remaining); read_bio->bi_next = NULL; generic_make_request(read_bio); @@ -4520,7 +4660,7 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) } atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - md_sync_acct(b->bi_bdev, r10_bio->sectors); + md_sync_acct_bio(b, r10_bio->sectors); atomic_inc(&r10_bio->remaining); b->bi_next = NULL; generic_make_request(b); @@ -4560,15 +4700,18 @@ static int handle_reshape_read_error(struct mddev *mddev, /* Use sync reads to get the blocks from somewhere else */ int sectors = r10_bio->sectors; struct r10conf *conf = mddev->private; - struct { - struct r10bio r10_bio; - struct r10dev devs[conf->copies]; - } on_stack; - struct r10bio *r10b = &on_stack.r10_bio; + struct r10bio *r10b; int slot = 0; int idx = 0; struct page **pages; + r10b = kmalloc(sizeof(*r10b) + + sizeof(struct r10dev) * conf->copies, GFP_NOIO); + if (!r10b) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return -ENOMEM; + } + /* reshape IOs share pages from .devs[0].bio */ pages = get_resync_pages(r10_bio->devs[0].bio)->pages; @@ -4617,11 +4760,13 @@ static int handle_reshape_read_error(struct mddev *mddev, /* couldn't read this block, must give up */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); + kfree(r10b); return -EIO; } sectors -= s; idx++; } + kfree(r10b); return 0; } diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 735ce1a3d260..db2ac22ac1b4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RAID10_H #define _RAID10_H @@ -88,6 +89,12 @@ struct r10conf { * the new thread here until we fully activate the array. */ struct md_thread *thread; + + /* + * Keep track of cluster resync window to send to other nodes. + */ + sector_t cluster_sync_low; + sector_t cluster_sync_high; }; /* diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index bfa1e907c472..f1c86d938502 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -23,7 +23,7 @@ #include <linux/types.h> #include "md.h" #include "raid5.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "raid5-log.h" /* @@ -236,9 +236,10 @@ struct r5l_io_unit { bool need_split_bio; struct bio *split_bio; - unsigned int has_flush:1; /* include flush request */ - unsigned int has_fua:1; /* include fua request */ - unsigned int has_null_flush:1; /* include empty flush request */ + unsigned int has_flush:1; /* include flush request */ + unsigned int has_fua:1; /* include fua request */ + unsigned int has_null_flush:1; /* include null flush request */ + unsigned int has_flush_payload:1; /* include flush payload */ /* * io isn't sent yet, flush/fua request can only be submitted till it's * the first IO in running_ios list @@ -538,7 +539,7 @@ static void r5l_log_run_stripes(struct r5l_log *log) { struct r5l_io_unit *io, *next; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { /* don't change list order */ @@ -554,7 +555,7 @@ static void r5l_move_to_end_ios(struct r5l_log *log) { struct r5l_io_unit *io, *next; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { /* don't change list order */ @@ -571,6 +572,8 @@ static void r5l_log_endio(struct bio *bio) struct r5l_io_unit *io_deferred; struct r5l_log *log = io->log; unsigned long flags; + bool has_null_flush; + bool has_flush_payload; if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); @@ -580,6 +583,16 @@ static void r5l_log_endio(struct bio *bio) spin_lock_irqsave(&log->io_list_lock, flags); __r5l_set_io_unit_state(io, IO_UNIT_IO_END); + + /* + * if the io doesn't not have null_flush or flush payload, + * it is not safe to access it after releasing io_list_lock. + * Therefore, it is necessary to check the condition with + * the lock held. + */ + has_null_flush = io->has_null_flush; + has_flush_payload = io->has_flush_payload; + if (log->need_cache_flush && !list_empty(&io->stripe_list)) r5l_move_to_end_ios(log); else @@ -600,19 +613,23 @@ static void r5l_log_endio(struct bio *bio) if (log->need_cache_flush) md_wakeup_thread(log->rdev->mddev->thread); - if (io->has_null_flush) { + /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */ + if (has_null_flush) { struct bio *bi; WARN_ON(bio_list_empty(&io->flush_barriers)); while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { bio_endio(bi); - atomic_dec(&io->pending_stripe); + if (atomic_dec_and_test(&io->pending_stripe)) { + __r5l_stripe_write_finished(io); + return; + } } } - - /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */ - if (atomic_read(&io->pending_stripe) == 0) - __r5l_stripe_write_finished(io); + /* decrease pending_stripe for flush payload */ + if (has_flush_payload) + if (atomic_dec_and_test(&io->pending_stripe)) + __r5l_stripe_write_finished(io); } static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) @@ -676,6 +693,8 @@ static void r5c_disable_writeback_async(struct work_struct *work) struct r5l_log *log = container_of(work, struct r5l_log, disable_writeback_work); struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + int locked = 0; if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; @@ -684,11 +703,15 @@ static void r5c_disable_writeback_async(struct work_struct *work) /* wait superblock change before suspend */ wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); - - mddev_suspend(mddev); - log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; - mddev_resume(mddev); + conf->log == NULL || + (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && + (locked = mddev_trylock(mddev)))); + if (locked) { + mddev_suspend(mddev); + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + mddev_resume(mddev); + mddev_unlock(mddev); + } } static void r5l_submit_current_io(struct r5l_log *log) @@ -728,7 +751,7 @@ static struct bio *r5l_bio_alloc(struct r5l_log *log) struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - bio->bi_bdev = log->rdev->bdev; + bio_set_dev(bio, log->rdev->bdev); bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; return bio; @@ -881,6 +904,11 @@ static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect) payload->size = cpu_to_le32(sizeof(__le64)); payload->flush_stripes[0] = cpu_to_le64(sect); io->meta_offset += meta_size; + /* multiple flush payloads count as one pending_stripe */ + if (!io->has_flush_payload) { + io->has_flush_payload = 1; + atomic_inc(&io->pending_stripe); + } mutex_unlock(&log->io_mutex); } @@ -1172,7 +1200,7 @@ static void r5l_run_no_mem_stripe(struct r5l_log *log) { struct stripe_head *sh; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); if (!list_empty(&log->no_mem_stripes)) { sh = list_first_entry(&log->no_mem_stripes, @@ -1188,7 +1216,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) struct r5l_io_unit *io, *next; bool found = false; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { /* don't change list order */ @@ -1291,7 +1319,7 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log) if (!do_flush) return; bio_reset(&log->flush_bio); - log->flush_bio.bi_bdev = log->rdev->bdev; + bio_set_dev(&log->flush_bio, log->rdev->bdev); log->flush_bio.bi_end_io = r5l_log_flush_endio; log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; submit_bio(&log->flush_bio); @@ -1360,7 +1388,7 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) * raid5_release_stripe() while holding conf->device_lock */ BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); - assert_spin_locked(&conf->device_lock); + lockdep_assert_held(&conf->device_lock); list_del_init(&sh->lru); atomic_inc(&sh->count); @@ -1387,7 +1415,7 @@ void r5c_flush_cache(struct r5conf *conf, int num) int count; struct stripe_head *sh, *next; - assert_spin_locked(&conf->device_lock); + lockdep_assert_held(&conf->device_lock); if (!conf->log) return; @@ -1561,21 +1589,21 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) md_wakeup_thread(log->reclaim_thread); } -void r5l_quiesce(struct r5l_log *log, int state) +void r5l_quiesce(struct r5l_log *log, int quiesce) { struct mddev *mddev; - if (!log || state == 2) + if (!log) return; - if (state == 0) - kthread_unpark(log->reclaim_thread->tsk); - else if (state == 1) { + + if (quiesce) { /* make sure r5l_write_super_and_discard_space exits */ mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); kthread_park(log->reclaim_thread->tsk); r5l_wake_reclaim(log, MaxSector); r5l_do_reclaim(log); - } + } else + kthread_unpark(log->reclaim_thread->tsk); } bool r5l_log_disk_error(struct r5conf *conf) @@ -1669,7 +1697,7 @@ static int r5l_recovery_fetch_ra_pool(struct r5l_log *log, sector_t offset) { bio_reset(ctx->ra_bio); - ctx->ra_bio->bi_bdev = log->rdev->bdev; + bio_set_dev(ctx->ra_bio, log->rdev->bdev); bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0); ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset; @@ -2507,11 +2535,18 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp) static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; int ret; - if (!conf->log) + ret = mddev_lock(mddev); + if (ret) + return ret; + + conf = mddev->private; + if (!conf || !conf->log) { + mddev_unlock(mddev); return 0; + } switch (conf->log->r5c_journal_mode) { case R5C_JOURNAL_MODE_WRITE_THROUGH: @@ -2529,6 +2564,7 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) default: ret = 0; } + mddev_unlock(mddev); return ret; } @@ -2540,23 +2576,32 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) */ int r5c_journal_mode_set(struct mddev *mddev, int mode) { - struct r5conf *conf = mddev->private; - struct r5l_log *log = conf->log; - - if (!log) - return -ENODEV; + struct r5conf *conf; + int err; if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH || mode > R5C_JOURNAL_MODE_WRITE_BACK) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf || !conf->log) { + mddev_unlock(mddev); + return -ENODEV; + } + if (raid5_calc_degraded(conf) > 0 && - mode == R5C_JOURNAL_MODE_WRITE_BACK) + mode == R5C_JOURNAL_MODE_WRITE_BACK) { + mddev_unlock(mddev); return -EINVAL; + } mddev_suspend(mddev); conf->log->r5c_journal_mode = mode; mddev_resume(mddev); + mddev_unlock(mddev); pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", mdname(mddev), mode, r5c_journal_mode_str[mode]); @@ -3126,6 +3171,8 @@ void r5l_exit_log(struct r5conf *conf) conf->log = NULL; synchronize_rcu(); + /* Ensure disable_writeback_work wakes up and exits */ + wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); mempool_destroy(log->meta_pool); diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 328d67aedda4..284578b0a349 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RAID5_LOG_H #define _RAID5_LOG_H @@ -8,7 +9,7 @@ extern void r5l_write_stripe_run(struct r5l_log *log); extern void r5l_flush_stripe_to_raid(struct r5l_log *log); extern void r5l_stripe_write_finished(struct stripe_head *sh); extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); -extern void r5l_quiesce(struct r5l_log *log, int state); +extern void r5l_quiesce(struct r5l_log *log, int quiesce); extern bool r5l_log_disk_error(struct r5conf *conf); extern bool r5c_is_writeback(struct r5l_log *log); extern int diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 44ad5baf3206..628c0bf7b9fd 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -87,6 +87,8 @@ * The current io_unit accepting new stripes is always at the end of the list. */ +#define PPL_SPACE_SIZE (128 * 1024) + struct ppl_conf { struct mddev *mddev; @@ -122,6 +124,10 @@ struct ppl_log { * always at the end of io_list */ spinlock_t io_list_lock; struct list_head io_list; /* all io_units of this log */ + + sector_t next_io_sector; + unsigned int entry_space; + bool use_multippl; }; #define PPL_IO_INLINE_BVECS 32 @@ -264,13 +270,12 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) int i; sector_t data_sector = 0; int data_disks = 0; - unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE; struct r5conf *conf = sh->raid_conf; pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector); /* check if current io_unit is full */ - if (io && (io->pp_size == entry_space || + if (io && (io->pp_size == log->entry_space || io->entries_count == PPL_HDR_MAX_ENTRIES)) { pr_debug("%s: add io_unit blocked by seq: %llu\n", __func__, io->seq); @@ -415,7 +420,7 @@ static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio) pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n", __func__, io->seq, bio->bi_iter.bi_size, (unsigned long long)bio->bi_iter.bi_sector, - bdevname(bio->bi_bdev, b)); + bio_devname(bio, b)); submit_bio(bio); } @@ -451,12 +456,25 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) pplhdr->entries_count = cpu_to_le32(io->entries_count); pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE)); + /* Rewind the buffer if current PPL is larger then remaining space */ + if (log->use_multippl && + log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector < + (PPL_HEADER_SIZE + io->pp_size) >> 9) + log->next_io_sector = log->rdev->ppl.sector; + + bio->bi_end_io = ppl_log_endio; bio->bi_opf = REQ_OP_WRITE | REQ_FUA; - bio->bi_bdev = log->rdev->bdev; - bio->bi_iter.bi_sector = log->rdev->ppl.sector; + bio_set_dev(bio, log->rdev->bdev); + bio->bi_iter.bi_sector = log->next_io_sector; bio_add_page(bio, io->header_page, PAGE_SIZE, 0); + pr_debug("%s: log->current_io_sector: %llu\n", __func__, + (unsigned long long)log->next_io_sector); + + if (log->use_multippl) + log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9; + list_for_each_entry(sh, &io->stripe_list, log_list) { /* entries for full stripe writes have no partial parity */ if (test_bit(STRIPE_FULL_WRITE, &sh->state)) @@ -468,7 +486,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, ppl_conf->bs); bio->bi_opf = prev->bi_opf; - bio->bi_bdev = prev->bi_bdev; + bio_copy_dev(bio, prev); bio->bi_iter.bi_sector = bio_end_sector(prev); bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); @@ -740,7 +758,8 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, (unsigned long long)sector); rdev = conf->disks[dd_idx].rdev; - if (!rdev) { + if (!rdev || (!test_bit(In_sync, &rdev->flags) && + sector >= rdev->recovery_offset)) { pr_debug("%s:%*s data member disk %d missing\n", __func__, indent, "", dd_idx); update_parity = false; @@ -813,12 +832,14 @@ out: return ret; } -static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr) +static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, + sector_t offset) { struct ppl_conf *ppl_conf = log->ppl_conf; struct md_rdev *rdev = log->rdev; struct mddev *mddev = rdev->mddev; - sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9); + sector_t ppl_sector = rdev->ppl.sector + offset + + (PPL_HEADER_SIZE >> 9); struct page *page; int i; int ret = 0; @@ -902,6 +923,9 @@ static int ppl_write_empty_header(struct ppl_log *log) return -ENOMEM; pplhdr = page_address(page); + /* zero out PPL space to avoid collision with old PPLs */ + blkdev_issue_zeroout(rdev->bdev, rdev->ppl.sector, + log->rdev->ppl.size, GFP_NOIO, 0); memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); pplhdr->signature = cpu_to_le32(log->ppl_conf->signature); pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE)); @@ -922,63 +946,110 @@ static int ppl_load_distributed(struct ppl_log *log) struct ppl_conf *ppl_conf = log->ppl_conf; struct md_rdev *rdev = log->rdev; struct mddev *mddev = rdev->mddev; - struct page *page; - struct ppl_header *pplhdr; + struct page *page, *page2, *tmp; + struct ppl_header *pplhdr = NULL, *prev_pplhdr = NULL; u32 crc, crc_stored; u32 signature; - int ret = 0; + int ret = 0, i; + sector_t pplhdr_offset = 0, prev_pplhdr_offset = 0; pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk); - - /* read PPL header */ + /* read PPL headers, find the recent one */ page = alloc_page(GFP_KERNEL); if (!page) return -ENOMEM; - if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset, - PAGE_SIZE, page, REQ_OP_READ, 0, false)) { - md_error(mddev, rdev); - ret = -EIO; - goto out; + page2 = alloc_page(GFP_KERNEL); + if (!page2) { + __free_page(page); + return -ENOMEM; } - pplhdr = page_address(page); - /* check header validity */ - crc_stored = le32_to_cpu(pplhdr->checksum); - pplhdr->checksum = 0; - crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE); + /* searching ppl area for latest ppl */ + while (pplhdr_offset < rdev->ppl.size - (PPL_HEADER_SIZE >> 9)) { + if (!sync_page_io(rdev, + rdev->ppl.sector - rdev->data_offset + + pplhdr_offset, PAGE_SIZE, page, REQ_OP_READ, + 0, false)) { + md_error(mddev, rdev); + ret = -EIO; + /* if not able to read - don't recover any PPL */ + pplhdr = NULL; + break; + } + pplhdr = page_address(page); + + /* check header validity */ + crc_stored = le32_to_cpu(pplhdr->checksum); + pplhdr->checksum = 0; + crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE); + + if (crc_stored != crc) { + pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x (offset: %llu)\n", + __func__, crc_stored, crc, + (unsigned long long)pplhdr_offset); + pplhdr = prev_pplhdr; + pplhdr_offset = prev_pplhdr_offset; + break; + } - if (crc_stored != crc) { - pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n", - __func__, crc_stored, crc); - ppl_conf->mismatch_count++; - goto out; - } + signature = le32_to_cpu(pplhdr->signature); - signature = le32_to_cpu(pplhdr->signature); + if (mddev->external) { + /* + * For external metadata the header signature is set and + * validated in userspace. + */ + ppl_conf->signature = signature; + } else if (ppl_conf->signature != signature) { + pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x (offset: %llu)\n", + __func__, signature, ppl_conf->signature, + (unsigned long long)pplhdr_offset); + pplhdr = prev_pplhdr; + pplhdr_offset = prev_pplhdr_offset; + break; + } - if (mddev->external) { - /* - * For external metadata the header signature is set and - * validated in userspace. - */ - ppl_conf->signature = signature; - } else if (ppl_conf->signature != signature) { - pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n", - __func__, signature, ppl_conf->signature); - ppl_conf->mismatch_count++; - goto out; + if (prev_pplhdr && le64_to_cpu(prev_pplhdr->generation) > + le64_to_cpu(pplhdr->generation)) { + /* previous was newest */ + pplhdr = prev_pplhdr; + pplhdr_offset = prev_pplhdr_offset; + break; + } + + prev_pplhdr_offset = pplhdr_offset; + prev_pplhdr = pplhdr; + + tmp = page; + page = page2; + page2 = tmp; + + /* calculate next potential ppl offset */ + for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) + pplhdr_offset += + le32_to_cpu(pplhdr->entries[i].pp_size) >> 9; + pplhdr_offset += PPL_HEADER_SIZE >> 9; } + /* no valid ppl found */ + if (!pplhdr) + ppl_conf->mismatch_count++; + else + pr_debug("%s: latest PPL found at offset: %llu, with generation: %llu\n", + __func__, (unsigned long long)pplhdr_offset, + le64_to_cpu(pplhdr->generation)); + /* attempt to recover from log if we are starting a dirty array */ - if (!mddev->pers && mddev->recovery_cp != MaxSector) - ret = ppl_recover(log, pplhdr); -out: + if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector) + ret = ppl_recover(log, pplhdr, pplhdr_offset); + /* write empty header if we are starting the array */ if (!ret && !mddev->pers) ret = ppl_write_empty_header(log); __free_page(page); + __free_page(page2); pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n", __func__, ret, ppl_conf->mismatch_count, @@ -1031,6 +1102,7 @@ static int ppl_load(struct ppl_conf *ppl_conf) static void __ppl_exit_log(struct ppl_conf *ppl_conf) { clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags); + clear_bit(MD_HAS_MULTIPLE_PPLS, &ppl_conf->mddev->flags); kfree(ppl_conf->child_logs); @@ -1099,6 +1171,22 @@ static int ppl_validate_rdev(struct md_rdev *rdev) return 0; } +static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) +{ + if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE + + PPL_HEADER_SIZE) * 2) { + log->use_multippl = true; + set_bit(MD_HAS_MULTIPLE_PPLS, + &log->ppl_conf->mddev->flags); + log->entry_space = PPL_SPACE_SIZE; + } else { + log->use_multippl = false; + log->entry_space = (log->rdev->ppl.size << 9) - + PPL_HEADER_SIZE; + } + log->next_io_sector = rdev->ppl.sector; +} + int ppl_init_log(struct r5conf *conf) { struct ppl_conf *ppl_conf; @@ -1196,6 +1284,7 @@ int ppl_init_log(struct r5conf *conf) q = bdev_get_queue(rdev->bdev); if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) need_cache_flush = true; + ppl_init_child_log(log, rdev); } } @@ -1208,8 +1297,7 @@ int ppl_init_log(struct r5conf *conf) if (ret) { goto err; - } else if (!mddev->pers && - mddev->recovery_cp == 0 && !mddev->degraded && + } else if (!mddev->pers && mddev->recovery_cp == 0 && ppl_conf->recovered_entries > 0 && ppl_conf->mismatch_count == 0) { /* @@ -1261,6 +1349,7 @@ int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add) if (!ret) { log->rdev = rdev; ret = ppl_write_empty_header(log); + ppl_init_child_log(log, rdev); } } else { log->rdev = NULL; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index aeeb8d6854e2..31dc25e2871a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -55,7 +55,6 @@ #include <linux/ratelimit.h> #include <linux/nodemask.h> #include <linux/flex_array.h> -#include <linux/sched/signal.h> #include <trace/events/block.h> #include <linux/list_sort.h> @@ -63,7 +62,7 @@ #include "md.h" #include "raid5.h" #include "raid0.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "raid5-log.h" #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) @@ -494,7 +493,6 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp) return 0; } -static void raid5_build_block(struct stripe_head *sh, int i, int previous); static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, struct stripe_head *sh); @@ -530,7 +528,7 @@ retry: WARN_ON(1); } dev->flags = 0; - raid5_build_block(sh, i, previous); + dev->sector = raid5_compute_blocknr(sh, i, previous); } if (read_seqcount_retry(&conf->gen_lock, seq)) goto retry; @@ -812,6 +810,14 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh spin_unlock(&head->batch_head->batch_lock); goto unlock_out; } + /* + * We must assign batch_head of this stripe within the + * batch_lock, otherwise clear_batch_ready of batch head + * stripe could clear BATCH_READY bit of this stripe and + * this stripe->batch_head doesn't get assigned, which + * could confuse clear_batch_ready for this stripe + */ + sh->batch_head = head->batch_head; /* * at this point, head's BATCH_READY could be cleared, but we @@ -819,8 +825,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh */ list_add(&sh->batch_list, &head->batch_list); spin_unlock(&head->batch_head->batch_lock); - - sh->batch_head = head->batch_head; } else { head->batch_head = head; sh->batch_head = head->batch_head; @@ -1096,7 +1100,7 @@ again: set_bit(STRIPE_IO_STARTED, &sh->state); - bi->bi_bdev = rdev->bdev; + bio_set_dev(bi, rdev->bdev); bio_set_op_attrs(bi, op, op_flags); bi->bi_end_io = op_is_write(op) ? raid5_end_write_request @@ -1145,7 +1149,7 @@ again: set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); if (conf->mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), + trace_block_bio_remap(bi->bi_disk->queue, bi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); if (should_defer && op_is_write(op)) @@ -1160,7 +1164,7 @@ again: set_bit(STRIPE_IO_STARTED, &sh->state); - rbi->bi_bdev = rrdev->bdev; + bio_set_dev(rbi, rrdev->bdev); bio_set_op_attrs(rbi, op, op_flags); BUG_ON(!op_is_write(op)); rbi->bi_end_io = raid5_end_write_request; @@ -1193,7 +1197,7 @@ again: if (op == REQ_OP_DISCARD) rbi->bi_vcnt = 0; if (conf->mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), + trace_block_bio_remap(rbi->bi_disk->queue, rbi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); if (should_defer && op_is_write(op)) @@ -1813,8 +1817,11 @@ static void ops_complete_reconstruct(void *stripe_head_ref) struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) { set_bit(R5_UPTODATE, &dev->flags); + if (test_bit(STRIPE_EXPAND_READY, &sh->state)) + set_bit(R5_Expanded, &dev->flags); + } if (fua) set_bit(R5_WantFUA, &dev->flags); if (sync) @@ -2662,14 +2669,6 @@ static void raid5_end_write_request(struct bio *bi) raid5_release_stripe(sh->batch_head); } -static void raid5_build_block(struct stripe_head *sh, int i, int previous) -{ - struct r5dev *dev = &sh->dev[i]; - - dev->flags = 0; - dev->sector = raid5_compute_blocknr(sh, i, previous); -} - static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; @@ -3381,9 +3380,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_status = BLK_STS_IOERR; md_write_end(conf->mddev); - bio_endio(bi); + bio_io_error(bi); bi = nextbi; } if (bitmap_end) @@ -3403,9 +3401,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_status = BLK_STS_IOERR; md_write_end(conf->mddev); - bio_endio(bi); + bio_io_error(bi); bi = bi2; } @@ -3429,8 +3426,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_status = BLK_STS_IOERR; - bio_endio(bi); + bio_io_error(bi); bi = nextbi; } } @@ -4611,7 +4607,8 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED)), + (1 << STRIPE_DEGRADED) | + (1 << STRIPE_ON_UNPLUG_LIST)), head_sh->state & (1 << STRIPE_INSYNC)); sh->check_state = head_sh->check_state; @@ -5095,10 +5092,12 @@ static int raid5_congested(struct mddev *mddev, int bits) static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { struct r5conf *conf = mddev->private; - sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bio->bi_iter.bi_sector; unsigned int chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); + WARN_ON_ONCE(bio->bi_partno); + chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); @@ -5234,7 +5233,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; - align_bi->bi_bdev = rdev->bdev; + bio_set_dev(align_bi, rdev->bdev); bio_clear_flag(align_bi, BIO_SEG_VALID); if (is_badblock(rdev, align_bi->bi_iter.bi_sector, @@ -5256,7 +5255,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) spin_unlock_irq(&conf->device_lock); if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), + trace_block_bio_remap(align_bi->bi_disk->queue, align_bi, disk_devt(mddev->gendisk), raid_bio->bi_iter.bi_sector); generic_make_request(align_bi); @@ -5685,28 +5684,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) goto retry; } - if (rw == WRITE && - logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - raid5_release_stripe(sh); - /* As the suspend_* range is controlled by - * userspace, we want an interruptible - * wait. - */ - prepare_to_wait(&conf->wait_for_overlap, - &w, TASK_INTERRUPTIBLE); - if (logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - sigset_t full, old; - sigfillset(&full); - sigprocmask(SIG_BLOCK, &full, &old); - schedule(); - sigprocmask(SIG_SETMASK, &old, NULL); - do_prepare = true; - } - goto retry; - } - if (test_bit(STRIPE_EXPANDING, &sh->state) || !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { /* Stripe is busy expanding or @@ -5761,6 +5738,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk */ struct r5conf *conf = mddev->private; struct stripe_head *sh; + struct md_rdev *rdev; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -5883,6 +5861,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk return 0; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; + conf->reshape_checkpoint = jiffies; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); @@ -5981,6 +5968,14 @@ finish: goto ret; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); @@ -6075,7 +6070,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n */ rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); if (rdev == NULL || test_bit(Faulty, &rdev->flags)) still_degraded = 1; @@ -6237,6 +6232,12 @@ static void raid5_do_work(struct work_struct *work) pr_debug("%d stripes handled\n", handled); spin_unlock_irq(&conf->device_lock); + + flush_deferred_bios(conf); + + r5l_flush_stripe_to_raid(conf->log); + + async_tx_issue_pending_all(); blk_finish_plug(&plug); pr_debug("--- raid5worker inactive\n"); @@ -6572,14 +6573,17 @@ static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { struct r5conf *conf; - unsigned long new; + unsigned int new; int err; struct r5worker_group *new_groups, *old_groups; int group_cnt, worker_cnt_per_group; if (len >= PAGE_SIZE) return -EINVAL; - if (kstrtoul(page, 10, &new)) + if (kstrtouint(page, 10, &new)) + return -EINVAL; + /* 8192 should be big enough */ + if (new > 8192) return -EINVAL; err = mddev_lock(mddev); @@ -7150,6 +7154,13 @@ static int raid5_run(struct mddev *mddev) min_offset_diff = diff; } + if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) && + (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { + pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->reshape_position != MaxSector) { /* Check that we can continue the reshape. * Difficulties arise if the stripe we would write to @@ -7242,6 +7253,7 @@ static int raid5_run(struct mddev *mddev) pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", mdname(mddev)); clear_bit(MD_HAS_PPL, &mddev->flags); + clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags); } if (mddev->private == NULL) @@ -7951,6 +7963,7 @@ static void end_reshape(struct r5conf *conf) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + struct md_rdev *rdev; spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; @@ -7958,6 +7971,11 @@ static void end_reshape(struct r5conf *conf) smp_wmb(); conf->reshape_progress = MaxSector; conf->mddev->reshape_position = MaxSector; + rdev_for_each(rdev, conf->mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) + rdev->recovery_offset = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); @@ -8013,16 +8031,12 @@ static void raid5_finish_reshape(struct mddev *mddev) } } -static void raid5_quiesce(struct mddev *mddev, int state) +static void raid5_quiesce(struct mddev *mddev, int quiesce) { struct r5conf *conf = mddev->private; - switch(state) { - case 2: /* resume for a suspend */ - wake_up(&conf->wait_for_overlap); - break; - - case 1: /* stop all writes */ + if (quiesce) { + /* stop all writes */ lock_all_device_hash_locks_irq(conf); /* '2' tells resync/reshape to pause so that all * active stripes can drain @@ -8038,17 +8052,15 @@ static void raid5_quiesce(struct mddev *mddev, int state) unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ wake_up(&conf->wait_for_overlap); - break; - - case 0: /* re-enable writes */ + } else { + /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_quiescent); wake_up(&conf->wait_for_overlap); unlock_all_device_hash_locks_irq(conf); - break; } - r5l_quiesce(conf->log, state); + r5l_quiesce(conf->log, quiesce); } static void *raid45_takeover_raid0(struct mddev *mddev, int level) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index f6536399677a..2e6123825095 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RAID5_H #define _RAID5_H |