summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorAlexander Graf <agraf@suse.de>2013-08-29 00:41:59 +0200
committerAlexander Graf <agraf@suse.de>2013-08-29 00:41:59 +0200
commitbf550fc93d9855872a95e69e4002256110d89858 (patch)
tree10876bb4304bffe54c4160a132e7b8de6577ac4e /drivers/md
parent7e48c101e0c53e6095c5f4f5e63d14df50aae8fc (diff)
parentcc2df20c7c4ce594c3e17e9cc260c330646012c8 (diff)
Merge remote-tracking branch 'origin/next' into kvm-ppc-next
Conflicts: mm/Kconfig CMA DMA split and ZSWAP introduction were conflicting, fix up manually.
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig14
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bcache/bset.c4
-rw-r--r--drivers/md/bitmap.c8
-rw-r--r--drivers/md/dm-bufio.c75
-rw-r--r--drivers/md/dm-cache-target.c4
-rw-r--r--drivers/md/dm-flakey.c2
-rw-r--r--drivers/md/dm-ioctl.c127
-rw-r--r--drivers/md/dm-mpath.c8
-rw-r--r--drivers/md/dm-raid.c76
-rw-r--r--drivers/md/dm-switch.c538
-rw-r--r--drivers/md/dm-table.c35
-rw-r--r--drivers/md/dm-verity.c17
-rw-r--r--drivers/md/dm.c177
-rw-r--r--drivers/md/md.c53
-rw-r--r--drivers/md/md.h8
-rw-r--r--drivers/md/raid0.c1
-rw-r--r--drivers/md/raid1.c7
-rw-r--r--drivers/md/raid10.c83
-rw-r--r--drivers/md/raid5.c6
20 files changed, 999 insertions, 245 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3bfc8f1da9fe..30b426ed744b 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -412,4 +412,18 @@ config DM_VERITY
If unsure, say N.
+config DM_SWITCH
+ tristate "Switch target support (EXPERIMENTAL)"
+ depends on BLK_DEV_DM
+ ---help---
+ This device-mapper target creates a device that supports an arbitrary
+ mapping of fixed-size regions of I/O across a fixed set of paths.
+ The path used for any specific region can be switched dynamically
+ by sending the target a message.
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-switch.
+
+ If unsure, say N.
+
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 1439fd4ad9b1..5ef78efc27f2 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
+obj-$(CONFIG_DM_SWITCH) += dm-switch.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index cb4578a327b9..1d27d3af3251 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -395,7 +395,7 @@ void inorder_test(void)
#endif
/*
- * Cacheline/offset <-> bkey pointer arithmatic:
+ * Cacheline/offset <-> bkey pointer arithmetic:
*
* t->tree is a binary search tree in an array; each node corresponds to a key
* in one cacheline in t->set (BSET_CACHELINE bytes).
@@ -404,7 +404,7 @@ void inorder_test(void)
* the binary tree points to; to_inorder() gives us the cacheline, and then
* bkey_float->m gives us the offset within that cacheline, in units of 8 bytes.
*
- * cacheline_to_bkey() and friends abstract out all the pointer arithmatic to
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
* make this work.
*
* To construct the bfloat for an arbitrary key we need to know what the key
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 5a2c75499824..a7fd82133b12 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -2002,9 +2002,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
} else {
int rv;
if (buf[0] == '+')
- rv = strict_strtoll(buf+1, 10, &offset);
+ rv = kstrtoll(buf+1, 10, &offset);
else
- rv = strict_strtoll(buf, 10, &offset);
+ rv = kstrtoll(buf, 10, &offset);
if (rv)
return rv;
if (offset == 0)
@@ -2139,7 +2139,7 @@ static ssize_t
backlog_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned long backlog;
- int rv = strict_strtoul(buf, 10, &backlog);
+ int rv = kstrtoul(buf, 10, &backlog);
if (rv)
return rv;
if (backlog > COUNTER_MAX)
@@ -2165,7 +2165,7 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len)
unsigned long csize;
if (mddev->bitmap)
return -EBUSY;
- rv = strict_strtoul(buf, 10, &csize);
+ rv = kstrtoul(buf, 10, &csize);
if (rv)
return rv;
if (csize < 512 ||
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 0387e05cdb98..5227e079a6e3 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -145,6 +145,7 @@ struct dm_buffer {
unsigned long state;
unsigned long last_accessed;
struct dm_bufio_client *c;
+ struct list_head write_list;
struct bio bio;
struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
};
@@ -349,7 +350,7 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
if (gfp_mask & __GFP_NORETRY)
noio_flag = memalloc_noio_save();
- ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+ ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL);
if (gfp_mask & __GFP_NORETRY)
memalloc_noio_restore(noio_flag);
@@ -630,7 +631,8 @@ static int do_io_schedule(void *word)
* - Submit our write and don't wait on it. We set B_WRITING indicating
* that there is a write in progress.
*/
-static void __write_dirty_buffer(struct dm_buffer *b)
+static void __write_dirty_buffer(struct dm_buffer *b,
+ struct list_head *write_list)
{
if (!test_bit(B_DIRTY, &b->state))
return;
@@ -639,7 +641,24 @@ static void __write_dirty_buffer(struct dm_buffer *b)
wait_on_bit_lock(&b->state, B_WRITING,
do_io_schedule, TASK_UNINTERRUPTIBLE);
- submit_io(b, WRITE, b->block, write_endio);
+ if (!write_list)
+ submit_io(b, WRITE, b->block, write_endio);
+ else
+ list_add_tail(&b->write_list, write_list);
+}
+
+static void __flush_write_list(struct list_head *write_list)
+{
+ struct blk_plug plug;
+ blk_start_plug(&plug);
+ while (!list_empty(write_list)) {
+ struct dm_buffer *b =
+ list_entry(write_list->next, struct dm_buffer, write_list);
+ list_del(&b->write_list);
+ submit_io(b, WRITE, b->block, write_endio);
+ dm_bufio_cond_resched();
+ }
+ blk_finish_plug(&plug);
}
/*
@@ -655,7 +674,7 @@ static void __make_buffer_clean(struct dm_buffer *b)
return;
wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
- __write_dirty_buffer(b);
+ __write_dirty_buffer(b, NULL);
wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
}
@@ -802,7 +821,8 @@ static void __free_buffer_wake(struct dm_buffer *b)
wake_up(&c->free_buffer_wait);
}
-static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
+static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
+ struct list_head *write_list)
{
struct dm_buffer *b, *tmp;
@@ -818,7 +838,7 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
if (no_wait && test_bit(B_WRITING, &b->state))
return;
- __write_dirty_buffer(b);
+ __write_dirty_buffer(b, write_list);
dm_bufio_cond_resched();
}
}
@@ -853,7 +873,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
* If we are over threshold_buffers, start freeing buffers.
* If we're over "limit_buffers", block until we get under the limit.
*/
-static void __check_watermark(struct dm_bufio_client *c)
+static void __check_watermark(struct dm_bufio_client *c,
+ struct list_head *write_list)
{
unsigned long threshold_buffers, limit_buffers;
@@ -872,7 +893,7 @@ static void __check_watermark(struct dm_bufio_client *c)
}
if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
- __write_dirty_buffers_async(c, 1);
+ __write_dirty_buffers_async(c, 1, write_list);
}
/*
@@ -897,7 +918,8 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
*--------------------------------------------------------------*/
static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
- enum new_flag nf, int *need_submit)
+ enum new_flag nf, int *need_submit,
+ struct list_head *write_list)
{
struct dm_buffer *b, *new_b = NULL;
@@ -924,7 +946,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
goto found_buffer;
}
- __check_watermark(c);
+ __check_watermark(c, write_list);
b = new_b;
b->hold_count = 1;
@@ -992,10 +1014,14 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
int need_submit;
struct dm_buffer *b;
+ LIST_HEAD(write_list);
+
dm_bufio_lock(c);
- b = __bufio_new(c, block, nf, &need_submit);
+ b = __bufio_new(c, block, nf, &need_submit, &write_list);
dm_bufio_unlock(c);
+ __flush_write_list(&write_list);
+
if (!b)
return b;
@@ -1047,6 +1073,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
{
struct blk_plug plug;
+ LIST_HEAD(write_list);
+
BUG_ON(dm_bufio_in_request());
blk_start_plug(&plug);
@@ -1055,7 +1083,15 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
for (; n_blocks--; block++) {
int need_submit;
struct dm_buffer *b;
- b = __bufio_new(c, block, NF_PREFETCH, &need_submit);
+ b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
+ &write_list);
+ if (unlikely(!list_empty(&write_list))) {
+ dm_bufio_unlock(c);
+ blk_finish_plug(&plug);
+ __flush_write_list(&write_list);
+ blk_start_plug(&plug);
+ dm_bufio_lock(c);
+ }
if (unlikely(b != NULL)) {
dm_bufio_unlock(c);
@@ -1069,7 +1105,6 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
goto flush_plug;
dm_bufio_lock(c);
}
-
}
dm_bufio_unlock(c);
@@ -1126,11 +1161,14 @@ EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
{
+ LIST_HEAD(write_list);
+
BUG_ON(dm_bufio_in_request());
dm_bufio_lock(c);
- __write_dirty_buffers_async(c, 0);
+ __write_dirty_buffers_async(c, 0, &write_list);
dm_bufio_unlock(c);
+ __flush_write_list(&write_list);
}
EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
@@ -1147,8 +1185,13 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
unsigned long buffers_processed = 0;
struct dm_buffer *b, *tmp;
+ LIST_HEAD(write_list);
+
+ dm_bufio_lock(c);
+ __write_dirty_buffers_async(c, 0, &write_list);
+ dm_bufio_unlock(c);
+ __flush_write_list(&write_list);
dm_bufio_lock(c);
- __write_dirty_buffers_async(c, 0);
again:
list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
@@ -1274,7 +1317,7 @@ retry:
BUG_ON(!b->hold_count);
BUG_ON(test_bit(B_READING, &b->state));
- __write_dirty_buffer(b);
+ __write_dirty_buffer(b, NULL);
if (b->hold_count == 1) {
wait_on_bit(&b->state, B_WRITING,
do_io_schedule, TASK_UNINTERRUPTIBLE);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index df44b60e66f2..0df3ec085ebb 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -425,6 +425,10 @@ static bool block_size_is_power_of_two(struct cache *cache)
return cache->sectors_per_block_shift >= 0;
}
+/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
+#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
+__always_inline
+#endif
static dm_block_t block_div(dm_block_t b, uint32_t n)
{
do_div(b, n);
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 7fcf21cb4ff8..c80a0ec5f126 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -176,7 +176,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
fc = kzalloc(sizeof(*fc), GFP_KERNEL);
if (!fc) {
- ti->error = "Cannot allocate linear context";
+ ti->error = "Cannot allocate context";
return -ENOMEM;
}
fc->start_time = jiffies;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index aa04f0224642..f1b758675ec7 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -36,6 +36,14 @@ struct hash_cell {
struct dm_table *new_map;
};
+/*
+ * A dummy definition to make RCU happy.
+ * struct dm_table should never be dereferenced in this file.
+ */
+struct dm_table {
+ int undefined__;
+};
+
struct vers_iter {
size_t param_size;
struct dm_target_versions *vers, *old_vers;
@@ -242,9 +250,10 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
return -EBUSY;
}
-static void __hash_remove(struct hash_cell *hc)
+static struct dm_table *__hash_remove(struct hash_cell *hc)
{
struct dm_table *table;
+ int srcu_idx;
/* remove from the dev hash */
list_del(&hc->uuid_list);
@@ -253,16 +262,18 @@ static void __hash_remove(struct hash_cell *hc)
dm_set_mdptr(hc->md, NULL);
mutex_unlock(&dm_hash_cells_mutex);
- table = dm_get_live_table(hc->md);
- if (table) {
+ table = dm_get_live_table(hc->md, &srcu_idx);
+ if (table)
dm_table_event(table);
- dm_table_put(table);
- }
+ dm_put_live_table(hc->md, srcu_idx);
+ table = NULL;
if (hc->new_map)
- dm_table_destroy(hc->new_map);
+ table = hc->new_map;
dm_put(hc->md);
free_cell(hc);
+
+ return table;
}
static void dm_hash_remove_all(int keep_open_devices)
@@ -270,6 +281,7 @@ static void dm_hash_remove_all(int keep_open_devices)
int i, dev_skipped;
struct hash_cell *hc;
struct mapped_device *md;
+ struct dm_table *t;
retry:
dev_skipped = 0;
@@ -287,10 +299,14 @@ retry:
continue;
}
- __hash_remove(hc);
+ t = __hash_remove(hc);
up_write(&_hash_lock);
+ if (t) {
+ dm_sync_table(md);
+ dm_table_destroy(t);
+ }
dm_put(md);
if (likely(keep_open_devices))
dm_destroy(md);
@@ -356,6 +372,7 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
struct dm_table *table;
struct mapped_device *md;
unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
+ int srcu_idx;
/*
* duplicate new.
@@ -418,11 +435,10 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
/*
* Wake up any dm event waiters.
*/
- table = dm_get_live_table(hc->md);
- if (table) {
+ table = dm_get_live_table(hc->md, &srcu_idx);
+ if (table)
dm_table_event(table);
- dm_table_put(table);
- }
+ dm_put_live_table(hc->md, srcu_idx);
if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr))
param->flags |= DM_UEVENT_GENERATED_FLAG;
@@ -620,11 +636,14 @@ static int check_name(const char *name)
* _hash_lock without first calling dm_table_put, because dm_table_destroy
* waits for this dm_table_put and could be called under this lock.
*/
-static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
+static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx)
{
struct hash_cell *hc;
struct dm_table *table = NULL;
+ /* increment rcu count, we don't care about the table pointer */
+ dm_get_live_table(md, srcu_idx);
+
down_read(&_hash_lock);
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
@@ -633,8 +652,6 @@ static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
}
table = hc->new_map;
- if (table)
- dm_table_get(table);
out:
up_read(&_hash_lock);
@@ -643,10 +660,11 @@ out:
}
static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
- struct dm_ioctl *param)
+ struct dm_ioctl *param,
+ int *srcu_idx)
{
return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
- dm_get_inactive_table(md) : dm_get_live_table(md);
+ dm_get_inactive_table(md, srcu_idx) : dm_get_live_table(md, srcu_idx);
}
/*
@@ -657,6 +675,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
{
struct gendisk *disk = dm_disk(md);
struct dm_table *table;
+ int srcu_idx;
param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
DM_ACTIVE_PRESENT_FLAG);
@@ -676,26 +695,27 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
param->event_nr = dm_get_event_nr(md);
param->target_count = 0;
- table = dm_get_live_table(md);
+ table = dm_get_live_table(md, &srcu_idx);
if (table) {
if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
if (get_disk_ro(disk))
param->flags |= DM_READONLY_FLAG;
param->target_count = dm_table_get_num_targets(table);
}
- dm_table_put(table);
param->flags |= DM_ACTIVE_PRESENT_FLAG;
}
+ dm_put_live_table(md, srcu_idx);
if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
- table = dm_get_inactive_table(md);
+ int srcu_idx;
+ table = dm_get_inactive_table(md, &srcu_idx);
if (table) {
if (!(dm_table_get_mode(table) & FMODE_WRITE))
param->flags |= DM_READONLY_FLAG;
param->target_count = dm_table_get_num_targets(table);
- dm_table_put(table);
}
+ dm_put_live_table(md, srcu_idx);
}
}
@@ -796,6 +816,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
struct hash_cell *hc;
struct mapped_device *md;
int r;
+ struct dm_table *t;
down_write(&_hash_lock);
hc = __find_device_hash_cell(param);
@@ -819,9 +840,14 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
return r;
}
- __hash_remove(hc);
+ t = __hash_remove(hc);
up_write(&_hash_lock);
+ if (t) {
+ dm_sync_table(md);
+ dm_table_destroy(t);
+ }
+
if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
param->flags |= DM_UEVENT_GENERATED_FLAG;
@@ -986,6 +1012,7 @@ static int do_resume(struct dm_ioctl *param)
old_map = dm_swap_table(md, new_map);
if (IS_ERR(old_map)) {
+ dm_sync_table(md);
dm_table_destroy(new_map);
dm_put(md);
return PTR_ERR(old_map);
@@ -1003,6 +1030,10 @@ static int do_resume(struct dm_ioctl *param)
param->flags |= DM_UEVENT_GENERATED_FLAG;
}
+ /*
+ * Since dm_swap_table synchronizes RCU, nobody should be in
+ * read-side critical section already.
+ */
if (old_map)
dm_table_destroy(old_map);
@@ -1125,6 +1156,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
int r = 0;
struct mapped_device *md;
struct dm_table *table;
+ int srcu_idx;
md = find_device(param);
if (!md)
@@ -1145,11 +1177,10 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
*/
__dev_status(md, param);
- table = dm_get_live_or_inactive_table(md, param);
- if (table) {
+ table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+ if (table)
retrieve_status(table, param, param_size);
- dm_table_put(table);
- }
+ dm_put_live_table(md, srcu_idx);
out:
dm_put(md);
@@ -1221,7 +1252,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
{
int r;
struct hash_cell *hc;
- struct dm_table *t;
+ struct dm_table *t, *old_map = NULL;
struct mapped_device *md;
struct target_type *immutable_target_type;
@@ -1277,14 +1308,14 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
DMWARN("device has been removed from the dev hash table.");
- dm_table_destroy(t);
up_write(&_hash_lock);
+ dm_table_destroy(t);
r = -ENXIO;
goto out;
}
if (hc->new_map)
- dm_table_destroy(hc->new_map);
+ old_map = hc->new_map;
hc->new_map = t;
up_write(&_hash_lock);
@@ -1292,6 +1323,11 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
__dev_status(md, param);
out:
+ if (old_map) {
+ dm_sync_table(md);
+ dm_table_destroy(old_map);
+ }
+
dm_put(md);
return r;
@@ -1301,6 +1337,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
{
struct hash_cell *hc;
struct mapped_device *md;
+ struct dm_table *old_map = NULL;
down_write(&_hash_lock);
@@ -1312,7 +1349,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
}
if (hc->new_map) {
- dm_table_destroy(hc->new_map);
+ old_map = hc->new_map;
hc->new_map = NULL;
}
@@ -1321,6 +1358,10 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
__dev_status(hc->md, param);
md = hc->md;
up_write(&_hash_lock);
+ if (old_map) {
+ dm_sync_table(md);
+ dm_table_destroy(old_map);
+ }
dm_put(md);
return 0;
@@ -1370,6 +1411,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
{
struct mapped_device *md;
struct dm_table *table;
+ int srcu_idx;
md = find_device(param);
if (!md)
@@ -1377,11 +1419,10 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
__dev_status(md, param);
- table = dm_get_live_or_inactive_table(md, param);
- if (table) {
+ table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+ if (table)
retrieve_deps(table, param, param_size);
- dm_table_put(table);
- }
+ dm_put_live_table(md, srcu_idx);
dm_put(md);
@@ -1396,6 +1437,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
{
struct mapped_device *md;
struct dm_table *table;
+ int srcu_idx;
md = find_device(param);
if (!md)
@@ -1403,11 +1445,10 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
__dev_status(md, param);
- table = dm_get_live_or_inactive_table(md, param);
- if (table) {
+ table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+ if (table)
retrieve_status(table, param, param_size);
- dm_table_put(table);
- }
+ dm_put_live_table(md, srcu_idx);
dm_put(md);
@@ -1443,6 +1484,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
struct dm_target_msg *tmsg = (void *) param + param->data_start;
size_t maxlen;
char *result = get_result_buffer(param, param_size, &maxlen);
+ int srcu_idx;
md = find_device(param);
if (!md)
@@ -1470,9 +1512,9 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
if (r <= 1)
goto out_argv;
- table = dm_get_live_table(md);
+ table = dm_get_live_table(md, &srcu_idx);
if (!table)
- goto out_argv;
+ goto out_table;
if (dm_deleting_md(md)) {
r = -ENXIO;
@@ -1491,7 +1533,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
}
out_table:
- dm_table_put(table);
+ dm_put_live_table(md, srcu_idx);
out_argv:
kfree(argv);
out:
@@ -1644,7 +1686,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
}
if (!dmi) {
- dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL);
+ unsigned noio_flag;
+ noio_flag = memalloc_noio_save();
+ dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
+ memalloc_noio_restore(noio_flag);
if (dmi)
*param_flags |= DM_PARAMS_VMALLOC;
}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index bdf26f5bd326..5adede17ddf6 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1561,7 +1561,6 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
unsigned long flags;
int r;
-again:
bdev = NULL;
mode = 0;
r = 0;
@@ -1579,7 +1578,7 @@ again:
}
if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
- r = -EAGAIN;
+ r = -ENOTCONN;
else if (!bdev)
r = -EIO;
@@ -1591,11 +1590,8 @@ again:
if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
r = scsi_verify_blk_ioctl(NULL, cmd);
- if (r == -EAGAIN && !fatal_signal_pending(current)) {
+ if (r == -ENOTCONN && !fatal_signal_pending(current))
queue_work(kmultipathd, &m->process_queued_ios);
- msleep(10);
- goto again;
- }
return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1d3fe1a40a9b..4880b69e2e9e 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -380,7 +380,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
static int validate_raid_redundancy(struct raid_set *rs)
{
unsigned i, rebuild_cnt = 0;
- unsigned rebuilds_per_group, copies, d;
+ unsigned rebuilds_per_group = 0, copies, d;
unsigned group_size, last_group_start;
for (i = 0; i < rs->md.raid_disks; i++)
@@ -504,7 +504,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
* First, parse the in-order required arguments
* "chunk_size" is the only argument of this type.
*/
- if ((strict_strtoul(argv[0], 10, &value) < 0)) {
+ if ((kstrtoul(argv[0], 10, &value) < 0)) {
rs->ti->error = "Bad chunk size";
return -EINVAL;
} else if (rs->raid_type->level == 1) {
@@ -585,7 +585,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
continue;
}
- if (strict_strtoul(argv[i], 10, &value) < 0) {
+ if (kstrtoul(argv[i], 10, &value) < 0) {
rs->ti->error = "Bad numerical argument given in raid params";
return -EINVAL;
}
@@ -1181,7 +1181,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
argv++;
/* number of RAID parameters */
- if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
+ if (kstrtoul(argv[0], 10, &num_raid_params) < 0) {
ti->error = "Cannot understand number of RAID parameters";
return -EINVAL;
}
@@ -1194,7 +1194,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
return -EINVAL;
}
- if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
+ if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
(num_raid_devs >= INT_MAX)) {
ti->error = "Cannot understand number of raid devices";
return -EINVAL;
@@ -1388,6 +1388,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
* performing a "check" of the array.
*/
DMEMIT(" %llu",
+ (strcmp(rs->md.last_sync_action, "check")) ? 0 :
(unsigned long long)
atomic64_read(&rs->md.resync_mismatches));
break;
@@ -1572,6 +1573,62 @@ static void raid_postsuspend(struct dm_target *ti)
mddev_suspend(&rs->md);
}
+static void attempt_restore_of_faulty_devices(struct raid_set *rs)
+{
+ int i;
+ uint64_t failed_devices, cleared_failed_devices = 0;
+ unsigned long flags;
+ struct dm_raid_superblock *sb;
+ struct md_rdev *r;
+
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ r = &rs->dev[i].rdev;
+ if (test_bit(Faulty, &r->flags) && r->sb_page &&
+ sync_page_io(r, 0, r->sb_size, r->sb_page, READ, 1)) {
+ DMINFO("Faulty %s device #%d has readable super block."
+ " Attempting to revive it.",
+ rs->raid_type->name, i);
+
+ /*
+ * Faulty bit may be set, but sometimes the array can
+ * be suspended before the personalities can respond
+ * by removing the device from the array (i.e. calling
+ * 'hot_remove_disk'). If they haven't yet removed
+ * the failed device, its 'raid_disk' number will be
+ * '>= 0' - meaning we must call this function
+ * ourselves.
+ */
+ if ((r->raid_disk >= 0) &&
+ (r->mddev->pers->hot_remove_disk(r->mddev, r) != 0))
+ /* Failed to revive this device, try next */
+ continue;
+
+ r->raid_disk = i;
+ r->saved_raid_disk = i;
+ flags = r->flags;
+ clear_bit(Faulty, &r->flags);
+ clear_bit(WriteErrorSeen, &r->flags);
+ clear_bit(In_sync, &r->flags);
+ if (r->mddev->pers->hot_add_disk(r->mddev, r)) {
+ r->raid_disk = -1;
+ r->saved_raid_disk = -1;
+ r->flags = flags;
+ } else {
+ r->recovery_offset = 0;
+ cleared_failed_devices |= 1 << i;
+ }
+ }
+ }
+ if (cleared_failed_devices) {
+ rdev_for_each(r, &rs->md) {
+ sb = page_address(r->sb_page);
+ failed_devices = le64_to_cpu(sb->failed_devices);
+ failed_devices &= ~cleared_failed_devices;
+ sb->failed_devices = cpu_to_le64(failed_devices);
+ }
+ }
+}
+
static void raid_resume(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
@@ -1580,6 +1637,13 @@ static void raid_resume(struct dm_target *ti)
if (!rs->bitmap_loaded) {
bitmap_load(&rs->md);
rs->bitmap_loaded = 1;
+ } else {
+ /*
+ * A secondary resume while the device is active.
+ * Take this opportunity to check whether any failed
+ * devices are reachable again.
+ */
+ attempt_restore_of_faulty_devices(rs);
}
clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
@@ -1588,7 +1652,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 5, 0},
+ .version = {1, 5, 2},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
new file mode 100644
index 000000000000..ff9ac4be4721
--- /dev/null
+++ b/drivers/md/dm-switch.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2010-2012 by Dell Inc. All rights reserved.
+ * Copyright (C) 2011-2013 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ *
+ * dm-switch is a device-mapper target that maps IO to underlying block
+ * devices efficiently when there are a large number of fixed-sized
+ * address regions but there is no simple pattern to allow for a compact
+ * mapping representation such as dm-stripe.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "switch"
+
+/*
+ * One region_table_slot_t holds <region_entries_per_slot> region table
+ * entries each of which is <region_table_entry_bits> in size.
+ */
+typedef unsigned long region_table_slot_t;
+
+/*
+ * A device with the offset to its start sector.
+ */
+struct switch_path {
+ struct dm_dev *dmdev;
+ sector_t start;
+};
+
+/*
+ * Context block for a dm switch device.
+ */
+struct switch_ctx {
+ struct dm_target *ti;
+
+ unsigned nr_paths; /* Number of paths in path_list. */
+
+ unsigned region_size; /* Region size in 512-byte sectors */
+ unsigned long nr_regions; /* Number of regions making up the device */
+ signed char region_size_bits; /* log2 of region_size or -1 */
+
+ unsigned char region_table_entry_bits; /* Number of bits in one region table entry */
+ unsigned char region_entries_per_slot; /* Number of entries in one region table slot */
+ signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */
+
+ region_table_slot_t *region_table; /* Region table */
+
+ /*
+ * Array of dm devices to switch between.
+ */
+ struct switch_path path_list[0];
+};
+
+static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
+ unsigned region_size)
+{
+ struct switch_ctx *sctx;
+
+ sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
+ GFP_KERNEL);
+ if (!sctx)
+ return NULL;
+
+ sctx->ti = ti;
+ sctx->region_size = region_size;
+
+ ti->private = sctx;
+
+ return sctx;
+}
+
+static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
+{
+ struct switch_ctx *sctx = ti->private;
+ sector_t nr_regions = ti->len;
+ sector_t nr_slots;
+
+ if (!(sctx->region_size & (sctx->region_size - 1)))
+ sctx->region_size_bits = __ffs(sctx->region_size);
+ else
+ sctx->region_size_bits = -1;
+
+ sctx->region_table_entry_bits = 1;
+ while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
+ (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
+ sctx->region_table_entry_bits++;
+
+ sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
+ if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
+ sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
+ else
+ sctx->region_entries_per_slot_bits = -1;
+
+ if (sector_div(nr_regions, sctx->region_size))
+ nr_regions++;
+
+ sctx->nr_regions = nr_regions;
+ if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) {
+ ti->error = "Region table too large";
+ return -EINVAL;
+ }
+
+ nr_slots = nr_regions;
+ if (sector_div(nr_slots, sctx->region_entries_per_slot))
+ nr_slots++;
+
+ if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
+ ti->error = "Region table too large";
+ return -EINVAL;
+ }
+
+ sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t));
+ if (!sctx->region_table) {
+ ti->error = "Cannot allocate region table";
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
+ unsigned long *region_index, unsigned *bit)
+{
+ if (sctx->region_entries_per_slot_bits >= 0) {
+ *region_index = region_nr >> sctx->region_entries_per_slot_bits;
+ *bit = region_nr & (sctx->region_entries_per_slot - 1);
+ } else {
+ *region_index = region_nr / sctx->region_entries_per_slot;
+ *bit = region_nr % sctx->region_entries_per_slot;
+ }
+
+ *bit *= sctx->region_table_entry_bits;
+}
+
+/*
+ * Find which path to use at given offset.
+ */
+static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
+{
+ unsigned long region_index;
+ unsigned bit, path_nr;
+ sector_t p;
+
+ p = offset;
+ if (sctx->region_size_bits >= 0)
+ p >>= sctx->region_size_bits;
+ else
+ sector_div(p, sctx->region_size);
+
+ switch_get_position(sctx, p, &region_index, &bit);
+ path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+ ((1 << sctx->region_table_entry_bits) - 1);
+
+ /* This can only happen if the processor uses non-atomic stores. */
+ if (unlikely(path_nr >= sctx->nr_paths))
+ path_nr = 0;
+
+ return path_nr;
+}
+
+static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
+ unsigned value)
+{
+ unsigned long region_index;
+ unsigned bit;
+ region_table_slot_t pte;
+
+ switch_get_position(sctx, region_nr, &region_index, &bit);
+
+ pte = sctx->region_table[region_index];
+ pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
+ pte |= (region_table_slot_t)value << bit;
+ sctx->region_table[region_index] = pte;
+}
+
+/*
+ * Fill the region table with an initial round robin pattern.
+ */
+static void initialise_region_table(struct switch_ctx *sctx)
+{
+ unsigned path_nr = 0;
+ unsigned long region_nr;
+
+ for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
+ switch_region_table_write(sctx, region_nr, path_nr);
+ if (++path_nr >= sctx->nr_paths)
+ path_nr = 0;
+ }
+}
+
+static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
+{
+ struct switch_ctx *sctx = ti->private;
+ unsigned long long start;
+ int r;
+
+ r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
+ &sctx->path_list[sctx->nr_paths].dmdev);
+ if (r) {
+ ti->error = "Device lookup failed";
+ return r;
+ }
+
+ if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
+ ti->error = "Invalid device starting offset";
+ dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
+ return -EINVAL;
+ }
+
+ sctx->path_list[sctx->nr_paths].start = start;
+
+ sctx->nr_paths++;
+
+ return 0;
+}
+
+/*
+ * Destructor: Don't free the dm_target, just the ti->private data (if any).
+ */
+static void switch_dtr(struct dm_target *ti)
+{
+ struct switch_ctx *sctx = ti->private;
+
+ while (sctx->nr_paths--)
+ dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
+
+ vfree(sctx->region_table);
+ kfree(sctx);
+}
+
+/*
+ * Constructor arguments:
+ * <num_paths> <region_size> <num_optional_args> [<optional_args>...]
+ * [<dev_path> <offset>]+
+ *
+ * Optional args are to allow for future extension: currently this
+ * parameter must be 0.
+ */
+static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ static struct dm_arg _args[] = {
+ {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
+ {1, UINT_MAX, "Invalid region size"},
+ {0, 0, "Invalid number of optional args"},
+ };
+
+ struct switch_ctx *sctx;
+ struct dm_arg_set as;
+ unsigned nr_paths, region_size, nr_optional_args;
+ int r;
+
+ as.argc = argc;
+ as.argv = argv;
+
+ r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
+ if (r)
+ return -EINVAL;
+
+ r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
+ if (r)
+ return r;
+
+ r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
+ if (r)
+ return r;
+ /* parse optional arguments here, if we add any */
+
+ if (as.argc != nr_paths * 2) {
+ ti->error = "Incorrect number of path arguments";
+ return -EINVAL;
+ }
+
+ sctx = alloc_switch_ctx(ti, nr_paths, region_size);
+ if (!sctx) {
+ ti->error = "Cannot allocate redirection context";
+ return -ENOMEM;
+ }
+
+ r = dm_set_target_max_io_len(ti, region_size);
+ if (r)
+ goto error;
+
+ while (as.argc) {
+ r = parse_path(&as, ti);
+ if (r)
+ goto error;
+ }
+
+ r = alloc_region_table(ti, nr_paths);
+ if (r)
+ goto error;
+
+ initialise_region_table(sctx);
+
+ /* For UNMAP, sending the request down any path is sufficient */
+ ti->num_discard_bios = 1;
+
+ return 0;
+
+error:
+ switch_dtr(ti);
+
+ return r;
+}
+
+static int switch_map(struct dm_target *ti, struct bio *bio)
+{
+ struct switch_ctx *sctx = ti->private;
+ sector_t offset = dm_target_offset(ti, bio->bi_sector);
+ unsigned path_nr = switch_get_path_nr(sctx, offset);
+
+ bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
+ bio->bi_sector = sctx->path_list[path_nr].start + offset;
+
+ return DM_MAPIO_REMAPPED;
+}
+
+/*
+ * We need to parse hex numbers in the message as quickly as possible.
+ *
+ * This table-based hex parser improves performance.
+ * It improves a time to load 1000000 entries compared to the condition-based
+ * parser.
+ * table-based parser condition-based parser
+ * PA-RISC 0.29s 0.31s
+ * Opteron 0.0495s 0.0498s
+ */
+static const unsigned char hex_table[256] = {
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
+255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+};
+
+static __always_inline unsigned long parse_hex(const char **string)
+{
+ unsigned char d;
+ unsigned long r = 0;
+
+ while ((d = hex_table[(unsigned char)**string]) < 16) {
+ r = (r << 4) | d;
+ (*string)++;
+ }
+
+ return r;
+}
+
+static int process_set_region_mappings(struct switch_ctx *sctx,
+ unsigned argc, char **argv)
+{
+ unsigned i;
+ unsigned long region_index = 0;
+
+ for (i = 1; i < argc; i++) {
+ unsigned long path_nr;
+ const char *string = argv[i];
+
+ if (*string == ':')
+ region_index++;
+ else {
+ region_index = parse_hex(&string);
+ if (unlikely(*string != ':')) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+ }
+
+ string++;
+ if (unlikely(!*string)) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+
+ path_nr = parse_hex(&string);
+ if (unlikely(*string)) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+ if (unlikely(region_index >= sctx->nr_regions)) {
+ DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
+ return -EINVAL;
+ }
+ if (unlikely(path_nr >= sctx->nr_paths)) {
+ DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
+ return -EINVAL;
+ }
+
+ switch_region_table_write(sctx, region_index, path_nr);
+ }
+
+ return 0;
+}
+
+/*
+ * Messages are processed one-at-a-time.
+ *
+ * Only set_region_mappings is supported.
+ */
+static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ static DEFINE_MUTEX(message_mutex);
+
+ struct switch_ctx *sctx = ti->private;
+ int r = -EINVAL;
+
+ mutex_lock(&message_mutex);
+
+ if (!strcasecmp(argv[0], "set_region_mappings"))
+ r = process_set_region_mappings(sctx, argc, argv);
+ else
+ DMWARN("Unrecognised message received.");
+
+ mutex_unlock(&message_mutex);
+
+ return r;
+}
+
+static void switch_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct switch_ctx *sctx = ti->private;
+ unsigned sz = 0;
+ int path_nr;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = '\0';
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
+ for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
+ DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
+ (unsigned long long)sctx->path_list[path_nr].start);
+ break;
+ }
+}
+
+/*
+ * Switch ioctl:
+ *
+ * Passthrough all ioctls to the path for sector 0
+ */
+static int switch_ioctl(struct dm_target *ti, unsigned cmd,
+ unsigned long arg)
+{
+ struct switch_ctx *sctx = ti->private;
+ struct block_device *bdev;
+ fmode_t mode;
+ unsigned path_nr;
+ int r = 0;
+
+ path_nr = switch_get_path_nr(sctx, 0);
+
+ bdev = sctx->path_list[path_nr].dmdev->bdev;
+ mode = sctx->path_list[path_nr].dmdev->mode;
+
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
+ return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+}
+
+static int switch_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct switch_ctx *sctx = ti->private;
+ int path_nr;
+ int r;
+
+ for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
+ r = fn(ti, sctx->path_list[path_nr].dmdev,
+ sctx->path_list[path_nr].start, ti->len, data);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+static struct target_type switch_target = {
+ .name = "switch",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = switch_ctr,
+ .dtr = switch_dtr,
+ .map = switch_map,
+ .message = switch_message,
+ .status = switch_status,
+ .ioctl = switch_ioctl,
+ .iterate_devices = switch_iterate_devices,
+};
+
+static int __init dm_switch_init(void)
+{
+ int r;
+
+ r = dm_register_target(&switch_target);
+ if (r < 0)
+ DMERR("dm_register_target() failed %d", r);
+
+ return r;
+}
+
+static void __exit dm_switch_exit(void)
+{
+ dm_unregister_target(&switch_target);
+}
+
+module_init(dm_switch_init);
+module_exit(dm_switch_exit);
+
+MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
+MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
+MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
+MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 1ff252ab7d46..f221812b7dbc 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -26,22 +26,8 @@
#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
-/*
- * The table has always exactly one reference from either mapped_device->map
- * or hash_cell->new_map. This reference is not counted in table->holders.
- * A pair of dm_create_table/dm_destroy_table functions is used for table
- * creation/destruction.
- *
- * Temporary references from the other code increase table->holders. A pair
- * of dm_table_get/dm_table_put functions is used to manipulate it.
- *
- * When the table is about to be destroyed, we wait for table->holders to
- * drop to zero.
- */
-
struct dm_table {
struct mapped_device *md;
- atomic_t holders;
unsigned type;
/* btree table */
@@ -208,7 +194,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
INIT_LIST_HEAD(&t->devices);
INIT_LIST_HEAD(&t->target_callbacks);
- atomic_set(&t->holders, 0);
if (!num_targets)
num_targets = KEYS_PER_NODE;
@@ -246,10 +231,6 @@ void dm_table_destroy(struct dm_table *t)
if (!t)
return;
- while (atomic_read(&t->holders))
- msleep(1);
- smp_mb();
-
/* free the indexes */
if (t->depth >= 2)
vfree(t->index[t->depth - 2]);
@@ -274,22 +255,6 @@ void dm_table_destroy(struct dm_table *t)
kfree(t);
}
-void dm_table_get(struct dm_table *t)
-{
- atomic_inc(&t->holders);
-}
-EXPORT_SYMBOL(dm_table_get);
-
-void dm_table_put(struct dm_table *t)
-{
- if (!t)
- return;
-
- smp_mb__before_atomic_dec();
- atomic_dec(&t->holders);
-}
-EXPORT_SYMBOL(dm_table_put);
-
/*
* Checks to see if we need to extend highs or targets.
*/
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index b948fd864d45..4b7941db3aff 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -451,7 +451,7 @@ static void verity_prefetch_io(struct work_struct *work)
goto no_prefetch_cluster;
if (unlikely(cluster & (cluster - 1)))
- cluster = 1 << (fls(cluster) - 1);
+ cluster = 1 << __fls(cluster);
hash_block_start &= ~(sector_t)(cluster - 1);
hash_block_end |= cluster - 1;
@@ -695,8 +695,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
- if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
- num < 0 || num > 1) {
+ if (sscanf(argv[0], "%u%c", &num, &dummy) != 1 ||
+ num > 1) {
ti->error = "Invalid version";
r = -EINVAL;
goto bad;
@@ -723,7 +723,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
r = -EINVAL;
goto bad;
}
- v->data_dev_block_bits = ffs(num) - 1;
+ v->data_dev_block_bits = __ffs(num);
if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
!num || (num & (num - 1)) ||
@@ -733,7 +733,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
r = -EINVAL;
goto bad;
}
- v->hash_dev_block_bits = ffs(num) - 1;
+ v->hash_dev_block_bits = __ffs(num);
if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
(sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
@@ -812,7 +812,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
v->hash_per_block_bits =
- fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1;
+ __fls((1 << v->hash_dev_block_bits) / v->digest_size);
v->levels = 0;
if (v->data_blocks)
@@ -831,9 +831,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
for (i = v->levels - 1; i >= 0; i--) {
sector_t s;
v->hash_level_block[i] = hash_position;
- s = verity_position_at_level(v, v->data_blocks, i);
- s = (s >> v->hash_per_block_bits) +
- !!(s & ((1 << v->hash_per_block_bits) - 1));
+ s = (v->data_blocks + ((sector_t)1 << ((i + 1) * v->hash_per_block_bits)) - 1)
+ >> ((i + 1) * v->hash_per_block_bits);
if (hash_position + s < hash_position) {
ti->error = "Hash device offset overflow";
r = -E2BIG;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d5370a94b2c1..9e39d2b64bf8 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -117,15 +117,29 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
#define DMF_MERGE_IS_OPTIONAL 6
/*
+ * A dummy definition to make RCU happy.
+ * struct dm_table should never be dereferenced in this file.
+ */
+struct dm_table {
+ int undefined__;
+};
+
+/*
* Work processed by per-device workqueue.
*/
struct mapped_device {
- struct rw_semaphore io_lock;
+ struct srcu_struct io_barrier;
struct mutex suspend_lock;
- rwlock_t map_lock;
atomic_t holders;
atomic_t open_count;
+ /*
+ * The current mapping.
+ * Use dm_get_live_table{_fast} or take suspend_lock for
+ * dereference.
+ */
+ struct dm_table *map;
+
unsigned long flags;
struct request_queue *queue;
@@ -155,11 +169,6 @@ struct mapped_device {
struct workqueue_struct *wq;
/*
- * The current mapping.
- */
- struct dm_table *map;
-
- /*
* io objects are allocated from here.
*/
mempool_t *io_pool;
@@ -386,10 +395,14 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct mapped_device *md = bdev->bd_disk->private_data;
- struct dm_table *map = dm_get_live_table(md);
+ int srcu_idx;
+ struct dm_table *map;
struct dm_target *tgt;
int r = -ENOTTY;
+retry:
+ map = dm_get_live_table(md, &srcu_idx);
+
if (!map || !dm_table_get_size(map))
goto out;
@@ -408,7 +421,12 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
r = tgt->type->ioctl(tgt, cmd, arg);
out:
- dm_table_put(map);
+ dm_put_live_table(md, srcu_idx);
+
+ if (r == -ENOTCONN) {
+ msleep(10);
+ goto retry;
+ }
return r;
}
@@ -502,20 +520,39 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
/*
* Everyone (including functions in this file), should use this
* function to access the md->map field, and make sure they call
- * dm_table_put() when finished.
+ * dm_put_live_table() when finished.
*/
-struct dm_table *dm_get_live_table(struct mapped_device *md)
+struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
{
- struct dm_table *t;
- unsigned long flags;
+ *srcu_idx = srcu_read_lock(&md->io_barrier);
+
+ return srcu_dereference(md->map, &md->io_barrier);
+}
- read_lock_irqsave(&md->map_lock, flags);
- t = md->map;
- if (t)
- dm_table_get(t);
- read_unlock_irqrestore(&md->map_lock, flags);
+void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
+{
+ srcu_read_unlock(&md->io_barrier, srcu_idx);
+}
- return t;
+void dm_sync_table(struct mapped_device *md)
+{
+ synchronize_srcu(&md->io_barrier);
+ synchronize_rcu_expedited();
+}
+
+/*
+ * A fast alternative to dm_get_live_table/dm_put_live_table.
+ * The caller must not block between these two functions.
+ */
+static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
+{
+ rcu_read_lock();
+ return rcu_dereference(md->map);
+}
+
+static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
+{
+ rcu_read_unlock();
}
/*
@@ -1349,17 +1386,18 @@ static int __split_and_process_non_flush(struct clone_info *ci)
/*
* Entry point to split a bio into clones and submit them to the targets.
*/
-static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
+static void __split_and_process_bio(struct mapped_device *md,
+ struct dm_table *map, struct bio *bio)
{
struct clone_info ci;
int error = 0;
- ci.map = dm_get_live_table(md);
- if (unlikely(!ci.map)) {
+ if (unlikely(!map)) {
bio_io_error(bio);
return;
}
+ ci.map = map;
ci.md = md;
ci.io = alloc_io(md);
ci.io->error = 0;
@@ -1386,7 +1424,6 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
/* drop the extra reference count */
dec_pending(ci.io, error);
- dm_table_put(ci.map);
}
/*-----------------------------------------------------------------
* CRUD END
@@ -1397,7 +1434,7 @@ static int dm_merge_bvec(struct request_queue *q,
struct bio_vec *biovec)
{
struct mapped_device *md = q->queuedata;
- struct dm_table *map = dm_get_live_table(md);
+ struct dm_table *map = dm_get_live_table_fast(md);
struct dm_target *ti;
sector_t max_sectors;
int max_size = 0;
@@ -1407,7 +1444,7 @@ static int dm_merge_bvec(struct request_queue *q,
ti = dm_table_find_target(map, bvm->bi_sector);
if (!dm_target_is_valid(ti))
- goto out_table;
+ goto out;
/*
* Find maximum amount of I/O that won't need splitting
@@ -1436,10 +1473,8 @@ static int dm_merge_bvec(struct request_queue *q,
max_size = 0;
-out_table:
- dm_table_put(map);
-
out:
+ dm_put_live_table_fast(md);
/*
* Always allow an entire first page
*/
@@ -1458,8 +1493,10 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
int cpu;
+ int srcu_idx;
+ struct dm_table *map;
- down_read(&md->io_lock);
+ map = dm_get_live_table(md, &srcu_idx);
cpu = part_stat_lock();
part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
@@ -1468,7 +1505,7 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
/* if we're suspended, we have to queue this io for later */
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
- up_read(&md->io_lock);
+ dm_put_live_table(md, srcu_idx);
if (bio_rw(bio) != READA)
queue_io(md, bio);
@@ -1477,8 +1514,8 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
return;
}
- __split_and_process_bio(md, bio);
- up_read(&md->io_lock);
+ __split_and_process_bio(md, map, bio);
+ dm_put_live_table(md, srcu_idx);
return;
}
@@ -1664,7 +1701,8 @@ static struct request *dm_start_request(struct mapped_device *md, struct request
static void dm_request_fn(struct request_queue *q)
{
struct mapped_device *md = q->queuedata;
- struct dm_table *map = dm_get_live_table(md);
+ int srcu_idx;
+ struct dm_table *map = dm_get_live_table(md, &srcu_idx);
struct dm_target *ti;
struct request *rq, *clone;
sector_t pos;
@@ -1719,7 +1757,7 @@ requeued:
delay_and_out:
blk_delay_queue(q, HZ / 10);
out:
- dm_table_put(map);
+ dm_put_live_table(md, srcu_idx);
}
int dm_underlying_device_busy(struct request_queue *q)
@@ -1732,14 +1770,14 @@ static int dm_lld_busy(struct request_queue *q)
{
int r;
struct mapped_device *md = q->queuedata;
- struct dm_table *map = dm_get_live_table(md);
+ struct dm_table *map = dm_get_live_table_fast(md);
if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
r = 1;
else
r = dm_table_any_busy_target(map);
- dm_table_put(map);
+ dm_put_live_table_fast(md);
return r;
}
@@ -1751,7 +1789,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
struct dm_table *map;
if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
- map = dm_get_live_table(md);
+ map = dm_get_live_table_fast(md);
if (map) {
/*
* Request-based dm cares about only own queue for
@@ -1762,9 +1800,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
bdi_bits;
else
r = dm_table_any_congested(map, bdi_bits);
-
- dm_table_put(map);
}
+ dm_put_live_table_fast(md);
}
return r;
@@ -1869,12 +1906,14 @@ static struct mapped_device *alloc_dev(int minor)
if (r < 0)
goto bad_minor;
+ r = init_srcu_struct(&md->io_barrier);
+ if (r < 0)
+ goto bad_io_barrier;
+
md->type = DM_TYPE_NONE;
- init_rwsem(&md->io_lock);
mutex_init(&md->suspend_lock);
mutex_init(&md->type_lock);
spin_lock_init(&md->deferred_lock);
- rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1);
atomic_set(&md->open_count, 0);
atomic_set(&md->event_nr, 0);
@@ -1937,6 +1976,8 @@ bad_thread:
bad_disk:
blk_cleanup_queue(md->queue);
bad_queue:
+ cleanup_srcu_struct(&md->io_barrier);
+bad_io_barrier:
free_minor(minor);
bad_minor:
module_put(THIS_MODULE);
@@ -1960,6 +2001,7 @@ static void free_dev(struct mapped_device *md)
bioset_free(md->bs);
blk_integrity_unregister(md->disk);
del_gendisk(md->disk);
+ cleanup_srcu_struct(&md->io_barrier);
free_minor(minor);
spin_lock(&_minor_lock);
@@ -2102,7 +2144,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
struct dm_table *old_map;
struct request_queue *q = md->queue;
sector_t size;
- unsigned long flags;
int merge_is_optional;
size = dm_table_get_size(t);
@@ -2131,9 +2172,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
merge_is_optional = dm_table_merge_is_optional(t);
- write_lock_irqsave(&md->map_lock, flags);
old_map = md->map;
- md->map = t;
+ rcu_assign_pointer(md->map, t);
md->immutable_target_type = dm_table_get_immutable_target_type(t);
dm_table_set_restrictions(t, q, limits);
@@ -2141,7 +2181,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
else
clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
- write_unlock_irqrestore(&md->map_lock, flags);
+ dm_sync_table(md);
return old_map;
}
@@ -2152,15 +2192,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
static struct dm_table *__unbind(struct mapped_device *md)
{
struct dm_table *map = md->map;
- unsigned long flags;
if (!map)
return NULL;
dm_table_event_callback(map, NULL, NULL);
- write_lock_irqsave(&md->map_lock, flags);
- md->map = NULL;
- write_unlock_irqrestore(&md->map_lock, flags);
+ rcu_assign_pointer(md->map, NULL);
+ dm_sync_table(md);
return map;
}
@@ -2312,11 +2350,12 @@ EXPORT_SYMBOL_GPL(dm_device_name);
static void __dm_destroy(struct mapped_device *md, bool wait)
{
struct dm_table *map;
+ int srcu_idx;
might_sleep();
spin_lock(&_minor_lock);
- map = dm_get_live_table(md);
+ map = dm_get_live_table(md, &srcu_idx);
idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
set_bit(DMF_FREEING, &md->flags);
spin_unlock(&_minor_lock);
@@ -2326,6 +2365,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
dm_table_postsuspend_targets(map);
}
+ /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
+ dm_put_live_table(md, srcu_idx);
+
/*
* Rare, but there may be I/O requests still going to complete,
* for example. Wait for all references to disappear.
@@ -2340,7 +2382,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
dm_device_name(md), atomic_read(&md->holders));
dm_sysfs_exit(md);
- dm_table_put(map);
dm_table_destroy(__unbind(md));
free_dev(md);
}
@@ -2397,8 +2438,10 @@ static void dm_wq_work(struct work_struct *work)
struct mapped_device *md = container_of(work, struct mapped_device,
work);
struct bio *c;
+ int srcu_idx;
+ struct dm_table *map;
- down_read(&md->io_lock);
+ map = dm_get_live_table(md, &srcu_idx);
while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
spin_lock_irq(&md->deferred_lock);
@@ -2408,17 +2451,13 @@ static void dm_wq_work(struct work_struct *work)
if (!c)
break;
- up_read(&md->io_lock);
-
if (dm_request_based(md))
generic_make_request(c);
else
- __split_and_process_bio(md, c);
-
- down_read(&md->io_lock);
+ __split_and_process_bio(md, map, c);
}
- up_read(&md->io_lock);
+ dm_put_live_table(md, srcu_idx);
}
static void dm_queue_flush(struct mapped_device *md)
@@ -2450,10 +2489,10 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
* reappear.
*/
if (dm_table_has_no_data_devices(table)) {
- live_map = dm_get_live_table(md);
+ live_map = dm_get_live_table_fast(md);
if (live_map)
limits = md->queue->limits;
- dm_table_put(live_map);
+ dm_put_live_table_fast(md);
}
if (!live_map) {
@@ -2533,7 +2572,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
goto out_unlock;
}
- map = dm_get_live_table(md);
+ map = md->map;
/*
* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2554,7 +2593,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
if (!noflush && do_lockfs) {
r = lock_fs(md);
if (r)
- goto out;
+ goto out_unlock;
}
/*
@@ -2569,9 +2608,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
* (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
* flush_workqueue(md->wq).
*/
- down_write(&md->io_lock);
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
- up_write(&md->io_lock);
+ synchronize_srcu(&md->io_barrier);
/*
* Stop md->queue before flushing md->wq in case request-based
@@ -2589,10 +2627,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
*/
r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
- down_write(&md->io_lock);
if (noflush)
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
- up_write(&md->io_lock);
+ synchronize_srcu(&md->io_barrier);
/* were we interrupted ? */
if (r < 0) {
@@ -2602,7 +2639,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
start_queue(md->queue);
unlock_fs(md);
- goto out; /* pushback list is already flushed, so skip flush */
+ goto out_unlock; /* pushback list is already flushed, so skip flush */
}
/*
@@ -2615,9 +2652,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
dm_table_postsuspend_targets(map);
-out:
- dm_table_put(map);
-
out_unlock:
mutex_unlock(&md->suspend_lock);
return r;
@@ -2632,7 +2666,7 @@ int dm_resume(struct mapped_device *md)
if (!dm_suspended_md(md))
goto out;
- map = dm_get_live_table(md);
+ map = md->map;
if (!map || !dm_table_get_size(map))
goto out;
@@ -2656,7 +2690,6 @@ int dm_resume(struct mapped_device *md)
r = 0;
out:
- dm_table_put(map);
mutex_unlock(&md->suspend_lock);
return r;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9b82377a833b..dddc87bcf64a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -521,6 +521,7 @@ void mddev_init(struct mddev *mddev)
init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector;
mddev->reshape_backwards = 0;
+ mddev->last_sync_action = "none";
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
@@ -2867,7 +2868,7 @@ static ssize_t
offset_store(struct md_rdev *rdev, const char *buf, size_t len)
{
unsigned long long offset;
- if (strict_strtoull(buf, 10, &offset) < 0)
+ if (kstrtoull(buf, 10, &offset) < 0)
return -EINVAL;
if (rdev->mddev->pers && rdev->raid_disk >= 0)
return -EBUSY;
@@ -2895,7 +2896,7 @@ static ssize_t new_offset_store(struct md_rdev *rdev,
unsigned long long new_offset;
struct mddev *mddev = rdev->mddev;
- if (strict_strtoull(buf, 10, &new_offset) < 0)
+ if (kstrtoull(buf, 10, &new_offset) < 0)
return -EINVAL;
if (mddev->sync_thread)
@@ -2961,7 +2962,7 @@ static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
unsigned long long blocks;
sector_t new;
- if (strict_strtoull(buf, 10, &blocks) < 0)
+ if (kstrtoull(buf, 10, &blocks) < 0)
return -EINVAL;
if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
@@ -3069,7 +3070,7 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_
if (cmd_match(buf, "none"))
recovery_start = MaxSector;
- else if (strict_strtoull(buf, 10, &recovery_start))
+ else if (kstrtoull(buf, 10, &recovery_start))
return -EINVAL;
if (rdev->mddev->pers &&
@@ -3497,7 +3498,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
if (clevel[len-1] == '\n')
len--;
clevel[len] = 0;
- if (strict_strtol(clevel, 10, &level))
+ if (kstrtol(clevel, 10, &level))
level = LEVEL_NONE;
if (request_module("md-%s", clevel) != 0)
@@ -4272,6 +4273,17 @@ action_store(struct mddev *mddev, const char *page, size_t len)
return len;
}
+static struct md_sysfs_entry md_scan_mode =
+__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+
+static ssize_t
+last_sync_action_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%s\n", mddev->last_sync_action);
+}
+
+static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
+
static ssize_t
mismatch_cnt_show(struct mddev *mddev, char *page)
{
@@ -4280,10 +4292,6 @@ mismatch_cnt_show(struct mddev *mddev, char *page)
atomic64_read(&mddev->resync_mismatches));
}
-static struct md_sysfs_entry md_scan_mode =
-__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
-
-
static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
static ssize_t
@@ -4356,7 +4364,7 @@ sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
{
long n;
- if (strict_strtol(buf, 10, &n))
+ if (kstrtol(buf, 10, &n))
return -EINVAL;
if (n != 0 && n != 1)
@@ -4424,7 +4432,7 @@ static ssize_t
min_sync_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned long long min;
- if (strict_strtoull(buf, 10, &min))
+ if (kstrtoull(buf, 10, &min))
return -EINVAL;
if (min > mddev->resync_max)
return -EINVAL;
@@ -4461,7 +4469,7 @@ max_sync_store(struct mddev *mddev, const char *buf, size_t len)
mddev->resync_max = MaxSector;
else {
unsigned long long max;
- if (strict_strtoull(buf, 10, &max))
+ if (kstrtoull(buf, 10, &max))
return -EINVAL;
if (max < mddev->resync_min)
return -EINVAL;
@@ -4686,6 +4694,7 @@ static struct attribute *md_default_attrs[] = {
static struct attribute *md_redundancy_attrs[] = {
&md_scan_mode.attr,
+ &md_last_scan_mode.attr,
&md_mismatches.attr,
&md_sync_min.attr,
&md_sync_max.attr,
@@ -6405,6 +6414,12 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
/* need to ensure md_delayed_delete() has completed */
flush_workqueue(md_misc_wq);
+ if (cmd == HOT_REMOVE_DISK)
+ /* need to ensure recovery thread has run */
+ wait_event_interruptible_timeout(mddev->sb_wait,
+ !test_bit(MD_RECOVERY_NEEDED,
+ &mddev->flags),
+ msecs_to_jiffies(5000));
err = mddev_lock(mddev);
if (err) {
printk(KERN_INFO
@@ -7323,7 +7338,7 @@ void md_do_sync(struct md_thread *thread)
sector_t last_check;
int skipped = 0;
struct md_rdev *rdev;
- char *desc;
+ char *desc, *action = NULL;
struct blk_plug plug;
/* just incase thread restarts... */
@@ -7333,17 +7348,21 @@ void md_do_sync(struct md_thread *thread)
return;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
- if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+ if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
desc = "data-check";
- else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ action = "check";
+ } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
desc = "requested-resync";
- else
+ action = "repair";
+ } else
desc = "resync";
} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
desc = "reshape";
else
desc = "recovery";
+ mddev->last_sync_action = action ?: desc;
+
/* we overload curr_resync somewhat here.
* 0 == not engaged in resync at all
* 2 == checking that there is no conflict with another sync
@@ -7892,6 +7911,8 @@ void md_check_recovery(struct mddev *mddev)
md_new_event(mddev);
}
unlock:
+ wake_up(&mddev->sb_wait);
+
if (!mddev->sync_thread) {
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
if (test_and_clear_bit(MD_RECOVERY_RECOVER,
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 653f992b687a..20f02c0b5f2d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -268,6 +268,14 @@ struct mddev {
struct md_thread *thread; /* management thread */
struct md_thread *sync_thread; /* doing resync or reconstruct */
+
+ /* 'last_sync_action' is initialized to "none". It is set when a
+ * sync operation (i.e "data-check", "requested-resync", "resync",
+ * "recovery", or "reshape") is started. It holds this value even
+ * when the sync thread is "frozen" (interrupted) or "idle" (stopped
+ * or finished). It is overwritten when a new sync operation is begun.
+ */
+ char *last_sync_action;
sector_t curr_resync; /* last block scheduled */
/* As resync requests can complete out of order, we cannot easily track
* how much resync has been completed. So we occasionally pause until
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index fcf65e512cf5..c4d420b7d2f4 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -597,6 +597,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
mdname(mddev));
return ERR_PTR(-EINVAL);
}
+ rdev->sectors = mddev->dev_sectors;
}
/* Set new parameters */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6e17f8181c4b..ec734588a1c6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1519,8 +1519,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
p = conf->mirrors+mirror;
if (!p->rdev) {
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
+ if (mddev->gendisk)
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
p->head_position = 0;
rdev->raid_disk = mirror;
@@ -1559,7 +1560,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
clear_bit(Unmerged, &rdev->flags);
}
md_integrity_add_rdev(rdev, mddev);
- if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
print_conf(conf);
return err;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6ddae2501b9a..cd066b63bdaf 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -97,7 +97,7 @@ static int max_queued_requests = 1024;
static void allow_barrier(struct r10conf *conf);
static void lower_barrier(struct r10conf *conf);
-static int enough(struct r10conf *conf, int ignore);
+static int _enough(struct r10conf *conf, int previous, int ignore);
static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
int *skipped);
static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
@@ -392,11 +392,9 @@ static void raid10_end_read_request(struct bio *bio, int error)
* than fail the last device. Here we redefine
* "uptodate" to mean "Don't want to retry"
*/
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- if (!enough(conf, rdev->raid_disk))
+ if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
+ rdev->raid_disk))
uptodate = 1;
- spin_unlock_irqrestore(&conf->device_lock, flags);
}
if (uptodate) {
raid_end_bio_io(r10_bio);
@@ -1632,37 +1630,58 @@ static void status(struct seq_file *seq, struct mddev *mddev)
* Don't consider the device numbered 'ignore'
* as we might be about to remove it.
*/
-static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
+static int _enough(struct r10conf *conf, int previous, int ignore)
{
int first = 0;
+ int has_enough = 0;
+ int disks, ncopies;
+ if (previous) {
+ disks = conf->prev.raid_disks;
+ ncopies = conf->prev.near_copies;
+ } else {
+ disks = conf->geo.raid_disks;
+ ncopies = conf->geo.near_copies;
+ }
+ rcu_read_lock();
do {
int n = conf->copies;
int cnt = 0;
int this = first;
while (n--) {
- if (conf->mirrors[this].rdev &&
- this != ignore)
+ struct md_rdev *rdev;
+ if (this != ignore &&
+ (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
+ test_bit(In_sync, &rdev->flags))
cnt++;
- this = (this+1) % geo->raid_disks;
+ this = (this+1) % disks;
}
if (cnt == 0)
- return 0;
- first = (first + geo->near_copies) % geo->raid_disks;
+ goto out;
+ first = (first + ncopies) % disks;
} while (first != 0);
- return 1;
+ has_enough = 1;
+out:
+ rcu_read_unlock();
+ return has_enough;
}
static int enough(struct r10conf *conf, int ignore)
{
- return _enough(conf, &conf->geo, ignore) &&
- _enough(conf, &conf->prev, ignore);
+ /* when calling 'enough', both 'prev' and 'geo' must
+ * be stable.
+ * This is ensured if ->reconfig_mutex or ->device_lock
+ * is held.
+ */
+ return _enough(conf, 0, ignore) &&
+ _enough(conf, 1, ignore);
}
static void error(struct mddev *mddev, struct md_rdev *rdev)
{
char b[BDEVNAME_SIZE];
struct r10conf *conf = mddev->private;
+ unsigned long flags;
/*
* If it is not operational, then we have already marked it as dead
@@ -1670,18 +1689,18 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
* next level up know.
* else mark the drive as failed
*/
+ spin_lock_irqsave(&conf->device_lock, flags);
if (test_bit(In_sync, &rdev->flags)
- && !enough(conf, rdev->raid_disk))
+ && !enough(conf, rdev->raid_disk)) {
/*
* Don't fail the drive, just return an IO error.
*/
+ spin_unlock_irqrestore(&conf->device_lock, flags);
return;
+ }
if (test_and_clear_bit(In_sync, &rdev->flags)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
+ /*
* if recovery is running, make sure it aborts.
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -1689,6 +1708,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
printk(KERN_ALERT
"md/raid10:%s: Disk failure on %s, disabling device.\n"
"md/raid10:%s: Operation continuing on %d devices.\n",
@@ -1791,7 +1811,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* very different from resync
*/
return -EBUSY;
- if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
+ if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
return -EINVAL;
if (rdev->raid_disk >= 0)
@@ -1819,15 +1839,17 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = mirror;
err = 0;
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
+ if (mddev->gendisk)
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
conf->fullsync = 1;
rcu_assign_pointer(p->replacement, rdev);
break;
}
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
+ if (mddev->gendisk)
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
p->head_position = 0;
p->recovery_disabled = mddev->recovery_disabled - 1;
@@ -2909,14 +2931,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
*/
if (mddev->bitmap == NULL &&
mddev->recovery_cp == MaxSector &&
+ mddev->reshape_position == MaxSector &&
+ !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
conf->fullsync == 0) {
*skipped = 1;
- max_sector = mddev->dev_sectors;
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
- test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
- max_sector = mddev->resync_max_sectors;
- return max_sector - sector_nr;
+ return mddev->dev_sectors - sector_nr;
}
skipped:
@@ -3532,7 +3553,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
/* FIXME calc properly */
conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
- max(0,mddev->delta_disks)),
+ max(0,-mddev->delta_disks)),
GFP_KERNEL);
if (!conf->mirrors)
goto out;
@@ -3691,7 +3712,7 @@ static int run(struct mddev *mddev)
conf->geo.far_offset == 0)
goto out_free_conf;
if (conf->prev.far_copies != 1 &&
- conf->geo.far_offset == 0)
+ conf->prev.far_offset == 0)
goto out_free_conf;
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 05e4a105b9c7..2bf094a587cb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4924,7 +4924,7 @@ raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
if (!conf)
return -ENODEV;
- if (strict_strtoul(page, 10, &new))
+ if (kstrtoul(page, 10, &new))
return -EINVAL;
err = raid5_set_cache_size(mddev, new);
if (err)
@@ -4957,7 +4957,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
if (!conf)
return -ENODEV;
- if (strict_strtoul(page, 10, &new))
+ if (kstrtoul(page, 10, &new))
return -EINVAL;
if (new > conf->max_nr_stripes)
return -EINVAL;
@@ -5914,7 +5914,7 @@ static int check_reshape(struct mddev *mddev)
return 0; /* nothing to do */
if (has_failed(conf))
return -EINVAL;
- if (mddev->delta_disks < 0) {
+ if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
/* We might be able to shrink, but the devices must
* be made bigger first.
* For raid6, 4 is the minimum size.