summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2009-06-03 13:43:25 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2009-06-03 13:43:25 +1000
commitfe371fb276db6ee6181205c26140ebf6bf09b117 (patch)
treeb8b8289f815d1ca00cd2b0acd6b50cee52e58248
parent75f71da8f65355f4a300542f706aa61e3b7d2453 (diff)
parentc855427a5ff13dc5b8aa8e72ed1495e9aac04abb (diff)
Merge branch 'quilt/device-mapper'
-rw-r--r--Documentation/device-mapper/dm-queue-length.txt39
-rw-r--r--drivers/md/Kconfig9
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-crypt.c8
-rw-r--r--drivers/md/dm-delay.c6
-rw-r--r--drivers/md/dm-exception-store.h2
-rw-r--r--drivers/md/dm-io.c14
-rw-r--r--drivers/md/dm-ioctl.c14
-rw-r--r--drivers/md/dm-linear.c4
-rw-r--r--drivers/md/dm-log.c2
-rw-r--r--drivers/md/dm-mpath.c120
-rw-r--r--drivers/md/dm-path-selector.h8
-rw-r--r--drivers/md/dm-queue-length.c263
-rw-r--r--drivers/md/dm-region-hash.c2
-rw-r--r--drivers/md/dm-round-robin.c2
-rw-r--r--drivers/md/dm-snap-persistent.c2
-rw-r--r--drivers/md/dm-snap.c11
-rw-r--r--drivers/md/dm-stripe.c15
-rw-r--r--drivers/md/dm-sysfs.c9
-rw-r--r--drivers/md/dm-table.c134
-rw-r--r--drivers/md/dm.c228
-rw-r--r--drivers/md/dm.h3
-rw-r--r--include/linux/device-mapper.h11
-rw-r--r--include/linux/dm-ioctl.h14
24 files changed, 738 insertions, 183 deletions
diff --git a/Documentation/device-mapper/dm-queue-length.txt b/Documentation/device-mapper/dm-queue-length.txt
new file mode 100644
index 000000000000..f4db2562175c
--- /dev/null
+++ b/Documentation/device-mapper/dm-queue-length.txt
@@ -0,0 +1,39 @@
+dm-queue-length
+===============
+
+dm-queue-length is a path selector module for device-mapper targets,
+which selects a path with the least number of in-flight I/Os.
+The path selector name is 'queue-length'.
+
+Table parameters for each path: [<repeat_count>]
+ <repeat_count>: The number of I/Os to dispatch using the selected
+ path before switching to the next path.
+ If not given, internal default is used. To check
+ the default value, see the activated table.
+
+Status for each path: <status> <fail-count> <in-flight>
+ <status>: 'A' if the path is active, 'F' if the path is failed.
+ <fail-count>: The number of path failures.
+ <in-flight>: The number of in-flight I/Os on the path.
+
+
+Algorithm
+=========
+
+dm-queue-length increments/decrements 'in-flight' when an I/O is
+dispatched/completed respectively.
+dm-queue-length selects a path with the minimum 'in-flight'.
+
+
+Examples
+========
+In case that 2 paths (sda and sdb) are used with repeat_count == 128.
+
+# echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \
+ dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 36e0675be9f7..3b311d273346 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -249,6 +249,15 @@ config DM_MULTIPATH
---help---
Allow volume managers to support multipath hardware.
+config DM_MULTIPATH_QL
+ tristate "I/O Path Selector based on the number of in-flight I/Os"
+ depends on DM_MULTIPATH
+ ---help---
+ This path selector is a dynamic load balancer which selects
+ the path with the least number of in-flight I/Os.
+
+ If unsure, say N.
+
config DM_DELAY
tristate "I/O delaying target (EXPERIMENTAL)"
depends on BLK_DEV_DM && EXPERIMENTAL
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 45cc5951d928..ff9f545dd516 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_DELAY) += dm-delay.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
+obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 53394e863c74..04db6c4004a8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_crypt_queue;
}
+ ti->num_flush_requests = 1;
ti->private = cc;
return 0;
@@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
struct dm_crypt_io *io;
+ struct crypt_config *cc;
+
+ if (unlikely(bio_empty_barrier(bio))) {
+ cc = ti->private;
+ bio->bi_bdev = cc->dev->bdev;
+ return DM_MAPIO_REMAPPED;
+ }
io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin);
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 559dbb52bc85..8ad8a9044bbf 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -197,6 +197,7 @@ out:
mutex_init(&dc->timer_lock);
atomic_set(&dc->may_delay, 1);
+ ti->num_flush_requests = 1;
ti->private = dc;
return 0;
@@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio,
if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
bio->bi_bdev = dc->dev_write->bdev;
- bio->bi_sector = dc->start_write +
- (bio->bi_sector - ti->begin);
+ if (bio_sectors(bio))
+ bio->bi_sector = dc->start_write +
+ (bio->bi_sector - ti->begin);
return delay_bio(dc, dc->write_delay, bio);
}
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 0a2e6e7f67b3..96a796be7c77 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
*/
static inline sector_t get_dev_size(struct block_device *bdev)
{
- return bdev->bd_inode->i_size >> SECTOR_SHIFT;
+ return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
}
static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index e73aabd61cd7..3a2e6a2f8bdd 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,6 +22,7 @@ struct dm_io_client {
/* FIXME: can we shrink this ? */
struct io {
unsigned long error_bits;
+ unsigned long eopnotsupp_bits;
atomic_t count;
struct task_struct *sleeper;
struct dm_io_client *client;
@@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio)
*---------------------------------------------------------------*/
static void dec_count(struct io *io, unsigned int region, int error)
{
- if (error)
+ if (error) {
set_bit(region, &io->error_bits);
+ if (error == -EOPNOTSUPP)
+ set_bit(region, &io->eopnotsupp_bits);
+ }
if (atomic_dec_and_test(&io->count)) {
if (io->sleeper)
@@ -360,7 +364,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
return -EIO;
}
+retry:
io.error_bits = 0;
+ io.eopnotsupp_bits = 0;
atomic_set(&io.count, 1); /* see dispatch_io() */
io.sleeper = current;
io.client = client;
@@ -377,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
}
set_current_state(TASK_RUNNING);
+ if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
+ rw &= ~(1 << BIO_RW_BARRIER);
+ goto retry;
+ }
+
if (error_bits)
*error_bits = io.error_bits;
@@ -397,6 +408,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
io = mempool_alloc(client->pool, GFP_NOIO);
io->error_bits = 0;
+ io->eopnotsupp_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->sleeper = NULL;
io->client = client;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 823ceba6efa8..410f2d058902 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -276,7 +276,7 @@ retry:
up_write(&_hash_lock);
}
-static int dm_hash_rename(const char *old, const char *new)
+static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
{
char *new_name, *old_name;
struct hash_cell *hc;
@@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new)
dm_table_put(table);
}
- dm_kobject_uevent(hc->md);
+ dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie);
dm_put(hc->md);
up_write(&_hash_lock);
@@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
__hash_remove(hc);
up_write(&_hash_lock);
+
+ dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr);
+
dm_put(md);
param->data_size = 0;
return 0;
@@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
return r;
param->data_size = 0;
- return dm_hash_rename(param->name, new_name);
+ return dm_hash_rename(param->event_nr, param->name, new_name);
}
static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
@@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param)
if (dm_suspended(md))
r = dm_resume(md);
- if (!r)
+
+ if (!r) {
+ dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
r = __dev_status(md, param);
+ }
dm_put(md);
return r;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 79fb53e51c70..ecbb17421da4 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
+ ti->num_flush_requests = 1;
ti->private = lc;
return 0;
@@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
struct linear_c *lc = ti->private;
bio->bi_bdev = lc->dev->bdev;
- bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
+ if (bio_sectors(bio))
+ bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
}
static int linear_map(struct dm_target *ti, struct bio *bio,
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6fa8ccf91c70..6352a9ad4446 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -416,7 +416,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
bitset_size,
ti->limits.logical_block_size);
- if (buf_size > dev->bdev->bd_inode->i_size) {
+ if (buf_size > i_size_read(dev->bdev->bd_inode)) {
DMWARN("log device %s too small: need %llu bytes",
dev->name, (unsigned long long)buf_size);
kfree(lc);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6a386ab4f7eb..252bc09bfaba 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -35,6 +35,7 @@ struct pgpath {
struct dm_path path;
struct work_struct deactivate_path;
+ struct work_struct activate_path;
};
#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -64,8 +65,6 @@ struct multipath {
spinlock_t lock;
const char *hw_handler_name;
- struct work_struct activate_path;
- struct pgpath *pgpath_to_activate;
unsigned nr_priority_groups;
struct list_head priority_groups;
unsigned pg_init_required; /* pg_init needs calling? */
@@ -102,6 +101,7 @@ struct multipath {
struct dm_mpath_io {
struct pgpath *pgpath;
struct dm_bio_details details;
+ size_t nr_bytes;
};
typedef int (*action_fn) (struct pgpath *pgpath);
@@ -128,6 +128,7 @@ static struct pgpath *alloc_pgpath(void)
if (pgpath) {
pgpath->is_active = 1;
INIT_WORK(&pgpath->deactivate_path, deactivate_path);
+ INIT_WORK(&pgpath->activate_path, activate_path);
}
return pgpath;
@@ -160,7 +161,6 @@ static struct priority_group *alloc_priority_group(void)
static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
{
- unsigned long flags;
struct pgpath *pgpath, *tmp;
struct multipath *m = ti->private;
@@ -169,10 +169,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
if (m->hw_handler_name)
scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
dm_put_device(ti, pgpath->path.dev);
- spin_lock_irqsave(&m->lock, flags);
- if (m->pgpath_to_activate == pgpath)
- m->pgpath_to_activate = NULL;
- spin_unlock_irqrestore(&m->lock, flags);
free_pgpath(pgpath);
}
}
@@ -202,7 +198,6 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
m->queue_io = 1;
INIT_WORK(&m->process_queued_ios, process_queued_ios);
INIT_WORK(&m->trigger_event, trigger_event);
- INIT_WORK(&m->activate_path, activate_path);
m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
if (!m->mpio_pool) {
kfree(m);
@@ -250,11 +245,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
m->pg_init_count = 0;
}
-static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
+ size_t nr_bytes)
{
struct dm_path *path;
- path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
+ path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
if (!path)
return -ENXIO;
@@ -266,7 +262,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
return 0;
}
-static void __choose_pgpath(struct multipath *m)
+static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
{
struct priority_group *pg;
unsigned bypassed = 1;
@@ -278,12 +274,12 @@ static void __choose_pgpath(struct multipath *m)
if (m->next_pg) {
pg = m->next_pg;
m->next_pg = NULL;
- if (!__choose_path_in_pg(m, pg))
+ if (!__choose_path_in_pg(m, pg, nr_bytes))
return;
}
/* Don't change PG until it has no remaining paths */
- if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
+ if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
return;
/*
@@ -295,7 +291,7 @@ static void __choose_pgpath(struct multipath *m)
list_for_each_entry(pg, &m->priority_groups, list) {
if (pg->bypassed == bypassed)
continue;
- if (!__choose_path_in_pg(m, pg))
+ if (!__choose_path_in_pg(m, pg, nr_bytes))
return;
}
} while (bypassed--);
@@ -326,6 +322,7 @@ static int map_io(struct multipath *m, struct bio *bio,
struct dm_mpath_io *mpio, unsigned was_queued)
{
int r = DM_MAPIO_REMAPPED;
+ size_t nr_bytes = bio->bi_size;
unsigned long flags;
struct pgpath *pgpath;
@@ -334,7 +331,7 @@ static int map_io(struct multipath *m, struct bio *bio,
/* Do we need to select a new pgpath? */
if (!m->current_pgpath ||
(!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
- __choose_pgpath(m);
+ __choose_pgpath(m, nr_bytes);
pgpath = m->current_pgpath;
@@ -359,6 +356,11 @@ static int map_io(struct multipath *m, struct bio *bio,
r = -EIO; /* Failed */
mpio->pgpath = pgpath;
+ mpio->nr_bytes = nr_bytes;
+
+ if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io)
+ pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path,
+ nr_bytes);
spin_unlock_irqrestore(&m->lock, flags);
@@ -427,8 +429,8 @@ static void process_queued_ios(struct work_struct *work)
{
struct multipath *m =
container_of(work, struct multipath, process_queued_ios);
- struct pgpath *pgpath = NULL;
- unsigned init_required = 0, must_queue = 1;
+ struct pgpath *pgpath = NULL, *tmp;
+ unsigned must_queue = 1;
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
@@ -437,7 +439,7 @@ static void process_queued_ios(struct work_struct *work)
goto out;
if (!m->current_pgpath)
- __choose_pgpath(m);
+ __choose_pgpath(m, 0);
pgpath = m->current_pgpath;
@@ -446,19 +448,15 @@ static void process_queued_ios(struct work_struct *work)
must_queue = 0;
if (m->pg_init_required && !m->pg_init_in_progress && pgpath) {
- m->pgpath_to_activate = pgpath;
m->pg_init_count++;
m->pg_init_required = 0;
- m->pg_init_in_progress = 1;
- init_required = 1;
+ list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) {
+ queue_work(kmpath_handlerd, &tmp->activate_path);
+ m->pg_init_in_progress++;
+ }
}
-
out:
spin_unlock_irqrestore(&m->lock, flags);
-
- if (init_required)
- queue_work(kmpath_handlerd, &m->activate_path);
-
if (!must_queue)
dispatch_queued_ios(m);
}
@@ -553,6 +551,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
return -EINVAL;
}
+ if (ps_argc > as->argc) {
+ dm_put_path_selector(pst);
+ ti->error = "not enough arguments for path selector";
+ return -EINVAL;
+ }
+
r = pst->create(&pg->ps, ps_argc, as->argv);
if (r) {
dm_put_path_selector(pst);
@@ -591,9 +595,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
}
if (m->hw_handler_name) {
- r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev),
- m->hw_handler_name);
+ struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
+
+ r = scsi_dh_attach(q, m->hw_handler_name);
+ if (r == -EBUSY) {
+ /*
+ * Already attached to different hw_handler,
+ * try to reattach with correct one.
+ */
+ scsi_dh_detach(q);
+ r = scsi_dh_attach(q, m->hw_handler_name);
+ }
+
if (r < 0) {
+ ti->error = "error attaching hardware handler";
dm_put_device(ti, p->path.dev);
goto bad;
}
@@ -699,6 +714,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
if (!hw_argc)
return 0;
+ if (hw_argc > as->argc) {
+ ti->error = "not enough arguments for hardware handler";
+ return -EINVAL;
+ }
+
m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
request_module("scsi_dh_%s", m->hw_handler_name);
if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
@@ -823,6 +843,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
goto bad;
}
+ ti->num_flush_requests = 1;
+
return 0;
bad:
@@ -836,6 +858,7 @@ static void multipath_dtr(struct dm_target *ti)
flush_workqueue(kmpath_handlerd);
flush_workqueue(kmultipathd);
+ flush_scheduled_work();
free_multipath(m);
}
@@ -924,9 +947,13 @@ static int reinstate_path(struct pgpath *pgpath)
pgpath->is_active = 1;
- m->current_pgpath = NULL;
- if (!m->nr_valid_paths++ && m->queue_size)
+ if (!m->nr_valid_paths++ && m->queue_size) {
+ m->current_pgpath = NULL;
queue_work(kmultipathd, &m->process_queued_ios);
+ } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
+ m->pg_init_in_progress++;
+ queue_work(kmpath_handlerd, &pgpath->activate_path);
+ }
dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
pgpath->path.dev->name, m->nr_valid_paths);
@@ -1102,35 +1129,30 @@ static void pg_init_done(struct dm_path *path, int errors)
spin_lock_irqsave(&m->lock, flags);
if (errors) {
- DMERR("Could not failover device. Error %d.", errors);
- m->current_pgpath = NULL;
- m->current_pg = NULL;
+ if (pgpath == m->current_pgpath) {
+ DMERR("Could not failover device. Error %d.", errors);
+ m->current_pgpath = NULL;
+ m->current_pg = NULL;
+ }
} else if (!m->pg_init_required) {
m->queue_io = 0;
pg->bypassed = 0;
}
- m->pg_init_in_progress = 0;
- queue_work(kmultipathd, &m->process_queued_ios);
+ m->pg_init_in_progress--;
+ if (!m->pg_init_in_progress)
+ queue_work(kmultipathd, &m->process_queued_ios);
spin_unlock_irqrestore(&m->lock, flags);
}
static void activate_path(struct work_struct *work)
{
int ret;
- struct multipath *m =
- container_of(work, struct multipath, activate_path);
- struct dm_path *path;
- unsigned long flags;
+ struct pgpath *pgpath =
+ container_of(work, struct pgpath, activate_path);
- spin_lock_irqsave(&m->lock, flags);
- path = &m->pgpath_to_activate->path;
- m->pgpath_to_activate = NULL;
- spin_unlock_irqrestore(&m->lock, flags);
- if (!path)
- return;
- ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev));
- pg_init_done(path, ret);
+ ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev));
+ pg_init_done(&pgpath->path, ret);
}
/*
@@ -1195,7 +1217,7 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
if (pgpath) {
ps = &pgpath->pg->ps;
if (ps->type->end_io)
- ps->type->end_io(ps, &pgpath->path);
+ ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
}
if (r != DM_ENDIO_INCOMPLETE)
mempool_free(mpio, m->mpio_pool);
@@ -1411,7 +1433,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
spin_lock_irqsave(&m->lock, flags);
if (!m->current_pgpath)
- __choose_pgpath(m);
+ __choose_pgpath(m, 0);
if (m->current_pgpath) {
bdev = m->current_pgpath->path.dev->bdev;
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index 27357b85d73d..e7d1fa8b0459 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -56,7 +56,8 @@ struct path_selector_type {
* the path fails.
*/
struct dm_path *(*select_path) (struct path_selector *ps,
- unsigned *repeat_count);
+ unsigned *repeat_count,
+ size_t nr_bytes);
/*
* Notify the selector that a path has failed.
@@ -75,7 +76,10 @@ struct path_selector_type {
int (*status) (struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned int maxlen);
- int (*end_io) (struct path_selector *ps, struct dm_path *path);
+ int (*start_io) (struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes);
+ int (*end_io) (struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes);
};
/* Register a path selector */
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
new file mode 100644
index 000000000000..f92b6cea9d9c
--- /dev/null
+++ b/drivers/md/dm-queue-length.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved.
+ * Copyright (C) 2006-2009 NEC Corporation.
+ *
+ * dm-queue-length.c
+ *
+ * Module Author: Stefan Bader, IBM
+ * Modified by: Kiyoshi Ueda, NEC
+ *
+ * This file is released under the GPL.
+ *
+ * queue-length path selector - choose a path with the least number of
+ * in-flight I/Os.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <asm/atomic.h>
+
+#define DM_MSG_PREFIX "multipath queue-length"
+#define QL_MIN_IO 128
+#define QL_VERSION "0.1.0"
+
+struct selector {
+ struct list_head valid_paths;
+ struct list_head failed_paths;
+};
+
+struct path_info {
+ struct list_head list;
+ struct dm_path *path;
+ unsigned repeat_count;
+ atomic_t qlen; /* the number of in-flight I/Os */
+};
+
+static struct selector *alloc_selector(void)
+{
+ struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s) {
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->failed_paths);
+ }
+
+ return s;
+}
+
+static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+ struct selector *s = alloc_selector();
+
+ if (!s)
+ return -ENOMEM;
+
+ ps->context = s;
+ return 0;
+}
+
+static void ql_free_paths(struct list_head *paths)
+{
+ struct path_info *pi, *next;
+
+ list_for_each_entry_safe(pi, next, paths, list) {
+ list_del(&pi->list);
+ kfree(pi);
+ }
+}
+
+static void ql_destroy(struct path_selector *ps)
+{
+ struct selector *s = ps->context;
+
+ ql_free_paths(&s->valid_paths);
+ ql_free_paths(&s->failed_paths);
+ kfree(s);
+ ps->context = NULL;
+}
+
+static int ql_status(struct path_selector *ps, struct dm_path *path,
+ status_type_t type, char *result, unsigned maxlen)
+{
+ unsigned sz = 0;
+ struct path_info *pi;
+
+ /* When called with NULL path, return selector status/args. */
+ if (!path)
+ DMEMIT("0 ");
+ else {
+ pi = path->pscontext;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%d ", atomic_read(&pi->qlen));
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u ", pi->repeat_count);
+ break;
+ }
+ }
+
+ return sz;
+}
+
+static int ql_add_path(struct path_selector *ps, struct dm_path *path,
+ int argc, char **argv, char **error)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi;
+ unsigned repeat_count = QL_MIN_IO;
+
+ /*
+ * Arguments: [<repeat_count>]
+ * <repeat_count>: The number of I/Os before switching path.
+ * If not given, default (QL_MIN_IO) is used.
+ */
+ if (argc > 1) {
+ *error = "queue-length ps: incorrect number of arguments";
+ return -EINVAL;
+ }
+
+ if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+ *error = "queue-length ps: invalid repeat count";
+ return -EINVAL;
+ }
+
+ /* Allocate the path information structure */
+ pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+ if (!pi) {
+ *error = "queue-length ps: Error allocating path information";
+ return -ENOMEM;
+ }
+
+ pi->path = path;
+ pi->repeat_count = repeat_count;
+ atomic_set(&pi->qlen, 0);
+
+ path->pscontext = pi;
+
+ list_add_tail(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = path->pscontext;
+
+ list_move(&pi->list, &s->failed_paths);
+}
+
+static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = path->pscontext;
+
+ list_move_tail(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+/*
+ * Select a path having the minimum number of in-flight I/Os
+ */
+static struct dm_path *ql_select_path(struct path_selector *ps,
+ unsigned *repeat_count, size_t nr_bytes)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = NULL, *best = NULL;
+
+ if (list_empty(&s->valid_paths))
+ return NULL;
+
+ /* Change preferred (first in list) path to evenly balance. */
+ list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+ list_for_each_entry(pi, &s->valid_paths, list) {
+ if (!best ||
+ (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
+ best = pi;
+
+ if (!atomic_read(&best->qlen))
+ break;
+ }
+
+ if (!best)
+ return NULL;
+
+ *repeat_count = best->repeat_count;
+
+ return best->path;
+}
+
+static int ql_start_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes)
+{
+ struct path_info *pi = path->pscontext;
+
+ atomic_inc(&pi->qlen);
+
+ return 0;
+}
+
+static int ql_end_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes)
+{
+ struct path_info *pi = path->pscontext;
+
+ atomic_dec(&pi->qlen);
+
+ return 0;
+}
+
+static struct path_selector_type ql_ps = {
+ .name = "queue-length",
+ .module = THIS_MODULE,
+ .table_args = 1,
+ .info_args = 1,
+ .create = ql_create,
+ .destroy = ql_destroy,
+ .status = ql_status,
+ .add_path = ql_add_path,
+ .fail_path = ql_fail_path,
+ .reinstate_path = ql_reinstate_path,
+ .select_path = ql_select_path,
+ .start_io = ql_start_io,
+ .end_io = ql_end_io,
+};
+
+static int __init dm_ql_init(void)
+{
+ int r = dm_register_path_selector(&ql_ps);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ DMINFO("version " QL_VERSION " loaded");
+
+ return r;
+}
+
+static void __exit dm_ql_exit(void)
+{
+ int r = dm_unregister_path_selector(&ql_ps);
+
+ if (r < 0)
+ DMERR("unregister failed %d", r);
+}
+
+module_init(dm_ql_init);
+module_exit(dm_ql_exit);
+
+MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
+MODULE_DESCRIPTION(
+ "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n"
+ DM_NAME " path selector to balance the number of in-flight I/Os"
+);
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 7b899be0b087..36dbe29f2fd6 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
if (unlikely(!nreg))
- nreg = kmalloc(sizeof(*nreg), GFP_NOIO);
+ nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
DM_RH_CLEAN : DM_RH_NOSYNC;
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index cdfbf65b28cb..24752f449bef 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
}
static struct dm_path *rr_select_path(struct path_selector *ps,
- unsigned *repeat_count)
+ unsigned *repeat_count, size_t nr_bytes)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi = NULL;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2662a41337e7..6e3fe4f14934 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
/*
* Commit exceptions to disk.
*/
- if (ps->valid && area_io(ps, WRITE))
+ if (ps->valid && area_io(ps, WRITE_BARRIER))
ps->valid = 0;
/*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index d73f17fc7778..d573165cd2b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->private = s;
ti->split_io = s->store->chunk_size;
+ ti->num_flush_requests = 1;
return 0;
@@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
chunk_t chunk;
struct dm_snap_pending_exception *pe = NULL;
+ if (unlikely(bio_empty_barrier(bio))) {
+ bio->bi_bdev = s->store->cow->bdev;
+ return DM_MAPIO_REMAPPED;
+ }
+
chunk = sector_to_chunk(s->store, bio->bi_sector);
/* Full snapshots are not usable */
@@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ti->private = dev;
+ ti->num_flush_requests = 1;
+
return 0;
}
@@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
struct dm_dev *dev = ti->private;
bio->bi_bdev = dev->bdev;
+ if (unlikely(bio_empty_barrier(bio)))
+ return DM_MAPIO_REMAPPED;
+
/* Only tell snapshots if this is a write */
return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
}
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 41569bc60abc..c64fe827a5f1 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
sc->stripes = stripes;
sc->stripe_width = width;
ti->split_io = chunk_size;
+ ti->num_flush_requests = stripes;
sc->chunk_mask = ((sector_t) chunk_size) - 1;
for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
@@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
struct stripe_c *sc = (struct stripe_c *) ti->private;
+ sector_t offset, chunk;
+ uint32_t stripe;
- sector_t offset = bio->bi_sector - ti->begin;
- sector_t chunk = offset >> sc->chunk_shift;
- uint32_t stripe = sector_div(chunk, sc->stripes);
+ if (unlikely(bio_empty_barrier(bio))) {
+ BUG_ON(map_context->flush_request >= sc->stripes);
+ bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev;
+ return DM_MAPIO_REMAPPED;
+ }
+
+ offset = bio->bi_sector - ti->begin;
+ chunk = offset >> sc->chunk_shift;
+ stripe = sector_div(chunk, sc->stripes);
bio->bi_bdev = sc->stripe[stripe].dev->bdev;
bio->bi_sector = sc->stripe[stripe].physical_start +
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index a2a45e6c7c8b..4b045903a4e2 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
return strlen(buf);
}
+static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
+{
+ sprintf(buf, "%d\n", dm_suspended(md));
+
+ return strlen(buf);
+}
+
static DM_ATTR_RO(name);
static DM_ATTR_RO(uuid);
+static DM_ATTR_RO(suspended);
static struct attribute *dm_attrs[] = {
&dm_attr_name.attr,
&dm_attr_uuid.attr,
+ &dm_attr_suspended.attr,
NULL,
};
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e9a73bb242b0..e3bcfb8b15a1 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -267,6 +267,8 @@ static void free_devices(struct list_head *devices)
list_for_each_safe(tmp, next, devices) {
struct dm_dev_internal *dd =
list_entry(tmp, struct dm_dev_internal, list);
+ DMWARN("dm_table_destroy: dm_put_device call missing for %s",
+ dd->dm_dev.name);
kfree(dd);
}
}
@@ -296,12 +298,8 @@ void dm_table_destroy(struct dm_table *t)
vfree(t->highs);
/* free the device list */
- if (t->devices.next != &t->devices) {
- DMWARN("devices still present during destroy: "
- "dm_table_remove_device calls missing");
-
+ if (t->devices.next != &t->devices)
free_devices(&t->devices);
- }
kfree(t);
}
@@ -385,15 +383,45 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
/*
* If possible, this checks an area of a destination device is valid.
*/
-static int check_device_area(struct dm_dev_internal *dd, sector_t start,
- sector_t len)
+static int device_area_is_valid(struct dm_target *ti, struct block_device *bdev,
+ sector_t start, sector_t len)
{
- sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT;
+ sector_t dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+ unsigned short logical_block_size_sectors =
+ ti->limits.logical_block_size >> SECTOR_SHIFT;
+ char b[BDEVNAME_SIZE];
if (!dev_size)
return 1;
- return ((start < dev_size) && (len <= (dev_size - start)));
+ if ((start >= dev_size) || (start + len > dev_size)) {
+ DMWARN("%s: %s too small for target",
+ dm_device_name(ti->table->md), bdevname(bdev, b));
+ return 0;
+ }
+
+ if (logical_block_size_sectors <= 1)
+ return 1;
+
+ if (start & (logical_block_size_sectors - 1)) {
+ DMWARN("%s: start=%llu not aligned to h/w "
+ "logical block size %hu of %s",
+ dm_device_name(ti->table->md),
+ (unsigned long long)start,
+ ti->limits.logical_block_size, bdevname(bdev, b));
+ return 0;
+ }
+
+ if (len & (logical_block_size_sectors - 1)) {
+ DMWARN("%s: len=%llu not aligned to h/w "
+ "logical block size %hu of %s",
+ dm_device_name(ti->table->md),
+ (unsigned long long)len,
+ ti->limits.logical_block_size, bdevname(bdev, b));
+ return 0;
+ }
+
+ return 1;
}
/*
@@ -479,14 +507,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
}
atomic_inc(&dd->count);
- if (!check_device_area(dd, start, len)) {
- DMWARN("device %s too small for target", path);
- dm_put_device(ti, &dd->dm_dev);
- return -EINVAL;
- }
-
*result = &dd->dm_dev;
-
return 0;
}
@@ -555,8 +576,16 @@ int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
int r = __table_get_device(ti->table, ti, path,
start, len, mode, result);
- if (!r)
- dm_set_device_limits(ti, (*result)->bdev);
+ if (r)
+ return r;
+
+ dm_set_device_limits(ti, (*result)->bdev);
+
+ if (!device_area_is_valid(ti, (*result)->bdev, start, len)) {
+ dm_put_device(ti, *result);
+ *result = NULL;
+ return -EINVAL;
+ }
return r;
}
@@ -695,6 +724,71 @@ static void check_for_valid_limits(struct io_restrictions *rs)
rs->bounce_pfn = -1;
}
+/*
+ * Impose necessary and sufficient conditions on a devices's table such
+ * that any incoming bio which respects its logical_block_size can be
+ * processed successfully. If it falls across the boundary between
+ * two or more targets, the size of each piece it gets split into must
+ * be compatible with the logical_block_size of the target processing it.
+ */
+static int validate_hardware_logical_block_alignment(struct dm_table *table)
+{
+ /*
+ * This function uses arithmetic modulo the logical_block_size
+ * (in units of 512-byte sectors).
+ */
+ unsigned short device_logical_block_size_sects =
+ table->limits.logical_block_size >> SECTOR_SHIFT;
+
+ /*
+ * Offset of the start of the next table entry, mod logical_block_size.
+ */
+ unsigned short next_target_start = 0;
+
+ /*
+ * Given an aligned bio that extends beyond the end of a
+ * target, how many sectors must the next target handle?
+ */
+ unsigned short remaining = 0;
+
+ struct dm_target *uninitialized_var(ti);
+ unsigned i = 0;
+
+ /*
+ * Check each entry in the table in turn.
+ */
+ while (i < dm_table_get_num_targets(table)) {
+ ti = dm_table_get_target(table, i++);
+
+ /*
+ * If the remaining sectors fall entirely within this
+ * table entry are they compatible with its logical_block_size?
+ */
+ if (remaining < ti->len &&
+ remaining & ((ti->limits.logical_block_size >>
+ SECTOR_SHIFT) - 1))
+ break; /* Error */
+
+ next_target_start =
+ (unsigned short) ((next_target_start + ti->len) &
+ (device_logical_block_size_sects - 1));
+ remaining = next_target_start ?
+ device_logical_block_size_sects - next_target_start : 0;
+ }
+
+ if (remaining) {
+ DMWARN("%s: table line %u (start sect %llu len %llu) "
+ "not aligned to hardware logical block size %hu",
+ dm_device_name(table->md), i,
+ (unsigned long long) ti->begin,
+ (unsigned long long) ti->len,
+ table->limits.logical_block_size);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
int dm_table_add_target(struct dm_table *t, const char *type,
sector_t start, sector_t len, char *params)
{
@@ -794,6 +888,10 @@ int dm_table_complete(struct dm_table *t)
check_for_valid_limits(&t->limits);
+ r = validate_hardware_logical_block_alignment(t);
+ if (r)
+ return r;
+
/* how many indexes will the btree have ? */
leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 424f7b048c30..0f17c35592da 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,13 @@
#define DM_MSG_PREFIX "core"
+/*
+ * Cookies are numeric values sent with CHANGE and REMOVE
+ * uevents while resuming, removing or renaming the device.
+ */
+#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
+#define DM_COOKIE_LENGTH 24
+
static const char *_name = DM_NAME;
static unsigned int major = 0;
@@ -159,13 +166,16 @@ struct mapped_device {
* freeze/thaw support require holding onto a super block
*/
struct super_block *frozen_sb;
- struct block_device *suspended_bdev;
+ struct block_device *bdev;
/* forced geometry settings */
struct hd_geometry geometry;
/* sysfs handle */
struct kobject kobj;
+
+ /* zero-length barrier that will be cloned and submitted to targets */
+ struct bio barrier_bio;
};
#define MIN_IOS 256
@@ -393,11 +403,6 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
mempool_free(io, md->io_pool);
}
-static struct dm_target_io *alloc_tio(struct mapped_device *md)
-{
- return mempool_alloc(md->tio_pool, GFP_NOIO);
-}
-
static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
{
mempool_free(tio, md->tio_pool);
@@ -538,9 +543,11 @@ static void dec_pending(struct dm_io *io, int error)
* Target requested pushing back the I/O.
*/
spin_lock_irqsave(&md->deferred_lock, flags);
- if (__noflush_suspending(md))
- bio_list_add_head(&md->deferred, io->bio);
- else
+ if (__noflush_suspending(md)) {
+ if (!bio_barrier(io->bio))
+ bio_list_add_head(&md->deferred,
+ io->bio);
+ } else
/* noflush suspend was interrupted. */
io->error = -EIO;
spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -555,7 +562,8 @@ static void dec_pending(struct dm_io *io, int error)
* a per-device variable for error reporting.
* Note that you can't touch the bio after end_io_acct
*/
- md->barrier_error = io_error;
+ if (!md->barrier_error && io_error != -EOPNOTSUPP)
+ md->barrier_error = io_error;
end_io_acct(io);
} else {
end_io_acct(io);
@@ -636,11 +644,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
sector_t sector;
struct mapped_device *md;
- /*
- * Sanity checks.
- */
- BUG_ON(!clone->bi_size);
-
clone->bi_end_io = clone_endio;
clone->bi_private = tio;
@@ -755,6 +758,48 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
return clone;
}
+static struct dm_target_io *alloc_tio(struct clone_info *ci,
+ struct dm_target *ti)
+{
+ struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
+
+ tio->io = ci->io;
+ tio->ti = ti;
+ memset(&tio->info, 0, sizeof(tio->info));
+
+ return tio;
+}
+
+static void __flush_target(struct clone_info *ci, struct dm_target *ti,
+ unsigned flush_nr)
+{
+ struct dm_target_io *tio = alloc_tio(ci, ti);
+ struct bio *clone;
+
+ tio->info.flush_request = flush_nr;
+
+ clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
+ __bio_clone(clone, ci->bio);
+ clone->bi_destructor = dm_bio_destructor;
+
+ __map_bio(ti, clone, tio);
+}
+
+static int __clone_and_map_empty_barrier(struct clone_info *ci)
+{
+ unsigned target_nr = 0, flush_nr;
+ struct dm_target *ti;
+
+ while ((ti = dm_table_get_target(ci->map, target_nr++)))
+ for (flush_nr = 0; flush_nr < ti->num_flush_requests;
+ flush_nr++)
+ __flush_target(ci, ti, flush_nr);
+
+ ci->sector_count = 0;
+
+ return 0;
+}
+
static int __clone_and_map(struct clone_info *ci)
{
struct bio *clone, *bio = ci->bio;
@@ -762,6 +807,9 @@ static int __clone_and_map(struct clone_info *ci)
sector_t len = 0, max;
struct dm_target_io *tio;
+ if (unlikely(bio_empty_barrier(bio)))
+ return __clone_and_map_empty_barrier(ci);
+
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
return -EIO;
@@ -771,10 +819,7 @@ static int __clone_and_map(struct clone_info *ci)
/*
* Allocate a target io object.
*/
- tio = alloc_tio(ci->md);
- tio->io = ci->io;
- tio->ti = ti;
- memset(&tio->info, 0, sizeof(tio->info));
+ tio = alloc_tio(ci, ti);
if (ci->sector_count <= max) {
/*
@@ -830,10 +875,7 @@ static int __clone_and_map(struct clone_info *ci)
max = max_io_len(ci->md, ci->sector, ti);
- tio = alloc_tio(ci->md);
- tio->io = ci->io;
- tio->ti = ti;
- memset(&tio->info, 0, sizeof(tio->info));
+ tio = alloc_tio(ci, ti);
}
len = min(remaining, max);
@@ -868,7 +910,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
if (!bio_barrier(bio))
bio_io_error(bio);
else
- md->barrier_error = -EIO;
+ if (!md->barrier_error)
+ md->barrier_error = -EIO;
return;
}
@@ -881,6 +924,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
ci.io->md = md;
ci.sector = bio->bi_sector;
ci.sector_count = bio_sectors(bio);
+ if (unlikely(bio_empty_barrier(bio)))
+ ci.sector_count = 1;
ci.idx = bio->bi_idx;
start_io_acct(ci.io);
@@ -928,6 +973,16 @@ static int dm_merge_bvec(struct request_queue *q,
*/
if (max_size && ti->type->merge)
max_size = ti->type->merge(ti, bvm, biovec, max_size);
+ /*
+ * If the target doesn't support merge method and some of the devices
+ * provided their merge_bvec method (we know this by looking at
+ * queue_max_hw_sectors), then we can't allow bios with multiple vector
+ * entries. So always set max_size to 0, and the code below allows
+ * just one page.
+ */
+ else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
+
+ max_size = 0;
out_table:
dm_table_put(map);
@@ -1173,6 +1228,10 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->wq)
goto bad_thread;
+ md->bdev = bdget_disk(md->disk, 0);
+ if (!md->bdev)
+ goto bad_bdev;
+
/* Populate the mapping, nobody knows we exist yet */
spin_lock(&_minor_lock);
old_md = idr_replace(&_minor_idr, md, minor);
@@ -1182,6 +1241,8 @@ static struct mapped_device *alloc_dev(int minor)
return md;
+bad_bdev:
+ destroy_workqueue(md->wq);
bad_thread:
put_disk(md->disk);
bad_disk:
@@ -1207,10 +1268,8 @@ static void free_dev(struct mapped_device *md)
{
int minor = MINOR(disk_devt(md->disk));
- if (md->suspended_bdev) {
- unlock_fs(md);
- bdput(md->suspended_bdev);
- }
+ unlock_fs(md);
+ bdput(md->bdev);
destroy_workqueue(md->wq);
mempool_destroy(md->tio_pool);
mempool_destroy(md->io_pool);
@@ -1252,9 +1311,9 @@ static void __set_size(struct mapped_device *md, sector_t size)
{
set_capacity(md->disk, size);
- mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
- i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
- mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
+ mutex_lock(&md->bdev->bd_inode->i_mutex);
+ i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
+ mutex_unlock(&md->bdev->bd_inode->i_mutex);
}
static int __bind(struct mapped_device *md, struct dm_table *t)
@@ -1270,8 +1329,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
if (size != get_capacity(md->disk))
memset(&md->geometry, 0, sizeof(md->geometry));
- if (md->suspended_bdev)
- __set_size(md, size);
+ __set_size(md, size);
if (!size) {
dm_table_destroy(t);
@@ -1429,34 +1487,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
return r;
}
-static int dm_flush(struct mapped_device *md)
+static void dm_flush(struct mapped_device *md)
{
dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
- return 0;
+
+ bio_init(&md->barrier_bio);
+ md->barrier_bio.bi_bdev = md->bdev;
+ md->barrier_bio.bi_rw = WRITE_BARRIER;
+ __split_and_process_bio(md, &md->barrier_bio);
+
+ dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
}
static void process_barrier(struct mapped_device *md, struct bio *bio)
{
- int error = dm_flush(md);
+ md->barrier_error = 0;
- if (unlikely(error)) {
- bio_endio(bio, error);
- return;
- }
- if (bio_empty_barrier(bio)) {
- bio_endio(bio, 0);
- return;
- }
-
- __split_and_process_bio(md, bio);
+ dm_flush(md);
- error = dm_flush(md);
-
- if (!error && md->barrier_error)
- error = md->barrier_error;
+ if (!bio_empty_barrier(bio)) {
+ __split_and_process_bio(md, bio);
+ dm_flush(md);
+ }
if (md->barrier_error != DM_ENDIO_REQUEUE)
- bio_endio(bio, error);
+ bio_endio(bio, md->barrier_error);
+ else {
+ spin_lock_irq(&md->deferred_lock);
+ bio_list_add_head(&md->deferred, bio);
+ spin_unlock_irq(&md->deferred_lock);
+ }
}
/*
@@ -1513,11 +1573,6 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
if (!dm_suspended(md))
goto out;
- /* without bdev, the device size cannot be changed */
- if (!md->suspended_bdev)
- if (get_capacity(md->disk) != dm_table_get_size(table))
- goto out;
-
__unbind(md);
r = __bind(md, table);
@@ -1536,7 +1591,7 @@ static int lock_fs(struct mapped_device *md)
WARN_ON(md->frozen_sb);
- md->frozen_sb = freeze_bdev(md->suspended_bdev);
+ md->frozen_sb = freeze_bdev(md->bdev);
if (IS_ERR(md->frozen_sb)) {
r = PTR_ERR(md->frozen_sb);
md->frozen_sb = NULL;
@@ -1545,9 +1600,6 @@ static int lock_fs(struct mapped_device *md)
set_bit(DMF_FROZEN, &md->flags);
- /* don't bdput right now, we don't want the bdev
- * to go away while it is locked.
- */
return 0;
}
@@ -1556,7 +1608,7 @@ static void unlock_fs(struct mapped_device *md)
if (!test_bit(DMF_FROZEN, &md->flags))
return;
- thaw_bdev(md->suspended_bdev, md->frozen_sb);
+ thaw_bdev(md->bdev, md->frozen_sb);
md->frozen_sb = NULL;
clear_bit(DMF_FROZEN, &md->flags);
}
@@ -1594,24 +1646,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
/* This does not get reverted if there's an error later. */
dm_table_presuspend_targets(map);
- /* bdget() can stall if the pending I/Os are not flushed */
- if (!noflush) {
- md->suspended_bdev = bdget_disk(md->disk, 0);
- if (!md->suspended_bdev) {
- DMWARN("bdget failed in dm_suspend");
- r = -ENOMEM;
+ /*
+ * Flush I/O to the device. noflush supersedes do_lockfs,
+ * because lock_fs() needs to flush I/Os.
+ */
+ if (!noflush && do_lockfs) {
+ r = lock_fs(md);
+ if (r)
goto out;
- }
-
- /*
- * Flush I/O to the device. noflush supersedes do_lockfs,
- * because lock_fs() needs to flush I/Os.
- */
- if (do_lockfs) {
- r = lock_fs(md);
- if (r)
- goto out;
- }
}
/*
@@ -1668,11 +1710,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
set_bit(DMF_SUSPENDED, &md->flags);
out:
- if (r && md->suspended_bdev) {
- bdput(md->suspended_bdev);
- md->suspended_bdev = NULL;
- }
-
dm_table_put(map);
out_unlock:
@@ -1701,19 +1738,10 @@ int dm_resume(struct mapped_device *md)
unlock_fs(md);
- if (md->suspended_bdev) {
- bdput(md->suspended_bdev);
- md->suspended_bdev = NULL;
- }
-
clear_bit(DMF_SUSPENDED, &md->flags);
dm_table_unplug_all(map);
-
- dm_kobject_uevent(md);
-
r = 0;
-
out:
dm_table_put(map);
mutex_unlock(&md->suspend_lock);
@@ -1724,9 +1752,19 @@ out:
/*-----------------------------------------------------------------
* Event notification.
*---------------------------------------------------------------*/
-void dm_kobject_uevent(struct mapped_device *md)
-{
- kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE);
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+ unsigned cookie)
+{
+ char udev_cookie[DM_COOKIE_LENGTH];
+ char *envp[] = { udev_cookie, NULL };
+
+ if (!cookie)
+ kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
+ else {
+ snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
+ DM_COOKIE_ENV_VAR_NAME, cookie);
+ kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
+ }
}
uint32_t dm_next_uevent_seq(struct mapped_device *md)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a31506d93e91..b5935c610c44 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -92,7 +92,8 @@ void dm_stripe_exit(void);
int dm_open_count(struct mapped_device *md);
int dm_lock_for_deletion(struct mapped_device *md);
-void dm_kobject_uevent(struct mapped_device *md);
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+ unsigned cookie);
int dm_kcopyd_init(void);
void dm_kcopyd_exit(void);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 49c2362977fd..fc36a4d07723 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -21,6 +21,7 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
union map_info {
void *ptr;
unsigned long long ll;
+ unsigned flush_request;
};
/*
@@ -168,6 +169,16 @@ struct dm_target {
sector_t split_io;
/*
+ * A number of zero-length barrier requests that will be submitted
+ * to the target for the purpose of flushing cache.
+ *
+ * The request number will be placed in union map_info->flush_request.
+ * It is a responsibility of the target driver to remap these requests
+ * to the real underlying devices.
+ */
+ unsigned num_flush_requests;
+
+ /*
* These are automatically filled in by
* dm_table_get_device.
*/
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 48e44ee2b466..2ab84c83c31a 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -123,6 +123,16 @@ struct dm_ioctl {
__u32 target_count; /* in/out */
__s32 open_count; /* out */
__u32 flags; /* in/out */
+
+ /*
+ * event_nr holds either the event number (input and output) or the
+ * udev cookie value (input only).
+ * The DM_DEV_WAIT ioctl takes an event number as input.
+ * The DM_SUSPEND, DM_DEV_REMOVE and DM_DEV_RENAME ioctls
+ * use the field as a cookie to return in the DM_COOKIE
+ * variable with the uevents they issue.
+ * For output, the ioctls return the event number, not the cookie.
+ */
__u32 event_nr; /* in/out */
__u32 padding;
@@ -256,9 +266,9 @@ enum {
#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
#define DM_VERSION_MAJOR 4
-#define DM_VERSION_MINOR 14
+#define DM_VERSION_MINOR 15
#define DM_VERSION_PATCHLEVEL 0
-#define DM_VERSION_EXTRA "-ioctl (2008-04-23)"
+#define DM_VERSION_EXTRA "-ioctl (2009-04-01)"
/* Status bits */
#define DM_READONLY_FLAG (1 << 0) /* In/Out */