diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2009-06-03 13:43:25 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2009-06-03 13:43:25 +1000 |
commit | fe371fb276db6ee6181205c26140ebf6bf09b117 (patch) | |
tree | b8b8289f815d1ca00cd2b0acd6b50cee52e58248 | |
parent | 75f71da8f65355f4a300542f706aa61e3b7d2453 (diff) | |
parent | c855427a5ff13dc5b8aa8e72ed1495e9aac04abb (diff) |
Merge branch 'quilt/device-mapper'
-rw-r--r-- | Documentation/device-mapper/dm-queue-length.txt | 39 | ||||
-rw-r--r-- | drivers/md/Kconfig | 9 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-delay.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 14 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 14 | ||||
-rw-r--r-- | drivers/md/dm-linear.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 120 | ||||
-rw-r--r-- | drivers/md/dm-path-selector.h | 8 | ||||
-rw-r--r-- | drivers/md/dm-queue-length.c | 263 | ||||
-rw-r--r-- | drivers/md/dm-region-hash.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-round-robin.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-snap-persistent.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 11 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 15 | ||||
-rw-r--r-- | drivers/md/dm-sysfs.c | 9 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 134 | ||||
-rw-r--r-- | drivers/md/dm.c | 228 | ||||
-rw-r--r-- | drivers/md/dm.h | 3 | ||||
-rw-r--r-- | include/linux/device-mapper.h | 11 | ||||
-rw-r--r-- | include/linux/dm-ioctl.h | 14 |
24 files changed, 738 insertions, 183 deletions
diff --git a/Documentation/device-mapper/dm-queue-length.txt b/Documentation/device-mapper/dm-queue-length.txt new file mode 100644 index 000000000000..f4db2562175c --- /dev/null +++ b/Documentation/device-mapper/dm-queue-length.txt @@ -0,0 +1,39 @@ +dm-queue-length +=============== + +dm-queue-length is a path selector module for device-mapper targets, +which selects a path with the least number of in-flight I/Os. +The path selector name is 'queue-length'. + +Table parameters for each path: [<repeat_count>] + <repeat_count>: The number of I/Os to dispatch using the selected + path before switching to the next path. + If not given, internal default is used. To check + the default value, see the activated table. + +Status for each path: <status> <fail-count> <in-flight> + <status>: 'A' if the path is active, 'F' if the path is failed. + <fail-count>: The number of path failures. + <in-flight>: The number of in-flight I/Os on the path. + + +Algorithm +========= + +dm-queue-length increments/decrements 'in-flight' when an I/O is +dispatched/completed respectively. +dm-queue-length selects a path with the minimum 'in-flight'. + + +Examples +======== +In case that 2 paths (sda and sdb) are used with repeat_count == 128. + +# echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \ + dmsetup create test +# +# dmsetup table +test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128 +# +# dmsetup status +test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0 diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 36e0675be9f7..3b311d273346 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -249,6 +249,15 @@ config DM_MULTIPATH ---help--- Allow volume managers to support multipath hardware. +config DM_MULTIPATH_QL + tristate "I/O Path Selector based on the number of in-flight I/Os" + depends on DM_MULTIPATH + ---help--- + This path selector is a dynamic load balancer which selects + the path with the least number of in-flight I/Os. + + If unsure, say N. + config DM_DELAY tristate "I/O delaying target (EXPERIMENTAL)" depends on BLK_DEV_DM && EXPERIMENTAL diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 45cc5951d928..ff9f545dd516 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o +obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_ZERO) += dm-zero.o diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 53394e863c74..04db6c4004a8 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_crypt_queue; } + ti->num_flush_requests = 1; ti->private = cc; return 0; @@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { struct dm_crypt_io *io; + struct crypt_config *cc; + + if (unlikely(bio_empty_barrier(bio))) { + cc = ti->private; + bio->bi_bdev = cc->dev->bdev; + return DM_MAPIO_REMAPPED; + } io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 559dbb52bc85..8ad8a9044bbf 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -197,6 +197,7 @@ out: mutex_init(&dc->timer_lock); atomic_set(&dc->may_delay, 1); + ti->num_flush_requests = 1; ti->private = dc; return 0; @@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio, if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { bio->bi_bdev = dc->dev_write->bdev; - bio->bi_sector = dc->start_write + - (bio->bi_sector - ti->begin); + if (bio_sectors(bio)) + bio->bi_sector = dc->start_write + + (bio->bi_sector - ti->begin); return delay_bio(dc, dc->write_delay, bio); } diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 0a2e6e7f67b3..96a796be7c77 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) */ static inline sector_t get_dev_size(struct block_device *bdev) { - return bdev->bd_inode->i_size >> SECTOR_SHIFT; + return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; } static inline chunk_t sector_to_chunk(struct dm_exception_store *store, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index e73aabd61cd7..3a2e6a2f8bdd 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -22,6 +22,7 @@ struct dm_io_client { /* FIXME: can we shrink this ? */ struct io { unsigned long error_bits; + unsigned long eopnotsupp_bits; atomic_t count; struct task_struct *sleeper; struct dm_io_client *client; @@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio) *---------------------------------------------------------------*/ static void dec_count(struct io *io, unsigned int region, int error) { - if (error) + if (error) { set_bit(region, &io->error_bits); + if (error == -EOPNOTSUPP) + set_bit(region, &io->eopnotsupp_bits); + } if (atomic_dec_and_test(&io->count)) { if (io->sleeper) @@ -360,7 +364,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, return -EIO; } +retry: io.error_bits = 0; + io.eopnotsupp_bits = 0; atomic_set(&io.count, 1); /* see dispatch_io() */ io.sleeper = current; io.client = client; @@ -377,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, } set_current_state(TASK_RUNNING); + if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { + rw &= ~(1 << BIO_RW_BARRIER); + goto retry; + } + if (error_bits) *error_bits = io.error_bits; @@ -397,6 +408,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; + io->eopnotsupp_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ io->sleeper = NULL; io->client = client; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 823ceba6efa8..410f2d058902 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -276,7 +276,7 @@ retry: up_write(&_hash_lock); } -static int dm_hash_rename(const char *old, const char *new) +static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) { char *new_name, *old_name; struct hash_cell *hc; @@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new) dm_table_put(table); } - dm_kobject_uevent(hc->md); + dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie); dm_put(hc->md); up_write(&_hash_lock); @@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) __hash_remove(hc); up_write(&_hash_lock); + + dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr); + dm_put(md); param->data_size = 0; return 0; @@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) return r; param->data_size = 0; - return dm_hash_rename(param->name, new_name); + return dm_hash_rename(param->event_nr, param->name, new_name); } static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) @@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param) if (dm_suspended(md)) r = dm_resume(md); - if (!r) + + if (!r) { + dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); r = __dev_status(md, param); + } dm_put(md); return r; diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 79fb53e51c70..ecbb17421da4 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + ti->num_flush_requests = 1; ti->private = lc; return 0; @@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) struct linear_c *lc = ti->private; bio->bi_bdev = lc->dev->bdev; - bio->bi_sector = linear_map_sector(ti, bio->bi_sector); + if (bio_sectors(bio)) + bio->bi_sector = linear_map_sector(ti, bio->bi_sector); } static int linear_map(struct dm_target *ti, struct bio *bio, diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 6fa8ccf91c70..6352a9ad4446 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -416,7 +416,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, bitset_size, ti->limits.logical_block_size); - if (buf_size > dev->bdev->bd_inode->i_size) { + if (buf_size > i_size_read(dev->bdev->bd_inode)) { DMWARN("log device %s too small: need %llu bytes", dev->name, (unsigned long long)buf_size); kfree(lc); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6a386ab4f7eb..252bc09bfaba 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -35,6 +35,7 @@ struct pgpath { struct dm_path path; struct work_struct deactivate_path; + struct work_struct activate_path; }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -64,8 +65,6 @@ struct multipath { spinlock_t lock; const char *hw_handler_name; - struct work_struct activate_path; - struct pgpath *pgpath_to_activate; unsigned nr_priority_groups; struct list_head priority_groups; unsigned pg_init_required; /* pg_init needs calling? */ @@ -102,6 +101,7 @@ struct multipath { struct dm_mpath_io { struct pgpath *pgpath; struct dm_bio_details details; + size_t nr_bytes; }; typedef int (*action_fn) (struct pgpath *pgpath); @@ -128,6 +128,7 @@ static struct pgpath *alloc_pgpath(void) if (pgpath) { pgpath->is_active = 1; INIT_WORK(&pgpath->deactivate_path, deactivate_path); + INIT_WORK(&pgpath->activate_path, activate_path); } return pgpath; @@ -160,7 +161,6 @@ static struct priority_group *alloc_priority_group(void) static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) { - unsigned long flags; struct pgpath *pgpath, *tmp; struct multipath *m = ti->private; @@ -169,10 +169,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) if (m->hw_handler_name) scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); dm_put_device(ti, pgpath->path.dev); - spin_lock_irqsave(&m->lock, flags); - if (m->pgpath_to_activate == pgpath) - m->pgpath_to_activate = NULL; - spin_unlock_irqrestore(&m->lock, flags); free_pgpath(pgpath); } } @@ -202,7 +198,6 @@ static struct multipath *alloc_multipath(struct dm_target *ti) m->queue_io = 1; INIT_WORK(&m->process_queued_ios, process_queued_ios); INIT_WORK(&m->trigger_event, trigger_event); - INIT_WORK(&m->activate_path, activate_path); m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); if (!m->mpio_pool) { kfree(m); @@ -250,11 +245,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) m->pg_init_count = 0; } -static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) +static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, + size_t nr_bytes) { struct dm_path *path; - path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); + path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); if (!path) return -ENXIO; @@ -266,7 +262,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) return 0; } -static void __choose_pgpath(struct multipath *m) +static void __choose_pgpath(struct multipath *m, size_t nr_bytes) { struct priority_group *pg; unsigned bypassed = 1; @@ -278,12 +274,12 @@ static void __choose_pgpath(struct multipath *m) if (m->next_pg) { pg = m->next_pg; m->next_pg = NULL; - if (!__choose_path_in_pg(m, pg)) + if (!__choose_path_in_pg(m, pg, nr_bytes)) return; } /* Don't change PG until it has no remaining paths */ - if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) + if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) return; /* @@ -295,7 +291,7 @@ static void __choose_pgpath(struct multipath *m) list_for_each_entry(pg, &m->priority_groups, list) { if (pg->bypassed == bypassed) continue; - if (!__choose_path_in_pg(m, pg)) + if (!__choose_path_in_pg(m, pg, nr_bytes)) return; } } while (bypassed--); @@ -326,6 +322,7 @@ static int map_io(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio, unsigned was_queued) { int r = DM_MAPIO_REMAPPED; + size_t nr_bytes = bio->bi_size; unsigned long flags; struct pgpath *pgpath; @@ -334,7 +331,7 @@ static int map_io(struct multipath *m, struct bio *bio, /* Do we need to select a new pgpath? */ if (!m->current_pgpath || (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) - __choose_pgpath(m); + __choose_pgpath(m, nr_bytes); pgpath = m->current_pgpath; @@ -359,6 +356,11 @@ static int map_io(struct multipath *m, struct bio *bio, r = -EIO; /* Failed */ mpio->pgpath = pgpath; + mpio->nr_bytes = nr_bytes; + + if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io) + pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, + nr_bytes); spin_unlock_irqrestore(&m->lock, flags); @@ -427,8 +429,8 @@ static void process_queued_ios(struct work_struct *work) { struct multipath *m = container_of(work, struct multipath, process_queued_ios); - struct pgpath *pgpath = NULL; - unsigned init_required = 0, must_queue = 1; + struct pgpath *pgpath = NULL, *tmp; + unsigned must_queue = 1; unsigned long flags; spin_lock_irqsave(&m->lock, flags); @@ -437,7 +439,7 @@ static void process_queued_ios(struct work_struct *work) goto out; if (!m->current_pgpath) - __choose_pgpath(m); + __choose_pgpath(m, 0); pgpath = m->current_pgpath; @@ -446,19 +448,15 @@ static void process_queued_ios(struct work_struct *work) must_queue = 0; if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { - m->pgpath_to_activate = pgpath; m->pg_init_count++; m->pg_init_required = 0; - m->pg_init_in_progress = 1; - init_required = 1; + list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) { + queue_work(kmpath_handlerd, &tmp->activate_path); + m->pg_init_in_progress++; + } } - out: spin_unlock_irqrestore(&m->lock, flags); - - if (init_required) - queue_work(kmpath_handlerd, &m->activate_path); - if (!must_queue) dispatch_queued_ios(m); } @@ -553,6 +551,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, return -EINVAL; } + if (ps_argc > as->argc) { + dm_put_path_selector(pst); + ti->error = "not enough arguments for path selector"; + return -EINVAL; + } + r = pst->create(&pg->ps, ps_argc, as->argv); if (r) { dm_put_path_selector(pst); @@ -591,9 +595,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, } if (m->hw_handler_name) { - r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev), - m->hw_handler_name); + struct request_queue *q = bdev_get_queue(p->path.dev->bdev); + + r = scsi_dh_attach(q, m->hw_handler_name); + if (r == -EBUSY) { + /* + * Already attached to different hw_handler, + * try to reattach with correct one. + */ + scsi_dh_detach(q); + r = scsi_dh_attach(q, m->hw_handler_name); + } + if (r < 0) { + ti->error = "error attaching hardware handler"; dm_put_device(ti, p->path.dev); goto bad; } @@ -699,6 +714,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) if (!hw_argc) return 0; + if (hw_argc > as->argc) { + ti->error = "not enough arguments for hardware handler"; + return -EINVAL; + } + m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); request_module("scsi_dh_%s", m->hw_handler_name); if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { @@ -823,6 +843,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, goto bad; } + ti->num_flush_requests = 1; + return 0; bad: @@ -836,6 +858,7 @@ static void multipath_dtr(struct dm_target *ti) flush_workqueue(kmpath_handlerd); flush_workqueue(kmultipathd); + flush_scheduled_work(); free_multipath(m); } @@ -924,9 +947,13 @@ static int reinstate_path(struct pgpath *pgpath) pgpath->is_active = 1; - m->current_pgpath = NULL; - if (!m->nr_valid_paths++ && m->queue_size) + if (!m->nr_valid_paths++ && m->queue_size) { + m->current_pgpath = NULL; queue_work(kmultipathd, &m->process_queued_ios); + } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { + m->pg_init_in_progress++; + queue_work(kmpath_handlerd, &pgpath->activate_path); + } dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, pgpath->path.dev->name, m->nr_valid_paths); @@ -1102,35 +1129,30 @@ static void pg_init_done(struct dm_path *path, int errors) spin_lock_irqsave(&m->lock, flags); if (errors) { - DMERR("Could not failover device. Error %d.", errors); - m->current_pgpath = NULL; - m->current_pg = NULL; + if (pgpath == m->current_pgpath) { + DMERR("Could not failover device. Error %d.", errors); + m->current_pgpath = NULL; + m->current_pg = NULL; + } } else if (!m->pg_init_required) { m->queue_io = 0; pg->bypassed = 0; } - m->pg_init_in_progress = 0; - queue_work(kmultipathd, &m->process_queued_ios); + m->pg_init_in_progress--; + if (!m->pg_init_in_progress) + queue_work(kmultipathd, &m->process_queued_ios); spin_unlock_irqrestore(&m->lock, flags); } static void activate_path(struct work_struct *work) { int ret; - struct multipath *m = - container_of(work, struct multipath, activate_path); - struct dm_path *path; - unsigned long flags; + struct pgpath *pgpath = + container_of(work, struct pgpath, activate_path); - spin_lock_irqsave(&m->lock, flags); - path = &m->pgpath_to_activate->path; - m->pgpath_to_activate = NULL; - spin_unlock_irqrestore(&m->lock, flags); - if (!path) - return; - ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev)); - pg_init_done(path, ret); + ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev)); + pg_init_done(&pgpath->path, ret); } /* @@ -1195,7 +1217,7 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio, if (pgpath) { ps = &pgpath->pg->ps; if (ps->type->end_io) - ps->type->end_io(ps, &pgpath->path); + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } if (r != DM_ENDIO_INCOMPLETE) mempool_free(mpio, m->mpio_pool); @@ -1411,7 +1433,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, spin_lock_irqsave(&m->lock, flags); if (!m->current_pgpath) - __choose_pgpath(m); + __choose_pgpath(m, 0); if (m->current_pgpath) { bdev = m->current_pgpath->path.dev->bdev; diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 27357b85d73d..e7d1fa8b0459 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -56,7 +56,8 @@ struct path_selector_type { * the path fails. */ struct dm_path *(*select_path) (struct path_selector *ps, - unsigned *repeat_count); + unsigned *repeat_count, + size_t nr_bytes); /* * Notify the selector that a path has failed. @@ -75,7 +76,10 @@ struct path_selector_type { int (*status) (struct path_selector *ps, struct dm_path *path, status_type_t type, char *result, unsigned int maxlen); - int (*end_io) (struct path_selector *ps, struct dm_path *path); + int (*start_io) (struct path_selector *ps, struct dm_path *path, + size_t nr_bytes); + int (*end_io) (struct path_selector *ps, struct dm_path *path, + size_t nr_bytes); }; /* Register a path selector */ diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c new file mode 100644 index 000000000000..f92b6cea9d9c --- /dev/null +++ b/drivers/md/dm-queue-length.c @@ -0,0 +1,263 @@ +/* + * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved. + * Copyright (C) 2006-2009 NEC Corporation. + * + * dm-queue-length.c + * + * Module Author: Stefan Bader, IBM + * Modified by: Kiyoshi Ueda, NEC + * + * This file is released under the GPL. + * + * queue-length path selector - choose a path with the least number of + * in-flight I/Os. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <asm/atomic.h> + +#define DM_MSG_PREFIX "multipath queue-length" +#define QL_MIN_IO 128 +#define QL_VERSION "0.1.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned repeat_count; + atomic_t qlen; /* the number of in-flight I/Os */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + } + + return s; +} + +static int ql_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void ql_free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void ql_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + ql_free_paths(&s->valid_paths); + ql_free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int ql_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct path_info *pi; + + /* When called with NULL path, return selector status/args. */ + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", atomic_read(&pi->qlen)); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u ", pi->repeat_count); + break; + } + } + + return sz; +} + +static int ql_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned repeat_count = QL_MIN_IO; + + /* + * Arguments: [<repeat_count>] + * <repeat_count>: The number of I/Os before switching path. + * If not given, default (QL_MIN_IO) is used. + */ + if (argc > 1) { + *error = "queue-length ps: incorrect number of arguments"; + return -EINVAL; + } + + if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { + *error = "queue-length ps: invalid repeat count"; + return -EINVAL; + } + + /* Allocate the path information structure */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "queue-length ps: Error allocating path information"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + atomic_set(&pi->qlen, 0); + + path->pscontext = pi; + + list_add_tail(&pi->list, &s->valid_paths); + + return 0; +} + +static void ql_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move(&pi->list, &s->failed_paths); +} + +static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move_tail(&pi->list, &s->valid_paths); + + return 0; +} + +/* + * Select a path having the minimum number of in-flight I/Os + */ +static struct dm_path *ql_select_path(struct path_selector *ps, + unsigned *repeat_count, size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + + if (list_empty(&s->valid_paths)) + return NULL; + + /* Change preferred (first in list) path to evenly balance. */ + list_move_tail(s->valid_paths.next, &s->valid_paths); + + list_for_each_entry(pi, &s->valid_paths, list) { + if (!best || + (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) + best = pi; + + if (!atomic_read(&best->qlen)) + break; + } + + if (!best) + return NULL; + + *repeat_count = best->repeat_count; + + return best->path; +} + +static int ql_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_inc(&pi->qlen); + + return 0; +} + +static int ql_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_dec(&pi->qlen); + + return 0; +} + +static struct path_selector_type ql_ps = { + .name = "queue-length", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 1, + .create = ql_create, + .destroy = ql_destroy, + .status = ql_status, + .add_path = ql_add_path, + .fail_path = ql_fail_path, + .reinstate_path = ql_reinstate_path, + .select_path = ql_select_path, + .start_io = ql_start_io, + .end_io = ql_end_io, +}; + +static int __init dm_ql_init(void) +{ + int r = dm_register_path_selector(&ql_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " QL_VERSION " loaded"); + + return r; +} + +static void __exit dm_ql_exit(void) +{ + int r = dm_unregister_path_selector(&ql_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_ql_init); +module_exit(dm_ql_exit); + +MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>"); +MODULE_DESCRIPTION( + "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n" + DM_NAME " path selector to balance the number of in-flight I/Os" +); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 7b899be0b087..36dbe29f2fd6 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); if (unlikely(!nreg)) - nreg = kmalloc(sizeof(*nreg), GFP_NOIO); + nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? DM_RH_CLEAN : DM_RH_NOSYNC; diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index cdfbf65b28cb..24752f449bef 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) } static struct dm_path *rr_select_path(struct path_selector *ps, - unsigned *repeat_count) + unsigned *repeat_count, size_t nr_bytes) { struct selector *s = (struct selector *) ps->context; struct path_info *pi = NULL; diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 2662a41337e7..6e3fe4f14934 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, /* * Commit exceptions to disk. */ - if (ps->valid && area_io(ps, WRITE)) + if (ps->valid && area_io(ps, WRITE_BARRIER)) ps->valid = 0; /* diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index d73f17fc7778..d573165cd2b7 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = s; ti->split_io = s->store->chunk_size; + ti->num_flush_requests = 1; return 0; @@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, chunk_t chunk; struct dm_snap_pending_exception *pe = NULL; + if (unlikely(bio_empty_barrier(bio))) { + bio->bi_bdev = s->store->cow->bdev; + return DM_MAPIO_REMAPPED; + } + chunk = sector_to_chunk(s->store, bio->bi_sector); /* Full snapshots are not usable */ @@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ti->private = dev; + ti->num_flush_requests = 1; + return 0; } @@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio, struct dm_dev *dev = ti->private; bio->bi_bdev = dev->bdev; + if (unlikely(bio_empty_barrier(bio))) + return DM_MAPIO_REMAPPED; + /* Only tell snapshots if this is a write */ return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; } diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 41569bc60abc..c64fe827a5f1 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) sc->stripes = stripes; sc->stripe_width = width; ti->split_io = chunk_size; + ti->num_flush_requests = stripes; sc->chunk_mask = ((sector_t) chunk_size) - 1; for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) @@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { struct stripe_c *sc = (struct stripe_c *) ti->private; + sector_t offset, chunk; + uint32_t stripe; - sector_t offset = bio->bi_sector - ti->begin; - sector_t chunk = offset >> sc->chunk_shift; - uint32_t stripe = sector_div(chunk, sc->stripes); + if (unlikely(bio_empty_barrier(bio))) { + BUG_ON(map_context->flush_request >= sc->stripes); + bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev; + return DM_MAPIO_REMAPPED; + } + + offset = bio->bi_sector - ti->begin; + chunk = offset >> sc->chunk_shift; + stripe = sector_div(chunk, sc->stripes); bio->bi_bdev = sc->stripe[stripe].dev->bdev; bio->bi_sector = sc->stripe[stripe].physical_start + diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index a2a45e6c7c8b..4b045903a4e2 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) return strlen(buf); } +static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) +{ + sprintf(buf, "%d\n", dm_suspended(md)); + + return strlen(buf); +} + static DM_ATTR_RO(name); static DM_ATTR_RO(uuid); +static DM_ATTR_RO(suspended); static struct attribute *dm_attrs[] = { &dm_attr_name.attr, &dm_attr_uuid.attr, + &dm_attr_suspended.attr, NULL, }; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e9a73bb242b0..e3bcfb8b15a1 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -267,6 +267,8 @@ static void free_devices(struct list_head *devices) list_for_each_safe(tmp, next, devices) { struct dm_dev_internal *dd = list_entry(tmp, struct dm_dev_internal, list); + DMWARN("dm_table_destroy: dm_put_device call missing for %s", + dd->dm_dev.name); kfree(dd); } } @@ -296,12 +298,8 @@ void dm_table_destroy(struct dm_table *t) vfree(t->highs); /* free the device list */ - if (t->devices.next != &t->devices) { - DMWARN("devices still present during destroy: " - "dm_table_remove_device calls missing"); - + if (t->devices.next != &t->devices) free_devices(&t->devices); - } kfree(t); } @@ -385,15 +383,45 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) /* * If possible, this checks an area of a destination device is valid. */ -static int check_device_area(struct dm_dev_internal *dd, sector_t start, - sector_t len) +static int device_area_is_valid(struct dm_target *ti, struct block_device *bdev, + sector_t start, sector_t len) { - sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT; + sector_t dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + unsigned short logical_block_size_sectors = + ti->limits.logical_block_size >> SECTOR_SHIFT; + char b[BDEVNAME_SIZE]; if (!dev_size) return 1; - return ((start < dev_size) && (len <= (dev_size - start))); + if ((start >= dev_size) || (start + len > dev_size)) { + DMWARN("%s: %s too small for target", + dm_device_name(ti->table->md), bdevname(bdev, b)); + return 0; + } + + if (logical_block_size_sectors <= 1) + return 1; + + if (start & (logical_block_size_sectors - 1)) { + DMWARN("%s: start=%llu not aligned to h/w " + "logical block size %hu of %s", + dm_device_name(ti->table->md), + (unsigned long long)start, + ti->limits.logical_block_size, bdevname(bdev, b)); + return 0; + } + + if (len & (logical_block_size_sectors - 1)) { + DMWARN("%s: len=%llu not aligned to h/w " + "logical block size %hu of %s", + dm_device_name(ti->table->md), + (unsigned long long)len, + ti->limits.logical_block_size, bdevname(bdev, b)); + return 0; + } + + return 1; } /* @@ -479,14 +507,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, } atomic_inc(&dd->count); - if (!check_device_area(dd, start, len)) { - DMWARN("device %s too small for target", path); - dm_put_device(ti, &dd->dm_dev); - return -EINVAL; - } - *result = &dd->dm_dev; - return 0; } @@ -555,8 +576,16 @@ int dm_get_device(struct dm_target *ti, const char *path, sector_t start, int r = __table_get_device(ti->table, ti, path, start, len, mode, result); - if (!r) - dm_set_device_limits(ti, (*result)->bdev); + if (r) + return r; + + dm_set_device_limits(ti, (*result)->bdev); + + if (!device_area_is_valid(ti, (*result)->bdev, start, len)) { + dm_put_device(ti, *result); + *result = NULL; + return -EINVAL; + } return r; } @@ -695,6 +724,71 @@ static void check_for_valid_limits(struct io_restrictions *rs) rs->bounce_pfn = -1; } +/* + * Impose necessary and sufficient conditions on a devices's table such + * that any incoming bio which respects its logical_block_size can be + * processed successfully. If it falls across the boundary between + * two or more targets, the size of each piece it gets split into must + * be compatible with the logical_block_size of the target processing it. + */ +static int validate_hardware_logical_block_alignment(struct dm_table *table) +{ + /* + * This function uses arithmetic modulo the logical_block_size + * (in units of 512-byte sectors). + */ + unsigned short device_logical_block_size_sects = + table->limits.logical_block_size >> SECTOR_SHIFT; + + /* + * Offset of the start of the next table entry, mod logical_block_size. + */ + unsigned short next_target_start = 0; + + /* + * Given an aligned bio that extends beyond the end of a + * target, how many sectors must the next target handle? + */ + unsigned short remaining = 0; + + struct dm_target *uninitialized_var(ti); + unsigned i = 0; + + /* + * Check each entry in the table in turn. + */ + while (i < dm_table_get_num_targets(table)) { + ti = dm_table_get_target(table, i++); + + /* + * If the remaining sectors fall entirely within this + * table entry are they compatible with its logical_block_size? + */ + if (remaining < ti->len && + remaining & ((ti->limits.logical_block_size >> + SECTOR_SHIFT) - 1)) + break; /* Error */ + + next_target_start = + (unsigned short) ((next_target_start + ti->len) & + (device_logical_block_size_sects - 1)); + remaining = next_target_start ? + device_logical_block_size_sects - next_target_start : 0; + } + + if (remaining) { + DMWARN("%s: table line %u (start sect %llu len %llu) " + "not aligned to hardware logical block size %hu", + dm_device_name(table->md), i, + (unsigned long long) ti->begin, + (unsigned long long) ti->len, + table->limits.logical_block_size); + return -EINVAL; + } + + return 0; +} + int dm_table_add_target(struct dm_table *t, const char *type, sector_t start, sector_t len, char *params) { @@ -794,6 +888,10 @@ int dm_table_complete(struct dm_table *t) check_for_valid_limits(&t->limits); + r = validate_hardware_logical_block_alignment(t); + if (r) + return r; + /* how many indexes will the btree have ? */ leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 424f7b048c30..0f17c35592da 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -24,6 +24,13 @@ #define DM_MSG_PREFIX "core" +/* + * Cookies are numeric values sent with CHANGE and REMOVE + * uevents while resuming, removing or renaming the device. + */ +#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" +#define DM_COOKIE_LENGTH 24 + static const char *_name = DM_NAME; static unsigned int major = 0; @@ -159,13 +166,16 @@ struct mapped_device { * freeze/thaw support require holding onto a super block */ struct super_block *frozen_sb; - struct block_device *suspended_bdev; + struct block_device *bdev; /* forced geometry settings */ struct hd_geometry geometry; /* sysfs handle */ struct kobject kobj; + + /* zero-length barrier that will be cloned and submitted to targets */ + struct bio barrier_bio; }; #define MIN_IOS 256 @@ -393,11 +403,6 @@ static void free_io(struct mapped_device *md, struct dm_io *io) mempool_free(io, md->io_pool); } -static struct dm_target_io *alloc_tio(struct mapped_device *md) -{ - return mempool_alloc(md->tio_pool, GFP_NOIO); -} - static void free_tio(struct mapped_device *md, struct dm_target_io *tio) { mempool_free(tio, md->tio_pool); @@ -538,9 +543,11 @@ static void dec_pending(struct dm_io *io, int error) * Target requested pushing back the I/O. */ spin_lock_irqsave(&md->deferred_lock, flags); - if (__noflush_suspending(md)) - bio_list_add_head(&md->deferred, io->bio); - else + if (__noflush_suspending(md)) { + if (!bio_barrier(io->bio)) + bio_list_add_head(&md->deferred, + io->bio); + } else /* noflush suspend was interrupted. */ io->error = -EIO; spin_unlock_irqrestore(&md->deferred_lock, flags); @@ -555,7 +562,8 @@ static void dec_pending(struct dm_io *io, int error) * a per-device variable for error reporting. * Note that you can't touch the bio after end_io_acct */ - md->barrier_error = io_error; + if (!md->barrier_error && io_error != -EOPNOTSUPP) + md->barrier_error = io_error; end_io_acct(io); } else { end_io_acct(io); @@ -636,11 +644,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, sector_t sector; struct mapped_device *md; - /* - * Sanity checks. - */ - BUG_ON(!clone->bi_size); - clone->bi_end_io = clone_endio; clone->bi_private = tio; @@ -755,6 +758,48 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, return clone; } +static struct dm_target_io *alloc_tio(struct clone_info *ci, + struct dm_target *ti) +{ + struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); + + tio->io = ci->io; + tio->ti = ti; + memset(&tio->info, 0, sizeof(tio->info)); + + return tio; +} + +static void __flush_target(struct clone_info *ci, struct dm_target *ti, + unsigned flush_nr) +{ + struct dm_target_io *tio = alloc_tio(ci, ti); + struct bio *clone; + + tio->info.flush_request = flush_nr; + + clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); + __bio_clone(clone, ci->bio); + clone->bi_destructor = dm_bio_destructor; + + __map_bio(ti, clone, tio); +} + +static int __clone_and_map_empty_barrier(struct clone_info *ci) +{ + unsigned target_nr = 0, flush_nr; + struct dm_target *ti; + + while ((ti = dm_table_get_target(ci->map, target_nr++))) + for (flush_nr = 0; flush_nr < ti->num_flush_requests; + flush_nr++) + __flush_target(ci, ti, flush_nr); + + ci->sector_count = 0; + + return 0; +} + static int __clone_and_map(struct clone_info *ci) { struct bio *clone, *bio = ci->bio; @@ -762,6 +807,9 @@ static int __clone_and_map(struct clone_info *ci) sector_t len = 0, max; struct dm_target_io *tio; + if (unlikely(bio_empty_barrier(bio))) + return __clone_and_map_empty_barrier(ci); + ti = dm_table_find_target(ci->map, ci->sector); if (!dm_target_is_valid(ti)) return -EIO; @@ -771,10 +819,7 @@ static int __clone_and_map(struct clone_info *ci) /* * Allocate a target io object. */ - tio = alloc_tio(ci->md); - tio->io = ci->io; - tio->ti = ti; - memset(&tio->info, 0, sizeof(tio->info)); + tio = alloc_tio(ci, ti); if (ci->sector_count <= max) { /* @@ -830,10 +875,7 @@ static int __clone_and_map(struct clone_info *ci) max = max_io_len(ci->md, ci->sector, ti); - tio = alloc_tio(ci->md); - tio->io = ci->io; - tio->ti = ti; - memset(&tio->info, 0, sizeof(tio->info)); + tio = alloc_tio(ci, ti); } len = min(remaining, max); @@ -868,7 +910,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) if (!bio_barrier(bio)) bio_io_error(bio); else - md->barrier_error = -EIO; + if (!md->barrier_error) + md->barrier_error = -EIO; return; } @@ -881,6 +924,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) ci.io->md = md; ci.sector = bio->bi_sector; ci.sector_count = bio_sectors(bio); + if (unlikely(bio_empty_barrier(bio))) + ci.sector_count = 1; ci.idx = bio->bi_idx; start_io_acct(ci.io); @@ -928,6 +973,16 @@ static int dm_merge_bvec(struct request_queue *q, */ if (max_size && ti->type->merge) max_size = ti->type->merge(ti, bvm, biovec, max_size); + /* + * If the target doesn't support merge method and some of the devices + * provided their merge_bvec method (we know this by looking at + * queue_max_hw_sectors), then we can't allow bios with multiple vector + * entries. So always set max_size to 0, and the code below allows + * just one page. + */ + else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) + + max_size = 0; out_table: dm_table_put(map); @@ -1173,6 +1228,10 @@ static struct mapped_device *alloc_dev(int minor) if (!md->wq) goto bad_thread; + md->bdev = bdget_disk(md->disk, 0); + if (!md->bdev) + goto bad_bdev; + /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); old_md = idr_replace(&_minor_idr, md, minor); @@ -1182,6 +1241,8 @@ static struct mapped_device *alloc_dev(int minor) return md; +bad_bdev: + destroy_workqueue(md->wq); bad_thread: put_disk(md->disk); bad_disk: @@ -1207,10 +1268,8 @@ static void free_dev(struct mapped_device *md) { int minor = MINOR(disk_devt(md->disk)); - if (md->suspended_bdev) { - unlock_fs(md); - bdput(md->suspended_bdev); - } + unlock_fs(md); + bdput(md->bdev); destroy_workqueue(md->wq); mempool_destroy(md->tio_pool); mempool_destroy(md->io_pool); @@ -1252,9 +1311,9 @@ static void __set_size(struct mapped_device *md, sector_t size) { set_capacity(md->disk, size); - mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); - i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); - mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); + mutex_lock(&md->bdev->bd_inode->i_mutex); + i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); + mutex_unlock(&md->bdev->bd_inode->i_mutex); } static int __bind(struct mapped_device *md, struct dm_table *t) @@ -1270,8 +1329,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t) if (size != get_capacity(md->disk)) memset(&md->geometry, 0, sizeof(md->geometry)); - if (md->suspended_bdev) - __set_size(md, size); + __set_size(md, size); if (!size) { dm_table_destroy(t); @@ -1429,34 +1487,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) return r; } -static int dm_flush(struct mapped_device *md) +static void dm_flush(struct mapped_device *md) { dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); - return 0; + + bio_init(&md->barrier_bio); + md->barrier_bio.bi_bdev = md->bdev; + md->barrier_bio.bi_rw = WRITE_BARRIER; + __split_and_process_bio(md, &md->barrier_bio); + + dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); } static void process_barrier(struct mapped_device *md, struct bio *bio) { - int error = dm_flush(md); + md->barrier_error = 0; - if (unlikely(error)) { - bio_endio(bio, error); - return; - } - if (bio_empty_barrier(bio)) { - bio_endio(bio, 0); - return; - } - - __split_and_process_bio(md, bio); + dm_flush(md); - error = dm_flush(md); - - if (!error && md->barrier_error) - error = md->barrier_error; + if (!bio_empty_barrier(bio)) { + __split_and_process_bio(md, bio); + dm_flush(md); + } if (md->barrier_error != DM_ENDIO_REQUEUE) - bio_endio(bio, error); + bio_endio(bio, md->barrier_error); + else { + spin_lock_irq(&md->deferred_lock); + bio_list_add_head(&md->deferred, bio); + spin_unlock_irq(&md->deferred_lock); + } } /* @@ -1513,11 +1573,6 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) if (!dm_suspended(md)) goto out; - /* without bdev, the device size cannot be changed */ - if (!md->suspended_bdev) - if (get_capacity(md->disk) != dm_table_get_size(table)) - goto out; - __unbind(md); r = __bind(md, table); @@ -1536,7 +1591,7 @@ static int lock_fs(struct mapped_device *md) WARN_ON(md->frozen_sb); - md->frozen_sb = freeze_bdev(md->suspended_bdev); + md->frozen_sb = freeze_bdev(md->bdev); if (IS_ERR(md->frozen_sb)) { r = PTR_ERR(md->frozen_sb); md->frozen_sb = NULL; @@ -1545,9 +1600,6 @@ static int lock_fs(struct mapped_device *md) set_bit(DMF_FROZEN, &md->flags); - /* don't bdput right now, we don't want the bdev - * to go away while it is locked. - */ return 0; } @@ -1556,7 +1608,7 @@ static void unlock_fs(struct mapped_device *md) if (!test_bit(DMF_FROZEN, &md->flags)) return; - thaw_bdev(md->suspended_bdev, md->frozen_sb); + thaw_bdev(md->bdev, md->frozen_sb); md->frozen_sb = NULL; clear_bit(DMF_FROZEN, &md->flags); } @@ -1594,24 +1646,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) /* This does not get reverted if there's an error later. */ dm_table_presuspend_targets(map); - /* bdget() can stall if the pending I/Os are not flushed */ - if (!noflush) { - md->suspended_bdev = bdget_disk(md->disk, 0); - if (!md->suspended_bdev) { - DMWARN("bdget failed in dm_suspend"); - r = -ENOMEM; + /* + * Flush I/O to the device. noflush supersedes do_lockfs, + * because lock_fs() needs to flush I/Os. + */ + if (!noflush && do_lockfs) { + r = lock_fs(md); + if (r) goto out; - } - - /* - * Flush I/O to the device. noflush supersedes do_lockfs, - * because lock_fs() needs to flush I/Os. - */ - if (do_lockfs) { - r = lock_fs(md); - if (r) - goto out; - } } /* @@ -1668,11 +1710,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) set_bit(DMF_SUSPENDED, &md->flags); out: - if (r && md->suspended_bdev) { - bdput(md->suspended_bdev); - md->suspended_bdev = NULL; - } - dm_table_put(map); out_unlock: @@ -1701,19 +1738,10 @@ int dm_resume(struct mapped_device *md) unlock_fs(md); - if (md->suspended_bdev) { - bdput(md->suspended_bdev); - md->suspended_bdev = NULL; - } - clear_bit(DMF_SUSPENDED, &md->flags); dm_table_unplug_all(map); - - dm_kobject_uevent(md); - r = 0; - out: dm_table_put(map); mutex_unlock(&md->suspend_lock); @@ -1724,9 +1752,19 @@ out: /*----------------------------------------------------------------- * Event notification. *---------------------------------------------------------------*/ -void dm_kobject_uevent(struct mapped_device *md) -{ - kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE); +void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, + unsigned cookie) +{ + char udev_cookie[DM_COOKIE_LENGTH]; + char *envp[] = { udev_cookie, NULL }; + + if (!cookie) + kobject_uevent(&disk_to_dev(md->disk)->kobj, action); + else { + snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", + DM_COOKIE_ENV_VAR_NAME, cookie); + kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); + } } uint32_t dm_next_uevent_seq(struct mapped_device *md) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a31506d93e91..b5935c610c44 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -92,7 +92,8 @@ void dm_stripe_exit(void); int dm_open_count(struct mapped_device *md); int dm_lock_for_deletion(struct mapped_device *md); -void dm_kobject_uevent(struct mapped_device *md); +void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, + unsigned cookie); int dm_kcopyd_init(void); void dm_kcopyd_exit(void); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 49c2362977fd..fc36a4d07723 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -21,6 +21,7 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; union map_info { void *ptr; unsigned long long ll; + unsigned flush_request; }; /* @@ -168,6 +169,16 @@ struct dm_target { sector_t split_io; /* + * A number of zero-length barrier requests that will be submitted + * to the target for the purpose of flushing cache. + * + * The request number will be placed in union map_info->flush_request. + * It is a responsibility of the target driver to remap these requests + * to the real underlying devices. + */ + unsigned num_flush_requests; + + /* * These are automatically filled in by * dm_table_get_device. */ diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 48e44ee2b466..2ab84c83c31a 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -123,6 +123,16 @@ struct dm_ioctl { __u32 target_count; /* in/out */ __s32 open_count; /* out */ __u32 flags; /* in/out */ + + /* + * event_nr holds either the event number (input and output) or the + * udev cookie value (input only). + * The DM_DEV_WAIT ioctl takes an event number as input. + * The DM_SUSPEND, DM_DEV_REMOVE and DM_DEV_RENAME ioctls + * use the field as a cookie to return in the DM_COOKIE + * variable with the uevents they issue. + * For output, the ioctls return the event number, not the cookie. + */ __u32 event_nr; /* in/out */ __u32 padding; @@ -256,9 +266,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 14 +#define DM_VERSION_MINOR 15 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2008-04-23)" +#define DM_VERSION_EXTRA "-ioctl (2009-04-01)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ |