diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-02-14 12:33:30 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-02-14 12:33:30 +1100 |
commit | 7080014ae36e7a6067afcf12f133595708d11db8 (patch) | |
tree | 925e631a4c87073282d58fadebc85a5aad37ae2c /drivers/md | |
parent | e083aec2cf2cb3fbea0b9f68cfbc7c548c36710f (diff) | |
parent | 70cc472f9c0dbd12f4ed4e3e869bf763a59a413f (diff) |
Merge branch 'device-mapper/master'
Diffstat (limited to 'drivers/md')
30 files changed, 1940 insertions, 394 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 91a02eeeb319..7cdf359d6b23 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -210,7 +210,7 @@ config DM_DEBUG config DM_BUFIO tristate - depends on BLK_DEV_DM && EXPERIMENTAL + depends on BLK_DEV_DM ---help--- This interface allows you to do buffered I/O on a device and acts as a cache, holding recently-read blocks in memory and performing @@ -218,7 +218,7 @@ config DM_BUFIO config DM_BIO_PRISON tristate - depends on BLK_DEV_DM && EXPERIMENTAL + depends on BLK_DEV_DM ---help--- Some bio locking schemes used by other device-mapper targets including thin provisioning. @@ -251,8 +251,8 @@ config DM_SNAPSHOT Allow volume managers to take writable snapshots of a device. config DM_THIN_PROVISIONING - tristate "Thin provisioning target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "Thin provisioning target" + depends on BLK_DEV_DM select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- @@ -302,8 +302,8 @@ config DM_RAID in one of the available parity distribution methods. config DM_LOG_USERSPACE - tristate "Mirror userspace logging (EXPERIMENTAL)" - depends on DM_MIRROR && EXPERIMENTAL && NET + tristate "Mirror userspace logging" + depends on DM_MIRROR && NET select CONNECTOR ---help--- The userspace logging module provides a mechanism for @@ -350,8 +350,8 @@ config DM_MULTIPATH_ST If unsure, say N. config DM_DELAY - tristate "I/O delaying target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "I/O delaying target" + depends on BLK_DEV_DM ---help--- A target that delays reads and/or writes and can send them to different devices. Useful for testing. @@ -365,14 +365,14 @@ config DM_UEVENT Generate udev events for DM events. config DM_FLAKEY - tristate "Flakey target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "Flakey target" + depends on BLK_DEV_DM ---help--- A target that intermittently fails I/O for debugging purposes. config DM_VERITY - tristate "Verity target support (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "Verity target support" + depends on BLK_DEV_DM select CRYPTO select CRYPTO_HASH select DM_BUFIO diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c index aefb78e3cbf9..4d670bbe963c 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison.c @@ -14,14 +14,6 @@ /*----------------------------------------------------------------*/ -struct dm_bio_prison_cell { - struct hlist_node list; - struct dm_bio_prison *prison; - struct dm_cell_key key; - struct bio *holder; - struct bio_list bios; -}; - struct dm_bio_prison { spinlock_t lock; mempool_t *cell_pool; @@ -87,6 +79,19 @@ void dm_bio_prison_destroy(struct dm_bio_prison *prison) } EXPORT_SYMBOL_GPL(dm_bio_prison_destroy); +struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp) +{ + return mempool_alloc(prison->cell_pool, gfp); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell); + +void dm_bio_prison_free_cell(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell) +{ + mempool_free(cell, prison->cell_pool); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell); + static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key) { const unsigned long BIG_PRIME = 4294967291UL; @@ -115,91 +120,86 @@ static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket, return NULL; } -/* - * This may block if a new cell needs allocating. You must ensure that - * cells will be unlocked even if the calling thread is blocked. - * - * Returns 1 if the cell was already held, 0 if @inmate is the new holder. - */ -int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key, - struct bio *inmate, struct dm_bio_prison_cell **ref) +static void __setup_new_cell(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct bio *holder, + uint32_t hash, + struct dm_bio_prison_cell *cell) { - int r = 1; - unsigned long flags; - uint32_t hash = hash_key(prison, key); - struct dm_bio_prison_cell *cell, *cell2; - - BUG_ON(hash > prison->nr_buckets); - - spin_lock_irqsave(&prison->lock, flags); - - cell = __search_bucket(prison->cells + hash, key); - if (cell) { - bio_list_add(&cell->bios, inmate); - goto out; - } + memcpy(&cell->key, key, sizeof(cell->key)); + cell->holder = holder; + bio_list_init(&cell->bios); + hlist_add_head(&cell->list, prison->cells + hash); +} - /* - * Allocate a new cell - */ - spin_unlock_irqrestore(&prison->lock, flags); - cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); - spin_lock_irqsave(&prison->lock, flags); +static int __bio_detain(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct bio *inmate, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result) +{ + uint32_t hash = hash_key(prison, key); + struct dm_bio_prison_cell *cell; - /* - * We've been unlocked, so we have to double check that - * nobody else has inserted this cell in the meantime. - */ cell = __search_bucket(prison->cells + hash, key); if (cell) { - mempool_free(cell2, prison->cell_pool); - bio_list_add(&cell->bios, inmate); - goto out; + if (inmate) + bio_list_add(&cell->bios, inmate); + *cell_result = cell; + return 1; } - /* - * Use new cell. - */ - cell = cell2; - - cell->prison = prison; - memcpy(&cell->key, key, sizeof(cell->key)); - cell->holder = inmate; - bio_list_init(&cell->bios); - hlist_add_head(&cell->list, prison->cells + hash); + __setup_new_cell(prison, key, inmate, hash, cell_prealloc); + *cell_result = cell_prealloc; + return 0; +} - r = 0; +static int bio_detain(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct bio *inmate, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result) +{ + int r; + unsigned long flags; -out: + spin_lock_irqsave(&prison->lock, flags); + r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); spin_unlock_irqrestore(&prison->lock, flags); - *ref = cell; - return r; } + +int dm_bio_detain(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct bio *inmate, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result) +{ + return bio_detain(prison, key, inmate, cell_prealloc, cell_result); +} EXPORT_SYMBOL_GPL(dm_bio_detain); /* * @inmates must have been initialised prior to this call */ -static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates) +static void __cell_release(struct dm_bio_prison_cell *cell, + struct bio_list *inmates) { - struct dm_bio_prison *prison = cell->prison; - hlist_del(&cell->list); if (inmates) { - bio_list_add(inmates, cell->holder); + if (cell->holder) + bio_list_add(inmates, cell->holder); bio_list_merge(inmates, &cell->bios); } - - mempool_free(cell, prison->cell_pool); } -void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios) +void dm_cell_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *bios) { unsigned long flags; - struct dm_bio_prison *prison = cell->prison; spin_lock_irqsave(&prison->lock, flags); __cell_release(cell, bios); @@ -210,20 +210,18 @@ EXPORT_SYMBOL_GPL(dm_cell_release); /* * Sometimes we don't want the holder, just the additional bios. */ -static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates) +static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, + struct bio_list *inmates) { - struct dm_bio_prison *prison = cell->prison; - hlist_del(&cell->list); bio_list_merge(inmates, &cell->bios); - - mempool_free(cell, prison->cell_pool); } -void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates) +void dm_cell_release_no_holder(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *inmates) { unsigned long flags; - struct dm_bio_prison *prison = cell->prison; spin_lock_irqsave(&prison->lock, flags); __cell_release_no_holder(cell, inmates); @@ -231,9 +229,9 @@ void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list } EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); -void dm_cell_error(struct dm_bio_prison_cell *cell) +void dm_cell_error(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell) { - struct dm_bio_prison *prison = cell->prison; struct bio_list bios; struct bio *bio; unsigned long flags; diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h index 53d1a7a84e2f..981a02d3a055 100644 --- a/drivers/md/dm-bio-prison.h +++ b/drivers/md/dm-bio-prison.h @@ -22,7 +22,6 @@ * subsequently unlocked the bios become available. */ struct dm_bio_prison; -struct dm_bio_prison_cell; /* FIXME: this needs to be more abstract */ struct dm_cell_key { @@ -31,21 +30,51 @@ struct dm_cell_key { dm_block_t block; }; +/* + * Treat this as opaque, only in header so callers can manage allocation + * themselves. + */ +struct dm_bio_prison_cell { + struct hlist_node list; + struct dm_cell_key key; + struct bio *holder; + struct bio_list bios; +}; + struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells); void dm_bio_prison_destroy(struct dm_bio_prison *prison); /* - * This may block if a new cell needs allocating. You must ensure that - * cells will be unlocked even if the calling thread is blocked. + * These two functions just wrap a mempool. This is a transitory step: + * Eventually all bio prison clients should manage their own cell memory. * - * Returns 1 if the cell was already held, 0 if @inmate is the new holder. + * Like mempool_alloc(), dm_bio_prison_alloc_cell() can only fail if called + * in interrupt context or passed GFP_NOWAIT. */ -int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key, - struct bio *inmate, struct dm_bio_prison_cell **ref); +struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, + gfp_t gfp); +void dm_bio_prison_free_cell(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell); -void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios); -void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates); -void dm_cell_error(struct dm_bio_prison_cell *cell); +/* + * An atomic op that combines retrieving a cell, and adding a bio to it. + * + * Returns 1 if the cell was already held, 0 if @inmate is the new holder. + */ +int dm_bio_detain(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct bio *inmate, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result); + +void dm_cell_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *bios); +void dm_cell_release_no_holder(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *inmates); +void dm_cell_error(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell); /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 651ca79881dd..f12fdbcc4f76 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1193,7 +1193,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); int dm_bufio_issue_flush(struct dm_bufio_client *c) { struct dm_io_request io_req = { - .bi_rw = REQ_FLUSH, + .bi_rw = WRITE_FLUSH, .mem.type = DM_IO_KMEM, .mem.ptr.addr = NULL, .client = c->dm_io, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index f7369f9d8595..2f8312a043de 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1651,7 +1651,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (opt_params == 1 && opt_string && !strcasecmp(opt_string, "allow_discards")) - ti->num_discard_requests = 1; + ti->num_discard_bios = 1; else if (opt_params) { ret = -EINVAL; ti->error = "Invalid feature arguments"; @@ -1679,7 +1679,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ti->num_flush_requests = 1; + ti->num_flush_bios = 1; ti->discard_zeroes_data_unsupported = true; return 0; @@ -1746,7 +1746,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, cc->dev->name, (unsigned long long)cc->start); - if (ti->num_discard_requests) + if (ti->num_discard_bios) DMEMIT(" 1 allow_discards"); break; diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index cc1bd048acb2..6ff7a7e390c7 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -198,8 +198,8 @@ out: mutex_init(&dc->timer_lock); atomic_set(&dc->may_delay, 1); - ti->num_flush_requests = 1; - ti->num_discard_requests = 1; + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; ti->private = dc; return 0; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 9721f2ffb1a2..12548de0ebc8 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -216,8 +216,8 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ti->num_flush_requests = 1; - ti->num_discard_requests = 1; + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; ti->per_bio_data_size = sizeof(struct per_bio_data); ti->private = fc; return 0; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 0666b5d14b88..0b37697585dc 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1474,39 +1474,52 @@ static int target_message(struct dm_ioctl *param, size_t param_size) return r; } +/* + * The ioctl parameter block consists of two parts, a dm_ioctl struct + * followed by a data buffer. This flag is set if the second part, + * which has a variable size, is not used by the function processing + * the ioctl. + */ +#define IOCTL_FLAGS_NO_PARAMS 1 + /*----------------------------------------------------------------- * Implementation of open/close/ioctl on the special char * device. *---------------------------------------------------------------*/ -static ioctl_fn lookup_ioctl(unsigned int cmd) +static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) { static struct { int cmd; + int flags; ioctl_fn fn; } _ioctls[] = { - {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */ - {DM_REMOVE_ALL_CMD, remove_all}, - {DM_LIST_DEVICES_CMD, list_devices}, - - {DM_DEV_CREATE_CMD, dev_create}, - {DM_DEV_REMOVE_CMD, dev_remove}, - {DM_DEV_RENAME_CMD, dev_rename}, - {DM_DEV_SUSPEND_CMD, dev_suspend}, - {DM_DEV_STATUS_CMD, dev_status}, - {DM_DEV_WAIT_CMD, dev_wait}, - - {DM_TABLE_LOAD_CMD, table_load}, - {DM_TABLE_CLEAR_CMD, table_clear}, - {DM_TABLE_DEPS_CMD, table_deps}, - {DM_TABLE_STATUS_CMD, table_status}, - - {DM_LIST_VERSIONS_CMD, list_versions}, - - {DM_TARGET_MSG_CMD, target_message}, - {DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry} + {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */ + {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all}, + {DM_LIST_DEVICES_CMD, 0, list_devices}, + + {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create}, + {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove}, + {DM_DEV_RENAME_CMD, 0, dev_rename}, + {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend}, + {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status}, + {DM_DEV_WAIT_CMD, 0, dev_wait}, + + {DM_TABLE_LOAD_CMD, 0, table_load}, + {DM_TABLE_CLEAR_CMD, IOCTL_FLAGS_NO_PARAMS, table_clear}, + {DM_TABLE_DEPS_CMD, 0, table_deps}, + {DM_TABLE_STATUS_CMD, 0, table_status}, + + {DM_LIST_VERSIONS_CMD, 0, list_versions}, + + {DM_TARGET_MSG_CMD, 0, target_message}, + {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry} }; - return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; + if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) + return NULL; + + *ioctl_flags = _ioctls[cmd].flags; + return _ioctls[cmd].fn; } /* @@ -1543,7 +1556,8 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user) return r; } -#define DM_PARAMS_VMALLOC 0x0001 /* Params alloced with vmalloc not kmalloc */ +#define DM_PARAMS_KMALLOC 0x0001 /* Params alloced with kmalloc */ +#define DM_PARAMS_VMALLOC 0x0002 /* Params alloced with vmalloc */ #define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */ static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags) @@ -1551,66 +1565,80 @@ static void free_params(struct dm_ioctl *param, size_t param_size, int param_fla if (param_flags & DM_WIPE_BUFFER) memset(param, 0, param_size); + if (param_flags & DM_PARAMS_KMALLOC) + kfree(param); if (param_flags & DM_PARAMS_VMALLOC) vfree(param); - else - kfree(param); } -static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, int *param_flags) +static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel, + int ioctl_flags, + struct dm_ioctl **param, int *param_flags) { - struct dm_ioctl tmp, *dmi; + struct dm_ioctl *dmi; int secure_data; + const size_t minimum_data_size = sizeof(*param_kernel) - sizeof(param_kernel->data); - if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) + if (copy_from_user(param_kernel, user, minimum_data_size)) return -EFAULT; - if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) + if (param_kernel->data_size < minimum_data_size) return -EINVAL; - secure_data = tmp.flags & DM_SECURE_DATA_FLAG; + secure_data = param_kernel->flags & DM_SECURE_DATA_FLAG; *param_flags = secure_data ? DM_WIPE_BUFFER : 0; + if (ioctl_flags & IOCTL_FLAGS_NO_PARAMS) { + dmi = param_kernel; + dmi->data_size = minimum_data_size; + goto data_copied; + } + /* * Try to avoid low memory issues when a device is suspended. * Use kmalloc() rather than vmalloc() when we can. */ dmi = NULL; - if (tmp.data_size <= KMALLOC_MAX_SIZE) - dmi = kmalloc(tmp.data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (param_kernel->data_size <= KMALLOC_MAX_SIZE) { + dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (dmi) + *param_flags |= DM_PARAMS_KMALLOC; + } if (!dmi) { - dmi = __vmalloc(tmp.data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL); - *param_flags |= DM_PARAMS_VMALLOC; + dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL); + if (dmi) + *param_flags |= DM_PARAMS_VMALLOC; } if (!dmi) { - if (secure_data && clear_user(user, tmp.data_size)) + if (secure_data && clear_user(user, param_kernel->data_size)) return -EFAULT; return -ENOMEM; } - if (copy_from_user(dmi, user, tmp.data_size)) + if (copy_from_user(dmi, user, param_kernel->data_size)) goto bad; +data_copied: /* * Abort if something changed the ioctl data while it was being copied. */ - if (dmi->data_size != tmp.data_size) { + if (dmi->data_size != param_kernel->data_size) { DMERR("rejecting ioctl: data size modified while processing parameters"); goto bad; } /* Wipe the user buffer so we do not return it to userspace */ - if (secure_data && clear_user(user, tmp.data_size)) + if (secure_data && clear_user(user, param_kernel->data_size)) goto bad; *param = dmi; return 0; bad: - free_params(dmi, tmp.data_size, *param_flags); + free_params(dmi, param_kernel->data_size, *param_flags); return -EFAULT; } @@ -1648,11 +1676,13 @@ static int validate_params(uint cmd, struct dm_ioctl *param) static int ctl_ioctl(uint command, struct dm_ioctl __user *user) { int r = 0; + int ioctl_flags; int param_flags; unsigned int cmd; struct dm_ioctl *uninitialized_var(param); ioctl_fn fn = NULL; size_t input_param_size; + struct dm_ioctl param_kernel; /* only root can play with this */ if (!capable(CAP_SYS_ADMIN)) @@ -1677,7 +1707,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) if (cmd == DM_VERSION_CMD) return 0; - fn = lookup_ioctl(cmd); + fn = lookup_ioctl(cmd, &ioctl_flags); if (!fn) { DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); return -ENOTTY; @@ -1686,7 +1716,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) /* * Copy the parameters into kernel space. */ - r = copy_params(user, ¶m, ¶m_flags); + r = copy_params(user, ¶m_kernel, ioctl_flags, ¶m, ¶m_flags); if (r) return r; @@ -1699,6 +1729,10 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) param->data_size = sizeof(*param); r = fn(param, input_param_size); + if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) && + unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS)) + DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd); + /* * Copy the results back to userland. */ diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 328cad5617ab..dd94a9866ba7 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -53,9 +53,9 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ti->num_flush_requests = 1; - ti->num_discard_requests = 1; - ti->num_write_same_requests = 1; + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_write_same_bios = 1; ti->private = lc; return 0; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 573bd04591bf..9522f47d4f2b 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -905,8 +905,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, goto bad; } - ti->num_flush_requests = 1; - ti->num_discard_requests = 1; + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; return 0; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9e58dbd8d8cb..a3e115b62186 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1151,7 +1151,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; - ti->num_flush_requests = 1; + ti->num_flush_bios = 1; mutex_lock(&rs->md.reconfig_mutex); ret = md_run(&rs->md); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index fa519185ebba..d6cdeef8c805 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1072,8 +1072,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto err_free_context; - ti->num_flush_requests = 1; - ti->num_discard_requests = 1; + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record); ti->discard_zeroes_data_unsupported = true; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 59fc18ae52c2..9fe53a360b4e 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1038,7 +1038,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) int i; int r = -EINVAL; char *origin_path, *cow_path; - unsigned args_used, num_flush_requests = 1; + unsigned args_used, num_flush_bios = 1; fmode_t origin_mode = FMODE_READ; if (argc != 4) { @@ -1048,7 +1048,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) } if (dm_target_is_snapshot_merge(ti)) { - num_flush_requests = 2; + num_flush_bios = 2; origin_mode = FMODE_WRITE; } @@ -1128,7 +1128,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) spin_lock_init(&s->tracked_chunk_lock); ti->private = s; - ti->num_flush_requests = num_flush_requests; + ti->num_flush_bios = num_flush_bios; ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk); /* Add snapshot to the list of snapshots for this origin */ @@ -1692,7 +1692,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) init_tracked_chunk(bio); if (bio->bi_rw & REQ_FLUSH) { - if (!dm_bio_get_target_request_nr(bio)) + if (!dm_bio_get_target_bio_nr(bio)) bio->bi_bdev = s->origin->bdev; else bio->bi_bdev = s->cow->bdev; @@ -2105,7 +2105,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ti->private = dev; - ti->num_flush_requests = 1; + ti->num_flush_bios = 1; return 0; } @@ -2307,3 +2307,5 @@ module_exit(dm_snapshot_exit); MODULE_DESCRIPTION(DM_NAME " snapshot target"); MODULE_AUTHOR("Joe Thornber"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("dm-snapshot-origin"); +MODULE_ALIAS("dm-snapshot-merge"); diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index c89cde86d400..afc547d63b03 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -160,9 +160,9 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) return r; - ti->num_flush_requests = stripes; - ti->num_discard_requests = stripes; - ti->num_write_same_requests = stripes; + ti->num_flush_bios = stripes; + ti->num_discard_bios = stripes; + ti->num_write_same_bios = stripes; sc->chunk_size = chunk_size; if (chunk_size & (chunk_size - 1)) @@ -276,19 +276,19 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) { struct stripe_c *sc = ti->private; uint32_t stripe; - unsigned target_request_nr; + unsigned target_bio_nr; if (bio->bi_rw & REQ_FLUSH) { - target_request_nr = dm_bio_get_target_request_nr(bio); - BUG_ON(target_request_nr >= sc->stripes); - bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; + target_bio_nr = dm_bio_get_target_bio_nr(bio); + BUG_ON(target_bio_nr >= sc->stripes); + bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev; return DM_MAPIO_REMAPPED; } if (unlikely(bio->bi_rw & REQ_DISCARD) || unlikely(bio->bi_rw & REQ_WRITE_SAME)) { - target_request_nr = dm_bio_get_target_request_nr(bio); - BUG_ON(target_request_nr >= sc->stripes); - return stripe_map_range(sc, bio, target_request_nr); + target_bio_nr = dm_bio_get_target_bio_nr(bio); + BUG_ON(target_bio_nr >= sc->stripes); + return stripe_map_range(sc, bio, target_bio_nr); } stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index daf25d0890b3..e50dad0c65f4 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -217,7 +217,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode, if (alloc_targets(t, num_targets)) { kfree(t); - t = NULL; return -ENOMEM; } @@ -823,8 +822,8 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; - if (!tgt->num_discard_requests && tgt->discards_supported) - DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.", + if (!tgt->num_discard_bios && tgt->discards_supported) + DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", dm_device_name(t->md), type); return 0; @@ -1360,7 +1359,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) while (i < dm_table_get_num_targets(t)) { ti = dm_table_get_target(t, i++); - if (!ti->num_flush_requests) + if (!ti->num_flush_bios) continue; if (ti->flush_supported) @@ -1439,7 +1438,7 @@ static bool dm_table_supports_write_same(struct dm_table *t) while (i < dm_table_get_num_targets(t)) { ti = dm_table_get_target(t, i++); - if (!ti->num_write_same_requests) + if (!ti->num_write_same_bios) return false; if (!ti->type->iterate_devices || @@ -1657,7 +1656,7 @@ bool dm_table_supports_discards(struct dm_table *t) while (i < dm_table_get_num_targets(t)) { ti = dm_table_get_target(t, i++); - if (!ti->num_discard_requests) + if (!ti->num_discard_bios) continue; if (ti->discards_supported) diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 617d21a77256..37ba5db71cd9 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -116,7 +116,7 @@ static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) /* * Return error for discards instead of -EOPNOTSUPP */ - tt->num_discard_requests = 1; + tt->num_discard_bios = 1; return 0; } diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 4d6e85367b84..00cee02f8fc9 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -280,7 +280,7 @@ static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) *t = v & ((1 << 24) - 1); } -static void data_block_inc(void *context, void *value_le) +static void data_block_inc(void *context, const void *value_le) { struct dm_space_map *sm = context; __le64 v_le; @@ -292,7 +292,7 @@ static void data_block_inc(void *context, void *value_le) dm_sm_inc_block(sm, b); } -static void data_block_dec(void *context, void *value_le) +static void data_block_dec(void *context, const void *value_le) { struct dm_space_map *sm = context; __le64 v_le; @@ -304,7 +304,7 @@ static void data_block_dec(void *context, void *value_le) dm_sm_dec_block(sm, b); } -static int data_block_equal(void *context, void *value1_le, void *value2_le) +static int data_block_equal(void *context, const void *value1_le, const void *value2_le) { __le64 v1_le, v2_le; uint64_t b1, b2; @@ -318,7 +318,7 @@ static int data_block_equal(void *context, void *value1_le, void *value2_le) return b1 == b2; } -static void subtree_inc(void *context, void *value) +static void subtree_inc(void *context, const void *value) { struct dm_btree_info *info = context; __le64 root_le; @@ -329,7 +329,7 @@ static void subtree_inc(void *context, void *value) dm_tm_inc(info->tm, root); } -static void subtree_dec(void *context, void *value) +static void subtree_dec(void *context, const void *value) { struct dm_btree_info *info = context; __le64 root_le; @@ -341,7 +341,7 @@ static void subtree_dec(void *context, void *value) DMERR("btree delete failed\n"); } -static int subtree_equal(void *context, void *value1_le, void *value2_le) +static int subtree_equal(void *context, const void *value1_le, const void *value2_le) { __le64 v1_le, v2_le; memcpy(&v1_le, value1_le, sizeof(v1_le)); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 5409607d4875..c15d13cb0ad7 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -227,6 +227,78 @@ struct thin_c { /*----------------------------------------------------------------*/ /* + * wake_worker() is used when new work is queued and when pool_resume is + * ready to continue deferred IO processing. + */ +static void wake_worker(struct pool *pool) +{ + queue_work(pool->wq, &pool->worker); +} + +/*----------------------------------------------------------------*/ + +static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio, + struct dm_bio_prison_cell **cell_result) +{ + int r; + struct dm_bio_prison_cell *cell_prealloc; + + /* + * Allocate a cell from the prison's mempool. + * This might block but it can't fail. + */ + cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO); + + r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result); + if (r) + /* + * We reused an old cell; we can get rid of + * the new one. + */ + dm_bio_prison_free_cell(pool->prison, cell_prealloc); + + return r; +} + +static void cell_release(struct pool *pool, + struct dm_bio_prison_cell *cell, + struct bio_list *bios) +{ + dm_cell_release(pool->prison, cell, bios); + dm_bio_prison_free_cell(pool->prison, cell); +} + +static void cell_release_no_holder(struct pool *pool, + struct dm_bio_prison_cell *cell, + struct bio_list *bios) +{ + dm_cell_release_no_holder(pool->prison, cell, bios); + dm_bio_prison_free_cell(pool->prison, cell); +} + +static void cell_defer_no_holder_no_free(struct thin_c *tc, + struct dm_bio_prison_cell *cell) +{ + struct pool *pool = tc->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + dm_cell_release_no_holder(pool->prison, cell, &pool->deferred_bios); + spin_unlock_irqrestore(&pool->lock, flags); + + wake_worker(pool); +} + +static void cell_error(struct pool *pool, + struct dm_bio_prison_cell *cell) +{ + dm_cell_error(pool->prison, cell); + dm_bio_prison_free_cell(pool->prison, cell); +} + +/*----------------------------------------------------------------*/ + +/* * A global list of pools that uses a struct mapped_device as a key. */ static struct dm_thin_pool_table { @@ -330,14 +402,20 @@ static void requeue_io(struct thin_c *tc) * target. */ +static bool block_size_is_power_of_two(struct pool *pool) +{ + return pool->sectors_per_block_shift >= 0; +} + static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) { + struct pool *pool = tc->pool; sector_t block_nr = bio->bi_sector; - if (tc->pool->sectors_per_block_shift < 0) - (void) sector_div(block_nr, tc->pool->sectors_per_block); + if (block_size_is_power_of_two(pool)) + block_nr >>= pool->sectors_per_block_shift; else - block_nr >>= tc->pool->sectors_per_block_shift; + (void) sector_div(block_nr, pool->sectors_per_block); return block_nr; } @@ -348,12 +426,12 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) sector_t bi_sector = bio->bi_sector; bio->bi_bdev = tc->pool_dev->bdev; - if (tc->pool->sectors_per_block_shift < 0) - bio->bi_sector = (block * pool->sectors_per_block) + - sector_div(bi_sector, pool->sectors_per_block); - else + if (block_size_is_power_of_two(pool)) bio->bi_sector = (block << pool->sectors_per_block_shift) | (bi_sector & (pool->sectors_per_block - 1)); + else + bio->bi_sector = (block * pool->sectors_per_block) + + sector_div(bi_sector, pool->sectors_per_block); } static void remap_to_origin(struct thin_c *tc, struct bio *bio) @@ -420,15 +498,6 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio, issue(tc, bio); } -/* - * wake_worker() is used when new work is queued and when pool_resume is - * ready to continue deferred IO processing. - */ -static void wake_worker(struct pool *pool) -{ - queue_work(pool->wq, &pool->worker); -} - /*----------------------------------------------------------------*/ /* @@ -515,14 +584,14 @@ static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) unsigned long flags; spin_lock_irqsave(&pool->lock, flags); - dm_cell_release(cell, &pool->deferred_bios); + cell_release(pool, cell, &pool->deferred_bios); spin_unlock_irqrestore(&tc->pool->lock, flags); wake_worker(pool); } /* - * Same as cell_defer except it omits the original holder of the cell. + * Same as cell_defer above, except it omits the original holder of the cell. */ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) { @@ -530,7 +599,7 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c unsigned long flags; spin_lock_irqsave(&pool->lock, flags); - dm_cell_release_no_holder(cell, &pool->deferred_bios); + cell_release_no_holder(pool, cell, &pool->deferred_bios); spin_unlock_irqrestore(&pool->lock, flags); wake_worker(pool); @@ -540,13 +609,15 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) { if (m->bio) m->bio->bi_end_io = m->saved_bi_end_io; - dm_cell_error(m->cell); + cell_error(m->tc->pool, m->cell); list_del(&m->list); mempool_free(m, m->tc->pool->mapping_pool); } + static void process_prepared_mapping(struct dm_thin_new_mapping *m) { struct thin_c *tc = m->tc; + struct pool *pool = tc->pool; struct bio *bio; int r; @@ -555,7 +626,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) bio->bi_end_io = m->saved_bi_end_io; if (m->err) { - dm_cell_error(m->cell); + cell_error(pool, m->cell); goto out; } @@ -567,7 +638,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); if (r) { DMERR_LIMIT("dm_thin_insert_block() failed"); - dm_cell_error(m->cell); + cell_error(pool, m->cell); goto out; } @@ -585,7 +656,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) out: list_del(&m->list); - mempool_free(m, tc->pool->mapping_pool); + mempool_free(m, pool->mapping_pool); } static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) @@ -736,7 +807,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, if (r < 0) { mempool_free(m, pool->mapping_pool); DMERR_LIMIT("dm_kcopyd_copy() failed"); - dm_cell_error(cell); + cell_error(pool, cell); } } } @@ -802,7 +873,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, if (r < 0) { mempool_free(m, pool->mapping_pool); DMERR_LIMIT("dm_kcopyd_zero() failed"); - dm_cell_error(cell); + cell_error(pool, cell); } } } @@ -908,13 +979,13 @@ static void retry_on_resume(struct bio *bio) spin_unlock_irqrestore(&pool->lock, flags); } -static void no_space(struct dm_bio_prison_cell *cell) +static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell) { struct bio *bio; struct bio_list bios; bio_list_init(&bios); - dm_cell_release(cell, &bios); + cell_release(pool, cell, &bios); while ((bio = bio_list_pop(&bios))) retry_on_resume(bio); @@ -932,7 +1003,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio) struct dm_thin_new_mapping *m; build_virtual_key(tc->td, block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell)) + if (bio_detain(tc->pool, &key, bio, &cell)) return; r = dm_thin_find_block(tc->td, block, 1, &lookup_result); @@ -944,7 +1015,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio) * on this block. */ build_data_key(tc->td, lookup_result.block, &key2); - if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) { + if (bio_detain(tc->pool, &key2, bio, &cell2)) { cell_defer_no_holder(tc, cell); break; } @@ -1020,13 +1091,13 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, break; case -ENOSPC: - no_space(cell); + no_space(tc->pool, cell); break; default: DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", __func__, r); - dm_cell_error(cell); + cell_error(tc->pool, cell); break; } } @@ -1044,7 +1115,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, * of being broken so we have nothing further to do here. */ build_data_key(tc->td, lookup_result->block, &key); - if (dm_bio_detain(pool->prison, &key, bio, &cell)) + if (bio_detain(pool, &key, bio, &cell)) return; if (bio_data_dir(bio) == WRITE && bio->bi_size) @@ -1065,12 +1136,13 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block { int r; dm_block_t data_block; + struct pool *pool = tc->pool; /* * Remap empty bios (flushes) immediately, without provisioning. */ if (!bio->bi_size) { - inc_all_io_entry(tc->pool, bio); + inc_all_io_entry(pool, bio); cell_defer_no_holder(tc, cell); remap_and_issue(tc, bio, 0); @@ -1097,14 +1169,14 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block break; case -ENOSPC: - no_space(cell); + no_space(pool, cell); break; default: DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", __func__, r); - set_pool_mode(tc->pool, PM_READ_ONLY); - dm_cell_error(cell); + set_pool_mode(pool, PM_READ_ONLY); + cell_error(pool, cell); break; } } @@ -1112,6 +1184,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block static void process_bio(struct thin_c *tc, struct bio *bio) { int r; + struct pool *pool = tc->pool; dm_block_t block = get_bio_block(tc, bio); struct dm_bio_prison_cell *cell; struct dm_cell_key key; @@ -1122,7 +1195,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio) * being provisioned so we have nothing further to do here. */ build_virtual_key(tc->td, block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell)) + if (bio_detain(pool, &key, bio, &cell)) return; r = dm_thin_find_block(tc->td, block, 1, &lookup_result); @@ -1130,9 +1203,9 @@ static void process_bio(struct thin_c *tc, struct bio *bio) case 0: if (lookup_result.shared) { process_shared_bio(tc, bio, block, &lookup_result); - cell_defer_no_holder(tc, cell); + cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ } else { - inc_all_io_entry(tc->pool, bio); + inc_all_io_entry(pool, bio); cell_defer_no_holder(tc, cell); remap_and_issue(tc, bio, lookup_result.block); @@ -1141,7 +1214,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio) case -ENODATA: if (bio_data_dir(bio) == READ && tc->origin_dev) { - inc_all_io_entry(tc->pool, bio); + inc_all_io_entry(pool, bio); cell_defer_no_holder(tc, cell); remap_to_origin_and_issue(tc, bio); @@ -1378,7 +1451,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) dm_block_t block = get_bio_block(tc, bio); struct dm_thin_device *td = tc->td; struct dm_thin_lookup_result result; - struct dm_bio_prison_cell *cell1, *cell2; + struct dm_bio_prison_cell cell1, cell2; + struct dm_bio_prison_cell *cell_result; struct dm_cell_key key; thin_hook_bio(tc, bio); @@ -1420,18 +1494,18 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) } build_virtual_key(tc->td, block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1)) + if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) return DM_MAPIO_SUBMITTED; build_data_key(tc->td, result.block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2)) { - cell_defer_no_holder(tc, cell1); + if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { + cell_defer_no_holder_no_free(tc, &cell1); return DM_MAPIO_SUBMITTED; } inc_all_io_entry(tc->pool, bio); - cell_defer_no_holder(tc, cell2); - cell_defer_no_holder(tc, cell1); + cell_defer_no_holder_no_free(tc, &cell2); + cell_defer_no_holder_no_free(tc, &cell1); remap(tc, bio, result.block); return DM_MAPIO_REMAPPED; @@ -1938,7 +2012,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) pt->data_dev = data_dev; pt->low_water_blocks = low_water_blocks; pt->adjusted_pf = pt->requested_pf = pf; - ti->num_flush_requests = 1; + ti->num_flush_bios = 1; /* * Only need to enable discards if the pool should pass @@ -1946,7 +2020,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) * processing will cause mappings to be removed from the btree. */ if (pf.discard_enabled && pf.discard_passdown) { - ti->num_discard_requests = 1; + ti->num_discard_bios = 1; /* * Setting 'discards_supported' circumvents the normal @@ -2414,11 +2488,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } -static bool block_size_is_power_of_two(struct pool *pool) -{ - return pool->sectors_per_block_shift >= 0; -} - static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) { struct pool *pool = pt->pool; @@ -2432,15 +2501,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) if (pt->adjusted_pf.discard_passdown) { data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; limits->discard_granularity = data_limits->discard_granularity; - } else if (block_size_is_power_of_two(pool)) + } else limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; - else - /* - * Use largest power of 2 that is a factor of sectors_per_block - * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS. - */ - limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1), - DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT; } static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) @@ -2588,17 +2650,17 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) if (r) goto bad_thin_open; - ti->num_flush_requests = 1; + ti->num_flush_bios = 1; ti->flush_supported = true; ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook); /* In case the pool supports discards, pass them on. */ if (tc->pool->pf.discard_enabled) { ti->discards_supported = true; - ti->num_discard_requests = 1; + ti->num_discard_bios = 1; ti->discard_zeroes_data_unsupported = true; - /* Discard requests must be split on a block boundary */ - ti->split_discard_requests = true; + /* Discard bios must be split on a block boundary */ + ti->split_discard_bios = true; } dm_put(pool_md); diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index 69a5c3b3b340..c99003e0d47a 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -25,7 +25,7 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* * Silently drop discards, avoiding -EOPNOTSUPP. */ - ti->num_discard_requests = 1; + ti->num_discard_bios = 1; return 0; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3dd2df2f9f91..5b213550af0b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -986,12 +986,13 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) } EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); -static void __map_bio(struct dm_target *ti, struct dm_target_io *tio) +static void __map_bio(struct dm_target_io *tio) { int r; sector_t sector; struct mapped_device *md; struct bio *clone = &tio->clone; + struct dm_target *ti = tio->ti; clone->bi_end_io = clone_endio; clone->bi_private = tio; @@ -1032,32 +1033,54 @@ struct clone_info { unsigned short idx; }; +static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) +{ + bio->bi_sector = sector; + bio->bi_size = to_bytes(len); +} + +static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count) +{ + bio->bi_idx = idx; + bio->bi_vcnt = idx + bv_count; + bio->bi_flags &= ~(1 << BIO_SEG_VALID); +} + +static void clone_bio_integrity(struct bio *bio, struct bio *clone, + unsigned short idx, unsigned len, unsigned offset, + unsigned trim) +{ + if (!bio_integrity(bio)) + return; + + bio_integrity_clone(clone, bio, GFP_NOIO); + + if (trim) + bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len); +} + /* * Creates a little bio that just does part of a bvec. */ -static void split_bvec(struct dm_target_io *tio, struct bio *bio, - sector_t sector, unsigned short idx, unsigned int offset, - unsigned int len, struct bio_set *bs) +static void clone_split_bio(struct dm_target_io *tio, struct bio *bio, + sector_t sector, unsigned short idx, + unsigned offset, unsigned len) { struct bio *clone = &tio->clone; struct bio_vec *bv = bio->bi_io_vec + idx; *clone->bi_io_vec = *bv; - clone->bi_sector = sector; + bio_setup_sector(clone, sector, len); + clone->bi_bdev = bio->bi_bdev; clone->bi_rw = bio->bi_rw; clone->bi_vcnt = 1; - clone->bi_size = to_bytes(len); clone->bi_io_vec->bv_offset = offset; clone->bi_io_vec->bv_len = clone->bi_size; clone->bi_flags |= 1 << BIO_CLONED; - if (bio_integrity(bio)) { - bio_integrity_clone(clone, bio, GFP_NOIO); - bio_integrity_trim(clone, - bio_sector_offset(bio, idx, offset), len); - } + clone_bio_integrity(bio, clone, idx, len, offset, 1); } /* @@ -1065,29 +1088,23 @@ static void split_bvec(struct dm_target_io *tio, struct bio *bio, */ static void clone_bio(struct dm_target_io *tio, struct bio *bio, sector_t sector, unsigned short idx, - unsigned short bv_count, unsigned int len, - struct bio_set *bs) + unsigned short bv_count, unsigned len) { struct bio *clone = &tio->clone; + unsigned trim = 0; __bio_clone(clone, bio); - clone->bi_sector = sector; - clone->bi_idx = idx; - clone->bi_vcnt = idx + bv_count; - clone->bi_size = to_bytes(len); - clone->bi_flags &= ~(1 << BIO_SEG_VALID); - - if (bio_integrity(bio)) { - bio_integrity_clone(clone, bio, GFP_NOIO); - - if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) - bio_integrity_trim(clone, - bio_sector_offset(bio, idx, 0), len); - } + bio_setup_sector(clone, sector, len); + bio_setup_bv(clone, idx, bv_count); + + if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) + trim = 1; + clone_bio_integrity(bio, clone, idx, len, 0, trim); } static struct dm_target_io *alloc_tio(struct clone_info *ci, - struct dm_target *ti, int nr_iovecs) + struct dm_target *ti, int nr_iovecs, + unsigned target_bio_nr) { struct dm_target_io *tio; struct bio *clone; @@ -1098,96 +1115,104 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, tio->io = ci->io; tio->ti = ti; memset(&tio->info, 0, sizeof(tio->info)); - tio->target_request_nr = 0; + tio->target_bio_nr = target_bio_nr; return tio; } -static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, - unsigned request_nr, sector_t len) +static void __clone_and_map_simple_bio(struct clone_info *ci, + struct dm_target *ti, + unsigned target_bio_nr, sector_t len) { - struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs); + struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); struct bio *clone = &tio->clone; - tio->target_request_nr = request_nr; - /* * Discard requests require the bio's inline iovecs be initialized. * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush * and discard, so no need for concern about wasted bvec allocations. */ - __bio_clone(clone, ci->bio); - if (len) { - clone->bi_sector = ci->sector; - clone->bi_size = to_bytes(len); - } + if (len) + bio_setup_sector(clone, ci->sector, len); - __map_bio(ti, tio); + __map_bio(tio); } -static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, - unsigned num_requests, sector_t len) +static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, + unsigned num_bios, sector_t len) { - unsigned request_nr; + unsigned target_bio_nr; - for (request_nr = 0; request_nr < num_requests; request_nr++) - __issue_target_request(ci, ti, request_nr, len); + for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) + __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); } -static int __clone_and_map_empty_flush(struct clone_info *ci) +static int __send_empty_flush(struct clone_info *ci) { unsigned target_nr = 0; struct dm_target *ti; BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) - __issue_target_requests(ci, ti, ti->num_flush_requests, 0); + __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0); return 0; } -/* - * Perform all io with a single clone. - */ -static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) +static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, + sector_t sector, int nr_iovecs, + unsigned short idx, unsigned short bv_count, + unsigned offset, unsigned len, + unsigned split_bvec) { struct bio *bio = ci->bio; struct dm_target_io *tio; + unsigned target_bio_nr; + unsigned num_target_bios = 1; - tio = alloc_tio(ci, ti, bio->bi_max_vecs); - clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx, - ci->sector_count, ci->md->bs); - __map_bio(ti, tio); - ci->sector_count = 0; + /* + * Does the target want to receive duplicate copies of the bio? + */ + if (bio_data_dir(bio) == WRITE && ti->num_write_bios) + num_target_bios = ti->num_write_bios(ti, bio); + + for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { + tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr); + if (split_bvec) + clone_split_bio(tio, bio, sector, idx, offset, len); + else + clone_bio(tio, bio, sector, idx, bv_count, len); + __map_bio(tio); + } } -typedef unsigned (*get_num_requests_fn)(struct dm_target *ti); +typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); -static unsigned get_num_discard_requests(struct dm_target *ti) +static unsigned get_num_discard_bios(struct dm_target *ti) { - return ti->num_discard_requests; + return ti->num_discard_bios; } -static unsigned get_num_write_same_requests(struct dm_target *ti) +static unsigned get_num_write_same_bios(struct dm_target *ti) { - return ti->num_write_same_requests; + return ti->num_write_same_bios; } typedef bool (*is_split_required_fn)(struct dm_target *ti); static bool is_split_required_for_discard(struct dm_target *ti) { - return ti->split_discard_requests; + return ti->split_discard_bios; } -static int __clone_and_map_changing_extent_only(struct clone_info *ci, - get_num_requests_fn get_num_requests, - is_split_required_fn is_split_required) +static int __send_changing_extent_only(struct clone_info *ci, + get_num_bios_fn get_num_bios, + is_split_required_fn is_split_required) { struct dm_target *ti; sector_t len; - unsigned num_requests; + unsigned num_bios; do { ti = dm_table_find_target(ci->map, ci->sector); @@ -1200,8 +1225,8 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci, * reconfiguration might also have changed that since the * check was performed. */ - num_requests = get_num_requests ? get_num_requests(ti) : 0; - if (!num_requests) + num_bios = get_num_bios ? get_num_bios(ti) : 0; + if (!num_bios) return -EOPNOTSUPP; if (is_split_required && !is_split_required(ti)) @@ -1209,7 +1234,7 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci, else len = min(ci->sector_count, max_io_len(ci->sector, ti)); - __issue_target_requests(ci, ti, num_requests, len); + __send_duplicate_bios(ci, ti, num_bios, len); ci->sector += len; } while (ci->sector_count -= len); @@ -1217,108 +1242,129 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci, return 0; } -static int __clone_and_map_discard(struct clone_info *ci) +static int __send_discard(struct clone_info *ci) { - return __clone_and_map_changing_extent_only(ci, get_num_discard_requests, - is_split_required_for_discard); + return __send_changing_extent_only(ci, get_num_discard_bios, + is_split_required_for_discard); } -static int __clone_and_map_write_same(struct clone_info *ci) +static int __send_write_same(struct clone_info *ci) { - return __clone_and_map_changing_extent_only(ci, get_num_write_same_requests, NULL); + return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); } -static int __clone_and_map(struct clone_info *ci) +/* + * Find maximum number of sectors / bvecs we can process with a single bio. + */ +static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx) { struct bio *bio = ci->bio; - struct dm_target *ti; - sector_t len = 0, max; - struct dm_target_io *tio; + sector_t bv_len, total_len = 0; - if (unlikely(bio->bi_rw & REQ_DISCARD)) - return __clone_and_map_discard(ci); - else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) - return __clone_and_map_write_same(ci); + for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) { + bv_len = to_sector(bio->bi_io_vec[*idx].bv_len); - ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) - return -EIO; - - max = max_io_len(ci->sector, ti); + if (bv_len > max) + break; - if (ci->sector_count <= max) { - /* - * Optimise for the simple case where we can do all of - * the remaining io with a single clone. - */ - __clone_and_map_simple(ci, ti); + max -= bv_len; + total_len += bv_len; + } - } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { - /* - * There are some bvecs that don't span targets. - * Do as many of these as possible. - */ - int i; - sector_t remaining = max; - sector_t bv_len; + return total_len; +} - for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { - bv_len = to_sector(bio->bi_io_vec[i].bv_len); +static int __split_bvec_across_targets(struct clone_info *ci, + struct dm_target *ti, sector_t max) +{ + struct bio *bio = ci->bio; + struct bio_vec *bv = bio->bi_io_vec + ci->idx; + sector_t remaining = to_sector(bv->bv_len); + unsigned offset = 0; + sector_t len; - if (bv_len > remaining) - break; + do { + if (offset) { + ti = dm_table_find_target(ci->map, ci->sector); + if (!dm_target_is_valid(ti)) + return -EIO; - remaining -= bv_len; - len += bv_len; + max = max_io_len(ci->sector, ti); } - tio = alloc_tio(ci, ti, bio->bi_max_vecs); - clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len, - ci->md->bs); - __map_bio(ti, tio); + len = min(remaining, max); + + __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0, + bv->bv_offset + offset, len, 1); ci->sector += len; ci->sector_count -= len; - ci->idx = i; + offset += to_bytes(len); + } while (remaining -= len); - } else { - /* - * Handle a bvec that must be split between two or more targets. - */ - struct bio_vec *bv = bio->bi_io_vec + ci->idx; - sector_t remaining = to_sector(bv->bv_len); - unsigned int offset = 0; + ci->idx++; - do { - if (offset) { - ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) - return -EIO; + return 0; +} - max = max_io_len(ci->sector, ti); - } +/* + * Select the correct strategy for processing a non-flush bio. + */ +static int __split_and_process_non_flush(struct clone_info *ci) +{ + struct bio *bio = ci->bio; + struct dm_target *ti; + sector_t len, max; + int idx; - len = min(remaining, max); + if (unlikely(bio->bi_rw & REQ_DISCARD)) + return __send_discard(ci); + else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) + return __send_write_same(ci); - tio = alloc_tio(ci, ti, 1); - split_bvec(tio, bio, ci->sector, ci->idx, - bv->bv_offset + offset, len, ci->md->bs); + ti = dm_table_find_target(ci->map, ci->sector); + if (!dm_target_is_valid(ti)) + return -EIO; - __map_bio(ti, tio); + max = max_io_len(ci->sector, ti); - ci->sector += len; - ci->sector_count -= len; - offset += to_bytes(len); - } while (remaining -= len); + /* + * Optimise for the simple case where we can do all of + * the remaining io with a single clone. + */ + if (ci->sector_count <= max) { + __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, + ci->idx, bio->bi_vcnt - ci->idx, 0, + ci->sector_count, 0); + ci->sector_count = 0; + return 0; + } - ci->idx++; + /* + * There are some bvecs that don't span targets. + * Do as many of these as possible. + */ + if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { + len = __len_within_target(ci, max, &idx); + + __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, + ci->idx, idx - ci->idx, 0, len, 0); + + ci->sector += len; + ci->sector_count -= len; + ci->idx = idx; + + return 0; } - return 0; + /* + * Handle a bvec that must be split between two or more targets. + */ + return __split_bvec_across_targets(ci, ti, max); } /* - * Split the bio into several clones and submit it to targets. + * Entry point to split a bio into clones and submit them to the targets. */ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) { @@ -1342,16 +1388,17 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) ci.idx = bio->bi_idx; start_io_acct(ci.io); + if (bio->bi_rw & REQ_FLUSH) { ci.bio = &ci.md->flush_bio; ci.sector_count = 0; - error = __clone_and_map_empty_flush(&ci); + error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ } else { ci.bio = bio; ci.sector_count = bio_sectors(bio); while (ci.sector_count && !error) - error = __clone_and_map(&ci); + error = __split_and_process_non_flush(&ci); } /* drop the extra reference count */ @@ -2420,7 +2467,7 @@ static void dm_queue_flush(struct mapped_device *md) */ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) { - struct dm_table *live_map, *map = ERR_PTR(-EINVAL); + struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); struct queue_limits limits; int r; @@ -2443,10 +2490,12 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) dm_table_put(live_map); } - r = dm_calculate_queue_limits(table, &limits); - if (r) { - map = ERR_PTR(r); - goto out; + if (!live_map) { + r = dm_calculate_queue_limits(table, &limits); + if (r) { + map = ERR_PTR(r); + goto out; + } } map = __bind(md, table, &limits); diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index ceb359050a59..19b268795415 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig @@ -1,6 +1,6 @@ config DM_PERSISTENT_DATA tristate - depends on BLK_DEV_DM && EXPERIMENTAL + depends on BLK_DEV_DM select LIBCRC32C select DM_BUFIO ---help--- diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile index d8e7cb767c1e..ff528792c358 100644 --- a/drivers/md/persistent-data/Makefile +++ b/drivers/md/persistent-data/Makefile @@ -1,5 +1,7 @@ obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o dm-persistent-data-objs := \ + dm-array.o \ + dm-bitset.o \ dm-block-manager.o \ dm-space-map-common.o \ dm-space-map-disk.o \ diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c new file mode 100644 index 000000000000..172147eb1d40 --- /dev/null +++ b/drivers/md/persistent-data/dm-array.c @@ -0,0 +1,808 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-array.h" +#include "dm-space-map.h" +#include "dm-transaction-manager.h" + +#include <linux/export.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "array" + +/*----------------------------------------------------------------*/ + +/* + * The array is implemented as a fully populated btree, which points to + * blocks that contain the packed values. This is more space efficient + * than just using a btree since we don't store 1 key per value. + */ +struct array_block { + __le32 csum; + __le32 max_entries; + __le32 nr_entries; + __le32 value_size; + __le64 blocknr; /* Block this node is supposed to live in. */ +} __packed; + +/*----------------------------------------------------------------*/ + +/* + * Validator methods. As usual we calculate a checksum, and also write the + * block location into the header (paranoia about ssds remapping areas by + * mistake). + */ +#define CSUM_XOR 595846735 + +static void array_block_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t size_of_block) +{ + struct array_block *bh_le = dm_block_data(b); + + bh_le->blocknr = cpu_to_le64(dm_block_location(b)); + bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, + size_of_block - sizeof(__le32), + CSUM_XOR)); +} + +static int array_block_check(struct dm_block_validator *v, + struct dm_block *b, + size_t size_of_block) +{ + struct array_block *bh_le = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) { + DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu", + (unsigned long long) le64_to_cpu(bh_le->blocknr), + (unsigned long long) dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, + size_of_block - sizeof(__le32), + CSUM_XOR)); + if (csum_disk != bh_le->csum) { + DMERR_LIMIT("array_block_check failed: csum %u != wanted %u", + (unsigned) le32_to_cpu(csum_disk), + (unsigned) le32_to_cpu(bh_le->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator array_validator = { + .name = "array", + .prepare_for_write = array_block_prepare_for_write, + .check = array_block_check +}; + +/*----------------------------------------------------------------*/ + +/* + * Functions for manipulating the array blocks. + */ + +/* + * Returns a pointer to a value within an array block. + * + * index - The index into _this_ specific block. + */ +static void *element_at(struct dm_array_info *info, struct array_block *ab, + unsigned index) +{ + unsigned char *entry = (unsigned char *) (ab + 1); + + entry += index * info->value_type.size; + + return entry; +} + +/* + * Utility function that calls one of the value_type methods on every value + * in an array block. + */ +static void on_entries(struct dm_array_info *info, struct array_block *ab, + void (*fn)(void *, const void *)) +{ + unsigned i, nr_entries = le32_to_cpu(ab->nr_entries); + + for (i = 0; i < nr_entries; i++) + fn(info->value_type.context, element_at(info, ab, i)); +} + +/* + * Increment every value in an array block. + */ +static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab) +{ + struct dm_btree_value_type *vt = &info->value_type; + + if (vt->inc) + on_entries(info, ab, vt->inc); +} + +/* + * Decrement every value in an array block. + */ +static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab) +{ + struct dm_btree_value_type *vt = &info->value_type; + + if (vt->dec) + on_entries(info, ab, vt->dec); +} + +/* + * Each array block can hold this many values. + */ +static uint32_t calc_max_entries(size_t value_size, size_t size_of_block) +{ + return (size_of_block - sizeof(struct array_block)) / value_size; +} + +/* + * Allocate a new array block. The caller will need to unlock block. + */ +static int alloc_ablock(struct dm_array_info *info, size_t size_of_block, + uint32_t max_entries, + struct dm_block **block, struct array_block **ab) +{ + int r; + + r = dm_tm_new_block(info->btree_info.tm, &array_validator, block); + if (r) + return r; + + (*ab) = dm_block_data(*block); + (*ab)->max_entries = cpu_to_le32(max_entries); + (*ab)->nr_entries = cpu_to_le32(0); + (*ab)->value_size = cpu_to_le32(info->value_type.size); + + return 0; +} + +/* + * Pad an array block out with a particular value. Every instance will + * cause an increment of the value_type. new_nr must always be more than + * the current number of entries. + */ +static void fill_ablock(struct dm_array_info *info, struct array_block *ab, + const void *value, unsigned new_nr) +{ + unsigned i; + uint32_t nr_entries; + struct dm_btree_value_type *vt = &info->value_type; + + BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); + BUG_ON(new_nr < le32_to_cpu(ab->nr_entries)); + + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = nr_entries; i < new_nr; i++) { + if (vt->inc) + vt->inc(vt->context, value); + memcpy(element_at(info, ab, i), value, vt->size); + } + ab->nr_entries = cpu_to_le32(new_nr); +} + +/* + * Remove some entries from the back of an array block. Every value + * removed will be decremented. new_nr must be <= the current number of + * entries. + */ +static void trim_ablock(struct dm_array_info *info, struct array_block *ab, + unsigned new_nr) +{ + unsigned i; + uint32_t nr_entries; + struct dm_btree_value_type *vt = &info->value_type; + + BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); + BUG_ON(new_nr > le32_to_cpu(ab->nr_entries)); + + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = nr_entries; i > new_nr; i--) + if (vt->dec) + vt->dec(vt->context, element_at(info, ab, i - 1)); + ab->nr_entries = cpu_to_le32(new_nr); +} + +/* + * Read locks a block, and coerces it to an array block. The caller must + * unlock 'block' when finished. + */ +static int get_ablock(struct dm_array_info *info, dm_block_t b, + struct dm_block **block, struct array_block **ab) +{ + int r; + + r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block); + if (r) + return r; + + *ab = dm_block_data(*block); + return 0; +} + +/* + * Unlocks an array block. + */ +static int unlock_ablock(struct dm_array_info *info, struct dm_block *block) +{ + return dm_tm_unlock(info->btree_info.tm, block); +} + +/*----------------------------------------------------------------*/ + +/* + * Btree manipulation. + */ + +/* + * Looks up an array block in the btree, and then read locks it. + * + * index is the index of the index of the array_block, (ie. the array index + * / max_entries). + */ +static int lookup_ablock(struct dm_array_info *info, dm_block_t root, + unsigned index, struct dm_block **block, + struct array_block **ab) +{ + int r; + uint64_t key = index; + __le64 block_le; + + r = dm_btree_lookup(&info->btree_info, root, &key, &block_le); + if (r) + return r; + + return get_ablock(info, le64_to_cpu(block_le), block, ab); +} + +/* + * Insert an array block into the btree. The block is _not_ unlocked. + */ +static int insert_ablock(struct dm_array_info *info, uint64_t index, + struct dm_block *block, dm_block_t *root) +{ + __le64 block_le = cpu_to_le64(dm_block_location(block)); + + __dm_bless_for_disk(block_le); + return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root); +} + +/* + * Looks up an array block in the btree. Then shadows it, and updates the + * btree to point to this new shadow. 'root' is an input/output parameter + * for both the current root block, and the new one. + */ +static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, + unsigned index, struct dm_block **block, + struct array_block **ab) +{ + int r, inc; + uint64_t key = index; + dm_block_t b; + __le64 block_le; + + /* + * lookup + */ + r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le); + if (r) + return r; + b = le64_to_cpu(block_le); + + /* + * shadow + */ + r = dm_tm_shadow_block(info->btree_info.tm, b, + &array_validator, block, &inc); + if (r) + return r; + + *ab = dm_block_data(*block); + if (inc) + inc_ablock_entries(info, *ab); + + /* + * Reinsert. + * + * The shadow op will often be a noop. Only insert if it really + * copied data. + */ + if (dm_block_location(*block) != b) + r = insert_ablock(info, index, *block, root); + + return r; +} + +/* + * Allocate an new array block, and fill it with some values. + */ +static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block, + uint32_t max_entries, + unsigned block_index, uint32_t nr, + const void *value, dm_block_t *root) +{ + int r; + struct dm_block *block; + struct array_block *ab; + + r = alloc_ablock(info, size_of_block, max_entries, &block, &ab); + if (r) + return r; + + fill_ablock(info, ab, value, nr); + r = insert_ablock(info, block_index, block, root); + unlock_ablock(info, block); + + return r; +} + +static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block, + unsigned begin_block, unsigned end_block, + unsigned max_entries, const void *value, + dm_block_t *root) +{ + int r = 0; + + for (; !r && begin_block != end_block; begin_block++) + r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root); + + return r; +} + +/* + * There are a bunch of functions involved with resizing an array. This + * structure holds information that commonly needed by them. Purely here + * to reduce parameter count. + */ +struct resize { + /* + * Describes the array. + */ + struct dm_array_info *info; + + /* + * The current root of the array. This gets updated. + */ + dm_block_t root; + + /* + * Metadata block size. Used to calculate the nr entries in an + * array block. + */ + size_t size_of_block; + + /* + * Maximum nr entries in an array block. + */ + unsigned max_entries; + + /* + * nr of completely full blocks in the array. + * + * 'old' refers to before the resize, 'new' after. + */ + unsigned old_nr_full_blocks, new_nr_full_blocks; + + /* + * Number of entries in the final block. 0 iff only full blocks in + * the array. + */ + unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block; + + /* + * The default value used when growing the array. + */ + const void *value; +}; + +/* + * Removes a consecutive set of array blocks from the btree. The values + * in block are decremented as a side effect of the btree remove. + * + * begin_index - the index of the first array block to remove. + * end_index - the one-past-the-end value. ie. this block is not removed. + */ +static int drop_blocks(struct resize *resize, unsigned begin_index, + unsigned end_index) +{ + int r; + + while (begin_index != end_index) { + uint64_t key = begin_index++; + r = dm_btree_remove(&resize->info->btree_info, resize->root, + &key, &resize->root); + if (r) + return r; + } + + return 0; +} + +/* + * Calculates how many blocks are needed for the array. + */ +static unsigned total_nr_blocks_needed(unsigned nr_full_blocks, + unsigned nr_entries_in_last_block) +{ + return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0); +} + +/* + * Shrink an array. + */ +static int shrink(struct resize *resize) +{ + int r; + unsigned begin, end; + struct dm_block *block; + struct array_block *ab; + + /* + * Lose some blocks from the back? + */ + if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) { + begin = total_nr_blocks_needed(resize->new_nr_full_blocks, + resize->new_nr_entries_in_last_block); + end = total_nr_blocks_needed(resize->old_nr_full_blocks, + resize->old_nr_entries_in_last_block); + + r = drop_blocks(resize, begin, end); + if (r) + return r; + } + + /* + * Trim the new tail block + */ + if (resize->new_nr_entries_in_last_block) { + r = shadow_ablock(resize->info, &resize->root, + resize->new_nr_full_blocks, &block, &ab); + if (r) + return r; + + trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block); + unlock_ablock(resize->info, block); + } + + return 0; +} + +/* + * Grow an array. + */ +static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries) +{ + int r; + struct dm_block *block; + struct array_block *ab; + + r = shadow_ablock(resize->info, &resize->root, + resize->old_nr_full_blocks, &block, &ab); + if (r) + return r; + + fill_ablock(resize->info, ab, resize->value, new_nr_entries); + unlock_ablock(resize->info, block); + + return r; +} + +static int grow_add_tail_block(struct resize *resize) +{ + return insert_new_ablock(resize->info, resize->size_of_block, + resize->max_entries, + resize->new_nr_full_blocks, + resize->new_nr_entries_in_last_block, + resize->value, &resize->root); +} + +static int grow_needs_more_blocks(struct resize *resize) +{ + int r; + + if (resize->old_nr_entries_in_last_block > 0) { + r = grow_extend_tail_block(resize, resize->max_entries); + if (r) + return r; + } + + r = insert_full_ablocks(resize->info, resize->size_of_block, + resize->old_nr_full_blocks, + resize->new_nr_full_blocks, + resize->max_entries, resize->value, + &resize->root); + if (r) + return r; + + if (resize->new_nr_entries_in_last_block) + r = grow_add_tail_block(resize); + + return r; +} + +static int grow(struct resize *resize) +{ + if (resize->new_nr_full_blocks > resize->old_nr_full_blocks) + return grow_needs_more_blocks(resize); + + else if (resize->old_nr_entries_in_last_block) + return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block); + + else + return grow_add_tail_block(resize); +} + +/*----------------------------------------------------------------*/ + +/* + * These are the value_type functions for the btree elements, which point + * to array blocks. + */ +static void block_inc(void *context, const void *value) +{ + __le64 block_le; + struct dm_array_info *info = context; + + memcpy(&block_le, value, sizeof(block_le)); + dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le)); +} + +static void block_dec(void *context, const void *value) +{ + int r; + uint64_t b; + __le64 block_le; + uint32_t ref_count; + struct dm_block *block; + struct array_block *ab; + struct dm_array_info *info = context; + + memcpy(&block_le, value, sizeof(block_le)); + b = le64_to_cpu(block_le); + + r = dm_tm_ref(info->btree_info.tm, b, &ref_count); + if (r) { + DMERR_LIMIT("couldn't get reference count for block %llu", + (unsigned long long) b); + return; + } + + if (ref_count == 1) { + /* + * We're about to drop the last reference to this ablock. + * So we need to decrement the ref count of the contents. + */ + r = get_ablock(info, b, &block, &ab); + if (r) { + DMERR_LIMIT("couldn't get array block %llu", + (unsigned long long) b); + return; + } + + dec_ablock_entries(info, ab); + unlock_ablock(info, block); + } + + dm_tm_dec(info->btree_info.tm, b); +} + +static int block_equal(void *context, const void *value1, const void *value2) +{ + return !memcmp(value1, value2, sizeof(__le64)); +} + +/*----------------------------------------------------------------*/ + +void dm_array_info_init(struct dm_array_info *info, + struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt) +{ + struct dm_btree_value_type *bvt = &info->btree_info.value_type; + + memcpy(&info->value_type, vt, sizeof(info->value_type)); + info->btree_info.tm = tm; + info->btree_info.levels = 1; + + bvt->context = info; + bvt->size = sizeof(__le64); + bvt->inc = block_inc; + bvt->dec = block_dec; + bvt->equal = block_equal; +} +EXPORT_SYMBOL_GPL(dm_array_info_init); + +int dm_array_empty(struct dm_array_info *info, dm_block_t *root) +{ + return dm_btree_empty(&info->btree_info, root); +} +EXPORT_SYMBOL_GPL(dm_array_empty); + +static int array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) +{ + int r; + struct resize resize; + + if (old_size == new_size) + return 0; + + resize.info = info; + resize.root = root; + resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + resize.max_entries = calc_max_entries(info->value_type.size, + resize.size_of_block); + + resize.old_nr_full_blocks = old_size / resize.max_entries; + resize.old_nr_entries_in_last_block = old_size % resize.max_entries; + resize.new_nr_full_blocks = new_size / resize.max_entries; + resize.new_nr_entries_in_last_block = new_size % resize.max_entries; + resize.value = value; + + r = ((new_size > old_size) ? grow : shrink)(&resize); + if (r) + return r; + + *new_root = resize.root; + return 0; +} + +int dm_array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + int r = array_resize(info, root, old_size, new_size, value, new_root); + __dm_unbless_for_disk(value); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_resize); + +int dm_array_del(struct dm_array_info *info, dm_block_t root) +{ + return dm_btree_del(&info->btree_info, root); +} +EXPORT_SYMBOL_GPL(dm_array_del); + +int dm_array_get_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, void *value_le) +{ + int r; + struct dm_block *block; + struct array_block *ab; + size_t size_of_block; + unsigned entry, max_entries; + + size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + max_entries = calc_max_entries(info->value_type.size, size_of_block); + + r = lookup_ablock(info, root, index / max_entries, &block, &ab); + if (r) + return r; + + entry = index % max_entries; + if (entry >= le32_to_cpu(ab->nr_entries)) + r = -ENODATA; + else + memcpy(value_le, element_at(info, ab, entry), + info->value_type.size); + + unlock_ablock(info, block); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_get_value); + +static int array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) +{ + int r; + struct dm_block *block; + struct array_block *ab; + size_t size_of_block; + unsigned max_entries; + unsigned entry; + void *old_value; + struct dm_btree_value_type *vt = &info->value_type; + + size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + max_entries = calc_max_entries(info->value_type.size, size_of_block); + + r = shadow_ablock(info, &root, index / max_entries, &block, &ab); + if (r) + return r; + *new_root = root; + + entry = index % max_entries; + if (entry >= le32_to_cpu(ab->nr_entries)) { + r = -ENODATA; + goto out; + } + + old_value = element_at(info, ab, entry); + if (vt->dec && + (!vt->equal || !vt->equal(vt->context, old_value, value))) { + vt->dec(vt->context, old_value); + if (vt->inc) + vt->inc(vt->context, value); + } + + memcpy(old_value, value, info->value_type.size); + +out: + unlock_ablock(info, block); + return r; +} + +int dm_array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + int r; + + r = array_set_value(info, root, index, value, new_root); + __dm_unbless_for_disk(value); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_set_value); + +struct walk_info { + struct dm_array_info *info; + int (*fn)(void *context, uint64_t key, void *leaf); + void *context; +}; + +static int walk_ablock(void *context, uint64_t *keys, void *leaf) +{ + struct walk_info *wi = context; + + int r; + unsigned i; + __le64 block_le; + unsigned nr_entries, max_entries; + struct dm_block *block; + struct array_block *ab; + + memcpy(&block_le, leaf, sizeof(block_le)); + r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab); + if (r) + return r; + + max_entries = le32_to_cpu(ab->max_entries); + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = 0; i < nr_entries; i++) { + r = wi->fn(wi->context, keys[0] * max_entries + i, + element_at(wi->info, ab, i)); + + if (r) + break; + } + + unlock_ablock(wi->info, block); + return r; +} + +int dm_array_walk(struct dm_array_info *info, dm_block_t root, + int (*fn)(void *, uint64_t key, void *leaf), + void *context) +{ + struct walk_info wi; + + wi.info = info; + wi.fn = fn; + wi.context = context; + + return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi); +} +EXPORT_SYMBOL_GPL(dm_array_walk); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h new file mode 100644 index 000000000000..ea177d6fa58f --- /dev/null +++ b/drivers/md/persistent-data/dm-array.h @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_ARRAY_H +#define _LINUX_DM_ARRAY_H + +#include "dm-btree.h" + +/*----------------------------------------------------------------*/ + +/* + * The dm-array is a persistent version of an array. It packs the data + * more efficiently than a btree which will result in less disk space use, + * and a performance boost. The element get and set operations are still + * O(ln(n)), but with a much smaller constant. + * + * The value type structure is reused from the btree type to support proper + * reference counting of values. + * + * The arrays implicitly know their length, and bounds are checked for + * lookups and updated. It doesn't store this in an accessible place + * because it would waste a whole metadata block. Make sure you store the + * size along with the array root in your encompassing data. + * + * Array entries are indexed via an unsigned integer starting from zero. + * Arrays are not sparse; if you resize an array to have 'n' entries then + * 'n - 1' will be the last valid index. + * + * Typical use: + * + * a) initialise a dm_array_info structure. This describes the array + * values and ties it into a specific transaction manager. It holds no + * instance data; the same info can be used for many similar arrays if + * you wish. + * + * b) Get yourself a root. The root is the index of a block of data on the + * disk that holds a particular instance of an array. You may have a + * pre existing root in your metadata that you wish to use, or you may + * want to create a brand new, empty array with dm_array_empty(). + * + * Like the other data structures in this library, dm_array objects are + * immutable between transactions. Update functions will return you the + * root for a _new_ array. If you've incremented the old root, via + * dm_tm_inc(), before calling the update function you may continue to use + * it in parallel with the new root. + * + * c) resize an array with dm_array_resize(). + * + * d) Get a value from the array with dm_array_get_value(). + * + * e) Set a value in the array with dm_array_set_value(). + * + * f) Walk an array of values in index order with dm_array_walk(). More + * efficient than making many calls to dm_array_get_value(). + * + * g) Destroy the array with dm_array_del(). This tells the transaction + * manager that you're no longer using this data structure so it can + * recycle it's blocks. (dm_array_dec() would be a better name for it, + * but del is in keeping with dm_btree_del()). + */ + +/* + * Describes an array. Don't initialise this structure yourself, use the + * init function below. + */ +struct dm_array_info { + struct dm_transaction_manager *tm; + struct dm_btree_value_type value_type; + struct dm_btree_info btree_info; +}; + +/* + * Sets up a dm_array_info structure. You don't need to do anything with + * this structure when you finish using it. + * + * info - the structure being filled in. + * tm - the transaction manager that should supervise this structure. + * vt - describes the leaf values. + */ +void dm_array_info_init(struct dm_array_info *info, + struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt); + +/* + * Create an empty, zero length array. + * + * info - describes the array + * root - on success this will be filled out with the root block + */ +int dm_array_empty(struct dm_array_info *info, dm_block_t *root); + +/* + * Resizes the array. + * + * info - describes the array + * root - the root block of the array on disk + * old_size - the caller is responsible for remembering the size of + * the array + * new_size - can be bigger or smaller than old_size + * value - if we're growing the array the new entries will have this value + * new_root - on success, points to the new root block + * + * If growing the inc function for 'value' will be called the appropriate + * number of times. So if the caller is holding a reference they may want + * to drop it. + */ +int dm_array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * Frees a whole array. The value_type's decrement operation will be called + * for all values in the array + */ +int dm_array_del(struct dm_array_info *info, dm_block_t root); + +/* + * Lookup a value in the array + * + * info - describes the array + * root - root block of the array + * index - array index + * value - the value to be read. Will be in on-disk format of course. + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_array_get_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, void *value); + +/* + * Set an entry in the array. + * + * info - describes the array + * root - root block of the array + * index - array index + * value - value to be written to disk. Make sure you confirm the value is + * in on-disk format with__dm_bless_for_disk() before calling. + * new_root - the new root block + * + * The old value being overwritten will be decremented, the new value + * incremented. + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * Walk through all the entries in an array. + * + * info - describes the array + * root - root block of the array + * fn - called back for every element + * context - passed to the callback + */ +int dm_array_walk(struct dm_array_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t key, void *leaf), + void *context); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_ARRAY_H */ diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c new file mode 100644 index 000000000000..cd9a86d4cdf0 --- /dev/null +++ b/drivers/md/persistent-data/dm-bitset.c @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-bitset.h" +#include "dm-transaction-manager.h" + +#include <linux/export.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "bitset" +#define BITS_PER_ARRAY_ENTRY 64 + +/*----------------------------------------------------------------*/ + +static struct dm_btree_value_type bitset_bvt = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL, +}; + +/*----------------------------------------------------------------*/ + +void dm_disk_bitset_init(struct dm_transaction_manager *tm, + struct dm_disk_bitset *info) +{ + dm_array_info_init(&info->array_info, tm, &bitset_bvt); + info->current_index_set = false; +} +EXPORT_SYMBOL_GPL(dm_disk_bitset_init); + +int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root) +{ + return dm_array_empty(&info->array_info, root); +} +EXPORT_SYMBOL_GPL(dm_bitset_empty); + +int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root, + uint32_t old_nr_entries, uint32_t new_nr_entries, + bool default_value, dm_block_t *new_root) +{ + uint32_t old_blocks = dm_div_up(old_nr_entries, BITS_PER_ARRAY_ENTRY); + uint32_t new_blocks = dm_div_up(new_nr_entries, BITS_PER_ARRAY_ENTRY); + __le64 value = default_value ? cpu_to_le64(~0) : cpu_to_le64(0); + + __dm_bless_for_disk(&value); + return dm_array_resize(&info->array_info, root, old_blocks, new_blocks, + &value, new_root); +} +EXPORT_SYMBOL_GPL(dm_bitset_resize); + +int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root) +{ + return dm_array_del(&info->array_info, root); +} +EXPORT_SYMBOL_GPL(dm_bitset_del); + +int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, + dm_block_t *new_root) +{ + int r; + __le64 value; + + if (!info->current_index_set) + return 0; + + value = cpu_to_le64(info->current_bits); + + __dm_bless_for_disk(&value); + r = dm_array_set_value(&info->array_info, root, info->current_index, + &value, new_root); + if (r) + return r; + + info->current_index_set = false; + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_flush); + +static int read_bits(struct dm_disk_bitset *info, dm_block_t root, + uint32_t array_index) +{ + int r; + __le64 value; + + r = dm_array_get_value(&info->array_info, root, array_index, &value); + if (r) + return r; + + info->current_bits = le64_to_cpu(value); + info->current_index_set = true; + info->current_index = array_index; + return 0; +} + +static int get_array_entry(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned array_index = index / BITS_PER_ARRAY_ENTRY; + + if (info->current_index_set) { + if (info->current_index == array_index) + return 0; + + r = dm_bitset_flush(info, root, new_root); + if (r) + return r; + } + + return read_bits(info, root, array_index); +} + +int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + set_bit(b, (unsigned long *) &info->current_bits); + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_set_bit); + +int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + clear_bit(b, (unsigned long *) &info->current_bits); + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_clear_bit); + +int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root, bool *result) +{ + int r; + unsigned b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + *result = test_bit(b, (unsigned long *) &info->current_bits); + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_test_bit); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h new file mode 100644 index 000000000000..e1b9bea14aa1 --- /dev/null +++ b/drivers/md/persistent-data/dm-bitset.h @@ -0,0 +1,165 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_BITSET_H +#define _LINUX_DM_BITSET_H + +#include "dm-array.h" + +/*----------------------------------------------------------------*/ + +/* + * This bitset type is a thin wrapper round a dm_array of 64bit words. It + * uses a tiny, one word cache to reduce the number of array lookups and so + * increase performance. + * + * Like the dm-array that it's based on, the caller needs to keep track of + * the size of the bitset separately. The underlying dm-array implicitly + * knows how many words it's storing and will return -ENODATA if you try + * and access an out of bounds word. However, an out of bounds bit in the + * final word will _not_ be detected, you have been warned. + * + * Bits are indexed from zero. + + * Typical use: + * + * a) Initialise a dm_disk_bitset structure with dm_disk_bitset_init(). + * This describes the bitset and includes the cache. It's not called it + * dm_bitset_info in line with other data structures because it does + * include instance data. + * + * b) Get yourself a root. The root is the index of a block of data on the + * disk that holds a particular instance of an bitset. You may have a + * pre existing root in your metadata that you wish to use, or you may + * want to create a brand new, empty bitset with dm_bitset_empty(). + * + * Like the other data structures in this library, dm_bitset objects are + * immutable between transactions. Update functions will return you the + * root for a _new_ array. If you've incremented the old root, via + * dm_tm_inc(), before calling the update function you may continue to use + * it in parallel with the new root. + * + * Even read operations may trigger the cache to be flushed and as such + * return a root for a new, updated bitset. + * + * c) resize a bitset with dm_bitset_resize(). + * + * d) Set a bit with dm_bitset_set_bit(). + * + * e) Clear a bit with dm_bitset_clear_bit(). + * + * f) Test a bit with dm_bitset_test_bit(). + * + * g) Flush all updates from the cache with dm_bitset_flush(). + * + * h) Destroy the bitset with dm_bitset_del(). This tells the transaction + * manager that you're no longer using this data structure so it can + * recycle it's blocks. (dm_bitset_dec() would be a better name for it, + * but del is in keeping with dm_btree_del()). + */ + +/* + * Opaque object. Unlike dm_array_info, you should have one of these per + * bitset. Initialise with dm_disk_bitset_init(). + */ +struct dm_disk_bitset { + struct dm_array_info array_info; + + uint32_t current_index; + uint64_t current_bits; + + bool current_index_set:1; +}; + +/* + * Sets up a dm_disk_bitset structure. You don't need to do anything with + * this structure when you finish using it. + * + * tm - the transaction manager that should supervise this structure + * info - the structure being initialised + */ +void dm_disk_bitset_init(struct dm_transaction_manager *tm, + struct dm_disk_bitset *info); + +/* + * Create an empty, zero length bitset. + * + * info - describes the bitset + * new_root - on success, points to the new root block + */ +int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root); + +/* + * Resize the bitset. + * + * info - describes the bitset + * old_root - the root block of the array on disk + * old_nr_entries - the number of bits in the old bitset + * new_nr_entries - the number of bits you want in the new bitset + * default_value - the value for any new bits + * new_root - on success, points to the new root block + */ +int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t old_root, + uint32_t old_nr_entries, uint32_t new_nr_entries, + bool default_value, dm_block_t *new_root); + +/* + * Frees the bitset. + */ +int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root); + +/* + * Set a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root); + +/* + * Clears a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root); + +/* + * Tests a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block (cached values may have been written) + * result - the bit value you're after + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root, bool *result); + +/* + * Flush any cached changes to disk. + * + * info - describes the bitset + * root - the root block of the bitset + * new_root - on success, points to the new root block + */ +int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, + dm_block_t *new_root); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_BITSET_H */ diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index accbb05f17b6..37d367bb9aa8 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -64,6 +64,7 @@ struct ro_spine { void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info); int exit_ro_spine(struct ro_spine *s); int ro_step(struct ro_spine *s, dm_block_t new_child); +void ro_pop(struct ro_spine *s); struct btree_node *ro_node(struct ro_spine *s); struct shadow_spine { diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index f199a0c4ed04..cf9fd676ae44 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -164,6 +164,13 @@ int ro_step(struct ro_spine *s, dm_block_t new_child) return r; } +void ro_pop(struct ro_spine *s) +{ + BUG_ON(!s->count); + --s->count; + unlock_block(s->info, s->nodes[s->count]); +} + struct btree_node *ro_node(struct ro_spine *s) { struct dm_block *block; diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 4caf66918cdb..35865425e4b4 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -807,3 +807,55 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, return r ? r : count; } EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); + +/* + * FIXME: We shouldn't use a recursive algorithm when we have limited stack + * space. Also this only works for single level trees. + */ +static int walk_node(struct ro_spine *s, dm_block_t block, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context) +{ + int r; + unsigned i, nr; + struct btree_node *n; + uint64_t keys; + + r = ro_step(s, block); + n = ro_node(s); + + nr = le32_to_cpu(n->header.nr_entries); + for (i = 0; i < nr; i++) { + if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { + r = walk_node(s, value64(n, i), fn, context); + if (r) + goto out; + } else { + keys = le64_to_cpu(*key_ptr(n, i)); + r = fn(context, &keys, value_ptr(n, i)); + if (r) + goto out; + } + } + +out: + ro_pop(s); + return r; +} + +int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context) +{ + int r; + struct ro_spine spine; + + BUG_ON(info->levels > 1); + + init_ro_spine(&spine, info); + r = walk_node(&spine, root, fn, context); + exit_ro_spine(&spine); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_walk); diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index a2cd50441ca1..8672d159e0b5 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h @@ -58,21 +58,21 @@ struct dm_btree_value_type { * somewhere.) This method is _not_ called for insertion of a new * value: It is assumed the ref count is already 1. */ - void (*inc)(void *context, void *value); + void (*inc)(void *context, const void *value); /* * This value is being deleted. The btree takes care of freeing * the memory pointed to by @value. Often the del function just * needs to decrement a reference count somewhere. */ - void (*dec)(void *context, void *value); + void (*dec)(void *context, const void *value); /* * A test for equality between two values. When a value is * overwritten with a new one, the old one has the dec method * called _unless_ the new and old value are deemed equal. */ - int (*equal)(void *context, void *value1, void *value2); + int (*equal)(void *context, const void *value1, const void *value2); }; /* @@ -142,4 +142,13 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, uint64_t *result_keys); +/* + * Iterate through the a btree, calling fn() on each entry. + * It only works for single level trees and is internally recursive, so + * monitor stack usage carefully. + */ +int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context); + #endif /* _LINUX_DM_BTREE_H */ |