diff options
-rw-r--r-- | Documentation/device-mapper/switch.txt | 12 | ||||
-rw-r--r-- | block/bio.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.h | 8 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 128 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 401 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 77 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-switch.c | 67 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 86 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 181 | ||||
-rw-r--r-- | drivers/md/dm.h | 1 |
12 files changed, 590 insertions, 384 deletions
diff --git a/Documentation/device-mapper/switch.txt b/Documentation/device-mapper/switch.txt index 2fa749387be8..8897d0494838 100644 --- a/Documentation/device-mapper/switch.txt +++ b/Documentation/device-mapper/switch.txt @@ -106,6 +106,11 @@ which paths. The path number in the range 0 ... (<num_paths> - 1). Expressed in hexadecimal (WITHOUT any prefix like 0x). +R<n>,<m> + This parameter allows repetitive patterns to be loaded quickly. <n> and <m> + are hexadecimal numbers. The last <n> mappings are repeated in the next <m> + slots. + Status ====== @@ -124,3 +129,10 @@ Create a switch device with 64kB region size: Set mappings for the first 7 entries to point to devices switch0, switch1, switch2, switch0, switch1, switch2, switch1: dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1 + +Set repetitive mapping. This command: + dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10 +is equivalent to: + dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \ + :1 :2 :1 :2 :1 :2 :1 :2 :1 :2 + diff --git a/block/bio.c b/block/bio.c index 0ec61c9e536c..3e6331d25d90 100644 --- a/block/bio.c +++ b/block/bio.c @@ -112,7 +112,8 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) bslab = &bio_slabs[entry]; snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); - slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL); + slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN, + SLAB_HWCACHE_ALIGN, NULL); if (!slab) goto out_unlock; diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index d2899e7eb3aa..06709257adde 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -330,7 +330,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->discard_root = cpu_to_le64(cmd->discard_root); disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); - disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); + disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE); disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); disk_super->cache_blocks = cpu_to_le32(0); @@ -478,7 +478,7 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, bool may_format_device) { int r; - cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE, + cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_METADATA_CACHE_SIZE, CACHE_MAX_CONCURRENT_LOCKS); if (IS_ERR(cmd->bm)) { diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index cd70a78623a3..7383c90ccdb8 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -9,19 +9,17 @@ #include "dm-cache-block-types.h" #include "dm-cache-policy-internal.h" +#include "persistent-data/dm-space-map-metadata.h" /*----------------------------------------------------------------*/ -#define DM_CACHE_METADATA_BLOCK_SIZE 4096 +#define DM_CACHE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE /* FIXME: remove this restriction */ /* * The metadata device is currently limited in size. - * - * We have one block of index, which can hold 255 index entries. Each - * index entry contains allocation info about 16k metadata blocks. */ -#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) +#define DM_CACHE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS /* * A metadata device larger than 16GB triggers a warning. diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 2c63326638b6..1af40ee209e2 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -718,6 +718,22 @@ static int bio_triggers_commit(struct cache *cache, struct bio *bio) return bio->bi_rw & (REQ_FLUSH | REQ_FUA); } +/* + * You must increment the deferred set whilst the prison cell is held. To + * encourage this, we ask for 'cell' to be passed in. + */ +static void inc_ds(struct cache *cache, struct bio *bio, + struct dm_bio_prison_cell *cell) +{ + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + BUG_ON(!cell); + BUG_ON(pb->all_io_entry); + + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); +} + static void issue(struct cache *cache, struct bio *bio) { unsigned long flags; @@ -737,6 +753,12 @@ static void issue(struct cache *cache, struct bio *bio) spin_unlock_irqrestore(&cache->lock, flags); } +static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) +{ + inc_ds(cache, bio, cell); + issue(cache, bio); +} + static void defer_writethrough_bio(struct cache *cache, struct bio *bio) { unsigned long flags; @@ -1015,6 +1037,11 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); + + /* + * No need to inc_ds() here, since the cell will be held for the + * duration of the io. + */ generic_make_request(bio); } @@ -1115,8 +1142,7 @@ static void check_for_quiesced_migrations(struct cache *cache, return; INIT_LIST_HEAD(&work); - if (pb->all_io_entry) - dm_deferred_entry_dec(pb->all_io_entry, &work); + dm_deferred_entry_dec(pb->all_io_entry, &work); if (!list_empty(&work)) queue_quiesced_migrations(cache, &work); @@ -1252,6 +1278,11 @@ static void process_flush_bio(struct cache *cache, struct bio *bio) else remap_to_cache(cache, bio, 0); + /* + * REQ_FLUSH is not directed at any particular block so we don't + * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH + * by dm-core. + */ issue(cache, bio); } @@ -1301,15 +1332,6 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio) &cache->stats.read_miss : &cache->stats.write_miss); } -static void issue_cache_bio(struct cache *cache, struct bio *bio, - struct per_bio_data *pb, - dm_oblock_t oblock, dm_cblock_t cblock) -{ - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - remap_to_cache_dirty(cache, bio, oblock, cblock); - issue(cache, bio); -} - static void process_bio(struct cache *cache, struct prealloc *structs, struct bio *bio) { @@ -1318,8 +1340,6 @@ static void process_bio(struct cache *cache, struct prealloc *structs, dm_oblock_t block = get_bio_block(cache, bio); struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; struct policy_result lookup_result; - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); bool discarded_block = is_discarded_oblock(cache, block); bool passthrough = passthrough_mode(&cache->features); bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); @@ -1359,9 +1379,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, } else { /* FIXME: factor out issue_origin() */ - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); remap_to_origin_clear_discard(cache, bio, block); - issue(cache, bio); + inc_and_issue(cache, bio, new_ocell); } } else { inc_hit_counter(cache, bio); @@ -1369,20 +1388,21 @@ static void process_bio(struct cache *cache, struct prealloc *structs, if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && !is_dirty(cache, lookup_result.cblock)) { - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - issue(cache, bio); - } else - issue_cache_bio(cache, bio, pb, block, lookup_result.cblock); + inc_and_issue(cache, bio, new_ocell); + + } else { + remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); + inc_and_issue(cache, bio, new_ocell); + } } break; case POLICY_MISS: inc_miss_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); remap_to_origin_clear_discard(cache, bio, block); - issue(cache, bio); + inc_and_issue(cache, bio, new_ocell); break; case POLICY_NEW: @@ -1501,6 +1521,9 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) bio_list_init(&cache->deferred_flush_bios); spin_unlock_irqrestore(&cache->lock, flags); + /* + * These bios have already been through inc_ds() + */ while ((bio = bio_list_pop(&bios))) submit_bios ? generic_make_request(bio) : bio_io_error(bio); } @@ -1518,6 +1541,9 @@ static void process_deferred_writethrough_bios(struct cache *cache) bio_list_init(&cache->deferred_writethrough_bios); spin_unlock_irqrestore(&cache->lock, flags); + /* + * These bios have already been through inc_ds() + */ while ((bio = bio_list_pop(&bios))) generic_make_request(bio); } @@ -1694,6 +1720,7 @@ static void do_worker(struct work_struct *ws) if (commit_if_needed(cache)) { process_deferred_flush_bios(cache, false); + process_migrations(cache, &cache->need_commit_migrations, migration_failure); /* * FIXME: rollback metadata or just go into a @@ -2406,16 +2433,13 @@ out: return r; } -static int cache_map(struct dm_target *ti, struct bio *bio) +static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell) { - struct cache *cache = ti->private; - int r; dm_oblock_t block = get_bio_block(cache, bio); size_t pb_data_size = get_per_bio_data_size(cache); bool can_migrate = false; bool discarded_block; - struct dm_bio_prison_cell *cell; struct policy_result lookup_result; struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); @@ -2437,15 +2461,15 @@ static int cache_map(struct dm_target *ti, struct bio *bio) /* * Check to see if that block is currently migrating. */ - cell = alloc_prison_cell(cache); - if (!cell) { + *cell = alloc_prison_cell(cache); + if (!*cell) { defer_bio(cache, bio); return DM_MAPIO_SUBMITTED; } - r = bio_detain(cache, block, bio, cell, + r = bio_detain(cache, block, bio, *cell, (cell_free_fn) free_prison_cell, - cache, &cell); + cache, cell); if (r) { if (r < 0) defer_bio(cache, bio); @@ -2458,11 +2482,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio) r = policy_map(cache->policy, block, false, can_migrate, discarded_block, bio, &lookup_result); if (r == -EWOULDBLOCK) { - cell_defer(cache, cell, true); + cell_defer(cache, *cell, true); return DM_MAPIO_SUBMITTED; } else if (r) { DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); + cell_defer(cache, *cell, false); bio_io_error(bio); return DM_MAPIO_SUBMITTED; } @@ -2476,52 +2501,44 @@ static int cache_map(struct dm_target *ti, struct bio *bio) * We need to invalidate this block, so * defer for the worker thread. */ - cell_defer(cache, cell, true); + cell_defer(cache, *cell, true); r = DM_MAPIO_SUBMITTED; } else { - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); inc_miss_counter(cache, bio); remap_to_origin_clear_discard(cache, bio, block); - - cell_defer(cache, cell, false); } } else { inc_hit_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && !is_dirty(cache, lookup_result.cblock)) remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); - - cell_defer(cache, cell, false); } break; case POLICY_MISS: inc_miss_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (pb->req_nr != 0) { /* * This is a duplicate writethrough io that is no * longer needed because the block has been demoted. */ bio_endio(bio, 0); - cell_defer(cache, cell, false); - return DM_MAPIO_SUBMITTED; - } else { + cell_defer(cache, *cell, false); + r = DM_MAPIO_SUBMITTED; + + } else remap_to_origin_clear_discard(cache, bio, block); - cell_defer(cache, cell, false); - } + break; default: DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, (unsigned) lookup_result.op); + cell_defer(cache, *cell, false); bio_io_error(bio); r = DM_MAPIO_SUBMITTED; } @@ -2529,6 +2546,21 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return r; } +static int cache_map(struct dm_target *ti, struct bio *bio) +{ + int r; + struct dm_bio_prison_cell *cell; + struct cache *cache = ti->private; + + r = __cache_map(cache, bio, &cell); + if (r == DM_MAPIO_REMAPPED) { + inc_ds(cache, bio, cell); + cell_defer(cache, cell, false); + } + + return r; +} + static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) { struct cache *cache = ti->private; @@ -2808,7 +2840,7 @@ static void cache_status(struct dm_target *ti, status_type_t type, residency = policy_residency(cache->policy); DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", - (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT), + (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), (unsigned long long)nr_blocks_metadata, cache->sectors_per_block, @@ -3062,7 +3094,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (io_opt_sectors < cache->sectors_per_block || do_div(io_opt_sectors, cache->sectors_per_block)) { - blk_limits_io_min(limits, 0); + blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); } set_discard_limits(cache, limits); @@ -3072,7 +3104,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 4cba2d808afb..ba2fad80a302 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -18,9 +18,11 @@ #include <linux/slab.h> #include <linux/crypto.h> #include <linux/workqueue.h> +#include <linux/kthread.h> #include <linux/backing-dev.h> #include <linux/atomic.h> #include <linux/scatterlist.h> +#include <linux/rbtree.h> #include <asm/page.h> #include <asm/unaligned.h> #include <crypto/hash.h> @@ -55,11 +57,11 @@ struct dm_crypt_io { struct convert_context ctx; - atomic_t io_pending; int error; sector_t sector; - struct dm_crypt_io *base_io; -}; + + struct rb_node rb_node; +} CRYPTO_MINALIGN_ATTR; struct dm_crypt_request { struct convert_context *ctx; @@ -121,14 +123,18 @@ struct crypt_config { * pool for per bio private data, crypto requests and * encryption requeusts/buffer pages */ - mempool_t *io_pool; mempool_t *req_pool; mempool_t *page_pool; struct bio_set *bs; + struct mutex bio_alloc_lock; struct workqueue_struct *io_queue; struct workqueue_struct *crypt_queue; + struct task_struct *write_thread; + wait_queue_head_t write_thread_wait; + struct rb_root write_tree; + char *cipher; char *cipher_string; @@ -162,6 +168,8 @@ struct crypt_config { */ unsigned int dmreq_start; + unsigned int per_bio_data_size; + unsigned long flags; unsigned int key_size; unsigned int key_parts; /* independent parts in key buffer */ @@ -170,9 +178,6 @@ struct crypt_config { }; #define MIN_IOS 16 -#define MIN_POOL_PAGES 32 - -static struct kmem_cache *_crypt_io_pool; static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); @@ -895,6 +900,15 @@ static void crypt_alloc_req(struct crypt_config *cc, kcryptd_async_done, dmreq_of_req(cc, ctx->req)); } +static void crypt_free_req(struct crypt_config *cc, + struct ablkcipher_request *req, struct bio *base_bio) +{ + struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); + + if ((struct ablkcipher_request *)(io + 1) != req) + mempool_free(req, cc->req_pool); +} + /* * Encrypt / decrypt data from one bio to another one (can be the same one) */ @@ -941,57 +955,71 @@ static int crypt_convert(struct crypt_config *cc, return 0; } +static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone); + /* * Generate a new unfragmented bio with the given size * This should never violate the device limitations - * May return a smaller bio when running out of pages, indicated by - * *out_of_pages set to 1. + * + * This function may be called concurrently. If we allocate from the mempool + * concurrently, there is a possibility of deadlock. For example, if we have + * mempool of 256 pages, two processes, each wanting 256, pages allocate from + * the mempool concurrently, it may deadlock in a situation where both processes + * have allocated 128 pages and the mempool is exhausted. + * + * In order to avoid this scenario we allocate the pages under a mutex. + * + * In order to not degrade performance with excessive locking, we try + * non-blocking allocation without a mutex first and if it fails, we fallback + * to a blocking allocation with a mutex. */ -static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, - unsigned *out_of_pages) +static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) { struct crypt_config *cc = io->cc; struct bio *clone; unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; - unsigned i, len; + gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM; + unsigned i, len, remaining_size; struct page *page; + struct bio_vec *bvec; + +retry: + if (unlikely(gfp_mask & __GFP_WAIT)) + mutex_lock(&cc->bio_alloc_lock); clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); if (!clone) - return NULL; + goto return_clone; clone_init(io, clone); - *out_of_pages = 0; + + remaining_size = size; for (i = 0; i < nr_iovecs; i++) { page = mempool_alloc(cc->page_pool, gfp_mask); if (!page) { - *out_of_pages = 1; - break; + BUG_ON(gfp_mask & __GFP_WAIT); + crypt_free_buffer_pages(cc, clone); + bio_put(clone); + gfp_mask |= __GFP_WAIT; + goto retry; } - /* - * If additional pages cannot be allocated without waiting, - * return a partially-allocated bio. The caller will then try - * to allocate more bios while submitting this partial bio. - */ - gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; + len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size; - len = (size > PAGE_SIZE) ? PAGE_SIZE : size; + bvec = &clone->bi_io_vec[clone->bi_vcnt++]; + bvec->bv_page = page; + bvec->bv_len = len; + bvec->bv_offset = 0; - if (!bio_add_page(clone, page, len, 0)) { - mempool_free(page, cc->page_pool); - break; - } + clone->bi_iter.bi_size += len; - size -= len; + remaining_size -= len; } - if (!clone->bi_iter.bi_size) { - bio_put(clone); - return NULL; - } +return_clone: + if (unlikely(gfp_mask & __GFP_WAIT)) + mutex_unlock(&cc->bio_alloc_lock); return clone; } @@ -1008,54 +1036,30 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) } } -static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc, - struct bio *bio, sector_t sector) +static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc, + struct bio *bio, sector_t sector) { - struct dm_crypt_io *io; - - io = mempool_alloc(cc->io_pool, GFP_NOIO); io->cc = cc; io->base_bio = bio; io->sector = sector; io->error = 0; - io->base_io = NULL; io->ctx.req = NULL; - atomic_set(&io->io_pending, 0); - - return io; -} - -static void crypt_inc_pending(struct dm_crypt_io *io) -{ - atomic_inc(&io->io_pending); } /* * One of the bios was finished. Check for completion of * the whole request and correctly clean up the buffer. - * If base_io is set, wait for the last fragment to complete. */ -static void crypt_dec_pending(struct dm_crypt_io *io) +static void crypt_end_io(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; struct bio *base_bio = io->base_bio; - struct dm_crypt_io *base_io = io->base_io; int error = io->error; - if (!atomic_dec_and_test(&io->io_pending)) - return; - if (io->ctx.req) - mempool_free(io->ctx.req, cc->req_pool); - mempool_free(io, cc->io_pool); - - if (likely(!base_io)) - bio_endio(base_bio, error); - else { - if (error && !base_io->error) - base_io->error = error; - crypt_dec_pending(base_io); - } + crypt_free_req(cc, io->ctx.req, base_bio); + + bio_endio(base_bio, error); } /* @@ -1100,7 +1104,7 @@ static void crypt_endio(struct bio *clone, int error) if (unlikely(error)) io->error = error; - crypt_dec_pending(io); + crypt_end_io(io); } static void clone_init(struct dm_crypt_io *io, struct bio *clone) @@ -1128,8 +1132,6 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) if (!clone) return 1; - crypt_inc_pending(io); - clone_init(io, clone); clone->bi_iter.bi_sector = cc->start + io->sector; @@ -1137,42 +1139,100 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) return 0; } +static void kcryptd_io_read_work(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + + if (kcryptd_io_read(io, GFP_NOIO)) + io->error = -ENOMEM; +} + +static void kcryptd_queue_read(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->cc; + + INIT_WORK(&io->work, kcryptd_io_read_work); + queue_work(cc->io_queue, &io->work); +} + static void kcryptd_io_write(struct dm_crypt_io *io) { struct bio *clone = io->ctx.bio_out; + generic_make_request(clone); } -static void kcryptd_io(struct work_struct *work) +#define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node) + +static int dmcrypt_write(void *data) { - struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + struct crypt_config *cc = data; + struct dm_crypt_io *io; - if (bio_data_dir(io->base_bio) == READ) { - crypt_inc_pending(io); - if (kcryptd_io_read(io, GFP_NOIO)) - io->error = -ENOMEM; - crypt_dec_pending(io); - } else - kcryptd_io_write(io); -} + while (1) { + struct rb_root write_tree; + struct blk_plug plug; -static void kcryptd_queue_io(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->cc; + DECLARE_WAITQUEUE(wait, current); - INIT_WORK(&io->work, kcryptd_io); - queue_work(cc->io_queue, &io->work); + spin_lock_irq(&cc->write_thread_wait.lock); +continue_locked: + + if (!RB_EMPTY_ROOT(&cc->write_tree)) + goto pop_from_list; + + __set_current_state(TASK_INTERRUPTIBLE); + __add_wait_queue(&cc->write_thread_wait, &wait); + + spin_unlock_irq(&cc->write_thread_wait.lock); + + if (unlikely(kthread_should_stop())) { + set_task_state(current, TASK_RUNNING); + remove_wait_queue(&cc->write_thread_wait, &wait); + break; + } + + schedule(); + + set_task_state(current, TASK_RUNNING); + spin_lock_irq(&cc->write_thread_wait.lock); + __remove_wait_queue(&cc->write_thread_wait, &wait); + goto continue_locked; + +pop_from_list: + write_tree = cc->write_tree; + cc->write_tree = RB_ROOT; + spin_unlock_irq(&cc->write_thread_wait.lock); + + BUG_ON(rb_parent(write_tree.rb_node)); + + /* + * Note: we cannot walk the tree here with rb_next because + * the structures may be freed when kcryptd_io_write is called. + */ + blk_start_plug(&plug); + do { + io = crypt_io_from_node(rb_first(&write_tree)); + rb_erase(&io->rb_node, &write_tree); + kcryptd_io_write(io); + } while (!RB_EMPTY_ROOT(&write_tree)); + blk_finish_plug(&plug); + } + return 0; } -static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) +static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io) { struct bio *clone = io->ctx.bio_out; struct crypt_config *cc = io->cc; + unsigned long flags; + sector_t sector; + struct rb_node **rbp, *parent; if (unlikely(io->error < 0)) { crypt_free_buffer_pages(cc, clone); bio_put(clone); - crypt_dec_pending(io); + crypt_end_io(io); return; } @@ -1181,109 +1241,62 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) clone->bi_iter.bi_sector = cc->start + io->sector; - if (async) - kcryptd_queue_io(io); - else - generic_make_request(clone); + spin_lock_irqsave(&cc->write_thread_wait.lock, flags); + rbp = &cc->write_tree.rb_node; + parent = NULL; + sector = io->sector; + while (*rbp) { + parent = *rbp; + + if (sector < crypt_io_from_node(parent)->sector) + rbp = &(*rbp)->rb_left; + else + rbp = &(*rbp)->rb_right; + } + rb_link_node(&io->rb_node, parent, rbp); + rb_insert_color(&io->rb_node, &cc->write_tree); + + wake_up_locked(&cc->write_thread_wait); + spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags); } static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; struct bio *clone; - struct dm_crypt_io *new_io; int crypt_finished; - unsigned out_of_pages = 0; - unsigned remaining = io->base_bio->bi_iter.bi_size; sector_t sector = io->sector; int r; - /* - * Prevent io from disappearing until this function completes. - */ - crypt_inc_pending(io); crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector); - /* - * The allocated buffers can be smaller than the whole bio, - * so repeat the whole process until all the data can be handled. - */ - while (remaining) { - clone = crypt_alloc_buffer(io, remaining, &out_of_pages); - if (unlikely(!clone)) { - io->error = -ENOMEM; - break; - } - - io->ctx.bio_out = clone; - io->ctx.iter_out = clone->bi_iter; - - remaining -= clone->bi_iter.bi_size; - sector += bio_sectors(clone); - - crypt_inc_pending(io); - - r = crypt_convert(cc, &io->ctx); - if (r < 0) - io->error = -EIO; - - crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); - - /* Encryption was already finished, submit io now */ - if (crypt_finished) { - kcryptd_crypt_write_io_submit(io, 0); - - /* - * If there was an error, do not try next fragments. - * For async, error is processed in async handler. - */ - if (unlikely(r < 0)) - break; + clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size); + if (unlikely(!clone)) { + io->error = -EIO; + crypt_end_io(io); + return; + } - io->sector = sector; - } + io->ctx.bio_out = clone; + io->ctx.iter_out = clone->bi_iter; - /* - * Out of memory -> run queues - * But don't wait if split was due to the io size restriction - */ - if (unlikely(out_of_pages)) - congestion_wait(BLK_RW_ASYNC, HZ/100); + sector += bio_sectors(clone); - /* - * With async crypto it is unsafe to share the crypto context - * between fragments, so switch to a new dm_crypt_io structure. - */ - if (unlikely(!crypt_finished && remaining)) { - new_io = crypt_io_alloc(io->cc, io->base_bio, - sector); - crypt_inc_pending(new_io); - crypt_convert_init(cc, &new_io->ctx, NULL, - io->base_bio, sector); - new_io->ctx.iter_in = io->ctx.iter_in; - - /* - * Fragments after the first use the base_io - * pending count. - */ - if (!io->base_io) - new_io->base_io = io; - else { - new_io->base_io = io->base_io; - crypt_inc_pending(io->base_io); - crypt_dec_pending(io); - } + r = crypt_convert(cc, &io->ctx); + if (r) + io->error = -EIO; + crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); - io = new_io; - } + /* Encryption was already finished, submit io now */ + if (crypt_finished) { + kcryptd_crypt_write_io_submit(io); + io->sector = sector; } - - crypt_dec_pending(io); } static void kcryptd_crypt_read_done(struct dm_crypt_io *io) { - crypt_dec_pending(io); + crypt_end_io(io); } static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) @@ -1291,8 +1304,6 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) struct crypt_config *cc = io->cc; int r = 0; - crypt_inc_pending(io); - crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, io->sector); @@ -1302,8 +1313,6 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) if (atomic_dec_and_test(&io->ctx.cc_pending)) kcryptd_crypt_read_done(io); - - crypt_dec_pending(io); } static void kcryptd_async_done(struct crypto_async_request *async_req, @@ -1325,7 +1334,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, if (error < 0) io->error = -EIO; - mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); + crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); if (!atomic_dec_and_test(&ctx->cc_pending)) return; @@ -1333,7 +1342,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, if (bio_data_dir(io->base_bio) == READ) kcryptd_crypt_read_done(io); else - kcryptd_crypt_write_io_submit(io, 1); + kcryptd_crypt_write_io_submit(io); } static void kcryptd_crypt(struct work_struct *work) @@ -1480,6 +1489,9 @@ static void crypt_dtr(struct dm_target *ti) if (!cc) return; + if (cc->write_thread) + kthread_stop(cc->write_thread); + if (cc->io_queue) destroy_workqueue(cc->io_queue); if (cc->crypt_queue) @@ -1494,8 +1506,6 @@ static void crypt_dtr(struct dm_target *ti) mempool_destroy(cc->page_pool); if (cc->req_pool) mempool_destroy(cc->req_pool); - if (cc->io_pool) - mempool_destroy(cc->io_pool); if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); @@ -1708,19 +1718,13 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret < 0) goto bad; - ret = -ENOMEM; - cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool); - if (!cc->io_pool) { - ti->error = "Cannot allocate crypt io mempool"; - goto bad; - } - cc->dmreq_start = sizeof(struct ablkcipher_request); cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & ~(crypto_tfm_ctx_alignment() - 1); + ret = -ENOMEM; cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + sizeof(struct dm_crypt_request) + cc->iv_size); if (!cc->req_pool) { @@ -1728,7 +1732,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); + cc->per_bio_data_size = ti->per_bio_data_size = + sizeof(struct dm_crypt_io) + cc->dmreq_start + + sizeof(struct dm_crypt_request) + cc->iv_size; + + cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0); if (!cc->page_pool) { ti->error = "Cannot allocate page mempool"; goto bad; @@ -1740,6 +1748,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + mutex_init(&cc->bio_alloc_lock); + ret = -EINVAL; if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid iv_offset sector"; @@ -1790,12 +1800,24 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->crypt_queue = alloc_workqueue("kcryptd", - WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; goto bad; } + init_waitqueue_head(&cc->write_thread_wait); + cc->write_tree = RB_ROOT; + + cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write"); + if (IS_ERR(cc->write_thread)) { + ret = PTR_ERR(cc->write_thread); + cc->write_thread = NULL; + ti->error = "Couldn't spawn write thread"; + goto bad; + } + wake_up_process(cc->write_thread); + ti->num_flush_bios = 1; ti->discard_zeroes_data_unsupported = true; @@ -1824,11 +1846,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } - io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); + io = dm_per_bio_data(bio, cc->per_bio_data_size); + crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); + io->ctx.req = (struct ablkcipher_request *)(io + 1); if (bio_data_dir(io->base_bio) == READ) { if (kcryptd_io_read(io, GFP_NOWAIT)) - kcryptd_queue_io(io); + kcryptd_queue_read(io); } else kcryptd_queue_crypt(io); @@ -1974,15 +1998,9 @@ static int __init dm_crypt_init(void) { int r; - _crypt_io_pool = KMEM_CACHE(dm_crypt_io, 0); - if (!_crypt_io_pool) - return -ENOMEM; - r = dm_register_target(&crypt_target); - if (r < 0) { + if (r < 0) DMERR("register failed %d", r); - kmem_cache_destroy(_crypt_io_pool); - } return r; } @@ -1990,7 +2008,6 @@ static int __init dm_crypt_init(void) static void __exit dm_crypt_exit(void) { dm_unregister_target(&crypt_target); - kmem_cache_destroy(_crypt_io_pool); } module_init(dm_crypt_init); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index db404a0f7e2c..c09359db3a90 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -33,7 +33,6 @@ struct dm_io_client { struct io { unsigned long error_bits; atomic_t count; - struct completion *wait; struct dm_io_client *client; io_notify_fn callback; void *context; @@ -112,28 +111,27 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, * We need an io object to keep track of the number of bios that * have been dispatched for a particular io. *---------------------------------------------------------------*/ -static void dec_count(struct io *io, unsigned int region, int error) +static void complete_io(struct io *io) { - if (error) - set_bit(region, &io->error_bits); + unsigned long error_bits = io->error_bits; + io_notify_fn fn = io->callback; + void *context = io->context; - if (atomic_dec_and_test(&io->count)) { - if (io->vma_invalidate_size) - invalidate_kernel_vmap_range(io->vma_invalidate_address, - io->vma_invalidate_size); + if (io->vma_invalidate_size) + invalidate_kernel_vmap_range(io->vma_invalidate_address, + io->vma_invalidate_size); - if (io->wait) - complete(io->wait); + mempool_free(io, io->client->pool); + fn(error_bits, context); +} - else { - unsigned long r = io->error_bits; - io_notify_fn fn = io->callback; - void *context = io->context; +static void dec_count(struct io *io, unsigned int region, int error) +{ + if (error) + set_bit(region, &io->error_bits); - mempool_free(io, io->client->pool); - fn(r, context); - } - } + if (atomic_dec_and_test(&io->count)) + complete_io(io); } static void endio(struct bio *bio, int error) @@ -376,41 +374,51 @@ static void dispatch_io(int rw, unsigned int num_regions, dec_count(io, 0, 0); } +struct sync_io { + unsigned long error_bits; + struct completion wait; +}; + +static void sync_io_complete(unsigned long error, void *context) +{ + struct sync_io *sio = context; + + sio->error_bits = error; + complete(&sio->wait); +} + static int sync_io(struct dm_io_client *client, unsigned int num_regions, struct dm_io_region *where, int rw, struct dpages *dp, unsigned long *error_bits) { - /* - * gcc <= 4.3 can't do the alignment for stack variables, so we must - * align it on our own. - * volatile prevents the optimizer from removing or reusing - * "io_" field from the stack frame (allowed in ANSI C). - */ - volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; - struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); - DECLARE_COMPLETION_ONSTACK(wait); + struct io *io; + struct sync_io sio; if (num_regions > 1 && (rw & RW_MASK) != WRITE) { WARN_ON(1); return -EIO; } + init_completion(&sio.wait); + + io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ - io->wait = &wait; io->client = client; + io->callback = sync_io_complete; + io->context = &sio; io->vma_invalidate_address = dp->vma_invalidate_address; io->vma_invalidate_size = dp->vma_invalidate_size; dispatch_io(rw, num_regions, where, dp, io, 1); - wait_for_completion_io(&wait); + wait_for_completion_io(&sio.wait); if (error_bits) - *error_bits = io->error_bits; + *error_bits = sio.error_bits; - return io->error_bits ? -EIO : 0; + return sio.error_bits ? -EIO : 0; } static int async_io(struct dm_io_client *client, unsigned int num_regions, @@ -428,7 +436,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ - io->wait = NULL; io->client = client; io->callback = fn; io->context = context; @@ -481,9 +488,9 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp, * New collapsed (a)synchronous interface. * * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug - * the queue with blk_unplug() some time later or set REQ_SYNC in -io_req->bi_rw. If you fail to do one of these, the IO will be submitted to - * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. + * the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw. + * If you fail to do one of these, the IO will be submitted to the disk after + * q->unplug_delay, which defaults to 3ms in blk-settings.c. */ int dm_io(struct dm_io_request *io_req, unsigned num_regions, struct dm_io_region *where, unsigned long *sync_error_bits) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index f4167b013d99..833d7e752f06 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -373,8 +373,6 @@ static int __must_push_back(struct multipath *m) dm_noflush_suspending(m->ti))); } -#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required) - /* * Map cloned requests */ @@ -402,11 +400,11 @@ static int multipath_map(struct dm_target *ti, struct request *clone, if (!__must_push_back(m)) r = -EIO; /* Failed */ goto out_unlock; - } - if (!pg_ready(m)) { + } else if (m->queue_io || m->pg_init_required) { __pg_init_all_paths(m); goto out_unlock; } + if (set_mapinfo(m, map_context) < 0) /* ENOMEM, requeue */ goto out_unlock; diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 09a688b3d48c..50fca469cafd 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -137,13 +137,23 @@ static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr *bit *= sctx->region_table_entry_bits; } +static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr) +{ + unsigned long region_index; + unsigned bit; + + switch_get_position(sctx, region_nr, ®ion_index, &bit); + + return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & + ((1 << sctx->region_table_entry_bits) - 1); +} + /* * Find which path to use at given offset. */ static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) { - unsigned long region_index; - unsigned bit, path_nr; + unsigned path_nr; sector_t p; p = offset; @@ -152,9 +162,7 @@ static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) else sector_div(p, sctx->region_size); - switch_get_position(sctx, p, ®ion_index, &bit); - path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & - ((1 << sctx->region_table_entry_bits) - 1); + path_nr = switch_region_table_read(sctx, p); /* This can only happen if the processor uses non-atomic stores. */ if (unlikely(path_nr >= sctx->nr_paths)) @@ -363,7 +371,7 @@ static __always_inline unsigned long parse_hex(const char **string) } static int process_set_region_mappings(struct switch_ctx *sctx, - unsigned argc, char **argv) + unsigned argc, char **argv) { unsigned i; unsigned long region_index = 0; @@ -372,6 +380,51 @@ static int process_set_region_mappings(struct switch_ctx *sctx, unsigned long path_nr; const char *string = argv[i]; + if ((*string & 0xdf) == 'R') { + unsigned long cycle_length, num_write; + + string++; + if (unlikely(*string == ',')) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + cycle_length = parse_hex(&string); + if (unlikely(*string != ',')) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + string++; + if (unlikely(!*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + num_write = parse_hex(&string); + if (unlikely(*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + + if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) { + DMWARN("invalid set_region_mappings cycle length: %lu > %lu", + cycle_length - 1, region_index); + return -EINVAL; + } + if (unlikely(region_index + num_write < region_index) || + unlikely(region_index + num_write >= sctx->nr_regions)) { + DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu", + region_index, num_write, sctx->nr_regions); + return -EINVAL; + } + + while (num_write--) { + region_index++; + path_nr = switch_region_table_read(sctx, region_index - cycle_length); + switch_region_table_write(sctx, region_index, path_nr); + } + + continue; + } + if (*string == ':') region_index++; else { @@ -500,7 +553,7 @@ static int switch_iterate_devices(struct dm_target *ti, static struct target_type switch_target = { .name = "switch", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = switch_ctr, .dtr = switch_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 5f59f1e3e5b1..f9c6cb8dbcf8 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1386,6 +1386,14 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, return q && !blk_queue_add_random(q); } +static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); +} + static bool dm_table_all_devices_attribute(struct dm_table *t, iterate_devices_callout_fn func) { @@ -1430,6 +1438,43 @@ static bool dm_table_supports_write_same(struct dm_table *t) return true; } +static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && blk_queue_discard(q); +} + +static bool dm_table_supports_discards(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + /* + * Unless any target used by the table set discards_supported, + * require at least one underlying device to support discards. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting discard selectively must provide. + */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->num_discard_bios) + continue; + + if (ti->discards_supported) + return 1; + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_discard_capable, NULL)) + return 1; + } + + return 0; +} + void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { @@ -1464,6 +1509,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_write_same(t)) q->limits.max_write_same_sectors = 0; + if (dm_table_all_devices_attribute(t, queue_supports_sg_merge)) + queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + else + queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + dm_table_set_integrity(t); /* @@ -1636,39 +1686,3 @@ void dm_table_run_md_queue_async(struct dm_table *t) } EXPORT_SYMBOL(dm_table_run_md_queue_async); -static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return q && blk_queue_discard(q); -} - -bool dm_table_supports_discards(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i = 0; - - /* - * Unless any target used by the table set discards_supported, - * require at least one underlying device to support discards. - * t->devices includes internal dm devices such as mirror logs - * so we need to use iterate_devices here, which targets - * supporting discard selectively must provide. - */ - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); - - if (!ti->num_discard_bios) - continue; - - if (ti->discards_supported) - return 1; - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_discard_capable, NULL)) - return 1; - } - - return 0; -} diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fc9c848a60c9..4843801173fe 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -227,6 +227,7 @@ struct thin_c { struct list_head list; struct dm_dev *pool_dev; struct dm_dev *origin_dev; + sector_t origin_size; dm_thin_id dev_id; struct pool *pool; @@ -554,11 +555,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio, struct dm_thin_new_mapping { struct list_head list; - bool quiesced:1; - bool prepared:1; bool pass_discard:1; bool definitely_not_shared:1; + /* + * Track quiescing, copying and zeroing preparation actions. When this + * counter hits zero the block is prepared and can be inserted into the + * btree. + */ + atomic_t prepare_actions; + int err; struct thin_c *tc; dm_block_t virt_block; @@ -575,43 +581,41 @@ struct dm_thin_new_mapping { bio_end_io_t *saved_bi_end_io; }; -static void __maybe_add_mapping(struct dm_thin_new_mapping *m) +static void __complete_mapping_preparation(struct dm_thin_new_mapping *m) { struct pool *pool = m->tc->pool; - if (m->quiesced && m->prepared) { + if (atomic_dec_and_test(&m->prepare_actions)) { list_add_tail(&m->list, &pool->prepared_mappings); wake_worker(pool); } } -static void copy_complete(int read_err, unsigned long write_err, void *context) +static void complete_mapping_preparation(struct dm_thin_new_mapping *m) { unsigned long flags; - struct dm_thin_new_mapping *m = context; struct pool *pool = m->tc->pool; - m->err = read_err || write_err ? -EIO : 0; - spin_lock_irqsave(&pool->lock, flags); - m->prepared = true; - __maybe_add_mapping(m); + __complete_mapping_preparation(m); spin_unlock_irqrestore(&pool->lock, flags); } +static void copy_complete(int read_err, unsigned long write_err, void *context) +{ + struct dm_thin_new_mapping *m = context; + + m->err = read_err || write_err ? -EIO : 0; + complete_mapping_preparation(m); +} + static void overwrite_endio(struct bio *bio, int err) { - unsigned long flags; struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); struct dm_thin_new_mapping *m = h->overwrite_mapping; - struct pool *pool = m->tc->pool; m->err = err; - - spin_lock_irqsave(&pool->lock, flags); - m->prepared = true; - __maybe_add_mapping(m); - spin_unlock_irqrestore(&pool->lock, flags); + complete_mapping_preparation(m); } /*----------------------------------------------------------------*/ @@ -821,10 +825,31 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) return m; } +static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m, + sector_t begin, sector_t end) +{ + int r; + struct dm_io_region to; + + to.bdev = tc->pool_dev->bdev; + to.sector = begin; + to.count = end - begin; + + r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m); + if (r < 0) { + DMERR_LIMIT("dm_kcopyd_zero() failed"); + copy_complete(1, 1, m); + } +} + +/* + * A partial copy also needs to zero the uncopied region. + */ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, struct dm_dev *origin, dm_block_t data_origin, dm_block_t data_dest, - struct dm_bio_prison_cell *cell, struct bio *bio) + struct dm_bio_prison_cell *cell, struct bio *bio, + sector_t len) { int r; struct pool *pool = tc->pool; @@ -835,8 +860,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, m->data_block = data_dest; m->cell = cell; + /* + * quiesce action + copy action + an extra reference held for the + * duration of this function (we may need to inc later for a + * partial zero). + */ + atomic_set(&m->prepare_actions, 3); + if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) - m->quiesced = true; + complete_mapping_preparation(m); /* already quiesced */ /* * IO to pool_dev remaps to the pool target's data_dev. @@ -857,20 +889,38 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, from.bdev = origin->bdev; from.sector = data_origin * pool->sectors_per_block; - from.count = pool->sectors_per_block; + from.count = len; to.bdev = tc->pool_dev->bdev; to.sector = data_dest * pool->sectors_per_block; - to.count = pool->sectors_per_block; + to.count = len; r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 0, copy_complete, m); if (r < 0) { - mempool_free(m, pool->mapping_pool); DMERR_LIMIT("dm_kcopyd_copy() failed"); - cell_error(pool, cell); + copy_complete(1, 1, m); + + /* + * We allow the zero to be issued, to simplify the + * error path. Otherwise we'd need to start + * worrying about decrementing the prepare_actions + * counter. + */ + } + + /* + * Do we need to zero a tail region? + */ + if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) { + atomic_inc(&m->prepare_actions); + ll_zero(tc, m, + data_dest * pool->sectors_per_block + len, + (data_dest + 1) * pool->sectors_per_block); } } + + complete_mapping_preparation(m); /* drop our ref */ } static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, @@ -878,15 +928,8 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, struct dm_bio_prison_cell *cell, struct bio *bio) { schedule_copy(tc, virt_block, tc->pool_dev, - data_origin, data_dest, cell, bio); -} - -static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, - dm_block_t data_dest, - struct dm_bio_prison_cell *cell, struct bio *bio) -{ - schedule_copy(tc, virt_block, tc->origin_dev, - virt_block, data_dest, cell, bio); + data_origin, data_dest, cell, bio, + tc->pool->sectors_per_block); } static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, @@ -896,8 +939,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, struct pool *pool = tc->pool; struct dm_thin_new_mapping *m = get_next_mapping(pool); - m->quiesced = true; - m->prepared = false; + atomic_set(&m->prepare_actions, 1); /* no need to quiesce */ m->tc = tc; m->virt_block = virt_block; m->data_block = data_block; @@ -919,21 +961,33 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); inc_all_io_entry(pool, bio); remap_and_issue(tc, bio, data_block); - } else { - int r; - struct dm_io_region to; - to.bdev = tc->pool_dev->bdev; - to.sector = data_block * pool->sectors_per_block; - to.count = pool->sectors_per_block; + } else + ll_zero(tc, m, + data_block * pool->sectors_per_block, + (data_block + 1) * pool->sectors_per_block); +} - r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); - if (r < 0) { - mempool_free(m, pool->mapping_pool); - DMERR_LIMIT("dm_kcopyd_zero() failed"); - cell_error(pool, cell); - } - } +static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_dest, + struct dm_bio_prison_cell *cell, struct bio *bio) +{ + struct pool *pool = tc->pool; + sector_t virt_block_begin = virt_block * pool->sectors_per_block; + sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block; + + if (virt_block_end <= tc->origin_size) + schedule_copy(tc, virt_block, tc->origin_dev, + virt_block, data_dest, cell, bio, + pool->sectors_per_block); + + else if (virt_block_begin < tc->origin_size) + schedule_copy(tc, virt_block, tc->origin_dev, + virt_block, data_dest, cell, bio, + tc->origin_size - virt_block_begin); + + else + schedule_zero(tc, virt_block, data_dest, cell, bio); } /* @@ -1315,7 +1369,18 @@ static void process_bio(struct thin_c *tc, struct bio *bio) inc_all_io_entry(pool, bio); cell_defer_no_holder(tc, cell); - remap_to_origin_and_issue(tc, bio); + if (bio_end_sector(bio) <= tc->origin_size) + remap_to_origin_and_issue(tc, bio); + + else if (bio->bi_iter.bi_sector < tc->origin_size) { + zero_fill_bio(bio); + bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT; + remap_to_origin_and_issue(tc, bio); + + } else { + zero_fill_bio(bio); + bio_endio(bio, 0); + } } else provision_block(tc, bio, block, cell); break; @@ -3112,7 +3177,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (io_opt_sectors < pool->sectors_per_block || do_div(io_opt_sectors, pool->sectors_per_block)) { - blk_limits_io_min(limits, 0); + blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); } @@ -3141,7 +3206,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 12, 0}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -3361,8 +3426,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err) spin_lock_irqsave(&pool->lock, flags); list_for_each_entry_safe(m, tmp, &work, list) { list_del(&m->list); - m->quiesced = true; - __maybe_add_mapping(m); + __complete_mapping_preparation(m); } spin_unlock_irqrestore(&pool->lock, flags); } @@ -3401,6 +3465,16 @@ static void thin_postsuspend(struct dm_target *ti) noflush_work(tc, do_noflush_stop); } +static int thin_preresume(struct dm_target *ti) +{ + struct thin_c *tc = ti->private; + + if (tc->origin_dev) + tc->origin_size = get_dev_size(tc->origin_dev->bdev); + + return 0; +} + /* * <nr mapped sectors> <highest mapped sector> */ @@ -3483,12 +3557,13 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 12, 0}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, .map = thin_map, .end_io = thin_endio, + .preresume = thin_preresume, .presuspend = thin_presuspend, .postsuspend = thin_postsuspend, .status = thin_status, diff --git a/drivers/md/dm.h b/drivers/md/dm.h index ed76126aac54..e81d2152fa68 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -72,7 +72,6 @@ int dm_table_any_busy_target(struct dm_table *t); unsigned dm_table_get_type(struct dm_table *t); struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); -bool dm_table_supports_discards(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); |