diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2014-10-22 12:04:39 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2014-10-22 12:04:39 +1100 |
commit | 39ce83806631c64bd3ce9ea3a177c26b39f017bd (patch) | |
tree | f7a06d005224eba38fa8288c61b1d02e0d1eb9b1 /drivers | |
parent | eff27c4601dfc8506f7308a071c33b4c8a628e44 (diff) | |
parent | 330f26b9734599470354368e019d853a7b591fc5 (diff) |
Merge remote-tracking branch 'device-mapper/for-next'
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/dm-bio-prison.c | 186 | ||||
-rw-r--r-- | drivers/md/dm-bio-prison.h | 16 | ||||
-rw-r--r-- | drivers/md/dm-bufio.c | 218 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 11 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 35 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.h | 9 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 661 | ||||
-rw-r--r-- | drivers/md/dm.c | 12 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-transaction-manager.c | 77 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-transaction-manager.h | 7 |
12 files changed, 885 insertions, 354 deletions
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c index f752d12081ff..bbe22a5dc06b 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison.c @@ -14,68 +14,38 @@ /*----------------------------------------------------------------*/ -struct bucket { - spinlock_t lock; - struct hlist_head cells; -}; +#define MIN_CELLS 1024 struct dm_bio_prison { + spinlock_t lock; mempool_t *cell_pool; - - unsigned nr_buckets; - unsigned hash_mask; - struct bucket *buckets; + struct rb_root cells; }; -/*----------------------------------------------------------------*/ - -static uint32_t calc_nr_buckets(unsigned nr_cells) -{ - uint32_t n = 128; - - nr_cells /= 4; - nr_cells = min(nr_cells, 8192u); - - while (n < nr_cells) - n <<= 1; - - return n; -} - static struct kmem_cache *_cell_cache; -static void init_bucket(struct bucket *b) -{ - spin_lock_init(&b->lock); - INIT_HLIST_HEAD(&b->cells); -} +/*----------------------------------------------------------------*/ /* * @nr_cells should be the number of cells you want in use _concurrently_. * Don't confuse it with the number of distinct keys. */ -struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells) +struct dm_bio_prison *dm_bio_prison_create(void) { - unsigned i; - uint32_t nr_buckets = calc_nr_buckets(nr_cells); - size_t len = sizeof(struct dm_bio_prison) + - (sizeof(struct bucket) * nr_buckets); - struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL); + struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL); if (!prison) return NULL; - prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); + spin_lock_init(&prison->lock); + + prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache); if (!prison->cell_pool) { kfree(prison); return NULL; } - prison->nr_buckets = nr_buckets; - prison->hash_mask = nr_buckets - 1; - prison->buckets = (struct bucket *) (prison + 1); - for (i = 0; i < nr_buckets; i++) - init_bucket(prison->buckets + i); + prison->cells = RB_ROOT; return prison; } @@ -101,68 +71,73 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison, } EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell); -static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key) +static void __setup_new_cell(struct dm_cell_key *key, + struct bio *holder, + struct dm_bio_prison_cell *cell) { - const unsigned long BIG_PRIME = 4294967291UL; - uint64_t hash = key->block * BIG_PRIME; - - return (uint32_t) (hash & prison->hash_mask); + memcpy(&cell->key, key, sizeof(cell->key)); + cell->holder = holder; + bio_list_init(&cell->bios); } -static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs) +static int cmp_keys(struct dm_cell_key *lhs, + struct dm_cell_key *rhs) { - return (lhs->virtual == rhs->virtual) && - (lhs->dev == rhs->dev) && - (lhs->block == rhs->block); -} + if (lhs->virtual < rhs->virtual) + return -1; -static struct bucket *get_bucket(struct dm_bio_prison *prison, - struct dm_cell_key *key) -{ - return prison->buckets + hash_key(prison, key); -} + if (lhs->virtual > rhs->virtual) + return 1; -static struct dm_bio_prison_cell *__search_bucket(struct bucket *b, - struct dm_cell_key *key) -{ - struct dm_bio_prison_cell *cell; + if (lhs->dev < rhs->dev) + return -1; - hlist_for_each_entry(cell, &b->cells, list) - if (keys_equal(&cell->key, key)) - return cell; + if (lhs->dev > rhs->dev) + return 1; - return NULL; -} + if (lhs->block < rhs->block) + return -1; -static void __setup_new_cell(struct bucket *b, - struct dm_cell_key *key, - struct bio *holder, - struct dm_bio_prison_cell *cell) -{ - memcpy(&cell->key, key, sizeof(cell->key)); - cell->holder = holder; - bio_list_init(&cell->bios); - hlist_add_head(&cell->list, &b->cells); + if (lhs->block > rhs->block) + return 1; + + return 0; } -static int __bio_detain(struct bucket *b, +static int __bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key, struct bio *inmate, struct dm_bio_prison_cell *cell_prealloc, struct dm_bio_prison_cell **cell_result) { - struct dm_bio_prison_cell *cell; - - cell = __search_bucket(b, key); - if (cell) { - if (inmate) - bio_list_add(&cell->bios, inmate); - *cell_result = cell; - return 1; + int r; + struct rb_node **new = &prison->cells.rb_node, *parent = NULL; + + while (*new) { + struct dm_bio_prison_cell *cell = + container_of(*new, struct dm_bio_prison_cell, node); + + r = cmp_keys(key, &cell->key); + + parent = *new; + if (r < 0) + new = &((*new)->rb_left); + else if (r > 0) + new = &((*new)->rb_right); + else { + if (inmate) + bio_list_add(&cell->bios, inmate); + *cell_result = cell; + return 1; + } } - __setup_new_cell(b, key, inmate, cell_prealloc); + __setup_new_cell(key, inmate, cell_prealloc); *cell_result = cell_prealloc; + + rb_link_node(&cell_prealloc->node, parent, new); + rb_insert_color(&cell_prealloc->node, &prison->cells); + return 0; } @@ -174,11 +149,10 @@ static int bio_detain(struct dm_bio_prison *prison, { int r; unsigned long flags; - struct bucket *b = get_bucket(prison, key); - spin_lock_irqsave(&b->lock, flags); - r = __bio_detain(b, key, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&b->lock, flags); + spin_lock_irqsave(&prison->lock, flags); + r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); + spin_unlock_irqrestore(&prison->lock, flags); return r; } @@ -205,10 +179,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell); /* * @inmates must have been initialised prior to this call */ -static void __cell_release(struct dm_bio_prison_cell *cell, +static void __cell_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, struct bio_list *inmates) { - hlist_del(&cell->list); + rb_erase(&cell->node, &prison->cells); if (inmates) { if (cell->holder) @@ -222,21 +197,21 @@ void dm_cell_release(struct dm_bio_prison *prison, struct bio_list *bios) { unsigned long flags; - struct bucket *b = get_bucket(prison, &cell->key); - spin_lock_irqsave(&b->lock, flags); - __cell_release(cell, bios); - spin_unlock_irqrestore(&b->lock, flags); + spin_lock_irqsave(&prison->lock, flags); + __cell_release(prison, cell, bios); + spin_unlock_irqrestore(&prison->lock, flags); } EXPORT_SYMBOL_GPL(dm_cell_release); /* * Sometimes we don't want the holder, just the additional bios. */ -static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, +static void __cell_release_no_holder(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, struct bio_list *inmates) { - hlist_del(&cell->list); + rb_erase(&cell->node, &prison->cells); bio_list_merge(inmates, &cell->bios); } @@ -245,11 +220,10 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, struct bio_list *inmates) { unsigned long flags; - struct bucket *b = get_bucket(prison, &cell->key); - spin_lock_irqsave(&b->lock, flags); - __cell_release_no_holder(cell, inmates); - spin_unlock_irqrestore(&b->lock, flags); + spin_lock_irqsave(&prison->lock, flags); + __cell_release_no_holder(prison, cell, inmates); + spin_unlock_irqrestore(&prison->lock, flags); } EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); @@ -267,6 +241,20 @@ void dm_cell_error(struct dm_bio_prison *prison, } EXPORT_SYMBOL_GPL(dm_cell_error); +void dm_cell_visit_release(struct dm_bio_prison *prison, + void (*visit_fn)(void *, struct dm_bio_prison_cell *), + void *context, + struct dm_bio_prison_cell *cell) +{ + unsigned long flags; + + spin_lock_irqsave(&prison->lock, flags); + visit_fn(context, cell); + rb_erase(&cell->node, &prison->cells); + spin_unlock_irqrestore(&prison->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_cell_visit_release); + /*----------------------------------------------------------------*/ #define DEFERRED_SET_SIZE 64 diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h index 6805a142b750..b03988667740 100644 --- a/drivers/md/dm-bio-prison.h +++ b/drivers/md/dm-bio-prison.h @@ -10,8 +10,8 @@ #include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */ #include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */ -#include <linux/list.h> #include <linux/bio.h> +#include <linux/rbtree.h> /*----------------------------------------------------------------*/ @@ -35,13 +35,15 @@ struct dm_cell_key { * themselves. */ struct dm_bio_prison_cell { - struct hlist_node list; + struct list_head user_list; /* for client use */ + struct rb_node node; + struct dm_cell_key key; struct bio *holder; struct bio_list bios; }; -struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells); +struct dm_bio_prison *dm_bio_prison_create(void); void dm_bio_prison_destroy(struct dm_bio_prison *prison); /* @@ -87,6 +89,14 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, void dm_cell_error(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell, int error); +/* + * Visits the cell and then releases. Guarantees no new inmates are + * inserted between the visit and release. + */ +void dm_cell_visit_release(struct dm_bio_prison *prison, + void (*visit_fn)(void *, struct dm_bio_prison_cell *), + void *context, struct dm_bio_prison_cell *cell); + /*----------------------------------------------------------------*/ /* diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 825ca1f87639..9649e48f0bc4 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -14,6 +14,7 @@ #include <linux/vmalloc.h> #include <linux/shrinker.h> #include <linux/module.h> +#include <linux/rbtree.h> #define DM_MSG_PREFIX "bufio" @@ -34,26 +35,23 @@ /* * Check buffer ages in this interval (seconds) */ -#define DM_BUFIO_WORK_TIMER_SECS 10 +#define DM_BUFIO_WORK_TIMER_SECS 30 /* * Free buffers when they are older than this (seconds) */ -#define DM_BUFIO_DEFAULT_AGE_SECS 60 +#define DM_BUFIO_DEFAULT_AGE_SECS 300 /* - * The number of bvec entries that are embedded directly in the buffer. - * If the chunk size is larger, dm-io is used to do the io. + * The nr of bytes of cached data to keep around. */ -#define DM_BUFIO_INLINE_VECS 16 +#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) /* - * Buffer hash + * The number of bvec entries that are embedded directly in the buffer. + * If the chunk size is larger, dm-io is used to do the io. */ -#define DM_BUFIO_HASH_BITS 20 -#define DM_BUFIO_HASH(block) \ - ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ - ((1 << DM_BUFIO_HASH_BITS) - 1)) +#define DM_BUFIO_INLINE_VECS 16 /* * Don't try to use kmem_cache_alloc for blocks larger than this. @@ -106,7 +104,7 @@ struct dm_bufio_client { unsigned minimum_buffers; - struct hlist_head *cache_hash; + struct rb_root buffer_tree; wait_queue_head_t free_buffer_wait; int async_write_error; @@ -135,7 +133,7 @@ enum data_mode { }; struct dm_buffer { - struct hlist_node hash_list; + struct rb_node node; struct list_head lru_list; sector_t block; void *data; @@ -223,6 +221,7 @@ static DEFINE_SPINLOCK(param_spinlock); * Buffers are freed after this timeout */ static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; +static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; static unsigned long dm_bufio_peak_allocated; static unsigned long dm_bufio_allocated_kmem_cache; @@ -253,6 +252,53 @@ static LIST_HEAD(dm_bufio_all_clients); */ static DEFINE_MUTEX(dm_bufio_clients_lock); +/*---------------------------------------------------------------- + * A red/black tree acts as an index for all the buffers. + *--------------------------------------------------------------*/ +static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) +{ + struct rb_node *n = c->buffer_tree.rb_node; + struct dm_buffer *b; + + while (n) { + b = container_of(n, struct dm_buffer, node); + + if (b->block == block) + return b; + + n = (b->block < block) ? n->rb_left : n->rb_right; + } + + return NULL; +} + +static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) +{ + struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; + struct dm_buffer *found; + + while (*new) { + found = container_of(*new, struct dm_buffer, node); + + if (found->block == b->block) { + BUG_ON(found != b); + return; + } + + parent = *new; + new = (found->block < b->block) ? + &((*new)->rb_left) : &((*new)->rb_right); + } + + rb_link_node(&b->node, parent, new); + rb_insert_color(&b->node, &c->buffer_tree); +} + +static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) +{ + rb_erase(&b->node, &c->buffer_tree); +} + /*----------------------------------------------------------------*/ static void adjust_total_allocated(enum data_mode data_mode, long diff) @@ -434,7 +480,7 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) b->block = block; b->list_mode = dirty; list_add(&b->lru_list, &c->lru[dirty]); - hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); + __insert(b->c, b); b->last_accessed = jiffies; } @@ -448,7 +494,7 @@ static void __unlink_buffer(struct dm_buffer *b) BUG_ON(!c->n_buffers[b->list_mode]); c->n_buffers[b->list_mode]--; - hlist_del(&b->hash_list); + __remove(b->c, b); list_del(&b->lru_list); } @@ -887,23 +933,6 @@ static void __check_watermark(struct dm_bufio_client *c, __write_dirty_buffers_async(c, 1, write_list); } -/* - * Find a buffer in the hash. - */ -static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) -{ - struct dm_buffer *b; - - hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)], - hash_list) { - dm_bufio_cond_resched(); - if (b->block == block) - return b; - } - - return NULL; -} - /*---------------------------------------------------------------- * Getting a buffer *--------------------------------------------------------------*/ @@ -1433,45 +1462,52 @@ static void drop_buffers(struct dm_bufio_client *c) } /* - * Test if the buffer is unused and too old, and commit it. - * At if noio is set, we must not do any I/O because we hold - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to - * different bufio client. + * We may not be able to evict this buffer if IO pending or the client + * is still using it. Caller is expected to know buffer is too old. + * + * And if GFP_NOFS is used, we must not do any I/O because we hold + * dm_bufio_clients_lock and we would risk deadlock if the I/O gets + * rerouted to different bufio client. */ -static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, - unsigned long max_jiffies) +static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) { - if (jiffies - b->last_accessed < max_jiffies) - return 0; - - if (!(gfp & __GFP_IO)) { + if (!(gfp & __GFP_FS)) { if (test_bit(B_READING, &b->state) || test_bit(B_WRITING, &b->state) || test_bit(B_DIRTY, &b->state)) - return 0; + return false; } if (b->hold_count) - return 0; + return false; __make_buffer_clean(b); __unlink_buffer(b); __free_buffer_wake(b); - return 1; + return true; +} + +static unsigned get_retain_buffers(struct dm_bufio_client *c) +{ + unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); + return retain_bytes / c->block_size; } -static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, - gfp_t gfp_mask) +static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, + gfp_t gfp_mask) { int l; struct dm_buffer *b, *tmp; - long freed = 0; + unsigned long freed = 0; + unsigned long count = nr_to_scan; + unsigned retain_target = get_retain_buffers(c); for (l = 0; l < LIST_SIZE; l++) { list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { - freed += __cleanup_old_buffer(b, gfp_mask, 0); - if (!--nr_to_scan) + if (__try_evict_buffer(b, gfp_mask)) + freed++; + if (!--nr_to_scan || ((count - freed) <= retain_target)) return freed; dm_bufio_cond_resched(); } @@ -1486,7 +1522,7 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long freed; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_IO) + if (sc->gfp_mask & __GFP_FS) dm_bufio_lock(c); else if (!dm_bufio_trylock(c)) return SHRINK_STOP; @@ -1503,7 +1539,7 @@ dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_IO) + if (sc->gfp_mask & __GFP_FS) dm_bufio_lock(c); else if (!dm_bufio_trylock(c)) return 0; @@ -1533,11 +1569,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign r = -ENOMEM; goto bad_client; } - c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); - if (!c->cache_hash) { - r = -ENOMEM; - goto bad_hash; - } + c->buffer_tree = RB_ROOT; c->bdev = bdev; c->block_size = block_size; @@ -1556,9 +1588,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->n_buffers[i] = 0; } - for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) - INIT_HLIST_HEAD(&c->cache_hash[i]); - mutex_init(&c->lock); INIT_LIST_HEAD(&c->reserved_buffers); c->need_reserved_buffers = reserved_buffers; @@ -1632,8 +1661,6 @@ bad_cache: } dm_io_client_destroy(c->dm_io); bad_dm_io: - vfree(c->cache_hash); -bad_hash: kfree(c); bad_client: return ERR_PTR(r); @@ -1660,9 +1687,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) mutex_unlock(&dm_bufio_clients_lock); - for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) - BUG_ON(!hlist_empty(&c->cache_hash[i])); - + BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree)); BUG_ON(c->need_reserved_buffers); while (!list_empty(&c->reserved_buffers)) { @@ -1680,36 +1705,60 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) BUG_ON(c->n_buffers[i]); dm_io_client_destroy(c->dm_io); - vfree(c->cache_hash); kfree(c); } EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); -static void cleanup_old_buffers(void) +static unsigned get_max_age_hz(void) { - unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); - struct dm_bufio_client *c; + unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); - if (max_age > ULONG_MAX / HZ) - max_age = ULONG_MAX / HZ; + if (max_age > UINT_MAX / HZ) + max_age = UINT_MAX / HZ; - mutex_lock(&dm_bufio_clients_lock); - list_for_each_entry(c, &dm_bufio_all_clients, client_list) { - if (!dm_bufio_trylock(c)) - continue; + return max_age * HZ; +} - while (!list_empty(&c->lru[LIST_CLEAN])) { - struct dm_buffer *b; - b = list_entry(c->lru[LIST_CLEAN].prev, - struct dm_buffer, lru_list); - if (!__cleanup_old_buffer(b, 0, max_age * HZ)) - break; - dm_bufio_cond_resched(); - } +static bool older_than(struct dm_buffer *b, unsigned long age_hz) +{ + return (jiffies - b->last_accessed) >= age_hz; +} + +static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) +{ + struct dm_buffer *b, *tmp; + unsigned retain_target = get_retain_buffers(c); + unsigned count; + + dm_bufio_lock(c); + + count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; + list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { + if (count <= retain_target) + break; + + if (!older_than(b, age_hz)) + break; + + if (__try_evict_buffer(b, 0)) + count--; - dm_bufio_unlock(c); dm_bufio_cond_resched(); } + + dm_bufio_unlock(c); +} + +static void cleanup_old_buffers(void) +{ + unsigned long max_age_hz = get_max_age_hz(); + struct dm_bufio_client *c; + + mutex_lock(&dm_bufio_clients_lock); + + list_for_each_entry(c, &dm_bufio_all_clients, client_list) + __evict_old_buffers(c, max_age_hz); + mutex_unlock(&dm_bufio_clients_lock); } @@ -1834,6 +1883,9 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); +module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); + module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 7130505c2425..69de8b43ca12 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -95,7 +95,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) /*----------------------------------------------------------------*/ -#define PRISON_CELLS 1024 #define MIGRATION_POOL_SIZE 128 #define COMMIT_PERIOD HZ #define MIGRATION_COUNT_WINDOW 10 @@ -2327,7 +2326,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) INIT_DELAYED_WORK(&cache->waker, do_waker); cache->last_commit_jiffies = jiffies; - cache->prison = dm_bio_prison_create(PRISON_CELLS); + cache->prison = dm_bio_prison_create(); if (!cache->prison) { *error = "could not create bio prison"; goto bad; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 4857fa4a5484..a7cb9dd5f135 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -789,8 +789,7 @@ struct dm_raid_superblock { __le32 layout; __le32 stripe_sectors; - __u8 pad[452]; /* Round struct to 512 bytes. */ - /* Always set to 0 when writing. */ + /* Remainder of a logical block is zero-filled when writing (see super_sync()). */ } __packed; static int read_disk_sb(struct md_rdev *rdev, int size) @@ -827,7 +826,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev) test_bit(Faulty, &(rs->dev[i].rdev.flags))) failed_devices |= (1ULL << i); - memset(sb, 0, sizeof(*sb)); + memset(sb + 1, 0, rdev->sb_size - sizeof(*sb)); sb->magic = cpu_to_le32(DM_RAID_MAGIC); sb->features = cpu_to_le32(0); /* No features yet */ @@ -862,7 +861,11 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) uint64_t events_sb, events_refsb; rdev->sb_start = 0; - rdev->sb_size = sizeof(*sb); + rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev); + if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) { + DMERR("superblock size of a logical block is no longer valid"); + return -EINVAL; + } ret = read_disk_sb(rdev, rdev->sb_size); if (ret) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index d1600d2aa2e2..f8b37d4c05d8 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -159,8 +159,10 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) sc->stripes_shift = __ffs(stripes); r = dm_set_target_max_io_len(ti, chunk_size); - if (r) + if (r) { + kfree(sc); return r; + } ti->num_flush_bios = stripes; ti->num_discard_bios = stripes; diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index e9d33ad59df5..43adbb863f5a 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1384,42 +1384,38 @@ static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) } int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, - int can_block, struct dm_thin_lookup_result *result) + int can_issue_io, struct dm_thin_lookup_result *result) { - int r = -EINVAL; - uint64_t block_time = 0; + int r; __le64 value; struct dm_pool_metadata *pmd = td->pmd; dm_block_t keys[2] = { td->id, block }; struct dm_btree_info *info; - if (can_block) { - down_read(&pmd->root_lock); - info = &pmd->info; - } else if (down_read_trylock(&pmd->root_lock)) - info = &pmd->nb_info; - else - return -EWOULDBLOCK; - if (pmd->fail_io) - goto out; + return -EINVAL; - r = dm_btree_lookup(info, pmd->root, keys, &value); - if (!r) - block_time = le64_to_cpu(value); + down_read(&pmd->root_lock); -out: - up_read(&pmd->root_lock); + if (can_issue_io) { + info = &pmd->info; + } else + info = &pmd->nb_info; + r = dm_btree_lookup(info, pmd->root, keys, &value); if (!r) { + uint64_t block_time = 0; dm_block_t exception_block; uint32_t exception_time; + + block_time = le64_to_cpu(value); unpack_block_time(block_time, &exception_block, &exception_time); result->block = exception_block; result->shared = __snapshotted_since(td, exception_time); } + up_read(&pmd->root_lock); return r; } @@ -1813,3 +1809,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) return needs_check; } + +void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd) +{ + dm_tm_issue_prefetches(pmd->tm); +} diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index e3c857db195a..921d15ee56a0 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -139,12 +139,12 @@ struct dm_thin_lookup_result { /* * Returns: - * -EWOULDBLOCK iff @can_block is set and would block. + * -EWOULDBLOCK iff @can_issue_io is set and would issue IO * -ENODATA iff that mapping is not present. * 0 success */ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, - int can_block, struct dm_thin_lookup_result *result); + int can_issue_io, struct dm_thin_lookup_result *result); /* * Obtain an unused block. @@ -213,6 +213,11 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd); bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); +/* + * Issue any prefetches that may be useful. + */ +void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd); + /*----------------------------------------------------------------*/ #endif diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 4843801173fe..719b330b7eb8 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -11,11 +11,13 @@ #include <linux/device-mapper.h> #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> +#include <linux/log2.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/sort.h> #include <linux/rbtree.h> #define DM_MSG_PREFIX "thin" @@ -25,7 +27,6 @@ */ #define ENDIO_HOOK_POOL_SIZE 1024 #define MAPPING_POOL_SIZE 1024 -#define PRISON_CELLS 1024 #define COMMIT_PERIOD HZ #define NO_SPACE_TIMEOUT_SECS 60 @@ -127,6 +128,53 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, /*----------------------------------------------------------------*/ +#define THROTTLE_THRESHOLD (1 * HZ) + +struct throttle { + struct rw_semaphore lock; + unsigned long threshold; + bool throttle_applied; +}; + +static void throttle_init(struct throttle *t) +{ + init_rwsem(&t->lock); + t->throttle_applied = false; +} + +static void throttle_work_start(struct throttle *t) +{ + t->threshold = jiffies + THROTTLE_THRESHOLD; +} + +static void throttle_work_update(struct throttle *t) +{ + if (!t->throttle_applied && jiffies > t->threshold) { + down_write(&t->lock); + t->throttle_applied = true; + } +} + +static void throttle_work_complete(struct throttle *t) +{ + if (t->throttle_applied) { + t->throttle_applied = false; + up_write(&t->lock); + } +} + +static void throttle_lock(struct throttle *t) +{ + down_read(&t->lock); +} + +static void throttle_unlock(struct throttle *t) +{ + up_read(&t->lock); +} + +/*----------------------------------------------------------------*/ + /* * A pool device ties together a metadata device and a data device. It * also provides the interface for creating and destroying internal @@ -155,8 +203,11 @@ struct pool_features { struct thin_c; typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); +typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell); typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); +#define CELL_SORT_ARRAY_SIZE 8192 + struct pool { struct list_head list; struct dm_target *ti; /* Only set if a pool target is bound */ @@ -176,6 +227,7 @@ struct pool { struct dm_kcopyd_client *copier; struct workqueue_struct *wq; + struct throttle throttle; struct work_struct worker; struct delayed_work waker; struct delayed_work no_space_timeout; @@ -198,8 +250,13 @@ struct pool { process_bio_fn process_bio; process_bio_fn process_discard; + process_cell_fn process_cell; + process_cell_fn process_discard_cell; + process_mapping_fn process_prepared_mapping; process_mapping_fn process_prepared_discard; + + struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE]; }; static enum pool_mode get_pool_mode(struct pool *pool); @@ -234,6 +291,7 @@ struct thin_c { struct dm_thin_device *td; bool requeue_mode:1; spinlock_t lock; + struct list_head deferred_cells; struct bio_list deferred_bio_list; struct bio_list retry_on_resume_list; struct rb_root sort_bio_list; /* sorted list of deferred bios */ @@ -290,6 +348,15 @@ static void cell_release(struct pool *pool, dm_bio_prison_free_cell(pool->prison, cell); } +static void cell_visit_release(struct pool *pool, + void (*fn)(void *, struct dm_bio_prison_cell *), + void *context, + struct dm_bio_prison_cell *cell) +{ + dm_cell_visit_release(pool->prison, fn, context, cell); + dm_bio_prison_free_cell(pool->prison, cell); +} + static void cell_release_no_holder(struct pool *pool, struct dm_bio_prison_cell *cell, struct bio_list *bios) @@ -298,19 +365,6 @@ static void cell_release_no_holder(struct pool *pool, dm_bio_prison_free_cell(pool->prison, cell); } -static void cell_defer_no_holder_no_free(struct thin_c *tc, - struct dm_bio_prison_cell *cell) -{ - struct pool *pool = tc->pool; - unsigned long flags; - - spin_lock_irqsave(&tc->lock, flags); - dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); - - wake_worker(pool); -} - static void cell_error_with_code(struct pool *pool, struct dm_bio_prison_cell *cell, int error_code) { @@ -323,6 +377,16 @@ static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) cell_error_with_code(pool, cell, -EIO); } +static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, 0); +} + +static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE); +} + /*----------------------------------------------------------------*/ /* @@ -393,44 +457,65 @@ struct dm_thin_endio_hook { struct rb_node rb_node; }; -static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) +static void __merge_bio_list(struct bio_list *bios, struct bio_list *master) +{ + bio_list_merge(bios, master); + bio_list_init(master); +} + +static void error_bio_list(struct bio_list *bios, int error) { struct bio *bio; + + while ((bio = bio_list_pop(bios))) + bio_endio(bio, error); +} + +static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error) +{ struct bio_list bios; unsigned long flags; bio_list_init(&bios); spin_lock_irqsave(&tc->lock, flags); - bio_list_merge(&bios, master); - bio_list_init(master); + __merge_bio_list(&bios, master); spin_unlock_irqrestore(&tc->lock, flags); - while ((bio = bio_list_pop(&bios))) - bio_endio(bio, DM_ENDIO_REQUEUE); + error_bio_list(&bios, error); } -static void requeue_io(struct thin_c *tc) +static void requeue_deferred_cells(struct thin_c *tc) { - requeue_bio_list(tc, &tc->deferred_bio_list); - requeue_bio_list(tc, &tc->retry_on_resume_list); + struct pool *pool = tc->pool; + unsigned long flags; + struct list_head cells; + struct dm_bio_prison_cell *cell, *tmp; + + INIT_LIST_HEAD(&cells); + + spin_lock_irqsave(&tc->lock, flags); + list_splice_init(&tc->deferred_cells, &cells); + spin_unlock_irqrestore(&tc->lock, flags); + + list_for_each_entry_safe(cell, tmp, &cells, user_list) + cell_requeue(pool, cell); } -static void error_thin_retry_list(struct thin_c *tc) +static void requeue_io(struct thin_c *tc) { - struct bio *bio; - unsigned long flags; struct bio_list bios; + unsigned long flags; bio_list_init(&bios); spin_lock_irqsave(&tc->lock, flags); - bio_list_merge(&bios, &tc->retry_on_resume_list); - bio_list_init(&tc->retry_on_resume_list); + __merge_bio_list(&bios, &tc->deferred_bio_list); + __merge_bio_list(&bios, &tc->retry_on_resume_list); spin_unlock_irqrestore(&tc->lock, flags); - while ((bio = bio_list_pop(&bios))) - bio_io_error(bio); + error_bio_list(&bios, DM_ENDIO_REQUEUE); + requeue_deferred_cells(tc); } static void error_retry_list(struct pool *pool) @@ -439,7 +524,7 @@ static void error_retry_list(struct pool *pool) rcu_read_lock(); list_for_each_entry_rcu(tc, &pool->active_thins, list) - error_thin_retry_list(tc); + error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO); rcu_read_unlock(); } @@ -629,33 +714,75 @@ static void overwrite_endio(struct bio *bio, int err) */ /* - * This sends the bios in the cell back to the deferred_bios list. + * This sends the bios in the cell, except the original holder, back + * to the deferred_bios list. */ -static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) +static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) { struct pool *pool = tc->pool; unsigned long flags; spin_lock_irqsave(&tc->lock, flags); - cell_release(pool, cell, &tc->deferred_bio_list); + cell_release_no_holder(pool, cell, &tc->deferred_bio_list); spin_unlock_irqrestore(&tc->lock, flags); wake_worker(pool); } -/* - * Same as cell_defer above, except it omits the original holder of the cell. - */ -static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) +static void thin_defer_bio(struct thin_c *tc, struct bio *bio); + +struct remap_info { + struct thin_c *tc; + struct bio_list defer_bios; + struct bio_list issue_bios; +}; + +static void __inc_remap_and_issue_cell(void *context, + struct dm_bio_prison_cell *cell) { - struct pool *pool = tc->pool; - unsigned long flags; + struct remap_info *info = context; + struct bio *bio; - spin_lock_irqsave(&tc->lock, flags); - cell_release_no_holder(pool, cell, &tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); + while ((bio = bio_list_pop(&cell->bios))) { + if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) + bio_list_add(&info->defer_bios, bio); + else { + inc_all_io_entry(info->tc->pool, bio); - wake_worker(pool); + /* + * We can't issue the bios with the bio prison lock + * held, so we add them to a list to issue on + * return from this function. + */ + bio_list_add(&info->issue_bios, bio); + } + } +} + +static void inc_remap_and_issue_cell(struct thin_c *tc, + struct dm_bio_prison_cell *cell, + dm_block_t block) +{ + struct bio *bio; + struct remap_info info; + + info.tc = tc; + bio_list_init(&info.defer_bios); + bio_list_init(&info.issue_bios); + + /* + * We have to be careful to inc any bios we're about to issue + * before the cell is released, and avoid a race with new bios + * being added to the cell. + */ + cell_visit_release(tc->pool, __inc_remap_and_issue_cell, + &info, cell); + + while ((bio = bio_list_pop(&info.defer_bios))) + thin_defer_bio(tc, bio); + + while ((bio = bio_list_pop(&info.issue_bios))) + remap_and_issue(info.tc, bio, block); } static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) @@ -706,10 +833,13 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) * the bios in the cell. */ if (bio) { - cell_defer_no_holder(tc, m->cell); + inc_remap_and_issue_cell(tc, m->cell, m->data_block); bio_endio(bio, 0); - } else - cell_defer(tc, m->cell); + } else { + inc_all_io_entry(tc->pool, m->cell->holder); + remap_and_issue(tc, m->cell->holder, m->data_block); + inc_remap_and_issue_cell(tc, m->cell, m->data_block); + } out: list_del(&m->list); @@ -842,6 +972,20 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m, } } +static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio, + dm_block_t data_block, + struct dm_thin_new_mapping *m) +{ + struct pool *pool = tc->pool; + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + h->overwrite_mapping = m; + m->bio = bio; + save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); + inc_all_io_entry(pool, bio); + remap_and_issue(tc, bio, data_block); +} + /* * A partial copy also needs to zero the uncopied region. */ @@ -876,15 +1020,9 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, * If the whole block of data is being overwritten, we can issue the * bio immediately. Otherwise we use kcopyd to clone the data first. */ - if (io_overwrites_block(pool, bio)) { - struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); - - h->overwrite_mapping = m; - m->bio = bio; - save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - inc_all_io_entry(pool, bio); - remap_and_issue(tc, bio, data_dest); - } else { + if (io_overwrites_block(pool, bio)) + remap_and_issue_overwrite(tc, bio, data_dest, m); + else { struct dm_io_region from, to; from.bdev = origin->bdev; @@ -953,16 +1091,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, if (!pool->pf.zero_new_blocks) process_prepared_mapping(m); - else if (io_overwrites_block(pool, bio)) { - struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); - - h->overwrite_mapping = m; - m->bio = bio; - save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - inc_all_io_entry(pool, bio); - remap_and_issue(tc, bio, data_block); + else if (io_overwrites_block(pool, bio)) + remap_and_issue_overwrite(tc, bio, data_block, m); - } else + else ll_zero(tc, m, data_block * pool->sectors_per_block, (data_block + 1) * pool->sectors_per_block); @@ -1134,29 +1266,25 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c bio_list_init(&bios); cell_release(pool, cell, &bios); - error = should_error_unserviceable_bio(pool); - if (error) - while ((bio = bio_list_pop(&bios))) - bio_endio(bio, error); - else - while ((bio = bio_list_pop(&bios))) - retry_on_resume(bio); + while ((bio = bio_list_pop(&bios))) + retry_on_resume(bio); } -static void process_discard(struct thin_c *tc, struct bio *bio) +static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) { int r; - unsigned long flags; + struct bio *bio = cell->holder; struct pool *pool = tc->pool; - struct dm_bio_prison_cell *cell, *cell2; - struct dm_cell_key key, key2; + struct dm_bio_prison_cell *cell2; + struct dm_cell_key key2; dm_block_t block = get_bio_block(tc, bio); struct dm_thin_lookup_result lookup_result; struct dm_thin_new_mapping *m; - build_virtual_key(tc->td, block, &key); - if (bio_detain(tc->pool, &key, bio, &cell)) + if (tc->requeue_mode) { + cell_requeue(pool, cell); return; + } r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { @@ -1187,12 +1315,9 @@ static void process_discard(struct thin_c *tc, struct bio *bio) m->cell2 = cell2; m->bio = bio; - if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { - spin_lock_irqsave(&pool->lock, flags); - list_add_tail(&m->list, &pool->prepared_discards); - spin_unlock_irqrestore(&pool->lock, flags); - wake_worker(pool); - } + if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) + pool->process_prepared_discard(m); + } else { inc_all_io_entry(pool, bio); cell_defer_no_holder(tc, cell); @@ -1227,6 +1352,19 @@ static void process_discard(struct thin_c *tc, struct bio *bio) } } +static void process_discard_bio(struct thin_c *tc, struct bio *bio) +{ + struct dm_bio_prison_cell *cell; + struct dm_cell_key key; + dm_block_t block = get_bio_block(tc, bio); + + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool, &key, bio, &cell)) + return; + + process_discard_cell(tc, cell); +} + static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, struct dm_cell_key *key, struct dm_thin_lookup_result *lookup_result, @@ -1255,11 +1393,53 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, } } +static void __remap_and_issue_shared_cell(void *context, + struct dm_bio_prison_cell *cell) +{ + struct remap_info *info = context; + struct bio *bio; + + while ((bio = bio_list_pop(&cell->bios))) { + if ((bio_data_dir(bio) == WRITE) || + (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))) + bio_list_add(&info->defer_bios, bio); + else { + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));; + + h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds); + inc_all_io_entry(info->tc->pool, bio); + bio_list_add(&info->issue_bios, bio); + } + } +} + +static void remap_and_issue_shared_cell(struct thin_c *tc, + struct dm_bio_prison_cell *cell, + dm_block_t block) +{ + struct bio *bio; + struct remap_info info; + + info.tc = tc; + bio_list_init(&info.defer_bios); + bio_list_init(&info.issue_bios); + + cell_visit_release(tc->pool, __remap_and_issue_shared_cell, + &info, cell); + + while ((bio = bio_list_pop(&info.defer_bios))) + thin_defer_bio(tc, bio); + + while ((bio = bio_list_pop(&info.issue_bios))) + remap_and_issue(tc, bio, block); +} + static void process_shared_bio(struct thin_c *tc, struct bio *bio, dm_block_t block, - struct dm_thin_lookup_result *lookup_result) + struct dm_thin_lookup_result *lookup_result, + struct dm_bio_prison_cell *virt_cell) { - struct dm_bio_prison_cell *cell; + struct dm_bio_prison_cell *data_cell; struct pool *pool = tc->pool; struct dm_cell_key key; @@ -1268,19 +1448,23 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, * of being broken so we have nothing further to do here. */ build_data_key(tc->td, lookup_result->block, &key); - if (bio_detain(pool, &key, bio, &cell)) + if (bio_detain(pool, &key, bio, &data_cell)) { + cell_defer_no_holder(tc, virt_cell); return; + } - if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) - break_sharing(tc, bio, block, &key, lookup_result, cell); - else { + if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) { + break_sharing(tc, bio, block, &key, lookup_result, data_cell); + cell_defer_no_holder(tc, virt_cell); + } else { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); inc_all_io_entry(pool, bio); - cell_defer_no_holder(tc, cell); - remap_and_issue(tc, bio, lookup_result->block); + + remap_and_issue_shared_cell(tc, data_cell, lookup_result->block); + remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block); } } @@ -1333,34 +1517,28 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block } } -static void process_bio(struct thin_c *tc, struct bio *bio) +static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) { int r; struct pool *pool = tc->pool; + struct bio *bio = cell->holder; dm_block_t block = get_bio_block(tc, bio); - struct dm_bio_prison_cell *cell; - struct dm_cell_key key; struct dm_thin_lookup_result lookup_result; - /* - * If cell is already occupied, then the block is already - * being provisioned so we have nothing further to do here. - */ - build_virtual_key(tc->td, block, &key); - if (bio_detain(pool, &key, bio, &cell)) + if (tc->requeue_mode) { + cell_requeue(pool, cell); return; + } r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { case 0: - if (lookup_result.shared) { - process_shared_bio(tc, bio, block, &lookup_result); - cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ - } else { + if (lookup_result.shared) + process_shared_bio(tc, bio, block, &lookup_result, cell); + else { inc_all_io_entry(pool, bio); - cell_defer_no_holder(tc, cell); - remap_and_issue(tc, bio, lookup_result.block); + inc_remap_and_issue_cell(tc, cell, lookup_result.block); } break; @@ -1394,7 +1572,26 @@ static void process_bio(struct thin_c *tc, struct bio *bio) } } -static void process_bio_read_only(struct thin_c *tc, struct bio *bio) +static void process_bio(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + dm_block_t block = get_bio_block(tc, bio); + struct dm_bio_prison_cell *cell; + struct dm_cell_key key; + + /* + * If cell is already occupied, then the block is already + * being provisioned so we have nothing further to do here. + */ + build_virtual_key(tc->td, block, &key); + if (bio_detain(pool, &key, bio, &cell)) + return; + + process_cell(tc, cell); +} + +static void __process_bio_read_only(struct thin_c *tc, struct bio *bio, + struct dm_bio_prison_cell *cell) { int r; int rw = bio_data_dir(bio); @@ -1404,15 +1601,21 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { case 0: - if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) + if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) { handle_unserviceable_bio(tc->pool, bio); - else { + if (cell) + cell_defer_no_holder(tc, cell); + } else { inc_all_io_entry(tc->pool, bio); remap_and_issue(tc, bio, lookup_result.block); + if (cell) + inc_remap_and_issue_cell(tc, cell, lookup_result.block); } break; case -ENODATA: + if (cell) + cell_defer_no_holder(tc, cell); if (rw != READ) { handle_unserviceable_bio(tc->pool, bio); break; @@ -1431,11 +1634,23 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) default: DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", __func__, r); + if (cell) + cell_defer_no_holder(tc, cell); bio_io_error(bio); break; } } +static void process_bio_read_only(struct thin_c *tc, struct bio *bio) +{ + __process_bio_read_only(tc, bio, NULL); +} + +static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + __process_bio_read_only(tc, cell->holder, cell); +} + static void process_bio_success(struct thin_c *tc, struct bio *bio) { bio_endio(bio, 0); @@ -1446,6 +1661,16 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio) bio_io_error(bio); } +static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + cell_success(tc->pool, cell); +} + +static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + cell_error(tc->pool, cell); +} + /* * FIXME: should we also commit due to size of transaction, measured in * metadata blocks? @@ -1527,9 +1752,10 @@ static void process_thin_deferred_bios(struct thin_c *tc) struct bio *bio; struct bio_list bios; struct blk_plug plug; + unsigned count = 0; if (tc->requeue_mode) { - requeue_bio_list(tc, &tc->deferred_bio_list); + error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE); return; } @@ -1568,10 +1794,97 @@ static void process_thin_deferred_bios(struct thin_c *tc) pool->process_discard(tc, bio); else pool->process_bio(tc, bio); + + if ((count++ & 127) == 0) { + throttle_work_update(&pool->throttle); + dm_pool_issue_prefetches(pool->pmd); + } } blk_finish_plug(&plug); } +static int cmp_cells(const void *lhs, const void *rhs) +{ + struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs); + struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs); + + BUG_ON(!lhs_cell->holder); + BUG_ON(!rhs_cell->holder); + + if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector) + return -1; + + if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector) + return 1; + + return 0; +} + +static unsigned sort_cells(struct pool *pool, struct list_head *cells) +{ + unsigned count = 0; + struct dm_bio_prison_cell *cell, *tmp; + + list_for_each_entry_safe(cell, tmp, cells, user_list) { + if (count >= CELL_SORT_ARRAY_SIZE) + break; + + pool->cell_sort_array[count++] = cell; + list_del(&cell->user_list); + } + + sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL); + + return count; +} + +static void process_thin_deferred_cells(struct thin_c *tc) +{ + struct pool *pool = tc->pool; + unsigned long flags; + struct list_head cells; + struct dm_bio_prison_cell *cell; + unsigned i, j, count; + + INIT_LIST_HEAD(&cells); + + spin_lock_irqsave(&tc->lock, flags); + list_splice_init(&tc->deferred_cells, &cells); + spin_unlock_irqrestore(&tc->lock, flags); + + if (list_empty(&cells)) + return; + + do { + count = sort_cells(tc->pool, &cells); + + for (i = 0; i < count; i++) { + cell = pool->cell_sort_array[i]; + BUG_ON(!cell->holder); + + /* + * If we've got no free new_mapping structs, and processing + * this bio might require one, we pause until there are some + * prepared mappings to process. + */ + if (ensure_next_mapping(pool)) { + for (j = i; j < count; j++) + list_add(&pool->cell_sort_array[j]->user_list, &cells); + + spin_lock_irqsave(&tc->lock, flags); + list_splice(&cells, &tc->deferred_cells); + spin_unlock_irqrestore(&tc->lock, flags); + return; + } + + if (cell->holder->bi_rw & REQ_DISCARD) + pool->process_discard_cell(tc, cell); + else + pool->process_cell(tc, cell); + } + } while (!list_empty(&cells)); +} + static void thin_get(struct thin_c *tc); static void thin_put(struct thin_c *tc); @@ -1620,6 +1933,7 @@ static void process_deferred_bios(struct pool *pool) tc = get_first_thin(pool); while (tc) { + process_thin_deferred_cells(tc); process_thin_deferred_bios(tc); tc = get_next_thin(pool, tc); } @@ -1653,9 +1967,15 @@ static void do_worker(struct work_struct *ws) { struct pool *pool = container_of(ws, struct pool, worker); + throttle_work_start(&pool->throttle); + dm_pool_issue_prefetches(pool->pmd); + throttle_work_update(&pool->throttle); process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); + throttle_work_update(&pool->throttle); process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); + throttle_work_update(&pool->throttle); process_deferred_bios(pool); + throttle_work_complete(&pool->throttle); } /* @@ -1792,6 +2112,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_fail; pool->process_discard = process_bio_fail; + pool->process_cell = process_cell_fail; + pool->process_discard_cell = process_cell_fail; pool->process_prepared_mapping = process_prepared_mapping_fail; pool->process_prepared_discard = process_prepared_discard_fail; @@ -1804,6 +2126,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_read_only; pool->process_discard = process_bio_success; + pool->process_cell = process_cell_read_only; + pool->process_discard_cell = process_cell_success; pool->process_prepared_mapping = process_prepared_mapping_fail; pool->process_prepared_discard = process_prepared_discard_passdown; @@ -1822,7 +2146,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) if (old_mode != new_mode) notify_of_pool_mode_change(pool, "out-of-data-space"); pool->process_bio = process_bio_read_only; - pool->process_discard = process_discard; + pool->process_discard = process_discard_bio; + pool->process_cell = process_cell_read_only; + pool->process_discard_cell = process_discard_cell; pool->process_prepared_mapping = process_prepared_mapping; pool->process_prepared_discard = process_prepared_discard_passdown; @@ -1835,7 +2161,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) notify_of_pool_mode_change(pool, "write"); dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; - pool->process_discard = process_discard; + pool->process_discard = process_discard_bio; + pool->process_cell = process_cell; + pool->process_discard_cell = process_discard_cell; pool->process_prepared_mapping = process_prepared_mapping; pool->process_prepared_discard = process_prepared_discard; break; @@ -1895,6 +2223,29 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) wake_worker(pool); } +static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + + throttle_lock(&pool->throttle); + thin_defer_bio(tc, bio); + throttle_unlock(&pool->throttle); +} + +static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + unsigned long flags; + struct pool *pool = tc->pool; + + throttle_lock(&pool->throttle); + spin_lock_irqsave(&tc->lock, flags); + list_add_tail(&cell->user_list, &tc->deferred_cells); + spin_unlock_irqrestore(&tc->lock, flags); + throttle_unlock(&pool->throttle); + + wake_worker(pool); +} + static void thin_hook_bio(struct thin_c *tc, struct bio *bio) { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); @@ -1915,8 +2266,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) dm_block_t block = get_bio_block(tc, bio); struct dm_thin_device *td = tc->td; struct dm_thin_lookup_result result; - struct dm_bio_prison_cell cell1, cell2; - struct dm_bio_prison_cell *cell_result; + struct dm_bio_prison_cell *virt_cell, *data_cell; struct dm_cell_key key; thin_hook_bio(tc, bio); @@ -1932,10 +2282,18 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) } if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { - thin_defer_bio(tc, bio); + thin_defer_bio_with_throttle(tc, bio); return DM_MAPIO_SUBMITTED; } + /* + * We must hold the virtual cell before doing the lookup, otherwise + * there's a race with discard. + */ + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool, &key, bio, &virt_cell)) + return DM_MAPIO_SUBMITTED; + r = dm_thin_find_block(td, block, 0, &result); /* @@ -1958,23 +2316,19 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * More distant ancestors are irrelevant. The * shared flag will be set in their case. */ - thin_defer_bio(tc, bio); + thin_defer_cell(tc, virt_cell); return DM_MAPIO_SUBMITTED; } - build_virtual_key(tc->td, block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) - return DM_MAPIO_SUBMITTED; - build_data_key(tc->td, result.block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { - cell_defer_no_holder_no_free(tc, &cell1); + if (bio_detain(tc->pool, &key, bio, &data_cell)) { + cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED; } inc_all_io_entry(tc->pool, bio); - cell_defer_no_holder_no_free(tc, &cell2); - cell_defer_no_holder_no_free(tc, &cell1); + cell_defer_no_holder(tc, data_cell); + cell_defer_no_holder(tc, virt_cell); remap(tc, bio, result.block); return DM_MAPIO_REMAPPED; @@ -1986,16 +2340,13 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * of doing so. */ handle_unserviceable_bio(tc->pool, bio); + cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED; } /* fall through */ case -EWOULDBLOCK: - /* - * In future, the failed dm_thin_find_block above could - * provide the hint to load the metadata into cache. - */ - thin_defer_bio(tc, bio); + thin_defer_cell(tc, virt_cell); return DM_MAPIO_SUBMITTED; default: @@ -2005,6 +2356,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * pool is switched to fail-io mode. */ bio_io_error(bio); + cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED; } } @@ -2185,7 +2537,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->sectors_per_block_shift = __ffs(block_size); pool->low_water_blocks = 0; pool_features_init(&pool->pf); - pool->prison = dm_bio_prison_create(PRISON_CELLS); + pool->prison = dm_bio_prison_create(); if (!pool->prison) { *error = "Error creating pool's bio prison"; err_p = ERR_PTR(-ENOMEM); @@ -2211,6 +2563,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, goto bad_wq; } + throttle_init(&pool->throttle); INIT_WORK(&pool->worker, do_worker); INIT_DELAYED_WORK(&pool->waker, do_waker); INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); @@ -3169,15 +3522,36 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + + /* + * Adjust max_sectors_kb to highest possible power-of-2 + * factor of pool->sectors_per_block. + */ + if (limits->max_hw_sectors & (limits->max_hw_sectors - 1)) + limits->max_sectors = rounddown_pow_of_two(limits->max_hw_sectors); + else + limits->max_sectors = limits->max_hw_sectors; + + if (limits->max_sectors < pool->sectors_per_block) { + while (!is_factor(pool->sectors_per_block, limits->max_sectors)) + limits->max_sectors = rounddown_pow_of_two(limits->max_sectors); + } else if (block_size_is_power_of_two(pool)) { + /* max_sectors_kb is >= power-of-2 thinp blocksize */ + while (!is_factor(limits->max_sectors, pool->sectors_per_block)) + limits->max_sectors = rounddown_pow_of_two(limits->max_sectors); + } /* * If the system-determined stacked limits are compatible with the * pool's blocksize (io_opt is a factor) do not override them. */ if (io_opt_sectors < pool->sectors_per_block || - do_div(io_opt_sectors, pool->sectors_per_block)) { - blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); + !is_factor(io_opt_sectors, pool->sectors_per_block)) { + if (is_factor(pool->sectors_per_block, limits->max_sectors)) + blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT); + else + blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); } @@ -3206,7 +3580,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 13, 0}, + .version = {1, 14, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -3295,6 +3669,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out_unlock; } spin_lock_init(&tc->lock); + INIT_LIST_HEAD(&tc->deferred_cells); bio_list_init(&tc->deferred_bio_list); bio_list_init(&tc->retry_on_resume_list); tc->sort_bio_list = RB_ROOT; @@ -3533,6 +3908,21 @@ err: DMEMIT("Error"); } +static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct thin_c *tc = ti->private; + struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = tc->pool_dev->bdev; + bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + static int thin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { @@ -3557,7 +3947,7 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 13, 0}, + .version = {1, 14, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, @@ -3567,6 +3957,7 @@ static struct target_type thin_target = { .presuspend = thin_presuspend, .postsuspend = thin_postsuspend, .status = thin_status, + .merge = thin_merge, .iterate_devices = thin_iterate_devices, }; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 58f3927fd7cc..0fee0e54d36f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1607,9 +1607,9 @@ static int dm_merge_bvec(struct request_queue *q, * Find maximum amount of I/O that won't need splitting */ max_sectors = min(max_io_len(bvm->bi_sector, ti), - (sector_t) BIO_MAX_SECTORS); + (sector_t) queue_max_sectors(q)); max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; - if (max_size < 0) + if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */ max_size = 0; /* @@ -1621,10 +1621,10 @@ static int dm_merge_bvec(struct request_queue *q, max_size = ti->type->merge(ti, bvm, biovec, max_size); /* * If the target doesn't support merge method and some of the devices - * provided their merge_bvec method (we know this by looking at - * queue_max_hw_sectors), then we can't allow bios with multiple vector - * entries. So always set max_size to 0, and the code below allows - * just one page. + * provided their merge_bvec method (we know this by looking for the + * max_hw_sectors that dm_set_device_limits may set), then we can't + * allow bios with multiple vector entries. So always set max_size + * to 0, and the code below allows just one page. */ else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) max_size = 0; diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index 3bc30a0ae3d6..9cb797d800cf 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -10,6 +10,8 @@ #include "dm-persistent-data-internal.h" #include <linux/export.h> +#include <linux/mutex.h> +#include <linux/hash.h> #include <linux/slab.h> #include <linux/device-mapper.h> @@ -17,6 +19,61 @@ /*----------------------------------------------------------------*/ +#define PREFETCH_SIZE 128 +#define PREFETCH_BITS 7 +#define PREFETCH_SENTINEL ((dm_block_t) -1ULL) + +struct prefetch_set { + struct mutex lock; + dm_block_t blocks[PREFETCH_SIZE]; +}; + +static unsigned prefetch_hash(dm_block_t b) +{ + return hash_64(b, PREFETCH_BITS); +} + +static void prefetch_wipe(struct prefetch_set *p) +{ + unsigned i; + for (i = 0; i < PREFETCH_SIZE; i++) + p->blocks[i] = PREFETCH_SENTINEL; +} + +static void prefetch_init(struct prefetch_set *p) +{ + mutex_init(&p->lock); + prefetch_wipe(p); +} + +static void prefetch_add(struct prefetch_set *p, dm_block_t b) +{ + unsigned h = prefetch_hash(b); + + mutex_lock(&p->lock); + if (p->blocks[h] == PREFETCH_SENTINEL) + p->blocks[h] = b; + + mutex_unlock(&p->lock); +} + +static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm) +{ + unsigned i; + + mutex_lock(&p->lock); + + for (i = 0; i < PREFETCH_SIZE; i++) + if (p->blocks[i] != PREFETCH_SENTINEL) { + dm_bm_prefetch(bm, p->blocks[i]); + p->blocks[i] = PREFETCH_SENTINEL; + } + + mutex_unlock(&p->lock); +} + +/*----------------------------------------------------------------*/ + struct shadow_info { struct hlist_node hlist; dm_block_t where; @@ -37,6 +94,8 @@ struct dm_transaction_manager { spinlock_t lock; struct hlist_head buckets[DM_HASH_SIZE]; + + struct prefetch_set prefetches; }; /*----------------------------------------------------------------*/ @@ -117,6 +176,8 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, for (i = 0; i < DM_HASH_SIZE; i++) INIT_HLIST_HEAD(tm->buckets + i); + prefetch_init(&tm->prefetches); + return tm; } @@ -268,8 +329,14 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, struct dm_block_validator *v, struct dm_block **blk) { - if (tm->is_clone) - return dm_bm_read_try_lock(tm->real->bm, b, v, blk); + if (tm->is_clone) { + int r = dm_bm_read_try_lock(tm->real->bm, b, v, blk); + + if (r == -EWOULDBLOCK) + prefetch_add(&tm->real->prefetches, b); + + return r; + } return dm_bm_read_lock(tm->bm, b, v, blk); } @@ -317,6 +384,12 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm) return tm->bm; } +void dm_tm_issue_prefetches(struct dm_transaction_manager *tm) +{ + prefetch_issue(&tm->prefetches, tm->bm); +} +EXPORT_SYMBOL_GPL(dm_tm_issue_prefetches); + /*----------------------------------------------------------------*/ static int dm_tm_create_internal(struct dm_block_manager *bm, diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index 2772ed2a781a..2e0d4d66fb1b 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -109,6 +109,13 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); /* + * If you're using a non-blocking clone the tm will build up a list of + * requested blocks that weren't in core. This call will request those + * blocks to be prefetched. + */ +void dm_tm_issue_prefetches(struct dm_transaction_manager *tm); + +/* * A little utility that ties the knot by producing a transaction manager * that has a space map managed by the transaction manager... * |