summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2014-10-22 12:04:39 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2014-10-22 12:04:39 +1100
commit39ce83806631c64bd3ce9ea3a177c26b39f017bd (patch)
treef7a06d005224eba38fa8288c61b1d02e0d1eb9b1 /drivers
parenteff27c4601dfc8506f7308a071c33b4c8a628e44 (diff)
parent330f26b9734599470354368e019d853a7b591fc5 (diff)
Merge remote-tracking branch 'device-mapper/for-next'
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/dm-bio-prison.c186
-rw-r--r--drivers/md/dm-bio-prison.h16
-rw-r--r--drivers/md/dm-bufio.c218
-rw-r--r--drivers/md/dm-cache-target.c3
-rw-r--r--drivers/md/dm-raid.c11
-rw-r--r--drivers/md/dm-stripe.c4
-rw-r--r--drivers/md/dm-thin-metadata.c35
-rw-r--r--drivers/md/dm-thin-metadata.h9
-rw-r--r--drivers/md/dm-thin.c661
-rw-r--r--drivers/md/dm.c12
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c77
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h7
12 files changed, 885 insertions, 354 deletions
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index f752d12081ff..bbe22a5dc06b 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -14,68 +14,38 @@
/*----------------------------------------------------------------*/
-struct bucket {
- spinlock_t lock;
- struct hlist_head cells;
-};
+#define MIN_CELLS 1024
struct dm_bio_prison {
+ spinlock_t lock;
mempool_t *cell_pool;
-
- unsigned nr_buckets;
- unsigned hash_mask;
- struct bucket *buckets;
+ struct rb_root cells;
};
-/*----------------------------------------------------------------*/
-
-static uint32_t calc_nr_buckets(unsigned nr_cells)
-{
- uint32_t n = 128;
-
- nr_cells /= 4;
- nr_cells = min(nr_cells, 8192u);
-
- while (n < nr_cells)
- n <<= 1;
-
- return n;
-}
-
static struct kmem_cache *_cell_cache;
-static void init_bucket(struct bucket *b)
-{
- spin_lock_init(&b->lock);
- INIT_HLIST_HEAD(&b->cells);
-}
+/*----------------------------------------------------------------*/
/*
* @nr_cells should be the number of cells you want in use _concurrently_.
* Don't confuse it with the number of distinct keys.
*/
-struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
+struct dm_bio_prison *dm_bio_prison_create(void)
{
- unsigned i;
- uint32_t nr_buckets = calc_nr_buckets(nr_cells);
- size_t len = sizeof(struct dm_bio_prison) +
- (sizeof(struct bucket) * nr_buckets);
- struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
+ struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
if (!prison)
return NULL;
- prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
+ spin_lock_init(&prison->lock);
+
+ prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
if (!prison->cell_pool) {
kfree(prison);
return NULL;
}
- prison->nr_buckets = nr_buckets;
- prison->hash_mask = nr_buckets - 1;
- prison->buckets = (struct bucket *) (prison + 1);
- for (i = 0; i < nr_buckets; i++)
- init_bucket(prison->buckets + i);
+ prison->cells = RB_ROOT;
return prison;
}
@@ -101,68 +71,73 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
}
EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
-static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
+static void __setup_new_cell(struct dm_cell_key *key,
+ struct bio *holder,
+ struct dm_bio_prison_cell *cell)
{
- const unsigned long BIG_PRIME = 4294967291UL;
- uint64_t hash = key->block * BIG_PRIME;
-
- return (uint32_t) (hash & prison->hash_mask);
+ memcpy(&cell->key, key, sizeof(cell->key));
+ cell->holder = holder;
+ bio_list_init(&cell->bios);
}
-static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs)
+static int cmp_keys(struct dm_cell_key *lhs,
+ struct dm_cell_key *rhs)
{
- return (lhs->virtual == rhs->virtual) &&
- (lhs->dev == rhs->dev) &&
- (lhs->block == rhs->block);
-}
+ if (lhs->virtual < rhs->virtual)
+ return -1;
-static struct bucket *get_bucket(struct dm_bio_prison *prison,
- struct dm_cell_key *key)
-{
- return prison->buckets + hash_key(prison, key);
-}
+ if (lhs->virtual > rhs->virtual)
+ return 1;
-static struct dm_bio_prison_cell *__search_bucket(struct bucket *b,
- struct dm_cell_key *key)
-{
- struct dm_bio_prison_cell *cell;
+ if (lhs->dev < rhs->dev)
+ return -1;
- hlist_for_each_entry(cell, &b->cells, list)
- if (keys_equal(&cell->key, key))
- return cell;
+ if (lhs->dev > rhs->dev)
+ return 1;
- return NULL;
-}
+ if (lhs->block < rhs->block)
+ return -1;
-static void __setup_new_cell(struct bucket *b,
- struct dm_cell_key *key,
- struct bio *holder,
- struct dm_bio_prison_cell *cell)
-{
- memcpy(&cell->key, key, sizeof(cell->key));
- cell->holder = holder;
- bio_list_init(&cell->bios);
- hlist_add_head(&cell->list, &b->cells);
+ if (lhs->block > rhs->block)
+ return 1;
+
+ return 0;
}
-static int __bio_detain(struct bucket *b,
+static int __bio_detain(struct dm_bio_prison *prison,
struct dm_cell_key *key,
struct bio *inmate,
struct dm_bio_prison_cell *cell_prealloc,
struct dm_bio_prison_cell **cell_result)
{
- struct dm_bio_prison_cell *cell;
-
- cell = __search_bucket(b, key);
- if (cell) {
- if (inmate)
- bio_list_add(&cell->bios, inmate);
- *cell_result = cell;
- return 1;
+ int r;
+ struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
+
+ while (*new) {
+ struct dm_bio_prison_cell *cell =
+ container_of(*new, struct dm_bio_prison_cell, node);
+
+ r = cmp_keys(key, &cell->key);
+
+ parent = *new;
+ if (r < 0)
+ new = &((*new)->rb_left);
+ else if (r > 0)
+ new = &((*new)->rb_right);
+ else {
+ if (inmate)
+ bio_list_add(&cell->bios, inmate);
+ *cell_result = cell;
+ return 1;
+ }
}
- __setup_new_cell(b, key, inmate, cell_prealloc);
+ __setup_new_cell(key, inmate, cell_prealloc);
*cell_result = cell_prealloc;
+
+ rb_link_node(&cell_prealloc->node, parent, new);
+ rb_insert_color(&cell_prealloc->node, &prison->cells);
+
return 0;
}
@@ -174,11 +149,10 @@ static int bio_detain(struct dm_bio_prison *prison,
{
int r;
unsigned long flags;
- struct bucket *b = get_bucket(prison, key);
- spin_lock_irqsave(&b->lock, flags);
- r = __bio_detain(b, key, inmate, cell_prealloc, cell_result);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
+ spin_unlock_irqrestore(&prison->lock, flags);
return r;
}
@@ -205,10 +179,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell);
/*
* @inmates must have been initialised prior to this call
*/
-static void __cell_release(struct dm_bio_prison_cell *cell,
+static void __cell_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
struct bio_list *inmates)
{
- hlist_del(&cell->list);
+ rb_erase(&cell->node, &prison->cells);
if (inmates) {
if (cell->holder)
@@ -222,21 +197,21 @@ void dm_cell_release(struct dm_bio_prison *prison,
struct bio_list *bios)
{
unsigned long flags;
- struct bucket *b = get_bucket(prison, &cell->key);
- spin_lock_irqsave(&b->lock, flags);
- __cell_release(cell, bios);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_lock_irqsave(&prison->lock, flags);
+ __cell_release(prison, cell, bios);
+ spin_unlock_irqrestore(&prison->lock, flags);
}
EXPORT_SYMBOL_GPL(dm_cell_release);
/*
* Sometimes we don't want the holder, just the additional bios.
*/
-static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
+static void __cell_release_no_holder(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
struct bio_list *inmates)
{
- hlist_del(&cell->list);
+ rb_erase(&cell->node, &prison->cells);
bio_list_merge(inmates, &cell->bios);
}
@@ -245,11 +220,10 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
struct bio_list *inmates)
{
unsigned long flags;
- struct bucket *b = get_bucket(prison, &cell->key);
- spin_lock_irqsave(&b->lock, flags);
- __cell_release_no_holder(cell, inmates);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_lock_irqsave(&prison->lock, flags);
+ __cell_release_no_holder(prison, cell, inmates);
+ spin_unlock_irqrestore(&prison->lock, flags);
}
EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
@@ -267,6 +241,20 @@ void dm_cell_error(struct dm_bio_prison *prison,
}
EXPORT_SYMBOL_GPL(dm_cell_error);
+void dm_cell_visit_release(struct dm_bio_prison *prison,
+ void (*visit_fn)(void *, struct dm_bio_prison_cell *),
+ void *context,
+ struct dm_bio_prison_cell *cell)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ visit_fn(context, cell);
+ rb_erase(&cell->node, &prison->cells);
+ spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_visit_release);
+
/*----------------------------------------------------------------*/
#define DEFERRED_SET_SIZE 64
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 6805a142b750..b03988667740 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -10,8 +10,8 @@
#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
-#include <linux/list.h>
#include <linux/bio.h>
+#include <linux/rbtree.h>
/*----------------------------------------------------------------*/
@@ -35,13 +35,15 @@ struct dm_cell_key {
* themselves.
*/
struct dm_bio_prison_cell {
- struct hlist_node list;
+ struct list_head user_list; /* for client use */
+ struct rb_node node;
+
struct dm_cell_key key;
struct bio *holder;
struct bio_list bios;
};
-struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells);
+struct dm_bio_prison *dm_bio_prison_create(void);
void dm_bio_prison_destroy(struct dm_bio_prison *prison);
/*
@@ -87,6 +89,14 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
void dm_cell_error(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell, int error);
+/*
+ * Visits the cell and then releases. Guarantees no new inmates are
+ * inserted between the visit and release.
+ */
+void dm_cell_visit_release(struct dm_bio_prison *prison,
+ void (*visit_fn)(void *, struct dm_bio_prison_cell *),
+ void *context, struct dm_bio_prison_cell *cell);
+
/*----------------------------------------------------------------*/
/*
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 825ca1f87639..9649e48f0bc4 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -14,6 +14,7 @@
#include <linux/vmalloc.h>
#include <linux/shrinker.h>
#include <linux/module.h>
+#include <linux/rbtree.h>
#define DM_MSG_PREFIX "bufio"
@@ -34,26 +35,23 @@
/*
* Check buffer ages in this interval (seconds)
*/
-#define DM_BUFIO_WORK_TIMER_SECS 10
+#define DM_BUFIO_WORK_TIMER_SECS 30
/*
* Free buffers when they are older than this (seconds)
*/
-#define DM_BUFIO_DEFAULT_AGE_SECS 60
+#define DM_BUFIO_DEFAULT_AGE_SECS 300
/*
- * The number of bvec entries that are embedded directly in the buffer.
- * If the chunk size is larger, dm-io is used to do the io.
+ * The nr of bytes of cached data to keep around.
*/
-#define DM_BUFIO_INLINE_VECS 16
+#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
/*
- * Buffer hash
+ * The number of bvec entries that are embedded directly in the buffer.
+ * If the chunk size is larger, dm-io is used to do the io.
*/
-#define DM_BUFIO_HASH_BITS 20
-#define DM_BUFIO_HASH(block) \
- ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
- ((1 << DM_BUFIO_HASH_BITS) - 1))
+#define DM_BUFIO_INLINE_VECS 16
/*
* Don't try to use kmem_cache_alloc for blocks larger than this.
@@ -106,7 +104,7 @@ struct dm_bufio_client {
unsigned minimum_buffers;
- struct hlist_head *cache_hash;
+ struct rb_root buffer_tree;
wait_queue_head_t free_buffer_wait;
int async_write_error;
@@ -135,7 +133,7 @@ enum data_mode {
};
struct dm_buffer {
- struct hlist_node hash_list;
+ struct rb_node node;
struct list_head lru_list;
sector_t block;
void *data;
@@ -223,6 +221,7 @@ static DEFINE_SPINLOCK(param_spinlock);
* Buffers are freed after this timeout
*/
static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
+static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
static unsigned long dm_bufio_peak_allocated;
static unsigned long dm_bufio_allocated_kmem_cache;
@@ -253,6 +252,53 @@ static LIST_HEAD(dm_bufio_all_clients);
*/
static DEFINE_MUTEX(dm_bufio_clients_lock);
+/*----------------------------------------------------------------
+ * A red/black tree acts as an index for all the buffers.
+ *--------------------------------------------------------------*/
+static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
+{
+ struct rb_node *n = c->buffer_tree.rb_node;
+ struct dm_buffer *b;
+
+ while (n) {
+ b = container_of(n, struct dm_buffer, node);
+
+ if (b->block == block)
+ return b;
+
+ n = (b->block < block) ? n->rb_left : n->rb_right;
+ }
+
+ return NULL;
+}
+
+static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+ struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
+ struct dm_buffer *found;
+
+ while (*new) {
+ found = container_of(*new, struct dm_buffer, node);
+
+ if (found->block == b->block) {
+ BUG_ON(found != b);
+ return;
+ }
+
+ parent = *new;
+ new = (found->block < b->block) ?
+ &((*new)->rb_left) : &((*new)->rb_right);
+ }
+
+ rb_link_node(&b->node, parent, new);
+ rb_insert_color(&b->node, &c->buffer_tree);
+}
+
+static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+ rb_erase(&b->node, &c->buffer_tree);
+}
+
/*----------------------------------------------------------------*/
static void adjust_total_allocated(enum data_mode data_mode, long diff)
@@ -434,7 +480,7 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
b->block = block;
b->list_mode = dirty;
list_add(&b->lru_list, &c->lru[dirty]);
- hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
+ __insert(b->c, b);
b->last_accessed = jiffies;
}
@@ -448,7 +494,7 @@ static void __unlink_buffer(struct dm_buffer *b)
BUG_ON(!c->n_buffers[b->list_mode]);
c->n_buffers[b->list_mode]--;
- hlist_del(&b->hash_list);
+ __remove(b->c, b);
list_del(&b->lru_list);
}
@@ -887,23 +933,6 @@ static void __check_watermark(struct dm_bufio_client *c,
__write_dirty_buffers_async(c, 1, write_list);
}
-/*
- * Find a buffer in the hash.
- */
-static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
-{
- struct dm_buffer *b;
-
- hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)],
- hash_list) {
- dm_bufio_cond_resched();
- if (b->block == block)
- return b;
- }
-
- return NULL;
-}
-
/*----------------------------------------------------------------
* Getting a buffer
*--------------------------------------------------------------*/
@@ -1433,45 +1462,52 @@ static void drop_buffers(struct dm_bufio_client *c)
}
/*
- * Test if the buffer is unused and too old, and commit it.
- * At if noio is set, we must not do any I/O because we hold
- * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to
- * different bufio client.
+ * We may not be able to evict this buffer if IO pending or the client
+ * is still using it. Caller is expected to know buffer is too old.
+ *
+ * And if GFP_NOFS is used, we must not do any I/O because we hold
+ * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
+ * rerouted to different bufio client.
*/
-static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
- unsigned long max_jiffies)
+static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
{
- if (jiffies - b->last_accessed < max_jiffies)
- return 0;
-
- if (!(gfp & __GFP_IO)) {
+ if (!(gfp & __GFP_FS)) {
if (test_bit(B_READING, &b->state) ||
test_bit(B_WRITING, &b->state) ||
test_bit(B_DIRTY, &b->state))
- return 0;
+ return false;
}
if (b->hold_count)
- return 0;
+ return false;
__make_buffer_clean(b);
__unlink_buffer(b);
__free_buffer_wake(b);
- return 1;
+ return true;
+}
+
+static unsigned get_retain_buffers(struct dm_bufio_client *c)
+{
+ unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+ return retain_bytes / c->block_size;
}
-static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
- gfp_t gfp_mask)
+static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
+ gfp_t gfp_mask)
{
int l;
struct dm_buffer *b, *tmp;
- long freed = 0;
+ unsigned long freed = 0;
+ unsigned long count = nr_to_scan;
+ unsigned retain_target = get_retain_buffers(c);
for (l = 0; l < LIST_SIZE; l++) {
list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
- freed += __cleanup_old_buffer(b, gfp_mask, 0);
- if (!--nr_to_scan)
+ if (__try_evict_buffer(b, gfp_mask))
+ freed++;
+ if (!--nr_to_scan || ((count - freed) <= retain_target))
return freed;
dm_bufio_cond_resched();
}
@@ -1486,7 +1522,7 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
unsigned long freed;
c = container_of(shrink, struct dm_bufio_client, shrinker);
- if (sc->gfp_mask & __GFP_IO)
+ if (sc->gfp_mask & __GFP_FS)
dm_bufio_lock(c);
else if (!dm_bufio_trylock(c))
return SHRINK_STOP;
@@ -1503,7 +1539,7 @@ dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
unsigned long count;
c = container_of(shrink, struct dm_bufio_client, shrinker);
- if (sc->gfp_mask & __GFP_IO)
+ if (sc->gfp_mask & __GFP_FS)
dm_bufio_lock(c);
else if (!dm_bufio_trylock(c))
return 0;
@@ -1533,11 +1569,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
r = -ENOMEM;
goto bad_client;
}
- c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS);
- if (!c->cache_hash) {
- r = -ENOMEM;
- goto bad_hash;
- }
+ c->buffer_tree = RB_ROOT;
c->bdev = bdev;
c->block_size = block_size;
@@ -1556,9 +1588,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
c->n_buffers[i] = 0;
}
- for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
- INIT_HLIST_HEAD(&c->cache_hash[i]);
-
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->reserved_buffers);
c->need_reserved_buffers = reserved_buffers;
@@ -1632,8 +1661,6 @@ bad_cache:
}
dm_io_client_destroy(c->dm_io);
bad_dm_io:
- vfree(c->cache_hash);
-bad_hash:
kfree(c);
bad_client:
return ERR_PTR(r);
@@ -1660,9 +1687,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
mutex_unlock(&dm_bufio_clients_lock);
- for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
- BUG_ON(!hlist_empty(&c->cache_hash[i]));
-
+ BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
BUG_ON(c->need_reserved_buffers);
while (!list_empty(&c->reserved_buffers)) {
@@ -1680,36 +1705,60 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
BUG_ON(c->n_buffers[i]);
dm_io_client_destroy(c->dm_io);
- vfree(c->cache_hash);
kfree(c);
}
EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
-static void cleanup_old_buffers(void)
+static unsigned get_max_age_hz(void)
{
- unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age);
- struct dm_bufio_client *c;
+ unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
- if (max_age > ULONG_MAX / HZ)
- max_age = ULONG_MAX / HZ;
+ if (max_age > UINT_MAX / HZ)
+ max_age = UINT_MAX / HZ;
- mutex_lock(&dm_bufio_clients_lock);
- list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
- if (!dm_bufio_trylock(c))
- continue;
+ return max_age * HZ;
+}
- while (!list_empty(&c->lru[LIST_CLEAN])) {
- struct dm_buffer *b;
- b = list_entry(c->lru[LIST_CLEAN].prev,
- struct dm_buffer, lru_list);
- if (!__cleanup_old_buffer(b, 0, max_age * HZ))
- break;
- dm_bufio_cond_resched();
- }
+static bool older_than(struct dm_buffer *b, unsigned long age_hz)
+{
+ return (jiffies - b->last_accessed) >= age_hz;
+}
+
+static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
+{
+ struct dm_buffer *b, *tmp;
+ unsigned retain_target = get_retain_buffers(c);
+ unsigned count;
+
+ dm_bufio_lock(c);
+
+ count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
+ list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
+ if (count <= retain_target)
+ break;
+
+ if (!older_than(b, age_hz))
+ break;
+
+ if (__try_evict_buffer(b, 0))
+ count--;
- dm_bufio_unlock(c);
dm_bufio_cond_resched();
}
+
+ dm_bufio_unlock(c);
+}
+
+static void cleanup_old_buffers(void)
+{
+ unsigned long max_age_hz = get_max_age_hz();
+ struct dm_bufio_client *c;
+
+ mutex_lock(&dm_bufio_clients_lock);
+
+ list_for_each_entry(c, &dm_bufio_all_clients, client_list)
+ __evict_old_buffers(c, max_age_hz);
+
mutex_unlock(&dm_bufio_clients_lock);
}
@@ -1834,6 +1883,9 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
+module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
+
module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7130505c2425..69de8b43ca12 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -95,7 +95,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
/*----------------------------------------------------------------*/
-#define PRISON_CELLS 1024
#define MIGRATION_POOL_SIZE 128
#define COMMIT_PERIOD HZ
#define MIGRATION_COUNT_WINDOW 10
@@ -2327,7 +2326,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
INIT_DELAYED_WORK(&cache->waker, do_waker);
cache->last_commit_jiffies = jiffies;
- cache->prison = dm_bio_prison_create(PRISON_CELLS);
+ cache->prison = dm_bio_prison_create();
if (!cache->prison) {
*error = "could not create bio prison";
goto bad;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 4857fa4a5484..a7cb9dd5f135 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -789,8 +789,7 @@ struct dm_raid_superblock {
__le32 layout;
__le32 stripe_sectors;
- __u8 pad[452]; /* Round struct to 512 bytes. */
- /* Always set to 0 when writing. */
+ /* Remainder of a logical block is zero-filled when writing (see super_sync()). */
} __packed;
static int read_disk_sb(struct md_rdev *rdev, int size)
@@ -827,7 +826,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
test_bit(Faulty, &(rs->dev[i].rdev.flags)))
failed_devices |= (1ULL << i);
- memset(sb, 0, sizeof(*sb));
+ memset(sb + 1, 0, rdev->sb_size - sizeof(*sb));
sb->magic = cpu_to_le32(DM_RAID_MAGIC);
sb->features = cpu_to_le32(0); /* No features yet */
@@ -862,7 +861,11 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
uint64_t events_sb, events_refsb;
rdev->sb_start = 0;
- rdev->sb_size = sizeof(*sb);
+ rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev);
+ if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) {
+ DMERR("superblock size of a logical block is no longer valid");
+ return -EINVAL;
+ }
ret = read_disk_sb(rdev, rdev->sb_size);
if (ret)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index d1600d2aa2e2..f8b37d4c05d8 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -159,8 +159,10 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
sc->stripes_shift = __ffs(stripes);
r = dm_set_target_max_io_len(ti, chunk_size);
- if (r)
+ if (r) {
+ kfree(sc);
return r;
+ }
ti->num_flush_bios = stripes;
ti->num_discard_bios = stripes;
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index e9d33ad59df5..43adbb863f5a 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1384,42 +1384,38 @@ static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
}
int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
- int can_block, struct dm_thin_lookup_result *result)
+ int can_issue_io, struct dm_thin_lookup_result *result)
{
- int r = -EINVAL;
- uint64_t block_time = 0;
+ int r;
__le64 value;
struct dm_pool_metadata *pmd = td->pmd;
dm_block_t keys[2] = { td->id, block };
struct dm_btree_info *info;
- if (can_block) {
- down_read(&pmd->root_lock);
- info = &pmd->info;
- } else if (down_read_trylock(&pmd->root_lock))
- info = &pmd->nb_info;
- else
- return -EWOULDBLOCK;
-
if (pmd->fail_io)
- goto out;
+ return -EINVAL;
- r = dm_btree_lookup(info, pmd->root, keys, &value);
- if (!r)
- block_time = le64_to_cpu(value);
+ down_read(&pmd->root_lock);
-out:
- up_read(&pmd->root_lock);
+ if (can_issue_io) {
+ info = &pmd->info;
+ } else
+ info = &pmd->nb_info;
+ r = dm_btree_lookup(info, pmd->root, keys, &value);
if (!r) {
+ uint64_t block_time = 0;
dm_block_t exception_block;
uint32_t exception_time;
+
+ block_time = le64_to_cpu(value);
unpack_block_time(block_time, &exception_block,
&exception_time);
result->block = exception_block;
result->shared = __snapshotted_since(td, exception_time);
}
+ up_read(&pmd->root_lock);
return r;
}
@@ -1813,3 +1809,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
return needs_check;
}
+
+void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
+{
+ dm_tm_issue_prefetches(pmd->tm);
+}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index e3c857db195a..921d15ee56a0 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -139,12 +139,12 @@ struct dm_thin_lookup_result {
/*
* Returns:
- * -EWOULDBLOCK iff @can_block is set and would block.
+ * -EWOULDBLOCK iff @can_issue_io is set and would issue IO
* -ENODATA iff that mapping is not present.
* 0 success
*/
int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
- int can_block, struct dm_thin_lookup_result *result);
+ int can_issue_io, struct dm_thin_lookup_result *result);
/*
* Obtain an unused block.
@@ -213,6 +213,11 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
+/*
+ * Issue any prefetches that may be useful.
+ */
+void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd);
+
/*----------------------------------------------------------------*/
#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 4843801173fe..719b330b7eb8 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -11,11 +11,13 @@
#include <linux/device-mapper.h>
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
+#include <linux/log2.h>
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/sort.h>
#include <linux/rbtree.h>
#define DM_MSG_PREFIX "thin"
@@ -25,7 +27,6 @@
*/
#define ENDIO_HOOK_POOL_SIZE 1024
#define MAPPING_POOL_SIZE 1024
-#define PRISON_CELLS 1024
#define COMMIT_PERIOD HZ
#define NO_SPACE_TIMEOUT_SECS 60
@@ -127,6 +128,53 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
/*----------------------------------------------------------------*/
+#define THROTTLE_THRESHOLD (1 * HZ)
+
+struct throttle {
+ struct rw_semaphore lock;
+ unsigned long threshold;
+ bool throttle_applied;
+};
+
+static void throttle_init(struct throttle *t)
+{
+ init_rwsem(&t->lock);
+ t->throttle_applied = false;
+}
+
+static void throttle_work_start(struct throttle *t)
+{
+ t->threshold = jiffies + THROTTLE_THRESHOLD;
+}
+
+static void throttle_work_update(struct throttle *t)
+{
+ if (!t->throttle_applied && jiffies > t->threshold) {
+ down_write(&t->lock);
+ t->throttle_applied = true;
+ }
+}
+
+static void throttle_work_complete(struct throttle *t)
+{
+ if (t->throttle_applied) {
+ t->throttle_applied = false;
+ up_write(&t->lock);
+ }
+}
+
+static void throttle_lock(struct throttle *t)
+{
+ down_read(&t->lock);
+}
+
+static void throttle_unlock(struct throttle *t)
+{
+ up_read(&t->lock);
+}
+
+/*----------------------------------------------------------------*/
+
/*
* A pool device ties together a metadata device and a data device. It
* also provides the interface for creating and destroying internal
@@ -155,8 +203,11 @@ struct pool_features {
struct thin_c;
typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
+typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
+#define CELL_SORT_ARRAY_SIZE 8192
+
struct pool {
struct list_head list;
struct dm_target *ti; /* Only set if a pool target is bound */
@@ -176,6 +227,7 @@ struct pool {
struct dm_kcopyd_client *copier;
struct workqueue_struct *wq;
+ struct throttle throttle;
struct work_struct worker;
struct delayed_work waker;
struct delayed_work no_space_timeout;
@@ -198,8 +250,13 @@ struct pool {
process_bio_fn process_bio;
process_bio_fn process_discard;
+ process_cell_fn process_cell;
+ process_cell_fn process_discard_cell;
+
process_mapping_fn process_prepared_mapping;
process_mapping_fn process_prepared_discard;
+
+ struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];
};
static enum pool_mode get_pool_mode(struct pool *pool);
@@ -234,6 +291,7 @@ struct thin_c {
struct dm_thin_device *td;
bool requeue_mode:1;
spinlock_t lock;
+ struct list_head deferred_cells;
struct bio_list deferred_bio_list;
struct bio_list retry_on_resume_list;
struct rb_root sort_bio_list; /* sorted list of deferred bios */
@@ -290,6 +348,15 @@ static void cell_release(struct pool *pool,
dm_bio_prison_free_cell(pool->prison, cell);
}
+static void cell_visit_release(struct pool *pool,
+ void (*fn)(void *, struct dm_bio_prison_cell *),
+ void *context,
+ struct dm_bio_prison_cell *cell)
+{
+ dm_cell_visit_release(pool->prison, fn, context, cell);
+ dm_bio_prison_free_cell(pool->prison, cell);
+}
+
static void cell_release_no_holder(struct pool *pool,
struct dm_bio_prison_cell *cell,
struct bio_list *bios)
@@ -298,19 +365,6 @@ static void cell_release_no_holder(struct pool *pool,
dm_bio_prison_free_cell(pool->prison, cell);
}
-static void cell_defer_no_holder_no_free(struct thin_c *tc,
- struct dm_bio_prison_cell *cell)
-{
- struct pool *pool = tc->pool;
- unsigned long flags;
-
- spin_lock_irqsave(&tc->lock, flags);
- dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
- spin_unlock_irqrestore(&tc->lock, flags);
-
- wake_worker(pool);
-}
-
static void cell_error_with_code(struct pool *pool,
struct dm_bio_prison_cell *cell, int error_code)
{
@@ -323,6 +377,16 @@ static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
cell_error_with_code(pool, cell, -EIO);
}
+static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+ cell_error_with_code(pool, cell, 0);
+}
+
+static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+ cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
+}
+
/*----------------------------------------------------------------*/
/*
@@ -393,44 +457,65 @@ struct dm_thin_endio_hook {
struct rb_node rb_node;
};
-static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
+static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
+{
+ bio_list_merge(bios, master);
+ bio_list_init(master);
+}
+
+static void error_bio_list(struct bio_list *bios, int error)
{
struct bio *bio;
+
+ while ((bio = bio_list_pop(bios)))
+ bio_endio(bio, error);
+}
+
+static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
+{
struct bio_list bios;
unsigned long flags;
bio_list_init(&bios);
spin_lock_irqsave(&tc->lock, flags);
- bio_list_merge(&bios, master);
- bio_list_init(master);
+ __merge_bio_list(&bios, master);
spin_unlock_irqrestore(&tc->lock, flags);
- while ((bio = bio_list_pop(&bios)))
- bio_endio(bio, DM_ENDIO_REQUEUE);
+ error_bio_list(&bios, error);
}
-static void requeue_io(struct thin_c *tc)
+static void requeue_deferred_cells(struct thin_c *tc)
{
- requeue_bio_list(tc, &tc->deferred_bio_list);
- requeue_bio_list(tc, &tc->retry_on_resume_list);
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+ struct list_head cells;
+ struct dm_bio_prison_cell *cell, *tmp;
+
+ INIT_LIST_HEAD(&cells);
+
+ spin_lock_irqsave(&tc->lock, flags);
+ list_splice_init(&tc->deferred_cells, &cells);
+ spin_unlock_irqrestore(&tc->lock, flags);
+
+ list_for_each_entry_safe(cell, tmp, &cells, user_list)
+ cell_requeue(pool, cell);
}
-static void error_thin_retry_list(struct thin_c *tc)
+static void requeue_io(struct thin_c *tc)
{
- struct bio *bio;
- unsigned long flags;
struct bio_list bios;
+ unsigned long flags;
bio_list_init(&bios);
spin_lock_irqsave(&tc->lock, flags);
- bio_list_merge(&bios, &tc->retry_on_resume_list);
- bio_list_init(&tc->retry_on_resume_list);
+ __merge_bio_list(&bios, &tc->deferred_bio_list);
+ __merge_bio_list(&bios, &tc->retry_on_resume_list);
spin_unlock_irqrestore(&tc->lock, flags);
- while ((bio = bio_list_pop(&bios)))
- bio_io_error(bio);
+ error_bio_list(&bios, DM_ENDIO_REQUEUE);
+ requeue_deferred_cells(tc);
}
static void error_retry_list(struct pool *pool)
@@ -439,7 +524,7 @@ static void error_retry_list(struct pool *pool)
rcu_read_lock();
list_for_each_entry_rcu(tc, &pool->active_thins, list)
- error_thin_retry_list(tc);
+ error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
rcu_read_unlock();
}
@@ -629,33 +714,75 @@ static void overwrite_endio(struct bio *bio, int err)
*/
/*
- * This sends the bios in the cell back to the deferred_bios list.
+ * This sends the bios in the cell, except the original holder, back
+ * to the deferred_bios list.
*/
-static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
struct pool *pool = tc->pool;
unsigned long flags;
spin_lock_irqsave(&tc->lock, flags);
- cell_release(pool, cell, &tc->deferred_bio_list);
+ cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
spin_unlock_irqrestore(&tc->lock, flags);
wake_worker(pool);
}
-/*
- * Same as cell_defer above, except it omits the original holder of the cell.
- */
-static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
+
+struct remap_info {
+ struct thin_c *tc;
+ struct bio_list defer_bios;
+ struct bio_list issue_bios;
+};
+
+static void __inc_remap_and_issue_cell(void *context,
+ struct dm_bio_prison_cell *cell)
{
- struct pool *pool = tc->pool;
- unsigned long flags;
+ struct remap_info *info = context;
+ struct bio *bio;
- spin_lock_irqsave(&tc->lock, flags);
- cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
- spin_unlock_irqrestore(&tc->lock, flags);
+ while ((bio = bio_list_pop(&cell->bios))) {
+ if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))
+ bio_list_add(&info->defer_bios, bio);
+ else {
+ inc_all_io_entry(info->tc->pool, bio);
- wake_worker(pool);
+ /*
+ * We can't issue the bios with the bio prison lock
+ * held, so we add them to a list to issue on
+ * return from this function.
+ */
+ bio_list_add(&info->issue_bios, bio);
+ }
+ }
+}
+
+static void inc_remap_and_issue_cell(struct thin_c *tc,
+ struct dm_bio_prison_cell *cell,
+ dm_block_t block)
+{
+ struct bio *bio;
+ struct remap_info info;
+
+ info.tc = tc;
+ bio_list_init(&info.defer_bios);
+ bio_list_init(&info.issue_bios);
+
+ /*
+ * We have to be careful to inc any bios we're about to issue
+ * before the cell is released, and avoid a race with new bios
+ * being added to the cell.
+ */
+ cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
+ &info, cell);
+
+ while ((bio = bio_list_pop(&info.defer_bios)))
+ thin_defer_bio(tc, bio);
+
+ while ((bio = bio_list_pop(&info.issue_bios)))
+ remap_and_issue(info.tc, bio, block);
}
static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
@@ -706,10 +833,13 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
* the bios in the cell.
*/
if (bio) {
- cell_defer_no_holder(tc, m->cell);
+ inc_remap_and_issue_cell(tc, m->cell, m->data_block);
bio_endio(bio, 0);
- } else
- cell_defer(tc, m->cell);
+ } else {
+ inc_all_io_entry(tc->pool, m->cell->holder);
+ remap_and_issue(tc, m->cell->holder, m->data_block);
+ inc_remap_and_issue_cell(tc, m->cell, m->data_block);
+ }
out:
list_del(&m->list);
@@ -842,6 +972,20 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
}
}
+static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
+ dm_block_t data_block,
+ struct dm_thin_new_mapping *m)
+{
+ struct pool *pool = tc->pool;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+ h->overwrite_mapping = m;
+ m->bio = bio;
+ save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
+ inc_all_io_entry(pool, bio);
+ remap_and_issue(tc, bio, data_block);
+}
+
/*
* A partial copy also needs to zero the uncopied region.
*/
@@ -876,15 +1020,9 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
* If the whole block of data is being overwritten, we can issue the
* bio immediately. Otherwise we use kcopyd to clone the data first.
*/
- if (io_overwrites_block(pool, bio)) {
- struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-
- h->overwrite_mapping = m;
- m->bio = bio;
- save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
- inc_all_io_entry(pool, bio);
- remap_and_issue(tc, bio, data_dest);
- } else {
+ if (io_overwrites_block(pool, bio))
+ remap_and_issue_overwrite(tc, bio, data_dest, m);
+ else {
struct dm_io_region from, to;
from.bdev = origin->bdev;
@@ -953,16 +1091,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
if (!pool->pf.zero_new_blocks)
process_prepared_mapping(m);
- else if (io_overwrites_block(pool, bio)) {
- struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-
- h->overwrite_mapping = m;
- m->bio = bio;
- save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
- inc_all_io_entry(pool, bio);
- remap_and_issue(tc, bio, data_block);
+ else if (io_overwrites_block(pool, bio))
+ remap_and_issue_overwrite(tc, bio, data_block, m);
- } else
+ else
ll_zero(tc, m,
data_block * pool->sectors_per_block,
(data_block + 1) * pool->sectors_per_block);
@@ -1134,29 +1266,25 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
bio_list_init(&bios);
cell_release(pool, cell, &bios);
- error = should_error_unserviceable_bio(pool);
- if (error)
- while ((bio = bio_list_pop(&bios)))
- bio_endio(bio, error);
- else
- while ((bio = bio_list_pop(&bios)))
- retry_on_resume(bio);
+ while ((bio = bio_list_pop(&bios)))
+ retry_on_resume(bio);
}
-static void process_discard(struct thin_c *tc, struct bio *bio)
+static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
int r;
- unsigned long flags;
+ struct bio *bio = cell->holder;
struct pool *pool = tc->pool;
- struct dm_bio_prison_cell *cell, *cell2;
- struct dm_cell_key key, key2;
+ struct dm_bio_prison_cell *cell2;
+ struct dm_cell_key key2;
dm_block_t block = get_bio_block(tc, bio);
struct dm_thin_lookup_result lookup_result;
struct dm_thin_new_mapping *m;
- build_virtual_key(tc->td, block, &key);
- if (bio_detain(tc->pool, &key, bio, &cell))
+ if (tc->requeue_mode) {
+ cell_requeue(pool, cell);
return;
+ }
r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
switch (r) {
@@ -1187,12 +1315,9 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
m->cell2 = cell2;
m->bio = bio;
- if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
- spin_lock_irqsave(&pool->lock, flags);
- list_add_tail(&m->list, &pool->prepared_discards);
- spin_unlock_irqrestore(&pool->lock, flags);
- wake_worker(pool);
- }
+ if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+ pool->process_prepared_discard(m);
+
} else {
inc_all_io_entry(pool, bio);
cell_defer_no_holder(tc, cell);
@@ -1227,6 +1352,19 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
}
}
+static void process_discard_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct dm_bio_prison_cell *cell;
+ struct dm_cell_key key;
+ dm_block_t block = get_bio_block(tc, bio);
+
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(tc->pool, &key, bio, &cell))
+ return;
+
+ process_discard_cell(tc, cell);
+}
+
static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
struct dm_cell_key *key,
struct dm_thin_lookup_result *lookup_result,
@@ -1255,11 +1393,53 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
}
}
+static void __remap_and_issue_shared_cell(void *context,
+ struct dm_bio_prison_cell *cell)
+{
+ struct remap_info *info = context;
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&cell->bios))) {
+ if ((bio_data_dir(bio) == WRITE) ||
+ (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)))
+ bio_list_add(&info->defer_bios, bio);
+ else {
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
+
+ h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
+ inc_all_io_entry(info->tc->pool, bio);
+ bio_list_add(&info->issue_bios, bio);
+ }
+ }
+}
+
+static void remap_and_issue_shared_cell(struct thin_c *tc,
+ struct dm_bio_prison_cell *cell,
+ dm_block_t block)
+{
+ struct bio *bio;
+ struct remap_info info;
+
+ info.tc = tc;
+ bio_list_init(&info.defer_bios);
+ bio_list_init(&info.issue_bios);
+
+ cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
+ &info, cell);
+
+ while ((bio = bio_list_pop(&info.defer_bios)))
+ thin_defer_bio(tc, bio);
+
+ while ((bio = bio_list_pop(&info.issue_bios)))
+ remap_and_issue(tc, bio, block);
+}
+
static void process_shared_bio(struct thin_c *tc, struct bio *bio,
dm_block_t block,
- struct dm_thin_lookup_result *lookup_result)
+ struct dm_thin_lookup_result *lookup_result,
+ struct dm_bio_prison_cell *virt_cell)
{
- struct dm_bio_prison_cell *cell;
+ struct dm_bio_prison_cell *data_cell;
struct pool *pool = tc->pool;
struct dm_cell_key key;
@@ -1268,19 +1448,23 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
* of being broken so we have nothing further to do here.
*/
build_data_key(tc->td, lookup_result->block, &key);
- if (bio_detain(pool, &key, bio, &cell))
+ if (bio_detain(pool, &key, bio, &data_cell)) {
+ cell_defer_no_holder(tc, virt_cell);
return;
+ }
- if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
- break_sharing(tc, bio, block, &key, lookup_result, cell);
- else {
+ if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
+ break_sharing(tc, bio, block, &key, lookup_result, data_cell);
+ cell_defer_no_holder(tc, virt_cell);
+ } else {
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
inc_all_io_entry(pool, bio);
- cell_defer_no_holder(tc, cell);
-
remap_and_issue(tc, bio, lookup_result->block);
+
+ remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
+ remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
}
}
@@ -1333,34 +1517,28 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
}
}
-static void process_bio(struct thin_c *tc, struct bio *bio)
+static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
int r;
struct pool *pool = tc->pool;
+ struct bio *bio = cell->holder;
dm_block_t block = get_bio_block(tc, bio);
- struct dm_bio_prison_cell *cell;
- struct dm_cell_key key;
struct dm_thin_lookup_result lookup_result;
- /*
- * If cell is already occupied, then the block is already
- * being provisioned so we have nothing further to do here.
- */
- build_virtual_key(tc->td, block, &key);
- if (bio_detain(pool, &key, bio, &cell))
+ if (tc->requeue_mode) {
+ cell_requeue(pool, cell);
return;
+ }
r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
switch (r) {
case 0:
- if (lookup_result.shared) {
- process_shared_bio(tc, bio, block, &lookup_result);
- cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
- } else {
+ if (lookup_result.shared)
+ process_shared_bio(tc, bio, block, &lookup_result, cell);
+ else {
inc_all_io_entry(pool, bio);
- cell_defer_no_holder(tc, cell);
-
remap_and_issue(tc, bio, lookup_result.block);
+ inc_remap_and_issue_cell(tc, cell, lookup_result.block);
}
break;
@@ -1394,7 +1572,26 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
}
}
-static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+static void process_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_bio_prison_cell *cell;
+ struct dm_cell_key key;
+
+ /*
+ * If cell is already occupied, then the block is already
+ * being provisioned so we have nothing further to do here.
+ */
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(pool, &key, bio, &cell))
+ return;
+
+ process_cell(tc, cell);
+}
+
+static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
+ struct dm_bio_prison_cell *cell)
{
int r;
int rw = bio_data_dir(bio);
@@ -1404,15 +1601,21 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
switch (r) {
case 0:
- if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
+ if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
handle_unserviceable_bio(tc->pool, bio);
- else {
+ if (cell)
+ cell_defer_no_holder(tc, cell);
+ } else {
inc_all_io_entry(tc->pool, bio);
remap_and_issue(tc, bio, lookup_result.block);
+ if (cell)
+ inc_remap_and_issue_cell(tc, cell, lookup_result.block);
}
break;
case -ENODATA:
+ if (cell)
+ cell_defer_no_holder(tc, cell);
if (rw != READ) {
handle_unserviceable_bio(tc->pool, bio);
break;
@@ -1431,11 +1634,23 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
default:
DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
__func__, r);
+ if (cell)
+ cell_defer_no_holder(tc, cell);
bio_io_error(bio);
break;
}
}
+static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+{
+ __process_bio_read_only(tc, bio, NULL);
+}
+
+static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ __process_bio_read_only(tc, cell->holder, cell);
+}
+
static void process_bio_success(struct thin_c *tc, struct bio *bio)
{
bio_endio(bio, 0);
@@ -1446,6 +1661,16 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio)
bio_io_error(bio);
}
+static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ cell_success(tc->pool, cell);
+}
+
+static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ cell_error(tc->pool, cell);
+}
+
/*
* FIXME: should we also commit due to size of transaction, measured in
* metadata blocks?
@@ -1527,9 +1752,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
struct bio *bio;
struct bio_list bios;
struct blk_plug plug;
+ unsigned count = 0;
if (tc->requeue_mode) {
- requeue_bio_list(tc, &tc->deferred_bio_list);
+ error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);
return;
}
@@ -1568,10 +1794,97 @@ static void process_thin_deferred_bios(struct thin_c *tc)
pool->process_discard(tc, bio);
else
pool->process_bio(tc, bio);
+
+ if ((count++ & 127) == 0) {
+ throttle_work_update(&pool->throttle);
+ dm_pool_issue_prefetches(pool->pmd);
+ }
}
blk_finish_plug(&plug);
}
+static int cmp_cells(const void *lhs, const void *rhs)
+{
+ struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
+ struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
+
+ BUG_ON(!lhs_cell->holder);
+ BUG_ON(!rhs_cell->holder);
+
+ if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
+ return -1;
+
+ if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
+ return 1;
+
+ return 0;
+}
+
+static unsigned sort_cells(struct pool *pool, struct list_head *cells)
+{
+ unsigned count = 0;
+ struct dm_bio_prison_cell *cell, *tmp;
+
+ list_for_each_entry_safe(cell, tmp, cells, user_list) {
+ if (count >= CELL_SORT_ARRAY_SIZE)
+ break;
+
+ pool->cell_sort_array[count++] = cell;
+ list_del(&cell->user_list);
+ }
+
+ sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
+
+ return count;
+}
+
+static void process_thin_deferred_cells(struct thin_c *tc)
+{
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+ struct list_head cells;
+ struct dm_bio_prison_cell *cell;
+ unsigned i, j, count;
+
+ INIT_LIST_HEAD(&cells);
+
+ spin_lock_irqsave(&tc->lock, flags);
+ list_splice_init(&tc->deferred_cells, &cells);
+ spin_unlock_irqrestore(&tc->lock, flags);
+
+ if (list_empty(&cells))
+ return;
+
+ do {
+ count = sort_cells(tc->pool, &cells);
+
+ for (i = 0; i < count; i++) {
+ cell = pool->cell_sort_array[i];
+ BUG_ON(!cell->holder);
+
+ /*
+ * If we've got no free new_mapping structs, and processing
+ * this bio might require one, we pause until there are some
+ * prepared mappings to process.
+ */
+ if (ensure_next_mapping(pool)) {
+ for (j = i; j < count; j++)
+ list_add(&pool->cell_sort_array[j]->user_list, &cells);
+
+ spin_lock_irqsave(&tc->lock, flags);
+ list_splice(&cells, &tc->deferred_cells);
+ spin_unlock_irqrestore(&tc->lock, flags);
+ return;
+ }
+
+ if (cell->holder->bi_rw & REQ_DISCARD)
+ pool->process_discard_cell(tc, cell);
+ else
+ pool->process_cell(tc, cell);
+ }
+ } while (!list_empty(&cells));
+}
+
static void thin_get(struct thin_c *tc);
static void thin_put(struct thin_c *tc);
@@ -1620,6 +1933,7 @@ static void process_deferred_bios(struct pool *pool)
tc = get_first_thin(pool);
while (tc) {
+ process_thin_deferred_cells(tc);
process_thin_deferred_bios(tc);
tc = get_next_thin(pool, tc);
}
@@ -1653,9 +1967,15 @@ static void do_worker(struct work_struct *ws)
{
struct pool *pool = container_of(ws, struct pool, worker);
+ throttle_work_start(&pool->throttle);
+ dm_pool_issue_prefetches(pool->pmd);
+ throttle_work_update(&pool->throttle);
process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
+ throttle_work_update(&pool->throttle);
process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
+ throttle_work_update(&pool->throttle);
process_deferred_bios(pool);
+ throttle_work_complete(&pool->throttle);
}
/*
@@ -1792,6 +2112,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_fail;
pool->process_discard = process_bio_fail;
+ pool->process_cell = process_cell_fail;
+ pool->process_discard_cell = process_cell_fail;
pool->process_prepared_mapping = process_prepared_mapping_fail;
pool->process_prepared_discard = process_prepared_discard_fail;
@@ -1804,6 +2126,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_read_only;
pool->process_discard = process_bio_success;
+ pool->process_cell = process_cell_read_only;
+ pool->process_discard_cell = process_cell_success;
pool->process_prepared_mapping = process_prepared_mapping_fail;
pool->process_prepared_discard = process_prepared_discard_passdown;
@@ -1822,7 +2146,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
if (old_mode != new_mode)
notify_of_pool_mode_change(pool, "out-of-data-space");
pool->process_bio = process_bio_read_only;
- pool->process_discard = process_discard;
+ pool->process_discard = process_discard_bio;
+ pool->process_cell = process_cell_read_only;
+ pool->process_discard_cell = process_discard_cell;
pool->process_prepared_mapping = process_prepared_mapping;
pool->process_prepared_discard = process_prepared_discard_passdown;
@@ -1835,7 +2161,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
notify_of_pool_mode_change(pool, "write");
dm_pool_metadata_read_write(pool->pmd);
pool->process_bio = process_bio;
- pool->process_discard = process_discard;
+ pool->process_discard = process_discard_bio;
+ pool->process_cell = process_cell;
+ pool->process_discard_cell = process_discard_cell;
pool->process_prepared_mapping = process_prepared_mapping;
pool->process_prepared_discard = process_prepared_discard;
break;
@@ -1895,6 +2223,29 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
wake_worker(pool);
}
+static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+
+ throttle_lock(&pool->throttle);
+ thin_defer_bio(tc, bio);
+ throttle_unlock(&pool->throttle);
+}
+
+static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ unsigned long flags;
+ struct pool *pool = tc->pool;
+
+ throttle_lock(&pool->throttle);
+ spin_lock_irqsave(&tc->lock, flags);
+ list_add_tail(&cell->user_list, &tc->deferred_cells);
+ spin_unlock_irqrestore(&tc->lock, flags);
+ throttle_unlock(&pool->throttle);
+
+ wake_worker(pool);
+}
+
static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
{
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -1915,8 +2266,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
dm_block_t block = get_bio_block(tc, bio);
struct dm_thin_device *td = tc->td;
struct dm_thin_lookup_result result;
- struct dm_bio_prison_cell cell1, cell2;
- struct dm_bio_prison_cell *cell_result;
+ struct dm_bio_prison_cell *virt_cell, *data_cell;
struct dm_cell_key key;
thin_hook_bio(tc, bio);
@@ -1932,10 +2282,18 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
}
if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
- thin_defer_bio(tc, bio);
+ thin_defer_bio_with_throttle(tc, bio);
return DM_MAPIO_SUBMITTED;
}
+ /*
+ * We must hold the virtual cell before doing the lookup, otherwise
+ * there's a race with discard.
+ */
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(tc->pool, &key, bio, &virt_cell))
+ return DM_MAPIO_SUBMITTED;
+
r = dm_thin_find_block(td, block, 0, &result);
/*
@@ -1958,23 +2316,19 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
* More distant ancestors are irrelevant. The
* shared flag will be set in their case.
*/
- thin_defer_bio(tc, bio);
+ thin_defer_cell(tc, virt_cell);
return DM_MAPIO_SUBMITTED;
}
- build_virtual_key(tc->td, block, &key);
- if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
- return DM_MAPIO_SUBMITTED;
-
build_data_key(tc->td, result.block, &key);
- if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
- cell_defer_no_holder_no_free(tc, &cell1);
+ if (bio_detain(tc->pool, &key, bio, &data_cell)) {
+ cell_defer_no_holder(tc, virt_cell);
return DM_MAPIO_SUBMITTED;
}
inc_all_io_entry(tc->pool, bio);
- cell_defer_no_holder_no_free(tc, &cell2);
- cell_defer_no_holder_no_free(tc, &cell1);
+ cell_defer_no_holder(tc, data_cell);
+ cell_defer_no_holder(tc, virt_cell);
remap(tc, bio, result.block);
return DM_MAPIO_REMAPPED;
@@ -1986,16 +2340,13 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
* of doing so.
*/
handle_unserviceable_bio(tc->pool, bio);
+ cell_defer_no_holder(tc, virt_cell);
return DM_MAPIO_SUBMITTED;
}
/* fall through */
case -EWOULDBLOCK:
- /*
- * In future, the failed dm_thin_find_block above could
- * provide the hint to load the metadata into cache.
- */
- thin_defer_bio(tc, bio);
+ thin_defer_cell(tc, virt_cell);
return DM_MAPIO_SUBMITTED;
default:
@@ -2005,6 +2356,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
* pool is switched to fail-io mode.
*/
bio_io_error(bio);
+ cell_defer_no_holder(tc, virt_cell);
return DM_MAPIO_SUBMITTED;
}
}
@@ -2185,7 +2537,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->sectors_per_block_shift = __ffs(block_size);
pool->low_water_blocks = 0;
pool_features_init(&pool->pf);
- pool->prison = dm_bio_prison_create(PRISON_CELLS);
+ pool->prison = dm_bio_prison_create();
if (!pool->prison) {
*error = "Error creating pool's bio prison";
err_p = ERR_PTR(-ENOMEM);
@@ -2211,6 +2563,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
goto bad_wq;
}
+ throttle_init(&pool->throttle);
INIT_WORK(&pool->worker, do_worker);
INIT_DELAYED_WORK(&pool->waker, do_waker);
INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
@@ -3169,15 +3522,36 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+ sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+
+ /*
+ * Adjust max_sectors_kb to highest possible power-of-2
+ * factor of pool->sectors_per_block.
+ */
+ if (limits->max_hw_sectors & (limits->max_hw_sectors - 1))
+ limits->max_sectors = rounddown_pow_of_two(limits->max_hw_sectors);
+ else
+ limits->max_sectors = limits->max_hw_sectors;
+
+ if (limits->max_sectors < pool->sectors_per_block) {
+ while (!is_factor(pool->sectors_per_block, limits->max_sectors))
+ limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
+ } else if (block_size_is_power_of_two(pool)) {
+ /* max_sectors_kb is >= power-of-2 thinp blocksize */
+ while (!is_factor(limits->max_sectors, pool->sectors_per_block))
+ limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
+ }
/*
* If the system-determined stacked limits are compatible with the
* pool's blocksize (io_opt is a factor) do not override them.
*/
if (io_opt_sectors < pool->sectors_per_block ||
- do_div(io_opt_sectors, pool->sectors_per_block)) {
- blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ !is_factor(io_opt_sectors, pool->sectors_per_block)) {
+ if (is_factor(pool->sectors_per_block, limits->max_sectors))
+ blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
+ else
+ blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
}
@@ -3206,7 +3580,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 13, 0},
+ .version = {1, 14, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -3295,6 +3669,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto out_unlock;
}
spin_lock_init(&tc->lock);
+ INIT_LIST_HEAD(&tc->deferred_cells);
bio_list_init(&tc->deferred_bio_list);
bio_list_init(&tc->retry_on_resume_list);
tc->sort_bio_list = RB_ROOT;
@@ -3533,6 +3908,21 @@ err:
DMEMIT("Error");
}
+static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct thin_c *tc = ti->private;
+ struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = tc->pool_dev->bdev;
+ bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
static int thin_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
@@ -3557,7 +3947,7 @@ static int thin_iterate_devices(struct dm_target *ti,
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 13, 0},
+ .version = {1, 14, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
@@ -3567,6 +3957,7 @@ static struct target_type thin_target = {
.presuspend = thin_presuspend,
.postsuspend = thin_postsuspend,
.status = thin_status,
+ .merge = thin_merge,
.iterate_devices = thin_iterate_devices,
};
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 58f3927fd7cc..0fee0e54d36f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1607,9 +1607,9 @@ static int dm_merge_bvec(struct request_queue *q,
* Find maximum amount of I/O that won't need splitting
*/
max_sectors = min(max_io_len(bvm->bi_sector, ti),
- (sector_t) BIO_MAX_SECTORS);
+ (sector_t) queue_max_sectors(q));
max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
- if (max_size < 0)
+ if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
max_size = 0;
/*
@@ -1621,10 +1621,10 @@ static int dm_merge_bvec(struct request_queue *q,
max_size = ti->type->merge(ti, bvm, biovec, max_size);
/*
* If the target doesn't support merge method and some of the devices
- * provided their merge_bvec method (we know this by looking at
- * queue_max_hw_sectors), then we can't allow bios with multiple vector
- * entries. So always set max_size to 0, and the code below allows
- * just one page.
+ * provided their merge_bvec method (we know this by looking for the
+ * max_hw_sectors that dm_set_device_limits may set), then we can't
+ * allow bios with multiple vector entries. So always set max_size
+ * to 0, and the code below allows just one page.
*/
else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
max_size = 0;
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 3bc30a0ae3d6..9cb797d800cf 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -10,6 +10,8 @@
#include "dm-persistent-data-internal.h"
#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/device-mapper.h>
@@ -17,6 +19,61 @@
/*----------------------------------------------------------------*/
+#define PREFETCH_SIZE 128
+#define PREFETCH_BITS 7
+#define PREFETCH_SENTINEL ((dm_block_t) -1ULL)
+
+struct prefetch_set {
+ struct mutex lock;
+ dm_block_t blocks[PREFETCH_SIZE];
+};
+
+static unsigned prefetch_hash(dm_block_t b)
+{
+ return hash_64(b, PREFETCH_BITS);
+}
+
+static void prefetch_wipe(struct prefetch_set *p)
+{
+ unsigned i;
+ for (i = 0; i < PREFETCH_SIZE; i++)
+ p->blocks[i] = PREFETCH_SENTINEL;
+}
+
+static void prefetch_init(struct prefetch_set *p)
+{
+ mutex_init(&p->lock);
+ prefetch_wipe(p);
+}
+
+static void prefetch_add(struct prefetch_set *p, dm_block_t b)
+{
+ unsigned h = prefetch_hash(b);
+
+ mutex_lock(&p->lock);
+ if (p->blocks[h] == PREFETCH_SENTINEL)
+ p->blocks[h] = b;
+
+ mutex_unlock(&p->lock);
+}
+
+static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm)
+{
+ unsigned i;
+
+ mutex_lock(&p->lock);
+
+ for (i = 0; i < PREFETCH_SIZE; i++)
+ if (p->blocks[i] != PREFETCH_SENTINEL) {
+ dm_bm_prefetch(bm, p->blocks[i]);
+ p->blocks[i] = PREFETCH_SENTINEL;
+ }
+
+ mutex_unlock(&p->lock);
+}
+
+/*----------------------------------------------------------------*/
+
struct shadow_info {
struct hlist_node hlist;
dm_block_t where;
@@ -37,6 +94,8 @@ struct dm_transaction_manager {
spinlock_t lock;
struct hlist_head buckets[DM_HASH_SIZE];
+
+ struct prefetch_set prefetches;
};
/*----------------------------------------------------------------*/
@@ -117,6 +176,8 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
for (i = 0; i < DM_HASH_SIZE; i++)
INIT_HLIST_HEAD(tm->buckets + i);
+ prefetch_init(&tm->prefetches);
+
return tm;
}
@@ -268,8 +329,14 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
struct dm_block_validator *v,
struct dm_block **blk)
{
- if (tm->is_clone)
- return dm_bm_read_try_lock(tm->real->bm, b, v, blk);
+ if (tm->is_clone) {
+ int r = dm_bm_read_try_lock(tm->real->bm, b, v, blk);
+
+ if (r == -EWOULDBLOCK)
+ prefetch_add(&tm->real->prefetches, b);
+
+ return r;
+ }
return dm_bm_read_lock(tm->bm, b, v, blk);
}
@@ -317,6 +384,12 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
return tm->bm;
}
+void dm_tm_issue_prefetches(struct dm_transaction_manager *tm)
+{
+ prefetch_issue(&tm->prefetches, tm->bm);
+}
+EXPORT_SYMBOL_GPL(dm_tm_issue_prefetches);
+
/*----------------------------------------------------------------*/
static int dm_tm_create_internal(struct dm_block_manager *bm,
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index 2772ed2a781a..2e0d4d66fb1b 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -109,6 +109,13 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
/*
+ * If you're using a non-blocking clone the tm will build up a list of
+ * requested blocks that weren't in core. This call will request those
+ * blocks to be prefetched.
+ */
+void dm_tm_issue_prefetches(struct dm_transaction_manager *tm);
+
+/*
* A little utility that ties the knot by producing a transaction manager
* that has a space map managed by the transaction manager...
*