summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-08-27 02:26:15 -0800
committerKent Overstreet <kent.overstreet@gmail.com>2016-08-27 05:24:18 -0800
commit09e9374b5eef9da7466f7f19bb65c5d1b6a0c0aa (patch)
treefe5af06ca8fed3caa8d0b6b02d2a8e48581e1f1f
parent0a9ad028970ba45acb365a559bfbd81c86918ec8 (diff)
bcache: fix some issues with journal seq blacklisting
- the journalling change that increased pipelining meant it was possible for a particular btree node to have bsets with two different blacklisted sequence numbers - existing code didn't handle that at all - the blacklisted seq flush code wasn't correctly waiting for nodes to be rewritten - the way the code kept track of which btree nodes had bsets with a blacklisted sequence number and needed to be rewritten meant that they were pinned in memory - asking for deadlocks all fixed
-rw-r--r--drivers/md/bcache/btree_cache.c6
-rw-r--r--drivers/md/bcache/btree_types.h2
-rw-r--r--drivers/md/bcache/btree_update.c12
-rw-r--r--drivers/md/bcache/journal.c137
-rw-r--r--drivers/md/bcache/journal_types.h11
5 files changed, 116 insertions, 52 deletions
diff --git a/drivers/md/bcache/btree_cache.c b/drivers/md/bcache/btree_cache.c
index f8f4b7e1b48a..2cda8776e903 100644
--- a/drivers/md/bcache/btree_cache.c
+++ b/drivers/md/bcache/btree_cache.c
@@ -89,7 +89,6 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, gfp_t gfp)
INIT_DELAYED_WORK(&b->work, btree_node_write_work);
b->c = c;
sema_init(&b->io_mutex, 1);
- INIT_LIST_HEAD(&b->journal_seq_blacklisted);
b->writes[1].index = 1;
INIT_LIST_HEAD(&b->write_blocked);
@@ -104,7 +103,6 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, gfp_t gfp)
void mca_hash_remove(struct cache_set *c, struct btree *b)
{
BUG_ON(btree_node_dirty(b));
- BUG_ON(!list_empty_careful(&b->journal_seq_blacklisted));
b->keys.nsets = 0;
b->keys.set[0].data = NULL;
@@ -170,10 +168,6 @@ static int mca_reap_notrace(struct cache_set *c, struct btree *b, bool flush)
if (!list_empty(&b->write_blocked))
goto out_unlock;
- /* XXX: we need a better solution for this, this will cause deadlocks */
- if (!list_empty_careful(&b->journal_seq_blacklisted))
- goto out_unlock;
-
if (!flush) {
if (btree_node_dirty(b))
goto out_unlock;
diff --git a/drivers/md/bcache/btree_types.h b/drivers/md/bcache/btree_types.h
index 5bce26ba3f95..f294aa2965c9 100644
--- a/drivers/md/bcache/btree_types.h
+++ b/drivers/md/bcache/btree_types.h
@@ -77,8 +77,6 @@ struct btree {
struct delayed_work work;
struct btree_write writes[2];
-
- struct list_head journal_seq_blacklisted;
};
#define BTREE_FLAG(flag) \
diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c
index 3fc2fcb6bd65..32a3915fd118 100644
--- a/drivers/md/bcache/btree_update.c
+++ b/drivers/md/bcache/btree_update.c
@@ -180,12 +180,6 @@ static void __btree_node_free(struct cache_set *c, struct btree *b,
clear_btree_node_dirty(b);
cancel_delayed_work(&b->work);
- if (!list_empty_careful(&b->journal_seq_blacklisted)) {
- mutex_lock(&c->journal.blacklist_lock);
- list_del_init(&b->journal_seq_blacklisted);
- mutex_unlock(&c->journal.blacklist_lock);
- }
-
mca_hash_remove(c, b);
mutex_lock(&c->btree_cache_lock);
@@ -874,8 +868,6 @@ static void btree_interior_update_pointers_written(struct closure *cl)
struct cache_set *c = as->c;
unsigned i;
- closure_wake_up(&as->wait);
-
journal_pin_drop(&c->journal, &as->journal);
mutex_lock(&c->btree_interior_update_lock);
@@ -890,6 +882,8 @@ static void btree_interior_update_pointers_written(struct closure *cl)
list_del(&as->list);
mutex_unlock(&c->btree_interior_update_lock);
+ closure_wake_up(&as->wait);
+
closure_return_with_destructor(cl, btree_interior_update_free);
}
@@ -1950,7 +1944,7 @@ int bch_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
if (!bch_btree_iter_upgrade(iter))
return -EINTR;
- reserve = bch_btree_reserve_get(c, b, 1, true, cl);
+ reserve = bch_btree_reserve_get(c, b, 1, false, cl);
if (IS_ERR(reserve)) {
trace_bcache_btree_gc_rewrite_node_fail(b);
return PTR_ERR(reserve);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 50f3e781ed6e..30e1eb13dc77 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -169,41 +169,88 @@ static void journal_seq_blacklist_flush(struct journal *j,
container_of(j, struct cache_set, journal);
struct journal_seq_blacklist *bl =
container_of(pin, struct journal_seq_blacklist, pin);
- struct btree *b;
- struct btree_iter iter;
+ struct blacklisted_node n;
struct closure cl;
+ unsigned i;
+ int ret;
closure_init_stack(&cl);
- while (1) {
+ for (i = 0;; i++) {
+ struct btree_iter iter;
+ struct btree *b;
+
mutex_lock(&j->blacklist_lock);
- if (list_empty(&bl->nodes))
+ if (i >= bl->nr_entries) {
+ mutex_unlock(&j->blacklist_lock);
break;
-
- b = list_first_entry(&bl->nodes, struct btree,
- journal_seq_blacklisted);
+ }
+ n = bl->entries[i];
mutex_unlock(&j->blacklist_lock);
- /*
- * b might be changing underneath us, but it won't be _freed_
- * underneath us - and if the fields we're reading out of it to
- * traverse to it are garbage because we raced, that's ok
- */
-
- bch_btree_iter_init(&iter, c, b->btree_id, b->key.k.p);
+ bch_btree_iter_init(&iter, c, n.btree_id, n.pos);
iter.is_extents = false;
-
+redo_peek:
b = bch_btree_iter_peek_node(&iter);
- if (!list_empty_careful(&b->journal_seq_blacklisted))
- bch_btree_node_rewrite(&iter, b, &cl);
+ /* The node might have already been rewritten: */
+
+ if (b->data->keys.seq == n.seq &&
+ !bkey_cmp(b->key.k.p, n.pos)) {
+ ret = bch_btree_node_rewrite(&iter, b, &cl);
+ if (ret == -EAGAIN) {
+ bch_btree_iter_unlock(&iter);
+ closure_sync(&cl);
+ ret = -EINTR;
+ }
+ if (ret == -EINTR)
+ goto redo_peek;
+ if (ret)
+ BUG();
+ }
bch_btree_iter_unlock(&iter);
- closure_sync(&cl);
}
+ closure_sync(&cl);
+
+ mutex_lock(&c->btree_interior_update_lock);
+
+ for (i = 0;; i++) {
+ struct btree_interior_update *as;
+ struct pending_btree_node_free *d;
+
+ mutex_lock(&j->blacklist_lock);
+ if (i >= bl->nr_entries) {
+ mutex_unlock(&j->blacklist_lock);
+ break;
+ }
+ n = bl->entries[i];
+ mutex_unlock(&j->blacklist_lock);
+
+ /*
+ * Is the node on the list of pending interior node updates -
+ * being freed? If so, wait for that to finish:
+ */
+ for_each_pending_btree_node_free(c, as, d)
+ if (n.seq == d->seq &&
+ n.btree_id == d->btree_id &&
+ !d->level &&
+ !bkey_cmp(n.pos, d->key.k.p)) {
+ closure_wait(&as->wait, &cl);
+ mutex_unlock(&c->btree_interior_update_lock);
+ closure_sync(&cl);
+ break;
+ }
+ }
+
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ mutex_lock(&j->blacklist_lock);
+
journal_pin_drop(j, &bl->pin);
list_del(&bl->list);
+ kfree(bl->entries);
kfree(bl);
mutex_unlock(&j->blacklist_lock);
@@ -235,8 +282,7 @@ bch_journal_seq_blacklisted_new(struct cache_set *c, u64 seq)
if (!bl)
return NULL;
- bl->seq = seq;
- INIT_LIST_HEAD(&bl->nodes);
+ bl->seq = seq;
list_add_tail(&bl->list, &j->seq_blacklist);
return bl;
}
@@ -251,30 +297,33 @@ int bch_journal_seq_should_ignore(struct cache_set *c, u64 seq, struct btree *b)
{
struct journal *j = &c->journal;
struct journal_seq_blacklist *bl;
+ struct blacklisted_node *n;
int ret = 0;
+ /* Interier updates aren't journalled: */
+ if (b->level)
+ return 0;
+
/*
* After initial GC has finished, we've scanned the entire btree and
* found all the blacklisted sequence numbers - we won't be creating any
* new ones after startup:
*/
- if (test_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags) &&
- list_empty_careful(&j->seq_blacklist))
- return 0;
+ if (test_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags)) {
+ if (list_empty_careful(&j->seq_blacklist))
+ return 0;
- mutex_lock(&j->blacklist_lock);
+ mutex_lock(&j->blacklist_lock);
+ ret = journal_seq_blacklist_find(j, seq) != NULL;
+ mutex_unlock(&j->blacklist_lock);
+ return ret;
+ }
+ mutex_lock(&j->blacklist_lock);
bl = journal_seq_blacklist_find(j, seq);
if (bl)
goto found;
- /*
- * Don't look at j->seq after journal has started, since we aren't
- * holding appropriate lock:
- */
- if (test_bit(CACHE_SET_INITIAL_GC_DONE, &c->flags))
- goto out;
-
if (seq <= j->seq)
goto out;
@@ -291,8 +340,30 @@ int bch_journal_seq_should_ignore(struct cache_set *c, u64 seq, struct btree *b)
goto out;
}
found:
- if (list_empty(&b->journal_seq_blacklisted))
- list_add(&b->journal_seq_blacklisted, &bl->nodes);
+ for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
+ if (b->data->keys.seq == n->seq &&
+ b->btree_id == n->btree_id &&
+ !bkey_cmp(b->key.k.p, n->pos))
+ goto found_entry;
+
+ if (!bl->nr_entries ||
+ is_power_of_2(bl->nr_entries)) {
+ n = krealloc(bl->entries,
+ max(bl->nr_entries * 2, 8UL) * sizeof(*n),
+ GFP_KERNEL);
+ if (!n) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ bl->entries = n;
+ }
+
+ bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
+ .seq = b->data->keys.seq,
+ .btree_id = b->btree_id,
+ .pos = b->key.k.p,
+ };
+found_entry:
ret = 1;
out:
mutex_unlock(&j->blacklist_lock);
diff --git a/drivers/md/bcache/journal_types.h b/drivers/md/bcache/journal_types.h
index 5011dd28fb9f..5c3ec3cb08d0 100644
--- a/drivers/md/bcache/journal_types.h
+++ b/drivers/md/bcache/journal_types.h
@@ -41,14 +41,21 @@ struct journal_entry_pin {
struct journal_entry_pin_list *pin_list;
};
+/* corresponds to a btree node with a blacklisted bset: */
+struct blacklisted_node {
+ __le64 seq;
+ enum btree_id btree_id;
+ struct bpos pos;
+};
+
struct journal_seq_blacklist {
struct list_head list;
u64 seq;
bool written;
struct journal_entry_pin pin;
- /* Btree nodes to be flushed: */
- struct list_head nodes;
+ struct blacklisted_node *entries;
+ size_t nr_entries;
};
struct journal_res {