summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-09-21 17:38:38 -0800
committerKent Overstreet <kent.overstreet@gmail.com>2016-09-21 17:38:38 -0800
commitb7cb7779de9609899fa53f7bab4e3cee69ac1f7e (patch)
tree5f0062453ca436472edfbf5fdda90592a93ebcf6
parentdcc5a0164ea0bee90e3a7e1c542d544d34620d40 (diff)
bcache: fix an ordering issue in btree_interior_update code
Also a bunch of refactoring
-rw-r--r--drivers/md/bcache/btree_gc.c6
-rw-r--r--drivers/md/bcache/btree_update.c180
-rw-r--r--drivers/md/bcache/btree_update.h20
3 files changed, 113 insertions, 93 deletions
diff --git a/drivers/md/bcache/btree_gc.c b/drivers/md/bcache/btree_gc.c
index ea1e72fe14db..ec05c22857fd 100644
--- a/drivers/md/bcache/btree_gc.c
+++ b/drivers/md/bcache/btree_gc.c
@@ -509,10 +509,10 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
goto out;
}
- as = __bch_btree_interior_update_alloc(old_nodes, nr_old_nodes, iter);
+ as = bch_btree_interior_update_alloc(c);
for (i = 0; i < nr_old_nodes; i++)
- bch_btree_interior_update_will_free_node(as, old_nodes[i]);
+ bch_btree_interior_update_will_free_node(c, as, old_nodes[i]);
/* Repack everything with @new_format and sort down to one bset */
for (i = 0; i < nr_old_nodes; i++)
@@ -601,8 +601,6 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
struct bkey_i delete;
unsigned j;
- bch_btree_node_free_start(c, as, old_nodes[i]);
-
for (j = 0; j < nr_new_nodes; j++)
if (!bkey_cmp(old_nodes[i]->key.k.p,
new_nodes[j]->key.k.p))
diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c
index eee969f74c61..5b4a743a1f8f 100644
--- a/drivers/md/bcache/btree_update.c
+++ b/drivers/md/bcache/btree_update.c
@@ -18,8 +18,9 @@
#include <linux/sort.h>
#include <trace/events/bcache.h>
-static void btree_interior_update_updated_root(struct btree_interior_update *,
- struct btree *);
+static void btree_interior_update_updated_root(struct cache_set *,
+ struct btree_interior_update *,
+ enum btree_id);
/* Calculate ideal packed bkey format for new btree nodes: */
@@ -71,29 +72,6 @@ bool bch_btree_node_format_fits(struct btree *b, struct bkey_format *new_f)
/* Btree node freeing/allocation: */
/*
- * @b is going to be freed, allocate a pending_btree_node_free in @as:
- */
-void bch_btree_node_free_start(struct cache_set *c,
- struct btree_interior_update *as,
- struct btree *b)
-{
- struct pending_btree_node_free *d;
-
- BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
-
- mutex_lock(&c->btree_interior_update_lock);
-
- d = &as->pending[as->nr_pending++];
- d->index_update_done = false;
- d->seq = b->data->keys.seq;
- d->btree_id = b->btree_id;
- d->level = b->level;
- bkey_copy(&d->key, &b->key);
-
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
-/*
* We're doing the index update that makes @b unreachable, update stuff to
* reflect that:
*
@@ -448,7 +426,7 @@ static void bch_btree_set_root(struct btree_iter *iter, struct btree *b,
bch_btree_set_root_inmem(c, b, btree_reserve);
- btree_interior_update_updated_root(as, b);
+ btree_interior_update_updated_root(c, as, iter->btree_id);
/*
* Unlock old root after new root is visible:
@@ -778,12 +756,9 @@ relock:
/* Asynchronous interior node update machinery */
struct btree_interior_update *
-__bch_btree_interior_update_alloc(struct btree *nodes[], unsigned nr_nodes,
- struct btree_iter *iter)
+bch_btree_interior_update_alloc(struct cache_set *c)
{
- struct cache_set *c = iter->c;
struct btree_interior_update *as;
- unsigned i;
as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
memset(as, 0, sizeof(*as));
@@ -794,15 +769,6 @@ __bch_btree_interior_update_alloc(struct btree *nodes[], unsigned nr_nodes,
bch_keylist_init(&as->parent_keys, as->inline_keys,
ARRAY_SIZE(as->inline_keys));
- for (i = 0; i < nr_nodes; i++) {
- bch_journal_pin_add_if_older(&c->journal,
- &nodes[i]->writes[0].journal,
- &as->journal, NULL);
- bch_journal_pin_add_if_older(&c->journal,
- &nodes[i]->writes[1].journal,
- &as->journal, NULL);
- }
-
mutex_lock(&c->btree_interior_update_lock);
list_add(&as->list, &c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
@@ -810,12 +776,6 @@ __bch_btree_interior_update_alloc(struct btree *nodes[], unsigned nr_nodes,
return as;
}
-struct btree_interior_update *
-bch_btree_interior_update_alloc(struct btree *b, struct btree_iter *iter)
-{
- return __bch_btree_interior_update_alloc(&b, 1, iter);
-}
-
static void btree_interior_update_free(struct closure *cl)
{
struct btree_interior_update *as = container_of(cl, struct btree_interior_update, cl);
@@ -823,7 +783,7 @@ static void btree_interior_update_free(struct closure *cl)
mempool_free(as, &as->c->btree_interior_update_pool);
}
-static void btree_interior_update_pointers_written(struct closure *cl)
+static void btree_interior_update_nodes_reachable(struct closure *cl)
{
struct btree_interior_update *as =
container_of(cl, struct btree_interior_update, cl);
@@ -856,12 +816,19 @@ static void btree_interior_update_nodes_written(struct closure *cl)
struct cache_set *c = as->c;
struct btree *b;
+ if (bch_journal_error(&c->journal)) {
+ /* XXX what? */
+ }
+
+ /* XXX: missing error handling, damnit */
+
+ /* check for journal error, bail out if we flushed */
+
/*
* We did an update to a parent node where the pointers we added pointed
* to child nodes that weren't written yet: now, the child nodes have
* been written so we can write out the update to the interior node.
*/
-
retry:
mutex_lock(&c->btree_interior_update_lock);
switch (as->mode) {
@@ -921,7 +888,7 @@ retry:
/*
* We don't have to wait anything anything here (before
- * btree_interior_update_pointers_written frees the old nodes
+ * btree_interior_update_nodes_reachable frees the old nodes
* ondisk) - we've ensured that the very next journal write will
* have the pointer to the new root, and before the allocator
* can reuse the old nodes it'll have to do a journal commit:
@@ -930,17 +897,18 @@ retry:
}
mutex_unlock(&c->btree_interior_update_lock);
- continue_at(cl, btree_interior_update_pointers_written, system_wq);
+ continue_at(cl, btree_interior_update_nodes_reachable, system_wq);
}
/*
* We're updating @b with pointers to nodes that haven't finished writing yet:
* block @b from being written until @as completes
*/
-static void btree_interior_update_updated_btree(struct btree_interior_update *as,
+static void btree_interior_update_updated_btree(struct cache_set *c,
+ struct btree_interior_update *as,
struct btree *b)
{
- mutex_lock(&as->c->btree_interior_update_lock);
+ mutex_lock(&c->btree_interior_update_lock);
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
BUG_ON(!btree_node_dirty(b));
@@ -949,23 +917,28 @@ static void btree_interior_update_updated_btree(struct btree_interior_update *as
as->b = b;
list_add(&as->write_blocked_list, &b->write_blocked);
- mutex_unlock(&as->c->btree_interior_update_lock);
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ bch_journal_flush_seq_async(&c->journal, as->journal_seq, &as->cl);
- continue_at(&as->cl, btree_interior_update_nodes_written, system_freezable_wq);
+ continue_at(&as->cl, btree_interior_update_nodes_written,
+ system_freezable_wq);
}
-static void btree_interior_update_updated_root(struct btree_interior_update *as,
- struct btree *b)
+static void btree_interior_update_updated_root(struct cache_set *c,
+ struct btree_interior_update *as,
+ enum btree_id btree_id)
{
- struct btree_root *r = &as->c->btree_roots[b->btree_id];
+ struct btree_root *r = &c->btree_roots[btree_id];
- /*
- * XXX: if there's an outstanding btree_interior_update updating the
- * root, we have to do the dance with the old one
- */
+ mutex_lock(&c->btree_interior_update_lock);
- mutex_lock(&as->c->btree_interior_update_lock);
+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ /*
+ * Old root might not be persistent yet - if so, redirect its
+ * btree_interior_update operation to point to us:
+ */
if (r->as) {
BUG_ON(r->as->mode != BTREE_INTERIOR_UPDATING_ROOT);
@@ -975,14 +948,16 @@ static void btree_interior_update_updated_root(struct btree_interior_update *as,
closure_get(&as->cl);
}
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
as->mode = BTREE_INTERIOR_UPDATING_ROOT;
- as->b = b;
+ as->b = r->b;
r->as = as;
- mutex_unlock(&as->c->btree_interior_update_lock);
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ bch_journal_flush_seq_async(&c->journal, as->journal_seq, &as->cl);
- continue_at(&as->cl, btree_interior_update_nodes_written, system_freezable_wq);
+ continue_at(&as->cl, btree_interior_update_nodes_written,
+ system_freezable_wq);
}
/*
@@ -990,16 +965,51 @@ static void btree_interior_update_updated_root(struct btree_interior_update *as,
* nodes and thus outstanding btree_interior_updates - redirect @b's
* btree_interior_updates to point to this btree_interior_update:
*/
-void bch_btree_interior_update_will_free_node(struct btree_interior_update *as, struct btree *b)
+void bch_btree_interior_update_will_free_node(struct cache_set *c,
+ struct btree_interior_update *as,
+ struct btree *b)
{
- mutex_lock(&as->c->btree_interior_update_lock);
+ struct btree_interior_update *p, *n;
+ struct pending_btree_node_free *d;
+ struct bset_tree *t;
- while (!list_empty(&b->write_blocked)) {
- struct btree_interior_update *p =
- list_first_entry(&b->write_blocked,
- struct btree_interior_update,
- write_blocked_list);
+ mutex_lock(&c->btree_interior_update_lock);
+ /*
+ * Does this node have data that hasn't been written in the journal?
+ *
+ * If so, we have to wait for the corresponding journal entry to be
+ * written before making the new nodes reachable - we can't just carry
+ * over the bset->journal_seq tracking, since we'll be mixing those keys
+ * in with keys that aren't in the journal anymore:
+ */
+ for (t = b->keys.set; t <= b->keys.set + b->keys.nsets; t++)
+ as->journal_seq = max(as->journal_seq, t->data->journal_seq);
+
+ /*
+ * Does this node have unwritten data that has a pin on the journal?
+ *
+ * If so, transfer that pin to the btree_interior_update operation -
+ * note that if we're freeing multiple nodes, we only need to keep the
+ * oldest pin of any of the nodes we're freeing. We'll release the pin
+ * when the new nodes are persistent and reachable on disk:
+ */
+ bch_journal_pin_add_if_older(&c->journal,
+ &b->writes[0].journal,
+ &as->journal, NULL);
+ bch_journal_pin_add_if_older(&c->journal,
+ &b->writes[1].journal,
+ &as->journal, NULL);
+
+ /*
+ * Does this node have any btree_interior_update operations preventing
+ * it from being written?
+ *
+ * If so, redirect them to point to this btree_interior_update: we can
+ * write out our new nodes, but we won't make them visible until those
+ * operations complete
+ */
+ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
BUG_ON(p->mode != BTREE_INTERIOR_UPDATING_NODE);
p->mode = BTREE_INTERIOR_UPDATING_AS;
@@ -1009,7 +1019,17 @@ void bch_btree_interior_update_will_free_node(struct btree_interior_update *as,
closure_get(&as->cl);
}
- mutex_unlock(&as->c->btree_interior_update_lock);
+ /* Add this node to the list of nodes being freed: */
+ BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
+
+ d = &as->pending[as->nr_pending++];
+ d->index_update_done = false;
+ d->seq = b->data->keys.seq;
+ d->btree_id = b->btree_id;
+ d->level = b->level;
+ bkey_copy(&d->key, &b->key);
+
+ mutex_unlock(&c->btree_interior_update_lock);
}
static void btree_node_interior_verify(struct btree *b)
@@ -1098,7 +1118,7 @@ bch_btree_insert_keys_interior(struct btree *b,
bch_keylist_dequeue(insert_keys);
}
- btree_interior_update_updated_btree(as, b);
+ btree_interior_update_updated_btree(iter->c, as, b);
btree_node_unlock_write(b, iter);
@@ -1237,7 +1257,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
BUG_ON(!parent && (b != btree_node_root(b)));
BUG_ON(!btree_node_intent_locked(iter, btree_node_root(b)->level));
- bch_btree_interior_update_will_free_node(as, b);
+ bch_btree_interior_update_will_free_node(c, as, b);
n1 = btree_node_alloc_replacement(c, b, reserve);
@@ -1324,8 +1344,6 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
bch_btree_node_write(n1, &as->cl, NULL);
- bch_btree_node_free_start(c, as, b);
-
/* New nodes all written, now make them visible: */
if (parent) {
@@ -1435,7 +1453,7 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags,
goto out_get_locks;
}
- as = bch_btree_interior_update_alloc(b, iter);
+ as = bch_btree_interior_update_alloc(c);
btree_split(b, iter, NULL, reserve, as);
bch_btree_reserve_put(c, reserve);
@@ -1912,9 +1930,9 @@ int bch_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
return PTR_ERR(reserve);
}
- as = bch_btree_interior_update_alloc(b, iter);
+ as = bch_btree_interior_update_alloc(c);
- bch_btree_interior_update_will_free_node(as, b);
+ bch_btree_interior_update_will_free_node(c, as, b);
n = btree_node_alloc_replacement(c, b, reserve);
six_unlock_write(&n->lock);
@@ -1923,8 +1941,6 @@ int bch_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
bch_btree_node_write(n, &as->cl, NULL);
- bch_btree_node_free_start(c, as, b);
-
if (parent) {
bch_btree_insert_node(parent, iter,
&keylist_single(&n->key),
diff --git a/drivers/md/bcache/btree_update.h b/drivers/md/bcache/btree_update.h
index 55b74b61c04b..97725296a2e8 100644
--- a/drivers/md/bcache/btree_update.h
+++ b/drivers/md/bcache/btree_update.h
@@ -91,8 +91,16 @@ struct btree_interior_update {
struct btree_interior_update *parent_as;
struct closure_waitlist wait;
+ /*
+ * We may be freeing nodes that were dirty, and thus had journal entries
+ * pinned: we need to transfer the oldest of those pins to the
+ * btree_interior_update operation, and release it when the new node(s)
+ * are all persistent and reachable:
+ */
struct journal_entry_pin journal;
+ u64 journal_seq;
+
/*
* Nodes being freed:
* Protected by c->btree_node_pending_free_lock
@@ -114,9 +122,6 @@ struct btree_interior_update {
list_for_each_entry(as, &c->btree_interior_update_list, list) \
for (p = as->pending; p < as->pending + as->nr_pending; p++)
-void bch_btree_node_free_start(struct cache_set *, struct btree_interior_update *,
- struct btree *);
-
void bch_btree_node_free_inmem(struct btree_iter *, struct btree *);
void bch_btree_node_free_never_inserted(struct cache_set *, struct btree *);
@@ -129,11 +134,12 @@ struct btree *__btree_node_alloc_replacement(struct cache_set *,
struct btree *btree_node_alloc_replacement(struct cache_set *, struct btree *,
struct btree_reserve *);
-struct btree_interior_update *__bch_btree_interior_update_alloc(struct btree *[], unsigned,
- struct btree_iter *);
-struct btree_interior_update *bch_btree_interior_update_alloc(struct btree *, struct btree_iter *);
+struct btree_interior_update *
+bch_btree_interior_update_alloc(struct cache_set *);
-void bch_btree_interior_update_will_free_node(struct btree_interior_update *, struct btree *);
+void bch_btree_interior_update_will_free_node(struct cache_set *,
+ struct btree_interior_update *,
+ struct btree *);
void bch_btree_set_root_initial(struct cache_set *, struct btree *,
struct btree_reserve *);