summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSlava Pestov <sp@daterainc.com>2015-01-30 18:08:24 -0800
committerKent Overstreet <kmo@daterainc.com>2015-02-12 23:43:32 -0800
commit9f239f94fcea7ce48c75bd8ae9eb510b8d210a82 (patch)
tree4eacfa385e8e7e93bb65a0aa23cad7b2bd05abde
parent62743d00d3a0e9482bda4ac1cf34922c2e3431f6 (diff)
bcache: fix journal reclaim deadlock during journal replay
Journal reclaim has to work during journal replay, because the allocator might need to invalidate buckets and write out prios and gens, or because we might need to set a new btree root. The recent patch "Fix journal replay" made this work by tracking reference counts on journal entries during replay in the same way that we do during normal operation, except that a reference is dropped once an entry has been replayed rather than dropping a reference when an entry has been written out. The problem with that patch is that we might start replay with a completely full journal, and be unable to add any new journal entries until the first bucket of entries has been replayed. If replaying the first bucket of entries required allocating buckets, we would deadlock in the allocator thread while waiting on a journal entry to write out prios and gens, because we would be unable to reclaim any journal buckets -- no entries have been replayed yet. Dig ourselves out of this hole by priming the allocator freelists with completely free buckets, by extending the existing logic to prime the PRIO freelist to prime all freelists. Also, wake up any threads waiting on reclaim when we drop a journal entry's reference count. Finally, add a BUG_ON() to ensure that flushing btree nodes makes forward progress during replay. Change-Id: I608b03bdf196834d22cda16d427555da50c344e5
-rw-r--r--drivers/md/bcache/alloc.c53
-rw-r--r--drivers/md/bcache/btree.c2
-rw-r--r--drivers/md/bcache/journal.c1
3 files changed, 36 insertions, 20 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 450969972863..7ced8ebc2611 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -536,12 +536,9 @@ static bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *g)
static void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *g)
{
lockdep_assert_held(&ca->freelist_lock);
- BUG_ON(!bch_can_invalidate_bucket(ca, g));
/* Ordering matters: see bch_mark_data_bucket() */
- /* this is what makes ptrs to the bucket invalid */
- ca->bucket_gens[g - ca->buckets]++;
/* bucket mark updates imply a write barrier */
bch_mark_alloc_bucket(ca, g);
@@ -555,6 +552,10 @@ static void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *g)
static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *g)
{
spin_lock(&ca->freelist_lock);
+
+ /* this is what makes ptrs to the bucket invalid */
+ ca->bucket_gens[g - ca->buckets]++;
+
__bch_invalidate_one_bucket(ca, g);
BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
spin_unlock(&ca->freelist_lock);
@@ -625,8 +626,10 @@ static void invalidate_buckets_lru(struct cache *ca)
* kick stuff and retry us
*/
while (!fifo_full(&ca->free_inc) &&
- heap_pop(&ca->heap, e, bucket_max_cmp))
+ heap_pop(&ca->heap, e, bucket_max_cmp)) {
+ BUG_ON(!bch_can_invalidate_bucket(ca, e.g));
bch_invalidate_one_bucket(ca, e.g);
+ }
mutex_unlock(&ca->set->bucket_lock);
mutex_unlock(&ca->heap_lock);
@@ -1381,38 +1384,48 @@ void bch_open_buckets_init(struct cache_set *c)
}
/*
- * bch_cache_allocator_start - put some unused buckets directly on the prio
- * freelist, start allocator
+ * bch_cache_allocator_start - fill freelists directly with completely unused
+ * buckets
*
* The allocator thread needs freed buckets to rewrite the prios and gens, and
* it needs to rewrite prios and gens in order to free buckets.
*
- * This is only safe for buckets that have no live data in them, which
- * there should always be some of when this function is called.
+ * Don't increment gens. We are only re-using completely free buckets here, so
+ * there are no existing pointers into them.
+ *
+ * Also, we can't increment gens until we re-write prios and gens, but we
+ * can't do that until we can write a journal entry.
+ *
+ * If the journal is completely full, we cannot write a journal entry until we
+ * reclaim a journal bucket, and we cannot do that until we possibly allocate
+ * some buckets for btree nodes.
+ *
+ * So dig ourselves out of that hole here.
+ *
+ * This is only safe for buckets that have no live data in them, which there
+ * should always be some of when this function is called, since the last time
+ * we shut down there should have been unused buckets stranded on freelists.
*/
const char *bch_cache_allocator_start(struct cache *ca)
{
struct task_struct *k;
struct bucket *g;
+ spin_lock(&ca->freelist_lock);
for_each_bucket(g, ca) {
- spin_lock(&ca->freelist_lock);
- if (fifo_used(&ca->free_inc) >= prio_buckets(ca)) {
- spin_unlock(&ca->freelist_lock);
- goto done;
- }
-
if (bch_can_invalidate_bucket(ca, g) &&
!g->mark.cached_sectors) {
- __bch_invalidate_one_bucket(ca, g);
- fifo_push(&ca->free_inc, g - ca->buckets);
+ if (__bch_allocator_push(ca, g - ca->buckets))
+ __bch_invalidate_one_bucket(ca, g);
+ else
+ break;
}
-
- spin_unlock(&ca->freelist_lock);
}
+ spin_unlock(&ca->freelist_lock);
+
+ if (!fifo_full(&ca->free[RESERVE_PRIO]))
+ return "couldn't find enough available buckets to write prios";
- return "couldn't find enough available buckets to write prios";
-done:
k = kthread_create(bch_allocator_thread, ca, "bcache_allocator");
if (IS_ERR(k))
return "error starting allocator thread";
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index dd9a2dc35f92..1baaa2a3bfc9 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -663,6 +663,8 @@ void bch_btree_write_oldest(struct cache_set *c, u64 oldest_seq)
closure_sync(&cl);
trace_bcache_journal_write_oldest_done(c, oldest_seq, written);
+
+ BUG_ON(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && !written);
}
/*
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 5c7d1b41a307..d5ec535053cb 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -559,6 +559,7 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list)
}
atomic_dec(c->journal.cur_pin);
+ wake_up(&c->journal.wait);
n = i->j.seq + 1;
entries++;