summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSlava Pestov <sp@daterainc.com>2015-01-31 21:40:34 -0800
committerKent Overstreet <kmo@daterainc.com>2015-02-12 23:43:39 -0800
commitcee9c8e9ee21cecb1339a567bca5c5781fbee29f (patch)
treeb54ad5ffb74f5434930bc0d27e816f1b127ae641
parente5db677cbaf80db41437b94ce13414a62a131830 (diff)
bcache: do btree node flushing in a work item
This is the first patch preparing us for background journal reclaim. Change-Id: Ic8622b75eba32ef99cf8694a68afc459a57cc238
-rw-r--r--drivers/md/bcache/journal.c172
-rw-r--r--drivers/md/bcache/journal_types.h2
2 files changed, 117 insertions, 57 deletions
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 027293ba9bba..950e2bc2a951 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -724,29 +724,87 @@ static void journal_entry_no_room(struct cache_set *c)
}
/**
- * journal_next_bucket - move on to the next journal bucket if possible
+ * journal_reclaim_work - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
*/
-static void journal_next_bucket(struct cache_set *c)
+static void journal_reclaim_work(struct work_struct *work)
{
- struct bkey_i_extent *e = bkey_i_to_extent(&c->journal.key);
- struct bch_extent_ptr *ptr;
+ struct cache_set *c = container_of(work, struct cache_set,
+ journal.reclaim_work);
struct cache *ca;
- u64 last_seq;
u64 oldest_seq = 0;
unsigned iter;
- atomic_t p;
bool flush = false;
- bool discard_done = true;
pr_debug("started");
- lockdep_assert_held(&c->journal.lock);
+ spin_lock(&c->journal.lock);
- /*
- * only supposed to be called when we're out of space/haven't started a
- * new journal entry
- */
- BUG_ON(c->journal.u64s_remaining);
- BUG_ON(c->journal.cur->data->u64s);
+ rcu_read_lock();
+
+ for_each_cache_rcu(ca, c, iter) {
+ struct journal_device *ja = &ca->journal;
+ unsigned next = (ja->cur_idx + 1) %
+ bch_nr_journal_buckets(&ca->sb);
+
+ if ((CACHE_TIER(&ca->mi) != 0)
+ || (CACHE_STATE(&ca->mi) != CACHE_ACTIVE))
+ continue;
+
+ /* No journal buckets available for writing on this device */
+ if (next == ja->last_idx) {
+ BUG_ON(ja->last_idx != ja->discard_idx);
+ oldest_seq = max_t(u64, oldest_seq,
+ ja->seq[next]);
+ flush = true;
+ }
+ }
+
+ rcu_read_unlock();
+
+ if (fifo_free(&c->journal.pin) <= 1) {
+ size_t used = fifo_used(&c->journal.pin);
+
+ /*
+ * Write out enough btree nodes to free up ~1%
+ * the FIFO
+ */
+ oldest_seq = max_t(u64, oldest_seq,
+ last_seq(&c->journal)
+ + (used >> 8));
+ flush = true;
+ }
+
+ if (!flush) {
+ spin_unlock(&c->journal.lock);
+ return;
+ }
+
+ BUG_ON(oldest_seq == 0);
+ oldest_seq -= last_seq(&c->journal);
+ spin_unlock(&c->journal.lock);
+ bch_btree_write_oldest(c, oldest_seq);
+}
+
+/**
+ * journal_reclaim_fast - do the fast part of journal reclaim
+ *
+ * Called from IO submission context, does not block. Cleans up after btree
+ * write completions by advancing the journal pin and each cache's last_idx,
+ * and kicking off discards.
+ *
+ * Returns true if background reclaim needs to run.
+ */
+static bool journal_reclaim_fast(struct cache_set *c)
+{
+ struct cache *ca;
+ u64 last_seq;
+ atomic_t p;
+ unsigned iter;
+ bool need_reclaim = false;
+
+ lockdep_assert_held(&c->journal.lock);
/*
* Unpin journal entries whose reference counts reached zero, meaning
@@ -755,20 +813,21 @@ static void journal_next_bucket(struct cache_set *c)
while (!atomic_read(&fifo_front(&c->journal.pin)))
fifo_pop(&c->journal.pin, p);
+ if (fifo_free(&c->journal.pin) <= 1)
+ need_reclaim = true;
+
last_seq = last_seq(&c->journal);
+ rcu_read_lock();
+
/*
* Advance last_idx to point to the oldest journal entry containing
* btree node updates that have not yet been written out
- *
- * Advance discard_idx toward last_idx by discarding journal buckets
- * that have had all btree node updates written out
*/
-
- rcu_read_lock();
-
for_each_cache_rcu(ca, c, iter) {
struct journal_device *ja = &ca->journal;
+ unsigned next = (ja->cur_idx + 1) %
+ bch_nr_journal_buckets(&ca->sb);
if ((CACHE_TIER(&ca->mi) != 0)
|| (CACHE_STATE(&ca->mi) != CACHE_ACTIVE))
@@ -779,9 +838,42 @@ static void journal_next_bucket(struct cache_set *c)
ja->last_idx = (ja->last_idx + 1) %
bch_nr_journal_buckets(&ca->sb);
- discard_done &= do_journal_discard(ca);
+ if (next == ja->last_idx)
+ need_reclaim = true;
+
+ do_journal_discard(ca);
}
+ rcu_read_unlock();
+
+ return need_reclaim;
+}
+
+/**
+ * journal_next_bucket - move on to the next journal bucket if possible
+ */
+static void journal_next_bucket(struct cache_set *c)
+{
+ struct bkey_i_extent *e = bkey_i_to_extent(&c->journal.key);
+ struct bch_extent_ptr *ptr;
+ struct cache *ca;
+ unsigned iter;
+
+ pr_debug("started");
+ lockdep_assert_held(&c->journal.lock);
+
+ /*
+ * only supposed to be called when we're out of space/haven't started a
+ * new journal entry
+ */
+ BUG_ON(c->journal.u64s_remaining);
+ BUG_ON(c->journal.cur->data->u64s);
+
+ if (journal_reclaim_fast(c))
+ queue_work(system_long_wq, &c->journal.reclaim_work);
+
+ rcu_read_lock();
+
/*
* Drop any pointers to devices that have been removed, are no longer
* empty, or filled up their current journal bucket:
@@ -814,21 +906,8 @@ static void journal_next_bucket(struct cache_set *c)
continue;
/* No journal buckets available for writing on this device */
- if (next == ja->discard_idx) {
- /*
- * No work for discard to do -- tell the caller to
- * flush all btree nodes up to the end of the oldest
- * journal bucket
- */
- if (discard_done) {
- BUG_ON(ja->last_idx != ja->discard_idx);
- oldest_seq = max_t(u64, oldest_seq,
- ja->seq[next]);
- flush = true;
- }
-
+ if (next == ja->discard_idx)
continue;
- }
BUG_ON(bch_extent_ptrs(e) >= BKEY_EXTENT_PTRS_MAX);
@@ -855,28 +934,6 @@ static void journal_next_bucket(struct cache_set *c)
c->journal.sectors_free = 0;
rcu_read_unlock();
-
- if (fifo_free(&c->journal.pin) <= 1) {
- size_t used = fifo_used(&c->journal.pin);
-
- trace_bcache_journal_fifo_full(c);
- /*
- * Write out enough btree nodes to free up ~1%
- * the FIFO
- */
- oldest_seq = max_t(u64, oldest_seq,
- last_seq(&c->journal)
- + (used >> 8));
- flush = true;
- }
-
- if (flush) {
- BUG_ON(oldest_seq == 0);
- oldest_seq -= last_seq(&c->journal);
- spin_unlock(&c->journal.lock);
- bch_btree_write_oldest(c, oldest_seq);
- spin_lock(&c->journal.lock);
- }
}
void bch_journal_next(struct journal *j)
@@ -1265,6 +1322,7 @@ int bch_journal_alloc(struct cache_set *c)
spin_lock_init(&j->lock);
init_waitqueue_head(&j->wait);
INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+ INIT_WORK(&j->reclaim_work, journal_reclaim_work);
c->journal.delay_ms = 10;
diff --git a/drivers/md/bcache/journal_types.h b/drivers/md/bcache/journal_types.h
index ec8b01c2acc9..2e7eea3f2dae 100644
--- a/drivers/md/bcache/journal_types.h
+++ b/drivers/md/bcache/journal_types.h
@@ -63,6 +63,8 @@ struct journal {
BKEY_PADDED(key);
+ struct work_struct reclaim_work;
+
/*
* Two journal entries -- one is currently open for new entries, the
* other is possibly being written out.