summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-07-29 21:45:25 -0800
committerKent Overstreet <kent.overstreet@gmail.com>2016-07-29 21:45:27 -0800
commita32b9fd27ff3caa3f5660c80db4ddb44f105204b (patch)
tree02bd61c806ba1e50862988d68f5c8c88e81aae9f
parente4833edff3a486aa711f5ff8de8dae7e4f089d97 (diff)
bcache: Fix a bug with allocating journal writes
Before opening a new journal entry we have to ensure that there's enough space to write both the new entry, and the previous entry (that won't have been written yet) - we don't allocate space to write a journal entry until we actually do the write. This calculation was wrong, meaning we could go to write out a journal entry and discover we didn't have space.
-rw-r--r--drivers/md/bcache/journal.c62
-rw-r--r--include/uapi/linux/bcache.h26
2 files changed, 64 insertions, 24 deletions
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index c33bdbcfbe29..5bdb3d551548 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -1036,27 +1036,61 @@ static int journal_entry_sectors(struct journal *j)
{
struct cache_set *c = container_of(j, struct cache_set, journal);
struct cache *ca;
- unsigned i, nr_buckets = UINT_MAX, sectors = JOURNAL_BUF_SECTORS,
- nr_devs = 0;
+ struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
+ struct journal_buf *w = journal_prev_buf(j);
+ union journal_res_state s = j->reservations;
+ bool prev_unwritten = journal_state_count(s, !s.idx);
+ unsigned prev_sectors = prev_unwritten
+ ? __set_blocks(w->data,
+ le32_to_cpu(w->data->u64s),
+ block_bytes(c)) * c->sb.block_size
+ : 0;
+ unsigned sectors_available = JOURNAL_BUF_SECTORS;
+ unsigned i, nr_online = 0, nr_devs = 0;
lockdep_assert_held(&j->lock);
rcu_read_lock();
group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
- nr_devs++;
- nr_buckets = min(nr_buckets,
- journal_dev_buckets_available(j, ca));
- sectors = min_t(unsigned, sectors, ca->mi.bucket_size);
+ unsigned buckets_required = 0;
+
+ sectors_available = min_t(unsigned, sectors_available,
+ ca->mi.bucket_size);
+
+ /*
+ * Note that we don't allocate the space for a journal entry
+ * until we write it out - thus, if we haven't started the write
+ * for the previous entry we have to make sure we have space for
+ * it too:
+ */
+ if (bch_extent_has_device(e.c, ca->sb.nr_this_dev)) {
+ if (prev_sectors > ca->journal.sectors_free)
+ buckets_required++;
+
+ if (prev_sectors + sectors_available >
+ ca->journal.sectors_free)
+ buckets_required++;
+ } else {
+ if (prev_sectors + sectors_available >
+ ca->mi.bucket_size)
+ buckets_required++;
+
+ buckets_required++;
+ }
+
+ if (journal_dev_buckets_available(j, ca) >= buckets_required)
+ nr_devs++;
+ nr_online++;
}
rcu_read_unlock();
- if (nr_devs < c->opts.metadata_replicas)
+ if (nr_online < c->opts.metadata_replicas)
return -EROFS;
- if (nr_buckets == UINT_MAX)
- nr_buckets = 0;
+ if (nr_devs < c->opts.metadata_replicas)
+ return 0;
- return nr_buckets ? sectors : 0;
+ return sectors_available;
}
/*
@@ -1288,7 +1322,8 @@ int bch_cache_journal_alloc(struct cache *ca)
* is smaller:
*/
ret = bch_set_nr_journal_buckets(ca,
- clamp_t(unsigned, ca->mi.nbuckets >> 8, 8,
+ clamp_t(unsigned, ca->mi.nbuckets >> 8,
+ BCH_JOURNAL_BUCKETS_MIN,
min(1 << 10,
(1 << 20) / ca->mi.bucket_size)));
if (ret)
@@ -1684,7 +1719,10 @@ static void journal_write(struct closure *cl)
block_bytes(c)) * c->sb.block_size;
if (journal_write_alloc(j, sectors)) {
- BUG();
+ bch_journal_halt(j);
+ bch_err(c, "Unable to allocate journal write");
+ bch_fatal_error(c);
+ closure_return_with_destructor(cl, journal_write_done);
}
bch_check_mark_super(c, &j->key, true);
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index efc4271f8428..6006d50418cf 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -975,18 +975,8 @@ static inline __u64 bset_magic(struct cache_sb *sb)
return __le64_to_cpu(sb->set_magic) ^ BSET_MAGIC;
}
-/*
- * Journal
- *
- * On disk format for a journal entry:
- * seq is monotonically increasing; every journal entry has its own unique
- * sequence number.
- *
- * last_seq is the oldest journal entry that still has keys the btree hasn't
- * flushed to disk yet.
- *
- * version is for on disk format changes.
- */
+/* Journal */
+
#define BCACHE_JSET_VERSION_UUIDv1 1
#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */
@@ -1026,6 +1016,16 @@ enum {
JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3,
};
+/*
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
struct jset {
__le64 csum;
__le64 magic;
@@ -1049,6 +1049,8 @@ struct jset {
LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
+#define BCH_JOURNAL_BUCKETS_MIN 20
+
/* Bucket prios/gens */
struct prio_set {