summaryrefslogtreecommitdiff
path: root/libbcachefs/alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbcachefs/alloc.c')
-rw-r--r--libbcachefs/alloc.c898
1 files changed, 468 insertions, 430 deletions
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index 9d54dd80..5a258cb6 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -75,7 +75,6 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
-static void __bch2_bucket_free(struct bch_dev *, struct bucket *);
static void bch2_recalc_min_prio(struct bch_dev *, int);
/* Allocation groups: */
@@ -206,268 +205,244 @@ static void pd_controllers_update(struct work_struct *work)
c->pd_controllers_update_seconds * HZ);
}
-/*
- * Bucket priorities/gens:
- *
- * For each bucket, we store on disk its
- * 8 bit gen
- * 16 bit priority
- *
- * See alloc.c for an explanation of the gen. The priority is used to implement
- * lru (and in the future other) cache replacement policies; for most purposes
- * it's just an opaque integer.
- *
- * The gens and the priorities don't have a whole lot to do with each other, and
- * it's actually the gens that must be written out at specific times - it's no
- * big deal if the priorities don't get written, if we lose them we just reuse
- * buckets in suboptimal order.
- *
- * On disk they're stored in a packed array, and in as many buckets are required
- * to fit them all. The buckets we use to store them form a list; the journal
- * header points to the first bucket, the first bucket points to the second
- * bucket, et cetera.
- *
- * This code is used by the allocation code; periodically (whenever it runs out
- * of buckets to allocate from) the allocation code will invalidate some
- * buckets, but it can't use those buckets until their new gens are safely on
- * disk.
- */
+static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
+{
+ unsigned bytes = offsetof(struct bch_alloc, data);
+
+ if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+ bytes += 2;
+ if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+ bytes += 2;
+
+ return DIV_ROUND_UP(bytes, sizeof(u64));
+}
-static int prio_io(struct bch_dev *ca, uint64_t bucket, int op)
+static const char *bch2_alloc_invalid(const struct bch_fs *c,
+ struct bkey_s_c k)
{
- bio_init(ca->bio_prio, ca->bio_prio->bi_inline_vecs, bucket_pages(ca));
- ca->bio_prio->bi_opf = op|REQ_SYNC|REQ_META;
- ca->bio_prio->bi_iter.bi_sector = bucket * ca->mi.bucket_size;
- ca->bio_prio->bi_bdev = ca->disk_sb.bdev;
- ca->bio_prio->bi_iter.bi_size = bucket_bytes(ca);
- bch2_bio_map(ca->bio_prio, ca->disk_buckets);
-
- return submit_bio_wait(ca->bio_prio);
+ if (k.k->p.inode >= c->sb.nr_devices ||
+ !c->devs[k.k->p.inode])
+ return "invalid device";
+
+ switch (k.k->type) {
+ case BCH_ALLOC: {
+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
+ if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
+ return "incorrect value size";
+ break;
+ }
+ default:
+ return "invalid type";
+ }
+
+ return NULL;
}
-static struct nonce prio_nonce(struct prio_set *p)
+static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
+ size_t size, struct bkey_s_c k)
{
- return (struct nonce) {{
- [0] = 0,
- [1] = p->nonce[0],
- [2] = p->nonce[1],
- [3] = p->nonce[2]^BCH_NONCE_PRIO,
- }};
+ buf[0] = '\0';
+
+ switch (k.k->type) {
+ case BCH_ALLOC:
+ break;
+ }
}
-int bch2_prio_write(struct bch_dev *ca)
+const struct bkey_ops bch2_bkey_alloc_ops = {
+ .key_invalid = bch2_alloc_invalid,
+ .val_to_text = bch2_alloc_to_text,
+};
+
+static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
{
- struct bch_fs *c = ca->fs;
- struct journal *j = &c->journal;
- struct journal_res res = { 0 };
- bool need_new_journal_entry;
- int i, ret = 0;
+ unsigned v;
- if (c->opts.nochanges)
- return 0;
+ switch (bytes) {
+ case 1:
+ v = **p;
+ break;
+ case 2:
+ v = le16_to_cpup((void *) *p);
+ break;
+ case 4:
+ v = le32_to_cpup((void *) *p);
+ break;
+ default:
+ BUG();
+ }
- mutex_lock(&ca->prio_write_lock);
- trace_prio_write_start(ca);
+ *p += bytes;
+ return v;
+}
- ca->need_prio_write = false;
+static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v)
+{
+ switch (bytes) {
+ case 1:
+ **p = v;
+ break;
+ case 2:
+ *((__le16 *) *p) = cpu_to_le16(v);
+ break;
+ case 4:
+ *((__le32 *) *p) = cpu_to_le32(v);
+ break;
+ default:
+ BUG();
+ }
- atomic64_add(ca->mi.bucket_size * prio_buckets(ca),
- &ca->meta_sectors_written);
+ *p += bytes;
+}
- for (i = prio_buckets(ca) - 1; i >= 0; --i) {
- struct bucket *g;
- struct prio_set *p = ca->disk_buckets;
- struct bucket_disk *d = p->data;
- struct bucket_disk *end = d + prios_per_bucket(ca);
- size_t r;
+static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bch_dev *ca;
+ struct bkey_s_c_alloc a;
+ struct bucket_mark new;
+ struct bucket *g;
+ const u8 *d;
- for (r = i * prios_per_bucket(ca);
- r < ca->mi.nbuckets && d < end;
- r++, d++) {
- g = ca->buckets + r;
- d->prio[READ] = cpu_to_le16(g->prio[READ]);
- d->prio[WRITE] = cpu_to_le16(g->prio[WRITE]);
- d->gen = ca->buckets[r].mark.gen;
- }
+ if (k.k->type != BCH_ALLOC)
+ return;
- p->next_bucket = cpu_to_le64(ca->prio_buckets[i + 1]);
- p->magic = cpu_to_le64(pset_magic(c));
- get_random_bytes(&p->nonce, sizeof(p->nonce));
+ a = bkey_s_c_to_alloc(k);
+ ca = c->devs[a.k->p.inode];
- spin_lock(&ca->prio_buckets_lock);
- r = bch2_bucket_alloc(ca, RESERVE_PRIO);
- BUG_ON(!r);
+ if (a.k->p.offset >= ca->mi.nbuckets)
+ return;
- /*
- * goes here before dropping prio_buckets_lock to guard against
- * it getting gc'd from under us
- */
- ca->prio_buckets[i] = r;
- bch2_mark_metadata_bucket(ca, ca->buckets + r,
- BUCKET_PRIOS, false);
- spin_unlock(&ca->prio_buckets_lock);
-
- SET_PSET_CSUM_TYPE(p, bch2_meta_checksum_type(c));
-
- bch2_encrypt(c, PSET_CSUM_TYPE(p),
- prio_nonce(p),
- p->encrypted_start,
- bucket_bytes(ca) -
- offsetof(struct prio_set, encrypted_start));
-
- p->csum = bch2_checksum(c, PSET_CSUM_TYPE(p),
- prio_nonce(p),
- (void *) p + sizeof(p->csum),
- bucket_bytes(ca) - sizeof(p->csum));
-
- ret = prio_io(ca, r, REQ_OP_WRITE);
- if (bch2_dev_fatal_io_err_on(ret, ca,
- "prio write to bucket %zu", r) ||
- bch2_meta_write_fault("prio"))
- goto err;
- }
+ g = ca->buckets + a.k->p.offset;
+ bucket_cmpxchg(g, new, ({
+ new.gen = a.v->gen;
+ new.gen_valid = 1;
+ }));
+
+ d = a.v->data;
+ if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+ g->prio[READ] = get_alloc_field(&d, 2);
+ if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+ g->prio[WRITE] = get_alloc_field(&d, 2);
+}
- spin_lock(&j->lock);
- j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]);
- j->nr_prio_buckets = max_t(unsigned,
- ca->dev_idx + 1,
- j->nr_prio_buckets);
- spin_unlock(&j->lock);
+int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
+{
+ struct journal_replay *r;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
- do {
- unsigned u64s = jset_u64s(0);
+ if (!c->btree_roots[BTREE_ID_ALLOC].b)
+ return 0;
- if (!test_bit(JOURNAL_STARTED, &c->journal.flags))
- break;
+ for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
+ bch2_alloc_read_key(c, k);
+ bch2_btree_iter_cond_resched(&iter);
+ }
- ret = bch2_journal_res_get(j, &res, u64s, u64s);
- if (ret)
- goto err;
+ ret = bch2_btree_iter_unlock(&iter);
+ if (ret)
+ return ret;
- need_new_journal_entry = j->buf[res.idx].nr_prio_buckets <
- ca->dev_idx + 1;
- bch2_journal_res_put(j, &res);
+ list_for_each_entry(r, journal_replay_list, list) {
+ struct bkey_i *k, *n;
+ struct jset_entry *entry;
- ret = bch2_journal_flush_seq(j, res.seq);
- if (ret)
- goto err;
- } while (need_new_journal_entry);
+ for_each_jset_key(k, n, entry, &r->j)
+ if (entry->btree_id == BTREE_ID_ALLOC)
+ bch2_alloc_read_key(c, bkey_i_to_s_c(k));
+ }
- /*
- * Don't want the old priorities to get garbage collected until after we
- * finish writing the new ones, and they're journalled
- */
+ return 0;
+}
- spin_lock(&ca->prio_buckets_lock);
+static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket *g, struct btree_iter *iter,
+ u64 *journal_seq)
+{
+ struct bucket_mark m = READ_ONCE(g->mark);
+ __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
+ struct bkey_i_alloc *a;
+ u8 *d;
+ int ret;
- for (i = 0; i < prio_buckets(ca); i++) {
- if (ca->prio_last_buckets[i])
- __bch2_bucket_free(ca,
- &ca->buckets[ca->prio_last_buckets[i]]);
+ bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, g - ca->buckets));
- ca->prio_last_buckets[i] = ca->prio_buckets[i];
- }
+ do {
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ break;
- spin_unlock(&ca->prio_buckets_lock);
+ a = bkey_alloc_init(&alloc_key.k);
+ a->k.p = iter->pos;
+ a->v.fields = 0;
+ a->v.gen = m.gen;
+ set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
+
+ d = a->v.data;
+ if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+ put_alloc_field(&d, 2, g->prio[READ]);
+ if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+ put_alloc_field(&d, 2, g->prio[WRITE]);
+
+ bch2_btree_iter_set_pos(iter, a->k.p);
+ ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE|
+ BTREE_INSERT_NOWAIT,
+ BTREE_INSERT_ENTRY(iter, &a->k_i));
+ bch2_btree_iter_cond_resched(iter);
+ } while (ret == -EINTR);
- trace_prio_write_end(ca);
-err:
- mutex_unlock(&ca->prio_write_lock);
return ret;
}
-int bch2_prio_read(struct bch_dev *ca)
+int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
{
- struct bch_fs *c = ca->fs;
- struct prio_set *p = ca->disk_buckets;
- struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
- struct bucket_mark new;
- struct bch_csum csum;
- unsigned bucket_nr = 0;
- u64 bucket, expect, got;
- size_t b;
- int ret = 0;
+ struct bch_dev *ca;
+ struct bucket *g;
+ struct btree_iter iter;
+ int ret;
- if (ca->prio_read_done)
- return 0;
+ lockdep_assert_held(&c->state_lock);
- ca->prio_read_done = true;
+ if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+ return 0;
- spin_lock(&c->journal.lock);
- bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]);
- spin_unlock(&c->journal.lock);
+ ca = c->devs[pos.inode];
- /*
- * If the device hasn't been used yet, there won't be a prio bucket ptr
- */
- if (!bucket)
+ if (pos.offset >= ca->mi.nbuckets)
return 0;
- if (mustfix_fsck_err_on(bucket < ca->mi.first_bucket ||
- bucket >= ca->mi.nbuckets, c,
- "bad prio bucket %llu", bucket))
- return 0;
+ g = ca->buckets + pos.offset;
- for (b = 0; b < ca->mi.nbuckets; b++, d++) {
- if (d == end) {
- ca->prio_last_buckets[bucket_nr] = bucket;
- bucket_nr++;
-
- ret = prio_io(ca, bucket, REQ_OP_READ) ||
- bch2_meta_read_fault("prio");
-
- if (mustfix_fsck_err_on(ret, c,
- "IO error reading bucket gens (%i)",
- ret))
- return 0;
-
- got = le64_to_cpu(p->magic);
- expect = pset_magic(c);
- if (mustfix_fsck_err_on(got != expect, c,
- "bad magic (got %llu expect %llu) while reading prios from bucket %llu",
- got, expect, bucket))
- return 0;
-
- if (mustfix_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c,
- "prio bucket with unknown csum type %llu bucket %lluu",
- PSET_CSUM_TYPE(p), bucket))
- return 0;
-
- csum = bch2_checksum(c, PSET_CSUM_TYPE(p),
- prio_nonce(p),
- (void *) p + sizeof(p->csum),
- bucket_bytes(ca) - sizeof(p->csum));
- if (fsck_err_on(bch2_crc_cmp(csum, p->csum), c,
- "bad checksum reading prios from bucket %llu",
- bucket))
- return 0;
-
- bch2_encrypt(c, PSET_CSUM_TYPE(p),
- prio_nonce(p),
- p->encrypted_start,
- bucket_bytes(ca) -
- offsetof(struct prio_set, encrypted_start));
-
- bucket = le64_to_cpu(p->next_bucket);
- d = p->data;
- }
+ bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
+ BTREE_ITER_INTENT);
- ca->buckets[b].prio[READ] = le16_to_cpu(d->prio[READ]);
- ca->buckets[b].prio[WRITE] = le16_to_cpu(d->prio[WRITE]);
+ ret = __bch2_alloc_write_key(c, ca, g, &iter, NULL);
+ bch2_btree_iter_unlock(&iter);
+ return ret;
+}
- bucket_cmpxchg(&ca->buckets[b], new, ({
- new.gen = d->gen;
- new.gen_valid = 1;
- }));
- }
+int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq)
+{
+ struct btree_iter iter;
+ struct bucket *g;
+ int ret = 0;
- mutex_lock(&c->bucket_lock);
- bch2_recalc_min_prio(ca, READ);
- bch2_recalc_min_prio(ca, WRITE);
- mutex_unlock(&c->bucket_lock);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
+ BTREE_ITER_INTENT);
+
+ for_each_bucket(g, ca) {
+ ret = __bch2_alloc_write_key(c, ca, g, &iter, journal_seq);
+ if (ret)
+ break;
+ }
- ret = 0;
-fsck_err:
+ bch2_btree_iter_unlock(&iter);
return ret;
}
@@ -516,9 +491,6 @@ static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket)
long i;
unsigned j;
- for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
- BUG_ON(ca->prio_buckets[iter] == bucket);
-
for (j = 0; j < RESERVE_NR; j++)
fifo_for_each_entry(i, &ca->free[j], iter)
BUG_ON(i == bucket);
@@ -651,17 +623,37 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
{
- spin_lock(&ca->freelist_lock);
-
- bch2_invalidate_bucket(ca, g);
+ struct bch_fs *c = ca->fs;
+ struct bucket_mark m;
- g->prio[READ] = ca->fs->prio_clock[READ].hand;
- g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand;
+ spin_lock(&ca->freelist_lock);
+ if (!bch2_invalidate_bucket(ca, g, &m)) {
+ spin_unlock(&ca->freelist_lock);
+ return;
+ }
verify_not_on_freelist(ca, g - ca->buckets);
BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
-
spin_unlock(&ca->freelist_lock);
+
+ g->prio[READ] = c->prio_clock[READ].hand;
+ g->prio[WRITE] = c->prio_clock[WRITE].hand;
+
+ if (m.cached_sectors) {
+ ca->allocator_invalidating_data = true;
+ } else if (m.journal_seq_valid) {
+ u64 journal_seq = atomic64_read(&c->journal.seq);
+ u64 bucket_seq = journal_seq;
+
+ bucket_seq &= ~((u64) U16_MAX);
+ bucket_seq |= m.journal_seq;
+
+ if (bucket_seq > journal_seq)
+ bucket_seq -= 1 << 16;
+
+ ca->allocator_journal_seq_flush =
+ max(ca->allocator_journal_seq_flush, bucket_seq);
+ }
}
/*
@@ -686,11 +678,23 @@ static unsigned long bucket_sort_key(struct bch_dev *ca,
struct bucket *g,
struct bucket_mark m)
{
+ /*
+ * Time since last read, scaled to [0, 8) where larger value indicates
+ * more recently read data:
+ */
unsigned long hotness =
(g->prio[READ] - ca->min_prio[READ]) * 7 /
(ca->fs->prio_clock[READ].hand - ca->min_prio[READ]);
- return (((hotness + 1) * bucket_sectors_used(m)) << 8) |
+ /* How much we want to keep the data in this bucket: */
+ unsigned long data_wantness =
+ (hotness + 1) * bucket_sectors_used(m);
+
+ unsigned long needs_journal_commit =
+ bucket_needs_journal_commit(m, ca->fs->journal.last_seq_ondisk);
+
+ return (data_wantness << 9) |
+ (needs_journal_commit << 8) |
bucket_gc_gen(ca, g);
}
@@ -790,8 +794,8 @@ static void invalidate_buckets_random(struct bch_dev *ca)
static void invalidate_buckets(struct bch_dev *ca)
{
- ca->inc_gen_needs_gc = 0;
- ca->inc_gen_really_needs_gc = 0;
+ ca->inc_gen_needs_gc = 0;
+ ca->inc_gen_really_needs_gc = 0;
switch (ca->mi.replacement) {
case CACHE_REPLACEMENT_LRU:
@@ -806,73 +810,82 @@ static void invalidate_buckets(struct bch_dev *ca)
}
}
-static bool __bch2_allocator_push(struct bch_dev *ca, long bucket)
+static int size_t_cmp(const void *_l, const void *_r)
{
- if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
- goto success;
-
- if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket))
- goto success;
-
- if (fifo_push(&ca->free[RESERVE_BTREE], bucket))
- goto success;
-
- if (fifo_push(&ca->free[RESERVE_NONE], bucket))
- goto success;
+ const size_t *l = _l, *r = _r;
- return false;
-success:
- closure_wake_up(&ca->fs->freelist_wait);
- return true;
+ return (*l > *r) - (*l < *r);
}
-static bool bch2_allocator_push(struct bch_dev *ca, long bucket)
+static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
+ u64 *journal_seq)
{
- bool ret;
+ struct btree_iter iter;
+ unsigned nr_invalidated = 0;
+ size_t b, i;
+ int ret = 0;
- spin_lock(&ca->freelist_lock);
- ret = __bch2_allocator_push(ca, bucket);
- if (ret)
- fifo_pop(&ca->free_inc, bucket);
- spin_unlock(&ca->freelist_lock);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
+ BTREE_ITER_INTENT);
- return ret;
+ fifo_for_each_entry(b, &ca->free_inc, i) {
+ ret = __bch2_alloc_write_key(c, ca, ca->buckets + b,
+ &iter, journal_seq);
+ if (ret)
+ break;
+
+ nr_invalidated++;
+ }
+
+ bch2_btree_iter_unlock(&iter);
+ return nr_invalidated ?: ret;
}
-static void bch2_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca)
+/*
+ * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
+ * then add it to the freelist, waiting until there's room if necessary:
+ */
+static void discard_invalidated_bucket(struct bch_dev *ca, long bucket)
{
- u16 last_seq_ondisk = c->journal.last_seq_ondisk;
- struct bucket *g;
+ if (ca->mi.discard &&
+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ bucket_to_sector(ca, bucket),
+ ca->mi.bucket_size, GFP_NOIO, 0);
- for_each_bucket(g, ca) {
- struct bucket_mark m = READ_ONCE(g->mark);
- if (is_available_bucket(m) &&
- !m.cached_sectors &&
- !m.had_metadata &&
- !bucket_needs_journal_commit(m, last_seq_ondisk)) {
- spin_lock(&ca->freelist_lock);
+ while (1) {
+ bool pushed = false;
+ unsigned i;
- bch2_mark_alloc_bucket(ca, g, true);
- g->prio[READ] = c->prio_clock[READ].hand;
- g->prio[WRITE] = c->prio_clock[WRITE].hand;
+ set_current_state(TASK_INTERRUPTIBLE);
- verify_not_on_freelist(ca, g - ca->buckets);
- BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
+ /*
+ * Don't remove from free_inc until after it's added to
+ * freelist, so gc can find it:
+ */
+ spin_lock(&ca->freelist_lock);
+ for (i = 0; i < RESERVE_NR; i++)
+ if (fifo_push(&ca->free[i], bucket)) {
+ fifo_pop(&ca->free_inc, bucket);
+ closure_wake_up(&ca->fs->freelist_wait);
+ pushed = true;
+ break;
+ }
+ spin_unlock(&ca->freelist_lock);
- spin_unlock(&ca->freelist_lock);
+ if (pushed)
+ break;
- if (fifo_full(&ca->free_inc))
- break;
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ break;
}
+ schedule();
+ try_to_freeze();
}
-}
-
-static int size_t_cmp(const void *_l, const void *_r)
-{
- const size_t *l = _l, *r = _r;
- return (*l > *r) - (*l < *r);
+ __set_current_state(TASK_RUNNING);
}
/**
@@ -887,57 +900,26 @@ static int bch2_allocator_thread(void *arg)
{
struct bch_dev *ca = arg;
struct bch_fs *c = ca->fs;
- long bucket;
+ size_t bucket;
int ret;
set_freezable();
- bch2_find_empty_buckets(c, ca);
-
- while (1) {
- /*
- * First, we pull buckets off of the free_inc list, possibly
- * issue discards to them, then we add the bucket to a
- * free list:
- */
-
- while (!fifo_empty(&ca->free_inc)) {
- bucket = fifo_peek(&ca->free_inc);
-
- /*
- * Don't remove from free_inc until after it's added
- * to freelist, so gc doesn't miss it while we've
- * dropped bucket lock
- */
-
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca, bucket),
- ca->mi.bucket_size, GFP_NOIO, 0);
-
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (bch2_allocator_push(ca, bucket))
- break;
-
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- goto out;
- }
- schedule();
- try_to_freeze();
- }
-
- __set_current_state(TASK_RUNNING);
- }
-
- /* We've run out of free buckets! */
+ while (!kthread_should_stop()) {
+ u64 journal_seq = 0;
+ /* Reset front/back so we can easily sort fifo entries later: */
BUG_ON(fifo_used(&ca->free_inc));
- ca->free_inc.front = ca->free_inc.back = 0;
+ ca->free_inc.front = ca->free_inc.back = 0;
+ ca->allocator_journal_seq_flush = 0;
+ ca->allocator_invalidating_data = false;
down_read(&c->gc_lock);
+ if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
+ up_read(&c->gc_lock);
+ goto out;
+ }
+
while (1) {
/*
* Find some buckets that we can invalidate, either
@@ -947,7 +929,6 @@ static int bch2_allocator_thread(void *arg)
*/
invalidate_buckets(ca);
-
trace_alloc_batch(ca, fifo_used(&ca->free_inc),
ca->free_inc.size);
@@ -980,28 +961,32 @@ static int bch2_allocator_thread(void *arg)
spin_unlock(&ca->freelist_lock);
/*
- * free_inc is full of newly-invalidated buckets, must write out
- * prios and gens before they can be re-used
+ * free_inc is now full of newly-invalidated buckets: next,
+ * write out the new bucket gens:
*/
- ret = bch2_prio_write(ca);
- if (ret) {
- /*
- * Emergency read only - allocator thread has to
- * shutdown.
- *
- * N.B. we better be going into RO mode, else
- * allocations would hang indefinitely - whatever
- * generated the error will have sent us into RO mode.
- *
- * Clear out the free_inc freelist so things are
- * consistent-ish:
- */
- spin_lock(&ca->freelist_lock);
- while (fifo_pop(&ca->free_inc, bucket))
- bch2_mark_free_bucket(ca, ca->buckets + bucket);
- spin_unlock(&ca->freelist_lock);
- goto out;
+
+ while (!fifo_empty(&ca->free_inc) && !kthread_should_stop()) {
+ ret = bch2_invalidate_free_inc(c, ca, &journal_seq);
+ if (bch2_fs_fatal_err_on(ret < 0, c,
+ "error invalidating buckets: %i", ret))
+ goto err;
+
+ if (ca->allocator_invalidating_data)
+ bch2_journal_flush_seq(&c->journal, journal_seq);
+ else if (ca->allocator_journal_seq_flush)
+ bch2_journal_flush_seq(&c->journal,
+ ca->allocator_journal_seq_flush);
+
+ while (ret && !kthread_should_stop()) {
+ BUG_ON(fifo_empty(&ca->free_inc));
+
+ bucket = fifo_peek(&ca->free_inc);
+ discard_invalidated_bucket(ca, bucket);
+ --ret;
+ }
}
+
+ ca->alloc_thread_started = true;
}
out:
/*
@@ -1010,50 +995,104 @@ out:
*/
synchronize_rcu();
return 0;
+err:
+ /*
+ * Emergency read only - allocator thread has to shutdown.
+ *
+ * N.B. we better be going into RO mode, else allocations would hang
+ * indefinitely - whatever generated the error will have sent us into RO
+ * mode.
+ *
+ * Clear out the free_inc freelist so things are consistent-ish:
+ */
+ spin_lock(&ca->freelist_lock);
+ while (fifo_pop(&ca->free_inc, bucket))
+ bch2_mark_free_bucket(ca, ca->buckets + bucket);
+ spin_unlock(&ca->freelist_lock);
+ goto out;
}
/* Allocation */
+static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct bucket *g;
+ long r = -1;
+
+ if (!down_read_trylock(&c->gc_lock))
+ return r;
+
+ if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+ goto out;
+
+ for_each_bucket(g, ca)
+ if (!g->mark.touched_this_mount &&
+ is_available_bucket(g->mark) &&
+ bch2_mark_alloc_bucket_startup(ca, g)) {
+ r = g - ca->buckets;
+ break;
+ }
+out:
+ up_read(&c->gc_lock);
+ return r;
+}
+
/**
* bch_bucket_alloc - allocate a single bucket from a specific device
*
* Returns index of bucket on success, 0 on failure
* */
-size_t bch2_bucket_alloc(struct bch_dev *ca, enum alloc_reserve reserve)
+long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve)
{
- struct bucket *g;
- long r;
+ size_t r;
spin_lock(&ca->freelist_lock);
- if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
- fifo_pop(&ca->free[reserve], r))
+ if (likely(fifo_pop(&ca->free[RESERVE_NONE], r)))
goto out;
+ switch (reserve) {
+ case RESERVE_ALLOC:
+ if (fifo_pop(&ca->free[RESERVE_BTREE], r))
+ goto out;
+ break;
+ case RESERVE_BTREE:
+ if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
+ ca->free[RESERVE_BTREE].size &&
+ fifo_pop(&ca->free[RESERVE_BTREE], r))
+ goto out;
+ break;
+ case RESERVE_MOVINGGC:
+ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], r))
+ goto out;
+ break;
+ default:
+ break;
+ }
+
spin_unlock(&ca->freelist_lock);
+ if (unlikely(!ca->alloc_thread_started) &&
+ (r = bch2_bucket_alloc_startup(c, ca)) >= 0) {
+ verify_not_on_freelist(ca, r);
+ goto out2;
+ }
+
trace_bucket_alloc_fail(ca, reserve);
- return 0;
+ return -1;
out:
verify_not_on_freelist(ca, r);
spin_unlock(&ca->freelist_lock);
- trace_bucket_alloc(ca, reserve);
-
bch2_wake_allocator(ca);
+out2:
+ ca->buckets[r].prio[READ] = c->prio_clock[READ].hand;
+ ca->buckets[r].prio[WRITE] = c->prio_clock[WRITE].hand;
- g = ca->buckets + r;
-
- g->prio[READ] = ca->fs->prio_clock[READ].hand;
- g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand;
-
+ trace_bucket_alloc(ca, reserve);
return r;
}
-static void __bch2_bucket_free(struct bch_dev *ca, struct bucket *g)
-{
- bch2_mark_free_bucket(ca, g);
-}
-
enum bucket_alloc_ret {
ALLOC_SUCCESS,
NO_DEVICES, /* -EROFS */
@@ -1116,7 +1155,7 @@ static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c,
while (ob->nr_ptrs < nr_replicas) {
struct bch_dev *ca;
- u64 bucket;
+ long bucket;
if (!available) {
ret = NO_DEVICES;
@@ -1139,8 +1178,8 @@ static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c,
get_random_int() > devs->d[i].weight)
continue;
- bucket = bch2_bucket_alloc(ca, reserve);
- if (!bucket) {
+ bucket = bch2_bucket_alloc(c, ca, reserve);
+ if (bucket < 0) {
if (fail_idx == -1)
fail_idx = i;
continue;
@@ -1456,7 +1495,6 @@ struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *c,
? 0 : BTREE_NODE_RESERVE;
int ret;
- BUG_ON(!reserve);
BUG_ON(!nr_replicas);
retry:
ob = lock_writepoint(c, wp);
@@ -1705,7 +1743,9 @@ set_capacity:
capacity *= (100 - c->opts.gc_reserve_percent);
capacity = div64_u64(capacity, 100);
- BUG_ON(capacity + reserved_sectors > total_capacity);
+ BUG_ON(reserved_sectors > total_capacity);
+
+ capacity = min(capacity, total_capacity - reserved_sectors);
c->capacity = capacity;
@@ -1725,10 +1765,9 @@ set_capacity:
closure_wake_up(&c->freelist_wait);
}
-static void bch2_stop_write_point(struct bch_dev *ca,
- struct write_point *wp)
+static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
+ struct write_point *wp)
{
- struct bch_fs *c = ca->fs;
struct open_bucket *ob;
struct bch_extent_ptr *ptr;
@@ -1750,9 +1789,8 @@ found:
bch2_open_bucket_put(c, ob);
}
-static bool bch2_dev_has_open_write_point(struct bch_dev *ca)
+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_fs *c = ca->fs;
struct bch_extent_ptr *ptr;
struct open_bucket *ob;
@@ -1773,55 +1811,36 @@ static bool bch2_dev_has_open_write_point(struct bch_dev *ca)
}
/* device goes ro: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_fs *c = ca->fs;
struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
- struct task_struct *p;
struct closure cl;
unsigned i;
+ BUG_ON(ca->alloc_thread);
+
closure_init_stack(&cl);
/* First, remove device from allocation groups: */
+ bch2_dev_group_remove(&c->journal.devs, ca);
bch2_dev_group_remove(tier, ca);
bch2_dev_group_remove(&c->all_devs, ca);
- bch2_recalc_capacity(c);
-
/*
- * Stopping the allocator thread comes after removing from allocation
- * groups, else pending allocations will hang:
- */
-
- p = ca->alloc_thread;
- ca->alloc_thread = NULL;
- smp_wmb();
-
- /*
- * We need an rcu barrier between setting ca->alloc_thread = NULL and
- * the thread shutting down to avoid a race with bch2_usage_update() -
- * the allocator thread itself does a synchronize_rcu() on exit.
- *
- * XXX: it would be better to have the rcu barrier be asynchronous
- * instead of blocking us here
+ * Capacity is calculated based off of devices in allocation groups:
*/
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
+ bch2_recalc_capacity(c);
/* Next, close write points that point to this device... */
-
for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
- bch2_stop_write_point(ca, &c->write_points[i]);
+ bch2_stop_write_point(c, ca, &c->write_points[i]);
- bch2_stop_write_point(ca, &ca->copygc_write_point);
- bch2_stop_write_point(ca, &c->promote_write_point);
- bch2_stop_write_point(ca, &ca->tiering_write_point);
- bch2_stop_write_point(ca, &c->migration_write_point);
- bch2_stop_write_point(ca, &c->btree_write_point);
+ bch2_stop_write_point(c, ca, &ca->copygc_write_point);
+ bch2_stop_write_point(c, ca, &c->promote_write_point);
+ bch2_stop_write_point(c, ca, &ca->tiering_write_point);
+ bch2_stop_write_point(c, ca, &c->migration_write_point);
+ bch2_stop_write_point(c, ca, &c->btree_write_point);
mutex_lock(&c->btree_reserve_cache_lock);
while (c->btree_reserve_cache_nr) {
@@ -1832,9 +1851,16 @@ void bch2_dev_allocator_stop(struct bch_dev *ca)
}
mutex_unlock(&c->btree_reserve_cache_lock);
- /* Avoid deadlocks.. */
-
+ /*
+ * Wake up threads that were blocked on allocation, so they can notice
+ * the device can no longer be removed and the capacity has changed:
+ */
closure_wake_up(&c->freelist_wait);
+
+ /*
+ * journal_res_get() can block waiting for free space in the journal -
+ * it needs to notice there may not be devices to allocate from anymore:
+ */
wake_up(&c->journal.wait);
/* Now wait for any in flight writes: */
@@ -1842,7 +1868,7 @@ void bch2_dev_allocator_stop(struct bch_dev *ca)
while (1) {
closure_wait(&c->open_buckets_wait, &cl);
- if (!bch2_dev_has_open_write_point(ca)) {
+ if (!bch2_dev_has_open_write_point(c, ca)) {
closure_wake_up(&c->open_buckets_wait);
break;
}
@@ -1851,32 +1877,15 @@ void bch2_dev_allocator_stop(struct bch_dev *ca)
}
}
-/*
- * Startup the allocator thread for transition to RW mode:
- */
-int bch2_dev_allocator_start(struct bch_dev *ca)
+/* device goes rw: */
+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_fs *c = ca->fs;
struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
struct bch_sb_field_journal *journal_buckets;
bool has_journal;
- struct task_struct *k;
- /*
- * allocator thread already started?
- */
- if (ca->alloc_thread)
- return 0;
-
- k = kthread_create(bch2_allocator_thread, ca, "bcache_allocator");
- if (IS_ERR(k))
- return 0;
-
- get_task_struct(k);
- ca->alloc_thread = k;
-
- bch2_dev_group_add(tier, ca);
bch2_dev_group_add(&c->all_devs, ca);
+ bch2_dev_group_add(tier, ca);
mutex_lock(&c->sb_lock);
journal_buckets = bch2_sb_get_journal(ca->disk_sb.sb);
@@ -1886,15 +1895,44 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
if (has_journal)
bch2_dev_group_add(&c->journal.devs, ca);
+}
- bch2_recalc_capacity(c);
+/* stop allocator thread: */
+void bch2_dev_allocator_stop(struct bch_dev *ca)
+{
+ struct task_struct *p = ca->alloc_thread;
+
+ ca->alloc_thread = NULL;
+ smp_wmb();
+
+ /*
+ * We need an rcu barrier between setting ca->alloc_thread = NULL and
+ * the thread shutting down to avoid a race with bch2_usage_update() -
+ * the allocator thread itself does a synchronize_rcu() on exit.
+ *
+ * XXX: it would be better to have the rcu barrier be asynchronous
+ * instead of blocking us here
+ */
+ if (p)
+ kthread_stop(p);
+}
+
+/* start allocator thread: */
+int bch2_dev_allocator_start(struct bch_dev *ca)
+{
+ struct task_struct *p;
/*
- * Don't wake up allocator thread until after adding device to
- * allocator groups - otherwise, alloc thread could get a spurious
- * -EROFS due to prio_write() -> journal_meta() not finding any devices:
+ * allocator thread already started?
*/
- wake_up_process(k);
+ if (ca->alloc_thread)
+ return 0;
+
+ p = kthread_run(bch2_allocator_thread, ca, "bcache_allocator");
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ ca->alloc_thread = p;
return 0;
}