diff options
Diffstat (limited to 'libbcachefs/alloc.c')
-rw-r--r-- | libbcachefs/alloc.c | 898 |
1 files changed, 468 insertions, 430 deletions
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index 9d54dd80..5a258cb6 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -75,7 +75,6 @@ #include <linux/sort.h> #include <trace/events/bcachefs.h> -static void __bch2_bucket_free(struct bch_dev *, struct bucket *); static void bch2_recalc_min_prio(struct bch_dev *, int); /* Allocation groups: */ @@ -206,268 +205,244 @@ static void pd_controllers_update(struct work_struct *work) c->pd_controllers_update_seconds * HZ); } -/* - * Bucket priorities/gens: - * - * For each bucket, we store on disk its - * 8 bit gen - * 16 bit priority - * - * See alloc.c for an explanation of the gen. The priority is used to implement - * lru (and in the future other) cache replacement policies; for most purposes - * it's just an opaque integer. - * - * The gens and the priorities don't have a whole lot to do with each other, and - * it's actually the gens that must be written out at specific times - it's no - * big deal if the priorities don't get written, if we lose them we just reuse - * buckets in suboptimal order. - * - * On disk they're stored in a packed array, and in as many buckets are required - * to fit them all. The buckets we use to store them form a list; the journal - * header points to the first bucket, the first bucket points to the second - * bucket, et cetera. - * - * This code is used by the allocation code; periodically (whenever it runs out - * of buckets to allocate from) the allocation code will invalidate some - * buckets, but it can't use those buckets until their new gens are safely on - * disk. - */ +static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) +{ + unsigned bytes = offsetof(struct bch_alloc, data); + + if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) + bytes += 2; + if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) + bytes += 2; + + return DIV_ROUND_UP(bytes, sizeof(u64)); +} -static int prio_io(struct bch_dev *ca, uint64_t bucket, int op) +static const char *bch2_alloc_invalid(const struct bch_fs *c, + struct bkey_s_c k) { - bio_init(ca->bio_prio, ca->bio_prio->bi_inline_vecs, bucket_pages(ca)); - ca->bio_prio->bi_opf = op|REQ_SYNC|REQ_META; - ca->bio_prio->bi_iter.bi_sector = bucket * ca->mi.bucket_size; - ca->bio_prio->bi_bdev = ca->disk_sb.bdev; - ca->bio_prio->bi_iter.bi_size = bucket_bytes(ca); - bch2_bio_map(ca->bio_prio, ca->disk_buckets); - - return submit_bio_wait(ca->bio_prio); + if (k.k->p.inode >= c->sb.nr_devices || + !c->devs[k.k->p.inode]) + return "invalid device"; + + switch (k.k->type) { + case BCH_ALLOC: { + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + + if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k)) + return "incorrect value size"; + break; + } + default: + return "invalid type"; + } + + return NULL; } -static struct nonce prio_nonce(struct prio_set *p) +static void bch2_alloc_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { - return (struct nonce) {{ - [0] = 0, - [1] = p->nonce[0], - [2] = p->nonce[1], - [3] = p->nonce[2]^BCH_NONCE_PRIO, - }}; + buf[0] = '\0'; + + switch (k.k->type) { + case BCH_ALLOC: + break; + } } -int bch2_prio_write(struct bch_dev *ca) +const struct bkey_ops bch2_bkey_alloc_ops = { + .key_invalid = bch2_alloc_invalid, + .val_to_text = bch2_alloc_to_text, +}; + +static inline unsigned get_alloc_field(const u8 **p, unsigned bytes) { - struct bch_fs *c = ca->fs; - struct journal *j = &c->journal; - struct journal_res res = { 0 }; - bool need_new_journal_entry; - int i, ret = 0; + unsigned v; - if (c->opts.nochanges) - return 0; + switch (bytes) { + case 1: + v = **p; + break; + case 2: + v = le16_to_cpup((void *) *p); + break; + case 4: + v = le32_to_cpup((void *) *p); + break; + default: + BUG(); + } - mutex_lock(&ca->prio_write_lock); - trace_prio_write_start(ca); + *p += bytes; + return v; +} - ca->need_prio_write = false; +static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v) +{ + switch (bytes) { + case 1: + **p = v; + break; + case 2: + *((__le16 *) *p) = cpu_to_le16(v); + break; + case 4: + *((__le32 *) *p) = cpu_to_le32(v); + break; + default: + BUG(); + } - atomic64_add(ca->mi.bucket_size * prio_buckets(ca), - &ca->meta_sectors_written); + *p += bytes; +} - for (i = prio_buckets(ca) - 1; i >= 0; --i) { - struct bucket *g; - struct prio_set *p = ca->disk_buckets; - struct bucket_disk *d = p->data; - struct bucket_disk *end = d + prios_per_bucket(ca); - size_t r; +static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_dev *ca; + struct bkey_s_c_alloc a; + struct bucket_mark new; + struct bucket *g; + const u8 *d; - for (r = i * prios_per_bucket(ca); - r < ca->mi.nbuckets && d < end; - r++, d++) { - g = ca->buckets + r; - d->prio[READ] = cpu_to_le16(g->prio[READ]); - d->prio[WRITE] = cpu_to_le16(g->prio[WRITE]); - d->gen = ca->buckets[r].mark.gen; - } + if (k.k->type != BCH_ALLOC) + return; - p->next_bucket = cpu_to_le64(ca->prio_buckets[i + 1]); - p->magic = cpu_to_le64(pset_magic(c)); - get_random_bytes(&p->nonce, sizeof(p->nonce)); + a = bkey_s_c_to_alloc(k); + ca = c->devs[a.k->p.inode]; - spin_lock(&ca->prio_buckets_lock); - r = bch2_bucket_alloc(ca, RESERVE_PRIO); - BUG_ON(!r); + if (a.k->p.offset >= ca->mi.nbuckets) + return; - /* - * goes here before dropping prio_buckets_lock to guard against - * it getting gc'd from under us - */ - ca->prio_buckets[i] = r; - bch2_mark_metadata_bucket(ca, ca->buckets + r, - BUCKET_PRIOS, false); - spin_unlock(&ca->prio_buckets_lock); - - SET_PSET_CSUM_TYPE(p, bch2_meta_checksum_type(c)); - - bch2_encrypt(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - p->encrypted_start, - bucket_bytes(ca) - - offsetof(struct prio_set, encrypted_start)); - - p->csum = bch2_checksum(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - (void *) p + sizeof(p->csum), - bucket_bytes(ca) - sizeof(p->csum)); - - ret = prio_io(ca, r, REQ_OP_WRITE); - if (bch2_dev_fatal_io_err_on(ret, ca, - "prio write to bucket %zu", r) || - bch2_meta_write_fault("prio")) - goto err; - } + g = ca->buckets + a.k->p.offset; + bucket_cmpxchg(g, new, ({ + new.gen = a.v->gen; + new.gen_valid = 1; + })); + + d = a.v->data; + if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) + g->prio[READ] = get_alloc_field(&d, 2); + if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) + g->prio[WRITE] = get_alloc_field(&d, 2); +} - spin_lock(&j->lock); - j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]); - j->nr_prio_buckets = max_t(unsigned, - ca->dev_idx + 1, - j->nr_prio_buckets); - spin_unlock(&j->lock); +int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) +{ + struct journal_replay *r; + struct btree_iter iter; + struct bkey_s_c k; + int ret; - do { - unsigned u64s = jset_u64s(0); + if (!c->btree_roots[BTREE_ID_ALLOC].b) + return 0; - if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) - break; + for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) { + bch2_alloc_read_key(c, k); + bch2_btree_iter_cond_resched(&iter); + } - ret = bch2_journal_res_get(j, &res, u64s, u64s); - if (ret) - goto err; + ret = bch2_btree_iter_unlock(&iter); + if (ret) + return ret; - need_new_journal_entry = j->buf[res.idx].nr_prio_buckets < - ca->dev_idx + 1; - bch2_journal_res_put(j, &res); + list_for_each_entry(r, journal_replay_list, list) { + struct bkey_i *k, *n; + struct jset_entry *entry; - ret = bch2_journal_flush_seq(j, res.seq); - if (ret) - goto err; - } while (need_new_journal_entry); + for_each_jset_key(k, n, entry, &r->j) + if (entry->btree_id == BTREE_ID_ALLOC) + bch2_alloc_read_key(c, bkey_i_to_s_c(k)); + } - /* - * Don't want the old priorities to get garbage collected until after we - * finish writing the new ones, and they're journalled - */ + return 0; +} - spin_lock(&ca->prio_buckets_lock); +static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, + struct bucket *g, struct btree_iter *iter, + u64 *journal_seq) +{ + struct bucket_mark m = READ_ONCE(g->mark); + __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key; + struct bkey_i_alloc *a; + u8 *d; + int ret; - for (i = 0; i < prio_buckets(ca); i++) { - if (ca->prio_last_buckets[i]) - __bch2_bucket_free(ca, - &ca->buckets[ca->prio_last_buckets[i]]); + bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, g - ca->buckets)); - ca->prio_last_buckets[i] = ca->prio_buckets[i]; - } + do { + ret = bch2_btree_iter_traverse(iter); + if (ret) + break; - spin_unlock(&ca->prio_buckets_lock); + a = bkey_alloc_init(&alloc_key.k); + a->k.p = iter->pos; + a->v.fields = 0; + a->v.gen = m.gen; + set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v)); + + d = a->v.data; + if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) + put_alloc_field(&d, 2, g->prio[READ]); + if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) + put_alloc_field(&d, 2, g->prio[WRITE]); + + bch2_btree_iter_set_pos(iter, a->k.p); + ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_NOWAIT, + BTREE_INSERT_ENTRY(iter, &a->k_i)); + bch2_btree_iter_cond_resched(iter); + } while (ret == -EINTR); - trace_prio_write_end(ca); -err: - mutex_unlock(&ca->prio_write_lock); return ret; } -int bch2_prio_read(struct bch_dev *ca) +int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) { - struct bch_fs *c = ca->fs; - struct prio_set *p = ca->disk_buckets; - struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; - struct bucket_mark new; - struct bch_csum csum; - unsigned bucket_nr = 0; - u64 bucket, expect, got; - size_t b; - int ret = 0; + struct bch_dev *ca; + struct bucket *g; + struct btree_iter iter; + int ret; - if (ca->prio_read_done) - return 0; + lockdep_assert_held(&c->state_lock); - ca->prio_read_done = true; + if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode]) + return 0; - spin_lock(&c->journal.lock); - bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]); - spin_unlock(&c->journal.lock); + ca = c->devs[pos.inode]; - /* - * If the device hasn't been used yet, there won't be a prio bucket ptr - */ - if (!bucket) + if (pos.offset >= ca->mi.nbuckets) return 0; - if (mustfix_fsck_err_on(bucket < ca->mi.first_bucket || - bucket >= ca->mi.nbuckets, c, - "bad prio bucket %llu", bucket)) - return 0; + g = ca->buckets + pos.offset; - for (b = 0; b < ca->mi.nbuckets; b++, d++) { - if (d == end) { - ca->prio_last_buckets[bucket_nr] = bucket; - bucket_nr++; - - ret = prio_io(ca, bucket, REQ_OP_READ) || - bch2_meta_read_fault("prio"); - - if (mustfix_fsck_err_on(ret, c, - "IO error reading bucket gens (%i)", - ret)) - return 0; - - got = le64_to_cpu(p->magic); - expect = pset_magic(c); - if (mustfix_fsck_err_on(got != expect, c, - "bad magic (got %llu expect %llu) while reading prios from bucket %llu", - got, expect, bucket)) - return 0; - - if (mustfix_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c, - "prio bucket with unknown csum type %llu bucket %lluu", - PSET_CSUM_TYPE(p), bucket)) - return 0; - - csum = bch2_checksum(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - (void *) p + sizeof(p->csum), - bucket_bytes(ca) - sizeof(p->csum)); - if (fsck_err_on(bch2_crc_cmp(csum, p->csum), c, - "bad checksum reading prios from bucket %llu", - bucket)) - return 0; - - bch2_encrypt(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - p->encrypted_start, - bucket_bytes(ca) - - offsetof(struct prio_set, encrypted_start)); - - bucket = le64_to_cpu(p->next_bucket); - d = p->data; - } + bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, + BTREE_ITER_INTENT); - ca->buckets[b].prio[READ] = le16_to_cpu(d->prio[READ]); - ca->buckets[b].prio[WRITE] = le16_to_cpu(d->prio[WRITE]); + ret = __bch2_alloc_write_key(c, ca, g, &iter, NULL); + bch2_btree_iter_unlock(&iter); + return ret; +} - bucket_cmpxchg(&ca->buckets[b], new, ({ - new.gen = d->gen; - new.gen_valid = 1; - })); - } +int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq) +{ + struct btree_iter iter; + struct bucket *g; + int ret = 0; - mutex_lock(&c->bucket_lock); - bch2_recalc_min_prio(ca, READ); - bch2_recalc_min_prio(ca, WRITE); - mutex_unlock(&c->bucket_lock); + bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, + BTREE_ITER_INTENT); + + for_each_bucket(g, ca) { + ret = __bch2_alloc_write_key(c, ca, g, &iter, journal_seq); + if (ret) + break; + } - ret = 0; -fsck_err: + bch2_btree_iter_unlock(&iter); return ret; } @@ -516,9 +491,6 @@ static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket) long i; unsigned j; - for (iter = 0; iter < prio_buckets(ca) * 2; iter++) - BUG_ON(ca->prio_buckets[iter] == bucket); - for (j = 0; j < RESERVE_NR; j++) fifo_for_each_entry(i, &ca->free[j], iter) BUG_ON(i == bucket); @@ -651,17 +623,37 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g, static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g) { - spin_lock(&ca->freelist_lock); - - bch2_invalidate_bucket(ca, g); + struct bch_fs *c = ca->fs; + struct bucket_mark m; - g->prio[READ] = ca->fs->prio_clock[READ].hand; - g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand; + spin_lock(&ca->freelist_lock); + if (!bch2_invalidate_bucket(ca, g, &m)) { + spin_unlock(&ca->freelist_lock); + return; + } verify_not_on_freelist(ca, g - ca->buckets); BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); - spin_unlock(&ca->freelist_lock); + + g->prio[READ] = c->prio_clock[READ].hand; + g->prio[WRITE] = c->prio_clock[WRITE].hand; + + if (m.cached_sectors) { + ca->allocator_invalidating_data = true; + } else if (m.journal_seq_valid) { + u64 journal_seq = atomic64_read(&c->journal.seq); + u64 bucket_seq = journal_seq; + + bucket_seq &= ~((u64) U16_MAX); + bucket_seq |= m.journal_seq; + + if (bucket_seq > journal_seq) + bucket_seq -= 1 << 16; + + ca->allocator_journal_seq_flush = + max(ca->allocator_journal_seq_flush, bucket_seq); + } } /* @@ -686,11 +678,23 @@ static unsigned long bucket_sort_key(struct bch_dev *ca, struct bucket *g, struct bucket_mark m) { + /* + * Time since last read, scaled to [0, 8) where larger value indicates + * more recently read data: + */ unsigned long hotness = (g->prio[READ] - ca->min_prio[READ]) * 7 / (ca->fs->prio_clock[READ].hand - ca->min_prio[READ]); - return (((hotness + 1) * bucket_sectors_used(m)) << 8) | + /* How much we want to keep the data in this bucket: */ + unsigned long data_wantness = + (hotness + 1) * bucket_sectors_used(m); + + unsigned long needs_journal_commit = + bucket_needs_journal_commit(m, ca->fs->journal.last_seq_ondisk); + + return (data_wantness << 9) | + (needs_journal_commit << 8) | bucket_gc_gen(ca, g); } @@ -790,8 +794,8 @@ static void invalidate_buckets_random(struct bch_dev *ca) static void invalidate_buckets(struct bch_dev *ca) { - ca->inc_gen_needs_gc = 0; - ca->inc_gen_really_needs_gc = 0; + ca->inc_gen_needs_gc = 0; + ca->inc_gen_really_needs_gc = 0; switch (ca->mi.replacement) { case CACHE_REPLACEMENT_LRU: @@ -806,73 +810,82 @@ static void invalidate_buckets(struct bch_dev *ca) } } -static bool __bch2_allocator_push(struct bch_dev *ca, long bucket) +static int size_t_cmp(const void *_l, const void *_r) { - if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) - goto success; - - if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket)) - goto success; - - if (fifo_push(&ca->free[RESERVE_BTREE], bucket)) - goto success; - - if (fifo_push(&ca->free[RESERVE_NONE], bucket)) - goto success; + const size_t *l = _l, *r = _r; - return false; -success: - closure_wake_up(&ca->fs->freelist_wait); - return true; + return (*l > *r) - (*l < *r); } -static bool bch2_allocator_push(struct bch_dev *ca, long bucket) +static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, + u64 *journal_seq) { - bool ret; + struct btree_iter iter; + unsigned nr_invalidated = 0; + size_t b, i; + int ret = 0; - spin_lock(&ca->freelist_lock); - ret = __bch2_allocator_push(ca, bucket); - if (ret) - fifo_pop(&ca->free_inc, bucket); - spin_unlock(&ca->freelist_lock); + bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), + BTREE_ITER_INTENT); - return ret; + fifo_for_each_entry(b, &ca->free_inc, i) { + ret = __bch2_alloc_write_key(c, ca, ca->buckets + b, + &iter, journal_seq); + if (ret) + break; + + nr_invalidated++; + } + + bch2_btree_iter_unlock(&iter); + return nr_invalidated ?: ret; } -static void bch2_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca) +/* + * Given an invalidated, ready to use bucket: issue a discard to it if enabled, + * then add it to the freelist, waiting until there's room if necessary: + */ +static void discard_invalidated_bucket(struct bch_dev *ca, long bucket) { - u16 last_seq_ondisk = c->journal.last_seq_ondisk; - struct bucket *g; + if (ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, bucket), + ca->mi.bucket_size, GFP_NOIO, 0); - for_each_bucket(g, ca) { - struct bucket_mark m = READ_ONCE(g->mark); - if (is_available_bucket(m) && - !m.cached_sectors && - !m.had_metadata && - !bucket_needs_journal_commit(m, last_seq_ondisk)) { - spin_lock(&ca->freelist_lock); + while (1) { + bool pushed = false; + unsigned i; - bch2_mark_alloc_bucket(ca, g, true); - g->prio[READ] = c->prio_clock[READ].hand; - g->prio[WRITE] = c->prio_clock[WRITE].hand; + set_current_state(TASK_INTERRUPTIBLE); - verify_not_on_freelist(ca, g - ca->buckets); - BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); + /* + * Don't remove from free_inc until after it's added to + * freelist, so gc can find it: + */ + spin_lock(&ca->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) + if (fifo_push(&ca->free[i], bucket)) { + fifo_pop(&ca->free_inc, bucket); + closure_wake_up(&ca->fs->freelist_wait); + pushed = true; + break; + } + spin_unlock(&ca->freelist_lock); - spin_unlock(&ca->freelist_lock); + if (pushed) + break; - if (fifo_full(&ca->free_inc)) - break; + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; } + schedule(); + try_to_freeze(); } -} - -static int size_t_cmp(const void *_l, const void *_r) -{ - const size_t *l = _l, *r = _r; - return (*l > *r) - (*l < *r); + __set_current_state(TASK_RUNNING); } /** @@ -887,57 +900,26 @@ static int bch2_allocator_thread(void *arg) { struct bch_dev *ca = arg; struct bch_fs *c = ca->fs; - long bucket; + size_t bucket; int ret; set_freezable(); - bch2_find_empty_buckets(c, ca); - - while (1) { - /* - * First, we pull buckets off of the free_inc list, possibly - * issue discards to them, then we add the bucket to a - * free list: - */ - - while (!fifo_empty(&ca->free_inc)) { - bucket = fifo_peek(&ca->free_inc); - - /* - * Don't remove from free_inc until after it's added - * to freelist, so gc doesn't miss it while we've - * dropped bucket lock - */ - - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, bucket), - ca->mi.bucket_size, GFP_NOIO, 0); - - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (bch2_allocator_push(ca, bucket)) - break; - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - goto out; - } - schedule(); - try_to_freeze(); - } - - __set_current_state(TASK_RUNNING); - } - - /* We've run out of free buckets! */ + while (!kthread_should_stop()) { + u64 journal_seq = 0; + /* Reset front/back so we can easily sort fifo entries later: */ BUG_ON(fifo_used(&ca->free_inc)); - ca->free_inc.front = ca->free_inc.back = 0; + ca->free_inc.front = ca->free_inc.back = 0; + ca->allocator_journal_seq_flush = 0; + ca->allocator_invalidating_data = false; down_read(&c->gc_lock); + if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { + up_read(&c->gc_lock); + goto out; + } + while (1) { /* * Find some buckets that we can invalidate, either @@ -947,7 +929,6 @@ static int bch2_allocator_thread(void *arg) */ invalidate_buckets(ca); - trace_alloc_batch(ca, fifo_used(&ca->free_inc), ca->free_inc.size); @@ -980,28 +961,32 @@ static int bch2_allocator_thread(void *arg) spin_unlock(&ca->freelist_lock); /* - * free_inc is full of newly-invalidated buckets, must write out - * prios and gens before they can be re-used + * free_inc is now full of newly-invalidated buckets: next, + * write out the new bucket gens: */ - ret = bch2_prio_write(ca); - if (ret) { - /* - * Emergency read only - allocator thread has to - * shutdown. - * - * N.B. we better be going into RO mode, else - * allocations would hang indefinitely - whatever - * generated the error will have sent us into RO mode. - * - * Clear out the free_inc freelist so things are - * consistent-ish: - */ - spin_lock(&ca->freelist_lock); - while (fifo_pop(&ca->free_inc, bucket)) - bch2_mark_free_bucket(ca, ca->buckets + bucket); - spin_unlock(&ca->freelist_lock); - goto out; + + while (!fifo_empty(&ca->free_inc) && !kthread_should_stop()) { + ret = bch2_invalidate_free_inc(c, ca, &journal_seq); + if (bch2_fs_fatal_err_on(ret < 0, c, + "error invalidating buckets: %i", ret)) + goto err; + + if (ca->allocator_invalidating_data) + bch2_journal_flush_seq(&c->journal, journal_seq); + else if (ca->allocator_journal_seq_flush) + bch2_journal_flush_seq(&c->journal, + ca->allocator_journal_seq_flush); + + while (ret && !kthread_should_stop()) { + BUG_ON(fifo_empty(&ca->free_inc)); + + bucket = fifo_peek(&ca->free_inc); + discard_invalidated_bucket(ca, bucket); + --ret; + } } + + ca->alloc_thread_started = true; } out: /* @@ -1010,50 +995,104 @@ out: */ synchronize_rcu(); return 0; +err: + /* + * Emergency read only - allocator thread has to shutdown. + * + * N.B. we better be going into RO mode, else allocations would hang + * indefinitely - whatever generated the error will have sent us into RO + * mode. + * + * Clear out the free_inc freelist so things are consistent-ish: + */ + spin_lock(&ca->freelist_lock); + while (fifo_pop(&ca->free_inc, bucket)) + bch2_mark_free_bucket(ca, ca->buckets + bucket); + spin_unlock(&ca->freelist_lock); + goto out; } /* Allocation */ +static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) +{ + struct bucket *g; + long r = -1; + + if (!down_read_trylock(&c->gc_lock)) + return r; + + if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) + goto out; + + for_each_bucket(g, ca) + if (!g->mark.touched_this_mount && + is_available_bucket(g->mark) && + bch2_mark_alloc_bucket_startup(ca, g)) { + r = g - ca->buckets; + break; + } +out: + up_read(&c->gc_lock); + return r; +} + /** * bch_bucket_alloc - allocate a single bucket from a specific device * * Returns index of bucket on success, 0 on failure * */ -size_t bch2_bucket_alloc(struct bch_dev *ca, enum alloc_reserve reserve) +long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve) { - struct bucket *g; - long r; + size_t r; spin_lock(&ca->freelist_lock); - if (fifo_pop(&ca->free[RESERVE_NONE], r) || - fifo_pop(&ca->free[reserve], r)) + if (likely(fifo_pop(&ca->free[RESERVE_NONE], r))) goto out; + switch (reserve) { + case RESERVE_ALLOC: + if (fifo_pop(&ca->free[RESERVE_BTREE], r)) + goto out; + break; + case RESERVE_BTREE: + if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= + ca->free[RESERVE_BTREE].size && + fifo_pop(&ca->free[RESERVE_BTREE], r)) + goto out; + break; + case RESERVE_MOVINGGC: + if (fifo_pop(&ca->free[RESERVE_MOVINGGC], r)) + goto out; + break; + default: + break; + } + spin_unlock(&ca->freelist_lock); + if (unlikely(!ca->alloc_thread_started) && + (r = bch2_bucket_alloc_startup(c, ca)) >= 0) { + verify_not_on_freelist(ca, r); + goto out2; + } + trace_bucket_alloc_fail(ca, reserve); - return 0; + return -1; out: verify_not_on_freelist(ca, r); spin_unlock(&ca->freelist_lock); - trace_bucket_alloc(ca, reserve); - bch2_wake_allocator(ca); +out2: + ca->buckets[r].prio[READ] = c->prio_clock[READ].hand; + ca->buckets[r].prio[WRITE] = c->prio_clock[WRITE].hand; - g = ca->buckets + r; - - g->prio[READ] = ca->fs->prio_clock[READ].hand; - g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand; - + trace_bucket_alloc(ca, reserve); return r; } -static void __bch2_bucket_free(struct bch_dev *ca, struct bucket *g) -{ - bch2_mark_free_bucket(ca, g); -} - enum bucket_alloc_ret { ALLOC_SUCCESS, NO_DEVICES, /* -EROFS */ @@ -1116,7 +1155,7 @@ static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c, while (ob->nr_ptrs < nr_replicas) { struct bch_dev *ca; - u64 bucket; + long bucket; if (!available) { ret = NO_DEVICES; @@ -1139,8 +1178,8 @@ static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c, get_random_int() > devs->d[i].weight) continue; - bucket = bch2_bucket_alloc(ca, reserve); - if (!bucket) { + bucket = bch2_bucket_alloc(c, ca, reserve); + if (bucket < 0) { if (fail_idx == -1) fail_idx = i; continue; @@ -1456,7 +1495,6 @@ struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *c, ? 0 : BTREE_NODE_RESERVE; int ret; - BUG_ON(!reserve); BUG_ON(!nr_replicas); retry: ob = lock_writepoint(c, wp); @@ -1705,7 +1743,9 @@ set_capacity: capacity *= (100 - c->opts.gc_reserve_percent); capacity = div64_u64(capacity, 100); - BUG_ON(capacity + reserved_sectors > total_capacity); + BUG_ON(reserved_sectors > total_capacity); + + capacity = min(capacity, total_capacity - reserved_sectors); c->capacity = capacity; @@ -1725,10 +1765,9 @@ set_capacity: closure_wake_up(&c->freelist_wait); } -static void bch2_stop_write_point(struct bch_dev *ca, - struct write_point *wp) +static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca, + struct write_point *wp) { - struct bch_fs *c = ca->fs; struct open_bucket *ob; struct bch_extent_ptr *ptr; @@ -1750,9 +1789,8 @@ found: bch2_open_bucket_put(c, ob); } -static bool bch2_dev_has_open_write_point(struct bch_dev *ca) +static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { - struct bch_fs *c = ca->fs; struct bch_extent_ptr *ptr; struct open_bucket *ob; @@ -1773,55 +1811,36 @@ static bool bch2_dev_has_open_write_point(struct bch_dev *ca) } /* device goes ro: */ -void bch2_dev_allocator_stop(struct bch_dev *ca) +void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { - struct bch_fs *c = ca->fs; struct dev_group *tier = &c->tiers[ca->mi.tier].devs; - struct task_struct *p; struct closure cl; unsigned i; + BUG_ON(ca->alloc_thread); + closure_init_stack(&cl); /* First, remove device from allocation groups: */ + bch2_dev_group_remove(&c->journal.devs, ca); bch2_dev_group_remove(tier, ca); bch2_dev_group_remove(&c->all_devs, ca); - bch2_recalc_capacity(c); - /* - * Stopping the allocator thread comes after removing from allocation - * groups, else pending allocations will hang: - */ - - p = ca->alloc_thread; - ca->alloc_thread = NULL; - smp_wmb(); - - /* - * We need an rcu barrier between setting ca->alloc_thread = NULL and - * the thread shutting down to avoid a race with bch2_usage_update() - - * the allocator thread itself does a synchronize_rcu() on exit. - * - * XXX: it would be better to have the rcu barrier be asynchronous - * instead of blocking us here + * Capacity is calculated based off of devices in allocation groups: */ - if (p) { - kthread_stop(p); - put_task_struct(p); - } + bch2_recalc_capacity(c); /* Next, close write points that point to this device... */ - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - bch2_stop_write_point(ca, &c->write_points[i]); + bch2_stop_write_point(c, ca, &c->write_points[i]); - bch2_stop_write_point(ca, &ca->copygc_write_point); - bch2_stop_write_point(ca, &c->promote_write_point); - bch2_stop_write_point(ca, &ca->tiering_write_point); - bch2_stop_write_point(ca, &c->migration_write_point); - bch2_stop_write_point(ca, &c->btree_write_point); + bch2_stop_write_point(c, ca, &ca->copygc_write_point); + bch2_stop_write_point(c, ca, &c->promote_write_point); + bch2_stop_write_point(c, ca, &ca->tiering_write_point); + bch2_stop_write_point(c, ca, &c->migration_write_point); + bch2_stop_write_point(c, ca, &c->btree_write_point); mutex_lock(&c->btree_reserve_cache_lock); while (c->btree_reserve_cache_nr) { @@ -1832,9 +1851,16 @@ void bch2_dev_allocator_stop(struct bch_dev *ca) } mutex_unlock(&c->btree_reserve_cache_lock); - /* Avoid deadlocks.. */ - + /* + * Wake up threads that were blocked on allocation, so they can notice + * the device can no longer be removed and the capacity has changed: + */ closure_wake_up(&c->freelist_wait); + + /* + * journal_res_get() can block waiting for free space in the journal - + * it needs to notice there may not be devices to allocate from anymore: + */ wake_up(&c->journal.wait); /* Now wait for any in flight writes: */ @@ -1842,7 +1868,7 @@ void bch2_dev_allocator_stop(struct bch_dev *ca) while (1) { closure_wait(&c->open_buckets_wait, &cl); - if (!bch2_dev_has_open_write_point(ca)) { + if (!bch2_dev_has_open_write_point(c, ca)) { closure_wake_up(&c->open_buckets_wait); break; } @@ -1851,32 +1877,15 @@ void bch2_dev_allocator_stop(struct bch_dev *ca) } } -/* - * Startup the allocator thread for transition to RW mode: - */ -int bch2_dev_allocator_start(struct bch_dev *ca) +/* device goes rw: */ +void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { - struct bch_fs *c = ca->fs; struct dev_group *tier = &c->tiers[ca->mi.tier].devs; struct bch_sb_field_journal *journal_buckets; bool has_journal; - struct task_struct *k; - /* - * allocator thread already started? - */ - if (ca->alloc_thread) - return 0; - - k = kthread_create(bch2_allocator_thread, ca, "bcache_allocator"); - if (IS_ERR(k)) - return 0; - - get_task_struct(k); - ca->alloc_thread = k; - - bch2_dev_group_add(tier, ca); bch2_dev_group_add(&c->all_devs, ca); + bch2_dev_group_add(tier, ca); mutex_lock(&c->sb_lock); journal_buckets = bch2_sb_get_journal(ca->disk_sb.sb); @@ -1886,15 +1895,44 @@ int bch2_dev_allocator_start(struct bch_dev *ca) if (has_journal) bch2_dev_group_add(&c->journal.devs, ca); +} - bch2_recalc_capacity(c); +/* stop allocator thread: */ +void bch2_dev_allocator_stop(struct bch_dev *ca) +{ + struct task_struct *p = ca->alloc_thread; + + ca->alloc_thread = NULL; + smp_wmb(); + + /* + * We need an rcu barrier between setting ca->alloc_thread = NULL and + * the thread shutting down to avoid a race with bch2_usage_update() - + * the allocator thread itself does a synchronize_rcu() on exit. + * + * XXX: it would be better to have the rcu barrier be asynchronous + * instead of blocking us here + */ + if (p) + kthread_stop(p); +} + +/* start allocator thread: */ +int bch2_dev_allocator_start(struct bch_dev *ca) +{ + struct task_struct *p; /* - * Don't wake up allocator thread until after adding device to - * allocator groups - otherwise, alloc thread could get a spurious - * -EROFS due to prio_write() -> journal_meta() not finding any devices: + * allocator thread already started? */ - wake_up_process(k); + if (ca->alloc_thread) + return 0; + + p = kthread_run(bch2_allocator_thread, ca, "bcache_allocator"); + if (IS_ERR(p)) + return PTR_ERR(p); + + ca->alloc_thread = p; return 0; } |