diff options
Diffstat (limited to 'libbcache/alloc.c')
-rw-r--r-- | libbcache/alloc.c | 1913 |
1 files changed, 0 insertions, 1913 deletions
diff --git a/libbcache/alloc.c b/libbcache/alloc.c deleted file mode 100644 index 2f892914..00000000 --- a/libbcache/alloc.c +++ /dev/null @@ -1,1913 +0,0 @@ -/* - * Primary bucket allocation code - * - * Copyright 2012 Google, Inc. - * - * Allocation in bcache is done in terms of buckets: - * - * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in - * btree pointers - they must match for the pointer to be considered valid. - * - * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a - * bucket simply by incrementing its gen. - * - * The gens (along with the priorities; it's really the gens are important but - * the code is named as if it's the priorities) are written in an arbitrary list - * of buckets on disk, with a pointer to them in the journal header. - * - * When we invalidate a bucket, we have to write its new gen to disk and wait - * for that write to complete before we use it - otherwise after a crash we - * could have pointers that appeared to be good but pointed to data that had - * been overwritten. - * - * Since the gens and priorities are all stored contiguously on disk, we can - * batch this up: We fill up the free_inc list with freshly invalidated buckets, - * call prio_write(), and when prio_write() finishes we pull buckets off the - * free_inc list and optionally discard them. - * - * free_inc isn't the only freelist - if it was, we'd often have to sleep while - * priorities and gens were being written before we could allocate. c->free is a - * smaller freelist, and buckets on that list are always ready to be used. - * - * If we've got discards enabled, that happens when a bucket moves from the - * free_inc list to the free list. - * - * It's important to ensure that gens don't wrap around - with respect to - * either the oldest gen in the btree or the gen on disk. This is quite - * difficult to do in practice, but we explicitly guard against it anyways - if - * a bucket is in danger of wrapping around we simply skip invalidating it that - * time around, and we garbage collect or rewrite the priorities sooner than we - * would have otherwise. - * - * bch_bucket_alloc() allocates a single bucket from a specific device. - * - * bch_bucket_alloc_set() allocates one or more buckets from different devices - * in a given filesystem. - * - * invalidate_buckets() drives all the processes described above. It's called - * from bch_bucket_alloc() and a few other places that need to make sure free - * buckets are ready. - * - * invalidate_buckets_(lru|fifo)() find buckets that are available to be - * invalidated, and then invalidate them and stick them on the free_inc list - - * in either lru or fifo order. - */ - -#include "bcache.h" -#include "alloc.h" -#include "btree_update.h" -#include "buckets.h" -#include "checksum.h" -#include "clock.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "io.h" -#include "journal.h" -#include "super-io.h" - -#include <linux/blkdev.h> -#include <linux/kthread.h> -#include <linux/math64.h> -#include <linux/random.h> -#include <linux/rcupdate.h> -#include <trace/events/bcache.h> - -static void __bch_bucket_free(struct bch_dev *, struct bucket *); -static void bch_recalc_min_prio(struct bch_dev *, int); - -/* Allocation groups: */ - -void bch_dev_group_remove(struct dev_group *grp, struct bch_dev *ca) -{ - unsigned i; - - spin_lock(&grp->lock); - - for (i = 0; i < grp->nr; i++) - if (grp->d[i].dev == ca) { - grp->nr--; - memmove(&grp->d[i], - &grp->d[i + 1], - (grp->nr- i) * sizeof(grp->d[0])); - break; - } - - spin_unlock(&grp->lock); -} - -void bch_dev_group_add(struct dev_group *grp, struct bch_dev *ca) -{ - unsigned i; - - spin_lock(&grp->lock); - for (i = 0; i < grp->nr; i++) - if (grp->d[i].dev == ca) - goto out; - - BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX); - - grp->d[grp->nr++].dev = ca; -out: - spin_unlock(&grp->lock); -} - -/* Ratelimiting/PD controllers */ - -static void pd_controllers_update(struct work_struct *work) -{ - struct bch_fs *c = container_of(to_delayed_work(work), - struct bch_fs, - pd_controllers_update); - struct bch_dev *ca; - unsigned i, iter; - - /* All units are in bytes */ - u64 faster_tiers_size = 0; - u64 faster_tiers_dirty = 0; - - u64 fastest_tier_size = 0; - u64 fastest_tier_free = 0; - u64 copygc_can_free = 0; - - rcu_read_lock(); - for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { - bch_pd_controller_update(&c->tiers[i].pd, - div_u64(faster_tiers_size * - c->tiering_percent, 100), - faster_tiers_dirty, - -1); - - spin_lock(&c->tiers[i].devs.lock); - group_for_each_dev(ca, &c->tiers[i].devs, iter) { - struct bch_dev_usage stats = bch_dev_usage_read(ca); - unsigned bucket_bits = ca->bucket_bits + 9; - - u64 size = (ca->mi.nbuckets - - ca->mi.first_bucket) << bucket_bits; - u64 dirty = stats.buckets_dirty << bucket_bits; - u64 free = __dev_buckets_free(ca, stats) << bucket_bits; - /* - * Bytes of internal fragmentation, which can be - * reclaimed by copy GC - */ - s64 fragmented = ((stats.buckets_dirty + - stats.buckets_cached) << - bucket_bits) - - ((stats.sectors[S_DIRTY] + - stats.sectors[S_CACHED] ) << 9); - - fragmented = max(0LL, fragmented); - - bch_pd_controller_update(&ca->moving_gc_pd, - free, fragmented, -1); - - faster_tiers_size += size; - faster_tiers_dirty += dirty; - - if (!c->fastest_tier || - c->fastest_tier == &c->tiers[i]) { - fastest_tier_size += size; - fastest_tier_free += free; - } - - copygc_can_free += fragmented; - } - spin_unlock(&c->tiers[i].devs.lock); - } - - rcu_read_unlock(); - - /* - * Throttle foreground writes if tier 0 is running out of free buckets, - * and either tiering or copygc can free up space. - * - * Target will be small if there isn't any work to do - we don't want to - * throttle foreground writes if we currently have all the free space - * we're ever going to have. - * - * Otherwise, if there's work to do, try to keep 20% of tier0 available - * for foreground writes. - */ - if (c->fastest_tier) - copygc_can_free = U64_MAX; - - bch_pd_controller_update(&c->foreground_write_pd, - min(copygc_can_free, - div_u64(fastest_tier_size * - c->foreground_target_percent, - 100)), - fastest_tier_free, - -1); - - schedule_delayed_work(&c->pd_controllers_update, - c->pd_controllers_update_seconds * HZ); -} - -/* - * Bucket priorities/gens: - * - * For each bucket, we store on disk its - * 8 bit gen - * 16 bit priority - * - * See alloc.c for an explanation of the gen. The priority is used to implement - * lru (and in the future other) cache replacement policies; for most purposes - * it's just an opaque integer. - * - * The gens and the priorities don't have a whole lot to do with each other, and - * it's actually the gens that must be written out at specific times - it's no - * big deal if the priorities don't get written, if we lose them we just reuse - * buckets in suboptimal order. - * - * On disk they're stored in a packed array, and in as many buckets are required - * to fit them all. The buckets we use to store them form a list; the journal - * header points to the first bucket, the first bucket points to the second - * bucket, et cetera. - * - * This code is used by the allocation code; periodically (whenever it runs out - * of buckets to allocate from) the allocation code will invalidate some - * buckets, but it can't use those buckets until their new gens are safely on - * disk. - */ - -static int prio_io(struct bch_dev *ca, uint64_t bucket, int op) -{ - bio_init(ca->bio_prio); - bio_set_op_attrs(ca->bio_prio, op, REQ_SYNC|REQ_META); - - ca->bio_prio->bi_max_vecs = bucket_pages(ca); - ca->bio_prio->bi_io_vec = ca->bio_prio->bi_inline_vecs; - ca->bio_prio->bi_iter.bi_sector = bucket * ca->mi.bucket_size; - ca->bio_prio->bi_bdev = ca->disk_sb.bdev; - ca->bio_prio->bi_iter.bi_size = bucket_bytes(ca); - bch_bio_map(ca->bio_prio, ca->disk_buckets); - - return submit_bio_wait(ca->bio_prio); -} - -static struct nonce prio_nonce(struct prio_set *p) -{ - return (struct nonce) {{ - [0] = 0, - [1] = p->nonce[0], - [2] = p->nonce[1], - [3] = p->nonce[2]^BCH_NONCE_PRIO, - }}; -} - -static int bch_prio_write(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct journal *j = &c->journal; - struct journal_res res = { 0 }; - bool need_new_journal_entry; - int i, ret; - - if (c->opts.nochanges) - return 0; - - trace_bcache_prio_write_start(ca); - - atomic64_add(ca->mi.bucket_size * prio_buckets(ca), - &ca->meta_sectors_written); - - for (i = prio_buckets(ca) - 1; i >= 0; --i) { - struct bucket *g; - struct prio_set *p = ca->disk_buckets; - struct bucket_disk *d = p->data; - struct bucket_disk *end = d + prios_per_bucket(ca); - size_t r; - - for (r = i * prios_per_bucket(ca); - r < ca->mi.nbuckets && d < end; - r++, d++) { - g = ca->buckets + r; - d->read_prio = cpu_to_le16(g->read_prio); - d->write_prio = cpu_to_le16(g->write_prio); - d->gen = ca->buckets[r].mark.gen; - } - - p->next_bucket = cpu_to_le64(ca->prio_buckets[i + 1]); - p->magic = cpu_to_le64(pset_magic(c)); - get_random_bytes(&p->nonce, sizeof(p->nonce)); - - spin_lock(&ca->prio_buckets_lock); - r = bch_bucket_alloc(ca, RESERVE_PRIO); - BUG_ON(!r); - - /* - * goes here before dropping prio_buckets_lock to guard against - * it getting gc'd from under us - */ - ca->prio_buckets[i] = r; - bch_mark_metadata_bucket(ca, ca->buckets + r, - BUCKET_PRIOS, false); - spin_unlock(&ca->prio_buckets_lock); - - SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c)); - - bch_encrypt(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - p->encrypted_start, - bucket_bytes(ca) - - offsetof(struct prio_set, encrypted_start)); - - p->csum = bch_checksum(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - (void *) p + sizeof(p->csum), - bucket_bytes(ca) - sizeof(p->csum)); - - ret = prio_io(ca, r, REQ_OP_WRITE); - if (bch_dev_fatal_io_err_on(ret, ca, - "prio write to bucket %zu", r) || - bch_meta_write_fault("prio")) - return ret; - } - - spin_lock(&j->lock); - j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]); - j->nr_prio_buckets = max_t(unsigned, - ca->dev_idx + 1, - j->nr_prio_buckets); - spin_unlock(&j->lock); - - do { - unsigned u64s = jset_u64s(0); - - if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) - break; - - ret = bch_journal_res_get(j, &res, u64s, u64s); - if (ret) - return ret; - - need_new_journal_entry = j->buf[res.idx].nr_prio_buckets < - ca->dev_idx + 1; - bch_journal_res_put(j, &res); - - ret = bch_journal_flush_seq(j, res.seq); - if (ret) - return ret; - } while (need_new_journal_entry); - - /* - * Don't want the old priorities to get garbage collected until after we - * finish writing the new ones, and they're journalled - */ - - spin_lock(&ca->prio_buckets_lock); - - for (i = 0; i < prio_buckets(ca); i++) { - if (ca->prio_last_buckets[i]) - __bch_bucket_free(ca, - &ca->buckets[ca->prio_last_buckets[i]]); - - ca->prio_last_buckets[i] = ca->prio_buckets[i]; - } - - spin_unlock(&ca->prio_buckets_lock); - - trace_bcache_prio_write_end(ca); - return 0; -} - -int bch_prio_read(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct prio_set *p = ca->disk_buckets; - struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; - struct bucket_mark new; - struct bch_csum csum; - unsigned bucket_nr = 0; - u64 bucket, expect, got; - size_t b; - int ret = 0; - - spin_lock(&c->journal.lock); - bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]); - spin_unlock(&c->journal.lock); - - /* - * If the device hasn't been used yet, there won't be a prio bucket ptr - */ - if (!bucket) - return 0; - - unfixable_fsck_err_on(bucket < ca->mi.first_bucket || - bucket >= ca->mi.nbuckets, c, - "bad prio bucket %llu", bucket); - - for (b = 0; b < ca->mi.nbuckets; b++, d++) { - if (d == end) { - ca->prio_last_buckets[bucket_nr] = bucket; - bucket_nr++; - - ret = prio_io(ca, bucket, REQ_OP_READ); - if (bch_dev_fatal_io_err_on(ret, ca, - "prior read from bucket %llu", - bucket) || - bch_meta_read_fault("prio")) - return -EIO; - - got = le64_to_cpu(p->magic); - expect = pset_magic(c); - unfixable_fsck_err_on(got != expect, c, - "bad magic (got %llu expect %llu) while reading prios from bucket %llu", - got, expect, bucket); - - unfixable_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c, - "prio bucket with unknown csum type %llu bucket %lluu", - PSET_CSUM_TYPE(p), bucket); - - csum = bch_checksum(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - (void *) p + sizeof(p->csum), - bucket_bytes(ca) - sizeof(p->csum)); - unfixable_fsck_err_on(bch_crc_cmp(csum, p->csum), c, - "bad checksum reading prios from bucket %llu", - bucket); - - bch_encrypt(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - p->encrypted_start, - bucket_bytes(ca) - - offsetof(struct prio_set, encrypted_start)); - - bucket = le64_to_cpu(p->next_bucket); - d = p->data; - } - - ca->buckets[b].read_prio = le16_to_cpu(d->read_prio); - ca->buckets[b].write_prio = le16_to_cpu(d->write_prio); - - bucket_cmpxchg(&ca->buckets[b], new, new.gen = d->gen); - } - - mutex_lock(&c->bucket_lock); - bch_recalc_min_prio(ca, READ); - bch_recalc_min_prio(ca, WRITE); - mutex_unlock(&c->bucket_lock); - - ret = 0; -fsck_err: - return ret; -} - -#define BUCKET_GC_GEN_MAX 96U - -/** - * wait_buckets_available - wait on reclaimable buckets - * - * If there aren't enough available buckets to fill up free_inc, wait until - * there are. - */ -static int wait_buckets_available(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - int ret = 0; - - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) { - ret = -1; - break; - } - - if (ca->inc_gen_needs_gc >= fifo_free(&ca->free_inc)) { - if (c->gc_thread) { - trace_bcache_gc_cannot_inc_gens(ca->fs); - atomic_inc(&c->kick_gc); - wake_up_process(ca->fs->gc_thread); - } - - /* - * We are going to wait for GC to wake us up, even if - * bucket counters tell us enough buckets are available, - * because we are actually waiting for GC to rewrite - * nodes with stale pointers - */ - } else if (dev_buckets_available(ca) >= - fifo_free(&ca->free_inc)) - break; - - up_read(&ca->fs->gc_lock); - schedule(); - try_to_freeze(); - down_read(&ca->fs->gc_lock); - } - - __set_current_state(TASK_RUNNING); - return ret; -} - -static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket) -{ - if (expensive_debug_checks(ca->fs)) { - size_t iter; - long i; - unsigned j; - - for (iter = 0; iter < prio_buckets(ca) * 2; iter++) - BUG_ON(ca->prio_buckets[iter] == bucket); - - for (j = 0; j < RESERVE_NR; j++) - fifo_for_each_entry(i, &ca->free[j], iter) - BUG_ON(i == bucket); - fifo_for_each_entry(i, &ca->free_inc, iter) - BUG_ON(i == bucket); - } -} - -/* Bucket heap / gen */ - -void bch_recalc_min_prio(struct bch_dev *ca, int rw) -{ - struct bch_fs *c = ca->fs; - struct prio_clock *clock = &c->prio_clock[rw]; - struct bucket *g; - u16 max_delta = 1; - unsigned i; - - lockdep_assert_held(&c->bucket_lock); - - /* Determine min prio for this particular cache */ - for_each_bucket(g, ca) - max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw])); - - ca->min_prio[rw] = clock->hand - max_delta; - - /* - * This may possibly increase the min prio for the whole cache, check - * that as well. - */ - max_delta = 1; - - for_each_member_device(ca, c, i) - max_delta = max(max_delta, - (u16) (clock->hand - ca->min_prio[rw])); - - clock->min_prio = clock->hand - max_delta; -} - -static void bch_rescale_prios(struct bch_fs *c, int rw) -{ - struct prio_clock *clock = &c->prio_clock[rw]; - struct bch_dev *ca; - struct bucket *g; - unsigned i; - - trace_bcache_rescale_prios(c); - - for_each_member_device(ca, c, i) { - for_each_bucket(g, ca) - g->prio[rw] = clock->hand - - (clock->hand - g->prio[rw]) / 2; - - bch_recalc_min_prio(ca, rw); - } -} - -static void bch_inc_clock_hand(struct io_timer *timer) -{ - struct prio_clock *clock = container_of(timer, - struct prio_clock, rescale); - struct bch_fs *c = container_of(clock, - struct bch_fs, prio_clock[clock->rw]); - u64 capacity; - - mutex_lock(&c->bucket_lock); - - clock->hand++; - - /* if clock cannot be advanced more, rescale prio */ - if (clock->hand == (u16) (clock->min_prio - 1)) - bch_rescale_prios(c, clock->rw); - - mutex_unlock(&c->bucket_lock); - - capacity = READ_ONCE(c->capacity); - - if (!capacity) - return; - - /* - * we only increment when 0.1% of the filesystem capacity has been read - * or written too, this determines if it's time - * - * XXX: we shouldn't really be going off of the capacity of devices in - * RW mode (that will be 0 when we're RO, yet we can still service - * reads) - */ - timer->expire += capacity >> 10; - - bch_io_timer_add(&c->io_clock[clock->rw], timer); -} - -static void bch_prio_timer_init(struct bch_fs *c, int rw) -{ - struct prio_clock *clock = &c->prio_clock[rw]; - struct io_timer *timer = &clock->rescale; - - clock->rw = rw; - timer->fn = bch_inc_clock_hand; - timer->expire = c->capacity >> 10; -} - -/* - * Background allocation thread: scans for buckets to be invalidated, - * invalidates them, rewrites prios/gens (marking them as invalidated on disk), - * then optionally issues discard commands to the newly free buckets, then puts - * them on the various freelists. - */ - -static inline bool can_inc_bucket_gen(struct bch_dev *ca, struct bucket *g) -{ - return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX; -} - -static bool bch_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g) -{ - if (!is_available_bucket(READ_ONCE(g->mark))) - return false; - - if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX - 1) - ca->inc_gen_needs_gc++; - - return can_inc_bucket_gen(ca, g); -} - -static void bch_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g) -{ - spin_lock(&ca->freelist_lock); - - bch_invalidate_bucket(ca, g); - - g->read_prio = ca->fs->prio_clock[READ].hand; - g->write_prio = ca->fs->prio_clock[WRITE].hand; - - verify_not_on_freelist(ca, g - ca->buckets); - BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); - - spin_unlock(&ca->freelist_lock); -} - -/* - * Determines what order we're going to reuse buckets, smallest bucket_key() - * first. - * - * - * - We take into account the read prio of the bucket, which gives us an - * indication of how hot the data is -- we scale the prio so that the prio - * farthest from the clock is worth 1/8th of the closest. - * - * - The number of sectors of cached data in the bucket, which gives us an - * indication of the cost in cache misses this eviction will cause. - * - * - The difference between the bucket's current gen and oldest gen of any - * pointer into it, which gives us an indication of the cost of an eventual - * btree GC to rewrite nodes with stale pointers. - */ - -#define bucket_sort_key(g) \ -({ \ - unsigned long prio = g->read_prio - ca->min_prio[READ]; \ - prio = (prio * 7) / (ca->fs->prio_clock[READ].hand - \ - ca->min_prio[READ]); \ - \ - (((prio + 1) * bucket_sectors_used(g)) << 8) | bucket_gc_gen(ca, g);\ -}) - -static void invalidate_buckets_lru(struct bch_dev *ca) -{ - struct bucket_heap_entry e; - struct bucket *g; - unsigned i; - - mutex_lock(&ca->heap_lock); - - ca->heap.used = 0; - - mutex_lock(&ca->fs->bucket_lock); - bch_recalc_min_prio(ca, READ); - bch_recalc_min_prio(ca, WRITE); - - /* - * Find buckets with lowest read priority, by building a maxheap sorted - * by read priority and repeatedly replacing the maximum element until - * all buckets have been visited. - */ - for_each_bucket(g, ca) { - if (!bch_can_invalidate_bucket(ca, g)) - continue; - - bucket_heap_push(ca, g, bucket_sort_key(g)); - } - - /* Sort buckets by physical location on disk for better locality */ - for (i = 0; i < ca->heap.used; i++) { - struct bucket_heap_entry *e = &ca->heap.data[i]; - - e->val = e->g - ca->buckets; - } - - heap_resort(&ca->heap, bucket_max_cmp); - - /* - * If we run out of buckets to invalidate, bch_allocator_thread() will - * kick stuff and retry us - */ - while (!fifo_full(&ca->free_inc) && - heap_pop(&ca->heap, e, bucket_max_cmp)) { - BUG_ON(!bch_can_invalidate_bucket(ca, e.g)); - bch_invalidate_one_bucket(ca, e.g); - } - - mutex_unlock(&ca->fs->bucket_lock); - mutex_unlock(&ca->heap_lock); -} - -static void invalidate_buckets_fifo(struct bch_dev *ca) -{ - struct bucket *g; - size_t checked = 0; - - while (!fifo_full(&ca->free_inc)) { - if (ca->fifo_last_bucket < ca->mi.first_bucket || - ca->fifo_last_bucket >= ca->mi.nbuckets) - ca->fifo_last_bucket = ca->mi.first_bucket; - - g = ca->buckets + ca->fifo_last_bucket++; - - if (bch_can_invalidate_bucket(ca, g)) - bch_invalidate_one_bucket(ca, g); - - if (++checked >= ca->mi.nbuckets) - return; - } -} - -static void invalidate_buckets_random(struct bch_dev *ca) -{ - struct bucket *g; - size_t checked = 0; - - while (!fifo_full(&ca->free_inc)) { - size_t n = bch_rand_range(ca->mi.nbuckets - - ca->mi.first_bucket) + - ca->mi.first_bucket; - - g = ca->buckets + n; - - if (bch_can_invalidate_bucket(ca, g)) - bch_invalidate_one_bucket(ca, g); - - if (++checked >= ca->mi.nbuckets / 2) - return; - } -} - -static void invalidate_buckets(struct bch_dev *ca) -{ - ca->inc_gen_needs_gc = 0; - - switch (ca->mi.replacement) { - case CACHE_REPLACEMENT_LRU: - invalidate_buckets_lru(ca); - break; - case CACHE_REPLACEMENT_FIFO: - invalidate_buckets_fifo(ca); - break; - case CACHE_REPLACEMENT_RANDOM: - invalidate_buckets_random(ca); - break; - } -} - -static bool __bch_allocator_push(struct bch_dev *ca, long bucket) -{ - if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) - goto success; - - if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket)) - goto success; - - if (fifo_push(&ca->free[RESERVE_BTREE], bucket)) - goto success; - - if (fifo_push(&ca->free[RESERVE_NONE], bucket)) - goto success; - - return false; -success: - closure_wake_up(&ca->fs->freelist_wait); - return true; -} - -static bool bch_allocator_push(struct bch_dev *ca, long bucket) -{ - bool ret; - - spin_lock(&ca->freelist_lock); - ret = __bch_allocator_push(ca, bucket); - if (ret) - fifo_pop(&ca->free_inc, bucket); - spin_unlock(&ca->freelist_lock); - - return ret; -} - -static void bch_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca) -{ - u16 last_seq_ondisk = c->journal.last_seq_ondisk; - struct bucket *g; - - for_each_bucket(g, ca) { - struct bucket_mark m = READ_ONCE(g->mark); - - if (is_available_bucket(m) && - !m.cached_sectors && - !m.had_metadata && - !bucket_needs_journal_commit(m, last_seq_ondisk)) { - spin_lock(&ca->freelist_lock); - - bch_mark_alloc_bucket(ca, g, true); - g->read_prio = c->prio_clock[READ].hand; - g->write_prio = c->prio_clock[WRITE].hand; - - verify_not_on_freelist(ca, g - ca->buckets); - BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); - - spin_unlock(&ca->freelist_lock); - - if (fifo_full(&ca->free_inc)) - break; - } - } -} - -/** - * bch_allocator_thread - move buckets from free_inc to reserves - * - * The free_inc FIFO is populated by invalidate_buckets(), and - * the reserves are depleted by bucket allocation. When we run out - * of free_inc, try to invalidate some buckets and write out - * prios and gens. - */ -static int bch_allocator_thread(void *arg) -{ - struct bch_dev *ca = arg; - struct bch_fs *c = ca->fs; - int ret; - - set_freezable(); - - bch_find_empty_buckets(c, ca); - - while (1) { - /* - * First, we pull buckets off of the free_inc list, possibly - * issue discards to them, then we add the bucket to a - * free list: - */ - - while (!fifo_empty(&ca->free_inc)) { - long bucket = fifo_peek(&ca->free_inc); - - /* - * Don't remove from free_inc until after it's added - * to freelist, so gc doesn't miss it while we've - * dropped bucket lock - */ - - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, bucket), - ca->mi.bucket_size, GFP_NOIO, 0); - - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (bch_allocator_push(ca, bucket)) - break; - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - goto out; - } - schedule(); - try_to_freeze(); - } - - __set_current_state(TASK_RUNNING); - } - - down_read(&c->gc_lock); - - /* - * See if we have buckets we can reuse without invalidating them - * or forcing a journal commit: - */ - //bch_find_empty_buckets(c, ca); - - if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) { - up_read(&c->gc_lock); - continue; - } - - /* We've run out of free buckets! */ - - while (!fifo_full(&ca->free_inc)) { - if (wait_buckets_available(ca)) { - up_read(&c->gc_lock); - goto out; - } - - /* - * Find some buckets that we can invalidate, either - * they're completely unused, or only contain clean data - * that's been written back to the backing device or - * another cache tier - */ - - invalidate_buckets(ca); - trace_bcache_alloc_batch(ca, fifo_used(&ca->free_inc), - ca->free_inc.size); - } - - up_read(&c->gc_lock); - - /* - * free_inc is full of newly-invalidated buckets, must write out - * prios and gens before they can be re-used - */ - ret = bch_prio_write(ca); - if (ret) { - /* - * Emergency read only - allocator thread has to - * shutdown. - * - * N.B. we better be going into RO mode, else - * allocations would hang indefinitely - whatever - * generated the error will have sent us into RO mode. - * - * Clear out the free_inc freelist so things are - * consistent-ish: - */ - spin_lock(&ca->freelist_lock); - while (!fifo_empty(&ca->free_inc)) { - long bucket; - - fifo_pop(&ca->free_inc, bucket); - bch_mark_free_bucket(ca, ca->buckets + bucket); - } - spin_unlock(&ca->freelist_lock); - goto out; - } - } -out: - /* - * Avoid a race with bch_usage_update() trying to wake us up after - * we've exited: - */ - synchronize_rcu(); - return 0; -} - -/* Allocation */ - -/** - * bch_bucket_alloc - allocate a single bucket from a specific device - * - * Returns index of bucket on success, 0 on failure - * */ -size_t bch_bucket_alloc(struct bch_dev *ca, enum alloc_reserve reserve) -{ - struct bucket *g; - long r; - - spin_lock(&ca->freelist_lock); - if (fifo_pop(&ca->free[RESERVE_NONE], r) || - fifo_pop(&ca->free[reserve], r)) - goto out; - - spin_unlock(&ca->freelist_lock); - - trace_bcache_bucket_alloc_fail(ca, reserve); - return 0; -out: - verify_not_on_freelist(ca, r); - spin_unlock(&ca->freelist_lock); - - trace_bcache_bucket_alloc(ca, reserve); - - bch_wake_allocator(ca); - - g = ca->buckets + r; - - g->read_prio = ca->fs->prio_clock[READ].hand; - g->write_prio = ca->fs->prio_clock[WRITE].hand; - - return r; -} - -static void __bch_bucket_free(struct bch_dev *ca, struct bucket *g) -{ - bch_mark_free_bucket(ca, g); - - g->read_prio = ca->fs->prio_clock[READ].hand; - g->write_prio = ca->fs->prio_clock[WRITE].hand; -} - -enum bucket_alloc_ret { - ALLOC_SUCCESS, - NO_DEVICES, /* -EROFS */ - FREELIST_EMPTY, /* Allocator thread not keeping up */ -}; - -static void recalc_alloc_group_weights(struct bch_fs *c, - struct dev_group *devs) -{ - struct bch_dev *ca; - u64 available_buckets = 1; /* avoid a divide by zero... */ - unsigned i; - - for (i = 0; i < devs->nr; i++) { - ca = devs->d[i].dev; - - devs->d[i].weight = dev_buckets_free(ca); - available_buckets += devs->d[i].weight; - } - - for (i = 0; i < devs->nr; i++) { - const unsigned min_weight = U32_MAX >> 4; - const unsigned max_weight = U32_MAX; - - devs->d[i].weight = - min_weight + - div64_u64(devs->d[i].weight * - devs->nr * - (max_weight - min_weight), - available_buckets); - devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight); - } -} - -static enum bucket_alloc_ret bch_bucket_alloc_group(struct bch_fs *c, - struct open_bucket *ob, - enum alloc_reserve reserve, - unsigned nr_replicas, - struct dev_group *devs, - long *devs_used) -{ - enum bucket_alloc_ret ret; - unsigned fail_idx = -1, i; - unsigned available = 0; - - BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs)); - - if (ob->nr_ptrs >= nr_replicas) - return ALLOC_SUCCESS; - - spin_lock(&devs->lock); - - for (i = 0; i < devs->nr; i++) - available += !test_bit(devs->d[i].dev->dev_idx, - devs_used); - - recalc_alloc_group_weights(c, devs); - - i = devs->cur_device; - - while (ob->nr_ptrs < nr_replicas) { - struct bch_dev *ca; - u64 bucket; - - if (!available) { - ret = NO_DEVICES; - goto err; - } - - i++; - i %= devs->nr; - - ret = FREELIST_EMPTY; - if (i == fail_idx) - goto err; - - ca = devs->d[i].dev; - - if (test_bit(ca->dev_idx, devs_used)) - continue; - - if (fail_idx == -1 && - get_random_int() > devs->d[i].weight) - continue; - - bucket = bch_bucket_alloc(ca, reserve); - if (!bucket) { - if (fail_idx == -1) - fail_idx = i; - continue; - } - - /* - * open_bucket_add_buckets expects new pointers at the head of - * the list: - */ - memmove(&ob->ptrs[1], - &ob->ptrs[0], - ob->nr_ptrs * sizeof(ob->ptrs[0])); - memmove(&ob->ptr_offset[1], - &ob->ptr_offset[0], - ob->nr_ptrs * sizeof(ob->ptr_offset[0])); - ob->nr_ptrs++; - ob->ptrs[0] = (struct bch_extent_ptr) { - .gen = ca->buckets[bucket].mark.gen, - .offset = bucket_to_sector(ca, bucket), - .dev = ca->dev_idx, - }; - ob->ptr_offset[0] = 0; - - __set_bit(ca->dev_idx, devs_used); - available--; - devs->cur_device = i; - } - - ret = ALLOC_SUCCESS; -err: - EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC); - spin_unlock(&devs->lock); - return ret; -} - -static enum bucket_alloc_ret __bch_bucket_alloc_set(struct bch_fs *c, - struct write_point *wp, - struct open_bucket *ob, - unsigned nr_replicas, - enum alloc_reserve reserve, - long *devs_used) -{ - struct bch_tier *tier; - /* - * this should implement policy - for a given type of allocation, decide - * which devices to allocate from: - * - * XXX: switch off wp->type and do something more intelligent here - */ - if (wp->group) - return bch_bucket_alloc_group(c, ob, reserve, nr_replicas, - wp->group, devs_used); - - /* foreground writes: prefer fastest tier: */ - tier = READ_ONCE(c->fastest_tier); - if (tier) - bch_bucket_alloc_group(c, ob, reserve, nr_replicas, - &tier->devs, devs_used); - - return bch_bucket_alloc_group(c, ob, reserve, nr_replicas, - &c->all_devs, devs_used); -} - -static int bch_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, - struct open_bucket *ob, unsigned nr_replicas, - enum alloc_reserve reserve, long *devs_used, - struct closure *cl) -{ - bool waiting = false; - - while (1) { - switch (__bch_bucket_alloc_set(c, wp, ob, nr_replicas, - reserve, devs_used)) { - case ALLOC_SUCCESS: - if (waiting) - closure_wake_up(&c->freelist_wait); - - return 0; - - case NO_DEVICES: - if (waiting) - closure_wake_up(&c->freelist_wait); - return -EROFS; - - case FREELIST_EMPTY: - if (!cl || waiting) - trace_bcache_freelist_empty_fail(c, - reserve, cl); - - if (!cl) - return -ENOSPC; - - if (waiting) - return -EAGAIN; - - /* Retry allocation after adding ourself to waitlist: */ - closure_wait(&c->freelist_wait, cl); - waiting = true; - break; - default: - BUG(); - } - } -} - -/* Open buckets: */ - -/* - * Open buckets represent one or more buckets (on multiple devices) that are - * currently being allocated from. They serve two purposes: - * - * - They track buckets that have been partially allocated, allowing for - * sub-bucket sized allocations - they're used by the sector allocator below - * - * - They provide a reference to the buckets they own that mark and sweep GC - * can find, until the new allocation has a pointer to it inserted into the - * btree - * - * When allocating some space with the sector allocator, the allocation comes - * with a reference to an open bucket - the caller is required to put that - * reference _after_ doing the index update that makes its allocation reachable. - */ - -static void __bch_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -{ - const struct bch_extent_ptr *ptr; - - lockdep_assert_held(&c->open_buckets_lock); - - open_bucket_for_each_ptr(ob, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; - - bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false); - } - - ob->nr_ptrs = 0; - - list_move(&ob->list, &c->open_buckets_free); - c->open_buckets_nr_free++; - closure_wake_up(&c->open_buckets_wait); -} - -void bch_open_bucket_put(struct bch_fs *c, struct open_bucket *b) -{ - if (atomic_dec_and_test(&b->pin)) { - spin_lock(&c->open_buckets_lock); - __bch_open_bucket_put(c, b); - spin_unlock(&c->open_buckets_lock); - } -} - -static struct open_bucket *bch_open_bucket_get(struct bch_fs *c, - unsigned nr_reserved, - struct closure *cl) -{ - struct open_bucket *ret; - - spin_lock(&c->open_buckets_lock); - - if (c->open_buckets_nr_free > nr_reserved) { - BUG_ON(list_empty(&c->open_buckets_free)); - ret = list_first_entry(&c->open_buckets_free, - struct open_bucket, list); - list_move(&ret->list, &c->open_buckets_open); - BUG_ON(ret->nr_ptrs); - - atomic_set(&ret->pin, 1); /* XXX */ - ret->has_full_ptrs = false; - - c->open_buckets_nr_free--; - trace_bcache_open_bucket_alloc(c, cl); - } else { - trace_bcache_open_bucket_alloc_fail(c, cl); - - if (cl) { - closure_wait(&c->open_buckets_wait, cl); - ret = ERR_PTR(-EAGAIN); - } else - ret = ERR_PTR(-ENOSPC); - } - - spin_unlock(&c->open_buckets_lock); - - return ret; -} - -static unsigned ob_ptr_sectors_free(struct bch_fs *c, - struct open_bucket *ob, - struct bch_extent_ptr *ptr) -{ - struct bch_dev *ca = c->devs[ptr->dev]; - unsigned i = ptr - ob->ptrs; - unsigned bucket_size = ca->mi.bucket_size; - unsigned used = (ptr->offset & (bucket_size - 1)) + - ob->ptr_offset[i]; - - BUG_ON(used > bucket_size); - - return bucket_size - used; -} - -static unsigned open_bucket_sectors_free(struct bch_fs *c, - struct open_bucket *ob, - unsigned nr_replicas) -{ - unsigned i, sectors_free = UINT_MAX; - - for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++) - sectors_free = min(sectors_free, - ob_ptr_sectors_free(c, ob, &ob->ptrs[i])); - - return sectors_free != UINT_MAX ? sectors_free : 0; -} - -static void open_bucket_copy_unused_ptrs(struct bch_fs *c, - struct open_bucket *new, - struct open_bucket *old) -{ - unsigned i; - - for (i = 0; i < old->nr_ptrs; i++) - if (ob_ptr_sectors_free(c, old, &old->ptrs[i])) { - struct bch_extent_ptr tmp = old->ptrs[i]; - - tmp.offset += old->ptr_offset[i]; - new->ptrs[new->nr_ptrs] = tmp; - new->ptr_offset[new->nr_ptrs] = 0; - new->nr_ptrs++; - } -} - -static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob) -{ -#ifdef CONFIG_BCACHE_DEBUG - const struct bch_extent_ptr *ptr; - - open_bucket_for_each_ptr(ob, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; - - BUG_ON(ptr_stale(ca, ptr)); - } -#endif -} - -/* Sector allocator */ - -static struct open_bucket *lock_writepoint(struct bch_fs *c, - struct write_point *wp) -{ - struct open_bucket *ob; - - while ((ob = ACCESS_ONCE(wp->b))) { - mutex_lock(&ob->lock); - if (wp->b == ob) - break; - - mutex_unlock(&ob->lock); - } - - return ob; -} - -static int open_bucket_add_buckets(struct bch_fs *c, - struct write_point *wp, - struct open_bucket *ob, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - struct closure *cl) -{ - long devs_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; - unsigned i; - int ret; - - /* - * We might be allocating pointers to add to an existing extent - * (tiering/copygc/migration) - if so, some of the pointers in our - * existing open bucket might duplicate devices we already have. This is - * moderately annoying. - */ - - /* Short circuit all the fun stuff if posssible: */ - if (ob->nr_ptrs >= nr_replicas) - return 0; - - memset(devs_used, 0, sizeof(devs_used)); - - for (i = 0; i < ob->nr_ptrs; i++) - __set_bit(ob->ptrs[i].dev, devs_used); - - ret = bch_bucket_alloc_set(c, wp, ob, nr_replicas, - reserve, devs_used, cl); - - if (ret == -EROFS && - ob->nr_ptrs >= nr_replicas_required) - ret = 0; - - return ret; -} - -/* - * Get us an open_bucket we can allocate from, return with it locked: - */ -struct open_bucket *bch_alloc_sectors_start(struct bch_fs *c, - struct write_point *wp, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - struct closure *cl) -{ - struct open_bucket *ob; - unsigned open_buckets_reserved = wp == &c->btree_write_point - ? 0 : BTREE_NODE_RESERVE; - int ret; - - BUG_ON(!reserve); - BUG_ON(!nr_replicas); -retry: - ob = lock_writepoint(c, wp); - - /* - * If ob->sectors_free == 0, one or more of the buckets ob points to is - * full. We can't drop pointers from an open bucket - garbage collection - * still needs to find them; instead, we must allocate a new open bucket - * and copy any pointers to non-full buckets into the new open bucket. - */ - if (!ob || ob->has_full_ptrs) { - struct open_bucket *new_ob; - - new_ob = bch_open_bucket_get(c, open_buckets_reserved, cl); - if (IS_ERR(new_ob)) - return new_ob; - - mutex_lock(&new_ob->lock); - - /* - * We point the write point at the open_bucket before doing the - * allocation to avoid a race with shutdown: - */ - if (race_fault() || - cmpxchg(&wp->b, ob, new_ob) != ob) { - /* We raced: */ - mutex_unlock(&new_ob->lock); - bch_open_bucket_put(c, new_ob); - - if (ob) - mutex_unlock(&ob->lock); - goto retry; - } - - if (ob) { - open_bucket_copy_unused_ptrs(c, new_ob, ob); - mutex_unlock(&ob->lock); - bch_open_bucket_put(c, ob); - } - - ob = new_ob; - } - - ret = open_bucket_add_buckets(c, wp, ob, nr_replicas, - nr_replicas_required, - reserve, cl); - if (ret) { - mutex_unlock(&ob->lock); - return ERR_PTR(ret); - } - - ob->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas); - - BUG_ON(!ob->sectors_free); - verify_not_stale(c, ob); - - return ob; -} - -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ -void bch_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, - unsigned nr_replicas, struct open_bucket *ob, - unsigned sectors) -{ - struct bch_extent_ptr tmp; - bool has_data = false; - unsigned i; - - /* - * We're keeping any existing pointer k has, and appending new pointers: - * __bch_write() will only write to the pointers we add here: - */ - - BUG_ON(sectors > ob->sectors_free); - - /* didn't use all the ptrs: */ - if (nr_replicas < ob->nr_ptrs) - has_data = true; - - for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) { - EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev)); - - tmp = ob->ptrs[i]; - tmp.cached = bkey_extent_is_cached(&e->k); - tmp.offset += ob->ptr_offset[i]; - extent_ptr_append(e, tmp); - - ob->ptr_offset[i] += sectors; - - this_cpu_add(*c->devs[tmp.dev]->sectors_written, sectors); - } -} - -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ -void bch_alloc_sectors_done(struct bch_fs *c, struct write_point *wp, - struct open_bucket *ob) -{ - bool has_data = false; - unsigned i; - - for (i = 0; i < ob->nr_ptrs; i++) { - if (!ob_ptr_sectors_free(c, ob, &ob->ptrs[i])) - ob->has_full_ptrs = true; - else - has_data = true; - } - - if (likely(has_data)) - atomic_inc(&ob->pin); - else - BUG_ON(xchg(&wp->b, NULL) != ob); - - mutex_unlock(&ob->lock); -} - -/* - * Allocates some space in the cache to write to, and k to point to the newly - * allocated space, and updates k->size and k->offset (to point to the - * end of the newly allocated space). - * - * May allocate fewer sectors than @sectors, k->size indicates how many - * sectors were actually allocated. - * - * Return codes: - * - -EAGAIN: closure was added to waitlist - * - -ENOSPC: out of space and no closure provided - * - * @c - filesystem. - * @wp - write point to use for allocating sectors. - * @k - key to return the allocated space information. - * @cl - closure to wait for a bucket - */ -struct open_bucket *bch_alloc_sectors(struct bch_fs *c, - struct write_point *wp, - struct bkey_i_extent *e, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - struct closure *cl) -{ - struct open_bucket *ob; - - ob = bch_alloc_sectors_start(c, wp, nr_replicas, - nr_replicas_required, - reserve, cl); - if (IS_ERR_OR_NULL(ob)) - return ob; - - if (e->k.size > ob->sectors_free) - bch_key_resize(&e->k, ob->sectors_free); - - bch_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size); - - bch_alloc_sectors_done(c, wp, ob); - - return ob; -} - -/* Startup/shutdown (ro/rw): */ - -void bch_recalc_capacity(struct bch_fs *c) -{ - struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier; - struct bch_dev *ca; - u64 total_capacity, capacity = 0, reserved_sectors = 0; - unsigned long ra_pages = 0; - unsigned i, j; - - for_each_online_member(ca, c, i) { - struct backing_dev_info *bdi = - blk_get_backing_dev_info(ca->disk_sb.bdev); - - ra_pages += bdi->ra_pages; - } - - c->bdi.ra_pages = ra_pages; - - /* Find fastest, slowest tiers with devices: */ - - for (tier = c->tiers; - tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { - if (!tier->devs.nr) - continue; - if (!fastest_tier) - fastest_tier = tier; - slowest_tier = tier; - } - - c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL; - - c->promote_write_point.group = &fastest_tier->devs; - - if (!fastest_tier) - goto set_capacity; - - /* - * Capacity of the filesystem is the capacity of all the devices in the - * slowest (highest) tier - we don't include lower tier devices. - */ - spin_lock(&slowest_tier->devs.lock); - group_for_each_dev(ca, &slowest_tier->devs, i) { - size_t reserve = 0; - - /* - * We need to reserve buckets (from the number - * of currently available buckets) against - * foreground writes so that mainly copygc can - * make forward progress. - * - * We need enough to refill the various reserves - * from scratch - copygc will use its entire - * reserve all at once, then run against when - * its reserve is refilled (from the formerly - * available buckets). - * - * This reserve is just used when considering if - * allocations for foreground writes must wait - - * not -ENOSPC calculations. - */ - for (j = 0; j < RESERVE_NONE; j++) - reserve += ca->free[j].size; - - reserve += ca->free_inc.size; - - reserve += ARRAY_SIZE(c->write_points); - - if (ca->mi.tier) - reserve += 1; /* tiering write point */ - reserve += 1; /* btree write point */ - - reserved_sectors += reserve << ca->bucket_bits; - - capacity += (ca->mi.nbuckets - - ca->mi.first_bucket) << - ca->bucket_bits; - } - spin_unlock(&slowest_tier->devs.lock); -set_capacity: - total_capacity = capacity; - - capacity *= (100 - c->opts.gc_reserve_percent); - capacity = div64_u64(capacity, 100); - - BUG_ON(capacity + reserved_sectors > total_capacity); - - c->capacity = capacity; - - if (c->capacity) { - bch_io_timer_add(&c->io_clock[READ], - &c->prio_clock[READ].rescale); - bch_io_timer_add(&c->io_clock[WRITE], - &c->prio_clock[WRITE].rescale); - } else { - bch_io_timer_del(&c->io_clock[READ], - &c->prio_clock[READ].rescale); - bch_io_timer_del(&c->io_clock[WRITE], - &c->prio_clock[WRITE].rescale); - } - - /* Wake up case someone was waiting for buckets */ - closure_wake_up(&c->freelist_wait); -} - -static void bch_stop_write_point(struct bch_dev *ca, - struct write_point *wp) -{ - struct bch_fs *c = ca->fs; - struct open_bucket *ob; - struct bch_extent_ptr *ptr; - - ob = lock_writepoint(c, wp); - if (!ob) - return; - - for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++) - if (ptr->dev == ca->dev_idx) - goto found; - - mutex_unlock(&ob->lock); - return; -found: - BUG_ON(xchg(&wp->b, NULL) != ob); - mutex_unlock(&ob->lock); - - /* Drop writepoint's ref: */ - bch_open_bucket_put(c, ob); -} - -static bool bch_dev_has_open_write_point(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_extent_ptr *ptr; - struct open_bucket *ob; - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) - if (atomic_read(&ob->pin)) { - mutex_lock(&ob->lock); - for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++) - if (ptr->dev == ca->dev_idx) { - mutex_unlock(&ob->lock); - return true; - } - mutex_unlock(&ob->lock); - } - - return false; -} - -/* device goes ro: */ -void bch_dev_allocator_stop(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct dev_group *tier = &c->tiers[ca->mi.tier].devs; - struct task_struct *p; - struct closure cl; - unsigned i; - - closure_init_stack(&cl); - - /* First, remove device from allocation groups: */ - - bch_dev_group_remove(tier, ca); - bch_dev_group_remove(&c->all_devs, ca); - - bch_recalc_capacity(c); - - /* - * Stopping the allocator thread comes after removing from allocation - * groups, else pending allocations will hang: - */ - - p = ca->alloc_thread; - ca->alloc_thread = NULL; - smp_wmb(); - - /* - * We need an rcu barrier between setting ca->alloc_thread = NULL and - * the thread shutting down to avoid a race with bch_usage_update() - - * the allocator thread itself does a synchronize_rcu() on exit. - * - * XXX: it would be better to have the rcu barrier be asynchronous - * instead of blocking us here - */ - if (p) { - kthread_stop(p); - put_task_struct(p); - } - - /* Next, close write points that point to this device... */ - - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - bch_stop_write_point(ca, &c->write_points[i]); - - bch_stop_write_point(ca, &ca->copygc_write_point); - bch_stop_write_point(ca, &c->promote_write_point); - bch_stop_write_point(ca, &ca->tiering_write_point); - bch_stop_write_point(ca, &c->migration_write_point); - bch_stop_write_point(ca, &c->btree_write_point); - - mutex_lock(&c->btree_reserve_cache_lock); - while (c->btree_reserve_cache_nr) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - bch_open_bucket_put(c, a->ob); - } - mutex_unlock(&c->btree_reserve_cache_lock); - - /* Avoid deadlocks.. */ - - closure_wake_up(&c->freelist_wait); - wake_up(&c->journal.wait); - - /* Now wait for any in flight writes: */ - - while (1) { - closure_wait(&c->open_buckets_wait, &cl); - - if (!bch_dev_has_open_write_point(ca)) { - closure_wake_up(&c->open_buckets_wait); - break; - } - - closure_sync(&cl); - } -} - -/* - * Startup the allocator thread for transition to RW mode: - */ -int bch_dev_allocator_start(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct dev_group *tier = &c->tiers[ca->mi.tier].devs; - struct bch_sb_field_journal *journal_buckets; - bool has_journal; - struct task_struct *k; - - /* - * allocator thread already started? - */ - if (ca->alloc_thread) - return 0; - - k = kthread_create(bch_allocator_thread, ca, "bcache_allocator"); - if (IS_ERR(k)) - return 0; - - get_task_struct(k); - ca->alloc_thread = k; - - bch_dev_group_add(tier, ca); - bch_dev_group_add(&c->all_devs, ca); - - mutex_lock(&c->sb_lock); - journal_buckets = bch_sb_get_journal(ca->disk_sb.sb); - has_journal = bch_nr_journal_buckets(journal_buckets) >= - BCH_JOURNAL_BUCKETS_MIN; - mutex_unlock(&c->sb_lock); - - if (has_journal) - bch_dev_group_add(&c->journal.devs, ca); - - bch_recalc_capacity(c); - - /* - * Don't wake up allocator thread until after adding device to - * allocator groups - otherwise, alloc thread could get a spurious - * -EROFS due to prio_write() -> journal_meta() not finding any devices: - */ - wake_up_process(k); - return 0; -} - -void bch_fs_allocator_init(struct bch_fs *c) -{ - unsigned i; - - INIT_LIST_HEAD(&c->open_buckets_open); - INIT_LIST_HEAD(&c->open_buckets_free); - spin_lock_init(&c->open_buckets_lock); - bch_prio_timer_init(c, READ); - bch_prio_timer_init(c, WRITE); - - /* open bucket 0 is a sentinal NULL: */ - mutex_init(&c->open_buckets[0].lock); - INIT_LIST_HEAD(&c->open_buckets[0].list); - - for (i = 1; i < ARRAY_SIZE(c->open_buckets); i++) { - mutex_init(&c->open_buckets[i].lock); - c->open_buckets_nr_free++; - list_add(&c->open_buckets[i].list, &c->open_buckets_free); - } - - spin_lock_init(&c->all_devs.lock); - - for (i = 0; i < ARRAY_SIZE(c->tiers); i++) - spin_lock_init(&c->tiers[i].devs.lock); - - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - c->write_points[i].throttle = true; - - c->pd_controllers_update_seconds = 5; - INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); - - spin_lock_init(&c->foreground_write_pd_lock); - bch_pd_controller_init(&c->foreground_write_pd); - /* - * We do not want the write rate to have an effect on the computed - * rate, for two reasons: - * - * We do not call bch_ratelimit_delay() at all if the write rate - * exceeds 1GB/s. In this case, the PD controller will think we are - * not "keeping up" and not change the rate. - */ - c->foreground_write_pd.backpressure = 0; - init_timer(&c->foreground_write_wakeup); - - c->foreground_write_wakeup.data = (unsigned long) c; - c->foreground_write_wakeup.function = bch_wake_delayed_writes; -} |