diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2017-12-27 16:41:52 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-12-28 02:41:05 -0500 |
commit | b1ee6ae0172cdc62d330109fdb2ac30e69ec3510 (patch) | |
tree | bdd4a2f8b226e31188e996f454965fe23f891707 | |
parent | bb91fd783c9c15f10410cc463f6e0c3413cb714e (diff) |
bcachefs: locking changes for device resize
-rw-r--r-- | fs/bcachefs/alloc.c | 351 | ||||
-rw-r--r-- | fs/bcachefs/alloc.h | 1 | ||||
-rw-r--r-- | fs/bcachefs/alloc_types.h | 3 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs.h | 35 | ||||
-rw-r--r-- | fs/bcachefs/btree_gc.c | 49 | ||||
-rw-r--r-- | fs/bcachefs/buckets.c | 84 | ||||
-rw-r--r-- | fs/bcachefs/buckets.h | 72 | ||||
-rw-r--r-- | fs/bcachefs/buckets_types.h | 7 | ||||
-rw-r--r-- | fs/bcachefs/extents.c | 25 | ||||
-rw-r--r-- | fs/bcachefs/fifo.h | 12 | ||||
-rw-r--r-- | fs/bcachefs/io.c | 5 | ||||
-rw-r--r-- | fs/bcachefs/journal.c | 3 | ||||
-rw-r--r-- | fs/bcachefs/movinggc.c | 22 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 18 | ||||
-rw-r--r-- | fs/bcachefs/sysfs.c | 26 |
15 files changed, 436 insertions, 277 deletions
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c index 53d764d97d74..ec02adc0d20c 100644 --- a/fs/bcachefs/alloc.c +++ b/fs/bcachefs/alloc.c @@ -154,6 +154,8 @@ static void pd_controllers_update(struct work_struct *work) c->pd_controllers_update_seconds * HZ); } +/* Persistent alloc info: */ + static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) { unsigned bytes = offsetof(struct bch_alloc, data); @@ -262,7 +264,9 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) if (a.k->p.offset >= ca->mi.nbuckets) return; - g = ca->buckets + a.k->p.offset; + lg_local_lock(&c->usage_lock); + + g = bucket(ca, a.k->p.offset); bucket_cmpxchg(g, new, ({ new.gen = a.v->gen; new.gen_valid = 1; @@ -273,6 +277,8 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) g->prio[READ] = get_alloc_field(&d, 2); if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) g->prio[WRITE] = get_alloc_field(&d, 2); + + lg_local_unlock(&c->usage_lock); } int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) @@ -306,35 +312,45 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) } mutex_lock(&c->prio_clock[READ].lock); - for_each_member_device(ca, c, i) + for_each_member_device(ca, c, i) { + down_read(&ca->bucket_lock); bch2_recalc_min_prio(c, ca, READ); + up_read(&ca->bucket_lock); + } mutex_unlock(&c->prio_clock[READ].lock); mutex_lock(&c->prio_clock[WRITE].lock); - for_each_member_device(ca, c, i) + for_each_member_device(ca, c, i) { + down_read(&ca->bucket_lock); bch2_recalc_min_prio(c, ca, WRITE); + up_read(&ca->bucket_lock); + } mutex_unlock(&c->prio_clock[WRITE].lock); return 0; } static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g, struct btree_iter *iter, + size_t b, struct btree_iter *iter, u64 *journal_seq) { struct bucket_mark m; __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key; + struct bucket *g; struct bkey_i_alloc *a; u8 *d; int ret; - bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, g - ca->buckets)); + bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); do { ret = bch2_btree_iter_traverse(iter); if (ret) break; + lg_local_lock(&c->usage_lock); + g = bucket(ca, b); + /* read mark under btree node lock: */ m = READ_ONCE(g->mark); a = bkey_alloc_init(&alloc_key.k); @@ -348,8 +364,8 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, put_alloc_field(&d, 2, g->prio[READ]); if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) put_alloc_field(&d, 2, g->prio[WRITE]); + lg_local_unlock(&c->usage_lock); - bch2_btree_iter_set_pos(iter, a->k.p); ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| @@ -366,7 +382,6 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) { struct bch_dev *ca; - struct bucket *g; struct btree_iter iter; int ret; @@ -378,12 +393,10 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) if (pos.offset >= ca->mi.nbuckets) return 0; - g = ca->buckets + pos.offset; - bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_INTENT); - ret = __bch2_alloc_write_key(c, ca, g, &iter, NULL); + ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL); bch2_btree_iter_unlock(&iter); return ret; } @@ -397,78 +410,26 @@ static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_s bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_INTENT); - for_each_set_bit(bucket, ca->bucket_dirty, ca->mi.nbuckets) { - ret = __bch2_alloc_write_key(c, ca, ca->buckets + bucket, - &iter, journal_seq); + down_read(&ca->bucket_lock); + for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) { + ret = __bch2_alloc_write_key(c, ca, bucket, &iter, journal_seq); if (ret) break; - clear_bit(bucket, ca->bucket_dirty); + clear_bit(bucket, ca->buckets_dirty); } + up_read(&ca->bucket_lock); bch2_btree_iter_unlock(&iter); return ret; } -#define BUCKET_GC_GEN_MAX 96U - -/** - * wait_buckets_available - wait on reclaimable buckets - * - * If there aren't enough available buckets to fill up free_inc, wait until - * there are. - */ -static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) -{ - unsigned long gc_count = c->gc_count; - int ret = 0; - - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) { - ret = -1; - break; - } - - if (gc_count != c->gc_count) - ca->inc_gen_really_needs_gc = 0; - - if ((ssize_t) (dev_buckets_available(c, ca) - - ca->inc_gen_really_needs_gc) >= - (ssize_t) fifo_free(&ca->free_inc)) - break; - - up_read(&c->gc_lock); - schedule(); - try_to_freeze(); - down_read(&c->gc_lock); - } - - __set_current_state(TASK_RUNNING); - return ret; -} - -static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, - size_t bucket) -{ - if (expensive_debug_checks(c)) { - size_t iter; - long i; - unsigned j; - - for (j = 0; j < RESERVE_NR; j++) - fifo_for_each_entry(i, &ca->free[j], iter) - BUG_ON(i == bucket); - fifo_for_each_entry(i, &ca->free_inc, iter) - BUG_ON(i == bucket); - } -} - -/* Bucket heap / gen */ +/* Bucket IO clocks: */ static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw) { struct prio_clock *clock = &c->prio_clock[rw]; + struct bucket_array *buckets = bucket_array(ca); struct bucket *g; u16 max_delta = 1; unsigned i; @@ -476,7 +437,7 @@ static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw) lockdep_assert_held(&c->prio_clock[rw].lock); /* Determine min prio for this particular device */ - for_each_bucket(g, ca) + for_each_bucket(g, buckets) max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw])); ca->min_prio[rw] = clock->hand - max_delta; @@ -497,6 +458,7 @@ static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw) static void bch2_rescale_prios(struct bch_fs *c, int rw) { struct prio_clock *clock = &c->prio_clock[rw]; + struct bucket_array *buckets; struct bch_dev *ca; struct bucket *g; unsigned i; @@ -504,20 +466,25 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw) trace_rescale_prios(c); for_each_member_device(ca, c, i) { - for_each_bucket(g, ca) + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + for_each_bucket(g, buckets) g->prio[rw] = clock->hand - - (clock->hand - g->prio[rw]) / 2; + (clock->hand - g->prio[rw]) / 2; bch2_recalc_min_prio(c, ca, rw); + + up_read(&ca->bucket_lock); } } static void bch2_inc_clock_hand(struct io_timer *timer) { struct prio_clock *clock = container_of(timer, - struct prio_clock, rescale); + struct prio_clock, rescale); struct bch_fs *c = container_of(clock, - struct bch_fs, prio_clock[clock->rw]); + struct bch_fs, prio_clock[clock->rw]); u64 capacity; mutex_lock(&clock->lock); @@ -559,50 +526,106 @@ static void bch2_prio_timer_init(struct bch_fs *c, int rw) mutex_init(&clock->lock); } +/* Background allocator thread: */ + /* - * Background allocation thread: scans for buckets to be invalidated, - * invalidates them, rewrites prios/gens (marking them as invalidated on disk), - * then optionally issues discard commands to the newly free buckets, then puts - * them on the various freelists. + * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens + * (marking them as invalidated on disk), then optionally issues discard + * commands to the newly free buckets, then puts them on the various freelists. */ -static inline bool can_inc_bucket_gen(struct bch_dev *ca, struct bucket *g) +static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, + size_t bucket) { - return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX; + if (expensive_debug_checks(c)) { + size_t iter; + long i; + unsigned j; + + for (j = 0; j < RESERVE_NR; j++) + fifo_for_each_entry(i, &ca->free[j], iter) + BUG_ON(i == bucket); + fifo_for_each_entry(i, &ca->free_inc, iter) + BUG_ON(i == bucket); + } +} + +#define BUCKET_GC_GEN_MAX 96U + +/** + * wait_buckets_available - wait on reclaimable buckets + * + * If there aren't enough available buckets to fill up free_inc, wait until + * there are. + */ +static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned long gc_count = c->gc_count; + int ret = 0; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) { + ret = -1; + break; + } + + if (gc_count != c->gc_count) + ca->inc_gen_really_needs_gc = 0; + + if ((ssize_t) (dev_buckets_available(c, ca) - + ca->inc_gen_really_needs_gc) >= + (ssize_t) fifo_free(&ca->free_inc)) + break; + + up_read(&c->gc_lock); + schedule(); + try_to_freeze(); + down_read(&c->gc_lock); + } + + __set_current_state(TASK_RUNNING); + return ret; } -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g, +static bool bch2_can_invalidate_bucket(struct bch_dev *ca, + size_t bucket, struct bucket_mark mark) { + u8 gc_gen; + if (!is_available_bucket(mark)) return false; - if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX / 2) + gc_gen = bucket_gc_gen(ca, bucket); + + if (gc_gen >= BUCKET_GC_GEN_MAX / 2) ca->inc_gen_needs_gc++; - if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX) + if (gc_gen >= BUCKET_GC_GEN_MAX) ca->inc_gen_really_needs_gc++; - return can_inc_bucket_gen(ca, g); + return gc_gen < BUCKET_GC_GEN_MAX; } static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g) + size_t bucket) { struct bucket_mark m; spin_lock(&c->freelist_lock); - if (!bch2_invalidate_bucket(c, ca, g, &m)) { + if (!bch2_invalidate_bucket(c, ca, bucket, &m)) { spin_unlock(&c->freelist_lock); return; } - verify_not_on_freelist(c, ca, g - ca->buckets); - BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); + verify_not_on_freelist(c, ca, bucket); + BUG_ON(!fifo_push(&ca->free_inc, bucket)); spin_unlock(&c->freelist_lock); - g->prio[READ] = c->prio_clock[READ].hand; - g->prio[WRITE] = c->prio_clock[WRITE].hand; + /* gc lock held: */ + bucket_io_clock_reset(c, ca, bucket, READ); + bucket_io_clock_reset(c, ca, bucket, WRITE); if (m.cached_sectors) { ca->allocator_invalidating_data = true; @@ -640,14 +663,14 @@ static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, */ static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g, struct bucket_mark m) + size_t b, struct bucket_mark m) { /* * Time since last read, scaled to [0, 8) where larger value indicates * more recently read data: */ unsigned long hotness = - (g->prio[READ] - ca->min_prio[READ]) * 7 / + (bucket(ca, b)->prio[READ] - ca->min_prio[READ]) * 7 / (c->prio_clock[READ].hand - ca->min_prio[READ]); /* How much we want to keep the data in this bucket: */ @@ -655,11 +678,11 @@ static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, (hotness + 1) * bucket_sectors_used(m); unsigned long needs_journal_commit = - bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); + bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); return (data_wantness << 9) | (needs_journal_commit << 8) | - bucket_gc_gen(ca, g); + bucket_gc_gen(ca, b); } static inline int bucket_alloc_cmp(alloc_heap *h, @@ -671,12 +694,17 @@ static inline int bucket_alloc_cmp(alloc_heap *h, static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca) { + struct bucket_array *buckets; struct alloc_heap_entry e; - struct bucket *g; + size_t b; ca->alloc_heap.used = 0; mutex_lock(&c->prio_clock[READ].lock); + down_read(&ca->bucket_lock); + + buckets = bucket_array(ca); + bch2_recalc_min_prio(c, ca, READ); /* @@ -684,20 +712,23 @@ static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca) * by read priority and repeatedly replacing the maximum element until * all buckets have been visited. */ - for_each_bucket(g, ca) { - struct bucket_mark m = READ_ONCE(g->mark); + for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { + struct bucket_mark m = READ_ONCE(buckets->b[b].mark); - if (!bch2_can_invalidate_bucket(ca, g, m)) + if (!bch2_can_invalidate_bucket(ca, b, m)) continue; e = (struct alloc_heap_entry) { - .bucket = g - ca->buckets, - .key = bucket_sort_key(c, ca, g, m) + .bucket = b, + .key = bucket_sort_key(c, ca, b, m) }; heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); } + up_read(&ca->bucket_lock); + mutex_unlock(&c->prio_clock[READ].lock); + heap_resort(&ca->alloc_heap, bucket_alloc_cmp); /* @@ -706,52 +737,48 @@ static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca) */ while (!fifo_full(&ca->free_inc) && heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) - bch2_invalidate_one_bucket(c, ca, &ca->buckets[e.bucket]); - - mutex_unlock(&c->prio_clock[READ].lock); + bch2_invalidate_one_bucket(c, ca, e.bucket); } static void invalidate_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) { + struct bucket_array *buckets = bucket_array(ca); struct bucket_mark m; - struct bucket *g; - size_t checked = 0; + size_t b, checked; - while (!fifo_full(&ca->free_inc)) { + for (checked = 0; + checked < ca->mi.nbuckets && !fifo_full(&ca->free_inc); + checked++) { if (ca->fifo_last_bucket < ca->mi.first_bucket || ca->fifo_last_bucket >= ca->mi.nbuckets) ca->fifo_last_bucket = ca->mi.first_bucket; - g = ca->buckets + ca->fifo_last_bucket++; - m = READ_ONCE(g->mark); + b = ca->fifo_last_bucket++; - if (bch2_can_invalidate_bucket(ca, g, m)) - bch2_invalidate_one_bucket(c, ca, g); + m = READ_ONCE(buckets->b[b].mark); - if (++checked >= ca->mi.nbuckets) - return; + if (bch2_can_invalidate_bucket(ca, b, m)) + bch2_invalidate_one_bucket(c, ca, b); } } static void invalidate_buckets_random(struct bch_fs *c, struct bch_dev *ca) { + struct bucket_array *buckets = bucket_array(ca); struct bucket_mark m; - struct bucket *g; - size_t checked = 0; + size_t checked; - while (!fifo_full(&ca->free_inc)) { - size_t n = bch2_rand_range(ca->mi.nbuckets - - ca->mi.first_bucket) + + for (checked = 0; + checked < ca->mi.nbuckets / 2 && !fifo_full(&ca->free_inc); + checked++) { + size_t b = bch2_rand_range(ca->mi.nbuckets - + ca->mi.first_bucket) + ca->mi.first_bucket; - g = ca->buckets + n; - m = READ_ONCE(g->mark); - - if (bch2_can_invalidate_bucket(ca, g, m)) - bch2_invalidate_one_bucket(c, ca, g); + m = READ_ONCE(buckets->b[b].mark); - if (++checked >= ca->mi.nbuckets / 2) - return; + if (bch2_can_invalidate_bucket(ca, b, m)) + bch2_invalidate_one_bucket(c, ca, b); } } @@ -761,15 +788,15 @@ static void invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) ca->inc_gen_really_needs_gc = 0; switch (ca->mi.replacement) { - case CACHE_REPLACEMENT_LRU: - invalidate_buckets_lru(c, ca); - break; - case CACHE_REPLACEMENT_FIFO: - invalidate_buckets_fifo(c, ca); - break; - case CACHE_REPLACEMENT_RANDOM: - invalidate_buckets_random(c, ca); - break; + case CACHE_REPLACEMENT_LRU: + invalidate_buckets_lru(c, ca); + break; + case CACHE_REPLACEMENT_FIFO: + invalidate_buckets_fifo(c, ca); + break; + case CACHE_REPLACEMENT_RANDOM: + invalidate_buckets_random(c, ca); + break; } } @@ -792,8 +819,7 @@ static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, BTREE_ITER_INTENT); fifo_for_each_entry(b, &ca->free_inc, i) { - ret = __bch2_alloc_write_key(c, ca, ca->buckets + b, - &iter, journal_seq); + ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq); if (ret) break; @@ -983,8 +1009,8 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); spin_lock(&ob->lock); - bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false, - gc_pos_alloc(c, ob), 0); + bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), + false, gc_pos_alloc(c, ob), 0); ob->valid = false; spin_unlock(&ob->lock); @@ -1028,26 +1054,35 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) */ static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) { - struct bucket *g; - long r = -1; + struct bucket_array *buckets; + ssize_t b; if (!down_read_trylock(&c->gc_lock)) - return r; + return -1; - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) - goto out; + if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { + up_read(&c->gc_lock); + return -1; + } - for_each_bucket(g, ca) - if (!g->mark.touched_this_mount && - is_available_bucket(g->mark) && - bch2_mark_alloc_bucket_startup(c, ca, g)) { - r = g - ca->buckets; - set_bit(r, ca->bucket_dirty); - break; + spin_unlock(&c->freelist_lock); + + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + spin_lock(&c->freelist_lock); + + for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) + if (is_startup_available_bucket(buckets->b[b].mark) && + bch2_mark_alloc_bucket_startup(c, ca, b)) { + set_bit(b, ca->buckets_dirty); + goto success; } -out: + b = -1; +success: + up_read(&ca->bucket_lock); up_read(&c->gc_lock); - return r; + return b; } static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) @@ -1072,6 +1107,7 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, bool may_alloc_partial, struct closure *cl) { + struct bucket_array *buckets; struct open_bucket *ob; long bucket; @@ -1129,22 +1165,27 @@ out: ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); + lg_local_lock(&c->usage_lock); + buckets = bucket_array(ca); + ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->ptr = (struct bch_extent_ptr) { - .gen = ca->buckets[bucket].mark.gen, + .gen = buckets->b[bucket].mark.gen, .offset = bucket_to_sector(ca, bucket), .dev = ca->dev_idx, }; + + bucket_io_clock_reset(c, ca, bucket, READ); + bucket_io_clock_reset(c, ca, bucket, WRITE); + + lg_local_unlock(&c->usage_lock); spin_unlock(&ob->lock); spin_unlock(&c->freelist_lock); bch2_wake_allocator(ca); - ca->buckets[bucket].prio[READ] = c->prio_clock[READ].hand; - ca->buckets[bucket].prio[WRITE] = c->prio_clock[WRITE].hand; - trace_bucket_alloc(ca, reserve); return ob - c->open_buckets; } diff --git a/fs/bcachefs/alloc.h b/fs/bcachefs/alloc.h index 8dffb8643b06..ee771ee1c09a 100644 --- a/fs/bcachefs/alloc.h +++ b/fs/bcachefs/alloc.h @@ -5,7 +5,6 @@ #include "alloc_types.h" struct bkey; -struct bucket; struct bch_dev; struct bch_fs; struct bch_devs_List; diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 10e67d48e35a..6b0810403336 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -5,6 +5,7 @@ #include <linux/spinlock.h> #include "clock_types.h" +#include "fifo.h" /* There's two of these clocks, one for reads and one for writes: */ struct prio_clock { @@ -40,6 +41,8 @@ enum alloc_reserve { RESERVE_NR = 3, }; +typedef FIFO(long) alloc_fifo; + /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ #define OPEN_BUCKETS_COUNT 256 #define WRITE_POINT_COUNT 32 diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 648555241aec..02e38410f5dc 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -349,6 +349,22 @@ struct bch_dev { /* biosets used in cloned bios for writing multiple replicas */ struct bio_set replica_set; + /* + * Buckets: + * Per-bucket arrays are protected by c->usage_lock, bucket_lock and + * gc_lock, for device resize - holding any is sufficient for access: + * Or rcu_read_lock(), but only for ptr_stale(): + */ + struct bucket_array __rcu *buckets; + unsigned long *buckets_dirty; + /* most out of date gen in the btree */ + u8 *oldest_gens; + struct rw_semaphore bucket_lock; + + struct bch_dev_usage __percpu *usage_percpu; + struct bch_dev_usage usage_cached; + + /* Allocator: */ struct task_struct *alloc_thread; /* @@ -360,8 +376,8 @@ struct bch_dev { * gens/prios, they'll be moved to the free list (and possibly discarded * in the process) */ - DECLARE_FIFO(long, free)[RESERVE_NR]; - DECLARE_FIFO(long, free_inc); + alloc_fifo free[RESERVE_NR]; + alloc_fifo free_inc; spinlock_t freelist_lock; unsigned nr_invalidated; bool alloc_thread_started; @@ -371,24 +387,9 @@ struct bch_dev { size_t fifo_last_bucket; - /* Allocation stuff: */ - - /* most out of date gen in the btree */ - u8 *oldest_gens; - struct bucket *buckets; - unsigned long *bucket_dirty; - /* last calculated minimum prio */ u16 min_prio[2]; - /* - * Bucket book keeping. The first element is updated by GC, the - * second contains a saved copy of the stats from the beginning - * of GC. - */ - struct bch_dev_usage __percpu *usage_percpu; - struct bch_dev_usage usage_cached; - atomic_long_t saturated_count; size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index e8e4f6d537b9..7d1be86f41cb 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -167,6 +167,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, extent_for_each_ptr(e, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + size_t b = PTR_BUCKET_NR(ca, ptr); struct bucket *g = PTR_BUCKET(ca, ptr); if (mustfix_fsck_err_on(!g->mark.gen_valid, c, @@ -176,7 +177,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, ptr->gen)) { g->_mark.gen = ptr->gen; g->_mark.gen_valid = 1; - set_bit(g - ca->buckets, ca->bucket_dirty); + set_bit(b, ca->buckets_dirty); } if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, @@ -185,7 +186,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, ptr->gen, g->mark.gen)) { g->_mark.gen = ptr->gen; g->_mark.gen_valid = 1; - set_bit(g - ca->buckets, ca->bucket_dirty); + set_bit(b, ca->buckets_dirty); set_bit(BCH_FS_FIXED_GENS, &c->flags); } @@ -194,7 +195,6 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, } } - atomic64_set(&c->key_version, max_t(u64, k.k->version.lo, atomic64_read(&c->key_version))); @@ -302,8 +302,7 @@ static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, unsigned sectors = min_t(u64, bucket_to_sector(ca, b + 1), end) - start; - bch2_mark_metadata_bucket(c, ca, ca->buckets + b, - type, sectors, + bch2_mark_metadata_bucket(c, ca, b, type, sectors, gc_phase(GC_PHASE_SB), flags); b++; start += sectors; @@ -335,8 +334,7 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, for (i = 0; i < ca->journal.nr; i++) { b = ca->journal.buckets[i]; - bch2_mark_metadata_bucket(c, ca, ca->buckets + b, - BCH_DATA_JOURNAL, + bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), flags); } @@ -397,7 +395,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) for_each_member_device(ca, c, ci) { fifo_for_each_entry(i, &ca->free_inc, iter) - bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true, + bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); @@ -406,7 +404,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) for (j = 0; j < RESERVE_NR; j++) fifo_for_each_entry(i, &ca->free[j], iter) - bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true, + bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); @@ -421,7 +419,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) if (ob->valid) { gc_pos_set(c, gc_pos_alloc(c, ob)); ca = bch_dev_bkey_exists(c, ob->ptr.dev); - bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true, + bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, gc_pos_alloc(c, ob), BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); @@ -433,9 +431,10 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) static void bch2_gc_start(struct bch_fs *c) { struct bch_dev *ca; - struct bucket *g; + struct bucket_array *buckets; struct bucket_mark new; unsigned i; + size_t b; int cpu; lg_global_lock(&c->usage_lock); @@ -467,16 +466,21 @@ static void bch2_gc_start(struct bch_fs *c) lg_global_unlock(&c->usage_lock); /* Clear bucket marks: */ - for_each_member_device(ca, c, i) - for_each_bucket(g, ca) { - bucket_cmpxchg(g, new, ({ + for_each_member_device(ca, c, i) { + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { + bucket_cmpxchg(buckets->b + b, new, ({ new.owned_by_allocator = 0; new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; })); - ca->oldest_gens[g - ca->buckets] = new.gen; + ca->oldest_gens[b] = new.gen; } + up_read(&ca->bucket_lock); + } } /** @@ -1020,7 +1024,7 @@ err: return bch2_btree_iter_unlock(&iter) ?: ret; } -int bch2_initial_gc(struct bch_fs *c, struct list_head *journal) +static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal) { unsigned iter = 0; enum btree_id id; @@ -1044,7 +1048,7 @@ again: ret = bch2_journal_mark(c, journal); if (ret) - return ret; + return ret; if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) { if (iter++ > 2) { @@ -1071,3 +1075,14 @@ again: return 0; } + +int bch2_initial_gc(struct bch_fs *c, struct list_head *journal) +{ + int ret; + + down_write(&c->gc_lock); + ret = __bch2_initial_gc(c, journal); + up_write(&c->gc_lock); + + return ret; +} diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 8899e3c6e284..58497d12414a 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -147,12 +147,16 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) { u16 last_seq_ondisk = c->journal.last_seq_ondisk; struct bch_dev *ca; + struct bucket_array *buckets; struct bucket *g; struct bucket_mark m; unsigned i; - for_each_member_device(ca, c, i) - for_each_bucket(g, ca) { + for_each_member_device(ca, c, i) { + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + for_each_bucket(g, buckets) { bucket_cmpxchg(g, m, ({ if (!m.journal_seq_valid || bucket_needs_journal_commit(m, last_seq_ondisk)) @@ -161,6 +165,8 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) m.journal_seq_valid = 0; })); } + up_read(&ca->bucket_lock); + } } #define bch2_usage_add(_acc, _stats) \ @@ -319,20 +325,17 @@ void bch2_fs_usage_apply(struct bch_fs *c, } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g, struct bucket_mark old, - struct bucket_mark new) + struct bucket_mark old, struct bucket_mark new) { struct bch_dev_usage *dev_usage; - BUG_ON((g - ca->buckets) < ca->mi.first_bucket || - (g - ca->buckets) >= ca->mi.nbuckets); + lockdep_assert_held(&c->usage_lock); bch2_fs_inconsistent_on(old.data_type && new.data_type && old.data_type != new.data_type, c, "different types of data in same bucket: %u, %u", old.data_type, new.data_type); - preempt_disable(); dev_usage = this_cpu_ptr(ca->usage_percpu); dev_usage->buckets[bucket_type(old)]--; @@ -347,7 +350,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, dev_usage->sectors[new.data_type] += new.dirty_sectors; dev_usage->sectors[BCH_DATA_CACHED] += (int) new.cached_sectors - (int) old.cached_sectors; - preempt_enable(); if (!is_available_bucket(old) && is_available_bucket(new)) bch2_wake_allocator(ca); @@ -359,16 +361,19 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, ({ \ struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ \ - bch2_dev_usage_update(c, ca, g, _old, new); \ + bch2_dev_usage_update(c, ca, _old, new); \ _old; \ }) bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g, struct bucket_mark *old) + size_t b, struct bucket_mark *old) { + struct bucket *g; struct bucket_mark new; lg_local_lock(&c->usage_lock); + g = bucket(ca, b); + *old = bucket_data_cmpxchg(c, ca, g, new, ({ if (!is_available_bucket(new)) { lg_local_unlock(&c->usage_lock); @@ -385,20 +390,22 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, lg_local_unlock(&c->usage_lock); if (!old->owned_by_allocator && old->cached_sectors) - trace_invalidate(ca, bucket_to_sector(ca, g - ca->buckets), + trace_invalidate(ca, bucket_to_sector(ca, b), old->cached_sectors); return true; } bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g) + size_t b) { + struct bucket *g; struct bucket_mark new, old; lg_local_lock(&c->usage_lock); + g = bucket(ca, b); + old = bucket_data_cmpxchg(c, ca, g, new, ({ - if (new.touched_this_mount || - !is_available_bucket(new)) { + if (!is_startup_available_bucket(new)) { lg_local_unlock(&c->usage_lock); return false; } @@ -412,12 +419,15 @@ bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca, } void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g, bool owned_by_allocator, + size_t b, bool owned_by_allocator, struct gc_pos pos, unsigned flags) { + struct bucket *g; struct bucket_mark old, new; lg_local_lock(&c->usage_lock); + g = bucket(ca, b); + if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && gc_will_visit(c, pos)) { lg_local_unlock(&c->usage_lock); @@ -448,15 +458,18 @@ do { \ } while (0) void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g, enum bch_data_type type, + size_t b, enum bch_data_type type, unsigned sectors, struct gc_pos pos, unsigned flags) { + struct bucket *g; struct bucket_mark old, new; BUG_ON(!type); lg_local_lock(&c->usage_lock); + g = bucket(ca, b); + if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && gc_will_visit(c, pos)) { lg_local_unlock(&c->usage_lock); @@ -502,7 +515,7 @@ static void bch2_mark_pointer(struct bch_fs *c, struct bucket_mark old, new; unsigned saturated; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr); + struct bucket *g = PTR_BUCKET(ca, ptr); enum bch_data_type data_type = type == S_META ? BCH_DATA_BTREE : BCH_DATA_USER; u64 v; @@ -584,7 +597,7 @@ static void bch2_mark_pointer(struct bch_fs *c, old.counter, new.counter)) != old.counter); - bch2_dev_usage_update(c, ca, g, old, new); + bch2_dev_usage_update(c, ca, old, new); BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && bucket_became_unavailable(c, old, new)); @@ -810,3 +823,38 @@ int bch2_disk_reservation_get(struct bch_fs *c, return bch2_disk_reservation_add(c, res, sectors, flags); } + +void bch2_dev_buckets_free(struct bch_dev *ca) +{ + free_percpu(ca->usage_percpu); + kvpfree(ca->buckets_dirty, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); + kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); + kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket)); +} + +int bch2_dev_buckets_alloc(struct bch_dev *ca) +{ + struct bucket_array *buckets; + + buckets = kvpmalloc(sizeof(struct bucket_array) + + ca->mi.nbuckets * + sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO); + if (!buckets) + return -ENOMEM; + + buckets->first_bucket = ca->mi.first_bucket; + buckets->nbuckets = ca->mi.nbuckets; + rcu_assign_pointer(ca->buckets, buckets); + + if (!(ca->oldest_gens = kvpmalloc(ca->mi.nbuckets * + sizeof(u8), + GFP_KERNEL|__GFP_ZERO)) || + !(ca->buckets_dirty = kvpmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO)) || + !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage))) + return -ENOMEM; + + return 0; +} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index d0a9ec08d8e8..f067d1b01ad4 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -10,9 +10,9 @@ #include "buckets_types.h" #include "super.h" -#define for_each_bucket(b, ca) \ - for (b = (ca)->buckets + (ca)->mi.first_bucket; \ - b < (ca)->buckets + (ca)->mi.nbuckets; b++) +#define for_each_bucket(_b, _buckets) \ + for (_b = (_buckets)->b + (_buckets)->first_bucket; \ + _b < (_buckets)->b + (_buckets)->nbuckets; _b++) #define bucket_cmpxchg(g, new, expr) \ ({ \ @@ -28,15 +28,36 @@ _old; \ }) +static inline struct bucket_array *bucket_array(struct bch_dev *ca) +{ + return rcu_dereference_check(ca->buckets, + lockdep_is_held(&ca->fs->usage_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); +} + +static inline struct bucket *bucket(struct bch_dev *ca, size_t b) +{ + struct bucket_array *buckets = bucket_array(ca); + + BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); + return buckets->b + b; +} + +static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, + size_t b, int rw) +{ + bucket(ca, b)->prio[rw] = c->prio_clock[rw].hand; +} + /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree. */ -static inline u8 bucket_gc_gen(struct bch_dev *ca, struct bucket *g) +static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) { - unsigned long r = g - ca->buckets; - return g->mark.gen - ca->oldest_gens[r]; + return bucket(ca, b)->mark.gen - ca->oldest_gens[b]; } static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, @@ -45,10 +66,22 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, return sector_to_bucket(ca, ptr->offset); } -static inline struct bucket *PTR_BUCKET(const struct bch_dev *ca, +static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - return ca->buckets + PTR_BUCKET_NR(ca, ptr); + return bucket(ca, PTR_BUCKET_NR(ca, ptr)); +} + +static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + struct bucket_mark m; + + rcu_read_lock(); + m = READ_ONCE(bucket(ca, PTR_BUCKET_NR(ca, ptr))->mark); + rcu_read_unlock(); + + return m; } static inline int gen_cmp(u8 a, u8 b) @@ -67,10 +100,10 @@ static inline int gen_after(u8 a, u8 b) * ptr_stale() - check if a pointer points into a bucket that has been * invalidated. */ -static inline u8 ptr_stale(const struct bch_dev *ca, +static inline u8 ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - return gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen); + return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); } /* bucket gc marks */ @@ -159,6 +192,11 @@ static inline bool is_available_bucket(struct bucket_mark mark) !mark.nouse); } +static inline bool is_startup_available_bucket(struct bucket_mark mark) +{ + return !mark.touched_this_mount && is_available_bucket(mark); +} + static inline bool bucket_needs_journal_commit(struct bucket_mark m, u16 last_seq_ondisk) { @@ -169,15 +207,14 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, void bch2_bucket_seq_cleanup(struct bch_fs *); bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, - struct bucket *, struct bucket_mark *); + size_t, struct bucket_mark *); bool bch2_mark_alloc_bucket_startup(struct bch_fs *, struct bch_dev *, - struct bucket *); + size_t); void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, - struct bucket *, bool, - struct gc_pos, unsigned); + size_t, bool, struct gc_pos, unsigned); void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, - struct bucket *, enum bch_data_type, - unsigned, struct gc_pos, unsigned); + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); #define BCH_BUCKET_MARK_NOATOMIC (1 << 0) #define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1) @@ -210,4 +247,7 @@ int bch2_disk_reservation_get(struct bch_fs *, struct disk_reservation *, unsigned, int); +void bch2_dev_buckets_free(struct bch_dev *); +int bch2_dev_buckets_alloc(struct bch_dev *); + #endif /* _BUCKETS_H */ diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 8a3c8c304beb..eaf4a3863605 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -40,6 +40,13 @@ struct bucket { }; }; +struct bucket_array { + struct rcu_head rcu; + size_t first_bucket; + size_t nbuckets; + struct bucket b[]; +}; + struct bch_dev_usage { u64 buckets[BCH_DATA_NR]; u64 buckets_alloc; diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index e055ee93f586..2b4a2dc2339e 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -636,14 +636,13 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, unsigned seq; const char *err; char buf[160]; - struct bucket *g; + struct bucket_mark mark; struct bch_dev *ca; unsigned replicas = 0; bool bad; extent_for_each_ptr(e, ptr) { ca = bch_dev_bkey_exists(c, ptr->dev); - g = PTR_BUCKET(ca, ptr); replicas++; if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags)) @@ -655,9 +654,11 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, do { seq = read_seqcount_begin(&c->gc_pos_lock); + mark = ptr_bucket_mark(ca, ptr); + bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - (g->mark.data_type != BCH_DATA_BTREE || - g->mark.dirty_sectors < c->opts.btree_node_size); + (mark.data_type != BCH_DATA_BTREE || + mark.dirty_sectors < c->opts.btree_node_size); } while (read_seqcount_retry(&c->gc_pos_lock, seq)); err = "inconsistent"; @@ -678,11 +679,9 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, err: bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); bch2_fs_bug(c, "%s btree pointer %s: bucket %zi " - "gen %i last_gc %i mark %08x", + "gen %i mark %08x", err, buf, PTR_BUCKET_NR(ca, ptr), - PTR_BUCKET(ca, ptr)->mark.gen, - ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], - (unsigned) g->mark.counter); + mark.gen, (unsigned) mark.counter); } static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, @@ -1732,7 +1731,6 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, { const struct bch_extent_ptr *ptr; struct bch_dev *ca; - struct bucket *g; struct bucket_mark mark; unsigned seq, stale; char buf[160]; @@ -1753,7 +1751,6 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, extent_for_each_ptr(e, ptr) { ca = bch_dev_bkey_exists(c, ptr->dev); - g = PTR_BUCKET(ca, ptr); replicas++; ptrs_per_tier[ca->mi.tier]++; @@ -1768,7 +1765,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, do { seq = read_seqcount_begin(&c->gc_pos_lock); - mark = READ_ONCE(g->mark); + mark = ptr_bucket_mark(ca, ptr); /* between mark and bucket gen */ smp_rmb(); @@ -1821,10 +1818,8 @@ bad_ptr: bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), e.s_c); bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu " - "gen %i last_gc %i type %u", - buf, PTR_BUCKET_NR(ca, ptr), mark.gen, - ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], - mark.data_type); + "gen %i type %u", buf, + PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type); return; } diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h index 0a9c0c9f19a8..98f22f6a58e4 100644 --- a/fs/bcachefs/fifo.h +++ b/fs/bcachefs/fifo.h @@ -3,11 +3,13 @@ #include "util.h" -#define DECLARE_FIFO(type, name) \ - struct { \ - size_t front, back, size, mask; \ - type *data; \ - } name +#define FIFO(type) \ +struct { \ + size_t front, back, size, mask; \ + type *data; \ +} + +#define DECLARE_FIFO(type, name) FIFO(type) name #define fifo_buf_size(fifo) \ (roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])) diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 26aa5e114d9f..e045eb20d686 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -1304,7 +1304,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bpos pos = bkey_start_pos(e.k); int ret = 0; - PTR_BUCKET(pick->ca, &pick->ptr)->prio[READ] = c->prio_clock[READ].hand; + lg_local_lock(&c->usage_lock); + bucket_io_clock_reset(c, pick->ca, + PTR_BUCKET_NR(pick->ca, &pick->ptr), READ); + lg_local_unlock(&c->usage_lock); narrow_crcs = should_narrow_crcs(e, pick, flags); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 41390df1c836..829e06482882 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1629,8 +1629,7 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, ja->nr++; spin_unlock(&j->lock); - bch2_mark_metadata_bucket(c, ca, &ca->buckets[bucket], - BCH_DATA_JOURNAL, + bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), 0); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index c6a9ac24c785..90eb4ca2959f 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -99,10 +99,11 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) { copygc_heap *h = &ca->copygc_heap; struct copygc_heap_entry e, *i; - struct bucket *g; + struct bucket_array *buckets; u64 keys_moved, sectors_moved; u64 sectors_to_move = 0, sectors_not_moved = 0; u64 buckets_to_move, buckets_not_moved = 0; + size_t b; int ret; closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); @@ -113,15 +114,18 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) * and repeatedly replacing the maximum element until all * buckets have been visited. */ + h->used = 0; /* * We need bucket marks to be up to date - gc can't be recalculating * them: */ down_read(&c->gc_lock); - h->used = 0; - for_each_bucket(g, ca) { - struct bucket_mark m = READ_ONCE(g->mark); + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { + struct bucket_mark m = READ_ONCE(buckets->b[b].mark); struct copygc_heap_entry e; if (m.owned_by_allocator || @@ -131,11 +135,12 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) continue; e = (struct copygc_heap_entry) { - .offset = bucket_to_sector(ca, g - ca->buckets), + .offset = bucket_to_sector(ca, b), .mark = m }; heap_add_or_replace(h, e, -sectors_used_cmp); } + up_read(&ca->bucket_lock); up_read(&c->gc_lock); for (i = h->data; i < h->data + h->used; i++) @@ -165,15 +170,18 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) &keys_moved, §ors_moved); + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); for (i = h->data; i < h->data + h->used; i++) { - size_t bucket = sector_to_bucket(ca, i->offset); - struct bucket_mark m = READ_ONCE(ca->buckets[bucket].mark); + size_t b = sector_to_bucket(ca, i->offset); + struct bucket_mark m = READ_ONCE(buckets->b[b].mark); if (i->mark.gen == m.gen && bucket_sectors_used(m)) { sectors_not_moved += bucket_sectors_used(m); buckets_not_moved++; } } + up_read(&ca->bucket_lock); if (sectors_not_moved && !ret) bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved", diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 527773dcf380..2fea8787d9df 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -969,10 +969,7 @@ static void bch2_dev_free(struct bch_dev *ca) free_percpu(ca->io_done); bioset_exit(&ca->replica_set); - free_percpu(ca->usage_percpu); - kvpfree(ca->bucket_dirty, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); - kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket)); - kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); + bch2_dev_buckets_free(ca); free_heap(&ca->copygc_heap); free_heap(&ca->alloc_heap); free_fifo(&ca->free_inc); @@ -1076,6 +1073,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->dev_idx = dev_idx; __set_bit(ca->dev_idx, ca->self.d); + init_rwsem(&ca->bucket_lock); + writepoint_init(&ca->copygc_write_point, BCH_DATA_USER); spin_lock_init(&ca->freelist_lock); @@ -1111,6 +1110,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) 0, GFP_KERNEL) || percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + bch2_dev_buckets_alloc(ca) || !init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_MOVINGGC], @@ -1119,16 +1119,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) !init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) || !init_heap(&ca->alloc_heap, free_inc_reserve, GFP_KERNEL) || !init_heap(&ca->copygc_heap,heap_size, GFP_KERNEL) || - !(ca->oldest_gens = kvpmalloc(ca->mi.nbuckets * - sizeof(u8), - GFP_KERNEL|__GFP_ZERO)) || - !(ca->buckets = kvpmalloc(ca->mi.nbuckets * - sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO)) || - !(ca->bucket_dirty = kvpmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)) || - !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) || bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio), 0) || !(ca->io_done = alloc_percpu(*ca->io_done))) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 11112dc3a1ac..dc70fb0cdd67 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -606,26 +606,28 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -typedef unsigned (bucket_map_fn)(struct bch_dev *, struct bucket *, void *); +typedef unsigned (bucket_map_fn)(struct bch_dev *, size_t, void *); -static unsigned bucket_priority_fn(struct bch_dev *ca, struct bucket *g, +static unsigned bucket_priority_fn(struct bch_dev *ca, size_t b, void *private) { + struct bucket *g = bucket(ca, b); int rw = (private ? 1 : 0); return ca->fs->prio_clock[rw].hand - g->prio[rw]; } -static unsigned bucket_sectors_used_fn(struct bch_dev *ca, struct bucket *g, +static unsigned bucket_sectors_used_fn(struct bch_dev *ca, size_t b, void *private) { + struct bucket *g = bucket(ca, b); return bucket_sectors_used(g->mark); } -static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, struct bucket *g, +static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, size_t b, void *private) { - return bucket_gc_gen(ca, g); + return bucket_gc_gen(ca, b); } static ssize_t show_quantiles(struct bch_dev *ca, char *buf, @@ -634,19 +636,25 @@ static ssize_t show_quantiles(struct bch_dev *ca, char *buf, int cmp(const void *l, const void *r) { return *((unsigned *) r) - *((unsigned *) l); } - size_t n = ca->mi.nbuckets, i; + size_t i, n; /* Compute 31 quantiles */ unsigned q[31], *p; ssize_t ret = 0; - p = vzalloc(ca->mi.nbuckets * sizeof(unsigned)); - if (!p) + down_read(&ca->bucket_lock); + n = ca->mi.nbuckets; + + p = vzalloc(n * sizeof(unsigned)); + if (!p) { + up_read(&ca->bucket_lock); return -ENOMEM; + } for (i = ca->mi.first_bucket; i < n; i++) - p[i] = fn(ca, &ca->buckets[i], private); + p[i] = fn(ca, n, private); sort(p, n, sizeof(unsigned), cmp, NULL); + up_read(&ca->bucket_lock); while (n && !p[n - 1]) |