From 1f383e88038c2038fd972e9da70190c602dd6840 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 20 Dec 2023 00:04:57 -0500 Subject: bcachefs: bucket_capacity() On zoned devices, zone capacity is variable. This patch implements a new data structure (eytzinger search tree) for getting a bucket's capacity. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 6 ++- fs/bcachefs/alloc_types.h | 1 + fs/bcachefs/bcachefs.h | 15 ++++++- fs/bcachefs/buckets.c | 9 +++-- fs/bcachefs/buckets.h | 8 ---- fs/bcachefs/ec.c | 5 +-- fs/bcachefs/extents.c | 14 +++---- fs/bcachefs/journal.c | 3 +- fs/bcachefs/journal_io.c | 53 +++++++++---------------- fs/bcachefs/super.c | 17 ++------ fs/bcachefs/zone.c | 90 +++++++++++++++++++++++++++++++++++++++++- fs/bcachefs/zone.h | 40 ++++++++++++++++++- 12 files changed, 183 insertions(+), 78 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index a1bfa9eaee42..64100f60af35 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -30,6 +30,7 @@ #include "movinggc.h" #include "nocow_locking.h" #include "trace.h" +#include "zone.h" #include #include @@ -254,7 +255,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * spin_lock(&ob->lock); ob->valid = true; - ob->sectors_free = ca->mi.bucket_size; + ob->bucket_size = bucket_capacity(ca, bucket); + ob->sectors_free = ob->bucket_size; ob->dev = ca->dev_idx; ob->gen = a->gen; ob->bucket = bucket; @@ -1497,7 +1499,7 @@ struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) .gen = ob->gen, .dev = ob->dev, .offset = bucket_to_sector(ca, ob->bucket) + - ca->mi.bucket_size - + ob->bucket_size - ob->sectors_free, }; } diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index b91b7a461056..57c4ff50926c 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -61,6 +61,7 @@ struct open_bucket { u8 dev; u8 gen; + u32 bucket_size; u32 sectors_free; u64 bucket; struct ec_stripe_new *ec; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index afa86dd7ff1a..f44346c90f76 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -530,6 +530,15 @@ struct io_count { u64 sectors[2][BCH_DATA_NR]; }; +struct bucket_capacities { + u32 nr, size; + + struct bucket_capacity { + u32 start; + u32 sectors; + } *d; +}; + struct bch_dev { struct kobject kobj; struct percpu_ref ref; @@ -540,6 +549,8 @@ struct bch_dev { struct bch_fs *fs; u8 dev_idx; + __uuid_t uuid; + char name[BDEVNAME_SIZE]; /* * Cached version of this device's member info from superblock * Committed by bch2_write_super() -> bch_fs_mi_update() @@ -547,8 +558,8 @@ struct bch_dev { struct bch_member_cpu mi; atomic64_t errors[BCH_MEMBER_ERROR_NR]; - __uuid_t uuid; - char name[BDEVNAME_SIZE]; + struct bucket_capacities buckets; + u64 capacity; struct bch_sb_handle disk_sb; struct bch_sb *sb_read_scratch; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index c0dac04253f7..73a96771316d 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -22,6 +22,7 @@ #include "replicas.h" #include "subvolume.h" #include "trace.h" +#include "zone.h" #include @@ -66,9 +67,8 @@ void bch2_fs_usage_initialize(struct bch_fs *c) for_each_member_device(c, ca) { struct bch_dev_usage dev = bch2_dev_usage_read(ca); - usage->hidden += (dev.d[BCH_DATA_sb].buckets + - dev.d[BCH_DATA_journal].buckets) * - ca->mi.bucket_size; + usage->hidden += dev.d[BCH_DATA_sb].sectors; + usage->hidden += dev.d[BCH_DATA_journal].sectors; } percpu_up_write(&c->mark_lock); @@ -1743,7 +1743,8 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, for (i = 0; i < ca->journal.nr; i++) { ret = bch2_trans_mark_metadata_bucket(trans, ca, ca->journal.buckets[i], - BCH_DATA_journal, ca->mi.bucket_size); + BCH_DATA_journal, + bucket_capacity(ca, ca->journal.buckets[i])); if (ret) return ret; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 379101d7e585..0bd2e32d661b 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -22,14 +22,6 @@ static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) return ((sector_t) b) * ca->mi.bucket_size; } -static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) -{ - u32 remainder; - - div_u64_rem(s, ca->mi.bucket_size, &remainder); - return remainder; -} - static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset) { diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index e89185a28e08..c08038b59a1c 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1216,16 +1216,13 @@ void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) { struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); - struct bch_dev *ca; - unsigned offset; if (!ob) return NULL; BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); - ca = bch_dev_bkey_exists(c, ob->dev); - offset = ca->mi.bucket_size - ob->sectors_free; + unsigned offset = ob->bucket_size - ob->sectors_free; return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); } diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 648f1daffb3b..699cc61e70b2 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -25,6 +25,7 @@ #include "super-io.h" #include "trace.h" #include "util.h" +#include "zone.h" static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, @@ -1063,8 +1064,6 @@ static int extent_ptr_invalid(struct bch_fs *c, struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - u64 bucket; - u32 bucket_offset; struct bch_dev *ca; int ret = 0; @@ -1087,15 +1086,16 @@ static int extent_ptr_invalid(struct bch_fs *c, ptr_to_duplicate_device, "multiple pointers to same device (%u)", ptr->dev); - bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); + u32 bucket_offset; + u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); + bkey_fsck_err_on(bucket < ca->mi.first_bucket, c, err, + ptr_before_first_bucket, + "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket); bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err, ptr_after_last_bucket, "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets); - bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err, - ptr_before_first_bucket, - "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket); - bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err, + bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_capacity(ca, bucket), c, err, ptr_spans_multiple_buckets, "pointer spans multiple buckets (%u + %u > %u)", bucket_offset, size_ondisk, ca->mi.bucket_size); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 8538ef34f62b..d66e0ed47209 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -19,6 +19,7 @@ #include "journal_sb.h" #include "journal_seq_blacklist.h" #include "trace.h" +#include "zone.h" static const char * const bch2_journal_errors[] = { #define x(n) #n, @@ -852,7 +853,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ret = bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, - ca->mi.bucket_size)); + bucket_capacity(ca, ob[nr_got]->bucket))); if (ret) { bch2_open_bucket_put(c, ob[nr_got]); bch_err_msg(c, ret, "marking new journal buckets"); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index c5bc58247146..b0ce77655596 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -16,6 +16,7 @@ #include "replicas.h" #include "sb-clean.h" #include "trace.h" +#include "zone.h" static struct nonce journal_nonce(const struct jset *jset) { @@ -932,7 +933,7 @@ static int journal_read_bucket(struct bch_dev *ca, struct jset *j = NULL; unsigned sectors, sectors_read = 0; u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), - end = offset + ca->mi.bucket_size; + end = offset + bucket_capacity(ca, ja->buckets[bucket]); bool saw_bad = false, csum_good; int ret = 0; @@ -1061,7 +1062,8 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) struct journal_replay *r, **_r; struct genradix_iter iter; struct journal_read_buf buf = { NULL, 0 }; - unsigned i; + u64 cur_bucket; + unsigned i, wrote = 0, cur_bucket_capacity; int ret = 0; if (!ja->nr) @@ -1079,7 +1081,8 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) goto err; } - ja->sectors_free = ca->mi.bucket_size; + cur_bucket = ja->buckets[ja->cur_idx]; + cur_bucket_capacity = bucket_capacity(ca, cur_bucket); mutex_lock(&jlist->lock); genradix_for_each_reverse(&c->journal_entries, iter, _r) { @@ -1089,12 +1092,14 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) continue; for (i = 0; i < r->nr_ptrs; i++) { - if (r->ptrs[i].dev == ca->dev_idx) { - unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + - vstruct_sectors(&r->j, c->block_bits); + if (r->ptrs[i].dev == ca->dev_idx && + sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) { + wrote = max_t(u64, wrote, r->ptrs[i].sector - + bucket_to_sector(ca, cur_bucket) + + vstruct_sectors(&r->j, c->block_bits)); ja->cur_idx = r->ptrs[i].bucket; - ja->sectors_free = ca->mi.bucket_size - wrote; + ja->sectors_free = cur_bucket_capacity - wrote; goto found; } } @@ -1102,24 +1107,8 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) found: mutex_unlock(&jlist->lock); - if (ja->bucket_seq[ja->cur_idx] && - ja->sectors_free == ca->mi.bucket_size) { -#if 0 - /* - * Debug code for ZNS support, where we (probably) want to be - * correlated where we stopped in the journal to the zone write - * points: - */ - bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); - bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); - for (i = 0; i < 3; i++) { - unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; - - bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); - } -#endif - ja->sectors_free = 0; - } + BUG_ON(!wrote); + ja->sectors_free = cur_bucket_capacity - min(wrote, cur_bucket_capacity); /* * Set dirty_idx to indicate the entire journal is full and needs to be @@ -1147,11 +1136,6 @@ void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, unsigned i; for (i = 0; i < j->nr_ptrs; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); - u64 offset; - - div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); - if (i) prt_printf(out, " "); prt_printf(out, "%u:%u:%u (sector %llu)", @@ -1401,6 +1385,7 @@ static void __journal_write_alloc(struct journal *j, struct journal_device *ja; struct bch_dev *ca; unsigned i; + u64 b; if (*replicas >= replicas_want) return; @@ -1424,12 +1409,12 @@ static void __journal_write_alloc(struct journal *j, continue; bch2_dev_stripe_increment(ca, &j->wp.stripe); + b = ja->buckets[ja->cur_idx]; bch2_bkey_append_ptr(&w->key, (struct bch_extent_ptr) { - .offset = bucket_to_sector(ca, - ja->buckets[ja->cur_idx]) + - ca->mi.bucket_size - + .offset = bucket_to_sector(ca, b) + + bucket_capacity(ca, b) - ja->sectors_free, .dev = ca->dev_idx, }); @@ -1489,7 +1474,7 @@ retry: bch2_journal_dev_buckets_available(j, ja, journal_space_discarded)) { ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = ca->mi.bucket_size; + ja->sectors_free = bucket_capacity(ca, ja->buckets[ja->cur_idx]); /* * ja->bucket_seq[ja->cur_idx] must always have diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index f27b93d0f56b..b600f6db3afc 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1145,6 +1145,7 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); + bch2_dev_zones_exit(ca); bch2_free_super(&ca->disk_sb); bch2_dev_journal_exit(ca); @@ -1330,19 +1331,9 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) return -BCH_ERR_device_size_too_small; } - ca->zoned = bdev_nr_zones(sb->bdev) != 0; - if (ca->zoned) { - struct blk_zone zone; - - ret = bch2_zone_report(sb->bdev, 0, &zone); - if (ret) - return ret; - - if (zone.len != ca->mi.bucket_size) { - bch_err(ca, "zone size doesn't match bucket size"); - return -EINVAL; - } - } + ret = bch2_dev_zones_init(ca, sb); + if (ret) + return ret; ret = bch2_dev_journal_init(ca, sb->sb); if (ret) diff --git a/fs/bcachefs/zone.c b/fs/bcachefs/zone.c index b6ad8c9daaea..6f48f58a6c50 100644 --- a/fs/bcachefs/zone.c +++ b/fs/bcachefs/zone.c @@ -2,11 +2,12 @@ #include "bcachefs.h" #include "buckets.h" +#include "eytzinger.h" #include "zone.h" #include -static int zone_report_cb(struct blk_zone *src, unsigned int idx, void *data) +static int zone_report_cb(struct blk_zone *src, unsigned idx, void *data) { struct blk_zone *dst = data; @@ -14,7 +15,7 @@ static int zone_report_cb(struct blk_zone *src, unsigned int idx, void *data) return 0; } -int bch2_zone_report(struct block_device *bdev, sector_t sector, struct blk_zone *zone) +static int bch2_zone_report(struct block_device *bdev, sector_t sector, struct blk_zone *zone) { int ret = blkdev_report_zones(bdev, sector, 1, zone_report_cb, zone); @@ -53,3 +54,88 @@ void bch2_bucket_finish(struct bch_dev *ca, u64 b) bucket_to_sector(ca, b), ca->mi.bucket_size, GFP_KERNEL); } + +void bch2_dev_zones_exit(struct bch_dev *ca) +{ + kfree(ca->buckets.d); +} + +static int zone_report_capacity(struct blk_zone *src, unsigned idx, void *data) +{ + struct bucket_capacities *b = data; + + if (b->nr && + b->d[b->nr - 1].sectors == src->capacity) + return 0; + + if (b->nr == b->size) { + size_t new_size = min(b->size * 2, 8U); + struct bucket_capacity *d = + krealloc_array(b->d, new_size, sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + + b->d = d; + b->size = new_size; + } + + b->d[b->nr++] = (struct bucket_capacity) { + .start = idx, + .sectors = src->capacity, + }; + + return 0; +} + +int bch2_dev_zones_init(struct bch_dev *ca, struct bch_sb_handle *sb) +{ + struct bucket_capacities *b = &ca->buckets; + struct blk_zone zone; + unsigned i; + int ret; + + ca->zoned = bdev_nr_zones(sb->bdev) != 0; + if (!ca->zoned) { + ca->capacity = ca->mi.bucket_size * ca->mi.nbuckets; + return 0; + } + + ret = bch2_zone_report(sb->bdev, 0, &zone); + if (ret) + return ret; + + if (zone.len != ca->mi.bucket_size) { + bch_err(ca, "zone size doesn't match bucket size"); + return -EINVAL; + } + + if (bdev_nr_zones(sb->bdev) < ca->mi.nbuckets) { + bch_err(ca, "member info nbuckets (%llu) greater than number of zones (%u)", + ca->mi.nbuckets, + bdev_nr_zones(sb->bdev)); + return -EINVAL; + } + + b->nr = 0; + ret = blkdev_report_zones(sb->bdev, 0, ca->mi.nbuckets, + zone_report_capacity, &ca->buckets); + if (ret) { + bch_err(ca, "error getting zone capacities"); + return -EINVAL; + } + + ca->capacity = 0; + for (i = 0; i < b->nr; i++) { + u64 next = i + 1 < b->nr + ? b->d[i + 1].start + : ca->mi.nbuckets; + + ca->capacity += (next - b->d[i].start) * b->d[i].sectors; + } + + BUG_ON(ca->capacity > ca->mi.bucket_size * ca->mi.nbuckets); + + eytzinger0_sort(b->d, b->nr, sizeof(*b->d), bucket_capacity_cmp, NULL); + + return 0; +} diff --git a/fs/bcachefs/zone.h b/fs/bcachefs/zone.h index aa3653bdb59b..620efc4fcbdc 100644 --- a/fs/bcachefs/zone.h +++ b/fs/bcachefs/zone.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_ZONE_H #define _BCACHEFS_ZONE_H +#include "eytzinger.h" + static inline bool blk_zone_writeable(struct blk_zone zone) { return (zone.cond == BLK_ZONE_COND_EMPTY || @@ -10,8 +12,44 @@ static inline bool blk_zone_writeable(struct blk_zone zone) zone.cond == BLK_ZONE_COND_CLOSED); } -int bch2_zone_report(struct block_device *, sector_t, struct blk_zone *); +static inline int bucket_capacity_cmp(const void *_l, const void *_r, size_t size) +{ + const struct bucket_capacity *l = _l; + const struct bucket_capacity *r = _r; + + return cmp_int(l->start, r->start); +} + +static inline unsigned bucket_capacity(struct bch_dev *ca, size_t bucket) +{ + struct bucket_capacities *b = &ca->buckets; + struct bucket_capacity search = { .start = bucket }; + ssize_t idx; + + if (!ca->zoned) + return ca->mi.bucket_size; + + idx = eytzinger0_find_le(b->d, b->nr, + sizeof(b->d[0]), + bucket_capacity_cmp, &search); + + { + ssize_t j = -1, k; + + for (k = 0; k < b->nr; k++) + if (b->d[k].start <= bucket && + (j < 0 || b->d[k].start > b->d[j].start)) + j = k; + + BUG_ON(idx != j); + } + + return b->d[idx].sectors; +} + void bch2_bucket_discard(struct bch_dev *, u64); void bch2_bucket_finish(struct bch_dev *, u64); +void bch2_dev_zones_exit(struct bch_dev *); +int bch2_dev_zones_init(struct bch_dev *, struct bch_sb_handle *); #endif /* _BCACHEFS_ZONE_H */ -- cgit v1.2.3