diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2021-01-21 21:52:06 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2021-04-27 12:22:43 -0400 |
commit | 71c16758d9dd3470e945bb2062ea5db67a60f1ed (patch) | |
tree | 913f96e1244640a68c0cd3654a9271f1eabf801b /fs | |
parent | 32737e888c2e78870ae0a175402c6586e2df5864 (diff) |
bcachefs: Journal updates to dev usage
This eliminates the need to scan every bucket to regenerate dev_usage at
mount time.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/bcachefs/alloc_background.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs.h | 6 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs_format.h | 20 | ||||
-rw-r--r-- | fs/bcachefs/btree_gc.c | 38 | ||||
-rw-r--r-- | fs/bcachefs/buckets.c | 102 | ||||
-rw-r--r-- | fs/bcachefs/buckets.h | 7 | ||||
-rw-r--r-- | fs/bcachefs/journal_io.c | 37 | ||||
-rw-r--r-- | fs/bcachefs/recovery.c | 21 | ||||
-rw-r--r-- | fs/bcachefs/super-io.c | 22 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 37 |
10 files changed, 219 insertions, 75 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 367d93e5f2e1..a91caf04fc9a 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -350,10 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) return ret; } - percpu_down_write(&c->mark_lock); - bch2_dev_usage_from_buckets(c); - percpu_up_write(&c->mark_lock); - return 0; } diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index f578d5e7442a..fa36e7641e32 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -429,7 +429,9 @@ struct bch_dev { unsigned long *buckets_nouse; struct rw_semaphore bucket_lock; - struct bch_dev_usage __percpu *usage[2]; + struct bch_dev_usage *usage_base; + struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; + struct bch_dev_usage __percpu *usage_gc; /* Allocator: */ struct task_struct __rcu *alloc_thread; @@ -582,6 +584,8 @@ struct bch_fs { struct journal_entry_res replicas_journal_res; + struct journal_entry_res dev_usage_journal_res; + struct bch_disk_groups_cpu __rcu *disk_groups; struct bch_opts opts; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 2df1949dc9da..30e77190d97a 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1505,7 +1505,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(blacklist_v2, 4) \ x(usage, 5) \ x(data_usage, 6) \ - x(clock, 7) + x(clock, 7) \ + x(dev_usage, 8) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1560,6 +1561,23 @@ struct jset_entry_clock { __le64 time; } __attribute__((packed)); +struct jset_entry_dev_usage_type { + __le64 buckets; + __le64 sectors; + __le64 fragmented; +} __attribute__((packed)); + +struct jset_entry_dev_usage { + struct jset_entry entry; + __le32 dev; + __u32 pad; + + __le64 buckets_ec; + __le64 buckets_unavailable; + + struct jset_entry_dev_usage_type d[]; +} __attribute__((packed)); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 303ace78ced6..c2c8a34f735d 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -706,8 +706,8 @@ static void bch2_gc_free(struct bch_fs *c) ca->mi.nbuckets * sizeof(struct bucket)); ca->buckets[1] = NULL; - free_percpu(ca->usage[1]); - ca->usage[1] = NULL; + free_percpu(ca->usage_gc); + ca->usage_gc = NULL; } free_percpu(c->usage_gc); @@ -720,7 +720,7 @@ static int bch2_gc_done(struct bch_fs *c, struct bch_dev *ca; bool verify = (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); - unsigned i; + unsigned i, dev; int ret = 0; #define copy_field(_f, _msg, ...) \ @@ -786,7 +786,10 @@ static int bch2_gc_done(struct bch_fs *c, } } - for_each_member_device(ca, c, i) { + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + + for_each_member_device(ca, c, dev) { struct bucket_array *dst = __bucket_array(ca, 0); struct bucket_array *src = __bucket_array(ca, 1); size_t b; @@ -801,12 +804,23 @@ static int bch2_gc_done(struct bch_fs *c, dst->b[b].oldest_gen = src->b[b].oldest_gen; } - }; - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - bch2_fs_usage_acc_to_base(c, i); + { + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) + bch2_acc_percpu_u64s((void *) ca->usage_gc, + dev_usage_u64s()); + + copy_dev_field(buckets_ec, "buckets_ec"); + copy_dev_field(buckets_unavailable, "buckets_unavailable"); - bch2_dev_usage_from_buckets(c); + for (i = 0; i < BCH_DATA_NR; i++) { + copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); + } + } + }; { unsigned nr = fs_usage_u64s(c); @@ -862,7 +876,7 @@ static int bch2_gc_start(struct bch_fs *c) for_each_member_device(ca, c, i) { BUG_ON(ca->buckets[1]); - BUG_ON(ca->usage[1]); + BUG_ON(ca->usage_gc); ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), @@ -873,9 +887,9 @@ static int bch2_gc_start(struct bch_fs *c) return -ENOMEM; } - ca->usage[1] = alloc_percpu(struct bch_dev_usage); - if (!ca->usage[1]) { - bch_err(c, "error allocating ca->usage[gc]"); + ca->usage_gc = alloc_percpu(struct bch_dev_usage); + if (!ca->usage_gc) { + bch_err(c, "error allocating ca->usage_gc"); percpu_ref_put(&ca->ref); return -ENOMEM; } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 0bce4bfff9e8..ef79f5cac64d 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) void bch2_fs_usage_initialize(struct bch_fs *c) { struct bch_fs_usage *usage; + struct bch_dev *ca; unsigned i; percpu_down_write(&c->mark_lock); @@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c) fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); } + for_each_member_device(ca, c, i) { + struct bch_dev_usage dev = bch2_dev_usage_read(ca); + + usage->hidden += (dev.d[BCH_DATA_sb].buckets + + dev.d[BCH_DATA_journal].buckets) * + ca->mi.bucket_size; + } + percpu_up_write(&c->mark_lock); } @@ -189,14 +198,27 @@ out_pool: return ret; } +static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, + unsigned journal_seq, + bool gc) +{ + return this_cpu_ptr(gc + ? ca->usage_gc + : ca->usage[journal_seq & JOURNAL_BUF_MASK]); +} + struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) { + struct bch_fs *c = ca->fs; struct bch_dev_usage ret; + unsigned seq, i, u64s = dev_usage_u64s(); - memset(&ret, 0, sizeof(ret)); - acc_u64s_percpu((u64 *) &ret, - (u64 __percpu *) ca->usage[0], - sizeof(ret) / sizeof(u64)); + do { + seq = read_seqcount_begin(&c->usage_lock); + memcpy(&ret, ca->usage_base, u64s * sizeof(u64)); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s); + } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; } @@ -261,7 +283,8 @@ retry: void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) { - unsigned u64s = fs_usage_u64s(c); + struct bch_dev *ca; + unsigned i, u64s = fs_usage_u64s(c); BUG_ON(idx >= ARRAY_SIZE(c->usage)); @@ -272,6 +295,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) (u64 __percpu *) c->usage[idx], u64s); percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) { + u64s = dev_usage_u64s(); + + acc_u64s_percpu((u64 *) ca->usage_base, + (u64 __percpu *) ca->usage[idx], u64s); + percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); + } + rcu_read_unlock(); + write_seqcount_end(&c->usage_lock); preempt_enable(); } @@ -454,14 +487,14 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, struct bch_fs_usage *fs_usage, struct bucket_mark old, struct bucket_mark new, - bool gc) + u64 journal_seq, bool gc) { struct bch_dev_usage *u; percpu_rwsem_assert_held(&c->mark_lock); preempt_disable(); - u = this_cpu_ptr(ca->usage[gc]); + u = dev_usage_ptr(ca, journal_seq, gc); if (bucket_type(old)) account_bucket(fs_usage, u, bucket_type(old), @@ -491,31 +524,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, bch2_wake_allocator(ca); } -__flatten -void bch2_dev_usage_from_buckets(struct bch_fs *c) -{ - struct bch_dev *ca; - struct bucket_mark old = { .v.counter = 0 }; - struct bucket_array *buckets; - struct bucket *g; - unsigned i; - int cpu; - - c->usage_base->hidden = 0; - - for_each_member_device(ca, c, i) { - for_each_possible_cpu(cpu) - memset(per_cpu_ptr(ca->usage[0], cpu), 0, - sizeof(*ca->usage[0])); - - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - bch2_dev_usage_update(c, ca, c->usage_base, - old, g->mark, false); - } -} - static inline int update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, struct bch_replicas_entry *r, @@ -653,7 +661,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, new.owned_by_allocator = owned_by_allocator; })); - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + /* + * XXX: this is wrong, this means we'll be doing updates to the percpu + * buckets_alloc counter that don't have an open journal buffer and + * we'll race with the machinery that accumulates that to ca->usage_base + */ + bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc); BUG_ON(!gc && !owned_by_allocator && !old.owned_by_allocator); @@ -717,7 +730,7 @@ static int bch2_mark_alloc(struct bch_fs *c, } })); - bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); + bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc); g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; @@ -782,7 +795,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (c) bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), - old, new, gc); + old, new, 0, gc); return 0; } @@ -963,7 +976,7 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, g->stripe = k.k->p.offset; g->stripe_redundancy = s->nr_redundant; - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); return 0; } @@ -1030,7 +1043,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); BUG_ON(!gc && bucket_became_unavailable(old, new)); @@ -2396,13 +2409,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca) sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); - free_percpu(ca->usage[0]); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + free_percpu(ca->usage[i]); + kfree(ca->usage_base); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) + unsigned i; + + ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); + if (!ca->usage_base) return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { + ca->usage[i] = alloc_percpu(struct bch_dev_usage); + if (!ca->usage[i]) + return -ENOMEM; + } + return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 659f1ba01b6f..6d15c455e7cc 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -162,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); -void bch2_dev_usage_from_buckets(struct bch_fs *); - static inline u64 __dev_buckets_available(struct bch_dev *ca, struct bch_dev_usage stats) { @@ -207,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c) READ_ONCE(c->replicas.nr); } +static inline unsigned dev_usage_u64s(void) +{ + return sizeof(struct bch_dev_usage) / sizeof(u64); +} + void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 39b8cbe178b0..2abca1644cdc 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -453,6 +453,43 @@ fsck_err: return ret; } +static int journal_entry_validate_dev_usage(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */ + unsigned dev; + int ret = 0; + + if (journal_entry_err_on(bytes < expected, + c, "invalid journal entry dev usage: bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + dev = le32_to_cpu(u->dev); + + if (journal_entry_err_on(!bch2_dev_exists2(c, dev), + c, "invalid journal entry dev usage: bad dev")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(u->pad, + c, "invalid journal entry dev usage: bad pad")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, int); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 55f7771e11c8..7ba098adcab9 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -825,10 +825,31 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_data_usage: { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); + ret = bch2_replicas_set_usage(c, &u->r, le64_to_cpu(u->v)); break; } + case BCH_JSET_ENTRY_dev_usage: { + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / + sizeof(struct jset_entry_dev_usage_type); + unsigned i; + + ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); + ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); + + for (i = 0; i < nr_types; i++) { + ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); + ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); + ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); + } + + break; + } case BCH_JSET_ENTRY_blacklist: { struct jset_entry_blacklist *bl_entry = container_of(entry, struct jset_entry_blacklist, entry); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 068262917e10..a510a25e2edb 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -983,7 +983,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry **end, u64 journal_seq) { - unsigned i; + struct bch_dev *ca; + unsigned i, dev; percpu_down_write(&c->mark_lock); @@ -1037,6 +1038,25 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, memcpy(&u->r, e, replicas_entry_bytes(e)); } + for_each_member_device(ca, c, dev) { + unsigned b = sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; + struct jset_entry_dev_usage *u = + container_of(jset_entry_init(end, b), + struct jset_entry_dev_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_dev_usage; + u->dev = cpu_to_le32(dev); + u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable); + + for (i = 0; i < BCH_DATA_NR; i++) { + u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); + u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); + u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); + } + } + percpu_up_write(&c->mark_lock); for (i = 0; i < 2; i++) { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 0d35df66e8cf..c0b8e9cc9d5a 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -148,6 +148,22 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) return c; } +static void bch2_dev_usage_journal_reserve(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i, nr = 0, u64s = + (sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + nr++; + rcu_read_unlock(); + + bch2_journal_entry_res_resize(&c->journal, + &c->dev_usage_journal_res, u64s * nr); +} + /* Filesystem RO/RW: */ /* @@ -772,6 +788,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_fsio_init(c)) goto err; + bch2_dev_usage_journal_reserve(c); + mi = bch2_sb_get_members(c->disk_sb.sb); for (i = 0; i < c->sb.nr_devices; i++) if (bch2_dev_exists(c->disk_sb.sb, mi, i) && @@ -1510,6 +1528,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) mutex_unlock(&c->sb_lock); up_write(&c->state_lock); + + bch2_dev_usage_journal_reserve(c); return 0; err: if (ca->mi.state == BCH_MEMBER_STATE_RW && @@ -1519,19 +1539,6 @@ err: return ret; } -static void dev_usage_clear(struct bch_dev *ca) -{ - struct bucket_array *buckets; - - percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); - - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); - up_read(&ca->bucket_lock); -} - /* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { @@ -1589,8 +1596,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - dev_usage_clear(ca); - down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1644,6 +1649,8 @@ have_slot: bch2_write_super(c); mutex_unlock(&c->sb_lock); + bch2_dev_usage_journal_reserve(c); + err = "error marking superblock"; ret = bch2_trans_mark_dev_sb(c, NULL, ca); if (ret) |