diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2017-03-19 15:56:34 -0800 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-03-19 17:31:47 -0800 |
commit | 5ec39af8eaba49aee7bafa44c661da39e2f40dc3 (patch) | |
tree | 1fb1a981602cbf22c7d2b2dba1168c715d7cecb5 /libbcachefs | |
parent | bb1941de5378a7b8122d3575dcbc7d0aeb6326f0 (diff) |
Rename from bcache-tools to bcachefs-tools
Diffstat (limited to 'libbcachefs')
94 files changed, 44440 insertions, 0 deletions
diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c new file mode 100644 index 00000000..6fcac72c --- /dev/null +++ b/libbcachefs/acl.c @@ -0,0 +1,225 @@ +#include "bcachefs.h" + +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/fs.h> + +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(bch_acl_header)) + return ERR_PTR(-EINVAL); + if (((bch_acl_header *)value)->a_version != + cpu_to_le32(BCH_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(bch_acl_header); + count = bch2_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + bch_acl_entry *entry = + (bch_acl_entry *)value; + if ((char *)value + sizeof(bch_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(bch_acl_entry_short); + break; + + case ACL_USER: + value = (char *)value + sizeof(bch_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + value = (char *)value + sizeof(bch_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void *bch2_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + bch_acl_header *ext_acl; + char *e; + size_t n; + + *size = bch2_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(bch_acl_header) + acl->a_count * + sizeof(bch_acl_entry), GFP_KERNEL); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(BCH_ACL_VERSION); + e = (char *)ext_acl + sizeof(bch_acl_header); + for (n = 0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + bch_acl_entry *entry = (bch_acl_entry *)e; + + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch (acl_e->e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(bch_acl_entry); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); + e += sizeof(bch_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(bch_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +struct posix_acl *bch2_get_acl(struct inode *inode, int type) +{ + struct bch_fs *c = inode->i_sb->s_fs_info; + int name_index; + char *value = NULL; + struct posix_acl *acl; + int ret; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + ret = bch2_xattr_get(c, inode, "", NULL, 0, name_index); + if (ret > 0) { + value = kmalloc(ret, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + ret = bch2_xattr_get(c, inode, "", value, + ret, name_index); + } + if (ret > 0) + acl = bch2_acl_from_disk(value, ret); + else if (ret == -ENODATA || ret == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(ret); + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +int bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + struct bch_fs *c = inode->i_sb->s_fs_info; + int name_index; + void *value = NULL; + size_t size = 0; + int ret; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &inode->i_mode); + if (ret < 0) + return ret; + else { + inode->i_ctime = current_fs_time(inode->i_sb); + mark_inode_dirty(inode); + if (ret == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + + if (acl) { + value = bch2_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + ret = bch2_xattr_set(c, inode, "", value, size, 0, name_index); + + kfree(value); + + if (ret == -ERANGE) + ret = -E2BIG; + + if (!ret) + set_cached_acl(inode, type, acl); + + return ret; +} diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h new file mode 100644 index 00000000..2e51726f --- /dev/null +++ b/libbcachefs/acl.h @@ -0,0 +1,56 @@ +/* + File: fs/bch/acl.h + + (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> +*/ + +#include <linux/posix_acl_xattr.h> + +#define BCH_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} bch_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} bch_acl_entry_short; + +typedef struct { + __le32 a_version; +} bch_acl_header; + +static inline size_t bch2_acl_size(int count) +{ + if (count <= 4) { + return sizeof(bch_acl_header) + + count * sizeof(bch_acl_entry_short); + } else { + return sizeof(bch_acl_header) + + 4 * sizeof(bch_acl_entry_short) + + (count - 4) * sizeof(bch_acl_entry); + } +} + +static inline int bch2_acl_count(size_t size) +{ + ssize_t s; + + size -= sizeof(bch_acl_header); + s = size - 4 * sizeof(bch_acl_entry_short); + if (s < 0) { + if (size % sizeof(bch_acl_entry_short)) + return -1; + return size / sizeof(bch_acl_entry_short); + } else { + if (s % sizeof(bch_acl_entry)) + return -1; + return s / sizeof(bch_acl_entry) + 4; + } +} + +extern struct posix_acl *bch2_get_acl(struct inode *, int); +extern int bch2_set_acl(struct inode *, struct posix_acl *, int); diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c new file mode 100644 index 00000000..3067181c --- /dev/null +++ b/libbcachefs/alloc.c @@ -0,0 +1,1913 @@ +/* + * Primary bucket allocation code + * + * Copyright 2012 Google, Inc. + * + * Allocation in bcache is done in terms of buckets: + * + * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in + * btree pointers - they must match for the pointer to be considered valid. + * + * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a + * bucket simply by incrementing its gen. + * + * The gens (along with the priorities; it's really the gens are important but + * the code is named as if it's the priorities) are written in an arbitrary list + * of buckets on disk, with a pointer to them in the journal header. + * + * When we invalidate a bucket, we have to write its new gen to disk and wait + * for that write to complete before we use it - otherwise after a crash we + * could have pointers that appeared to be good but pointed to data that had + * been overwritten. + * + * Since the gens and priorities are all stored contiguously on disk, we can + * batch this up: We fill up the free_inc list with freshly invalidated buckets, + * call prio_write(), and when prio_write() finishes we pull buckets off the + * free_inc list and optionally discard them. + * + * free_inc isn't the only freelist - if it was, we'd often have to sleep while + * priorities and gens were being written before we could allocate. c->free is a + * smaller freelist, and buckets on that list are always ready to be used. + * + * If we've got discards enabled, that happens when a bucket moves from the + * free_inc list to the free list. + * + * It's important to ensure that gens don't wrap around - with respect to + * either the oldest gen in the btree or the gen on disk. This is quite + * difficult to do in practice, but we explicitly guard against it anyways - if + * a bucket is in danger of wrapping around we simply skip invalidating it that + * time around, and we garbage collect or rewrite the priorities sooner than we + * would have otherwise. + * + * bch2_bucket_alloc() allocates a single bucket from a specific device. + * + * bch2_bucket_alloc_set() allocates one or more buckets from different devices + * in a given filesystem. + * + * invalidate_buckets() drives all the processes described above. It's called + * from bch2_bucket_alloc() and a few other places that need to make sure free + * buckets are ready. + * + * invalidate_buckets_(lru|fifo)() find buckets that are available to be + * invalidated, and then invalidate them and stick them on the free_inc list - + * in either lru or fifo order. + */ + +#include "bcachefs.h" +#include "alloc.h" +#include "btree_update.h" +#include "buckets.h" +#include "checksum.h" +#include "clock.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "io.h" +#include "journal.h" +#include "super-io.h" + +#include <linux/blkdev.h> +#include <linux/kthread.h> +#include <linux/math64.h> +#include <linux/random.h> +#include <linux/rcupdate.h> +#include <trace/events/bcachefs.h> + +static void __bch2_bucket_free(struct bch_dev *, struct bucket *); +static void bch2_recalc_min_prio(struct bch_dev *, int); + +/* Allocation groups: */ + +void bch2_dev_group_remove(struct dev_group *grp, struct bch_dev *ca) +{ + unsigned i; + + spin_lock(&grp->lock); + + for (i = 0; i < grp->nr; i++) + if (grp->d[i].dev == ca) { + grp->nr--; + memmove(&grp->d[i], + &grp->d[i + 1], + (grp->nr- i) * sizeof(grp->d[0])); + break; + } + + spin_unlock(&grp->lock); +} + +void bch2_dev_group_add(struct dev_group *grp, struct bch_dev *ca) +{ + unsigned i; + + spin_lock(&grp->lock); + for (i = 0; i < grp->nr; i++) + if (grp->d[i].dev == ca) + goto out; + + BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX); + + grp->d[grp->nr++].dev = ca; +out: + spin_unlock(&grp->lock); +} + +/* Ratelimiting/PD controllers */ + +static void pd_controllers_update(struct work_struct *work) +{ + struct bch_fs *c = container_of(to_delayed_work(work), + struct bch_fs, + pd_controllers_update); + struct bch_dev *ca; + unsigned i, iter; + + /* All units are in bytes */ + u64 faster_tiers_size = 0; + u64 faster_tiers_dirty = 0; + + u64 fastest_tier_size = 0; + u64 fastest_tier_free = 0; + u64 copygc_can_free = 0; + + rcu_read_lock(); + for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { + bch2_pd_controller_update(&c->tiers[i].pd, + div_u64(faster_tiers_size * + c->tiering_percent, 100), + faster_tiers_dirty, + -1); + + spin_lock(&c->tiers[i].devs.lock); + group_for_each_dev(ca, &c->tiers[i].devs, iter) { + struct bch_dev_usage stats = bch2_dev_usage_read(ca); + unsigned bucket_bits = ca->bucket_bits + 9; + + u64 size = (ca->mi.nbuckets - + ca->mi.first_bucket) << bucket_bits; + u64 dirty = stats.buckets_dirty << bucket_bits; + u64 free = __dev_buckets_free(ca, stats) << bucket_bits; + /* + * Bytes of internal fragmentation, which can be + * reclaimed by copy GC + */ + s64 fragmented = ((stats.buckets_dirty + + stats.buckets_cached) << + bucket_bits) - + ((stats.sectors[S_DIRTY] + + stats.sectors[S_CACHED] ) << 9); + + fragmented = max(0LL, fragmented); + + bch2_pd_controller_update(&ca->moving_gc_pd, + free, fragmented, -1); + + faster_tiers_size += size; + faster_tiers_dirty += dirty; + + if (!c->fastest_tier || + c->fastest_tier == &c->tiers[i]) { + fastest_tier_size += size; + fastest_tier_free += free; + } + + copygc_can_free += fragmented; + } + spin_unlock(&c->tiers[i].devs.lock); + } + + rcu_read_unlock(); + + /* + * Throttle foreground writes if tier 0 is running out of free buckets, + * and either tiering or copygc can free up space. + * + * Target will be small if there isn't any work to do - we don't want to + * throttle foreground writes if we currently have all the free space + * we're ever going to have. + * + * Otherwise, if there's work to do, try to keep 20% of tier0 available + * for foreground writes. + */ + if (c->fastest_tier) + copygc_can_free = U64_MAX; + + bch2_pd_controller_update(&c->foreground_write_pd, + min(copygc_can_free, + div_u64(fastest_tier_size * + c->foreground_target_percent, + 100)), + fastest_tier_free, + -1); + + schedule_delayed_work(&c->pd_controllers_update, + c->pd_controllers_update_seconds * HZ); +} + +/* + * Bucket priorities/gens: + * + * For each bucket, we store on disk its + * 8 bit gen + * 16 bit priority + * + * See alloc.c for an explanation of the gen. The priority is used to implement + * lru (and in the future other) cache replacement policies; for most purposes + * it's just an opaque integer. + * + * The gens and the priorities don't have a whole lot to do with each other, and + * it's actually the gens that must be written out at specific times - it's no + * big deal if the priorities don't get written, if we lose them we just reuse + * buckets in suboptimal order. + * + * On disk they're stored in a packed array, and in as many buckets are required + * to fit them all. The buckets we use to store them form a list; the journal + * header points to the first bucket, the first bucket points to the second + * bucket, et cetera. + * + * This code is used by the allocation code; periodically (whenever it runs out + * of buckets to allocate from) the allocation code will invalidate some + * buckets, but it can't use those buckets until their new gens are safely on + * disk. + */ + +static int prio_io(struct bch_dev *ca, uint64_t bucket, int op) +{ + bio_init(ca->bio_prio); + bio_set_op_attrs(ca->bio_prio, op, REQ_SYNC|REQ_META); + + ca->bio_prio->bi_max_vecs = bucket_pages(ca); + ca->bio_prio->bi_io_vec = ca->bio_prio->bi_inline_vecs; + ca->bio_prio->bi_iter.bi_sector = bucket * ca->mi.bucket_size; + ca->bio_prio->bi_bdev = ca->disk_sb.bdev; + ca->bio_prio->bi_iter.bi_size = bucket_bytes(ca); + bch2_bio_map(ca->bio_prio, ca->disk_buckets); + + return submit_bio_wait(ca->bio_prio); +} + +static struct nonce prio_nonce(struct prio_set *p) +{ + return (struct nonce) {{ + [0] = 0, + [1] = p->nonce[0], + [2] = p->nonce[1], + [3] = p->nonce[2]^BCH_NONCE_PRIO, + }}; +} + +static int bch2_prio_write(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct journal *j = &c->journal; + struct journal_res res = { 0 }; + bool need_new_journal_entry; + int i, ret; + + if (c->opts.nochanges) + return 0; + + trace_prio_write_start(ca); + + atomic64_add(ca->mi.bucket_size * prio_buckets(ca), + &ca->meta_sectors_written); + + for (i = prio_buckets(ca) - 1; i >= 0; --i) { + struct bucket *g; + struct prio_set *p = ca->disk_buckets; + struct bucket_disk *d = p->data; + struct bucket_disk *end = d + prios_per_bucket(ca); + size_t r; + + for (r = i * prios_per_bucket(ca); + r < ca->mi.nbuckets && d < end; + r++, d++) { + g = ca->buckets + r; + d->read_prio = cpu_to_le16(g->read_prio); + d->write_prio = cpu_to_le16(g->write_prio); + d->gen = ca->buckets[r].mark.gen; + } + + p->next_bucket = cpu_to_le64(ca->prio_buckets[i + 1]); + p->magic = cpu_to_le64(pset_magic(c)); + get_random_bytes(&p->nonce, sizeof(p->nonce)); + + spin_lock(&ca->prio_buckets_lock); + r = bch2_bucket_alloc(ca, RESERVE_PRIO); + BUG_ON(!r); + + /* + * goes here before dropping prio_buckets_lock to guard against + * it getting gc'd from under us + */ + ca->prio_buckets[i] = r; + bch2_mark_metadata_bucket(ca, ca->buckets + r, + BUCKET_PRIOS, false); + spin_unlock(&ca->prio_buckets_lock); + + SET_PSET_CSUM_TYPE(p, bch2_meta_checksum_type(c)); + + bch2_encrypt(c, PSET_CSUM_TYPE(p), + prio_nonce(p), + p->encrypted_start, + bucket_bytes(ca) - + offsetof(struct prio_set, encrypted_start)); + + p->csum = bch2_checksum(c, PSET_CSUM_TYPE(p), + prio_nonce(p), + (void *) p + sizeof(p->csum), + bucket_bytes(ca) - sizeof(p->csum)); + + ret = prio_io(ca, r, REQ_OP_WRITE); + if (bch2_dev_fatal_io_err_on(ret, ca, + "prio write to bucket %zu", r) || + bch2_meta_write_fault("prio")) + return ret; + } + + spin_lock(&j->lock); + j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]); + j->nr_prio_buckets = max_t(unsigned, + ca->dev_idx + 1, + j->nr_prio_buckets); + spin_unlock(&j->lock); + + do { + unsigned u64s = jset_u64s(0); + + if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) + break; + + ret = bch2_journal_res_get(j, &res, u64s, u64s); + if (ret) + return ret; + + need_new_journal_entry = j->buf[res.idx].nr_prio_buckets < + ca->dev_idx + 1; + bch2_journal_res_put(j, &res); + + ret = bch2_journal_flush_seq(j, res.seq); + if (ret) + return ret; + } while (need_new_journal_entry); + + /* + * Don't want the old priorities to get garbage collected until after we + * finish writing the new ones, and they're journalled + */ + + spin_lock(&ca->prio_buckets_lock); + + for (i = 0; i < prio_buckets(ca); i++) { + if (ca->prio_last_buckets[i]) + __bch2_bucket_free(ca, + &ca->buckets[ca->prio_last_buckets[i]]); + + ca->prio_last_buckets[i] = ca->prio_buckets[i]; + } + + spin_unlock(&ca->prio_buckets_lock); + + trace_prio_write_end(ca); + return 0; +} + +int bch2_prio_read(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct prio_set *p = ca->disk_buckets; + struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; + struct bucket_mark new; + struct bch_csum csum; + unsigned bucket_nr = 0; + u64 bucket, expect, got; + size_t b; + int ret = 0; + + spin_lock(&c->journal.lock); + bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]); + spin_unlock(&c->journal.lock); + + /* + * If the device hasn't been used yet, there won't be a prio bucket ptr + */ + if (!bucket) + return 0; + + unfixable_fsck_err_on(bucket < ca->mi.first_bucket || + bucket >= ca->mi.nbuckets, c, + "bad prio bucket %llu", bucket); + + for (b = 0; b < ca->mi.nbuckets; b++, d++) { + if (d == end) { + ca->prio_last_buckets[bucket_nr] = bucket; + bucket_nr++; + + ret = prio_io(ca, bucket, REQ_OP_READ); + if (bch2_dev_fatal_io_err_on(ret, ca, + "prior read from bucket %llu", + bucket) || + bch2_meta_read_fault("prio")) + return -EIO; + + got = le64_to_cpu(p->magic); + expect = pset_magic(c); + unfixable_fsck_err_on(got != expect, c, + "bad magic (got %llu expect %llu) while reading prios from bucket %llu", + got, expect, bucket); + + unfixable_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c, + "prio bucket with unknown csum type %llu bucket %lluu", + PSET_CSUM_TYPE(p), bucket); + + csum = bch2_checksum(c, PSET_CSUM_TYPE(p), + prio_nonce(p), + (void *) p + sizeof(p->csum), + bucket_bytes(ca) - sizeof(p->csum)); + unfixable_fsck_err_on(bch2_crc_cmp(csum, p->csum), c, + "bad checksum reading prios from bucket %llu", + bucket); + + bch2_encrypt(c, PSET_CSUM_TYPE(p), + prio_nonce(p), + p->encrypted_start, + bucket_bytes(ca) - + offsetof(struct prio_set, encrypted_start)); + + bucket = le64_to_cpu(p->next_bucket); + d = p->data; + } + + ca->buckets[b].read_prio = le16_to_cpu(d->read_prio); + ca->buckets[b].write_prio = le16_to_cpu(d->write_prio); + + bucket_cmpxchg(&ca->buckets[b], new, new.gen = d->gen); + } + + mutex_lock(&c->bucket_lock); + bch2_recalc_min_prio(ca, READ); + bch2_recalc_min_prio(ca, WRITE); + mutex_unlock(&c->bucket_lock); + + ret = 0; +fsck_err: + return ret; +} + +#define BUCKET_GC_GEN_MAX 96U + +/** + * wait_buckets_available - wait on reclaimable buckets + * + * If there aren't enough available buckets to fill up free_inc, wait until + * there are. + */ +static int wait_buckets_available(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + int ret = 0; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) { + ret = -1; + break; + } + + if (ca->inc_gen_needs_gc >= fifo_free(&ca->free_inc)) { + if (c->gc_thread) { + trace_gc_cannot_inc_gens(ca->fs); + atomic_inc(&c->kick_gc); + wake_up_process(ca->fs->gc_thread); + } + + /* + * We are going to wait for GC to wake us up, even if + * bucket counters tell us enough buckets are available, + * because we are actually waiting for GC to rewrite + * nodes with stale pointers + */ + } else if (dev_buckets_available(ca) >= + fifo_free(&ca->free_inc)) + break; + + up_read(&ca->fs->gc_lock); + schedule(); + try_to_freeze(); + down_read(&ca->fs->gc_lock); + } + + __set_current_state(TASK_RUNNING); + return ret; +} + +static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket) +{ + if (expensive_debug_checks(ca->fs)) { + size_t iter; + long i; + unsigned j; + + for (iter = 0; iter < prio_buckets(ca) * 2; iter++) + BUG_ON(ca->prio_buckets[iter] == bucket); + + for (j = 0; j < RESERVE_NR; j++) + fifo_for_each_entry(i, &ca->free[j], iter) + BUG_ON(i == bucket); + fifo_for_each_entry(i, &ca->free_inc, iter) + BUG_ON(i == bucket); + } +} + +/* Bucket heap / gen */ + +void bch2_recalc_min_prio(struct bch_dev *ca, int rw) +{ + struct bch_fs *c = ca->fs; + struct prio_clock *clock = &c->prio_clock[rw]; + struct bucket *g; + u16 max_delta = 1; + unsigned i; + + lockdep_assert_held(&c->bucket_lock); + + /* Determine min prio for this particular cache */ + for_each_bucket(g, ca) + max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw])); + + ca->min_prio[rw] = clock->hand - max_delta; + + /* + * This may possibly increase the min prio for the whole cache, check + * that as well. + */ + max_delta = 1; + + for_each_member_device(ca, c, i) + max_delta = max(max_delta, + (u16) (clock->hand - ca->min_prio[rw])); + + clock->min_prio = clock->hand - max_delta; +} + +static void bch2_rescale_prios(struct bch_fs *c, int rw) +{ + struct prio_clock *clock = &c->prio_clock[rw]; + struct bch_dev *ca; + struct bucket *g; + unsigned i; + + trace_rescale_prios(c); + + for_each_member_device(ca, c, i) { + for_each_bucket(g, ca) + g->prio[rw] = clock->hand - + (clock->hand - g->prio[rw]) / 2; + + bch2_recalc_min_prio(ca, rw); + } +} + +static void bch2_inc_clock_hand(struct io_timer *timer) +{ + struct prio_clock *clock = container_of(timer, + struct prio_clock, rescale); + struct bch_fs *c = container_of(clock, + struct bch_fs, prio_clock[clock->rw]); + u64 capacity; + + mutex_lock(&c->bucket_lock); + + clock->hand++; + + /* if clock cannot be advanced more, rescale prio */ + if (clock->hand == (u16) (clock->min_prio - 1)) + bch2_rescale_prios(c, clock->rw); + + mutex_unlock(&c->bucket_lock); + + capacity = READ_ONCE(c->capacity); + + if (!capacity) + return; + + /* + * we only increment when 0.1% of the filesystem capacity has been read + * or written too, this determines if it's time + * + * XXX: we shouldn't really be going off of the capacity of devices in + * RW mode (that will be 0 when we're RO, yet we can still service + * reads) + */ + timer->expire += capacity >> 10; + + bch2_io_timer_add(&c->io_clock[clock->rw], timer); +} + +static void bch2_prio_timer_init(struct bch_fs *c, int rw) +{ + struct prio_clock *clock = &c->prio_clock[rw]; + struct io_timer *timer = &clock->rescale; + + clock->rw = rw; + timer->fn = bch2_inc_clock_hand; + timer->expire = c->capacity >> 10; +} + +/* + * Background allocation thread: scans for buckets to be invalidated, + * invalidates them, rewrites prios/gens (marking them as invalidated on disk), + * then optionally issues discard commands to the newly free buckets, then puts + * them on the various freelists. + */ + +static inline bool can_inc_bucket_gen(struct bch_dev *ca, struct bucket *g) +{ + return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX; +} + +static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g) +{ + if (!is_available_bucket(READ_ONCE(g->mark))) + return false; + + if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX - 1) + ca->inc_gen_needs_gc++; + + return can_inc_bucket_gen(ca, g); +} + +static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g) +{ + spin_lock(&ca->freelist_lock); + + bch2_invalidate_bucket(ca, g); + + g->read_prio = ca->fs->prio_clock[READ].hand; + g->write_prio = ca->fs->prio_clock[WRITE].hand; + + verify_not_on_freelist(ca, g - ca->buckets); + BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); + + spin_unlock(&ca->freelist_lock); +} + +/* + * Determines what order we're going to reuse buckets, smallest bucket_key() + * first. + * + * + * - We take into account the read prio of the bucket, which gives us an + * indication of how hot the data is -- we scale the prio so that the prio + * farthest from the clock is worth 1/8th of the closest. + * + * - The number of sectors of cached data in the bucket, which gives us an + * indication of the cost in cache misses this eviction will cause. + * + * - The difference between the bucket's current gen and oldest gen of any + * pointer into it, which gives us an indication of the cost of an eventual + * btree GC to rewrite nodes with stale pointers. + */ + +#define bucket_sort_key(g) \ +({ \ + unsigned long prio = g->read_prio - ca->min_prio[READ]; \ + prio = (prio * 7) / (ca->fs->prio_clock[READ].hand - \ + ca->min_prio[READ]); \ + \ + (((prio + 1) * bucket_sectors_used(g)) << 8) | bucket_gc_gen(ca, g);\ +}) + +static void invalidate_buckets_lru(struct bch_dev *ca) +{ + struct bucket_heap_entry e; + struct bucket *g; + unsigned i; + + mutex_lock(&ca->heap_lock); + + ca->heap.used = 0; + + mutex_lock(&ca->fs->bucket_lock); + bch2_recalc_min_prio(ca, READ); + bch2_recalc_min_prio(ca, WRITE); + + /* + * Find buckets with lowest read priority, by building a maxheap sorted + * by read priority and repeatedly replacing the maximum element until + * all buckets have been visited. + */ + for_each_bucket(g, ca) { + if (!bch2_can_invalidate_bucket(ca, g)) + continue; + + bucket_heap_push(ca, g, bucket_sort_key(g)); + } + + /* Sort buckets by physical location on disk for better locality */ + for (i = 0; i < ca->heap.used; i++) { + struct bucket_heap_entry *e = &ca->heap.data[i]; + + e->val = e->g - ca->buckets; + } + + heap_resort(&ca->heap, bucket_max_cmp); + + /* + * If we run out of buckets to invalidate, bch2_allocator_thread() will + * kick stuff and retry us + */ + while (!fifo_full(&ca->free_inc) && + heap_pop(&ca->heap, e, bucket_max_cmp)) { + BUG_ON(!bch2_can_invalidate_bucket(ca, e.g)); + bch2_invalidate_one_bucket(ca, e.g); + } + + mutex_unlock(&ca->fs->bucket_lock); + mutex_unlock(&ca->heap_lock); +} + +static void invalidate_buckets_fifo(struct bch_dev *ca) +{ + struct bucket *g; + size_t checked = 0; + + while (!fifo_full(&ca->free_inc)) { + if (ca->fifo_last_bucket < ca->mi.first_bucket || + ca->fifo_last_bucket >= ca->mi.nbuckets) + ca->fifo_last_bucket = ca->mi.first_bucket; + + g = ca->buckets + ca->fifo_last_bucket++; + + if (bch2_can_invalidate_bucket(ca, g)) + bch2_invalidate_one_bucket(ca, g); + + if (++checked >= ca->mi.nbuckets) + return; + } +} + +static void invalidate_buckets_random(struct bch_dev *ca) +{ + struct bucket *g; + size_t checked = 0; + + while (!fifo_full(&ca->free_inc)) { + size_t n = bch2_rand_range(ca->mi.nbuckets - + ca->mi.first_bucket) + + ca->mi.first_bucket; + + g = ca->buckets + n; + + if (bch2_can_invalidate_bucket(ca, g)) + bch2_invalidate_one_bucket(ca, g); + + if (++checked >= ca->mi.nbuckets / 2) + return; + } +} + +static void invalidate_buckets(struct bch_dev *ca) +{ + ca->inc_gen_needs_gc = 0; + + switch (ca->mi.replacement) { + case CACHE_REPLACEMENT_LRU: + invalidate_buckets_lru(ca); + break; + case CACHE_REPLACEMENT_FIFO: + invalidate_buckets_fifo(ca); + break; + case CACHE_REPLACEMENT_RANDOM: + invalidate_buckets_random(ca); + break; + } +} + +static bool __bch2_allocator_push(struct bch_dev *ca, long bucket) +{ + if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) + goto success; + + if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket)) + goto success; + + if (fifo_push(&ca->free[RESERVE_BTREE], bucket)) + goto success; + + if (fifo_push(&ca->free[RESERVE_NONE], bucket)) + goto success; + + return false; +success: + closure_wake_up(&ca->fs->freelist_wait); + return true; +} + +static bool bch2_allocator_push(struct bch_dev *ca, long bucket) +{ + bool ret; + + spin_lock(&ca->freelist_lock); + ret = __bch2_allocator_push(ca, bucket); + if (ret) + fifo_pop(&ca->free_inc, bucket); + spin_unlock(&ca->freelist_lock); + + return ret; +} + +static void bch2_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca) +{ + u16 last_seq_ondisk = c->journal.last_seq_ondisk; + struct bucket *g; + + for_each_bucket(g, ca) { + struct bucket_mark m = READ_ONCE(g->mark); + + if (is_available_bucket(m) && + !m.cached_sectors && + !m.had_metadata && + !bucket_needs_journal_commit(m, last_seq_ondisk)) { + spin_lock(&ca->freelist_lock); + + bch2_mark_alloc_bucket(ca, g, true); + g->read_prio = c->prio_clock[READ].hand; + g->write_prio = c->prio_clock[WRITE].hand; + + verify_not_on_freelist(ca, g - ca->buckets); + BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); + + spin_unlock(&ca->freelist_lock); + + if (fifo_full(&ca->free_inc)) + break; + } + } +} + +/** + * bch_allocator_thread - move buckets from free_inc to reserves + * + * The free_inc FIFO is populated by invalidate_buckets(), and + * the reserves are depleted by bucket allocation. When we run out + * of free_inc, try to invalidate some buckets and write out + * prios and gens. + */ +static int bch2_allocator_thread(void *arg) +{ + struct bch_dev *ca = arg; + struct bch_fs *c = ca->fs; + int ret; + + set_freezable(); + + bch2_find_empty_buckets(c, ca); + + while (1) { + /* + * First, we pull buckets off of the free_inc list, possibly + * issue discards to them, then we add the bucket to a + * free list: + */ + + while (!fifo_empty(&ca->free_inc)) { + long bucket = fifo_peek(&ca->free_inc); + + /* + * Don't remove from free_inc until after it's added + * to freelist, so gc doesn't miss it while we've + * dropped bucket lock + */ + + if (ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, bucket), + ca->mi.bucket_size, GFP_NOIO, 0); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + if (bch2_allocator_push(ca, bucket)) + break; + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + goto out; + } + schedule(); + try_to_freeze(); + } + + __set_current_state(TASK_RUNNING); + } + + down_read(&c->gc_lock); + + /* + * See if we have buckets we can reuse without invalidating them + * or forcing a journal commit: + */ + //bch2_find_empty_buckets(c, ca); + + if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) { + up_read(&c->gc_lock); + continue; + } + + /* We've run out of free buckets! */ + + while (!fifo_full(&ca->free_inc)) { + if (wait_buckets_available(ca)) { + up_read(&c->gc_lock); + goto out; + } + + /* + * Find some buckets that we can invalidate, either + * they're completely unused, or only contain clean data + * that's been written back to the backing device or + * another cache tier + */ + + invalidate_buckets(ca); + trace_alloc_batch(ca, fifo_used(&ca->free_inc), + ca->free_inc.size); + } + + up_read(&c->gc_lock); + + /* + * free_inc is full of newly-invalidated buckets, must write out + * prios and gens before they can be re-used + */ + ret = bch2_prio_write(ca); + if (ret) { + /* + * Emergency read only - allocator thread has to + * shutdown. + * + * N.B. we better be going into RO mode, else + * allocations would hang indefinitely - whatever + * generated the error will have sent us into RO mode. + * + * Clear out the free_inc freelist so things are + * consistent-ish: + */ + spin_lock(&ca->freelist_lock); + while (!fifo_empty(&ca->free_inc)) { + long bucket; + + fifo_pop(&ca->free_inc, bucket); + bch2_mark_free_bucket(ca, ca->buckets + bucket); + } + spin_unlock(&ca->freelist_lock); + goto out; + } + } +out: + /* + * Avoid a race with bch2_usage_update() trying to wake us up after + * we've exited: + */ + synchronize_rcu(); + return 0; +} + +/* Allocation */ + +/** + * bch_bucket_alloc - allocate a single bucket from a specific device + * + * Returns index of bucket on success, 0 on failure + * */ +size_t bch2_bucket_alloc(struct bch_dev *ca, enum alloc_reserve reserve) +{ + struct bucket *g; + long r; + + spin_lock(&ca->freelist_lock); + if (fifo_pop(&ca->free[RESERVE_NONE], r) || + fifo_pop(&ca->free[reserve], r)) + goto out; + + spin_unlock(&ca->freelist_lock); + + trace_bucket_alloc_fail(ca, reserve); + return 0; +out: + verify_not_on_freelist(ca, r); + spin_unlock(&ca->freelist_lock); + + trace_bucket_alloc(ca, reserve); + + bch2_wake_allocator(ca); + + g = ca->buckets + r; + + g->read_prio = ca->fs->prio_clock[READ].hand; + g->write_prio = ca->fs->prio_clock[WRITE].hand; + + return r; +} + +static void __bch2_bucket_free(struct bch_dev *ca, struct bucket *g) +{ + bch2_mark_free_bucket(ca, g); + + g->read_prio = ca->fs->prio_clock[READ].hand; + g->write_prio = ca->fs->prio_clock[WRITE].hand; +} + +enum bucket_alloc_ret { + ALLOC_SUCCESS, + NO_DEVICES, /* -EROFS */ + FREELIST_EMPTY, /* Allocator thread not keeping up */ +}; + +static void recalc_alloc_group_weights(struct bch_fs *c, + struct dev_group *devs) +{ + struct bch_dev *ca; + u64 available_buckets = 1; /* avoid a divide by zero... */ + unsigned i; + + for (i = 0; i < devs->nr; i++) { + ca = devs->d[i].dev; + + devs->d[i].weight = dev_buckets_free(ca); + available_buckets += devs->d[i].weight; + } + + for (i = 0; i < devs->nr; i++) { + const unsigned min_weight = U32_MAX >> 4; + const unsigned max_weight = U32_MAX; + + devs->d[i].weight = + min_weight + + div64_u64(devs->d[i].weight * + devs->nr * + (max_weight - min_weight), + available_buckets); + devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight); + } +} + +static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c, + struct open_bucket *ob, + enum alloc_reserve reserve, + unsigned nr_replicas, + struct dev_group *devs, + long *devs_used) +{ + enum bucket_alloc_ret ret; + unsigned fail_idx = -1, i; + unsigned available = 0; + + BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs)); + + if (ob->nr_ptrs >= nr_replicas) + return ALLOC_SUCCESS; + + spin_lock(&devs->lock); + + for (i = 0; i < devs->nr; i++) + available += !test_bit(devs->d[i].dev->dev_idx, + devs_used); + + recalc_alloc_group_weights(c, devs); + + i = devs->cur_device; + + while (ob->nr_ptrs < nr_replicas) { + struct bch_dev *ca; + u64 bucket; + + if (!available) { + ret = NO_DEVICES; + goto err; + } + + i++; + i %= devs->nr; + + ret = FREELIST_EMPTY; + if (i == fail_idx) + goto err; + + ca = devs->d[i].dev; + + if (test_bit(ca->dev_idx, devs_used)) + continue; + + if (fail_idx == -1 && + get_random_int() > devs->d[i].weight) + continue; + + bucket = bch2_bucket_alloc(ca, reserve); + if (!bucket) { + if (fail_idx == -1) + fail_idx = i; + continue; + } + + /* + * open_bucket_add_buckets expects new pointers at the head of + * the list: + */ + memmove(&ob->ptrs[1], + &ob->ptrs[0], + ob->nr_ptrs * sizeof(ob->ptrs[0])); + memmove(&ob->ptr_offset[1], + &ob->ptr_offset[0], + ob->nr_ptrs * sizeof(ob->ptr_offset[0])); + ob->nr_ptrs++; + ob->ptrs[0] = (struct bch_extent_ptr) { + .gen = ca->buckets[bucket].mark.gen, + .offset = bucket_to_sector(ca, bucket), + .dev = ca->dev_idx, + }; + ob->ptr_offset[0] = 0; + + __set_bit(ca->dev_idx, devs_used); + available--; + devs->cur_device = i; + } + + ret = ALLOC_SUCCESS; +err: + EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC); + spin_unlock(&devs->lock); + return ret; +} + +static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, + struct write_point *wp, + struct open_bucket *ob, + unsigned nr_replicas, + enum alloc_reserve reserve, + long *devs_used) +{ + struct bch_tier *tier; + /* + * this should implement policy - for a given type of allocation, decide + * which devices to allocate from: + * + * XXX: switch off wp->type and do something more intelligent here + */ + if (wp->group) + return bch2_bucket_alloc_group(c, ob, reserve, nr_replicas, + wp->group, devs_used); + + /* foreground writes: prefer fastest tier: */ + tier = READ_ONCE(c->fastest_tier); + if (tier) + bch2_bucket_alloc_group(c, ob, reserve, nr_replicas, + &tier->devs, devs_used); + + return bch2_bucket_alloc_group(c, ob, reserve, nr_replicas, + &c->all_devs, devs_used); +} + +static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, + struct open_bucket *ob, unsigned nr_replicas, + enum alloc_reserve reserve, long *devs_used, + struct closure *cl) +{ + bool waiting = false; + + while (1) { + switch (__bch2_bucket_alloc_set(c, wp, ob, nr_replicas, + reserve, devs_used)) { + case ALLOC_SUCCESS: + if (waiting) + closure_wake_up(&c->freelist_wait); + + return 0; + + case NO_DEVICES: + if (waiting) + closure_wake_up(&c->freelist_wait); + return -EROFS; + + case FREELIST_EMPTY: + if (!cl || waiting) + trace_freelist_empty_fail(c, + reserve, cl); + + if (!cl) + return -ENOSPC; + + if (waiting) + return -EAGAIN; + + /* Retry allocation after adding ourself to waitlist: */ + closure_wait(&c->freelist_wait, cl); + waiting = true; + break; + default: + BUG(); + } + } +} + +/* Open buckets: */ + +/* + * Open buckets represent one or more buckets (on multiple devices) that are + * currently being allocated from. They serve two purposes: + * + * - They track buckets that have been partially allocated, allowing for + * sub-bucket sized allocations - they're used by the sector allocator below + * + * - They provide a reference to the buckets they own that mark and sweep GC + * can find, until the new allocation has a pointer to it inserted into the + * btree + * + * When allocating some space with the sector allocator, the allocation comes + * with a reference to an open bucket - the caller is required to put that + * reference _after_ doing the index update that makes its allocation reachable. + */ + +static void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +{ + const struct bch_extent_ptr *ptr; + + lockdep_assert_held(&c->open_buckets_lock); + + open_bucket_for_each_ptr(ob, ptr) { + struct bch_dev *ca = c->devs[ptr->dev]; + + bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false); + } + + ob->nr_ptrs = 0; + + list_move(&ob->list, &c->open_buckets_free); + c->open_buckets_nr_free++; + closure_wake_up(&c->open_buckets_wait); +} + +void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *b) +{ + if (atomic_dec_and_test(&b->pin)) { + spin_lock(&c->open_buckets_lock); + __bch2_open_bucket_put(c, b); + spin_unlock(&c->open_buckets_lock); + } +} + +static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c, + unsigned nr_reserved, + struct closure *cl) +{ + struct open_bucket *ret; + + spin_lock(&c->open_buckets_lock); + + if (c->open_buckets_nr_free > nr_reserved) { + BUG_ON(list_empty(&c->open_buckets_free)); + ret = list_first_entry(&c->open_buckets_free, + struct open_bucket, list); + list_move(&ret->list, &c->open_buckets_open); + BUG_ON(ret->nr_ptrs); + + atomic_set(&ret->pin, 1); /* XXX */ + ret->has_full_ptrs = false; + + c->open_buckets_nr_free--; + trace_open_bucket_alloc(c, cl); + } else { + trace_open_bucket_alloc_fail(c, cl); + + if (cl) { + closure_wait(&c->open_buckets_wait, cl); + ret = ERR_PTR(-EAGAIN); + } else + ret = ERR_PTR(-ENOSPC); + } + + spin_unlock(&c->open_buckets_lock); + + return ret; +} + +static unsigned ob_ptr_sectors_free(struct bch_fs *c, + struct open_bucket *ob, + struct bch_extent_ptr *ptr) +{ + struct bch_dev *ca = c->devs[ptr->dev]; + unsigned i = ptr - ob->ptrs; + unsigned bucket_size = ca->mi.bucket_size; + unsigned used = (ptr->offset & (bucket_size - 1)) + + ob->ptr_offset[i]; + + BUG_ON(used > bucket_size); + + return bucket_size - used; +} + +static unsigned open_bucket_sectors_free(struct bch_fs *c, + struct open_bucket *ob, + unsigned nr_replicas) +{ + unsigned i, sectors_free = UINT_MAX; + + for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++) + sectors_free = min(sectors_free, + ob_ptr_sectors_free(c, ob, &ob->ptrs[i])); + + return sectors_free != UINT_MAX ? sectors_free : 0; +} + +static void open_bucket_copy_unused_ptrs(struct bch_fs *c, + struct open_bucket *new, + struct open_bucket *old) +{ + unsigned i; + + for (i = 0; i < old->nr_ptrs; i++) + if (ob_ptr_sectors_free(c, old, &old->ptrs[i])) { + struct bch_extent_ptr tmp = old->ptrs[i]; + + tmp.offset += old->ptr_offset[i]; + new->ptrs[new->nr_ptrs] = tmp; + new->ptr_offset[new->nr_ptrs] = 0; + new->nr_ptrs++; + } +} + +static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + const struct bch_extent_ptr *ptr; + + open_bucket_for_each_ptr(ob, ptr) { + struct bch_dev *ca = c->devs[ptr->dev]; + + BUG_ON(ptr_stale(ca, ptr)); + } +#endif +} + +/* Sector allocator */ + +static struct open_bucket *lock_writepoint(struct bch_fs *c, + struct write_point *wp) +{ + struct open_bucket *ob; + + while ((ob = ACCESS_ONCE(wp->b))) { + mutex_lock(&ob->lock); + if (wp->b == ob) + break; + + mutex_unlock(&ob->lock); + } + + return ob; +} + +static int open_bucket_add_buckets(struct bch_fs *c, + struct write_point *wp, + struct open_bucket *ob, + unsigned nr_replicas, + unsigned nr_replicas_required, + enum alloc_reserve reserve, + struct closure *cl) +{ + long devs_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; + unsigned i; + int ret; + + /* + * We might be allocating pointers to add to an existing extent + * (tiering/copygc/migration) - if so, some of the pointers in our + * existing open bucket might duplicate devices we already have. This is + * moderately annoying. + */ + + /* Short circuit all the fun stuff if posssible: */ + if (ob->nr_ptrs >= nr_replicas) + return 0; + + memset(devs_used, 0, sizeof(devs_used)); + + for (i = 0; i < ob->nr_ptrs; i++) + __set_bit(ob->ptrs[i].dev, devs_used); + + ret = bch2_bucket_alloc_set(c, wp, ob, nr_replicas, + reserve, devs_used, cl); + + if (ret == -EROFS && + ob->nr_ptrs >= nr_replicas_required) + ret = 0; + + return ret; +} + +/* + * Get us an open_bucket we can allocate from, return with it locked: + */ +struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *c, + struct write_point *wp, + unsigned nr_replicas, + unsigned nr_replicas_required, + enum alloc_reserve reserve, + struct closure *cl) +{ + struct open_bucket *ob; + unsigned open_buckets_reserved = wp == &c->btree_write_point + ? 0 : BTREE_NODE_RESERVE; + int ret; + + BUG_ON(!reserve); + BUG_ON(!nr_replicas); +retry: + ob = lock_writepoint(c, wp); + + /* + * If ob->sectors_free == 0, one or more of the buckets ob points to is + * full. We can't drop pointers from an open bucket - garbage collection + * still needs to find them; instead, we must allocate a new open bucket + * and copy any pointers to non-full buckets into the new open bucket. + */ + if (!ob || ob->has_full_ptrs) { + struct open_bucket *new_ob; + + new_ob = bch2_open_bucket_get(c, open_buckets_reserved, cl); + if (IS_ERR(new_ob)) + return new_ob; + + mutex_lock(&new_ob->lock); + + /* + * We point the write point at the open_bucket before doing the + * allocation to avoid a race with shutdown: + */ + if (race_fault() || + cmpxchg(&wp->b, ob, new_ob) != ob) { + /* We raced: */ + mutex_unlock(&new_ob->lock); + bch2_open_bucket_put(c, new_ob); + + if (ob) + mutex_unlock(&ob->lock); + goto retry; + } + + if (ob) { + open_bucket_copy_unused_ptrs(c, new_ob, ob); + mutex_unlock(&ob->lock); + bch2_open_bucket_put(c, ob); + } + + ob = new_ob; + } + + ret = open_bucket_add_buckets(c, wp, ob, nr_replicas, + nr_replicas_required, + reserve, cl); + if (ret) { + mutex_unlock(&ob->lock); + return ERR_PTR(ret); + } + + ob->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas); + + BUG_ON(!ob->sectors_free); + verify_not_stale(c, ob); + + return ob; +} + +/* + * Append pointers to the space we just allocated to @k, and mark @sectors space + * as allocated out of @ob + */ +void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, + unsigned nr_replicas, struct open_bucket *ob, + unsigned sectors) +{ + struct bch_extent_ptr tmp; + bool has_data = false; + unsigned i; + + /* + * We're keeping any existing pointer k has, and appending new pointers: + * __bch2_write() will only write to the pointers we add here: + */ + + BUG_ON(sectors > ob->sectors_free); + + /* didn't use all the ptrs: */ + if (nr_replicas < ob->nr_ptrs) + has_data = true; + + for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) { + EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev)); + + tmp = ob->ptrs[i]; + tmp.cached = bkey_extent_is_cached(&e->k); + tmp.offset += ob->ptr_offset[i]; + extent_ptr_append(e, tmp); + + ob->ptr_offset[i] += sectors; + + this_cpu_add(*c->devs[tmp.dev]->sectors_written, sectors); + } +} + +/* + * Append pointers to the space we just allocated to @k, and mark @sectors space + * as allocated out of @ob + */ +void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp, + struct open_bucket *ob) +{ + bool has_data = false; + unsigned i; + + for (i = 0; i < ob->nr_ptrs; i++) { + if (!ob_ptr_sectors_free(c, ob, &ob->ptrs[i])) + ob->has_full_ptrs = true; + else + has_data = true; + } + + if (likely(has_data)) + atomic_inc(&ob->pin); + else + BUG_ON(xchg(&wp->b, NULL) != ob); + + mutex_unlock(&ob->lock); +} + +/* + * Allocates some space in the cache to write to, and k to point to the newly + * allocated space, and updates k->size and k->offset (to point to the + * end of the newly allocated space). + * + * May allocate fewer sectors than @sectors, k->size indicates how many + * sectors were actually allocated. + * + * Return codes: + * - -EAGAIN: closure was added to waitlist + * - -ENOSPC: out of space and no closure provided + * + * @c - filesystem. + * @wp - write point to use for allocating sectors. + * @k - key to return the allocated space information. + * @cl - closure to wait for a bucket + */ +struct open_bucket *bch2_alloc_sectors(struct bch_fs *c, + struct write_point *wp, + struct bkey_i_extent *e, + unsigned nr_replicas, + unsigned nr_replicas_required, + enum alloc_reserve reserve, + struct closure *cl) +{ + struct open_bucket *ob; + + ob = bch2_alloc_sectors_start(c, wp, nr_replicas, + nr_replicas_required, + reserve, cl); + if (IS_ERR_OR_NULL(ob)) + return ob; + + if (e->k.size > ob->sectors_free) + bch2_key_resize(&e->k, ob->sectors_free); + + bch2_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size); + + bch2_alloc_sectors_done(c, wp, ob); + + return ob; +} + +/* Startup/shutdown (ro/rw): */ + +void bch2_recalc_capacity(struct bch_fs *c) +{ + struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier; + struct bch_dev *ca; + u64 total_capacity, capacity = 0, reserved_sectors = 0; + unsigned long ra_pages = 0; + unsigned i, j; + + for_each_online_member(ca, c, i) { + struct backing_dev_info *bdi = + blk_get_backing_dev_info(ca->disk_sb.bdev); + + ra_pages += bdi->ra_pages; + } + + c->bdi.ra_pages = ra_pages; + + /* Find fastest, slowest tiers with devices: */ + + for (tier = c->tiers; + tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { + if (!tier->devs.nr) + continue; + if (!fastest_tier) + fastest_tier = tier; + slowest_tier = tier; + } + + c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL; + + c->promote_write_point.group = &fastest_tier->devs; + + if (!fastest_tier) + goto set_capacity; + + /* + * Capacity of the filesystem is the capacity of all the devices in the + * slowest (highest) tier - we don't include lower tier devices. + */ + spin_lock(&slowest_tier->devs.lock); + group_for_each_dev(ca, &slowest_tier->devs, i) { + size_t reserve = 0; + + /* + * We need to reserve buckets (from the number + * of currently available buckets) against + * foreground writes so that mainly copygc can + * make forward progress. + * + * We need enough to refill the various reserves + * from scratch - copygc will use its entire + * reserve all at once, then run against when + * its reserve is refilled (from the formerly + * available buckets). + * + * This reserve is just used when considering if + * allocations for foreground writes must wait - + * not -ENOSPC calculations. + */ + for (j = 0; j < RESERVE_NONE; j++) + reserve += ca->free[j].size; + + reserve += ca->free_inc.size; + + reserve += ARRAY_SIZE(c->write_points); + + if (ca->mi.tier) + reserve += 1; /* tiering write point */ + reserve += 1; /* btree write point */ + + reserved_sectors += reserve << ca->bucket_bits; + + capacity += (ca->mi.nbuckets - + ca->mi.first_bucket) << + ca->bucket_bits; + } + spin_unlock(&slowest_tier->devs.lock); +set_capacity: + total_capacity = capacity; + + capacity *= (100 - c->opts.gc_reserve_percent); + capacity = div64_u64(capacity, 100); + + BUG_ON(capacity + reserved_sectors > total_capacity); + + c->capacity = capacity; + + if (c->capacity) { + bch2_io_timer_add(&c->io_clock[READ], + &c->prio_clock[READ].rescale); + bch2_io_timer_add(&c->io_clock[WRITE], + &c->prio_clock[WRITE].rescale); + } else { + bch2_io_timer_del(&c->io_clock[READ], + &c->prio_clock[READ].rescale); + bch2_io_timer_del(&c->io_clock[WRITE], + &c->prio_clock[WRITE].rescale); + } + + /* Wake up case someone was waiting for buckets */ + closure_wake_up(&c->freelist_wait); +} + +static void bch2_stop_write_point(struct bch_dev *ca, + struct write_point *wp) +{ + struct bch_fs *c = ca->fs; + struct open_bucket *ob; + struct bch_extent_ptr *ptr; + + ob = lock_writepoint(c, wp); + if (!ob) + return; + + for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++) + if (ptr->dev == ca->dev_idx) + goto found; + + mutex_unlock(&ob->lock); + return; +found: + BUG_ON(xchg(&wp->b, NULL) != ob); + mutex_unlock(&ob->lock); + + /* Drop writepoint's ref: */ + bch2_open_bucket_put(c, ob); +} + +static bool bch2_dev_has_open_write_point(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_extent_ptr *ptr; + struct open_bucket *ob; + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) + if (atomic_read(&ob->pin)) { + mutex_lock(&ob->lock); + for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++) + if (ptr->dev == ca->dev_idx) { + mutex_unlock(&ob->lock); + return true; + } + mutex_unlock(&ob->lock); + } + + return false; +} + +/* device goes ro: */ +void bch2_dev_allocator_stop(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct dev_group *tier = &c->tiers[ca->mi.tier].devs; + struct task_struct *p; + struct closure cl; + unsigned i; + + closure_init_stack(&cl); + + /* First, remove device from allocation groups: */ + + bch2_dev_group_remove(tier, ca); + bch2_dev_group_remove(&c->all_devs, ca); + + bch2_recalc_capacity(c); + + /* + * Stopping the allocator thread comes after removing from allocation + * groups, else pending allocations will hang: + */ + + p = ca->alloc_thread; + ca->alloc_thread = NULL; + smp_wmb(); + + /* + * We need an rcu barrier between setting ca->alloc_thread = NULL and + * the thread shutting down to avoid a race with bch2_usage_update() - + * the allocator thread itself does a synchronize_rcu() on exit. + * + * XXX: it would be better to have the rcu barrier be asynchronous + * instead of blocking us here + */ + if (p) { + kthread_stop(p); + put_task_struct(p); + } + + /* Next, close write points that point to this device... */ + + for (i = 0; i < ARRAY_SIZE(c->write_points); i++) + bch2_stop_write_point(ca, &c->write_points[i]); + + bch2_stop_write_point(ca, &ca->copygc_write_point); + bch2_stop_write_point(ca, &c->promote_write_point); + bch2_stop_write_point(ca, &ca->tiering_write_point); + bch2_stop_write_point(ca, &c->migration_write_point); + bch2_stop_write_point(ca, &c->btree_write_point); + + mutex_lock(&c->btree_reserve_cache_lock); + while (c->btree_reserve_cache_nr) { + struct btree_alloc *a = + &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + + bch2_open_bucket_put(c, a->ob); + } + mutex_unlock(&c->btree_reserve_cache_lock); + + /* Avoid deadlocks.. */ + + closure_wake_up(&c->freelist_wait); + wake_up(&c->journal.wait); + + /* Now wait for any in flight writes: */ + + while (1) { + closure_wait(&c->open_buckets_wait, &cl); + + if (!bch2_dev_has_open_write_point(ca)) { + closure_wake_up(&c->open_buckets_wait); + break; + } + + closure_sync(&cl); + } +} + +/* + * Startup the allocator thread for transition to RW mode: + */ +int bch2_dev_allocator_start(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct dev_group *tier = &c->tiers[ca->mi.tier].devs; + struct bch_sb_field_journal *journal_buckets; + bool has_journal; + struct task_struct *k; + + /* + * allocator thread already started? + */ + if (ca->alloc_thread) + return 0; + + k = kthread_create(bch2_allocator_thread, ca, "bcache_allocator"); + if (IS_ERR(k)) + return 0; + + get_task_struct(k); + ca->alloc_thread = k; + + bch2_dev_group_add(tier, ca); + bch2_dev_group_add(&c->all_devs, ca); + + mutex_lock(&c->sb_lock); + journal_buckets = bch2_sb_get_journal(ca->disk_sb.sb); + has_journal = bch2_nr_journal_buckets(journal_buckets) >= + BCH_JOURNAL_BUCKETS_MIN; + mutex_unlock(&c->sb_lock); + + if (has_journal) + bch2_dev_group_add(&c->journal.devs, ca); + + bch2_recalc_capacity(c); + + /* + * Don't wake up allocator thread until after adding device to + * allocator groups - otherwise, alloc thread could get a spurious + * -EROFS due to prio_write() -> journal_meta() not finding any devices: + */ + wake_up_process(k); + return 0; +} + +void bch2_fs_allocator_init(struct bch_fs *c) +{ + unsigned i; + + INIT_LIST_HEAD(&c->open_buckets_open); + INIT_LIST_HEAD(&c->open_buckets_free); + spin_lock_init(&c->open_buckets_lock); + bch2_prio_timer_init(c, READ); + bch2_prio_timer_init(c, WRITE); + + /* open bucket 0 is a sentinal NULL: */ + mutex_init(&c->open_buckets[0].lock); + INIT_LIST_HEAD(&c->open_buckets[0].list); + + for (i = 1; i < ARRAY_SIZE(c->open_buckets); i++) { + mutex_init(&c->open_buckets[i].lock); + c->open_buckets_nr_free++; + list_add(&c->open_buckets[i].list, &c->open_buckets_free); + } + + spin_lock_init(&c->all_devs.lock); + + for (i = 0; i < ARRAY_SIZE(c->tiers); i++) + spin_lock_init(&c->tiers[i].devs.lock); + + for (i = 0; i < ARRAY_SIZE(c->write_points); i++) + c->write_points[i].throttle = true; + + c->pd_controllers_update_seconds = 5; + INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); + + spin_lock_init(&c->foreground_write_pd_lock); + bch2_pd_controller_init(&c->foreground_write_pd); + /* + * We do not want the write rate to have an effect on the computed + * rate, for two reasons: + * + * We do not call bch2_ratelimit_delay() at all if the write rate + * exceeds 1GB/s. In this case, the PD controller will think we are + * not "keeping up" and not change the rate. + */ + c->foreground_write_pd.backpressure = 0; + init_timer(&c->foreground_write_wakeup); + + c->foreground_write_wakeup.data = (unsigned long) c; + c->foreground_write_wakeup.function = bch2_wake_delayed_writes; +} diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h new file mode 100644 index 00000000..08638b25 --- /dev/null +++ b/libbcachefs/alloc.h @@ -0,0 +1,85 @@ +#ifndef _BCACHE_ALLOC_H +#define _BCACHE_ALLOC_H + +#include "alloc_types.h" + +struct bkey; +struct bucket; +struct bch_dev; +struct bch_fs; +struct dev_group; + +static inline size_t prios_per_bucket(const struct bch_dev *ca) +{ + return (bucket_bytes(ca) - sizeof(struct prio_set)) / + sizeof(struct bucket_disk); +} + +static inline size_t prio_buckets(const struct bch_dev *ca) +{ + return DIV_ROUND_UP((size_t) (ca)->mi.nbuckets, prios_per_bucket(ca)); +} + +void bch2_dev_group_remove(struct dev_group *, struct bch_dev *); +void bch2_dev_group_add(struct dev_group *, struct bch_dev *); + +int bch2_prio_read(struct bch_dev *); + +size_t bch2_bucket_alloc(struct bch_dev *, enum alloc_reserve); + +void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); + +struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *, + struct write_point *, + unsigned, unsigned, + enum alloc_reserve, + struct closure *); + +void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *, + unsigned, struct open_bucket *, unsigned); +void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *, + struct open_bucket *); + +struct open_bucket *bch2_alloc_sectors(struct bch_fs *, struct write_point *, + struct bkey_i_extent *, unsigned, unsigned, + enum alloc_reserve, struct closure *); + +static inline void bch2_wake_allocator(struct bch_dev *ca) +{ + struct task_struct *p; + + rcu_read_lock(); + if ((p = ACCESS_ONCE(ca->alloc_thread))) + wake_up_process(p); + rcu_read_unlock(); +} + +static inline struct bch_dev *dev_group_next(struct dev_group *devs, + unsigned *iter) +{ + struct bch_dev *ret = NULL; + + while (*iter < devs->nr && + !(ret = rcu_dereference_check(devs->d[*iter].dev, + lockdep_is_held(&devs->lock)))) + (*iter)++; + + return ret; +} + +#define group_for_each_dev(ca, devs, iter) \ + for ((iter) = 0; \ + ((ca) = dev_group_next((devs), &(iter))); \ + (iter)++) + +#define open_bucket_for_each_ptr(_ob, _ptr) \ + for ((_ptr) = (_ob)->ptrs; \ + (_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs; \ + (_ptr)++) + +void bch2_recalc_capacity(struct bch_fs *); +void bch2_dev_allocator_stop(struct bch_dev *); +int bch2_dev_allocator_start(struct bch_dev *); +void bch2_fs_allocator_init(struct bch_fs *); + +#endif /* _BCACHE_ALLOC_H */ diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h new file mode 100644 index 00000000..1bf48ef9 --- /dev/null +++ b/libbcachefs/alloc_types.h @@ -0,0 +1,102 @@ +#ifndef _BCACHE_ALLOC_TYPES_H +#define _BCACHE_ALLOC_TYPES_H + +#include <linux/mutex.h> + +#include "clock_types.h" + +/* + * There's two of these clocks, one for reads and one for writes: + * + * All fields protected by bucket_lock + */ +struct prio_clock { + /* + * "now" in (read/write) IO time - incremented whenever we do X amount + * of reads or writes. + * + * Goes with the bucket read/write prios: when we read or write to a + * bucket we reset the bucket's prio to the current hand; thus hand - + * prio = time since bucket was last read/written. + * + * The units are some amount (bytes/sectors) of data read/written, and + * the units can change on the fly if we need to rescale to fit + * everything in a u16 - your only guarantee is that the units are + * consistent. + */ + u16 hand; + u16 min_prio; + + int rw; + + struct io_timer rescale; +}; + +/* There is one reserve for each type of btree, one for prios and gens + * and one for moving GC */ +enum alloc_reserve { + RESERVE_PRIO, + RESERVE_BTREE, + RESERVE_METADATA_LAST = RESERVE_BTREE, + RESERVE_MOVINGGC, + + RESERVE_NONE, + RESERVE_NR, +}; + +static inline bool allocation_is_metadata(enum alloc_reserve id) +{ + return id <= RESERVE_METADATA_LAST; +} + +struct dev_group { + spinlock_t lock; + unsigned nr; + unsigned cur_device; + struct { + u64 weight; + struct bch_dev *dev; + } d[BCH_SB_MEMBERS_MAX]; +}; + +/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ +#define OPEN_BUCKETS_COUNT 256 + +#define WRITE_POINT_COUNT 16 + +struct open_bucket { + struct list_head list; + struct mutex lock; + atomic_t pin; + bool has_full_ptrs; + /* + * recalculated every time we allocate from this open_bucket based on + * how many pointers we're actually going to use: + */ + unsigned sectors_free; + unsigned nr_ptrs; + struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; + unsigned ptr_offset[BCH_REPLICAS_MAX]; +}; + +struct write_point { + struct open_bucket *b; + + /* + * Throttle writes to this write point if tier 0 is full? + */ + bool throttle; + + /* + * If not NULL, cache group for tiering, promotion and moving GC - + * always allocates a single replica + */ + struct dev_group *group; + + /* + * Otherwise do a normal replicated bucket allocation that could come + * from any device in tier 0 (foreground write) + */ +}; + +#endif /* _BCACHE_ALLOC_TYPES_H */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h new file mode 100644 index 00000000..6e08947c --- /dev/null +++ b/libbcachefs/bcachefs.h @@ -0,0 +1,794 @@ +#ifndef _BCACHE_H +#define _BCACHE_H + +/* + * SOME HIGH LEVEL CODE DOCUMENTATION: + * + * Bcache mostly works with cache sets, cache devices, and backing devices. + * + * Support for multiple cache devices hasn't quite been finished off yet, but + * it's about 95% plumbed through. A cache set and its cache devices is sort of + * like a md raid array and its component devices. Most of the code doesn't care + * about individual cache devices, the main abstraction is the cache set. + * + * Multiple cache devices is intended to give us the ability to mirror dirty + * cached data and metadata, without mirroring clean cached data. + * + * Backing devices are different, in that they have a lifetime independent of a + * cache set. When you register a newly formatted backing device it'll come up + * in passthrough mode, and then you can attach and detach a backing device from + * a cache set at runtime - while it's mounted and in use. Detaching implicitly + * invalidates any cached data for that backing device. + * + * A cache set can have multiple (many) backing devices attached to it. + * + * There's also flash only volumes - this is the reason for the distinction + * between struct cached_dev and struct bcache_device. A flash only volume + * works much like a bcache device that has a backing device, except the + * "cached" data is always dirty. The end result is that we get thin + * provisioning with very little additional code. + * + * Flash only volumes work but they're not production ready because the moving + * garbage collector needs more work. More on that later. + * + * BUCKETS/ALLOCATION: + * + * Bcache is primarily designed for caching, which means that in normal + * operation all of our available space will be allocated. Thus, we need an + * efficient way of deleting things from the cache so we can write new things to + * it. + * + * To do this, we first divide the cache device up into buckets. A bucket is the + * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ + * works efficiently. + * + * Each bucket has a 16 bit priority, and an 8 bit generation associated with + * it. The gens and priorities for all the buckets are stored contiguously and + * packed on disk (in a linked list of buckets - aside from the superblock, all + * of bcache's metadata is stored in buckets). + * + * The priority is used to implement an LRU. We reset a bucket's priority when + * we allocate it or on cache it, and every so often we decrement the priority + * of each bucket. It could be used to implement something more sophisticated, + * if anyone ever gets around to it. + * + * The generation is used for invalidating buckets. Each pointer also has an 8 + * bit generation embedded in it; for a pointer to be considered valid, its gen + * must match the gen of the bucket it points into. Thus, to reuse a bucket all + * we have to do is increment its gen (and write its new gen to disk; we batch + * this up). + * + * Bcache is entirely COW - we never write twice to a bucket, even buckets that + * contain metadata (including btree nodes). + * + * THE BTREE: + * + * Bcache is in large part design around the btree. + * + * At a high level, the btree is just an index of key -> ptr tuples. + * + * Keys represent extents, and thus have a size field. Keys also have a variable + * number of pointers attached to them (potentially zero, which is handy for + * invalidating the cache). + * + * The key itself is an inode:offset pair. The inode number corresponds to a + * backing device or a flash only volume. The offset is the ending offset of the + * extent within the inode - not the starting offset; this makes lookups + * slightly more convenient. + * + * Pointers contain the cache device id, the offset on that device, and an 8 bit + * generation number. More on the gen later. + * + * Index lookups are not fully abstracted - cache lookups in particular are + * still somewhat mixed in with the btree code, but things are headed in that + * direction. + * + * Updates are fairly well abstracted, though. There are two different ways of + * updating the btree; insert and replace. + * + * BTREE_INSERT will just take a list of keys and insert them into the btree - + * overwriting (possibly only partially) any extents they overlap with. This is + * used to update the index after a write. + * + * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is + * overwriting a key that matches another given key. This is used for inserting + * data into the cache after a cache miss, and for background writeback, and for + * the moving garbage collector. + * + * There is no "delete" operation; deleting things from the index is + * accomplished by either by invalidating pointers (by incrementing a bucket's + * gen) or by inserting a key with 0 pointers - which will overwrite anything + * previously present at that location in the index. + * + * This means that there are always stale/invalid keys in the btree. They're + * filtered out by the code that iterates through a btree node, and removed when + * a btree node is rewritten. + * + * BTREE NODES: + * + * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and + * free smaller than a bucket - so, that's how big our btree nodes are. + * + * (If buckets are really big we'll only use part of the bucket for a btree node + * - no less than 1/4th - but a bucket still contains no more than a single + * btree node. I'd actually like to change this, but for now we rely on the + * bucket's gen for deleting btree nodes when we rewrite/split a node.) + * + * Anyways, btree nodes are big - big enough to be inefficient with a textbook + * btree implementation. + * + * The way this is solved is that btree nodes are internally log structured; we + * can append new keys to an existing btree node without rewriting it. This + * means each set of keys we write is sorted, but the node is not. + * + * We maintain this log structure in memory - keeping 1Mb of keys sorted would + * be expensive, and we have to distinguish between the keys we have written and + * the keys we haven't. So to do a lookup in a btree node, we have to search + * each sorted set. But we do merge written sets together lazily, so the cost of + * these extra searches is quite low (normally most of the keys in a btree node + * will be in one big set, and then there'll be one or two sets that are much + * smaller). + * + * This log structure makes bcache's btree more of a hybrid between a + * conventional btree and a compacting data structure, with some of the + * advantages of both. + * + * GARBAGE COLLECTION: + * + * We can't just invalidate any bucket - it might contain dirty data or + * metadata. If it once contained dirty data, other writes might overwrite it + * later, leaving no valid pointers into that bucket in the index. + * + * Thus, the primary purpose of garbage collection is to find buckets to reuse. + * It also counts how much valid data it each bucket currently contains, so that + * allocation can reuse buckets sooner when they've been mostly overwritten. + * + * It also does some things that are really internal to the btree + * implementation. If a btree node contains pointers that are stale by more than + * some threshold, it rewrites the btree node to avoid the bucket's generation + * wrapping around. It also merges adjacent btree nodes if they're empty enough. + * + * THE JOURNAL: + * + * Bcache's journal is not necessary for consistency; we always strictly + * order metadata writes so that the btree and everything else is consistent on + * disk in the event of an unclean shutdown, and in fact bcache had writeback + * caching (with recovery from unclean shutdown) before journalling was + * implemented. + * + * Rather, the journal is purely a performance optimization; we can't complete a + * write until we've updated the index on disk, otherwise the cache would be + * inconsistent in the event of an unclean shutdown. This means that without the + * journal, on random write workloads we constantly have to update all the leaf + * nodes in the btree, and those writes will be mostly empty (appending at most + * a few keys each) - highly inefficient in terms of amount of metadata writes, + * and it puts more strain on the various btree resorting/compacting code. + * + * The journal is just a log of keys we've inserted; on startup we just reinsert + * all the keys in the open journal entries. That means that when we're updating + * a node in the btree, we can wait until a 4k block of keys fills up before + * writing them out. + * + * For simplicity, we only journal updates to leaf nodes; updates to parent + * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth + * the complexity to deal with journalling them (in particular, journal replay) + * - updates to non leaf nodes just happen synchronously (see btree_split()). + */ + +#undef pr_fmt +#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ + +#include <linux/bug.h> +#include <linux/bio.h> +#include <linux/closure.h> +#include <linux/kobject.h> +#include <linux/lglock.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/percpu-refcount.h> +#include <linux/radix-tree.h> +#include <linux/rbtree.h> +#include <linux/rhashtable.h> +#include <linux/rwsem.h> +#include <linux/seqlock.h> +#include <linux/shrinker.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +#include "bcachefs_format.h" +#include "bset.h" +#include "fifo.h" +#include "opts.h" +#include "util.h" + +#include <linux/dynamic_fault.h> + +#define bch2_fs_init_fault(name) \ + dynamic_fault("bcachefs:bch_fs_init:" name) +#define bch2_meta_read_fault(name) \ + dynamic_fault("bcachefs:meta:read:" name) +#define bch2_meta_write_fault(name) \ + dynamic_fault("bcachefs:meta:write:" name) + +#ifndef bch2_fmt +#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) +#endif + +#define bch_info(c, fmt, ...) \ + printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_notice(c, fmt, ...) \ + printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_warn(c, fmt, ...) \ + printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_err(c, fmt, ...) \ + printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + +#define bch_verbose(c, fmt, ...) \ +do { \ + if ((c)->opts.verbose_recovery) \ + bch_info(c, fmt, ##__VA_ARGS__); \ +} while (0) + +/* Parameters that are useful for debugging, but should always be compiled in: */ +#define BCH_DEBUG_PARAMS_ALWAYS() \ + BCH_DEBUG_PARAM(key_merging_disabled, \ + "Disables merging of extents") \ + BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ + "Causes mark and sweep to compact and rewrite every " \ + "btree node it traverses") \ + BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ + "Disables rewriting of btree nodes during mark and sweep")\ + BCH_DEBUG_PARAM(btree_gc_coalesce_disabled, \ + "Disables coalescing of btree nodes") \ + BCH_DEBUG_PARAM(btree_shrinker_disabled, \ + "Disables the shrinker callback for the btree node cache") + +/* Parameters that should only be compiled in in debug mode: */ +#define BCH_DEBUG_PARAMS_DEBUG() \ + BCH_DEBUG_PARAM(expensive_debug_checks, \ + "Enables various runtime debugging checks that " \ + "significantly affect performance") \ + BCH_DEBUG_PARAM(debug_check_bkeys, \ + "Run bkey_debugcheck (primarily checking GC/allocation "\ + "information) when iterating over keys") \ + BCH_DEBUG_PARAM(version_stress_test, \ + "Assigns random version numbers to newly written " \ + "extents, to test overlapping extent cases") \ + BCH_DEBUG_PARAM(verify_btree_ondisk, \ + "Reread btree nodes at various points to verify the " \ + "mergesort in the read path against modifications " \ + "done in memory") \ + +#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() + +#ifdef CONFIG_BCACHEFS_DEBUG +#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() +#else +#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() +#endif + +/* name, frequency_units, duration_units */ +#define BCH_TIME_STATS() \ + BCH_TIME_STAT(btree_node_mem_alloc, sec, us) \ + BCH_TIME_STAT(btree_gc, sec, ms) \ + BCH_TIME_STAT(btree_coalesce, sec, ms) \ + BCH_TIME_STAT(btree_split, sec, us) \ + BCH_TIME_STAT(btree_sort, ms, us) \ + BCH_TIME_STAT(btree_read, ms, us) \ + BCH_TIME_STAT(journal_write, us, us) \ + BCH_TIME_STAT(journal_delay, ms, us) \ + BCH_TIME_STAT(journal_blocked, sec, ms) \ + BCH_TIME_STAT(journal_flush_seq, us, us) + +#include "alloc_types.h" +#include "buckets_types.h" +#include "clock_types.h" +#include "io_types.h" +#include "journal_types.h" +#include "keylist_types.h" +#include "move_types.h" +#include "super_types.h" + +/* 256k, in sectors */ +#define BTREE_NODE_SIZE_MAX 512 + +/* + * Number of nodes we might have to allocate in a worst case btree split + * operation - we split all the way up to the root, then allocate a new root. + */ +#define btree_reserve_required_nodes(depth) (((depth) + 1) * 2 + 1) + +/* Number of nodes btree coalesce will try to coalesce at once */ +#define GC_MERGE_NODES 4U + +/* Maximum number of nodes we might need to allocate atomically: */ +#define BTREE_RESERVE_MAX \ + (btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES) + +/* Size of the freelist we allocate btree nodes from: */ +#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 2) + +struct btree; +struct crypto_blkcipher; +struct crypto_ahash; + +enum gc_phase { + GC_PHASE_SB_METADATA = BTREE_ID_NR + 1, + GC_PHASE_PENDING_DELETE, + GC_PHASE_DONE +}; + +struct gc_pos { + enum gc_phase phase; + struct bpos pos; + unsigned level; +}; + +struct bch_member_cpu { + u64 nbuckets; /* device size */ + u16 first_bucket; /* index of first bucket used */ + u16 bucket_size; /* sectors */ + u8 state; + u8 tier; + u8 has_metadata; + u8 has_data; + u8 replacement; + u8 discard; + u8 valid; +}; + +struct bch_dev { + struct kobject kobj; + struct percpu_ref ref; + struct percpu_ref io_ref; + struct completion stop_complete; + struct completion offline_complete; + + struct bch_fs *fs; + + u8 dev_idx; + /* + * Cached version of this device's member info from superblock + * Committed by bch2_write_super() -> bch_fs_mi_update() + */ + struct bch_member_cpu mi; + uuid_le uuid; + char name[BDEVNAME_SIZE]; + + struct bcache_superblock disk_sb; + + struct dev_group self; + + /* biosets used in cloned bios for replicas and moving_gc */ + struct bio_set replica_set; + + struct task_struct *alloc_thread; + + struct prio_set *disk_buckets; + + /* + * When allocating new buckets, prio_write() gets first dibs - since we + * may not be allocate at all without writing priorities and gens. + * prio_last_buckets[] contains the last buckets we wrote priorities to + * (so gc can mark them as metadata). + */ + u64 *prio_buckets; + u64 *prio_last_buckets; + spinlock_t prio_buckets_lock; + struct bio *bio_prio; + + /* + * free: Buckets that are ready to be used + * + * free_inc: Incoming buckets - these are buckets that currently have + * cached data in them, and we can't reuse them until after we write + * their new gen to disk. After prio_write() finishes writing the new + * gens/prios, they'll be moved to the free list (and possibly discarded + * in the process) + */ + DECLARE_FIFO(long, free)[RESERVE_NR]; + DECLARE_FIFO(long, free_inc); + spinlock_t freelist_lock; + + size_t fifo_last_bucket; + + /* Allocation stuff: */ + + /* most out of date gen in the btree */ + u8 *oldest_gens; + struct bucket *buckets; + unsigned short bucket_bits; /* ilog2(bucket_size) */ + + /* last calculated minimum prio */ + u16 min_prio[2]; + + /* + * Bucket book keeping. The first element is updated by GC, the + * second contains a saved copy of the stats from the beginning + * of GC. + */ + struct bch_dev_usage __percpu *usage_percpu; + struct bch_dev_usage usage_cached; + + atomic_long_t saturated_count; + size_t inc_gen_needs_gc; + + struct mutex heap_lock; + DECLARE_HEAP(struct bucket_heap_entry, heap); + + /* Moving GC: */ + struct task_struct *moving_gc_read; + + struct bch_pd_controller moving_gc_pd; + + /* Tiering: */ + struct write_point tiering_write_point; + + struct write_point copygc_write_point; + + struct journal_device journal; + + struct work_struct io_error_work; + + /* The rest of this all shows up in sysfs */ + atomic64_t meta_sectors_written; + atomic64_t btree_sectors_written; + u64 __percpu *sectors_written; +}; + +/* + * Flag bits for what phase of startup/shutdown the cache set is at, how we're + * shutting down, etc.: + * + * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching + * all the backing devices first (their cached data gets invalidated, and they + * won't automatically reattach). + */ +enum { + BCH_FS_INITIAL_GC_DONE, + BCH_FS_EMERGENCY_RO, + BCH_FS_WRITE_DISABLE_COMPLETE, + BCH_FS_GC_STOPPING, + BCH_FS_GC_FAILURE, + BCH_FS_BDEV_MOUNTED, + BCH_FS_ERROR, + BCH_FS_FSCK_FIXED_ERRORS, +}; + +struct btree_debug { + unsigned id; + struct dentry *btree; + struct dentry *btree_format; + struct dentry *failed; +}; + +struct bch_tier { + unsigned idx; + struct task_struct *migrate; + struct bch_pd_controller pd; + + struct dev_group devs; +}; + +enum bch_fs_state { + BCH_FS_STARTING = 0, + BCH_FS_STOPPING, + BCH_FS_RO, + BCH_FS_RW, +}; + +struct bch_fs { + struct closure cl; + + struct list_head list; + struct kobject kobj; + struct kobject internal; + struct kobject opts_dir; + struct kobject time_stats; + unsigned long flags; + + int minor; + struct device *chardev; + struct super_block *vfs_sb; + char name[40]; + + /* ro/rw, add/remove devices: */ + struct mutex state_lock; + enum bch_fs_state state; + + /* Counts outstanding writes, for clean transition to read-only */ + struct percpu_ref writes; + struct work_struct read_only_work; + + struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; + + struct bch_opts opts; + + /* Updated by bch2_sb_update():*/ + struct { + uuid_le uuid; + uuid_le user_uuid; + + u16 block_size; + u16 btree_node_size; + + u8 nr_devices; + u8 clean; + + u8 meta_replicas_have; + u8 data_replicas_have; + + u8 str_hash_type; + u8 encryption_type; + + u64 time_base_lo; + u32 time_base_hi; + u32 time_precision; + } sb; + + struct bch_sb *disk_sb; + unsigned disk_sb_order; + + unsigned short block_bits; /* ilog2(block_size) */ + + struct closure sb_write; + struct mutex sb_lock; + + struct backing_dev_info bdi; + + /* BTREE CACHE */ + struct bio_set btree_read_bio; + + struct btree_root btree_roots[BTREE_ID_NR]; + struct mutex btree_root_lock; + + bool btree_cache_table_init_done; + struct rhashtable btree_cache_table; + + /* + * We never free a struct btree, except on shutdown - we just put it on + * the btree_cache_freed list and reuse it later. This simplifies the + * code, and it doesn't cost us much memory as the memory usage is + * dominated by buffers that hold the actual btree node data and those + * can be freed - and the number of struct btrees allocated is + * effectively bounded. + * + * btree_cache_freeable effectively is a small cache - we use it because + * high order page allocations can be rather expensive, and it's quite + * common to delete and allocate btree nodes in quick succession. It + * should never grow past ~2-3 nodes in practice. + */ + struct mutex btree_cache_lock; + struct list_head btree_cache; + struct list_head btree_cache_freeable; + struct list_head btree_cache_freed; + + /* Number of elements in btree_cache + btree_cache_freeable lists */ + unsigned btree_cache_used; + unsigned btree_cache_reserve; + struct shrinker btree_cache_shrink; + + /* + * If we need to allocate memory for a new btree node and that + * allocation fails, we can cannibalize another node in the btree cache + * to satisfy the allocation - lock to guarantee only one thread does + * this at a time: + */ + struct closure_waitlist mca_wait; + struct task_struct *btree_cache_alloc_lock; + + mempool_t btree_reserve_pool; + + /* + * Cache of allocated btree nodes - if we allocate a btree node and + * don't use it, if we free it that space can't be reused until going + * _all_ the way through the allocator (which exposes us to a livelock + * when allocating btree reserves fail halfway through) - instead, we + * can stick them here: + */ + struct btree_alloc { + struct open_bucket *ob; + BKEY_PADDED(k); + } btree_reserve_cache[BTREE_NODE_RESERVE * 2]; + unsigned btree_reserve_cache_nr; + struct mutex btree_reserve_cache_lock; + + mempool_t btree_interior_update_pool; + struct list_head btree_interior_update_list; + struct mutex btree_interior_update_lock; + + struct workqueue_struct *wq; + /* copygc needs its own workqueue for index updates.. */ + struct workqueue_struct *copygc_wq; + + /* ALLOCATION */ + struct bch_pd_controller foreground_write_pd; + struct delayed_work pd_controllers_update; + unsigned pd_controllers_update_seconds; + spinlock_t foreground_write_pd_lock; + struct bch_write_op *write_wait_head; + struct bch_write_op *write_wait_tail; + + struct timer_list foreground_write_wakeup; + + /* + * These contain all r/w devices - i.e. devices we can currently + * allocate from: + */ + struct dev_group all_devs; + struct bch_tier tiers[BCH_TIER_MAX]; + /* NULL if we only have devices in one tier: */ + struct bch_tier *fastest_tier; + + u64 capacity; /* sectors */ + + /* + * When capacity _decreases_ (due to a disk being removed), we + * increment capacity_gen - this invalidates outstanding reservations + * and forces them to be revalidated + */ + u32 capacity_gen; + + atomic64_t sectors_available; + + struct bch_fs_usage __percpu *usage_percpu; + struct bch_fs_usage usage_cached; + struct lglock usage_lock; + + struct mutex bucket_lock; + + struct closure_waitlist freelist_wait; + + /* + * When we invalidate buckets, we use both the priority and the amount + * of good data to determine which buckets to reuse first - to weight + * those together consistently we keep track of the smallest nonzero + * priority of any bucket. + */ + struct prio_clock prio_clock[2]; + + struct io_clock io_clock[2]; + + /* SECTOR ALLOCATOR */ + struct list_head open_buckets_open; + struct list_head open_buckets_free; + unsigned open_buckets_nr_free; + struct closure_waitlist open_buckets_wait; + spinlock_t open_buckets_lock; + struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; + + struct write_point btree_write_point; + + struct write_point write_points[WRITE_POINT_COUNT]; + struct write_point promote_write_point; + + /* + * This write point is used for migrating data off a device + * and can point to any other device. + * We can't use the normal write points because those will + * gang up n replicas, and for migration we want only one new + * replica. + */ + struct write_point migration_write_point; + + /* GARBAGE COLLECTION */ + struct task_struct *gc_thread; + atomic_t kick_gc; + + /* + * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] + * has been marked by GC. + * + * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) + * + * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not + * currently running, and gc marks are currently valid + * + * Protected by gc_pos_lock. Only written to by GC thread, so GC thread + * can read without a lock. + */ + seqcount_t gc_pos_lock; + struct gc_pos gc_pos; + + /* + * The allocation code needs gc_mark in struct bucket to be correct, but + * it's not while a gc is in progress. + */ + struct rw_semaphore gc_lock; + + /* IO PATH */ + struct bio_set bio_read; + struct bio_set bio_read_split; + struct bio_set bio_write; + struct mutex bio_bounce_pages_lock; + mempool_t bio_bounce_pages; + + mempool_t lz4_workspace_pool; + void *zlib_workspace; + struct mutex zlib_workspace_lock; + mempool_t compression_bounce[2]; + + struct crypto_shash *sha256; + struct crypto_blkcipher *chacha20; + struct crypto_shash *poly1305; + + atomic64_t key_version; + + struct bio_list read_retry_list; + struct work_struct read_retry_work; + spinlock_t read_retry_lock; + + /* FILESYSTEM */ + wait_queue_head_t writeback_wait; + atomic_t writeback_pages; + unsigned writeback_pages_max; + atomic_long_t nr_inodes; + + /* DEBUG JUNK */ + struct dentry *debug; + struct btree_debug btree_debug[BTREE_ID_NR]; +#ifdef CONFIG_BCACHEFS_DEBUG + struct btree *verify_data; + struct btree_node *verify_ondisk; + struct mutex verify_lock; +#endif + + u64 unused_inode_hint; + + /* + * A btree node on disk could have too many bsets for an iterator to fit + * on the stack - have to dynamically allocate them + */ + mempool_t fill_iter; + + mempool_t btree_bounce_pool; + + struct journal journal; + + unsigned bucket_journal_seq; + + /* The rest of this all shows up in sysfs */ + atomic_long_t cache_read_races; + + unsigned foreground_write_ratelimit_enabled:1; + unsigned copy_gc_enabled:1; + unsigned tiering_enabled:1; + unsigned tiering_percent; + + /* + * foreground writes will be throttled when the number of free + * buckets is below this percentage + */ + unsigned foreground_target_percent; + +#define BCH_DEBUG_PARAM(name, description) bool name; + BCH_DEBUG_PARAMS_ALL() +#undef BCH_DEBUG_PARAM + +#define BCH_TIME_STAT(name, frequency_units, duration_units) \ + struct time_stats name##_time; + BCH_TIME_STATS() +#undef BCH_TIME_STAT +}; + +static inline bool bch2_fs_running(struct bch_fs *c) +{ + return c->state == BCH_FS_RO || c->state == BCH_FS_RW; +} + +static inline unsigned bucket_pages(const struct bch_dev *ca) +{ + return ca->mi.bucket_size / PAGE_SECTORS; +} + +static inline unsigned bucket_bytes(const struct bch_dev *ca) +{ + return ca->mi.bucket_size << 9; +} + +static inline unsigned block_bytes(const struct bch_fs *c) +{ + return c->sb.block_size << 9; +} + +#endif /* _BCACHE_H */ diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h new file mode 100644 index 00000000..0a0dc870 --- /dev/null +++ b/libbcachefs/bcachefs_format.h @@ -0,0 +1,1352 @@ +#ifndef _LINUX_BCACHE_H +#define _LINUX_BCACHE_H + +/* + * Bcache on disk data structures + */ + +#ifdef __cplusplus +typedef bool _Bool; +extern "C" { +#endif + +#include <asm/types.h> +#include <asm/byteorder.h> +#include <linux/uuid.h> + +#define LE32_BITMASK(name, type, field, offset, end) \ +static const unsigned name##_OFFSET = offset; \ +static const unsigned name##_BITS = (end - offset); \ +static const __u64 name##_MAX = (1ULL << (end - offset)) - 1; \ + \ +static inline __u64 name(const type *k) \ +{ \ + return (__le32_to_cpu(k->field) >> offset) & \ + ~(~0ULL << (end - offset)); \ +} \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + __u64 new = __le32_to_cpu(k->field); \ + \ + new &= ~(~(~0ULL << (end - offset)) << offset); \ + new |= (v & ~(~0ULL << (end - offset))) << offset; \ + k->field = __cpu_to_le32(new); \ +} + +#define LE64_BITMASK(name, type, field, offset, end) \ +static const unsigned name##_OFFSET = offset; \ +static const unsigned name##_BITS = (end - offset); \ +static const __u64 name##_MAX = (1ULL << (end - offset)) - 1; \ + \ +static inline __u64 name(const type *k) \ +{ \ + return (__le64_to_cpu(k->field) >> offset) & \ + ~(~0ULL << (end - offset)); \ +} \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + __u64 new = __le64_to_cpu(k->field); \ + \ + new &= ~(~(~0ULL << (end - offset)) << offset); \ + new |= (v & ~(~0ULL << (end - offset))) << offset; \ + k->field = __cpu_to_le64(new); \ +} + +struct bkey_format { + __u8 key_u64s; + __u8 nr_fields; + /* One unused slot for now: */ + __u8 bits_per_field[6]; + __le64 field_offset[6]; +}; + +/* Btree keys - all units are in sectors */ + +struct bpos { + /* Word order matches machine byte order */ +#if defined(__LITTLE_ENDIAN) + __u32 snapshot; + __u64 offset; + __u64 inode; +#elif defined(__BIG_ENDIAN) + __u64 inode; + __u64 offset; /* Points to end of extent - sectors */ + __u32 snapshot; +#else +#error edit for your odd byteorder. +#endif +} __attribute__((packed, aligned(4))); + +#define KEY_INODE_MAX ((__u64)~0ULL) +#define KEY_OFFSET_MAX ((__u64)~0ULL) +#define KEY_SNAPSHOT_MAX ((__u32)~0U) + +static inline struct bpos POS(__u64 inode, __u64 offset) +{ + struct bpos ret; + + ret.inode = inode; + ret.offset = offset; + ret.snapshot = 0; + + return ret; +} + +#define POS_MIN POS(0, 0) +#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) + +/* Empty placeholder struct, for container_of() */ +struct bch_val { + __u64 __nothing[0]; +}; + +struct bversion { +#if defined(__LITTLE_ENDIAN) + __u64 lo; + __u32 hi; +#elif defined(__BIG_ENDIAN) + __u32 hi; + __u64 lo; +#endif +} __attribute__((packed, aligned(4))); + +struct bkey { + /* Size of combined key and value, in u64s */ + __u8 u64s; + + /* Format of key (0 for format local to btree node) */ +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 format:7, + needs_whiteout:1; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 needs_whiteout:1, + format:7; +#else +#error edit for your odd byteorder. +#endif + + /* Type of the value */ + __u8 type; + +#if defined(__LITTLE_ENDIAN) + __u8 pad[1]; + + struct bversion version; + __u32 size; /* extent size, in sectors */ + struct bpos p; +#elif defined(__BIG_ENDIAN) + struct bpos p; + __u32 size; /* extent size, in sectors */ + struct bversion version; + + __u8 pad[1]; +#endif +} __attribute__((packed, aligned(8))); + +struct bkey_packed { + __u64 _data[0]; + + /* Size of combined key and value, in u64s */ + __u8 u64s; + + /* Format of key (0 for format local to btree node) */ + + /* + * XXX: next incompat on disk format change, switch format and + * needs_whiteout - bkey_packed() will be cheaper if format is the high + * bits of the bitfield + */ +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 format:7, + needs_whiteout:1; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 needs_whiteout:1, + format:7; +#endif + + /* Type of the value */ + __u8 type; + __u8 key_start[0]; + + /* + * We copy bkeys with struct assignment in various places, and while + * that shouldn't be done with packed bkeys we can't disallow it in C, + * and it's legal to cast a bkey to a bkey_packed - so padding it out + * to the same size as struct bkey should hopefully be safest. + */ + __u8 pad[sizeof(struct bkey) - 3]; +} __attribute__((packed, aligned(8))); + +#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) +#define KEY_PACKED_BITS_START 24 + +#define KEY_SIZE_MAX ((__u32)~0U) + +#define KEY_FORMAT_LOCAL_BTREE 0 +#define KEY_FORMAT_CURRENT 1 + +enum bch_bkey_fields { + BKEY_FIELD_INODE, + BKEY_FIELD_OFFSET, + BKEY_FIELD_SNAPSHOT, + BKEY_FIELD_SIZE, + BKEY_FIELD_VERSION_HI, + BKEY_FIELD_VERSION_LO, + BKEY_NR_FIELDS, +}; + +#define bkey_format_field(name, field) \ + [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) + +#define BKEY_FORMAT_CURRENT \ +((struct bkey_format) { \ + .key_u64s = BKEY_U64s, \ + .nr_fields = BKEY_NR_FIELDS, \ + .bits_per_field = { \ + bkey_format_field(INODE, p.inode), \ + bkey_format_field(OFFSET, p.offset), \ + bkey_format_field(SNAPSHOT, p.snapshot), \ + bkey_format_field(SIZE, size), \ + bkey_format_field(VERSION_HI, version.hi), \ + bkey_format_field(VERSION_LO, version.lo), \ + }, \ +}) + +/* bkey with inline value */ +struct bkey_i { + __u64 _data[0]; + + union { + struct { + /* Size of combined key and value, in u64s */ + __u8 u64s; + }; + struct { + struct bkey k; + struct bch_val v; + }; + }; +}; + +#ifndef __cplusplus + +#define KEY(_inode, _offset, _size) \ +((struct bkey) { \ + .u64s = BKEY_U64s, \ + .format = KEY_FORMAT_CURRENT, \ + .p = POS(_inode, _offset), \ + .size = _size, \ +}) + +#else + +static inline struct bkey KEY(__u64 inode, __u64 offset, __u64 size) +{ + struct bkey ret; + + memset(&ret, 0, sizeof(ret)); + ret.u64s = BKEY_U64s; + ret.format = KEY_FORMAT_CURRENT; + ret.p.inode = inode; + ret.p.offset = offset; + ret.size = size; + + return ret; +} + +#endif + +static inline void bkey_init(struct bkey *k) +{ + *k = KEY(0, 0, 0); +} + +#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) + +#define __BKEY_PADDED(key, pad) \ + struct { struct bkey_i key; __u64 key ## _pad[pad]; } + +#define BKEY_VAL_TYPE(name, nr) \ +struct bkey_i_##name { \ + union { \ + struct bkey k; \ + struct bkey_i k_i; \ + }; \ + struct bch_##name v; \ +} + +/* + * - DELETED keys are used internally to mark keys that should be ignored but + * override keys in composition order. Their version number is ignored. + * + * - DISCARDED keys indicate that the data is all 0s because it has been + * discarded. DISCARDs may have a version; if the version is nonzero the key + * will be persistent, otherwise the key will be dropped whenever the btree + * node is rewritten (like DELETED keys). + * + * - ERROR: any read of the data returns a read error, as the data was lost due + * to a failing device. Like DISCARDED keys, they can be removed (overridden) + * by new writes or cluster-wide GC. Node repair can also overwrite them with + * the same or a more recent version number, but not with an older version + * number. +*/ +#define KEY_TYPE_DELETED 0 +#define KEY_TYPE_DISCARD 1 +#define KEY_TYPE_ERROR 2 +#define KEY_TYPE_COOKIE 3 +#define KEY_TYPE_PERSISTENT_DISCARD 4 +#define KEY_TYPE_GENERIC_NR 128 + +struct bch_cookie { + struct bch_val v; + __le64 cookie; +}; +BKEY_VAL_TYPE(cookie, KEY_TYPE_COOKIE); + +/* Extents */ + +/* + * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally + * preceded by checksum/compression information (bch_extent_crc32 or + * bch_extent_crc64). + * + * One major determining factor in the format of extents is how we handle and + * represent extents that have been partially overwritten and thus trimmed: + * + * If an extent is not checksummed or compressed, when the extent is trimmed we + * don't have to remember the extent we originally allocated and wrote: we can + * merely adjust ptr->offset to point to the start of the start of the data that + * is currently live. The size field in struct bkey records the current (live) + * size of the extent, and is also used to mean "size of region on disk that we + * point to" in this case. + * + * Thus an extent that is not checksummed or compressed will consist only of a + * list of bch_extent_ptrs, with none of the fields in + * bch_extent_crc32/bch_extent_crc64. + * + * When an extent is checksummed or compressed, it's not possible to read only + * the data that is currently live: we have to read the entire extent that was + * originally written, and then return only the part of the extent that is + * currently live. + * + * Thus, in addition to the current size of the extent in struct bkey, we need + * to store the size of the originally allocated space - this is the + * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, + * when the extent is trimmed, instead of modifying the offset field of the + * pointer, we keep a second smaller offset field - "offset into the original + * extent of the currently live region". + * + * The other major determining factor is replication and data migration: + * + * Each pointer may have its own bch_extent_crc32/64. When doing a replicated + * write, we will initially write all the replicas in the same format, with the + * same checksum type and compression format - however, when copygc runs later (or + * tiering/cache promotion, anything that moves data), it is not in general + * going to rewrite all the pointers at once - one of the replicas may be in a + * bucket on one device that has very little fragmentation while another lives + * in a bucket that has become heavily fragmented, and thus is being rewritten + * sooner than the rest. + * + * Thus it will only move a subset of the pointers (or in the case of + * tiering/cache promotion perhaps add a single pointer without dropping any + * current pointers), and if the extent has been partially overwritten it must + * write only the currently live portion (or copygc would not be able to reduce + * fragmentation!) - which necessitates a different bch_extent_crc format for + * the new pointer. + * + * But in the interests of space efficiency, we don't want to store one + * bch_extent_crc for each pointer if we don't have to. + * + * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and + * bch_extent_ptrs appended arbitrarily one after the other. We determine the + * type of a given entry with a scheme similar to utf8 (except we're encoding a + * type, not a size), encoding the type in the position of the first set bit: + * + * bch_extent_crc32 - 0b1 + * bch_extent_ptr - 0b10 + * bch_extent_crc64 - 0b100 + * + * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and + * bch_extent_crc64 is the least constrained). + * + * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, + * until the next bch_extent_crc32/64. + * + * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer + * is neither checksummed nor compressed. + */ + +/* 128 bits, sufficient for cryptographic MACs: */ +struct bch_csum { + __le64 lo; + __le64 hi; +} __attribute__((packed, aligned(8))); + +#define BCH_CSUM_NONE 0U +#define BCH_CSUM_CRC32C 1U +#define BCH_CSUM_CRC64 2U +#define BCH_CSUM_CHACHA20_POLY1305_80 3U +#define BCH_CSUM_CHACHA20_POLY1305_128 4U +#define BCH_CSUM_NR 5U + +static inline _Bool bch2_csum_type_is_encryption(unsigned type) +{ + switch (type) { + case BCH_CSUM_CHACHA20_POLY1305_80: + case BCH_CSUM_CHACHA20_POLY1305_128: + return true; + default: + return false; + } +} + +enum bch_extent_entry_type { + BCH_EXTENT_ENTRY_ptr = 0, + BCH_EXTENT_ENTRY_crc32 = 1, + BCH_EXTENT_ENTRY_crc64 = 2, + BCH_EXTENT_ENTRY_crc128 = 3, +}; + +#define BCH_EXTENT_ENTRY_MAX 4 + +/* Compressed/uncompressed size are stored biased by 1: */ +struct bch_extent_crc32 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 type:2, + _compressed_size:7, + _uncompressed_size:7, + offset:7, + _unused:1, + csum_type:4, + compression_type:4; + __u32 csum; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u32 csum; + __u32 compression_type:4, + csum_type:4, + _unused:1, + offset:7, + _uncompressed_size:7, + _compressed_size:7, + type:2; +#endif +} __attribute__((packed, aligned(8))); + +#define CRC32_SIZE_MAX (1U << 7) +#define CRC32_NONCE_MAX 0 + +struct bch_extent_crc64 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:3, + _compressed_size:9, + _uncompressed_size:9, + offset:9, + nonce:10, + csum_type:4, + compression_type:4, + csum_hi:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 csum_hi:16, + compression_type:4, + csum_type:4, + nonce:10, + offset:9, + _uncompressed_size:9, + _compressed_size:9, + type:3; +#endif + __u64 csum_lo; +} __attribute__((packed, aligned(8))); + +#define CRC64_SIZE_MAX (1U << 9) +#define CRC64_NONCE_MAX ((1U << 10) - 1) + +struct bch_extent_crc128 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:4, + _compressed_size:13, + _uncompressed_size:13, + offset:13, + nonce:13, + csum_type:4, + compression_type:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 compression_type:4, + csum_type:4, + nonce:14, + offset:13, + _uncompressed_size:13, + _compressed_size:13, + type:3; +#endif + struct bch_csum csum; +} __attribute__((packed, aligned(8))); + +#define CRC128_SIZE_MAX (1U << 13) +#define CRC128_NONCE_MAX ((1U << 13) - 1) + +/* + * Max size of an extent that may require bouncing to read or write + * (checksummed, compressed): 64k + */ +#define BCH_ENCODED_EXTENT_MAX 128U + +/* + * @reservation - pointer hasn't been written to, just reserved + */ +struct bch_extent_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:1, + cached:1, + erasure_coded:1, + reservation:1, + offset:44, /* 8 petabytes */ + dev:8, + gen:8; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 gen:8, + dev:8, + offset:44, + reservation:1, + erasure_coded:1, + cached:1, + type:1; +#endif +} __attribute__((packed, aligned(8))); + +struct bch_extent_reservation { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:5, + unused:23, + replicas:4, + generation:32; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 generation:32, + replicas:4, + unused:23, + type:5; +#endif +}; + +union bch_extent_entry { +#if defined(__LITTLE_ENDIAN) || __BITS_PER_LONG == 64 + unsigned long type; +#elif __BITS_PER_LONG == 32 + struct { + unsigned long pad; + unsigned long type; + }; +#else +#error edit for your odd byteorder. +#endif + struct bch_extent_crc32 crc32; + struct bch_extent_crc64 crc64; + struct bch_extent_crc128 crc128; + struct bch_extent_ptr ptr; +}; + +enum { + BCH_EXTENT = 128, + + /* + * This is kind of a hack, we're overloading the type for a boolean that + * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED + * have the same value type: + */ + BCH_EXTENT_CACHED = 129, + + /* + * Persistent reservation: + */ + BCH_RESERVATION = 130, +}; + +struct bch_extent { + struct bch_val v; + + union bch_extent_entry start[0]; + __u64 _data[0]; +} __attribute__((packed, aligned(8))); +BKEY_VAL_TYPE(extent, BCH_EXTENT); + +struct bch_reservation { + struct bch_val v; + + __le32 generation; + __u8 nr_replicas; + __u8 pad[3]; +} __attribute__((packed, aligned(8))); +BKEY_VAL_TYPE(reservation, BCH_RESERVATION); + +/* Maximum size (in u64s) a single pointer could be: */ +#define BKEY_EXTENT_PTR_U64s_MAX\ + ((sizeof(struct bch_extent_crc128) + \ + sizeof(struct bch_extent_ptr)) / sizeof(u64)) + +/* Maximum possible size of an entire extent value: */ +/* There's a hack in the keylist code that needs to be fixed.. */ +#define BKEY_EXTENT_VAL_U64s_MAX \ + (BKEY_EXTENT_PTR_U64s_MAX * BCH_REPLICAS_MAX) + +/* * Maximum possible size of an entire extent, key + value: */ +#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) + +/* Btree pointers don't carry around checksums: */ +#define BKEY_BTREE_PTR_VAL_U64s_MAX \ + ((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX) +#define BKEY_BTREE_PTR_U64s_MAX \ + (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) + +/* Inodes */ + +#define BLOCKDEV_INODE_MAX 4096 + +#define BCACHE_ROOT_INO 4096 + +enum bch_inode_types { + BCH_INODE_FS = 128, + BCH_INODE_BLOCKDEV = 129, +}; + +struct bch_inode { + struct bch_val v; + + __le64 i_hash_seed; + __le32 i_flags; + __le16 i_mode; + __u8 fields[0]; +} __attribute__((packed)); +BKEY_VAL_TYPE(inode, BCH_INODE_FS); + +#define BCH_INODE_FIELDS() \ + BCH_INODE_FIELD(i_atime, 64) \ + BCH_INODE_FIELD(i_ctime, 64) \ + BCH_INODE_FIELD(i_mtime, 64) \ + BCH_INODE_FIELD(i_otime, 64) \ + BCH_INODE_FIELD(i_size, 64) \ + BCH_INODE_FIELD(i_sectors, 64) \ + BCH_INODE_FIELD(i_uid, 32) \ + BCH_INODE_FIELD(i_gid, 32) \ + BCH_INODE_FIELD(i_nlink, 32) \ + BCH_INODE_FIELD(i_generation, 32) \ + BCH_INODE_FIELD(i_dev, 32) + +enum { + /* + * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL + * flags) + */ + __BCH_INODE_SYNC = 0, + __BCH_INODE_IMMUTABLE = 1, + __BCH_INODE_APPEND = 2, + __BCH_INODE_NODUMP = 3, + __BCH_INODE_NOATIME = 4, + + __BCH_INODE_I_SIZE_DIRTY= 5, + __BCH_INODE_I_SECTORS_DIRTY= 6, + + /* not implemented yet: */ + __BCH_INODE_HAS_XATTRS = 7, /* has xattrs in xattr btree */ + + /* bits 20+ reserved for packed fields below: */ +}; + +#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) +#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) +#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) +#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) +#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) +#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) +#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) +#define BCH_INODE_HAS_XATTRS (1 << __BCH_INODE_HAS_XATTRS) + +LE32_BITMASK(INODE_STR_HASH, struct bch_inode, i_flags, 20, 24); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, i_flags, 24, 32); + +struct bch_inode_blockdev { + struct bch_val v; + + __le64 i_size; + __le64 i_flags; + + /* Seconds: */ + __le64 i_ctime; + __le64 i_mtime; + + uuid_le i_uuid; + __u8 i_label[32]; +} __attribute__((packed, aligned(8))); +BKEY_VAL_TYPE(inode_blockdev, BCH_INODE_BLOCKDEV); + +/* Thin provisioned volume, or cache for another block device? */ +LE64_BITMASK(CACHED_DEV, struct bch_inode_blockdev, i_flags, 0, 1) + +/* Dirents */ + +/* + * Dirents (and xattrs) have to implement string lookups; since our b-tree + * doesn't support arbitrary length strings for the key, we instead index by a + * 64 bit hash (currently truncated sha1) of the string, stored in the offset + * field of the key - using linear probing to resolve hash collisions. This also + * provides us with the readdir cookie posix requires. + * + * Linear probing requires us to use whiteouts for deletions, in the event of a + * collision: + */ + +enum { + BCH_DIRENT = 128, + BCH_DIRENT_WHITEOUT = 129, +}; + +struct bch_dirent { + struct bch_val v; + + /* Target inode number: */ + __le64 d_inum; + + /* + * Copy of mode bits 12-15 from the target inode - so userspace can get + * the filetype without having to do a stat() + */ + __u8 d_type; + + __u8 d_name[]; +} __attribute__((packed)); +BKEY_VAL_TYPE(dirent, BCH_DIRENT); + +/* Xattrs */ + +enum { + BCH_XATTR = 128, + BCH_XATTR_WHITEOUT = 129, +}; + +#define BCH_XATTR_INDEX_USER 0 +#define BCH_XATTR_INDEX_POSIX_ACL_ACCESS 1 +#define BCH_XATTR_INDEX_POSIX_ACL_DEFAULT 2 +#define BCH_XATTR_INDEX_TRUSTED 3 +#define BCH_XATTR_INDEX_SECURITY 4 + +struct bch_xattr { + struct bch_val v; + __u8 x_type; + __u8 x_name_len; + __le16 x_val_len; + __u8 x_name[]; +} __attribute__((packed)); +BKEY_VAL_TYPE(xattr, BCH_XATTR); + +/* Superblock */ + +/* Version 0: Cache device + * Version 1: Backing device + * Version 2: Seed pointer into btree node checksum + * Version 3: Cache device with new UUID format + * Version 4: Backing device with data offset + * Version 5: All the incompat changes + * Version 6: Cache device UUIDs all in superblock, another incompat bset change + * Version 7: Encryption (expanded checksum fields), other random things + */ +#define BCACHE_SB_VERSION_CDEV_V0 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_VERSION_CDEV_V2 5 +#define BCACHE_SB_VERSION_CDEV_V3 6 +#define BCACHE_SB_VERSION_CDEV_V4 7 +#define BCACHE_SB_VERSION_CDEV 7 +#define BCACHE_SB_MAX_VERSION 7 + +#define BCH_SB_SECTOR 8 +#define BCH_SB_LABEL_SIZE 32 +#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ + +struct bch_member { + uuid_le uuid; + __le64 nbuckets; /* device size */ + __le16 first_bucket; /* index of first bucket used */ + __le16 bucket_size; /* sectors */ + __le32 pad; + __le64 last_mount; /* time_t */ + + __le64 flags[2]; +}; + +LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) +LE64_BITMASK(BCH_MEMBER_TIER, struct bch_member, flags[0], 4, 8) +LE64_BITMASK(BCH_MEMBER_HAS_METADATA, struct bch_member, flags[0], 8, 9) +LE64_BITMASK(BCH_MEMBER_HAS_DATA, struct bch_member, flags[0], 9, 10) +LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) +LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15); + +#if 0 +LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); +LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); +#endif + +enum bch_member_state { + BCH_MEMBER_STATE_RW = 0, + BCH_MEMBER_STATE_RO = 1, + BCH_MEMBER_STATE_FAILED = 2, + BCH_MEMBER_STATE_SPARE = 3, + BCH_MEMBER_STATE_NR = 4, +}; + +#define BCH_TIER_MAX 4U + +enum cache_replacement { + CACHE_REPLACEMENT_LRU = 0, + CACHE_REPLACEMENT_FIFO = 1, + CACHE_REPLACEMENT_RANDOM = 2, + CACHE_REPLACEMENT_NR = 3, +}; + +struct bch_sb_layout { + uuid_le magic; /* bcachefs superblock UUID */ + __u8 layout_type; + __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ + __u8 nr_superblocks; + __u8 pad[5]; + __u64 sb_offset[61]; +} __attribute__((packed)); + +#define BCH_SB_LAYOUT_SECTOR 7 + +struct bch_sb_field { + __u64 _data[0]; + __le32 u64s; + __le32 type; +}; + +enum bch_sb_field_type { + BCH_SB_FIELD_journal = 0, + BCH_SB_FIELD_members = 1, + BCH_SB_FIELD_crypt = 2, + BCH_SB_FIELD_NR = 3, +}; + +struct bch_sb_field_journal { + struct bch_sb_field field; + __le64 buckets[0]; +}; + +struct bch_sb_field_members { + struct bch_sb_field field; + struct bch_member members[0]; +}; + +/* Crypto: */ + +struct nonce { + __le32 d[4]; +}; + +struct bch_key { + __le64 key[4]; +}; + +#define BCH_KEY_MAGIC \ + (((u64) 'b' << 0)|((u64) 'c' << 8)| \ + ((u64) 'h' << 16)|((u64) '*' << 24)| \ + ((u64) '*' << 32)|((u64) 'k' << 40)| \ + ((u64) 'e' << 48)|((u64) 'y' << 56)) + +struct bch_encrypted_key { + __le64 magic; + struct bch_key key; +}; + +/* + * If this field is present in the superblock, it stores an encryption key which + * is used encrypt all other data/metadata. The key will normally be encrypted + * with the key userspace provides, but if encryption has been turned off we'll + * just store the master key unencrypted in the superblock so we can access the + * previously encrypted data. + */ +struct bch_sb_field_crypt { + struct bch_sb_field field; + + __le64 flags; + __le64 kdf_flags; + struct bch_encrypted_key key; +}; + +LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); + +enum bch_kdf_types { + BCH_KDF_SCRYPT = 0, + BCH_KDF_NR = 1, +}; + +/* stored as base 2 log of scrypt params: */ +LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); +LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); +LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); + +struct bch_sb_field_replication { + struct bch_sb_field field; +}; + +/* + * @offset - sector where this sb was written + * @version - on disk format version + * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC) + * @seq - incremented each time superblock is written + * @uuid - used for generating various magic numbers and identifying + * member devices, never changes + * @user_uuid - user visible UUID, may be changed + * @label - filesystem label + * @seq - identifies most recent superblock, incremented each time + * superblock is written + * @features - enabled incompatible features + */ +struct bch_sb { + struct bch_csum csum; + __le64 version; + uuid_le magic; + uuid_le uuid; + uuid_le user_uuid; + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 offset; + __le64 seq; + + __le16 block_size; + __u8 dev_idx; + __u8 nr_devices; + __le32 u64s; + + __le64 time_base_lo; + __le32 time_base_hi; + __le32 time_precision; + + __le64 flags[8]; + __le64 features[2]; + __le64 compat[2]; + + struct bch_sb_layout layout; + + union { + struct bch_sb_field start[0]; + __le64 _data[0]; + }; +} __attribute__((packed, aligned(8))); + +/* + * Flags: + * BCH_SB_INITALIZED - set on first mount + * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect + * behaviour of mount/recovery path: + * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits + * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 + * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides + * DATA/META_CSUM_TYPE. Also indicates encryption + * algorithm in use, if/when we get more than one + */ + +LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); +LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); +LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); +LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); + +LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); + +LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); +LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); + +LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); +LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); + +LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); +LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); + +LE64_BITMASK(BCH_SB_META_REPLICAS_HAVE, struct bch_sb, flags[0], 56, 60); +LE64_BITMASK(BCH_SB_DATA_REPLICAS_HAVE, struct bch_sb, flags[0], 60, 64); + +LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); +LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); +LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); + +LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); +LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); +LE64_BITMASK(BCH_SB_JOURNAL_ENTRY_SIZE, struct bch_sb, flags[1], 14, 20); + +LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); +LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); + +/* Features: */ +enum bch_sb_features { + BCH_FEATURE_LZ4 = 0, + BCH_FEATURE_GZIP = 1, +}; + +/* options: */ + +#define BCH_REPLICAS_MAX 4U + +#if 0 +#define BCH_ERROR_ACTIONS() \ + x(BCH_ON_ERROR_CONTINUE, 0, "continue") \ + x(BCH_ON_ERROR_RO, 1, "remount-ro") \ + x(BCH_ON_ERROR_PANIC, 2, "panic") \ + x(BCH_NR_ERROR_ACTIONS, 3, NULL) + +enum bch_error_actions { +#define x(_opt, _nr, _str) _opt = _nr, + BCH_ERROR_ACTIONS() +#undef x +}; +#endif + +enum bch_error_actions { + BCH_ON_ERROR_CONTINUE = 0, + BCH_ON_ERROR_RO = 1, + BCH_ON_ERROR_PANIC = 2, + BCH_NR_ERROR_ACTIONS = 3, +}; + +enum bch_csum_opts { + BCH_CSUM_OPT_NONE = 0, + BCH_CSUM_OPT_CRC32C = 1, + BCH_CSUM_OPT_CRC64 = 2, + BCH_CSUM_OPT_NR = 3, +}; + +enum bch_str_hash_opts { + BCH_STR_HASH_CRC32C = 0, + BCH_STR_HASH_CRC64 = 1, + BCH_STR_HASH_SIPHASH = 2, + BCH_STR_HASH_NR = 3, +}; + +enum bch_compression_opts { + BCH_COMPRESSION_NONE = 0, + BCH_COMPRESSION_LZ4 = 1, + BCH_COMPRESSION_GZIP = 2, + BCH_COMPRESSION_NR = 3, +}; + +/* backing device specific stuff: */ + +struct backingdev_sb { + __le64 csum; + __le64 offset; /* sector where this sb was written */ + __le64 version; /* of on disk format */ + + uuid_le magic; /* bcachefs superblock UUID */ + + uuid_le disk_uuid; + + /* + * Internal cache set UUID - xored with various magic numbers and thus + * must never change: + */ + union { + uuid_le set_uuid; + __le64 set_magic; + }; + __u8 label[BCH_SB_LABEL_SIZE]; + + __le64 flags; + + /* Incremented each time superblock is written: */ + __le64 seq; + + /* + * User visible UUID for identifying the cache set the user is allowed + * to change: + * + * XXX hooked up? + */ + uuid_le user_uuid; + __le64 pad1[6]; + + __le64 data_offset; + __le16 block_size; /* sectors */ + __le16 pad2[3]; + + __le32 last_mount; /* time_t */ + __le16 pad3; + /* size of variable length portion - always 0 for backingdev superblock */ + __le16 u64s; + __u64 _data[0]; +}; + +LE64_BITMASK(BDEV_CACHE_MODE, struct backingdev_sb, flags, 0, 4); +#define CACHE_MODE_WRITETHROUGH 0U +#define CACHE_MODE_WRITEBACK 1U +#define CACHE_MODE_WRITEAROUND 2U +#define CACHE_MODE_NONE 3U + +LE64_BITMASK(BDEV_STATE, struct backingdev_sb, flags, 61, 63); +#define BDEV_STATE_NONE 0U +#define BDEV_STATE_CLEAN 1U +#define BDEV_STATE_DIRTY 2U +#define BDEV_STATE_STALE 3U + +#define BDEV_DATA_START_DEFAULT 16 /* sectors */ + +static inline _Bool __SB_IS_BDEV(__u64 version) +{ + return version == BCACHE_SB_VERSION_BDEV + || version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; +} + +static inline _Bool SB_IS_BDEV(const struct bch_sb *sb) +{ + return __SB_IS_BDEV(sb->version); +} + +/* + * Magic numbers + * + * The various other data structures have their own magic numbers, which are + * xored with the first part of the cache set's UUID + */ + +#define BCACHE_MAGIC \ + UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ + 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) + +#define BCACHE_STATFS_MAGIC 0xca451a4e + +#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) +#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL) +#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) + +static inline __le64 __bch2_sb_magic(struct bch_sb *sb) +{ + __le64 ret; + memcpy(&ret, &sb->uuid, sizeof(ret)); + return ret; +} + +static inline __u64 __jset_magic(struct bch_sb *sb) +{ + return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); +} + +static inline __u64 __pset_magic(struct bch_sb *sb) +{ + return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC); +} + +static inline __u64 __bset_magic(struct bch_sb *sb) +{ + return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); +} + +/* Journal */ + +#define BCACHE_JSET_VERSION_UUIDv1 1 +#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */ +#define BCACHE_JSET_VERSION_JKEYS 2 +#define BCACHE_JSET_VERSION 2 + +struct jset_entry { + __le16 u64s; + __u8 btree_id; + __u8 level; + __le32 flags; /* designates what this jset holds */ + + union { + struct bkey_i start[0]; + __u64 _data[0]; + }; +}; + +#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) + +LE32_BITMASK(JOURNAL_ENTRY_TYPE, struct jset_entry, flags, 0, 8); +enum { + JOURNAL_ENTRY_BTREE_KEYS = 0, + JOURNAL_ENTRY_BTREE_ROOT = 1, + JOURNAL_ENTRY_PRIO_PTRS = 2, + + /* + * Journal sequence numbers can be blacklisted: bsets record the max + * sequence number of all the journal entries they contain updates for, + * so that on recovery we can ignore those bsets that contain index + * updates newer that what made it into the journal. + * + * This means that we can't reuse that journal_seq - we have to skip it, + * and then record that we skipped it so that the next time we crash and + * recover we don't think there was a missing journal entry. + */ + JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3, +}; + +/* + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique + * sequence number. + * + * last_seq is the oldest journal entry that still has keys the btree hasn't + * flushed to disk yet. + * + * version is for on disk format changes. + */ +struct jset { + struct bch_csum csum; + + __le64 magic; + __le64 seq; + __le32 version; + __le32 flags; + + __le32 u64s; /* size of d[] in u64s */ + + __u8 encrypted_start[0]; + + __le16 read_clock; + __le16 write_clock; + + /* Sequence number of oldest dirty journal entry */ + __le64 last_seq; + + + union { + struct jset_entry start[0]; + __u64 _data[0]; + }; +} __attribute__((packed)); + +LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); +LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); + +#define BCH_JOURNAL_BUCKETS_MIN 20 + +/* Bucket prios/gens */ + +struct prio_set { + struct bch_csum csum; + + __le64 magic; + __le32 nonce[3]; + __le16 version; + __le16 flags; + + __u8 encrypted_start[0]; + + __le64 next_bucket; + + struct bucket_disk { + __le16 read_prio; + __le16 write_prio; + __u8 gen; + } __attribute__((packed)) data[]; +} __attribute__((packed)); + +LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4); + +/* Btree: */ + +#define DEFINE_BCH_BTREE_IDS() \ + DEF_BTREE_ID(EXTENTS, 0, "extents") \ + DEF_BTREE_ID(INODES, 1, "inodes") \ + DEF_BTREE_ID(DIRENTS, 2, "dirents") \ + DEF_BTREE_ID(XATTRS, 3, "xattrs") + +#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val, + +enum btree_id { + DEFINE_BCH_BTREE_IDS() + BTREE_ID_NR +}; + +#undef DEF_BTREE_ID + +#define BTREE_MAX_DEPTH 4U + +/* Btree nodes */ + +/* Version 1: Seed pointer into btree node checksum + */ +#define BCACHE_BSET_CSUM 1 +#define BCACHE_BSET_KEY_v1 2 +#define BCACHE_BSET_JOURNAL_SEQ 3 +#define BCACHE_BSET_VERSION 3 + +/* + * Btree nodes + * + * On disk a btree node is a list/log of these; within each set the keys are + * sorted + */ +struct bset { + __le64 seq; + + /* + * Highest journal entry this bset contains keys for. + * If on recovery we don't see that journal entry, this bset is ignored: + * this allows us to preserve the order of all index updates after a + * crash, since the journal records a total order of all index updates + * and anything that didn't make it to the journal doesn't get used. + */ + __le64 journal_seq; + + __le32 flags; + __le16 version; + __le16 u64s; /* count of d[] in u64s */ + + union { + struct bkey_packed start[0]; + __u64 _data[0]; + }; +} __attribute__((packed)); + +LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); + +LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); +LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, + struct bset, flags, 5, 6); + +struct btree_node { + struct bch_csum csum; + __le64 magic; + + /* this flags field is encrypted, unlike bset->flags: */ + __le64 flags; + + /* Closed interval: */ + struct bpos min_key; + struct bpos max_key; + struct bch_extent_ptr ptr; + struct bkey_format format; + + union { + struct bset keys; + struct { + __u8 pad[22]; + __le16 u64s; + __u64 _data[0]; + + }; + }; +} __attribute__((packed)); + +LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); +LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); + +struct btree_node_entry { + struct bch_csum csum; + + union { + struct bset keys; + struct { + __u8 pad[22]; + __le16 u64s; + __u64 _data[0]; + + }; + }; +} __attribute__((packed)); + +#ifdef __cplusplus +} +#endif +#endif /* _LINUX_BCACHE_H */ + +/* vim: set foldnestmax=2: */ diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h new file mode 100644 index 00000000..2218a00b --- /dev/null +++ b/libbcachefs/bcachefs_ioctl.h @@ -0,0 +1,104 @@ +#ifndef _LINUX_BCACHE_IOCTL_H +#define _LINUX_BCACHE_IOCTL_H + +#include <linux/uuid.h> +#include "bcachefs_format.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BCH_FORCE_IF_DATA_LOST (1 << 0) +#define BCH_FORCE_IF_METADATA_LOST (1 << 1) +#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) +#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) + +#define BCH_FORCE_IF_DEGRADED \ + (BCH_FORCE_IF_DATA_DEGRADED| \ + BCH_FORCE_IF_METADATA_DEGRADED) + +#define BCH_BY_UUID (1 << 4) + +/* global control dev: */ + +#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) +#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) + +struct bch_ioctl_assemble { + __u32 flags; + __u32 nr_devs; + __u64 pad; + __u64 devs[]; +}; + +struct bch_ioctl_incremental { + __u32 flags; + __u64 pad; + __u64 dev; +}; + +/* filesystem ioctls: */ + +#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) +#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) +#define BCH_IOCTL_STOP _IO(0xbc, 3) +#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) +#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) +#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) +#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) +#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) +#define BCH_IOCTL_DISK_EVACUATE _IOW(0xbc, 9, struct bch_ioctl_disk) +#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) + +struct bch_ioctl_query_uuid { + uuid_le uuid; +}; + +struct bch_ioctl_start { + __u32 flags; + __u32 pad; +}; + +struct bch_ioctl_disk { + __u32 flags; + __u32 pad; + __u64 dev; +}; + +struct bch_ioctl_disk_set_state { + __u32 flags; + __u8 new_state; + __u8 pad[3]; + __u64 dev; +}; + +#define BCH_REWRITE_INCREASE_REPLICAS (1 << 0) +#define BCH_REWRITE_DECREASE_REPLICAS (1 << 1) + +#define BCH_REWRITE_RECOMPRESS (1 << 0) +#define BCH_REWRITE_DECREASE_REPLICAS (1 << 1) + +enum bch_data_ops { + BCH_DATA_SCRUB, +}; + +struct bch_data_op { + __u8 type; +}; + +struct bch_ioctl_data { + __u32 flags; + __u32 pad; + + __u64 start_inode; + __u64 start_offset; + + __u64 end_inode; + __u64 end_offset; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _LINUX_BCACHE_IOCTL_H */ diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c new file mode 100644 index 00000000..b9ceb6ea --- /dev/null +++ b/libbcachefs/bkey.c @@ -0,0 +1,1164 @@ + +#include "bcachefs.h" +#include "bkey.h" +#include "bset.h" +#include "util.h" + +const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; + +struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, + const struct bkey_packed *); + +void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits) +{ + unsigned bit = high_bit_offset, done = 0; + + while (1) { + while (bit < 64) { + if (done && !(done % 8)) + *out++ = ' '; + *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0'; + bit++; + done++; + if (done == nr_bits) { + *out++ = '\0'; + return; + } + } + + p = next_word(p); + bit = 0; + } +} + +#ifdef CONFIG_BCACHEFS_DEBUG + +static void bch2_bkey_pack_verify(const struct bkey_packed *packed, + const struct bkey *unpacked, + const struct bkey_format *format) +{ + struct bkey tmp; + + BUG_ON(bkeyp_val_u64s(format, packed) != + bkey_val_u64s(unpacked)); + + BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); + + tmp = __bch2_bkey_unpack_key(format, packed); + + if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { + char buf1[160], buf2[160]; + char buf3[160], buf4[160]; + + bch2_bkey_to_text(buf1, sizeof(buf1), unpacked); + bch2_bkey_to_text(buf2, sizeof(buf2), &tmp); + bch2_to_binary(buf3, (void *) unpacked, 80); + bch2_to_binary(buf4, high_word(format, packed), 80); + + panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n", + format->key_u64s, + format->bits_per_field[0], + format->bits_per_field[1], + format->bits_per_field[2], + format->bits_per_field[3], + format->bits_per_field[4], + buf1, buf2, buf3, buf4); + } +} + +#else +static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, + const struct bkey *unpacked, + const struct bkey_format *format) {} +#endif + +int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k) +{ + char *out = buf, *end = buf + size; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + p("u64s %u type %u %llu:%llu snap %u len %u ver %llu", + k->u64s, k->type, k->p.inode, k->p.offset, + k->p.snapshot, k->size, k->version.lo); + + BUG_ON(bkey_packed(k)); + + switch (k->type) { + case KEY_TYPE_DELETED: + p(" deleted"); + break; + case KEY_TYPE_DISCARD: + p(" discard"); + break; + case KEY_TYPE_ERROR: + p(" error"); + break; + case KEY_TYPE_COOKIE: + p(" cookie"); + break; + } +#undef p + + return out - buf; +} + +struct pack_state { + const struct bkey_format *format; + unsigned bits; /* bits remaining in current word */ + u64 w; /* current word */ + u64 *p; /* pointer to next word */ +}; + +__always_inline +static struct pack_state pack_state_init(const struct bkey_format *format, + struct bkey_packed *k) +{ + u64 *p = high_word(format, k); + + return (struct pack_state) { + .format = format, + .bits = 64 - high_bit_offset, + .w = 0, + .p = p, + }; +} + +__always_inline +static void pack_state_finish(struct pack_state *state, + struct bkey_packed *k) +{ + EBUG_ON(state->p < k->_data); + EBUG_ON(state->p >= k->_data + state->format->key_u64s); + + *state->p = state->w; +} + +struct unpack_state { + const struct bkey_format *format; + unsigned bits; /* bits remaining in current word */ + u64 w; /* current word */ + const u64 *p; /* pointer to next word */ +}; + +__always_inline +static struct unpack_state unpack_state_init(const struct bkey_format *format, + const struct bkey_packed *k) +{ + const u64 *p = high_word(format, k); + + return (struct unpack_state) { + .format = format, + .bits = 64 - high_bit_offset, + .w = *p << high_bit_offset, + .p = p, + }; +} + +__always_inline +static u64 get_inc_field(struct unpack_state *state, unsigned field) +{ + unsigned bits = state->format->bits_per_field[field]; + u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); + + if (bits >= state->bits) { + v = state->w >> (64 - bits); + bits -= state->bits; + + state->p = next_word(state->p); + state->w = *state->p; + state->bits = 64; + } + + /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ + v |= (state->w >> 1) >> (63 - bits); + state->w <<= bits; + state->bits -= bits; + + return v + offset; +} + +__always_inline +static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) +{ + unsigned bits = state->format->bits_per_field[field]; + u64 offset = le64_to_cpu(state->format->field_offset[field]); + + if (v < offset) + return false; + + v -= offset; + + if (fls64(v) > bits) + return false; + + if (bits > state->bits) { + bits -= state->bits; + /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ + state->w |= (v >> 1) >> (bits - 1); + + *state->p = state->w; + state->p = next_word(state->p); + state->w = 0; + state->bits = 64; + } + + state->bits -= bits; + state->w |= v << state->bits; + + return true; +} + +/* + * Note: does NOT set out->format (we don't know what it should be here!) + * + * Also: doesn't work on extents - it doesn't preserve the invariant that + * if k is packed bkey_start_pos(k) will successfully pack + */ +static bool bch2_bkey_transform_key(const struct bkey_format *out_f, + struct bkey_packed *out, + const struct bkey_format *in_f, + const struct bkey_packed *in) +{ + struct pack_state out_s = pack_state_init(out_f, out); + struct unpack_state in_s = unpack_state_init(in_f, in); + unsigned i; + + out->_data[0] = 0; + + for (i = 0; i < BKEY_NR_FIELDS; i++) + if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) + return false; + + /* Can't happen because the val would be too big to unpack: */ + EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); + + pack_state_finish(&out_s, out); + out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; + out->needs_whiteout = in->needs_whiteout; + out->type = in->type; + + return true; +} + +bool bch2_bkey_transform(const struct bkey_format *out_f, + struct bkey_packed *out, + const struct bkey_format *in_f, + const struct bkey_packed *in) +{ + if (!bch2_bkey_transform_key(out_f, out, in_f, in)) + return false; + + memcpy_u64s((u64 *) out + out_f->key_u64s, + (u64 *) in + in_f->key_u64s, + (in->u64s - in_f->key_u64s)); + return true; +} + +#define bkey_fields() \ + x(BKEY_FIELD_INODE, p.inode) \ + x(BKEY_FIELD_OFFSET, p.offset) \ + x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ + x(BKEY_FIELD_SIZE, size) \ + x(BKEY_FIELD_VERSION_HI, version.hi) \ + x(BKEY_FIELD_VERSION_LO, version.lo) + +struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, + const struct bkey_packed *in) +{ + struct unpack_state state = unpack_state_init(format, in); + struct bkey out; + + EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); + EBUG_ON(in->u64s < format->key_u64s); + EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); + EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); + + out.u64s = BKEY_U64s + in->u64s - format->key_u64s; + out.format = KEY_FORMAT_CURRENT; + out.needs_whiteout = in->needs_whiteout; + out.type = in->type; + out.pad[0] = 0; + +#define x(id, field) out.field = get_inc_field(&state, id); + bkey_fields() +#undef x + + return out; +} + +#ifndef HAVE_BCACHE_COMPILED_UNPACK +struct bpos __bkey_unpack_pos(const struct bkey_format *format, + const struct bkey_packed *in) +{ + struct unpack_state state = unpack_state_init(format, in); + struct bpos out; + + EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); + EBUG_ON(in->u64s < format->key_u64s); + EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); + + out.inode = get_inc_field(&state, BKEY_FIELD_INODE); + out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); + out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); + + return out; +} +#endif + +/** + * bch2_bkey_pack_key -- pack just the key, not the value + */ +bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, + const struct bkey_format *format) +{ + struct pack_state state = pack_state_init(format, out); + + EBUG_ON((void *) in == (void *) out); + EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); + EBUG_ON(in->format != KEY_FORMAT_CURRENT); + + out->_data[0] = 0; + +#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; + bkey_fields() +#undef x + + /* + * Extents - we have to guarantee that if an extent is packed, a trimmed + * version will also pack: + */ + if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET]) + return false; + + pack_state_finish(&state, out); + out->u64s = format->key_u64s + in->u64s - BKEY_U64s; + out->format = KEY_FORMAT_LOCAL_BTREE; + out->needs_whiteout = in->needs_whiteout; + out->type = in->type; + + bch2_bkey_pack_verify(out, in, format); + return true; +} + +/** + * bch2_bkey_unpack -- unpack the key and the value + */ +void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, + const struct bkey_packed *src) +{ + dst->k = bkey_unpack_key(b, src); + + memcpy_u64s(&dst->v, + bkeyp_val(&b->format, src), + bkeyp_val_u64s(&b->format, src)); +} + +/** + * bch2_bkey_pack -- pack the key and the value + */ +bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, + const struct bkey_format *format) +{ + struct bkey_packed tmp; + + if (!bch2_bkey_pack_key(&tmp, &in->k, format)) + return false; + + memmove_u64s((u64 *) out + format->key_u64s, + &in->v, + bkey_val_u64s(&in->k)); + memcpy_u64s(out, &tmp, format->key_u64s); + + return true; +} + +__always_inline +static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) +{ + unsigned bits = state->format->bits_per_field[field]; + u64 offset = le64_to_cpu(state->format->field_offset[field]); + bool ret = true; + + EBUG_ON(v < offset); + v -= offset; + + if (fls64(v) > bits) { + v = ~(~0ULL << bits); + ret = false; + } + + if (bits > state->bits) { + bits -= state->bits; + state->w |= (v >> 1) >> (bits - 1); + + *state->p = state->w; + state->p = next_word(state->p); + state->w = 0; + state->bits = 64; + } + + state->bits -= bits; + state->w |= v << state->bits; + + return ret; +} + +#ifdef CONFIG_BCACHEFS_DEBUG +static bool bkey_packed_successor(struct bkey_packed *out, + const struct btree *b, + struct bkey_packed k) +{ + const struct bkey_format *f = &b->format; + unsigned nr_key_bits = b->nr_key_bits; + unsigned first_bit, offset; + u64 *p; + + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); + + if (!nr_key_bits) + return false; + + *out = k; + + first_bit = high_bit_offset + nr_key_bits - 1; + p = nth_word(high_word(f, out), first_bit >> 6); + offset = 63 - (first_bit & 63); + + while (nr_key_bits) { + unsigned bits = min(64 - offset, nr_key_bits); + u64 mask = (~0ULL >> (64 - bits)) << offset; + + if ((*p & mask) != mask) { + *p += 1ULL << offset; + EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); + return true; + } + + *p &= ~mask; + p = prev_word(p); + nr_key_bits -= bits; + offset = 0; + } + + return false; +} +#endif + +/* + * Returns a packed key that compares <= in + * + * This is used in bset_search_tree(), where we need a packed pos in order to be + * able to compare against the keys in the auxiliary search tree - and it's + * legal to use a packed pos that isn't equivalent to the original pos, + * _provided_ it compares <= to the original pos. + */ +enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, + struct bpos in, + const struct btree *b) +{ + const struct bkey_format *f = &b->format; + struct pack_state state = pack_state_init(f, out); +#ifdef CONFIG_BCACHEFS_DEBUG + struct bpos orig = in; +#endif + bool exact = true; + + out->_data[0] = 0; + + if (unlikely(in.snapshot < + le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { + if (!in.offset-- && + !in.inode--) + return BKEY_PACK_POS_FAIL; + in.snapshot = KEY_SNAPSHOT_MAX; + exact = false; + } + + if (unlikely(in.offset < + le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { + if (!in.inode--) + return BKEY_PACK_POS_FAIL; + in.offset = KEY_OFFSET_MAX; + in.snapshot = KEY_SNAPSHOT_MAX; + exact = false; + } + + if (unlikely(in.inode < + le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) + return BKEY_PACK_POS_FAIL; + + if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) { + in.offset = KEY_OFFSET_MAX; + in.snapshot = KEY_SNAPSHOT_MAX; + exact = false; + } + + if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) { + in.snapshot = KEY_SNAPSHOT_MAX; + exact = false; + } + + if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)) + exact = false; + + pack_state_finish(&state, out); + out->u64s = f->key_u64s; + out->format = KEY_FORMAT_LOCAL_BTREE; + out->type = KEY_TYPE_DELETED; + +#ifdef CONFIG_BCACHEFS_DEBUG + if (exact) { + BUG_ON(bkey_cmp_left_packed(b, out, &orig)); + } else { + struct bkey_packed successor; + + BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); + BUG_ON(bkey_packed_successor(&successor, b, *out) && + bkey_cmp_left_packed(b, &successor, &orig) < 0); + } +#endif + + return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; +} + +void bch2_bkey_format_init(struct bkey_format_state *s) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(s->field_min); i++) + s->field_min[i] = U64_MAX; + + for (i = 0; i < ARRAY_SIZE(s->field_max); i++) + s->field_max[i] = 0; + + /* Make sure we can store a size of 0: */ + s->field_min[BKEY_FIELD_SIZE] = 0; +} + +static void __bkey_format_add(struct bkey_format_state *s, + unsigned field, u64 v) +{ + s->field_min[field] = min(s->field_min[field], v); + s->field_max[field] = max(s->field_max[field], v); +} + +/* + * Changes @format so that @k can be successfully packed with @format + */ +void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) +{ +#define x(id, field) __bkey_format_add(s, id, k->field); + bkey_fields() +#undef x + __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k)); +} + +void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) +{ + unsigned field = 0; + + __bkey_format_add(s, field++, p.inode); + __bkey_format_add(s, field++, p.offset); + __bkey_format_add(s, field++, p.snapshot); +} + +/* + * We don't want it to be possible for the packed format to represent fields + * bigger than a u64... that will cause confusion and issues (like with + * bkey_packed_successor()) + */ +static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, + unsigned bits, u64 offset) +{ + offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1)); + + f->bits_per_field[i] = bits; + f->field_offset[i] = cpu_to_le64(offset); +} + +struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) +{ + unsigned i, bits = KEY_PACKED_BITS_START; + struct bkey_format ret = { + .nr_fields = BKEY_NR_FIELDS, + }; + + for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { + s->field_min[i] = min(s->field_min[i], s->field_max[i]); + + set_format_field(&ret, i, + fls64(s->field_max[i] - s->field_min[i]), + s->field_min[i]); + + bits += ret.bits_per_field[i]; + } + + /* allow for extent merging: */ + if (ret.bits_per_field[BKEY_FIELD_SIZE]) { + ret.bits_per_field[BKEY_FIELD_SIZE] += 4; + bits += 4; + } + + ret.key_u64s = DIV_ROUND_UP(bits, 64); + + /* if we have enough spare bits, round fields up to nearest byte */ + bits = ret.key_u64s * 64 - bits; + + for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { + unsigned r = round_up(ret.bits_per_field[i], 8) - + ret.bits_per_field[i]; + + if (r <= bits) { + set_format_field(&ret, i, + ret.bits_per_field[i] + r, + le64_to_cpu(ret.field_offset[i])); + bits -= r; + } + } + + EBUG_ON(bch2_bkey_format_validate(&ret)); + return ret; +} + +const char *bch2_bkey_format_validate(struct bkey_format *f) +{ + unsigned i, bits = KEY_PACKED_BITS_START; + + if (f->nr_fields != BKEY_NR_FIELDS) + return "invalid format: incorrect number of fields"; + + for (i = 0; i < f->nr_fields; i++) { + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (f->bits_per_field[i] > 64) + return "invalid format: field too large"; + + if (field_offset && + (f->bits_per_field[i] == 64 || + (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < + field_offset))) + return "invalid format: offset + bits overflow"; + + bits += f->bits_per_field[i]; + } + + if (f->key_u64s != DIV_ROUND_UP(bits, 64)) + return "invalid format: incorrect key_u64s"; + + return NULL; +} + +/* + * Most significant differing bit + * Bits are indexed from 0 - return is [0, nr_key_bits) + */ +__pure +unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, + const struct bkey_packed *l_k, + const struct bkey_packed *r_k) +{ + const u64 *l = high_word(&b->format, l_k); + const u64 *r = high_word(&b->format, r_k); + unsigned nr_key_bits = b->nr_key_bits; + unsigned word_bits = 64 - high_bit_offset; + u64 l_v, r_v; + + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); + + /* for big endian, skip past header */ + l_v = *l & (~0ULL >> high_bit_offset); + r_v = *r & (~0ULL >> high_bit_offset); + + while (nr_key_bits) { + if (nr_key_bits < word_bits) { + l_v >>= word_bits - nr_key_bits; + r_v >>= word_bits - nr_key_bits; + nr_key_bits = 0; + } else { + nr_key_bits -= word_bits; + } + + if (l_v != r_v) + return fls64(l_v ^ r_v) - 1 + nr_key_bits; + + l = next_word(l); + r = next_word(r); + + l_v = *l; + r_v = *r; + word_bits = 64; + } + + return 0; +} + +/* + * First set bit + * Bits are indexed from 0 - return is [0, nr_key_bits) + */ +__pure +unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) +{ + const u64 *p = high_word(&b->format, k); + unsigned nr_key_bits = b->nr_key_bits; + unsigned ret = 0, offset; + + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); + + offset = nr_key_bits; + while (offset > 64) { + p = next_word(p); + offset -= 64; + } + + offset = 64 - offset; + + while (nr_key_bits) { + unsigned bits = nr_key_bits + offset < 64 + ? nr_key_bits + : 64 - offset; + + u64 mask = (~0ULL >> (64 - bits)) << offset; + + if (*p & mask) + return ret + __ffs64(*p & mask) - offset; + + p = prev_word(p); + nr_key_bits -= bits; + ret += bits; + offset = 0; + } + + return 0; +} + +#ifdef CONFIG_X86_64 + +static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, + unsigned nr_key_bits) +{ + long d0, d1, d2, d3; + int cmp; + + /* we shouldn't need asm for this, but gcc is being retarded: */ + + asm(".intel_syntax noprefix;" + "xor eax, eax;" + "xor edx, edx;" + "1:;" + "mov r8, [rdi];" + "mov r9, [rsi];" + "sub ecx, 64;" + "jl 2f;" + + "cmp r8, r9;" + "jnz 3f;" + + "lea rdi, [rdi - 8];" + "lea rsi, [rsi - 8];" + "jmp 1b;" + + "2:;" + "not ecx;" + "shr r8, 1;" + "shr r9, 1;" + "shr r8, cl;" + "shr r9, cl;" + "cmp r8, r9;" + + "3:\n" + "seta al;" + "setb dl;" + "sub eax, edx;" + ".att_syntax prefix;" + : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) + : "0" (l), "1" (r), "3" (nr_key_bits) + : "r8", "r9", "cc", "memory"); + + return cmp; +} + +#define I(_x) (*(out)++ = (_x)) +#define I1(i0) I(i0) +#define I2(i0, i1) (I1(i0), I(i1)) +#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) +#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) +#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) + +static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, + enum bch_bkey_fields field, + unsigned dst_offset, unsigned dst_size, + bool *eax_zeroed) +{ + unsigned byte = format->key_u64s * sizeof(u64); + unsigned bits = format->bits_per_field[field]; + u64 offset = format->field_offset[field]; + unsigned i, bit_offset = 0; + unsigned shl, shr; + + if (!bits && !offset) { + if (!*eax_zeroed) { + /* xor eax, eax */ + I2(0x31, 0xc0); + } + + *eax_zeroed = true; + goto set_field; + } + + if (!bits) { + /* just return offset: */ + + switch (dst_size) { + case 8: + if (offset > S32_MAX) { + /* mov [rdi + dst_offset], offset */ + I3(0xc7, 0x47, dst_offset); + memcpy(out, &offset, 4); + out += 4; + + I3(0xc7, 0x47, dst_offset + 4); + memcpy(out, (void *) &offset + 4, 4); + out += 4; + } else { + /* mov [rdi + dst_offset], offset */ + /* sign extended */ + I4(0x48, 0xc7, 0x47, dst_offset); + memcpy(out, &offset, 4); + out += 4; + } + break; + case 4: + /* mov [rdi + dst_offset], offset */ + I3(0xc7, 0x47, dst_offset); + memcpy(out, &offset, 4); + out += 4; + break; + default: + BUG(); + } + + return out; + } + + for (i = 0; i <= field; i++) + bit_offset += format->bits_per_field[i]; + + byte -= DIV_ROUND_UP(bit_offset, 8); + bit_offset = round_up(bit_offset, 8) - bit_offset; + + *eax_zeroed = false; + + if (bit_offset == 0 && bits == 8) { + /* movzx eax, BYTE PTR [rsi + imm8] */ + I4(0x0f, 0xb6, 0x46, byte); + } else if (bit_offset == 0 && bits == 16) { + /* movzx eax, WORD PTR [rsi + imm8] */ + I4(0x0f, 0xb7, 0x46, byte); + } else if (bit_offset + bits <= 32) { + /* mov eax, [rsi + imm8] */ + I3(0x8b, 0x46, byte); + + if (bit_offset) { + /* shr eax, imm8 */ + I3(0xc1, 0xe8, bit_offset); + } + + if (bit_offset + bits < 32) { + unsigned mask = ~0U >> (32 - bits); + + /* and eax, imm32 */ + I1(0x25); + memcpy(out, &mask, 4); + out += 4; + } + } else if (bit_offset + bits <= 64) { + /* mov rax, [rsi + imm8] */ + I4(0x48, 0x8b, 0x46, byte); + + shl = 64 - bit_offset - bits; + shr = bit_offset + shl; + + if (shl) { + /* shl rax, imm8 */ + I4(0x48, 0xc1, 0xe0, shl); + } + + if (shr) { + /* shr rax, imm8 */ + I4(0x48, 0xc1, 0xe8, shr); + } + } else { + /* mov rax, [rsi + byte] */ + I4(0x48, 0x8b, 0x46, byte); + + /* mov edx, [rsi + byte + 8] */ + I3(0x8b, 0x56, byte + 8); + + /* bits from next word: */ + shr = bit_offset + bits - 64; + BUG_ON(shr > bit_offset); + + /* shr rax, bit_offset */ + I4(0x48, 0xc1, 0xe8, shr); + + /* shl rdx, imm8 */ + I4(0x48, 0xc1, 0xe2, 64 - shr); + + /* or rax, rdx */ + I3(0x48, 0x09, 0xd0); + + shr = bit_offset - shr; + + if (shr) { + /* shr rax, imm8 */ + I4(0x48, 0xc1, 0xe8, shr); + } + } + + /* rax += offset: */ + if (offset > S32_MAX) { + /* mov rdx, imm64 */ + I2(0x48, 0xba); + memcpy(out, &offset, 8); + out += 8; + /* add %rdx, %rax */ + I3(0x48, 0x01, 0xd0); + } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { + /* add rax, imm32 */ + I2(0x48, 0x05); + memcpy(out, &offset, 4); + out += 4; + } else if (offset) { + /* add eax, imm32 */ + I1(0x05); + memcpy(out, &offset, 4); + out += 4; + } +set_field: + switch (dst_size) { + case 8: + /* mov [rdi + dst_offset], rax */ + I4(0x48, 0x89, 0x47, dst_offset); + break; + case 4: + /* mov [rdi + dst_offset], eax */ + I3(0x89, 0x47, dst_offset); + break; + default: + BUG(); + } + + return out; +} + +int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) +{ + bool eax_zeroed = false; + u8 *out = _out; + + /* + * rdi: dst - unpacked key + * rsi: src - packed key + */ + + /* k->u64s, k->format, k->type */ + + /* mov eax, [rsi] */ + I2(0x8b, 0x06); + + /* add eax, BKEY_U64s - format->key_u64s */ + I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); + + /* and eax, imm32: mask out k->pad: */ + I5(0x25, 0xff, 0xff, 0xff, 0); + + /* mov [rdi], eax */ + I2(0x89, 0x07); + +#define x(id, field) \ + out = compile_bkey_field(format, out, id, \ + offsetof(struct bkey, field), \ + sizeof(((struct bkey *) NULL)->field), \ + &eax_zeroed); + bkey_fields() +#undef x + + /* retq */ + I1(0xc3); + + return (void *) out - _out; +} + +#else +static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, + unsigned nr_key_bits) +{ + u64 l_v, r_v; + + if (!nr_key_bits) + return 0; + + /* for big endian, skip past header */ + nr_key_bits += high_bit_offset; + l_v = *l & (~0ULL >> high_bit_offset); + r_v = *r & (~0ULL >> high_bit_offset); + + while (1) { + if (nr_key_bits < 64) { + l_v >>= 64 - nr_key_bits; + r_v >>= 64 - nr_key_bits; + nr_key_bits = 0; + } else { + nr_key_bits -= 64; + } + + if (l_v != r_v) + return l_v < r_v ? -1 : 1; + + if (!nr_key_bits) + return 0; + + l = next_word(l); + r = next_word(r); + + l_v = *l; + r_v = *r; + } +} +#endif + +__pure +int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, + const struct bkey_packed *r, + const struct btree *b) +{ + const struct bkey_format *f = &b->format; + int ret; + + EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); + + ret = __bkey_cmp_bits(high_word(f, l), + high_word(f, r), + b->nr_key_bits); + + EBUG_ON(ret != bkey_cmp(bkey_unpack_key_format_checked(b, l).p, + bkey_unpack_key_format_checked(b, r).p)); + return ret; +} + +__pure __flatten +int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, + const struct bkey_packed *l, + const struct bpos *r) +{ + return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); +} + +__pure __flatten +int __bch2_bkey_cmp_packed(const struct bkey_packed *l, + const struct bkey_packed *r, + const struct btree *b) +{ + int packed = bkey_lr_packed(l, r); + + if (likely(packed == BKEY_PACKED_BOTH)) + return __bch2_bkey_cmp_packed_format_checked(l, r, b); + + switch (packed) { + case BKEY_PACKED_NONE: + return bkey_cmp(((struct bkey *) l)->p, + ((struct bkey *) r)->p); + case BKEY_PACKED_LEFT: + return __bch2_bkey_cmp_left_packed_format_checked(b, + (struct bkey_packed *) l, + &((struct bkey *) r)->p); + case BKEY_PACKED_RIGHT: + return -__bch2_bkey_cmp_left_packed_format_checked(b, + (struct bkey_packed *) r, + &((struct bkey *) l)->p); + default: + unreachable(); + } +} + +__pure __flatten +int __bch2_bkey_cmp_left_packed(const struct btree *b, + const struct bkey_packed *l, + const struct bpos *r) +{ + const struct bkey *l_unpacked; + + return unlikely(l_unpacked = packed_to_bkey_c(l)) + ? bkey_cmp(l_unpacked->p, *r) + : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); +} + +void bch2_bpos_swab(struct bpos *p) +{ + u8 *l = (u8 *) p; + u8 *h = ((u8 *) &p[1]) - 1; + + while (l < h) { + swap(*l, *h); + l++; + --h; + } +} + +void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) +{ + const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; + u8 *l = k->key_start; + u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; + + while (l < h) { + swap(*l, *h); + l++; + --h; + } +} + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_bkey_pack_test(void) +{ + struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); + struct bkey_packed p; + + struct bkey_format test_format = { + .key_u64s = 2, + .nr_fields = BKEY_NR_FIELDS, + .bits_per_field = { + 13, + 64, + }, + }; + + struct unpack_state in_s = + unpack_state_init(&bch2_bkey_format_current, (void *) &t); + struct pack_state out_s = pack_state_init(&test_format, &p); + unsigned i; + + for (i = 0; i < out_s.format->nr_fields; i++) { + u64 a, v = get_inc_field(&in_s, i); + + switch (i) { +#define x(id, field) case id: a = t.field; break; + bkey_fields() +#undef x + default: + BUG(); + } + + if (a != v) + panic("got %llu actual %llu i %u\n", v, a, i); + + if (!set_inc_field(&out_s, i, v)) + panic("failed at %u\n", i); + } + + BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); +} +#endif diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h new file mode 100644 index 00000000..1383c96b --- /dev/null +++ b/libbcachefs/bkey.h @@ -0,0 +1,613 @@ +#ifndef _BCACHE_BKEY_H +#define _BCACHE_BKEY_H + +#include <linux/bug.h> +#include "bcachefs_format.h" + +#include "util.h" +#include "vstructs.h" + +void bch2_to_binary(char *, const u64 *, unsigned); +int bch2_bkey_to_text(char *, size_t, const struct bkey *); + +#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) + +/* bkey with split value, const */ +struct bkey_s_c { + const struct bkey *k; + const struct bch_val *v; +}; + +/* bkey with split value */ +struct bkey_s { + union { + struct { + struct bkey *k; + struct bch_val *v; + }; + struct bkey_s_c s_c; + }; +}; + +#define bkey_next(_k) vstruct_next(_k) + +static inline unsigned bkey_val_u64s(const struct bkey *k) +{ + return k->u64s - BKEY_U64s; +} + +static inline size_t bkey_val_bytes(const struct bkey *k) +{ + return bkey_val_u64s(k) * sizeof(u64); +} + +static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) +{ + k->u64s = BKEY_U64s + val_u64s; +} + +static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) +{ + k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); +} + +/* + * Mark a key as deleted without changing the size of the value (i.e. modifying + * keys in the btree in place) + */ +static inline void __set_bkey_deleted(struct bkey *k) +{ + k->type = KEY_TYPE_DELETED; +} + +static inline void set_bkey_deleted(struct bkey *k) +{ + __set_bkey_deleted(k); + set_bkey_val_u64s(k, 0); +} + +#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_DELETED) + +#define bkey_whiteout(_k) \ + ((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD) + +#define bkey_packed_typecheck(_k) \ +({ \ + BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ + !type_is(_k, struct bkey_packed *)); \ + type_is(_k, struct bkey_packed *); \ +}) + +enum bkey_lr_packed { + BKEY_PACKED_BOTH, + BKEY_PACKED_RIGHT, + BKEY_PACKED_LEFT, + BKEY_PACKED_NONE, +}; + +#define bkey_lr_packed_typecheck(_l, _r) \ + (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) + +#define bkey_lr_packed(_l, _r) \ + ((_l)->format + ((_r)->format << 1)) + +#define bkey_copy(_dst, _src) \ +do { \ + BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ + !type_is(_dst, struct bkey_packed *)); \ + BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ + !type_is(_src, struct bkey_packed *)); \ + EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ + (u64 *) (_dst) < (u64 *) (_src) + \ + ((struct bkey *) (_src))->u64s); \ + \ + __memmove_u64s_down((_dst), (_src), \ + ((struct bkey *) (_src))->u64s); \ +} while (0) + +struct btree; + +struct bkey_format_state { + u64 field_min[BKEY_NR_FIELDS]; + u64 field_max[BKEY_NR_FIELDS]; +}; + +void bch2_bkey_format_init(struct bkey_format_state *); +void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *); +void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); +struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); +const char *bch2_bkey_format_validate(struct bkey_format *); + +__pure +unsigned bch2_bkey_greatest_differing_bit(const struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); +__pure +unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); + +__pure +int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, + const struct bkey_packed *, + const struct btree *); + +__pure +int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, + const struct bkey_packed *, + const struct bpos *); + +__pure +int __bch2_bkey_cmp_packed(const struct bkey_packed *, + const struct bkey_packed *, + const struct btree *); + +__pure +int __bch2_bkey_cmp_left_packed(const struct btree *, + const struct bkey_packed *, + const struct bpos *); + +static inline __pure +int bkey_cmp_left_packed(const struct btree *b, + const struct bkey_packed *l, const struct bpos *r) +{ + return __bch2_bkey_cmp_left_packed(b, l, r); +} + +/* + * we prefer to pass bpos by ref, but it's often enough terribly convenient to + * pass it by by val... as much as I hate c++, const ref would be nice here: + */ +__pure __flatten +static inline int bkey_cmp_left_packed_byval(const struct btree *b, + const struct bkey_packed *l, + struct bpos r) +{ + return bkey_cmp_left_packed(b, l, &r); +} + +/* + * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to + * skip dispatching on k->format: + */ +#define bkey_cmp_packed(_b, _l, _r) \ +({ \ + int _cmp; \ + \ + switch (bkey_lr_packed_typecheck(_l, _r)) { \ + case BKEY_PACKED_NONE: \ + _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ + ((struct bkey *) (_r))->p); \ + break; \ + case BKEY_PACKED_LEFT: \ + _cmp = bkey_cmp_left_packed((_b), \ + (struct bkey_packed *) (_l), \ + &((struct bkey *) (_r))->p); \ + break; \ + case BKEY_PACKED_RIGHT: \ + _cmp = -bkey_cmp_left_packed((_b), \ + (struct bkey_packed *) (_r), \ + &((struct bkey *) (_l))->p); \ + break; \ + case BKEY_PACKED_BOTH: \ + _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ + (void *) (_r), (_b)); \ + break; \ + } \ + _cmp; \ +}) + +#if 1 +static __always_inline int bkey_cmp(struct bpos l, struct bpos r) +{ + if (l.inode != r.inode) + return l.inode < r.inode ? -1 : 1; + if (l.offset != r.offset) + return l.offset < r.offset ? -1 : 1; + if (l.snapshot != r.snapshot) + return l.snapshot < r.snapshot ? -1 : 1; + return 0; +} +#else +int bkey_cmp(struct bpos l, struct bpos r); +#endif + +static inline struct bpos bpos_min(struct bpos l, struct bpos r) +{ + return bkey_cmp(l, r) < 0 ? l : r; +} + +void bch2_bpos_swab(struct bpos *); +void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); + +static __always_inline int bversion_cmp(struct bversion l, struct bversion r) +{ + if (l.hi != r.hi) + return l.hi < r.hi ? -1 : 1; + if (l.lo != r.lo) + return l.lo < r.lo ? -1 : 1; + return 0; +} + +#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) + +static __always_inline int bversion_zero(struct bversion v) +{ + return !bversion_cmp(v, ZERO_VERSION); +} + +#ifdef CONFIG_BCACHEFS_DEBUG +/* statement expressions confusing unlikely()? */ +#define bkey_packed(_k) \ + ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ + (_k)->format != KEY_FORMAT_CURRENT; }) +#else +#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) +#endif + +/* + * It's safe to treat an unpacked bkey as a packed one, but not the reverse + */ +static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) +{ + return (struct bkey_packed *) k; +} + +static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) +{ + return (const struct bkey_packed *) k; +} + +static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) +{ + return bkey_packed(k) ? NULL : (struct bkey_i *) k; +} + +static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) +{ + return bkey_packed(k) ? NULL : (const struct bkey *) k; +} + +static inline unsigned bkey_format_key_bits(const struct bkey_format *format) +{ + return format->bits_per_field[BKEY_FIELD_INODE] + + format->bits_per_field[BKEY_FIELD_OFFSET] + + format->bits_per_field[BKEY_FIELD_SNAPSHOT]; +} + +static inline struct bpos bkey_successor(struct bpos p) +{ + struct bpos ret = p; + + if (!++ret.offset) + BUG_ON(!++ret.inode); + + return ret; +} + +static inline u64 bkey_start_offset(const struct bkey *k) +{ + return k->p.offset - k->size; +} + +static inline struct bpos bkey_start_pos(const struct bkey *k) +{ + return (struct bpos) { + .inode = k->p.inode, + .offset = bkey_start_offset(k), + .snapshot = k->p.snapshot, + }; +} + +/* Packed helpers */ + +static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, + const struct bkey_packed *k) +{ + unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; + + EBUG_ON(k->u64s < ret); + return ret; +} + +static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, + const struct bkey_packed *k) +{ + return bkeyp_key_u64s(format, k) * sizeof(u64); +} + +static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, + const struct bkey_packed *k) +{ + return k->u64s - bkeyp_key_u64s(format, k); +} + +static inline size_t bkeyp_val_bytes(const struct bkey_format *format, + const struct bkey_packed *k) +{ + return bkeyp_val_u64s(format, k) * sizeof(u64); +} + +static inline void set_bkeyp_val_u64s(const struct bkey_format *format, + struct bkey_packed *k, unsigned val_u64s) +{ + k->u64s = bkeyp_key_u64s(format, k) + val_u64s; +} + +#define bkeyp_val(_format, _k) \ + ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) + +extern const struct bkey_format bch2_bkey_format_current; + +bool bch2_bkey_transform(const struct bkey_format *, + struct bkey_packed *, + const struct bkey_format *, + const struct bkey_packed *); + +struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, + const struct bkey_packed *); + +#ifndef HAVE_BCACHE_COMPILED_UNPACK +struct bpos __bkey_unpack_pos(const struct bkey_format *, + const struct bkey_packed *); +#endif + +bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, + const struct bkey_format *); + +enum bkey_pack_pos_ret { + BKEY_PACK_POS_EXACT, + BKEY_PACK_POS_SMALLER, + BKEY_PACK_POS_FAIL, +}; + +enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, + const struct btree *); + +static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, + const struct btree *b) +{ + return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; +} + +void bch2_bkey_unpack(const struct btree *, struct bkey_i *, + const struct bkey_packed *); +bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, + const struct bkey_format *); + +static inline u64 bkey_field_max(const struct bkey_format *f, + enum bch_bkey_fields nr) +{ + return f->bits_per_field[nr] < 64 + ? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr]) + : U64_MAX; +} + +#ifdef CONFIG_X86_64 +#define HAVE_BCACHE_COMPILED_UNPACK 1 + +int bch2_compile_bkey_format(const struct bkey_format *, void *); + +#else + +static inline int bch2_compile_bkey_format(const struct bkey_format *format, + void *out) { return 0; } + +#endif + +static inline void bkey_reassemble(struct bkey_i *dst, + struct bkey_s_c src) +{ + BUG_ON(bkey_packed(src.k)); + dst->k = *src.k; + memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k)); +} + +#define bkey_s_null ((struct bkey_s) { .k = NULL }) +#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) + +#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) +#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) + +static inline struct bkey_s bkey_to_s(struct bkey *k) +{ + return (struct bkey_s) { .k = k, .v = NULL }; +} + +static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) +{ + return (struct bkey_s_c) { .k = k, .v = NULL }; +} + +static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) +{ + return (struct bkey_s) { .k = &k->k, .v = &k->v }; +} + +static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) +{ + return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; +} + +/* + * For a given type of value (e.g. struct bch_extent), generates the types for + * bkey + bch_extent - inline, split, split const - and also all the conversion + * functions, which also check that the value is of the correct type. + * + * We use anonymous unions for upcasting - e.g. converting from e.g. a + * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion + * functions. + */ +#define __BKEY_VAL_ACCESSORS(name, nr, _assert) \ +struct bkey_s_c_##name { \ + union { \ + struct { \ + const struct bkey *k; \ + const struct bch_##name *v; \ + }; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +struct bkey_s_##name { \ + union { \ + struct { \ + struct bkey *k; \ + struct bch_##name *v; \ + }; \ + struct bkey_s_c_##name c; \ + struct bkey_s s; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ +{ \ + _assert(k->k.type, nr); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline const struct bkey_i_##name * \ +bkey_i_to_##name##_c(const struct bkey_i *k) \ +{ \ + _assert(k->k.type, nr); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ +{ \ + _assert(k.k->type, nr); \ + return (struct bkey_s_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ +{ \ + _assert(k.k->type, nr); \ + return (struct bkey_s_c_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ +{ \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +name##_i_to_s_c(const struct bkey_i_##name *k) \ +{ \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ +{ \ + _assert(k->k.type, nr); \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +bkey_i_to_s_c_##name(const struct bkey_i *k) \ +{ \ + _assert(k->k.type, nr); \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bch_##name * \ +bkey_p_##name##_val(const struct bkey_format *f, \ + struct bkey_packed *k) \ +{ \ + return container_of(bkeyp_val(f, k), struct bch_##name, v); \ +} \ + \ +static inline const struct bch_##name * \ +bkey_p_c_##name##_val(const struct bkey_format *f, \ + const struct bkey_packed *k) \ +{ \ + return container_of(bkeyp_val(f, k), struct bch_##name, v); \ +} \ + \ +static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ +{ \ + struct bkey_i_##name *k = \ + container_of(&_k->k, struct bkey_i_##name, k); \ + \ + bkey_init(&k->k); \ + memset(&k->v, 0, sizeof(k->v)); \ + k->k.type = nr; \ + set_bkey_val_bytes(&k->k, sizeof(k->v)); \ + \ + return k; \ +} + +#define __BKEY_VAL_ASSERT(_type, _nr) EBUG_ON(_type != _nr) + +#define BKEY_VAL_ACCESSORS(name, _nr) \ + static inline void __bch_##name##_assert(u8 type, u8 nr) \ + { \ + EBUG_ON(type != _nr); \ + } \ + \ + __BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert) + +BKEY_VAL_ACCESSORS(cookie, KEY_TYPE_COOKIE); + +static inline void __bch2_extent_assert(u8 type, u8 nr) +{ + EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED); +} + +__BKEY_VAL_ACCESSORS(extent, BCH_EXTENT, __bch2_extent_assert); +BKEY_VAL_ACCESSORS(reservation, BCH_RESERVATION); + +BKEY_VAL_ACCESSORS(inode, BCH_INODE_FS); +BKEY_VAL_ACCESSORS(inode_blockdev, BCH_INODE_BLOCKDEV); + +BKEY_VAL_ACCESSORS(dirent, BCH_DIRENT); + +BKEY_VAL_ACCESSORS(xattr, BCH_XATTR); + +/* byte order helpers */ + +#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN) +#error edit for your odd byteorder. +#endif + +#ifdef __LITTLE_ENDIAN + +#define high_bit_offset 0 +#define __high_word(u64s, k) ((k)->_data + (u64s) - 1) +#define nth_word(p, n) ((p) - (n)) + +#else + +#define high_bit_offset KEY_PACKED_BITS_START +#define __high_word(u64s, k) ((k)->_data) +#define nth_word(p, n) ((p) + (n)) + +#endif + +#define high_word(format, k) __high_word((format)->key_u64s, k) +#define next_word(p) nth_word(p, 1) +#define prev_word(p) nth_word(p, -1) + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_bkey_pack_test(void); +#else +static inline void bch2_bkey_pack_test(void) {} +#endif + +#endif /* _BCACHE_BKEY_H */ diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c new file mode 100644 index 00000000..51a13fca --- /dev/null +++ b/libbcachefs/bkey_methods.c @@ -0,0 +1,127 @@ + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_types.h" +#include "dirent.h" +#include "error.h" +#include "extents.h" +#include "inode.h" +#include "xattr.h" + +const struct bkey_ops *bch2_bkey_ops[] = { + [BKEY_TYPE_EXTENTS] = &bch2_bkey_extent_ops, + [BKEY_TYPE_INODES] = &bch2_bkey_inode_ops, + [BKEY_TYPE_DIRENTS] = &bch2_bkey_dirent_ops, + [BKEY_TYPE_XATTRS] = &bch2_bkey_xattr_ops, + [BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops, +}; + +/* Returns string indicating reason for being invalid, or NULL if valid: */ +const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type, + struct bkey_s_c k) +{ + const struct bkey_ops *ops = bch2_bkey_ops[type]; + + if (k.k->u64s < BKEY_U64s) + return "u64s too small"; + + if (k.k->size && + (bkey_deleted(k.k) || !ops->is_extents)) + return "nonzero size field"; + + switch (k.k->type) { + case KEY_TYPE_DELETED: + case KEY_TYPE_DISCARD: + return NULL; + + case KEY_TYPE_ERROR: + return bkey_val_bytes(k.k) != 0 + ? "value size should be zero" + : NULL; + + case KEY_TYPE_COOKIE: + return bkey_val_bytes(k.k) != sizeof(struct bch_cookie) + ? "incorrect value size" + : NULL; + + default: + if (k.k->type < KEY_TYPE_GENERIC_NR) + return "invalid type"; + + return ops->key_invalid(c, k); + } +} + +const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b, + struct bkey_s_c k) +{ + if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0) + return "key before start of btree node"; + + if (bkey_cmp(k.k->p, b->data->max_key) > 0) + return "key past end of btree node"; + + if (k.k->p.snapshot) + return "nonzero snapshot"; + + return bch2_bkey_invalid(c, btree_node_type(b), k); +} + +void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) +{ + enum bkey_type type = btree_node_type(b); + const struct bkey_ops *ops = bch2_bkey_ops[type]; + const char *invalid; + + BUG_ON(!k.k->u64s); + + invalid = bch2_btree_bkey_invalid(c, b, k); + if (invalid) { + char buf[160]; + + bch2_bkey_val_to_text(c, type, buf, sizeof(buf), k); + bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid); + return; + } + + if (k.k->type >= KEY_TYPE_GENERIC_NR && + ops->key_debugcheck) + ops->key_debugcheck(c, b, k); +} + +void bch2_val_to_text(struct bch_fs *c, enum bkey_type type, + char *buf, size_t size, struct bkey_s_c k) +{ + const struct bkey_ops *ops = bch2_bkey_ops[type]; + + if (k.k->type >= KEY_TYPE_GENERIC_NR && + ops->val_to_text) + ops->val_to_text(c, buf, size, k); +} + +void bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type, + char *buf, size_t size, struct bkey_s_c k) +{ + const struct bkey_ops *ops = bch2_bkey_ops[type]; + char *out = buf, *end = buf + size; + + out += bch2_bkey_to_text(out, end - out, k.k); + + if (k.k->type >= KEY_TYPE_GENERIC_NR && + ops->val_to_text) { + out += scnprintf(out, end - out, " -> "); + ops->val_to_text(c, out, end - out, k); + } +} + +void bch2_bkey_swab(enum bkey_type type, + const struct bkey_format *f, + struct bkey_packed *k) +{ + const struct bkey_ops *ops = bch2_bkey_ops[type]; + + bch2_bkey_swab_key(f, k); + + if (ops->swab) + ops->swab(f, k); +} diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h new file mode 100644 index 00000000..d372fa61 --- /dev/null +++ b/libbcachefs/bkey_methods.h @@ -0,0 +1,82 @@ +#ifndef _BCACHE_BKEY_METHODS_H +#define _BCACHE_BKEY_METHODS_H + +#include "bkey.h" + +#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val, + +enum bkey_type { + DEFINE_BCH_BTREE_IDS() + BKEY_TYPE_BTREE, +}; + +/* Type of a key in btree @id at level @level: */ +static inline enum bkey_type bkey_type(unsigned level, enum btree_id id) +{ + return level ? BKEY_TYPE_BTREE : id; +} + +static inline bool btree_type_has_ptrs(enum bkey_type type) +{ + switch (type) { + case BKEY_TYPE_BTREE: + case BKEY_TYPE_EXTENTS: + return true; + default: + return false; + } +} + +struct bch_fs; +struct btree; +struct bkey; + +enum merge_result { + BCH_MERGE_NOMERGE, + + /* + * The keys were mergeable, but would have overflowed size - so instead + * l was changed to the maximum size, and both keys were modified: + */ + BCH_MERGE_PARTIAL, + BCH_MERGE_MERGE, +}; + +typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *, + struct bkey_s); +typedef enum merge_result (*key_merge_fn)(struct bch_fs *, + struct btree *, + struct bkey_i *, struct bkey_i *); + +struct bkey_ops { + /* Returns reason for being invalid if invalid, else NULL: */ + const char * (*key_invalid)(const struct bch_fs *, + struct bkey_s_c); + void (*key_debugcheck)(struct bch_fs *, struct btree *, + struct bkey_s_c); + void (*val_to_text)(struct bch_fs *, char *, + size_t, struct bkey_s_c); + void (*swab)(const struct bkey_format *, struct bkey_packed *); + key_filter_fn key_normalize; + key_merge_fn key_merge; + bool is_extents; +}; + +const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c); +const char *bch2_btree_bkey_invalid(struct bch_fs *, struct btree *, + struct bkey_s_c); + +void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); +void bch2_val_to_text(struct bch_fs *, enum bkey_type, + char *, size_t, struct bkey_s_c); +void bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type, + char *, size_t, struct bkey_s_c); + +void bch2_bkey_swab(enum bkey_type, const struct bkey_format *, + struct bkey_packed *); + +extern const struct bkey_ops *bch2_bkey_ops[]; + +#undef DEF_BTREE_ID + +#endif /* _BCACHE_BKEY_METHODS_H */ diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c new file mode 100644 index 00000000..280dcf3e --- /dev/null +++ b/libbcachefs/bset.c @@ -0,0 +1,1843 @@ +/* + * Code for working with individual keys, and sorted sets of keys with in a + * btree node + * + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "bset.h" +#include "eytzinger.h" +#include "util.h" + +#include <asm/unaligned.h> +#include <linux/dynamic_fault.h> +#include <linux/console.h> +#include <linux/random.h> +#include <linux/prefetch.h> + +/* hack.. */ +#include "alloc_types.h" +#include <trace/events/bcachefs.h> + +struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) +{ + struct bset_tree *t; + + for_each_bset(b, t) + if (k >= btree_bkey_first(b, t) && + k < btree_bkey_last(b, t)) + return t; + + BUG(); +} + +/* + * There are never duplicate live keys in the btree - but including keys that + * have been flagged as deleted (and will be cleaned up later) we _will_ see + * duplicates. + * + * Thus the sort order is: usual key comparison first, but for keys that compare + * equal the deleted key(s) come first, and the (at most one) live version comes + * last. + * + * The main reason for this is insertion: to handle overwrites, we first iterate + * over keys that compare equal to our insert key, and then insert immediately + * prior to the first key greater than the key we're inserting - our insert + * position will be after all keys that compare equal to our insert key, which + * by the time we actually do the insert will all be deleted. + */ + +void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set) +{ + struct bkey_packed *_k, *_n; + struct bkey k, n; + char buf[120]; + + if (!i->u64s) + return; + + for (_k = i->start, k = bkey_unpack_key(b, _k); + _k < vstruct_last(i); + _k = _n, k = n) { + _n = bkey_next(_k); + + bch2_bkey_to_text(buf, sizeof(buf), &k); + printk(KERN_ERR "block %u key %zi/%u: %s\n", set, + _k->_data - i->_data, i->u64s, buf); + + if (_n == vstruct_last(i)) + continue; + + n = bkey_unpack_key(b, _n); + + if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) { + printk(KERN_ERR "Key skipped backwards\n"); + continue; + } + + /* + * Weird check for duplicate non extent keys: extents are + * deleted iff they have 0 size, so if it has zero size and it's + * not deleted these aren't extents: + */ + if (((!k.size && !bkey_deleted(&k)) || + (!n.size && !bkey_deleted(&n))) && + !bkey_deleted(&k) && + !bkey_cmp(n.p, k.p)) + printk(KERN_ERR "Duplicate keys\n"); + } +} + +void bch2_dump_btree_node(struct btree *b) +{ + struct bset_tree *t; + + console_lock(); + for_each_bset(b, t) + bch2_dump_bset(b, bset(b, t), t - b->set); + console_unlock(); +} + +void bch2_dump_btree_node_iter(struct btree *b, + struct btree_node_iter *iter) +{ + struct btree_node_iter_set *set; + + printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets); + + btree_node_iter_for_each(iter, set) { + struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); + struct bset_tree *t = bch2_bkey_to_bset(b, k); + struct bkey uk = bkey_unpack_key(b, k); + char buf[100]; + + bch2_bkey_to_text(buf, sizeof(buf), &uk); + printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set, + k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf); + } +} + +#ifdef CONFIG_BCACHEFS_DEBUG + +static bool keys_out_of_order(struct btree *b, + const struct bkey_packed *prev, + const struct bkey_packed *next, + bool is_extents) +{ + struct bkey nextu = bkey_unpack_key(b, next); + + return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 || + ((is_extents + ? !bkey_deleted(next) + : !bkey_deleted(prev)) && + !bkey_cmp_packed(b, prev, next)); +} + +void __bch2_verify_btree_nr_keys(struct btree *b) +{ + struct bset_tree *t; + struct bkey_packed *k; + struct btree_nr_keys nr = { 0 }; + + for_each_bset(b, t) + for (k = btree_bkey_first(b, t); + k != btree_bkey_last(b, t); + k = bkey_next(k)) + if (!bkey_whiteout(k)) + btree_keys_account_key_add(&nr, t - b->set, k); + + BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); +} + +static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, + struct btree *b, + struct bkey_packed *k) +{ + const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b); + + bkey_unpack_key(b, k); + + if (n && + keys_out_of_order(b, k, n, iter->is_extents)) { + struct bkey ku = bkey_unpack_key(b, k); + struct bkey nu = bkey_unpack_key(b, n); + char buf1[80], buf2[80]; + + bch2_dump_btree_node(b); + bch2_bkey_to_text(buf1, sizeof(buf1), &ku); + bch2_bkey_to_text(buf2, sizeof(buf2), &nu); + panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2); + } +} + +void bch2_btree_node_iter_verify(struct btree_node_iter *iter, + struct btree *b) +{ + struct btree_node_iter_set *set; + struct bset_tree *t; + struct bkey_packed *k, *first; + + BUG_ON(iter->used > MAX_BSETS); + + if (!iter->used) + return; + + btree_node_iter_for_each(iter, set) { + k = __btree_node_offset_to_key(b, set->k); + t = bch2_bkey_to_bset(b, k); + + BUG_ON(__btree_node_offset_to_key(b, set->end) != + btree_bkey_last(b, t)); + + BUG_ON(set + 1 < iter->data + iter->used && + btree_node_iter_cmp(iter, b, set[0], set[1]) > 0); + } + + first = __btree_node_offset_to_key(b, iter->data[0].k); + + for_each_bset(b, t) + if (bch2_btree_node_iter_bset_pos(iter, b, t) == + btree_bkey_last(b, t) && + (k = bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)))) + BUG_ON(__btree_node_iter_cmp(iter->is_extents, b, + k, first) > 0); +} + +void bch2_verify_key_order(struct btree *b, + struct btree_node_iter *iter, + struct bkey_packed *where) +{ + struct bset_tree *t = bch2_bkey_to_bset(b, where); + struct bkey_packed *k, *prev; + struct bkey uk, uw = bkey_unpack_key(b, where); + + k = bch2_bkey_prev_all(b, t, where); + if (k && + keys_out_of_order(b, k, where, iter->is_extents)) { + char buf1[100], buf2[100]; + + bch2_dump_btree_node(b); + uk = bkey_unpack_key(b, k); + bch2_bkey_to_text(buf1, sizeof(buf1), &uk); + bch2_bkey_to_text(buf2, sizeof(buf2), &uw); + panic("out of order with prev:\n%s\n%s\n", + buf1, buf2); + } + + k = bkey_next(where); + BUG_ON(k != btree_bkey_last(b, t) && + keys_out_of_order(b, where, k, iter->is_extents)); + + for_each_bset(b, t) { + if (where >= btree_bkey_first(b, t) || + where < btree_bkey_last(b, t)) + continue; + + k = bch2_btree_node_iter_bset_pos(iter, b, t); + + if (k == btree_bkey_last(b, t)) + k = bch2_bkey_prev_all(b, t, k); + + while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 && + (prev = bch2_bkey_prev_all(b, t, k))) + k = prev; + + for (; + k != btree_bkey_last(b, t); + k = bkey_next(k)) { + uk = bkey_unpack_key(b, k); + + if (iter->is_extents) { + BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 || + bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0)); + } else { + BUG_ON(!bkey_cmp(uw.p, uk.p) && + !bkey_deleted(&uk)); + } + + if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0) + break; + } + } +} + +#else + +static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, + struct btree *b, + struct bkey_packed *k) {} + +#endif + +/* Auxiliary search trees */ + +#define BFLOAT_FAILED_UNPACKED (U8_MAX - 0) +#define BFLOAT_FAILED_PREV (U8_MAX - 1) +#define BFLOAT_FAILED_OVERFLOW (U8_MAX - 2) +#define BFLOAT_FAILED (U8_MAX - 2) + +#define KEY_WORDS BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS) + +struct bkey_float { + u8 exponent; + u8 key_offset; + union { + u32 mantissa32; + struct { + u16 mantissa16; + u16 _pad; + }; + }; +} __packed; + +#define BFLOAT_32BIT_NR 32U + +static unsigned bkey_float_byte_offset(unsigned idx) +{ + int d = (idx - BFLOAT_32BIT_NR) << 1; + + d &= ~(d >> 31); + + return idx * 6 - d; +} + +struct ro_aux_tree { + struct bkey_float _d[0]; +}; + +struct rw_aux_tree { + u16 offset; + struct bpos k; +}; + +/* + * BSET_CACHELINE was originally intended to match the hardware cacheline size - + * it used to be 64, but I realized the lookup code would touch slightly less + * memory if it was 128. + * + * It definites the number of bytes (in struct bset) per struct bkey_float in + * the auxiliar search tree - when we're done searching the bset_float tree we + * have this many bytes left that we do a linear search over. + * + * Since (after level 5) every level of the bset_tree is on a new cacheline, + * we're touching one fewer cacheline in the bset tree in exchange for one more + * cacheline in the linear search - but the linear search might stop before it + * gets to the second cacheline. + */ + +#define BSET_CACHELINE 128 + +/* Space required for the btree node keys */ +static inline size_t btree_keys_bytes(struct btree *b) +{ + return PAGE_SIZE << b->page_order; +} + +static inline size_t btree_keys_cachelines(struct btree *b) +{ + return btree_keys_bytes(b) / BSET_CACHELINE; +} + +static inline size_t btree_aux_data_bytes(struct btree *b) +{ + return btree_keys_cachelines(b) * 8; +} + +static inline size_t btree_aux_data_u64s(struct btree *b) +{ + return btree_aux_data_bytes(b) / sizeof(u64); +} + +static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) +{ + BUG_ON(t->aux_data_offset == U16_MAX); + + switch (bset_aux_tree_type(t)) { + case BSET_NO_AUX_TREE: + return t->aux_data_offset; + case BSET_RO_AUX_TREE: + return t->aux_data_offset + + DIV_ROUND_UP(bkey_float_byte_offset(t->size) + + sizeof(u8) * t->size, 8); + case BSET_RW_AUX_TREE: + return t->aux_data_offset + + DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); + default: + BUG(); + } +} + +static unsigned bset_aux_tree_buf_start(const struct btree *b, + const struct bset_tree *t) +{ + return t == b->set + ? DIV_ROUND_UP(b->unpack_fn_len, 8) + : bset_aux_tree_buf_end(t - 1); +} + +static void *__aux_tree_base(const struct btree *b, + const struct bset_tree *t) +{ + return b->aux_data + t->aux_data_offset * 8; +} + +static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, + const struct bset_tree *t) +{ + EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); + + return __aux_tree_base(b, t); +} + +static u8 *ro_aux_tree_prev(const struct btree *b, + const struct bset_tree *t) +{ + EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); + + return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); +} + +static struct bkey_float *bkey_float_get(struct ro_aux_tree *b, + unsigned idx) +{ + return (void *) b + bkey_float_byte_offset(idx); +} + +static struct bkey_float *bkey_float(const struct btree *b, + const struct bset_tree *t, + unsigned idx) +{ + return bkey_float_get(ro_aux_tree_base(b, t), idx); +} + +static void bset_aux_tree_verify(struct btree *b) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bset_tree *t; + + for_each_bset(b, t) { + if (t->aux_data_offset == U16_MAX) + continue; + + BUG_ON(t != b->set && + t[-1].aux_data_offset == U16_MAX); + + BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); + BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); + BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); + } +#endif +} + +/* Memory allocation */ + +void bch2_btree_keys_free(struct btree *b) +{ + vfree(b->aux_data); + b->aux_data = NULL; +} + +int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp) +{ + b->page_order = page_order; + b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, + PAGE_KERNEL_EXEC); + if (!b->aux_data) + return -ENOMEM; + + return 0; +} + +void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) +{ + unsigned i; + + b->nsets = 0; + memset(&b->nr, 0, sizeof(b->nr)); +#ifdef CONFIG_BCACHEFS_DEBUG + b->expensive_debug_checks = expensive_debug_checks; +#endif + for (i = 0; i < MAX_BSETS; i++) + b->set[i].data_offset = U16_MAX; + + bch2_bset_set_no_aux_tree(b, b->set); +} + +/* Binary tree stuff for auxiliary search trees */ + +/* + * Cacheline/offset <-> bkey pointer arithmetic: + * + * t->tree is a binary search tree in an array; each node corresponds to a key + * in one cacheline in t->set (BSET_CACHELINE bytes). + * + * This means we don't have to store the full index of the key that a node in + * the binary tree points to; eytzinger_to_inorder() gives us the cacheline, and + * then bkey_float->m gives us the offset within that cacheline, in units of 8 + * bytes. + * + * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to + * make this work. + * + * To construct the bfloat for an arbitrary key we need to know what the key + * immediately preceding it is: we have to check if the two keys differ in the + * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size + * of the previous key so we can walk backwards to it from t->tree[j]'s key. + */ + +static inline void *bset_cacheline(const struct btree *b, + const struct bset_tree *t, + unsigned cacheline) +{ + return (void *) round_down((unsigned long) btree_bkey_first(b, t), + L1_CACHE_BYTES) + + cacheline * BSET_CACHELINE; +} + +static struct bkey_packed *cacheline_to_bkey(const struct btree *b, + const struct bset_tree *t, + unsigned cacheline, + unsigned offset) +{ + return bset_cacheline(b, t, cacheline) + offset * 8; +} + +static unsigned bkey_to_cacheline(const struct btree *b, + const struct bset_tree *t, + const struct bkey_packed *k) +{ + return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; +} + +static ssize_t __bkey_to_cacheline_offset(const struct btree *b, + const struct bset_tree *t, + unsigned cacheline, + const struct bkey_packed *k) +{ + return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); +} + +static unsigned bkey_to_cacheline_offset(const struct btree *b, + const struct bset_tree *t, + unsigned cacheline, + const struct bkey_packed *k) +{ + size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); + + EBUG_ON(m > U8_MAX); + return m; +} + +static inline struct bkey_packed *tree_to_bkey(const struct btree *b, + const struct bset_tree *t, + unsigned j) +{ + return cacheline_to_bkey(b, t, + __eytzinger_to_inorder(j, t->size, t->extra), + bkey_float(b, t, j)->key_offset); +} + +static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, + const struct bset_tree *t, + unsigned j) +{ + unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; + + return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); +} + +static struct rw_aux_tree *rw_aux_tree(const struct btree *b, + const struct bset_tree *t) +{ + EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); + + return __aux_tree_base(b, t); +} + +/* + * For the write set - the one we're currently inserting keys into - we don't + * maintain a full search tree, we just keep a simple lookup table in t->prev. + */ +static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, + struct bset_tree *t, + unsigned j) +{ + return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); +} + +static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, + unsigned j, struct bkey_packed *k) +{ + BUG_ON(k >= btree_bkey_last(b, t)); + + rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { + .offset = __btree_node_key_to_offset(b, k), + .k = bkey_unpack_pos(b, k), + }; +} + +static void bch2_bset_verify_rw_aux_tree(struct btree *b, + struct bset_tree *t) +{ + struct bkey_packed *k = btree_bkey_first(b, t); + unsigned j = 0; + + if (!btree_keys_expensive_checks(b)) + return; + + BUG_ON(bset_has_ro_aux_tree(t)); + + if (!bset_has_rw_aux_tree(t)) + return; + + BUG_ON(t->size < 1); + BUG_ON(rw_aux_to_bkey(b, t, j) != k); + + goto start; + while (1) { + if (rw_aux_to_bkey(b, t, j) == k) { + BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, + bkey_unpack_pos(b, k))); +start: + if (++j == t->size) + break; + + BUG_ON(rw_aux_tree(b, t)[j].offset <= + rw_aux_tree(b, t)[j - 1].offset); + } + + k = bkey_next(k); + BUG_ON(k >= btree_bkey_last(b, t)); + } +} + +/* returns idx of first entry >= offset: */ +static unsigned rw_aux_tree_bsearch(struct btree *b, + struct bset_tree *t, + unsigned offset) +{ + unsigned l = 0, r = t->size; + + BUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); + + while (l < r) { + unsigned m = (l + r) >> 1; + + if (rw_aux_tree(b, t)[m].offset < offset) + l = m + 1; + else + r = m; + } + + BUG_ON(l < t->size && + rw_aux_tree(b, t)[l].offset < offset); + BUG_ON(l && + rw_aux_tree(b, t)[l - 1].offset >= offset); + + BUG_ON(l > r); + BUG_ON(l > t->size); + + return l; +} + +static inline unsigned bfloat_mantissa(const struct bkey_float *f, + unsigned idx) +{ + return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16; +} + +static inline void bfloat_mantissa_set(struct bkey_float *f, + unsigned idx, unsigned mantissa) +{ + if (idx < BFLOAT_32BIT_NR) + f->mantissa32 = mantissa; + else + f->mantissa16 = mantissa; +} + +static inline unsigned bkey_mantissa(const struct bkey_packed *k, + const struct bkey_float *f, + unsigned idx) +{ + u64 v; + + EBUG_ON(!bkey_packed(k)); + + v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); + + /* + * In little endian, we're shifting off low bits (and then the bits we + * want are at the low end), in big endian we're shifting off high bits + * (and then the bits we want are at the high end, so we shift them + * back down): + */ +#ifdef __LITTLE_ENDIAN + v >>= f->exponent & 7; +#else + v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16); +#endif + return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v; +} + +static void make_bfloat(struct btree *b, struct bset_tree *t, + unsigned j, + struct bkey_packed *min_key, + struct bkey_packed *max_key) +{ + struct bkey_float *f = bkey_float(b, t, j); + struct bkey_packed *m = tree_to_bkey(b, t, j); + struct bkey_packed *p = tree_to_prev_bkey(b, t, j); + struct bkey_packed *l, *r; + unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16; + unsigned mantissa; + int shift, exponent; + + EBUG_ON(bkey_next(p) != m); + + if (is_power_of_2(j)) { + l = min_key; + + if (!l->u64s) { + if (!bkey_pack_pos(l, b->data->min_key, b)) { + struct bkey_i tmp; + + bkey_init(&tmp.k); + tmp.k.p = b->data->min_key; + bkey_copy(l, &tmp); + } + } + } else { + l = tree_to_prev_bkey(b, t, j >> ffs(j)); + + EBUG_ON(m < l); + } + + if (is_power_of_2(j + 1)) { + r = max_key; + + if (!r->u64s) { + if (!bkey_pack_pos(r, t->max_key, b)) { + struct bkey_i tmp; + + bkey_init(&tmp.k); + tmp.k.p = t->max_key; + bkey_copy(r, &tmp); + } + } + } else { + r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); + + EBUG_ON(m > r); + } + + /* + * for failed bfloats, the lookup code falls back to comparing against + * the original key. + */ + + if (!bkey_packed(l) || !bkey_packed(r) || + !bkey_packed(p) || !bkey_packed(m)) { + f->exponent = BFLOAT_FAILED_UNPACKED; + return; + } + + /* + * The greatest differing bit of l and r is the first bit we must + * include in the bfloat mantissa we're creating in order to do + * comparisons - that bit always becomes the high bit of + * bfloat->mantissa, and thus the exponent we're calculating here is + * the position of what will become the low bit in bfloat->mantissa: + * + * Note that this may be negative - we may be running off the low end + * of the key: we handle this later: + */ + exponent = (int) bch2_bkey_greatest_differing_bit(b, l, r) - (bits - 1); + + /* + * Then we calculate the actual shift value, from the start of the key + * (k->_data), to get the key bits starting at exponent: + */ +#ifdef __LITTLE_ENDIAN + shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; + + EBUG_ON(shift + bits > b->format.key_u64s * 64); +#else + shift = high_bit_offset + + b->nr_key_bits - + exponent - + bits; + + EBUG_ON(shift < KEY_PACKED_BITS_START); +#endif + EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); + + f->exponent = shift; + mantissa = bkey_mantissa(m, f, j); + + /* + * If we've got garbage bits, set them to all 1s - it's legal for the + * bfloat to compare larger than the original key, but not smaller: + */ + if (exponent < 0) + mantissa |= ~(~0U << -exponent); + + bfloat_mantissa_set(f, j, mantissa); + + /* + * The bfloat must be able to tell its key apart from the previous key - + * if its key and the previous key don't differ in the required bits, + * flag as failed - unless the keys are actually equal, in which case + * we aren't required to return a specific one: + */ + if (exponent > 0 && + bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) && + bkey_cmp_packed(b, p, m)) { + f->exponent = BFLOAT_FAILED_PREV; + return; + } + + /* + * f->mantissa must compare >= the original key - for transitivity with + * the comparison in bset_search_tree. If we're dropping set bits, + * increment it: + */ + if (exponent > (int) bch2_bkey_ffs(b, m)) { + if (j < BFLOAT_32BIT_NR + ? f->mantissa32 == U32_MAX + : f->mantissa16 == U16_MAX) + f->exponent = BFLOAT_FAILED_OVERFLOW; + + if (j < BFLOAT_32BIT_NR) + f->mantissa32++; + else + f->mantissa16++; + } +} + +/* bytes remaining - only valid for last bset: */ +static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) +{ + bset_aux_tree_verify(b); + + return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); +} + +static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) +{ + unsigned bytes = __bset_tree_capacity(b, t); + + if (bytes < 7 * BFLOAT_32BIT_NR) + return bytes / 7; + + bytes -= 7 * BFLOAT_32BIT_NR; + + return BFLOAT_32BIT_NR + bytes / 5; +} + +static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) +{ + return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); +} + +static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) +{ + struct bkey_packed *k; + + t->size = 1; + t->extra = BSET_RW_AUX_TREE_VAL; + rw_aux_tree(b, t)[0].offset = + __btree_node_key_to_offset(b, btree_bkey_first(b, t)); + + for (k = btree_bkey_first(b, t); + k != btree_bkey_last(b, t); + k = bkey_next(k)) { + if (t->size == bset_rw_tree_capacity(b, t)) + break; + + if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > + L1_CACHE_BYTES) + rw_aux_tree_set(b, t, t->size++, k); + } +} + +static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) +{ + struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); + struct bkey_packed min_key, max_key; + unsigned j, cacheline = 1; + + /* signal to make_bfloat() that they're uninitialized: */ + min_key.u64s = max_key.u64s = 0; + + t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), + bset_ro_tree_capacity(b, t)); +retry: + if (t->size < 2) { + t->size = 0; + t->extra = BSET_NO_AUX_TREE_VAL; + return; + } + + t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; + + /* First we figure out where the first key in each cacheline is */ + eytzinger_for_each(j, t->size) { + while (bkey_to_cacheline(b, t, k) < cacheline) + prev = k, k = bkey_next(k); + + if (k >= btree_bkey_last(b, t)) { + t->size--; + goto retry; + } + + ro_aux_tree_prev(b, t)[j] = prev->u64s; + bkey_float(b, t, j)->key_offset = + bkey_to_cacheline_offset(b, t, cacheline++, k); + + BUG_ON(tree_to_prev_bkey(b, t, j) != prev); + BUG_ON(tree_to_bkey(b, t, j) != k); + } + + while (bkey_next(k) != btree_bkey_last(b, t)) + k = bkey_next(k); + + t->max_key = bkey_unpack_pos(b, k); + + /* Then we build the tree */ + eytzinger_for_each(j, t->size) + make_bfloat(b, t, j, &min_key, &max_key); +} + +static void bset_alloc_tree(struct btree *b, struct bset_tree *t) +{ + struct bset_tree *i; + + for (i = b->set; i != t; i++) + BUG_ON(bset_has_rw_aux_tree(i)); + + bch2_bset_set_no_aux_tree(b, t); + + /* round up to next cacheline: */ + t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), + SMP_CACHE_BYTES / sizeof(u64)); + + bset_aux_tree_verify(b); +} + +void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, + bool writeable) +{ + if (writeable + ? bset_has_rw_aux_tree(t) + : bset_has_ro_aux_tree(t)) + return; + + bset_alloc_tree(b, t); + + if (!__bset_tree_capacity(b, t)) + return; + + if (writeable) + __build_rw_aux_tree(b, t); + else + __build_ro_aux_tree(b, t); + + bset_aux_tree_verify(b); +} + +void bch2_bset_init_first(struct btree *b, struct bset *i) +{ + struct bset_tree *t; + + BUG_ON(b->nsets); + + memset(i, 0, sizeof(*i)); + get_random_bytes(&i->seq, sizeof(i->seq)); + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + + t = &b->set[b->nsets++]; + set_btree_bset(b, t, i); +} + +void bch2_bset_init_next(struct btree *b, struct bset *i) +{ + struct bset_tree *t; + + BUG_ON(b->nsets >= MAX_BSETS); + + memset(i, 0, sizeof(*i)); + i->seq = btree_bset_first(b)->seq; + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + + t = &b->set[b->nsets++]; + set_btree_bset(b, t, i); +} + +static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + struct bkey_packed *p; + unsigned offset; + int j; + + EBUG_ON(k < btree_bkey_first(b, t) || + k > btree_bkey_last(b, t)); + + if (k == btree_bkey_first(b, t)) + return NULL; + + switch (bset_aux_tree_type(t)) { + case BSET_NO_AUX_TREE: + p = btree_bkey_first(b, t); + break; + case BSET_RO_AUX_TREE: + j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); + + do { + p = j ? tree_to_bkey(b, t, + __inorder_to_eytzinger(j--, + t->size, t->extra)) + : btree_bkey_first(b, t); + } while (p >= k); + break; + case BSET_RW_AUX_TREE: + offset = __btree_node_key_to_offset(b, k); + j = rw_aux_tree_bsearch(b, t, offset); + p = j ? rw_aux_to_bkey(b, t, j - 1) + : btree_bkey_first(b, t); + break; + } + + return p; +} + +struct bkey_packed *bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + struct bkey_packed *p; + + p = __bkey_prev(b, t, k); + if (!p) + return NULL; + + while (bkey_next(p) != k) + p = bkey_next(p); + + return p; +} + +struct bkey_packed *bch2_bkey_prev(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + while (1) { + struct bkey_packed *p, *i, *ret = NULL; + + p = __bkey_prev(b, t, k); + if (!p) + return NULL; + + for (i = p; i != k; i = bkey_next(i)) + if (!bkey_deleted(i)) + ret = i; + + if (ret) + return ret; + + k = p; + } +} + +/* Insert */ + +static void rw_aux_tree_fix_invalidated_key(struct btree *b, + struct bset_tree *t, + struct bkey_packed *k) +{ + unsigned offset = __btree_node_key_to_offset(b, k); + unsigned j = rw_aux_tree_bsearch(b, t, offset); + + if (j < t->size && + rw_aux_tree(b, t)[j].offset == offset) + rw_aux_tree_set(b, t, j, k); + + bch2_bset_verify_rw_aux_tree(b, t); +} + +static void ro_aux_tree_fix_invalidated_key(struct btree *b, + struct bset_tree *t, + struct bkey_packed *k) +{ + struct bkey_packed min_key, max_key; + unsigned inorder, j; + + BUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); + + /* signal to make_bfloat() that they're uninitialized: */ + min_key.u64s = max_key.u64s = 0; + + if (bkey_next(k) == btree_bkey_last(b, t)) { + t->max_key = bkey_unpack_pos(b, k); + + for (j = 1; j < t->size; j = j * 2 + 1) + make_bfloat(b, t, j, &min_key, &max_key); + } + + inorder = bkey_to_cacheline(b, t, k); + + if (inorder && + inorder < t->size) { + j = __inorder_to_eytzinger(inorder, t->size, t->extra); + + if (k == tree_to_bkey(b, t, j)) { + /* Fix the node this key corresponds to */ + make_bfloat(b, t, j, &min_key, &max_key); + + /* Children for which this key is the right boundary */ + for (j = eytzinger_left_child(j); + j < t->size; + j = eytzinger_right_child(j)) + make_bfloat(b, t, j, &min_key, &max_key); + } + } + + if (inorder + 1 < t->size) { + j = __inorder_to_eytzinger(inorder + 1, t->size, t->extra); + + if (k == tree_to_prev_bkey(b, t, j)) { + make_bfloat(b, t, j, &min_key, &max_key); + + /* Children for which this key is the left boundary */ + for (j = eytzinger_right_child(j); + j < t->size; + j = eytzinger_left_child(j)) + make_bfloat(b, t, j, &min_key, &max_key); + } + } +} + +/** + * bch2_bset_fix_invalidated_key() - given an existing key @k that has been + * modified, fix any auxiliary search tree by remaking all the nodes in the + * auxiliary search tree that @k corresponds to + */ +void bch2_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + switch (bset_aux_tree_type(t)) { + case BSET_NO_AUX_TREE: + break; + case BSET_RO_AUX_TREE: + ro_aux_tree_fix_invalidated_key(b, t, k); + break; + case BSET_RW_AUX_TREE: + rw_aux_tree_fix_invalidated_key(b, t, k); + break; + } +} + +static void bch2_bset_fix_lookup_table(struct btree *b, + struct bset_tree *t, + struct bkey_packed *_where, + unsigned clobber_u64s, + unsigned new_u64s) +{ + int shift = new_u64s - clobber_u64s; + unsigned l, j, where = __btree_node_key_to_offset(b, _where); + + BUG_ON(bset_has_ro_aux_tree(t)); + + if (!bset_has_rw_aux_tree(t)) + return; + + l = rw_aux_tree_bsearch(b, t, where); + + /* l is first >= than @where */ + + BUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where); + BUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where); + + if (!l) /* never delete first entry */ + l++; + else if (l < t->size && + where < t->end_offset && + rw_aux_tree(b, t)[l].offset == where) + rw_aux_tree_set(b, t, l++, _where); + + /* l now > where */ + + for (j = l; + j < t->size && + rw_aux_tree(b, t)[j].offset < where + clobber_u64s; + j++) + ; + + if (j < t->size && + rw_aux_tree(b, t)[j].offset + shift == + rw_aux_tree(b, t)[l - 1].offset) + j++; + + memmove(&rw_aux_tree(b, t)[l], + &rw_aux_tree(b, t)[j], + (void *) &rw_aux_tree(b, t)[t->size] - + (void *) &rw_aux_tree(b, t)[j]); + t->size -= j - l; + + for (j = l; j < t->size; j++) + rw_aux_tree(b, t)[j].offset += shift; + + BUG_ON(l < t->size && + rw_aux_tree(b, t)[l].offset == + rw_aux_tree(b, t)[l - 1].offset); + + if (t->size < bset_rw_tree_capacity(b, t) && + (l < t->size + ? rw_aux_tree(b, t)[l].offset + : t->end_offset) - + rw_aux_tree(b, t)[l - 1].offset > + L1_CACHE_BYTES / sizeof(u64)) { + struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); + struct bkey_packed *end = l < t->size + ? rw_aux_to_bkey(b, t, l) + : btree_bkey_last(b, t); + struct bkey_packed *k = start; + + while (1) { + k = bkey_next(k); + if (k == end) + break; + + if ((void *) k - (void *) start >= L1_CACHE_BYTES) { + memmove(&rw_aux_tree(b, t)[l + 1], + &rw_aux_tree(b, t)[l], + (void *) &rw_aux_tree(b, t)[t->size] - + (void *) &rw_aux_tree(b, t)[l]); + t->size++; + rw_aux_tree_set(b, t, l, k); + break; + } + } + } + + bch2_bset_verify_rw_aux_tree(b, t); + bset_aux_tree_verify(b); +} + +void bch2_bset_insert(struct btree *b, + struct btree_node_iter *iter, + struct bkey_packed *where, + struct bkey_i *insert, + unsigned clobber_u64s) +{ + struct bkey_format *f = &b->format; + struct bset_tree *t = bset_tree_last(b); + struct bkey_packed packed, *src = bkey_to_packed(insert); + + bch2_bset_verify_rw_aux_tree(b, t); + + if (bch2_bkey_pack_key(&packed, &insert->k, f)) + src = &packed; + + if (!bkey_whiteout(&insert->k)) + btree_keys_account_key_add(&b->nr, t - b->set, src); + + if (src->u64s != clobber_u64s) { + u64 *src_p = where->_data + clobber_u64s; + u64 *dst_p = where->_data + src->u64s; + + BUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < + (int) clobber_u64s - src->u64s); + + memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); + le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); + set_btree_bset_end(b, t); + } + + memcpy_u64s(where, src, + bkeyp_key_u64s(f, src)); + memcpy_u64s(bkeyp_val(f, where), &insert->v, + bkeyp_val_u64s(f, src)); + + bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); + + bch2_verify_key_order(b, iter, where); + bch2_verify_btree_nr_keys(b); +} + +void bch2_bset_delete(struct btree *b, + struct bkey_packed *where, + unsigned clobber_u64s) +{ + struct bset_tree *t = bset_tree_last(b); + u64 *src_p = where->_data + clobber_u64s; + u64 *dst_p = where->_data; + + bch2_bset_verify_rw_aux_tree(b, t); + + BUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); + + memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); + le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); + set_btree_bset_end(b, t); + + bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); +} + +/* Lookup */ + +__flatten +static struct bkey_packed *bset_search_write_set(const struct btree *b, + struct bset_tree *t, + struct bpos search, + const struct bkey_packed *packed_search) +{ + unsigned l = 0, r = t->size; + + while (l + 1 != r) { + unsigned m = (l + r) >> 1; + + if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0) + l = m; + else + r = m; + } + + return rw_aux_to_bkey(b, t, l); +} + +noinline +static int bset_search_tree_slowpath(const struct btree *b, + struct bset_tree *t, struct bpos *search, + const struct bkey_packed *packed_search, + unsigned n) +{ + return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n), + packed_search, search) < 0; +} + +__flatten +static struct bkey_packed *bset_search_tree(const struct btree *b, + struct bset_tree *t, + struct bpos search, + const struct bkey_packed *packed_search) +{ + struct ro_aux_tree *base = ro_aux_tree_base(b, t); + struct bkey_float *f = bkey_float_get(base, 1); + void *p; + unsigned inorder, n = 1; + + while (1) { + if (likely(n << 4 < t->size)) { + p = bkey_float_get(base, n << 4); + prefetch(p); + } else if (n << 3 < t->size) { + inorder = __eytzinger_to_inorder(n, t->size, t->extra); + p = bset_cacheline(b, t, inorder); +#ifdef CONFIG_X86_64 + asm(".intel_syntax noprefix;" + "prefetcht0 [%0 - 127 + 64 * 0];" + "prefetcht0 [%0 - 127 + 64 * 1];" + "prefetcht0 [%0 - 127 + 64 * 2];" + "prefetcht0 [%0 - 127 + 64 * 3];" + ".att_syntax prefix;" + : + : "r" (p + 127)); +#else + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + prefetch(p + L1_CACHE_BYTES * 3); +#endif + } else if (n >= t->size) + break; + + f = bkey_float_get(base, n); + + if (packed_search && + likely(f->exponent < BFLOAT_FAILED)) + n = n * 2 + (bfloat_mantissa(f, n) < + bkey_mantissa(packed_search, f, n)); + else + n = n * 2 + bset_search_tree_slowpath(b, t, + &search, packed_search, n); + } while (n < t->size); + + inorder = __eytzinger_to_inorder(n >> 1, t->size, t->extra); + + /* + * n would have been the node we recursed to - the low bit tells us if + * we recursed left or recursed right. + */ + if (n & 1) { + return cacheline_to_bkey(b, t, inorder, f->key_offset); + } else { + if (--inorder) { + n = eytzinger_prev(n >> 1, t->size); + f = bkey_float_get(base, n); + return cacheline_to_bkey(b, t, inorder, f->key_offset); + } else + return btree_bkey_first(b, t); + } +} + +/* + * Returns the first key greater than or equal to @search + */ +__always_inline __flatten +static struct bkey_packed *bch2_bset_search(struct btree *b, + struct bset_tree *t, + struct bpos search, + struct bkey_packed *packed_search, + const struct bkey_packed *lossy_packed_search, + bool strictly_greater) +{ + struct bkey_packed *m; + + /* + * First, we search for a cacheline, then lastly we do a linear search + * within that cacheline. + * + * To search for the cacheline, there's three different possibilities: + * * The set is too small to have a search tree, so we just do a linear + * search over the whole set. + * * The set is the one we're currently inserting into; keeping a full + * auxiliary search tree up to date would be too expensive, so we + * use a much simpler lookup table to do a binary search - + * bset_search_write_set(). + * * Or we use the auxiliary search tree we constructed earlier - + * bset_search_tree() + */ + + switch (bset_aux_tree_type(t)) { + case BSET_NO_AUX_TREE: + m = btree_bkey_first(b, t); + break; + case BSET_RW_AUX_TREE: + m = bset_search_write_set(b, t, search, lossy_packed_search); + break; + case BSET_RO_AUX_TREE: + /* + * Each node in the auxiliary search tree covers a certain range + * of bits, and keys above and below the set it covers might + * differ outside those bits - so we have to special case the + * start and end - handle that here: + */ + + if (bkey_cmp(search, t->max_key) > 0) + return btree_bkey_last(b, t); + + m = bset_search_tree(b, t, search, lossy_packed_search); + break; + } + + if (lossy_packed_search) + while (m != btree_bkey_last(b, t) && + !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search, + m, strictly_greater)) + m = bkey_next(m); + + if (!packed_search) + while (m != btree_bkey_last(b, t) && + !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater)) + m = bkey_next(m); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); + + BUG_ON(prev && + btree_iter_pos_cmp_p_or_unp(b, search, packed_search, + prev, strictly_greater)); + } + + return m; +} + +/* Btree node iterator */ + +void bch2_btree_node_iter_push(struct btree_node_iter *iter, + struct btree *b, + const struct bkey_packed *k, + const struct bkey_packed *end) +{ + if (k != end) { + struct btree_node_iter_set *pos, n = + ((struct btree_node_iter_set) { + __btree_node_key_to_offset(b, k), + __btree_node_key_to_offset(b, end) + }); + + btree_node_iter_for_each(iter, pos) + if (btree_node_iter_cmp(iter, b, n, *pos) <= 0) + break; + + memmove(pos + 1, pos, + (void *) (iter->data + iter->used) - (void *) pos); + iter->used++; + *pos = n; + } +} + +noinline __flatten __attribute__((cold)) +static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, + struct btree *b, struct bpos search, + bool strictly_greater, bool is_extents) +{ + struct bset_tree *t; + + trace_bkey_pack_pos_fail(search); + + for_each_bset(b, t) + __bch2_btree_node_iter_push(iter, b, + bch2_bset_search(b, t, search, NULL, NULL, + strictly_greater), + btree_bkey_last(b, t)); + + bch2_btree_node_iter_sort(iter, b); +} + +/** + * bch_btree_node_iter_init - initialize a btree node iterator, starting from a + * given position + * + * Main entry point to the lookup code for individual btree nodes: + * + * NOTE: + * + * When you don't filter out deleted keys, btree nodes _do_ contain duplicate + * keys. This doesn't matter for most code, but it does matter for lookups. + * + * Some adjacent keys with a string of equal keys: + * i j k k k k l m + * + * If you search for k, the lookup code isn't guaranteed to return you any + * specific k. The lookup code is conceptually doing a binary search and + * iterating backwards is very expensive so if the pivot happens to land at the + * last k that's what you'll get. + * + * This works out ok, but it's something to be aware of: + * + * - For non extents, we guarantee that the live key comes last - see + * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't + * see will only be deleted keys you don't care about. + * + * - For extents, deleted keys sort last (see the comment at the top of this + * file). But when you're searching for extents, you actually want the first + * key strictly greater than your search key - an extent that compares equal + * to the search key is going to have 0 sectors after the search key. + * + * But this does mean that we can't just search for + * bkey_successor(start_of_range) to get the first extent that overlaps with + * the range we want - if we're unlucky and there's an extent that ends + * exactly where we searched, then there could be a deleted key at the same + * position and we'd get that when we search instead of the preceding extent + * we needed. + * + * So we've got to search for start_of_range, then after the lookup iterate + * past any extents that compare equal to the position we searched for. + */ +void bch2_btree_node_iter_init(struct btree_node_iter *iter, + struct btree *b, struct bpos search, + bool strictly_greater, bool is_extents) +{ + struct bset_tree *t; + struct bkey_packed p, *packed_search = NULL; + + EBUG_ON(bkey_cmp(search, b->data->min_key) < 0); + bset_aux_tree_verify(b); + + __bch2_btree_node_iter_init(iter, is_extents); + + //if (bkey_cmp(search, b->curr_max_key) > 0) + // return; + + switch (bch2_bkey_pack_pos_lossy(&p, search, b)) { + case BKEY_PACK_POS_EXACT: + packed_search = &p; + break; + case BKEY_PACK_POS_SMALLER: + packed_search = NULL; + break; + case BKEY_PACK_POS_FAIL: + btree_node_iter_init_pack_failed(iter, b, search, + strictly_greater, is_extents); + return; + } + + for_each_bset(b, t) + __bch2_btree_node_iter_push(iter, b, + bch2_bset_search(b, t, search, + packed_search, &p, + strictly_greater), + btree_bkey_last(b, t)); + + bch2_btree_node_iter_sort(iter, b); +} + +void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, + struct btree *b, + bool is_extents) +{ + struct bset_tree *t; + + __bch2_btree_node_iter_init(iter, is_extents); + + for_each_bset(b, t) + __bch2_btree_node_iter_push(iter, b, + btree_bkey_first(b, t), + btree_bkey_last(b, t)); + bch2_btree_node_iter_sort(iter, b); +} + +struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, + struct btree *b, + struct bset_tree *t) +{ + struct btree_node_iter_set *set; + + BUG_ON(iter->used > MAX_BSETS); + + btree_node_iter_for_each(iter, set) + if (set->end == t->end_offset) + return __btree_node_offset_to_key(b, set->k); + + return btree_bkey_last(b, t); +} + +static inline void btree_node_iter_sift(struct btree_node_iter *iter, + struct btree *b, + unsigned start) +{ + unsigned i; + + EBUG_ON(iter->used > MAX_BSETS); + + for (i = start; + i + 1 < iter->used && + btree_node_iter_cmp(iter, b, iter->data[i], iter->data[i + 1]) > 0; + i++) + swap(iter->data[i], iter->data[i + 1]); +} + +static inline void btree_node_iter_sort_two(struct btree_node_iter *iter, + struct btree *b, + unsigned first) +{ + if (btree_node_iter_cmp(iter, b, + iter->data[first], + iter->data[first + 1]) > 0) + swap(iter->data[first], iter->data[first + 1]); +} + +void bch2_btree_node_iter_sort(struct btree_node_iter *iter, + struct btree *b) +{ + EBUG_ON(iter->used > 3); + + /* unrolled bubble sort: */ + + if (iter->used > 2) { + btree_node_iter_sort_two(iter, b, 0); + btree_node_iter_sort_two(iter, b, 1); + } + + if (iter->used > 1) + btree_node_iter_sort_two(iter, b, 0); +} + +/** + * bch_btree_node_iter_advance - advance @iter by one key + * + * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might + * momentarily have out of order extents. + */ +void bch2_btree_node_iter_advance(struct btree_node_iter *iter, + struct btree *b) +{ + struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b); + + iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; + + BUG_ON(iter->data->k > iter->data->end); + + if (iter->data->k == iter->data->end) { + BUG_ON(iter->used == 0); + iter->data[0] = iter->data[--iter->used]; + } + + btree_node_iter_sift(iter, b, 0); + + bch2_btree_node_iter_next_check(iter, b, k); +} + +/* + * Expensive: + */ +struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, + struct btree *b) +{ + struct bkey_packed *k, *prev = NULL; + struct btree_node_iter_set *set; + struct bset_tree *t; + struct bset_tree *prev_t; + unsigned end; + + bch2_btree_node_iter_verify(iter, b); + + for_each_bset(b, t) { + k = bch2_bkey_prev_all(b, t, + bch2_btree_node_iter_bset_pos(iter, b, t)); + if (k && + (!prev || __btree_node_iter_cmp(iter->is_extents, b, + k, prev) > 0)) { + prev = k; + prev_t = t; + } + } + + if (!prev) + return NULL; + + /* + * We're manually memmoving instead of just calling sort() to ensure the + * prev we picked ends up in slot 0 - sort won't necessarily put it + * there because of duplicate deleted keys: + */ + end = __btree_node_key_to_offset(b, btree_bkey_last(b, prev_t)); + btree_node_iter_for_each(iter, set) + if (set->end == end) { + memmove(&iter->data[1], + &iter->data[0], + (void *) set - (void *) &iter->data[0]); + goto out; + } + + memmove(&iter->data[1], + &iter->data[0], + (void *) &iter->data[iter->used] - (void *) &iter->data[0]); + iter->used++; +out: + iter->data[0].k = __btree_node_key_to_offset(b, prev); + iter->data[0].end = end; + return prev; +} + +struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, + struct btree *b) +{ + struct bkey_packed *k; + + do { + k = bch2_btree_node_iter_prev_all(iter, b); + } while (k && bkey_deleted(k)); + + return k; +} + +struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, + struct btree *b, + struct bkey *u) +{ + struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); + + return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; +} + +/* Mergesort */ + +void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) +{ + struct bset_tree *t; + + for_each_bset(b, t) { + enum bset_aux_tree_type type = bset_aux_tree_type(t); + size_t j; + + stats->sets[type].nr++; + stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * + sizeof(u64); + + if (bset_has_ro_aux_tree(t)) { + stats->floats += t->size - 1; + + for (j = 1; j < t->size; j++) + switch (bkey_float(b, t, j)->exponent) { + case BFLOAT_FAILED_UNPACKED: + stats->failed_unpacked++; + break; + case BFLOAT_FAILED_PREV: + stats->failed_prev++; + break; + case BFLOAT_FAILED_OVERFLOW: + stats->failed_overflow++; + break; + } + } + } +} + +int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k, + char *buf, size_t size) +{ + struct bset_tree *t = bch2_bkey_to_bset(b, k); + struct bkey_packed *l, *r, *p; + struct bkey uk, up; + char buf1[200], buf2[200]; + unsigned j; + + if (!size) + return 0; + + if (!bset_has_ro_aux_tree(t)) + goto out; + + j = __inorder_to_eytzinger(bkey_to_cacheline(b, t, k), t->size, t->extra); + if (j && + j < t->size && + k == tree_to_bkey(b, t, j)) + switch (bkey_float(b, t, j)->exponent) { + case BFLOAT_FAILED_UNPACKED: + uk = bkey_unpack_key(b, k); + return scnprintf(buf, size, + " failed unpacked at depth %u\n" + "\t%llu:%llu\n", + ilog2(j), + uk.p.inode, uk.p.offset); + case BFLOAT_FAILED_PREV: + p = tree_to_prev_bkey(b, t, j); + l = is_power_of_2(j) + ? btree_bkey_first(b, t) + : tree_to_prev_bkey(b, t, j >> ffs(j)); + r = is_power_of_2(j + 1) + ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)) + : tree_to_bkey(b, t, j >> (ffz(j) + 1)); + + up = bkey_unpack_key(b, p); + uk = bkey_unpack_key(b, k); + bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits); + bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits); + + return scnprintf(buf, size, + " failed prev at depth %u\n" + "\tkey starts at bit %u but first differing bit at %u\n" + "\t%llu:%llu\n" + "\t%llu:%llu\n" + "\t%s\n" + "\t%s\n", + ilog2(j), + bch2_bkey_greatest_differing_bit(b, l, r), + bch2_bkey_greatest_differing_bit(b, p, k), + uk.p.inode, uk.p.offset, + up.p.inode, up.p.offset, + buf1, buf2); + case BFLOAT_FAILED_OVERFLOW: + uk = bkey_unpack_key(b, k); + return scnprintf(buf, size, + " failed overflow at depth %u\n" + "\t%llu:%llu\n", + ilog2(j), + uk.p.inode, uk.p.offset); + } +out: + *buf = '\0'; + return 0; +} diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h new file mode 100644 index 00000000..76a83fcb --- /dev/null +++ b/libbcachefs/bset.h @@ -0,0 +1,615 @@ +#ifndef _BCACHE_BSET_H +#define _BCACHE_BSET_H + +#include <linux/kernel.h> +#include <linux/types.h> + +#include "bcachefs_format.h" +#include "bkey.h" +#include "bkey_methods.h" +#include "btree_types.h" +#include "util.h" /* for time_stats */ +#include "vstructs.h" + +/* + * BKEYS: + * + * A bkey contains a key, a size field, a variable number of pointers, and some + * ancillary flag bits. + * + * We use two different functions for validating bkeys, bkey_invalid and + * bkey_deleted(). + * + * The one exception to the rule that ptr_invalid() filters out invalid keys is + * that it also filters out keys of size 0 - these are keys that have been + * completely overwritten. It'd be safe to delete these in memory while leaving + * them on disk, just unnecessary work - so we filter them out when resorting + * instead. + * + * We can't filter out stale keys when we're resorting, because garbage + * collection needs to find them to ensure bucket gens don't wrap around - + * unless we're rewriting the btree node those stale keys still exist on disk. + * + * We also implement functions here for removing some number of sectors from the + * front or the back of a bkey - this is mainly used for fixing overlapping + * extents, by removing the overlapping sectors from the older key. + * + * BSETS: + * + * A bset is an array of bkeys laid out contiguously in memory in sorted order, + * along with a header. A btree node is made up of a number of these, written at + * different times. + * + * There could be many of them on disk, but we never allow there to be more than + * 4 in memory - we lazily resort as needed. + * + * We implement code here for creating and maintaining auxiliary search trees + * (described below) for searching an individial bset, and on top of that we + * implement a btree iterator. + * + * BTREE ITERATOR: + * + * Most of the code in bcache doesn't care about an individual bset - it needs + * to search entire btree nodes and iterate over them in sorted order. + * + * The btree iterator code serves both functions; it iterates through the keys + * in a btree node in sorted order, starting from either keys after a specific + * point (if you pass it a search key) or the start of the btree node. + * + * AUXILIARY SEARCH TREES: + * + * Since keys are variable length, we can't use a binary search on a bset - we + * wouldn't be able to find the start of the next key. But binary searches are + * slow anyways, due to terrible cache behaviour; bcache originally used binary + * searches and that code topped out at under 50k lookups/second. + * + * So we need to construct some sort of lookup table. Since we only insert keys + * into the last (unwritten) set, most of the keys within a given btree node are + * usually in sets that are mostly constant. We use two different types of + * lookup tables to take advantage of this. + * + * Both lookup tables share in common that they don't index every key in the + * set; they index one key every BSET_CACHELINE bytes, and then a linear search + * is used for the rest. + * + * For sets that have been written to disk and are no longer being inserted + * into, we construct a binary search tree in an array - traversing a binary + * search tree in an array gives excellent locality of reference and is very + * fast, since both children of any node are adjacent to each other in memory + * (and their grandchildren, and great grandchildren...) - this means + * prefetching can be used to great effect. + * + * It's quite useful performance wise to keep these nodes small - not just + * because they're more likely to be in L2, but also because we can prefetch + * more nodes on a single cacheline and thus prefetch more iterations in advance + * when traversing this tree. + * + * Nodes in the auxiliary search tree must contain both a key to compare against + * (we don't want to fetch the key from the set, that would defeat the purpose), + * and a pointer to the key. We use a few tricks to compress both of these. + * + * To compress the pointer, we take advantage of the fact that one node in the + * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have + * a function (to_inorder()) that takes the index of a node in a binary tree and + * returns what its index would be in an inorder traversal, so we only have to + * store the low bits of the offset. + * + * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To + * compress that, we take advantage of the fact that when we're traversing the + * search tree at every iteration we know that both our search key and the key + * we're looking for lie within some range - bounded by our previous + * comparisons. (We special case the start of a search so that this is true even + * at the root of the tree). + * + * So we know the key we're looking for is between a and b, and a and b don't + * differ higher than bit 50, we don't need to check anything higher than bit + * 50. + * + * We don't usually need the rest of the bits, either; we only need enough bits + * to partition the key range we're currently checking. Consider key n - the + * key our auxiliary search tree node corresponds to, and key p, the key + * immediately preceding n. The lowest bit we need to store in the auxiliary + * search tree is the highest bit that differs between n and p. + * + * Note that this could be bit 0 - we might sometimes need all 80 bits to do the + * comparison. But we'd really like our nodes in the auxiliary search tree to be + * of fixed size. + * + * The solution is to make them fixed size, and when we're constructing a node + * check if p and n differed in the bits we needed them to. If they don't we + * flag that node, and when doing lookups we fallback to comparing against the + * real key. As long as this doesn't happen to often (and it seems to reliably + * happen a bit less than 1% of the time), we win - even on failures, that key + * is then more likely to be in cache than if we were doing binary searches all + * the way, since we're touching so much less memory. + * + * The keys in the auxiliary search tree are stored in (software) floating + * point, with an exponent and a mantissa. The exponent needs to be big enough + * to address all the bits in the original key, but the number of bits in the + * mantissa is somewhat arbitrary; more bits just gets us fewer failures. + * + * We need 7 bits for the exponent and 3 bits for the key's offset (since keys + * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. + * We need one node per 128 bytes in the btree node, which means the auxiliary + * search trees take up 3% as much memory as the btree itself. + * + * Constructing these auxiliary search trees is moderately expensive, and we + * don't want to be constantly rebuilding the search tree for the last set + * whenever we insert another key into it. For the unwritten set, we use a much + * simpler lookup table - it's just a flat array, so index i in the lookup table + * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing + * within each byte range works the same as with the auxiliary search trees. + * + * These are much easier to keep up to date when we insert a key - we do it + * somewhat lazily; when we shift a key up we usually just increment the pointer + * to it, only when it would overflow do we go to the trouble of finding the + * first key in that range of bytes again. + */ + +struct btree_node_iter; +struct btree_node_iter_set; + +enum bset_aux_tree_type { + BSET_NO_AUX_TREE, + BSET_RO_AUX_TREE, + BSET_RW_AUX_TREE, +}; + +#define BSET_TREE_NR_TYPES 3 + +#define BSET_NO_AUX_TREE_VAL (U16_MAX) +#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) + +static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) +{ + switch (t->extra) { + case BSET_NO_AUX_TREE_VAL: + EBUG_ON(t->size); + return BSET_NO_AUX_TREE; + case BSET_RW_AUX_TREE_VAL: + EBUG_ON(!t->size); + return BSET_RW_AUX_TREE; + default: + EBUG_ON(!t->size); + return BSET_RO_AUX_TREE; + } +} + +typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); + +static inline struct bkey +bkey_unpack_key_format_checked(const struct btree *b, + const struct bkey_packed *src) +{ + struct bkey dst; + +#ifdef HAVE_BCACHE_COMPILED_UNPACK + { + compiled_unpack_fn unpack_fn = b->aux_data; + unpack_fn(&dst, src); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); + + BUG_ON(memcmp(&dst, &dst2, sizeof(dst))); + } + } +#else + dst = __bch2_bkey_unpack_key(&b->format, src); +#endif + return dst; +} + +/** + * bkey_unpack_key -- unpack just the key, not the value + */ +static inline struct bkey bkey_unpack_key(const struct btree *b, + const struct bkey_packed *src) +{ + return likely(bkey_packed(src)) + ? bkey_unpack_key_format_checked(b, src) + : *packed_to_bkey_c(src); +} + +static inline struct bpos +bkey_unpack_pos_format_checked(const struct btree *b, + const struct bkey_packed *src) +{ +#ifdef HAVE_BCACHE_COMPILED_UNPACK + return bkey_unpack_key_format_checked(b, src).p; +#else + return __bkey_unpack_pos(&b->format, src); +#endif +} + +static inline struct bpos bkey_unpack_pos(const struct btree *b, + const struct bkey_packed *src) +{ + return likely(bkey_packed(src)) + ? bkey_unpack_pos_format_checked(b, src) + : packed_to_bkey_c(src)->p; +} + +/* Disassembled bkeys */ + +static inline struct bkey_s_c bkey_disassemble(struct btree *b, + const struct bkey_packed *k, + struct bkey *u) +{ + *u = bkey_unpack_key(b, k); + + return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; +} + +/* non const version: */ +static inline struct bkey_s __bkey_disassemble(struct btree *b, + struct bkey_packed *k, + struct bkey *u) +{ + *u = bkey_unpack_key(b, k); + + return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; +} + +#define for_each_bset(_b, _t) \ + for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + +extern bool bch2_expensive_debug_checks; + +static inline bool btree_keys_expensive_checks(struct btree *b) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + return bch2_expensive_debug_checks || *b->expensive_debug_checks; +#else + return false; +#endif +} + +static inline bool bset_has_ro_aux_tree(struct bset_tree *t) +{ + return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; +} + +static inline bool bset_has_rw_aux_tree(struct bset_tree *t) +{ + return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; +} + +static inline void bch2_bset_set_no_aux_tree(struct btree *b, + struct bset_tree *t) +{ + BUG_ON(t < b->set); + + for (; t < b->set + ARRAY_SIZE(b->set); t++) { + t->size = 0; + t->extra = BSET_NO_AUX_TREE_VAL; + t->aux_data_offset = U16_MAX; + } +} + +static inline void btree_node_set_format(struct btree *b, + struct bkey_format f) +{ + int len; + + b->format = f; + b->nr_key_bits = bkey_format_key_bits(&f); + + len = bch2_compile_bkey_format(&b->format, b->aux_data); + BUG_ON(len < 0 || len > U8_MAX); + + b->unpack_fn_len = len; + + bch2_bset_set_no_aux_tree(b, b->set); +} + +static inline struct bset *bset_next_set(struct btree *b, + unsigned block_bytes) +{ + struct bset *i = btree_bset_last(b); + + EBUG_ON(!is_power_of_2(block_bytes)); + + return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); +} + +void bch2_btree_keys_free(struct btree *); +int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t); +void bch2_btree_keys_init(struct btree *, bool *); + +void bch2_bset_init_first(struct btree *, struct bset *); +void bch2_bset_init_next(struct btree *, struct bset *); +void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); +void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *, + struct bkey_packed *); + +void bch2_bset_insert(struct btree *, struct btree_node_iter *, + struct bkey_packed *, struct bkey_i *, unsigned); +void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); + +/* Bkey utility code */ + +/* packed or unpacked */ +static inline int bkey_cmp_p_or_unp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r_packed, + struct bpos *r) +{ + EBUG_ON(r_packed && !bkey_packed(r_packed)); + + if (unlikely(!bkey_packed(l))) + return bkey_cmp(packed_to_bkey_c(l)->p, *r); + + if (likely(r_packed)) + return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); + + return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); +} + +/* Returns true if @k is after iterator position @pos */ +static inline bool btree_iter_pos_cmp(struct bpos pos, const struct bkey *k, + bool strictly_greater) +{ + int cmp = bkey_cmp(k->p, pos); + + return cmp > 0 || + (cmp == 0 && !strictly_greater && !bkey_deleted(k)); +} + +static inline bool btree_iter_pos_cmp_packed(const struct btree *b, + struct bpos *pos, + const struct bkey_packed *k, + bool strictly_greater) +{ + int cmp = bkey_cmp_left_packed(b, k, pos); + + return cmp > 0 || + (cmp == 0 && !strictly_greater && !bkey_deleted(k)); +} + +static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b, + struct bpos pos, + const struct bkey_packed *pos_packed, + const struct bkey_packed *k, + bool strictly_greater) +{ + int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos); + + return cmp > 0 || + (cmp == 0 && !strictly_greater && !bkey_deleted(k)); +} + +struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); +struct bkey_packed *bch2_bkey_prev_all(struct btree *, struct bset_tree *, + struct bkey_packed *); +struct bkey_packed *bch2_bkey_prev(struct btree *, struct bset_tree *, + struct bkey_packed *); + +enum bch_extent_overlap { + BCH_EXTENT_OVERLAP_ALL = 0, + BCH_EXTENT_OVERLAP_BACK = 1, + BCH_EXTENT_OVERLAP_FRONT = 2, + BCH_EXTENT_OVERLAP_MIDDLE = 3, +}; + +/* Returns how k overlaps with m */ +static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, + const struct bkey *m) +{ + int cmp1 = bkey_cmp(k->p, m->p) < 0; + int cmp2 = bkey_cmp(bkey_start_pos(k), + bkey_start_pos(m)) > 0; + + return (cmp1 << 1) + cmp2; +} + +/* Btree key iteration */ + +struct btree_node_iter { + u8 is_extents; + u16 used; + + struct btree_node_iter_set { + u16 k, end; + } data[MAX_BSETS]; +}; + +static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter, + bool is_extents) +{ + iter->used = 0; + iter->is_extents = is_extents; +} + +void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); +void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, + struct bpos, bool, bool); +void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, + struct btree *, bool); +struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, + struct btree *, + struct bset_tree *); + +void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); +void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); + +#define btree_node_iter_for_each(_iter, _set) \ + for (_set = (_iter)->data; \ + _set < (_iter)->data + (_iter)->used; \ + _set++) + +static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) +{ + return !iter->used; +} + +static inline int __btree_node_iter_cmp(bool is_extents, + struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + /* + * For non extents, when keys compare equal the deleted keys have to + * come first - so that bch2_btree_node_iter_next_check() can detect + * duplicate nondeleted keys (and possibly other reasons?) + * + * For extents, bkey_deleted() is used as a proxy for k->size == 0, so + * deleted keys have to sort last. + */ + return bkey_cmp_packed(b, l, r) ?: is_extents + ? (int) bkey_deleted(l) - (int) bkey_deleted(r) + : (int) bkey_deleted(r) - (int) bkey_deleted(l); +} + +static inline int btree_node_iter_cmp(struct btree_node_iter *iter, + struct btree *b, + struct btree_node_iter_set l, + struct btree_node_iter_set r) +{ + return __btree_node_iter_cmp(iter->is_extents, b, + __btree_node_offset_to_key(b, l.k), + __btree_node_offset_to_key(b, r.k)); +} + +static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, + struct btree *b, + const struct bkey_packed *k, + const struct bkey_packed *end) +{ + if (k != end) + iter->data[iter->used++] = (struct btree_node_iter_set) { + __btree_node_key_to_offset(b, k), + __btree_node_key_to_offset(b, end) + }; +} + +static inline struct bkey_packed * +__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, + struct btree *b) +{ + return __btree_node_offset_to_key(b, iter->data->k); +} + +static inline struct bkey_packed * +bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, + struct btree *b) +{ + return bch2_btree_node_iter_end(iter) + ? NULL + : __bch2_btree_node_iter_peek_all(iter, b); +} + +static inline struct bkey_packed * +bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) +{ + struct bkey_packed *ret; + + while ((ret = bch2_btree_node_iter_peek_all(iter, b)) && + bkey_deleted(ret)) + bch2_btree_node_iter_advance(iter, b); + + return ret; +} + +static inline struct bkey_packed * +bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) +{ + struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); + + if (ret) + bch2_btree_node_iter_advance(iter, b); + + return ret; +} + +struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, + struct btree *); +struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *, + struct btree *); + +/* + * Iterates over all _live_ keys - skipping deleted (and potentially + * overlapping) keys + */ +#define for_each_btree_node_key(b, k, iter, _is_extents) \ + for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\ + ((k) = bch2_btree_node_iter_peek(iter, b)); \ + bch2_btree_node_iter_advance(iter, b)) + +struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, + struct btree *, + struct bkey *); + +#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\ + for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\ + (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ + bch2_btree_node_iter_advance(iter, b)) + +/* Accounting: */ + +static inline void btree_keys_account_key(struct btree_nr_keys *n, + unsigned bset, + struct bkey_packed *k, + int sign) +{ + n->live_u64s += k->u64s * sign; + n->bset_u64s[bset] += k->u64s * sign; + + if (bkey_packed(k)) + n->packed_keys += sign; + else + n->unpacked_keys += sign; +} + +#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ + btree_keys_account_key(_nr, _bset_idx, _k, 1) +#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ + btree_keys_account_key(_nr, _bset_idx, _k, -1) + +struct bset_stats { + struct { + size_t nr, bytes; + } sets[BSET_TREE_NR_TYPES]; + + size_t floats; + size_t failed_unpacked; + size_t failed_prev; + size_t failed_overflow; +}; + +void bch2_btree_keys_stats(struct btree *, struct bset_stats *); +int bch2_bkey_print_bfloat(struct btree *, struct bkey_packed *, + char *, size_t); + +/* Debug stuff */ + +void bch2_dump_bset(struct btree *, struct bset *, unsigned); +void bch2_dump_btree_node(struct btree *); +void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); + +#ifdef CONFIG_BCACHEFS_DEBUG + +void __bch2_verify_btree_nr_keys(struct btree *); +void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); +void bch2_verify_key_order(struct btree *, struct btree_node_iter *, + struct bkey_packed *); + +#else + +static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} +static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, + struct btree *b) {} +static inline void bch2_verify_key_order(struct btree *b, + struct btree_node_iter *iter, + struct bkey_packed *where) {} +#endif + +static inline void bch2_verify_btree_nr_keys(struct btree *b) +{ + if (btree_keys_expensive_checks(b)) + __bch2_verify_btree_nr_keys(b); +} + +#endif diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c new file mode 100644 index 00000000..c4cc26f9 --- /dev/null +++ b/libbcachefs/btree_cache.c @@ -0,0 +1,746 @@ + +#include "bcachefs.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "debug.h" +#include "extents.h" + +#include <trace/events/bcachefs.h> + +#define DEF_BTREE_ID(kwd, val, name) name, + +const char * const bch2_btree_ids[] = { + DEFINE_BCH_BTREE_IDS() + NULL +}; + +#undef DEF_BTREE_ID + +void bch2_recalc_btree_reserve(struct bch_fs *c) +{ + unsigned i, reserve = 16; + + if (!c->btree_roots[0].b) + reserve += 8; + + for (i = 0; i < BTREE_ID_NR; i++) + if (c->btree_roots[i].b) + reserve += min_t(unsigned, 1, + c->btree_roots[i].b->level) * 8; + + c->btree_cache_reserve = reserve; +} + +#define mca_can_free(c) \ + max_t(int, 0, c->btree_cache_used - c->btree_cache_reserve) + +static void __mca_data_free(struct bch_fs *c, struct btree *b) +{ + EBUG_ON(btree_node_write_in_flight(b)); + + free_pages((unsigned long) b->data, btree_page_order(c)); + b->data = NULL; + bch2_btree_keys_free(b); +} + +static void mca_data_free(struct bch_fs *c, struct btree *b) +{ + __mca_data_free(c, b); + c->btree_cache_used--; + list_move(&b->list, &c->btree_cache_freed); +} + +#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0]) + +static const struct rhashtable_params bch_btree_cache_params = { + .head_offset = offsetof(struct btree, hash), + .key_offset = offsetof(struct btree, key.v), + .key_len = sizeof(struct bch_extent_ptr), +}; + +static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) +{ + unsigned order = ilog2(btree_pages(c)); + + b->data = (void *) __get_free_pages(gfp, order); + if (!b->data) + goto err; + + if (bch2_btree_keys_alloc(b, order, gfp)) + goto err; + + c->btree_cache_used++; + list_move(&b->list, &c->btree_cache_freeable); + return; +err: + free_pages((unsigned long) b->data, order); + b->data = NULL; + list_move(&b->list, &c->btree_cache_freed); +} + +static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp) +{ + struct btree *b = kzalloc(sizeof(struct btree), gfp); + if (!b) + return NULL; + + six_lock_init(&b->lock); + INIT_LIST_HEAD(&b->list); + INIT_LIST_HEAD(&b->write_blocked); + + mca_data_alloc(c, b, gfp); + return b->data ? b : NULL; +} + +/* Btree in memory cache - hash table */ + +void bch2_btree_node_hash_remove(struct bch_fs *c, struct btree *b) +{ + BUG_ON(btree_node_dirty(b)); + + b->nsets = 0; + + rhashtable_remove_fast(&c->btree_cache_table, &b->hash, + bch_btree_cache_params); + + /* Cause future lookups for this node to fail: */ + bkey_i_to_extent(&b->key)->v._data[0] = 0; +} + +int bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b, + unsigned level, enum btree_id id) +{ + int ret; + b->level = level; + b->btree_id = id; + + ret = rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash, + bch_btree_cache_params); + if (ret) + return ret; + + mutex_lock(&c->btree_cache_lock); + list_add(&b->list, &c->btree_cache); + mutex_unlock(&c->btree_cache_lock); + + return 0; +} + +__flatten +static inline struct btree *mca_find(struct bch_fs *c, + const struct bkey_i *k) +{ + return rhashtable_lookup_fast(&c->btree_cache_table, &PTR_HASH(k), + bch_btree_cache_params); +} + +/* + * this version is for btree nodes that have already been freed (we're not + * reaping a real btree node) + */ +static int mca_reap_notrace(struct bch_fs *c, struct btree *b, bool flush) +{ + lockdep_assert_held(&c->btree_cache_lock); + + if (!six_trylock_intent(&b->lock)) + return -ENOMEM; + + if (!six_trylock_write(&b->lock)) + goto out_unlock_intent; + + if (btree_node_write_error(b) || + btree_node_noevict(b)) + goto out_unlock; + + if (!list_empty(&b->write_blocked)) + goto out_unlock; + + if (!flush && + (btree_node_dirty(b) || + btree_node_write_in_flight(b))) + goto out_unlock; + + /* + * Using the underscore version because we don't want to compact bsets + * after the write, since this node is about to be evicted - unless + * btree verify mode is enabled, since it runs out of the post write + * cleanup: + */ + if (btree_node_dirty(b)) { + if (verify_btree_ondisk(c)) + bch2_btree_node_write(c, b, NULL, SIX_LOCK_intent, -1); + else + __bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, -1); + } + + /* wait for any in flight btree write */ + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); + + return 0; +out_unlock: + six_unlock_write(&b->lock); +out_unlock_intent: + six_unlock_intent(&b->lock); + return -ENOMEM; +} + +static int mca_reap(struct bch_fs *c, struct btree *b, bool flush) +{ + int ret = mca_reap_notrace(c, b, flush); + + trace_btree_node_reap(c, b, ret); + return ret; +} + +static unsigned long bch2_mca_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_cache_shrink); + struct btree *b, *t; + unsigned long nr = sc->nr_to_scan; + unsigned long can_free; + unsigned long touched = 0; + unsigned long freed = 0; + unsigned i; + + if (btree_shrinker_disabled(c)) + return SHRINK_STOP; + + if (c->btree_cache_alloc_lock) + return SHRINK_STOP; + + /* Return -1 if we can't do anything right now */ + if (sc->gfp_mask & __GFP_IO) + mutex_lock(&c->btree_cache_lock); + else if (!mutex_trylock(&c->btree_cache_lock)) + return -1; + + /* + * It's _really_ critical that we don't free too many btree nodes - we + * have to always leave ourselves a reserve. The reserve is how we + * guarantee that allocating memory for a new btree node can always + * succeed, so that inserting keys into the btree can always succeed and + * IO can always make forward progress: + */ + nr /= btree_pages(c); + can_free = mca_can_free(c); + nr = min_t(unsigned long, nr, can_free); + + i = 0; + list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { + touched++; + + if (freed >= nr) + break; + + if (++i > 3 && + !mca_reap_notrace(c, b, false)) { + mca_data_free(c, b); + six_unlock_write(&b->lock); + six_unlock_intent(&b->lock); + freed++; + } + } +restart: + list_for_each_entry_safe(b, t, &c->btree_cache, list) { + touched++; + + if (freed >= nr) { + /* Save position */ + if (&t->list != &c->btree_cache) + list_move_tail(&c->btree_cache, &t->list); + break; + } + + if (!btree_node_accessed(b) && + !mca_reap(c, b, false)) { + /* can't call bch2_btree_node_hash_remove under btree_cache_lock */ + freed++; + if (&t->list != &c->btree_cache) + list_move_tail(&c->btree_cache, &t->list); + + mca_data_free(c, b); + mutex_unlock(&c->btree_cache_lock); + + bch2_btree_node_hash_remove(c, b); + six_unlock_write(&b->lock); + six_unlock_intent(&b->lock); + + if (freed >= nr) + goto out; + + if (sc->gfp_mask & __GFP_IO) + mutex_lock(&c->btree_cache_lock); + else if (!mutex_trylock(&c->btree_cache_lock)) + goto out; + goto restart; + } else + clear_btree_node_accessed(b); + } + + mutex_unlock(&c->btree_cache_lock); +out: + return (unsigned long) freed * btree_pages(c); +} + +static unsigned long bch2_mca_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_cache_shrink); + + if (btree_shrinker_disabled(c)) + return 0; + + if (c->btree_cache_alloc_lock) + return 0; + + return mca_can_free(c) * btree_pages(c); +} + +void bch2_fs_btree_exit(struct bch_fs *c) +{ + struct btree *b; + unsigned i; + + if (c->btree_cache_shrink.list.next) + unregister_shrinker(&c->btree_cache_shrink); + + mutex_lock(&c->btree_cache_lock); + +#ifdef CONFIG_BCACHEFS_DEBUG + if (c->verify_data) + list_move(&c->verify_data->list, &c->btree_cache); + + free_pages((unsigned long) c->verify_ondisk, ilog2(btree_pages(c))); +#endif + + for (i = 0; i < BTREE_ID_NR; i++) + if (c->btree_roots[i].b) + list_add(&c->btree_roots[i].b->list, &c->btree_cache); + + list_splice(&c->btree_cache_freeable, + &c->btree_cache); + + while (!list_empty(&c->btree_cache)) { + b = list_first_entry(&c->btree_cache, struct btree, list); + + if (btree_node_dirty(b)) + bch2_btree_complete_write(c, b, btree_current_write(b)); + clear_btree_node_dirty(b); + + mca_data_free(c, b); + } + + while (!list_empty(&c->btree_cache_freed)) { + b = list_first_entry(&c->btree_cache_freed, + struct btree, list); + list_del(&b->list); + kfree(b); + } + + mutex_unlock(&c->btree_cache_lock); + + if (c->btree_cache_table_init_done) + rhashtable_destroy(&c->btree_cache_table); +} + +int bch2_fs_btree_init(struct bch_fs *c) +{ + unsigned i; + int ret; + + ret = rhashtable_init(&c->btree_cache_table, &bch_btree_cache_params); + if (ret) + return ret; + + c->btree_cache_table_init_done = true; + + bch2_recalc_btree_reserve(c); + + for (i = 0; i < c->btree_cache_reserve; i++) + if (!mca_bucket_alloc(c, GFP_KERNEL)) + return -ENOMEM; + + list_splice_init(&c->btree_cache, + &c->btree_cache_freeable); + +#ifdef CONFIG_BCACHEFS_DEBUG + mutex_init(&c->verify_lock); + + c->verify_ondisk = (void *) + __get_free_pages(GFP_KERNEL, ilog2(btree_pages(c))); + if (!c->verify_ondisk) + return -ENOMEM; + + c->verify_data = mca_bucket_alloc(c, GFP_KERNEL); + if (!c->verify_data) + return -ENOMEM; + + list_del_init(&c->verify_data->list); +#endif + + c->btree_cache_shrink.count_objects = bch2_mca_count; + c->btree_cache_shrink.scan_objects = bch2_mca_scan; + c->btree_cache_shrink.seeks = 4; + c->btree_cache_shrink.batch = btree_pages(c) * 2; + register_shrinker(&c->btree_cache_shrink); + + return 0; +} + +/* + * We can only have one thread cannibalizing other cached btree nodes at a time, + * or we'll deadlock. We use an open coded mutex to ensure that, which a + * cannibalize_bucket() will take. This means every time we unlock the root of + * the btree, we need to release this lock if we have it held. + */ +void bch2_btree_node_cannibalize_unlock(struct bch_fs *c) +{ + if (c->btree_cache_alloc_lock == current) { + trace_btree_node_cannibalize_unlock(c); + c->btree_cache_alloc_lock = NULL; + closure_wake_up(&c->mca_wait); + } +} + +int bch2_btree_node_cannibalize_lock(struct bch_fs *c, struct closure *cl) +{ + struct task_struct *old; + + old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); + if (old == NULL || old == current) + goto success; + + if (!cl) { + trace_btree_node_cannibalize_lock_fail(c); + return -ENOMEM; + } + + closure_wait(&c->mca_wait, cl); + + /* Try again, after adding ourselves to waitlist */ + old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); + if (old == NULL || old == current) { + /* We raced */ + closure_wake_up(&c->mca_wait); + goto success; + } + + trace_btree_node_cannibalize_lock_fail(c); + return -EAGAIN; + +success: + trace_btree_node_cannibalize_lock(c); + return 0; +} + +static struct btree *mca_cannibalize(struct bch_fs *c) +{ + struct btree *b; + + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(c, b, false)) + return b; + + while (1) { + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(c, b, true)) + return b; + + /* + * Rare case: all nodes were intent-locked. + * Just busy-wait. + */ + WARN_ONCE(1, "btree cache cannibalize failed\n"); + cond_resched(); + } +} + +struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) +{ + struct btree *b; + u64 start_time = local_clock(); + + mutex_lock(&c->btree_cache_lock); + + /* + * btree_free() doesn't free memory; it sticks the node on the end of + * the list. Check if there's any freed nodes there: + */ + list_for_each_entry(b, &c->btree_cache_freeable, list) + if (!mca_reap_notrace(c, b, false)) + goto out_unlock; + + /* + * We never free struct btree itself, just the memory that holds the on + * disk node. Check the freed list before allocating a new one: + */ + list_for_each_entry(b, &c->btree_cache_freed, list) + if (!mca_reap_notrace(c, b, false)) { + mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO); + if (b->data) + goto out_unlock; + + six_unlock_write(&b->lock); + six_unlock_intent(&b->lock); + goto err; + } + + b = mca_bucket_alloc(c, __GFP_NOWARN|GFP_NOIO); + if (!b) + goto err; + + BUG_ON(!six_trylock_intent(&b->lock)); + BUG_ON(!six_trylock_write(&b->lock)); +out_unlock: + BUG_ON(bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key)); + BUG_ON(btree_node_write_in_flight(b)); + + list_del_init(&b->list); + mutex_unlock(&c->btree_cache_lock); +out: + b->flags = 0; + b->written = 0; + b->nsets = 0; + b->sib_u64s[0] = 0; + b->sib_u64s[1] = 0; + b->whiteout_u64s = 0; + b->uncompacted_whiteout_u64s = 0; + bch2_btree_keys_init(b, &c->expensive_debug_checks); + + bch2_time_stats_update(&c->btree_node_mem_alloc_time, start_time); + + return b; +err: + /* Try to cannibalize another cached btree node: */ + if (c->btree_cache_alloc_lock == current) { + b = mca_cannibalize(c); + list_del_init(&b->list); + mutex_unlock(&c->btree_cache_lock); + + bch2_btree_node_hash_remove(c, b); + + trace_btree_node_cannibalize(c); + goto out; + } + + mutex_unlock(&c->btree_cache_lock); + return ERR_PTR(-ENOMEM); +} + +/* Slowpath, don't want it inlined into btree_iter_traverse() */ +static noinline struct btree *bch2_btree_node_fill(struct btree_iter *iter, + const struct bkey_i *k, + unsigned level, + enum six_lock_type lock_type) +{ + struct bch_fs *c = iter->c; + struct btree *b; + + b = bch2_btree_node_mem_alloc(c); + if (IS_ERR(b)) + return b; + + bkey_copy(&b->key, k); + if (bch2_btree_node_hash_insert(c, b, level, iter->btree_id)) { + /* raced with another fill: */ + + /* mark as unhashed... */ + bkey_i_to_extent(&b->key)->v._data[0] = 0; + + mutex_lock(&c->btree_cache_lock); + list_add(&b->list, &c->btree_cache_freeable); + mutex_unlock(&c->btree_cache_lock); + + six_unlock_write(&b->lock); + six_unlock_intent(&b->lock); + return NULL; + } + + /* + * If the btree node wasn't cached, we can't drop our lock on + * the parent until after it's added to the cache - because + * otherwise we could race with a btree_split() freeing the node + * we're trying to lock. + * + * But the deadlock described below doesn't exist in this case, + * so it's safe to not drop the parent lock until here: + */ + if (btree_node_read_locked(iter, level + 1)) + btree_node_unlock(iter, level + 1); + + bch2_btree_node_read(c, b); + six_unlock_write(&b->lock); + + if (lock_type == SIX_LOCK_read) + six_lock_downgrade(&b->lock); + + return b; +} + +/** + * bch_btree_node_get - find a btree node in the cache and lock it, reading it + * in from disk if necessary. + * + * If IO is necessary and running under generic_make_request, returns -EAGAIN. + * + * The btree node will have either a read or a write lock held, depending on + * the @write parameter. + */ +struct btree *bch2_btree_node_get(struct btree_iter *iter, + const struct bkey_i *k, unsigned level, + enum six_lock_type lock_type) +{ + struct btree *b; + struct bset_tree *t; + + BUG_ON(level >= BTREE_MAX_DEPTH); +retry: + rcu_read_lock(); + b = mca_find(iter->c, k); + rcu_read_unlock(); + + if (unlikely(!b)) { + /* + * We must have the parent locked to call bch2_btree_node_fill(), + * else we could read in a btree node from disk that's been + * freed: + */ + b = bch2_btree_node_fill(iter, k, level, lock_type); + + /* We raced and found the btree node in the cache */ + if (!b) + goto retry; + + if (IS_ERR(b)) + return b; + } else { + /* + * There's a potential deadlock with splits and insertions into + * interior nodes we have to avoid: + * + * The other thread might be holding an intent lock on the node + * we want, and they want to update its parent node so they're + * going to upgrade their intent lock on the parent node to a + * write lock. + * + * But if we're holding a read lock on the parent, and we're + * trying to get the intent lock they're holding, we deadlock. + * + * So to avoid this we drop the read locks on parent nodes when + * we're starting to take intent locks - and handle the race. + * + * The race is that they might be about to free the node we + * want, and dropping our read lock on the parent node lets them + * update the parent marking the node we want as freed, and then + * free it: + * + * To guard against this, btree nodes are evicted from the cache + * when they're freed - and PTR_HASH() is zeroed out, which we + * check for after we lock the node. + * + * Then, bch2_btree_node_relock() on the parent will fail - because + * the parent was modified, when the pointer to the node we want + * was removed - and we'll bail out: + */ + if (btree_node_read_locked(iter, level + 1)) + btree_node_unlock(iter, level + 1); + + if (!btree_node_lock(b, k->k.p, level, iter, lock_type)) + return ERR_PTR(-EINTR); + + if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) || + b->level != level || + race_fault())) { + six_unlock_type(&b->lock, lock_type); + if (bch2_btree_node_relock(iter, level + 1)) + goto retry; + + return ERR_PTR(-EINTR); + } + } + + prefetch(b->aux_data); + + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; + + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + } + + /* avoid atomic set bit if it's not needed: */ + if (btree_node_accessed(b)) + set_btree_node_accessed(b); + + if (unlikely(btree_node_read_error(b))) { + six_unlock_type(&b->lock, lock_type); + return ERR_PTR(-EIO); + } + + EBUG_ON(!b->written); + EBUG_ON(b->btree_id != iter->btree_id || + BTREE_NODE_LEVEL(b->data) != level || + bkey_cmp(b->data->max_key, k->k.p)); + + return b; +} + +int bch2_print_btree_node(struct bch_fs *c, struct btree *b, + char *buf, size_t len) +{ + const struct bkey_format *f = &b->format; + struct bset_stats stats; + char ptrs[100]; + + memset(&stats, 0, sizeof(stats)); + + bch2_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs), + bkey_i_to_s_c(&b->key)); + bch2_btree_keys_stats(b, &stats); + + return scnprintf(buf, len, + "l %u %llu:%llu - %llu:%llu:\n" + " ptrs: %s\n" + " format: u64s %u fields %u %u %u %u %u\n" + " unpack fn len: %u\n" + " bytes used %zu/%zu (%zu%% full)\n" + " sib u64s: %u, %u (merge threshold %zu)\n" + " nr packed keys %u\n" + " nr unpacked keys %u\n" + " floats %zu\n" + " failed unpacked %zu\n" + " failed prev %zu\n" + " failed overflow %zu\n", + b->level, + b->data->min_key.inode, + b->data->min_key.offset, + b->data->max_key.inode, + b->data->max_key.offset, + ptrs, + f->key_u64s, + f->bits_per_field[0], + f->bits_per_field[1], + f->bits_per_field[2], + f->bits_per_field[3], + f->bits_per_field[4], + b->unpack_fn_len, + b->nr.live_u64s * sizeof(u64), + btree_bytes(c) - sizeof(struct btree_node), + b->nr.live_u64s * 100 / btree_max_u64s(c), + b->sib_u64s[0], + b->sib_u64s[1], + BTREE_FOREGROUND_MERGE_THRESHOLD(c), + b->nr.packed_keys, + b->nr.unpacked_keys, + stats.floats, + stats.failed_unpacked, + stats.failed_prev, + stats.failed_overflow); +} diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h new file mode 100644 index 00000000..23f637ab --- /dev/null +++ b/libbcachefs/btree_cache.h @@ -0,0 +1,71 @@ +#ifndef _BCACHE_BTREE_CACHE_H +#define _BCACHE_BTREE_CACHE_H + +#include "bcachefs.h" +#include "btree_types.h" + +struct btree_iter; + +extern const char * const bch2_btree_ids[]; + +void bch2_recalc_btree_reserve(struct bch_fs *); + +void bch2_btree_node_hash_remove(struct bch_fs *, struct btree *); +int bch2_btree_node_hash_insert(struct bch_fs *, struct btree *, + unsigned, enum btree_id); + +void bch2_btree_node_cannibalize_unlock(struct bch_fs *); +int bch2_btree_node_cannibalize_lock(struct bch_fs *, struct closure *); + +struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); + +struct btree *bch2_btree_node_get(struct btree_iter *, const struct bkey_i *, + unsigned, enum six_lock_type); + +void bch2_fs_btree_exit(struct bch_fs *); +int bch2_fs_btree_init(struct bch_fs *); + +#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ + for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl, \ + &(_c)->btree_cache_table), \ + _iter = 0; _iter < (_tbl)->size; _iter++) \ + rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) + +static inline size_t btree_bytes(struct bch_fs *c) +{ + return c->sb.btree_node_size << 9; +} + +static inline size_t btree_max_u64s(struct bch_fs *c) +{ + return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); +} + +static inline size_t btree_pages(struct bch_fs *c) +{ + return c->sb.btree_node_size >> (PAGE_SHIFT - 9); +} + +static inline size_t btree_page_order(struct bch_fs *c) +{ + return ilog2(btree_pages(c)); +} + +static inline unsigned btree_blocks(struct bch_fs *c) +{ + return c->sb.btree_node_size >> c->block_bits; +} + +#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4) + +#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) +#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) + +#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->btree_id].b) + +int bch2_print_btree_node(struct bch_fs *, struct btree *, + char *, size_t); + +#endif /* _BCACHE_BTREE_CACHE_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c new file mode 100644 index 00000000..0883b9b4 --- /dev/null +++ b/libbcachefs/btree_gc.c @@ -0,0 +1,954 @@ +/* + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright (C) 2014 Datera Inc. + */ + +#include "bcachefs.h" +#include "alloc.h" +#include "bkey_methods.h" +#include "btree_locking.h" +#include "btree_update.h" +#include "btree_io.h" +#include "btree_gc.h" +#include "buckets.h" +#include "clock.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "journal.h" +#include "keylist.h" +#include "move.h" +#include "super-io.h" + +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/rcupdate.h> +#include <trace/events/bcachefs.h> + +struct range_checks { + struct range_level { + struct bpos min; + struct bpos max; + } l[BTREE_MAX_DEPTH]; + unsigned depth; +}; + +static void btree_node_range_checks_init(struct range_checks *r, unsigned depth) +{ + unsigned i; + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + r->l[i].min = r->l[i].max = POS_MIN; + r->depth = depth; +} + +static void btree_node_range_checks(struct bch_fs *c, struct btree *b, + struct range_checks *r) +{ + struct range_level *l = &r->l[b->level]; + + struct bpos expected_min = bkey_cmp(l->min, l->max) + ? btree_type_successor(b->btree_id, l->max) + : l->max; + + bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c, + "btree node has incorrect min key: %llu:%llu != %llu:%llu", + b->data->min_key.inode, + b->data->min_key.offset, + expected_min.inode, + expected_min.offset); + + l->max = b->data->max_key; + + if (b->level > r->depth) { + l = &r->l[b->level - 1]; + + bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c, + "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu", + b->data->min_key.inode, + b->data->min_key.offset, + l->min.inode, + l->min.offset); + + bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c, + "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu", + b->data->max_key.inode, + b->data->max_key.offset, + l->max.inode, + l->max.offset); + + if (bkey_cmp(b->data->max_key, POS_MAX)) + l->min = l->max = + btree_type_successor(b->btree_id, + b->data->max_key); + } +} + +u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_extent_ptr *ptr; + u8 max_stale = 0; + + if (bkey_extent_is_data(k.k)) { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + + extent_for_each_ptr(e, ptr) { + struct bch_dev *ca = c->devs[ptr->dev]; + size_t b = PTR_BUCKET_NR(ca, ptr); + + if (__gen_after(ca->oldest_gens[b], ptr->gen)) + ca->oldest_gens[b] = ptr->gen; + + max_stale = max(max_stale, ptr_stale(ca, ptr)); + } + } + + return max_stale; +} + +/* + * For runtime mark and sweep: + */ +static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type, + struct bkey_s_c k) +{ + switch (type) { + case BKEY_TYPE_BTREE: + bch2_gc_mark_key(c, k, c->sb.btree_node_size, true); + return 0; + case BKEY_TYPE_EXTENTS: + bch2_gc_mark_key(c, k, k.k->size, false); + return bch2_btree_key_recalc_oldest_gen(c, k); + default: + BUG(); + } +} + +u8 bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, + struct bkey_s_c k) +{ + atomic64_set(&c->key_version, + max_t(u64, k.k->version.lo, + atomic64_read(&c->key_version))); + + return bch2_btree_mark_key(c, type, k); +} + +static bool btree_gc_mark_node(struct bch_fs *c, struct btree *b) +{ + if (btree_node_has_ptrs(b)) { + struct btree_node_iter iter; + struct bkey unpacked; + struct bkey_s_c k; + u8 stale = 0; + + for_each_btree_node_key_unpack(b, k, &iter, + btree_node_is_extents(b), + &unpacked) { + bch2_bkey_debugcheck(c, b, k); + stale = max(stale, bch2_btree_mark_key(c, + btree_node_type(b), k)); + } + + if (btree_gc_rewrite_disabled(c)) + return false; + + if (stale > 10) + return true; + } + + if (btree_gc_always_rewrite(c)) + return true; + + return false; +} + +static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) +{ + write_seqcount_begin(&c->gc_pos_lock); + c->gc_pos = new_pos; + write_seqcount_end(&c->gc_pos_lock); +} + +static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) +{ + BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); + __gc_pos_set(c, new_pos); +} + +static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) +{ + struct btree_iter iter; + struct btree *b; + bool should_rewrite; + struct range_checks r; + unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1; + int ret; + + /* + * if expensive_debug_checks is on, run range_checks on all leaf nodes: + */ + if (expensive_debug_checks(c)) + depth = 0; + + btree_node_range_checks_init(&r, depth); + + for_each_btree_node(&iter, c, btree_id, POS_MIN, depth, b) { + btree_node_range_checks(c, b, &r); + + bch2_verify_btree_nr_keys(b); + + should_rewrite = btree_gc_mark_node(c, b); + + gc_pos_set(c, gc_pos_btree_node(b)); + + if (should_rewrite) + bch2_btree_node_rewrite(&iter, b, NULL); + + bch2_btree_iter_cond_resched(&iter); + } + ret = bch2_btree_iter_unlock(&iter); + if (ret) + return ret; + + mutex_lock(&c->btree_root_lock); + + b = c->btree_roots[btree_id].b; + bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key)); + gc_pos_set(c, gc_pos_btree_root(b->btree_id)); + + mutex_unlock(&c->btree_root_lock); + return 0; +} + +static void bch2_mark_allocator_buckets(struct bch_fs *c) +{ + struct bch_dev *ca; + struct open_bucket *ob; + size_t i, j, iter; + unsigned ci; + + for_each_member_device(ca, c, ci) { + spin_lock(&ca->freelist_lock); + + fifo_for_each_entry(i, &ca->free_inc, iter) + bch2_mark_alloc_bucket(ca, &ca->buckets[i], true); + + for (j = 0; j < RESERVE_NR; j++) + fifo_for_each_entry(i, &ca->free[j], iter) + bch2_mark_alloc_bucket(ca, &ca->buckets[i], true); + + spin_unlock(&ca->freelist_lock); + } + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { + const struct bch_extent_ptr *ptr; + + mutex_lock(&ob->lock); + open_bucket_for_each_ptr(ob, ptr) { + ca = c->devs[ptr->dev]; + bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), true); + } + mutex_unlock(&ob->lock); + } +} + +static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end, + enum bucket_data_type type) +{ + u64 b = start >> ca->bucket_bits; + + do { + bch2_mark_metadata_bucket(ca, ca->buckets + b, type, true); + b++; + } while (b < end >> ca->bucket_bits); +} + +static void bch2_dev_mark_superblocks(struct bch_dev *ca) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + unsigned i; + + for (i = 0; i < layout->nr_superblocks; i++) { + if (layout->sb_offset[i] == BCH_SB_SECTOR) + mark_metadata_sectors(ca, 0, BCH_SB_SECTOR, + BUCKET_SB); + + mark_metadata_sectors(ca, + layout->sb_offset[i], + layout->sb_offset[i] + + (1 << layout->sb_max_size_bits), + BUCKET_SB); + } +} + +/* + * Mark non btree metadata - prios, journal + */ +void bch2_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned i; + u64 b; + + lockdep_assert_held(&c->sb_lock); + + bch2_dev_mark_superblocks(ca); + + spin_lock(&c->journal.lock); + + for (i = 0; i < ca->journal.nr; i++) { + b = ca->journal.buckets[i]; + bch2_mark_metadata_bucket(ca, ca->buckets + b, + BUCKET_JOURNAL, true); + } + + spin_unlock(&c->journal.lock); + + spin_lock(&ca->prio_buckets_lock); + + for (i = 0; i < prio_buckets(ca) * 2; i++) { + b = ca->prio_buckets[i]; + if (b) + bch2_mark_metadata_bucket(ca, ca->buckets + b, + BUCKET_PRIOS, true); + } + + spin_unlock(&ca->prio_buckets_lock); +} + +static void bch2_mark_metadata(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + mutex_lock(&c->sb_lock); + gc_pos_set(c, gc_phase(GC_PHASE_SB_METADATA)); + + for_each_online_member(ca, c, i) + bch2_mark_dev_metadata(c, ca); + mutex_unlock(&c->sb_lock); +} + +/* Also see bch2_pending_btree_node_free_insert_done() */ +static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) +{ + struct bch_fs_usage stats = { 0 }; + struct btree_interior_update *as; + struct pending_btree_node_free *d; + + mutex_lock(&c->btree_interior_update_lock); + gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); + + for_each_pending_btree_node_free(c, as, d) + if (d->index_update_done) + __bch2_gc_mark_key(c, bkey_i_to_s_c(&d->key), + c->sb.btree_node_size, true, + &stats); + /* + * Don't apply stats - pending deletes aren't tracked in + * bch_alloc_stats: + */ + + mutex_unlock(&c->btree_interior_update_lock); +} + +/** + * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes + */ +void bch2_gc(struct bch_fs *c) +{ + struct bch_dev *ca; + struct bucket *g; + struct bucket_mark new; + u64 start_time = local_clock(); + unsigned i; + int cpu; + + /* + * Walk _all_ references to buckets, and recompute them: + * + * Order matters here: + * - Concurrent GC relies on the fact that we have a total ordering for + * everything that GC walks - see gc_will_visit_node(), + * gc_will_visit_root() + * + * - also, references move around in the course of index updates and + * various other crap: everything needs to agree on the ordering + * references are allowed to move around in - e.g., we're allowed to + * start with a reference owned by an open_bucket (the allocator) and + * move it to the btree, but not the reverse. + * + * This is necessary to ensure that gc doesn't miss references that + * move around - if references move backwards in the ordering GC + * uses, GC could skip past them + */ + + if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) + return; + + trace_gc_start(c); + + /* + * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on + * gc_lock if sectors_available goes to 0: + */ + bch2_recalc_sectors_available(c); + + down_write(&c->gc_lock); + + lg_global_lock(&c->usage_lock); + + /* + * Indicates to buckets code that gc is now in progress - done under + * usage_lock to avoid racing with bch2_mark_key(): + */ + __gc_pos_set(c, GC_POS_MIN); + + /* Save a copy of the existing bucket stats while we recompute them: */ + for_each_member_device(ca, c, i) { + ca->usage_cached = __bch2_dev_usage_read(ca); + for_each_possible_cpu(cpu) { + struct bch_dev_usage *p = + per_cpu_ptr(ca->usage_percpu, cpu); + memset(p, 0, sizeof(*p)); + } + } + + c->usage_cached = __bch2_fs_usage_read(c); + for_each_possible_cpu(cpu) { + struct bch_fs_usage *p = + per_cpu_ptr(c->usage_percpu, cpu); + + memset(p->s, 0, sizeof(p->s)); + p->persistent_reserved = 0; + } + + lg_global_unlock(&c->usage_lock); + + /* Clear bucket marks: */ + for_each_member_device(ca, c, i) + for_each_bucket(g, ca) { + bucket_cmpxchg(g, new, ({ + new.owned_by_allocator = 0; + new.data_type = 0; + new.cached_sectors = 0; + new.dirty_sectors = 0; + })); + ca->oldest_gens[g - ca->buckets] = new.gen; + } + + /* Walk allocator's references: */ + bch2_mark_allocator_buckets(c); + + /* Walk btree: */ + while (c->gc_pos.phase < (int) BTREE_ID_NR) { + int ret = c->btree_roots[c->gc_pos.phase].b + ? bch2_gc_btree(c, (int) c->gc_pos.phase) + : 0; + + if (ret) { + bch_err(c, "btree gc failed: %d", ret); + set_bit(BCH_FS_GC_FAILURE, &c->flags); + up_write(&c->gc_lock); + return; + } + + gc_pos_set(c, gc_phase(c->gc_pos.phase + 1)); + } + + bch2_mark_metadata(c); + bch2_mark_pending_btree_node_frees(c); + + for_each_member_device(ca, c, i) + atomic_long_set(&ca->saturated_count, 0); + + /* Indicates that gc is no longer in progress: */ + gc_pos_set(c, gc_phase(GC_PHASE_DONE)); + + up_write(&c->gc_lock); + trace_gc_end(c); + bch2_time_stats_update(&c->btree_gc_time, start_time); + + /* + * Wake up allocator in case it was waiting for buckets + * because of not being able to inc gens + */ + for_each_member_device(ca, c, i) + bch2_wake_allocator(ca); +} + +/* Btree coalescing */ + +static void recalc_packed_keys(struct btree *b) +{ + struct bkey_packed *k; + + memset(&b->nr, 0, sizeof(b->nr)); + + BUG_ON(b->nsets != 1); + + for (k = btree_bkey_first(b, b->set); + k != btree_bkey_last(b, b->set); + k = bkey_next(k)) + btree_keys_account_key_add(&b->nr, 0, k); +} + +static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], + struct btree_iter *iter) +{ + struct btree *parent = iter->nodes[old_nodes[0]->level + 1]; + struct bch_fs *c = iter->c; + unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; + unsigned blocks = btree_blocks(c) * 2 / 3; + struct btree *new_nodes[GC_MERGE_NODES]; + struct btree_interior_update *as; + struct btree_reserve *res; + struct keylist keylist; + struct bkey_format_state format_state; + struct bkey_format new_format; + + memset(new_nodes, 0, sizeof(new_nodes)); + bch2_keylist_init(&keylist, NULL, 0); + + /* Count keys that are not deleted */ + for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) + u64s += old_nodes[i]->nr.live_u64s; + + nr_old_nodes = nr_new_nodes = i; + + /* Check if all keys in @old_nodes could fit in one fewer node */ + if (nr_old_nodes <= 1 || + __vstruct_blocks(struct btree_node, c->block_bits, + DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) + return; + + res = bch2_btree_reserve_get(c, parent, nr_old_nodes, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + NULL); + if (IS_ERR(res)) { + trace_btree_gc_coalesce_fail(c, + BTREE_GC_COALESCE_FAIL_RESERVE_GET); + return; + } + + if (bch2_keylist_realloc(&keylist, NULL, 0, + (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { + trace_btree_gc_coalesce_fail(c, + BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); + goto out; + } + + /* Find a format that all keys in @old_nodes can pack into */ + bch2_bkey_format_init(&format_state); + + for (i = 0; i < nr_old_nodes; i++) + __bch2_btree_calc_format(&format_state, old_nodes[i]); + + new_format = bch2_bkey_format_done(&format_state); + + /* Check if repacking would make any nodes too big to fit */ + for (i = 0; i < nr_old_nodes; i++) + if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { + trace_btree_gc_coalesce_fail(c, + BTREE_GC_COALESCE_FAIL_FORMAT_FITS); + goto out; + } + + trace_btree_gc_coalesce(c, parent, nr_old_nodes); + + as = bch2_btree_interior_update_alloc(c); + + for (i = 0; i < nr_old_nodes; i++) + bch2_btree_interior_update_will_free_node(c, as, old_nodes[i]); + + /* Repack everything with @new_format and sort down to one bset */ + for (i = 0; i < nr_old_nodes; i++) + new_nodes[i] = + __bch2_btree_node_alloc_replacement(c, old_nodes[i], + new_format, res); + + /* + * Conceptually we concatenate the nodes together and slice them + * up at different boundaries. + */ + for (i = nr_new_nodes - 1; i > 0; --i) { + struct btree *n1 = new_nodes[i]; + struct btree *n2 = new_nodes[i - 1]; + + struct bset *s1 = btree_bset_first(n1); + struct bset *s2 = btree_bset_first(n2); + struct bkey_packed *k, *last = NULL; + + /* Calculate how many keys from @n2 we could fit inside @n1 */ + u64s = 0; + + for (k = s2->start; + k < vstruct_last(s2) && + vstruct_blocks_plus(n1->data, c->block_bits, + u64s + k->u64s) <= blocks; + k = bkey_next(k)) { + last = k; + u64s += k->u64s; + } + + if (u64s == le16_to_cpu(s2->u64s)) { + /* n2 fits entirely in n1 */ + n1->key.k.p = n1->data->max_key = n2->data->max_key; + + memcpy_u64s(vstruct_last(s1), + s2->start, + le16_to_cpu(s2->u64s)); + le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); + + set_btree_bset_end(n1, n1->set); + + six_unlock_write(&n2->lock); + bch2_btree_node_free_never_inserted(c, n2); + six_unlock_intent(&n2->lock); + + memmove(new_nodes + i - 1, + new_nodes + i, + sizeof(new_nodes[0]) * (nr_new_nodes - i)); + new_nodes[--nr_new_nodes] = NULL; + } else if (u64s) { + /* move part of n2 into n1 */ + n1->key.k.p = n1->data->max_key = + bkey_unpack_pos(n1, last); + + n2->data->min_key = + btree_type_successor(iter->btree_id, + n1->data->max_key); + + memcpy_u64s(vstruct_last(s1), + s2->start, u64s); + le16_add_cpu(&s1->u64s, u64s); + + memmove(s2->start, + vstruct_idx(s2, u64s), + (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); + s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); + + set_btree_bset_end(n1, n1->set); + set_btree_bset_end(n2, n2->set); + } + } + + for (i = 0; i < nr_new_nodes; i++) { + struct btree *n = new_nodes[i]; + + recalc_packed_keys(n); + btree_node_reset_sib_u64s(n); + + bch2_btree_build_aux_trees(n); + six_unlock_write(&n->lock); + + bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1); + } + + /* + * The keys for the old nodes get deleted. We don't want to insert keys + * that compare equal to the keys for the new nodes we'll also be + * inserting - we can't because keys on a keylist must be strictly + * greater than the previous keys, and we also don't need to since the + * key for the new node will serve the same purpose (overwriting the key + * for the old node). + */ + for (i = 0; i < nr_old_nodes; i++) { + struct bkey_i delete; + unsigned j; + + for (j = 0; j < nr_new_nodes; j++) + if (!bkey_cmp(old_nodes[i]->key.k.p, + new_nodes[j]->key.k.p)) + goto next; + + bkey_init(&delete.k); + delete.k.p = old_nodes[i]->key.k.p; + bch2_keylist_add_in_order(&keylist, &delete); +next: + i = i; + } + + /* + * Keys for the new nodes get inserted: bch2_btree_insert_keys() only + * does the lookup once and thus expects the keys to be in sorted order + * so we have to make sure the new keys are correctly ordered with + * respect to the deleted keys added in the previous loop + */ + for (i = 0; i < nr_new_nodes; i++) + bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); + + /* Insert the newly coalesced nodes */ + bch2_btree_insert_node(parent, iter, &keylist, res, as); + + BUG_ON(!bch2_keylist_empty(&keylist)); + + BUG_ON(iter->nodes[old_nodes[0]->level] != old_nodes[0]); + + BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0])); + + for (i = 0; i < nr_new_nodes; i++) + bch2_btree_open_bucket_put(c, new_nodes[i]); + + /* Free the old nodes and update our sliding window */ + for (i = 0; i < nr_old_nodes; i++) { + bch2_btree_node_free_inmem(iter, old_nodes[i]); + six_unlock_intent(&old_nodes[i]->lock); + + /* + * the index update might have triggered a split, in which case + * the nodes we coalesced - the new nodes we just created - + * might not be sibling nodes anymore - don't add them to the + * sliding window (except the first): + */ + if (!i) { + old_nodes[i] = new_nodes[i]; + } else { + old_nodes[i] = NULL; + if (new_nodes[i]) + six_unlock_intent(&new_nodes[i]->lock); + } + } +out: + bch2_keylist_free(&keylist, NULL); + bch2_btree_reserve_put(c, res); +} + +static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) +{ + struct btree_iter iter; + struct btree *b; + unsigned i; + + /* Sliding window of adjacent btree nodes */ + struct btree *merge[GC_MERGE_NODES]; + u32 lock_seq[GC_MERGE_NODES]; + + /* + * XXX: We don't have a good way of positively matching on sibling nodes + * that have the same parent - this code works by handling the cases + * where they might not have the same parent, and is thus fragile. Ugh. + * + * Perhaps redo this to use multiple linked iterators? + */ + memset(merge, 0, sizeof(merge)); + + __for_each_btree_node(&iter, c, btree_id, POS_MIN, 0, b, U8_MAX) { + memmove(merge + 1, merge, + sizeof(merge) - sizeof(merge[0])); + memmove(lock_seq + 1, lock_seq, + sizeof(lock_seq) - sizeof(lock_seq[0])); + + merge[0] = b; + + for (i = 1; i < GC_MERGE_NODES; i++) { + if (!merge[i] || + !six_relock_intent(&merge[i]->lock, lock_seq[i])) + break; + + if (merge[i]->level != merge[0]->level) { + six_unlock_intent(&merge[i]->lock); + break; + } + } + memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); + + bch2_coalesce_nodes(merge, &iter); + + for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { + lock_seq[i] = merge[i]->lock.state.seq; + six_unlock_intent(&merge[i]->lock); + } + + lock_seq[0] = merge[0]->lock.state.seq; + + if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) { + bch2_btree_iter_unlock(&iter); + return -ESHUTDOWN; + } + + bch2_btree_iter_cond_resched(&iter); + + /* + * If the parent node wasn't relocked, it might have been split + * and the nodes in our sliding window might not have the same + * parent anymore - blow away the sliding window: + */ + if (iter.nodes[iter.level + 1] && + !btree_node_intent_locked(&iter, iter.level + 1)) + memset(merge + 1, 0, + (GC_MERGE_NODES - 1) * sizeof(merge[0])); + } + return bch2_btree_iter_unlock(&iter); +} + +/** + * bch_coalesce - coalesce adjacent nodes with low occupancy + */ +void bch2_coalesce(struct bch_fs *c) +{ + u64 start_time; + enum btree_id id; + + if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) + return; + + down_read(&c->gc_lock); + trace_gc_coalesce_start(c); + start_time = local_clock(); + + for (id = 0; id < BTREE_ID_NR; id++) { + int ret = c->btree_roots[id].b + ? bch2_coalesce_btree(c, id) + : 0; + + if (ret) { + if (ret != -ESHUTDOWN) + bch_err(c, "btree coalescing failed: %d", ret); + set_bit(BCH_FS_GC_FAILURE, &c->flags); + return; + } + } + + bch2_time_stats_update(&c->btree_coalesce_time, start_time); + trace_gc_coalesce_end(c); + up_read(&c->gc_lock); +} + +static int bch2_gc_thread(void *arg) +{ + struct bch_fs *c = arg; + struct io_clock *clock = &c->io_clock[WRITE]; + unsigned long last = atomic_long_read(&clock->now); + unsigned last_kick = atomic_read(&c->kick_gc); + + set_freezable(); + + while (1) { + unsigned long next = last + c->capacity / 16; + + while (atomic_long_read(&clock->now) < next) { + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + return 0; + } + + if (atomic_read(&c->kick_gc) != last_kick) { + __set_current_state(TASK_RUNNING); + break; + } + + bch2_io_clock_schedule_timeout(clock, next); + try_to_freeze(); + } + + last = atomic_long_read(&clock->now); + last_kick = atomic_read(&c->kick_gc); + + bch2_gc(c); + if (!btree_gc_coalesce_disabled(c)) + bch2_coalesce(c); + + debug_check_no_locks_held(); + } + + return 0; +} + +void bch2_gc_thread_stop(struct bch_fs *c) +{ + set_bit(BCH_FS_GC_STOPPING, &c->flags); + + if (c->gc_thread) + kthread_stop(c->gc_thread); + + c->gc_thread = NULL; + clear_bit(BCH_FS_GC_STOPPING, &c->flags); +} + +int bch2_gc_thread_start(struct bch_fs *c) +{ + struct task_struct *p; + + BUG_ON(c->gc_thread); + + p = kthread_create(bch2_gc_thread, c, "bcache_gc"); + if (IS_ERR(p)) + return PTR_ERR(p); + + c->gc_thread = p; + wake_up_process(c->gc_thread); + return 0; +} + +/* Initial GC computes bucket marks during startup */ + +static void bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id) +{ + struct btree_iter iter; + struct btree *b; + struct range_checks r; + + btree_node_range_checks_init(&r, 0); + + if (!c->btree_roots[id].b) + return; + + /* + * We have to hit every btree node before starting journal replay, in + * order for the journal seq blacklist machinery to work: + */ + for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { + btree_node_range_checks(c, b, &r); + + if (btree_node_has_ptrs(b)) { + struct btree_node_iter node_iter; + struct bkey unpacked; + struct bkey_s_c k; + + for_each_btree_node_key_unpack(b, k, &node_iter, + btree_node_is_extents(b), + &unpacked) + bch2_btree_mark_key_initial(c, btree_node_type(b), k); + } + + bch2_btree_iter_cond_resched(&iter); + } + + bch2_btree_iter_unlock(&iter); + + bch2_btree_mark_key(c, BKEY_TYPE_BTREE, + bkey_i_to_s_c(&c->btree_roots[id].b->key)); +} + +int bch2_initial_gc(struct bch_fs *c, struct list_head *journal) +{ + enum btree_id id; + + for (id = 0; id < BTREE_ID_NR; id++) + bch2_initial_gc_btree(c, id); + + if (journal) + bch2_journal_mark(c, journal); + + bch2_mark_metadata(c); + + /* + * Skip past versions that might have possibly been used (as nonces), + * but hadn't had their pointers written: + */ + if (c->sb.encryption_type) + atomic64_add(1 << 16, &c->key_version); + + gc_pos_set(c, gc_phase(GC_PHASE_DONE)); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + + return 0; +} diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h new file mode 100644 index 00000000..07210d33 --- /dev/null +++ b/libbcachefs/btree_gc.h @@ -0,0 +1,104 @@ +#ifndef _BCACHE_GC_H +#define _BCACHE_GC_H + +#include "btree_types.h" + +enum bkey_type; + +void bch2_coalesce(struct bch_fs *); +void bch2_gc(struct bch_fs *); +void bch2_gc_thread_stop(struct bch_fs *); +int bch2_gc_thread_start(struct bch_fs *); +int bch2_initial_gc(struct bch_fs *, struct list_head *); +u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c); +u8 bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type, + struct bkey_s_c); +void bch2_mark_dev_metadata(struct bch_fs *, struct bch_dev *); + +/* + * For concurrent mark and sweep (with other index updates), we define a total + * ordering of _all_ references GC walks: + * + * Note that some references will have the same GC position as others - e.g. + * everything within the same btree node; in those cases we're relying on + * whatever locking exists for where those references live, i.e. the write lock + * on a btree node. + * + * That locking is also required to ensure GC doesn't pass the updater in + * between the updater adding/removing the reference and updating the GC marks; + * without that, we would at best double count sometimes. + * + * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ + * be held that prevents GC from passing the position the updater is at. + * + * (What about the start of gc, when we're clearing all the marks? GC clears the + * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc + * position inside its cmpxchg loop, so crap magically works). + */ + +/* Position of (the start of) a gc phase: */ +static inline struct gc_pos gc_phase(enum gc_phase phase) +{ + return (struct gc_pos) { + .phase = phase, + .pos = POS_MIN, + .level = 0, + }; +} + +#define GC_POS_MIN gc_phase(0) + +static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) +{ + if (l.phase != r.phase) + return l.phase < r.phase ? -1 : 1; + if (bkey_cmp(l.pos, r.pos)) + return bkey_cmp(l.pos, r.pos); + if (l.level != r.level) + return l.level < r.level ? -1 : 1; + return 0; +} + +/* + * GC position of the pointers within a btree node: note, _not_ for &b->key + * itself, that lives in the parent node: + */ +static inline struct gc_pos gc_pos_btree_node(struct btree *b) +{ + return (struct gc_pos) { + .phase = b->btree_id, + .pos = b->key.k.p, + .level = b->level, + }; +} + +/* + * GC position of the pointer to a btree root: we don't use + * gc_pos_pointer_to_btree_node() here to avoid a potential race with + * btree_split() increasing the tree depth - the new root will have level > the + * old root and thus have a greater gc position than the old root, but that + * would be incorrect since once gc has marked the root it's not coming back. + */ +static inline struct gc_pos gc_pos_btree_root(enum btree_id id) +{ + return (struct gc_pos) { + .phase = (int) id, + .pos = POS_MAX, + .level = U8_MAX, + }; +} + +static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos) +{ + unsigned seq; + bool ret; + + do { + seq = read_seqcount_begin(&c->gc_pos_lock); + ret = gc_pos_cmp(c->gc_pos, pos) < 0; + } while (read_seqcount_retry(&c->gc_pos_lock, seq)); + + return ret; +} + +#endif diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c new file mode 100644 index 00000000..728cbcd9 --- /dev/null +++ b/libbcachefs/btree_io.c @@ -0,0 +1,1725 @@ + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_cache.h" +#include "btree_update.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "buckets.h" +#include "checksum.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "io.h" +#include "journal.h" +#include "super-io.h" + +#include <trace/events/bcachefs.h> + +static void verify_no_dups(struct btree *b, + struct bkey_packed *start, + struct bkey_packed *end) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bkey_packed *k; + + for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) { + struct bkey l = bkey_unpack_key(b, k); + struct bkey r = bkey_unpack_key(b, bkey_next(k)); + + BUG_ON(btree_node_is_extents(b) + ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 + : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); + //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0); + } +#endif +} + +static void clear_needs_whiteout(struct bset *i) +{ + struct bkey_packed *k; + + for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) + k->needs_whiteout = false; +} + +static void set_needs_whiteout(struct bset *i) +{ + struct bkey_packed *k; + + for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) + k->needs_whiteout = true; +} + +static void btree_bounce_free(struct bch_fs *c, unsigned order, + bool used_mempool, void *p) +{ + if (used_mempool) + mempool_free(virt_to_page(p), &c->btree_bounce_pool); + else + free_pages((unsigned long) p, order); +} + +static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, + bool *used_mempool) +{ + void *p; + + BUG_ON(1 << order > btree_pages(c)); + + *used_mempool = false; + p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); + if (p) + return p; + + *used_mempool = true; + return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO)); +} + +typedef int (*sort_cmp_fn)(struct btree *, + struct bkey_packed *, + struct bkey_packed *); + +struct sort_iter { + struct btree *b; + unsigned used; + + struct sort_iter_set { + struct bkey_packed *k, *end; + } data[MAX_BSETS + 1]; +}; + +static void sort_iter_init(struct sort_iter *iter, struct btree *b) +{ + memset(iter, 0, sizeof(*iter)); + iter->b = b; +} + +static inline void __sort_iter_sift(struct sort_iter *iter, + unsigned from, + sort_cmp_fn cmp) +{ + unsigned i; + + for (i = from; + i + 1 < iter->used && + cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; + i++) + swap(iter->data[i], iter->data[i + 1]); +} + +static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) +{ + + __sort_iter_sift(iter, 0, cmp); +} + +static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) +{ + unsigned i = iter->used; + + while (i--) + __sort_iter_sift(iter, i, cmp); +} + +static void sort_iter_add(struct sort_iter *iter, + struct bkey_packed *k, + struct bkey_packed *end) +{ + BUG_ON(iter->used >= ARRAY_SIZE(iter->data)); + + if (k != end) + iter->data[iter->used++] = (struct sort_iter_set) { k, end }; +} + +static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) +{ + return iter->used ? iter->data->k : NULL; +} + +static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) +{ + iter->data->k = bkey_next(iter->data->k); + + BUG_ON(iter->data->k > iter->data->end); + + if (iter->data->k == iter->data->end) + memmove(&iter->data[0], + &iter->data[1], + sizeof(iter->data[0]) * --iter->used); + else + sort_iter_sift(iter, cmp); +} + +static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, + sort_cmp_fn cmp) +{ + struct bkey_packed *ret = sort_iter_peek(iter); + + if (ret) + sort_iter_advance(iter, cmp); + + return ret; +} + +static inline int sort_key_whiteouts_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + return bkey_cmp_packed(b, l, r); +} + +static unsigned sort_key_whiteouts(struct bkey_packed *dst, + struct sort_iter *iter) +{ + struct bkey_packed *in, *out = dst; + + sort_iter_sort(iter, sort_key_whiteouts_cmp); + + while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) { + bkey_copy(out, in); + out = bkey_next(out); + } + + return (u64 *) out - (u64 *) dst; +} + +static inline int sort_extent_whiteouts_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + struct bkey ul = bkey_unpack_key(b, l); + struct bkey ur = bkey_unpack_key(b, r); + + return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); +} + +static unsigned sort_extent_whiteouts(struct bkey_packed *dst, + struct sort_iter *iter) +{ + const struct bkey_format *f = &iter->b->format; + struct bkey_packed *in, *out = dst; + struct bkey_i l, r; + bool prev = false, l_packed = false; + u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); + u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); + u64 new_size; + + max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); + + sort_iter_sort(iter, sort_extent_whiteouts_cmp); + + while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { + EBUG_ON(bkeyp_val_u64s(f, in)); + EBUG_ON(in->type != KEY_TYPE_DISCARD); + + r.k = bkey_unpack_key(iter->b, in); + + if (prev && + bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { + if (bkey_cmp(l.k.p, r.k.p) >= 0) + continue; + + new_size = l_packed + ? min(max_packed_size, max_packed_offset - + bkey_start_offset(&l.k)) + : KEY_SIZE_MAX; + + new_size = min(new_size, r.k.p.offset - + bkey_start_offset(&l.k)); + + BUG_ON(new_size < l.k.size); + + bch2_key_resize(&l.k, new_size); + + if (bkey_cmp(l.k.p, r.k.p) >= 0) + continue; + + bch2_cut_front(l.k.p, &r); + } + + if (prev) { + if (!bch2_bkey_pack(out, &l, f)) { + BUG_ON(l_packed); + bkey_copy(out, &l); + } + out = bkey_next(out); + } + + l = r; + prev = true; + l_packed = bkey_packed(in); + } + + if (prev) { + if (!bch2_bkey_pack(out, &l, f)) { + BUG_ON(l_packed); + bkey_copy(out, &l); + } + out = bkey_next(out); + } + + return (u64 *) out - (u64 *) dst; +} + +static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, + bool compacting, + enum compact_mode mode) +{ + unsigned live_u64s = b->nr.bset_u64s[t - b->set]; + unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); + + if (live_u64s == bset_u64s) + return 0; + + if (mode == COMPACT_LAZY) { + if (live_u64s * 4 < bset_u64s * 3 || + (compacting && bset_unwritten(b, bset(b, t)))) + return bset_u64s - live_u64s; + } else { + if (bset_written(b, bset(b, t))) + return bset_u64s - live_u64s; + } + + return 0; +} + +bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + enum compact_mode mode) +{ + const struct bkey_format *f = &b->format; + struct bset_tree *t; + struct bkey_packed *whiteouts = NULL; + struct bkey_packed *u_start, *u_pos; + struct sort_iter sort_iter; + unsigned order, whiteout_u64s = 0, u64s; + bool used_mempool, compacting = false; + + for_each_bset(b, t) + whiteout_u64s += should_compact_bset(b, t, + whiteout_u64s != 0, mode); + + if (!whiteout_u64s) + return false; + + sort_iter_init(&sort_iter, b); + + whiteout_u64s += b->whiteout_u64s; + order = get_order(whiteout_u64s * sizeof(u64)); + + whiteouts = btree_bounce_alloc(c, order, &used_mempool); + u_start = u_pos = whiteouts; + + memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), + b->whiteout_u64s); + u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); + + sort_iter_add(&sort_iter, u_start, u_pos); + + for_each_bset(b, t) { + struct bset *i = bset(b, t); + struct bkey_packed *k, *n, *out, *start, *end; + struct btree_node_entry *src = NULL, *dst = NULL; + + if (t != b->set && bset_unwritten(b, i)) { + src = container_of(i, struct btree_node_entry, keys); + dst = max(write_block(b), + (void *) btree_bkey_last(b, t -1)); + } + + if (!should_compact_bset(b, t, compacting, mode)) { + if (src != dst) { + memmove(dst, src, sizeof(*src) + + le16_to_cpu(src->keys.u64s) * + sizeof(u64)); + i = &dst->keys; + set_btree_bset(b, t, i); + } + continue; + } + + compacting = true; + u_start = u_pos; + start = i->start; + end = vstruct_last(i); + + if (src != dst) { + memmove(dst, src, sizeof(*src)); + i = &dst->keys; + set_btree_bset(b, t, i); + } + + out = i->start; + + for (k = start; k != end; k = n) { + n = bkey_next(k); + + if (bkey_deleted(k) && btree_node_is_extents(b)) + continue; + + if (bkey_whiteout(k) && !k->needs_whiteout) + continue; + + if (bkey_whiteout(k)) { + unreserve_whiteout(b, t, k); + memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); + set_bkeyp_val_u64s(f, u_pos, 0); + u_pos = bkey_next(u_pos); + } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { + bkey_copy(out, k); + out = bkey_next(out); + } + } + + sort_iter_add(&sort_iter, u_start, u_pos); + + if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { + i->u64s = cpu_to_le16((u64 *) out - i->_data); + set_btree_bset_end(b, t); + bch2_bset_set_no_aux_tree(b, t); + } + } + + b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; + + BUG_ON((void *) unwritten_whiteouts_start(c, b) < + (void *) btree_bkey_last(b, bset_tree_last(b))); + + u64s = btree_node_is_extents(b) + ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b), + &sort_iter) + : sort_key_whiteouts(unwritten_whiteouts_start(c, b), + &sort_iter); + + BUG_ON(u64s > b->whiteout_u64s); + BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b)); + BUG_ON(u_pos != whiteouts && !u64s); + + if (u64s != b->whiteout_u64s) { + void *src = unwritten_whiteouts_start(c, b); + + b->whiteout_u64s = u64s; + memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); + } + + verify_no_dups(b, + unwritten_whiteouts_start(c, b), + unwritten_whiteouts_end(c, b)); + + btree_bounce_free(c, order, used_mempool, whiteouts); + + if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) + bch2_btree_build_aux_trees(b); + + bch_btree_keys_u64s_remaining(c, b); + bch2_verify_btree_nr_keys(b); + + return true; +} + +static bool bch2_drop_whiteouts(struct btree *b) +{ + struct bset_tree *t; + bool ret = false; + + for_each_bset(b, t) { + struct bset *i = bset(b, t); + struct bkey_packed *k, *n, *out, *start, *end; + + if (!should_compact_bset(b, t, true, true)) + continue; + + start = btree_bkey_first(b, t); + end = btree_bkey_last(b, t); + + if (bset_unwritten(b, i) && + t != b->set) { + struct bset *dst = + max_t(struct bset *, write_block(b), + (void *) btree_bkey_last(b, t -1)); + + memmove(dst, i, sizeof(struct bset)); + i = dst; + set_btree_bset(b, t, i); + } + + out = i->start; + + for (k = start; k != end; k = n) { + n = bkey_next(k); + + if (!bkey_whiteout(k)) { + bkey_copy(out, k); + out = bkey_next(out); + } + } + + i->u64s = cpu_to_le16((u64 *) out - i->_data); + bch2_bset_set_no_aux_tree(b, t); + ret = true; + } + + bch2_verify_btree_nr_keys(b); + + return ret; +} + +static inline int sort_keys_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + return bkey_cmp_packed(b, l, r) ?: + (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?: + (int) l->needs_whiteout - (int) r->needs_whiteout; +} + +static unsigned sort_keys(struct bkey_packed *dst, + struct sort_iter *iter, + bool filter_whiteouts) +{ + const struct bkey_format *f = &iter->b->format; + struct bkey_packed *in, *next, *out = dst; + + sort_iter_sort(iter, sort_keys_cmp); + + while ((in = sort_iter_next(iter, sort_keys_cmp))) { + if (bkey_whiteout(in) && + (filter_whiteouts || !in->needs_whiteout)) + continue; + + if (bkey_whiteout(in) && + (next = sort_iter_peek(iter)) && + !bkey_cmp_packed(iter->b, in, next)) { + BUG_ON(in->needs_whiteout && + next->needs_whiteout); + /* + * XXX racy, called with read lock from write path + * + * leads to spurious BUG_ON() in bkey_unpack_key() in + * debug mode + */ + next->needs_whiteout |= in->needs_whiteout; + continue; + } + + if (bkey_whiteout(in)) { + memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); + set_bkeyp_val_u64s(f, out, 0); + } else { + bkey_copy(out, in); + } + out = bkey_next(out); + } + + return (u64 *) out - (u64 *) dst; +} + +static inline int sort_extents_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + return bkey_cmp_packed(b, l, r) ?: + (int) bkey_deleted(l) - (int) bkey_deleted(r); +} + +static unsigned sort_extents(struct bkey_packed *dst, + struct sort_iter *iter, + bool filter_whiteouts) +{ + struct bkey_packed *in, *out = dst; + + sort_iter_sort(iter, sort_extents_cmp); + + while ((in = sort_iter_next(iter, sort_extents_cmp))) { + if (bkey_deleted(in)) + continue; + + if (bkey_whiteout(in) && + (filter_whiteouts || !in->needs_whiteout)) + continue; + + bkey_copy(out, in); + out = bkey_next(out); + } + + return (u64 *) out - (u64 *) dst; +} + +static void btree_node_sort(struct bch_fs *c, struct btree *b, + struct btree_iter *iter, + unsigned start_idx, + unsigned end_idx, + bool filter_whiteouts) +{ + struct btree_node *out; + struct sort_iter sort_iter; + struct bset_tree *t; + struct bset *start_bset = bset(b, &b->set[start_idx]); + bool used_mempool = false; + u64 start_time; + unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; + bool sorting_entire_node = start_idx == 0 && + end_idx == b->nsets; + + sort_iter_init(&sort_iter, b); + + for (t = b->set + start_idx; + t < b->set + end_idx; + t++) { + u64s += le16_to_cpu(bset(b, t)->u64s); + sort_iter_add(&sort_iter, + btree_bkey_first(b, t), + btree_bkey_last(b, t)); + } + + order = sorting_entire_node + ? btree_page_order(c) + : get_order(__vstruct_bytes(struct btree_node, u64s)); + + out = btree_bounce_alloc(c, order, &used_mempool); + + start_time = local_clock(); + + if (btree_node_is_extents(b)) + filter_whiteouts = bset_written(b, start_bset); + + u64s = btree_node_is_extents(b) + ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts) + : sort_keys(out->keys.start, &sort_iter, filter_whiteouts); + + out->keys.u64s = cpu_to_le16(u64s); + + BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); + + if (sorting_entire_node) + bch2_time_stats_update(&c->btree_sort_time, start_time); + + /* Make sure we preserve bset journal_seq: */ + for (t = b->set + start_idx + 1; + t < b->set + end_idx; + t++) + start_bset->journal_seq = + max(start_bset->journal_seq, + bset(b, t)->journal_seq); + + if (sorting_entire_node) { + unsigned u64s = le16_to_cpu(out->keys.u64s); + + BUG_ON(order != btree_page_order(c)); + + /* + * Our temporary buffer is the same size as the btree node's + * buffer, we can just swap buffers instead of doing a big + * memcpy() + */ + *out = *b->data; + out->keys.u64s = cpu_to_le16(u64s); + swap(out, b->data); + set_btree_bset(b, b->set, &b->data->keys); + } else { + start_bset->u64s = out->keys.u64s; + memcpy_u64s(start_bset->start, + out->keys.start, + le16_to_cpu(out->keys.u64s)); + } + + for (i = start_idx + 1; i < end_idx; i++) + b->nr.bset_u64s[start_idx] += + b->nr.bset_u64s[i]; + + b->nsets -= shift; + + for (i = start_idx + 1; i < b->nsets; i++) { + b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; + b->set[i] = b->set[i + shift]; + } + + for (i = b->nsets; i < MAX_BSETS; i++) + b->nr.bset_u64s[i] = 0; + + set_btree_bset_end(b, &b->set[start_idx]); + bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); + + btree_bounce_free(c, order, used_mempool, out); + + bch2_verify_btree_nr_keys(b); +} + +/* Sort + repack in a new format: */ +static struct btree_nr_keys sort_repack(struct bset *dst, + struct btree *src, + struct btree_node_iter *src_iter, + struct bkey_format *out_f, + bool filter_whiteouts) +{ + struct bkey_format *in_f = &src->format; + struct bkey_packed *in, *out = vstruct_last(dst); + struct btree_nr_keys nr; + + memset(&nr, 0, sizeof(nr)); + + while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { + if (filter_whiteouts && bkey_whiteout(in)) + continue; + + if (bch2_bkey_transform(out_f, out, bkey_packed(in) + ? in_f : &bch2_bkey_format_current, in)) + out->format = KEY_FORMAT_LOCAL_BTREE; + else + bch2_bkey_unpack(src, (void *) out, in); + + btree_keys_account_key_add(&nr, 0, out); + out = bkey_next(out); + } + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + return nr; +} + +/* Sort, repack, and merge: */ +static struct btree_nr_keys sort_repack_merge(struct bch_fs *c, + struct bset *dst, + struct btree *src, + struct btree_node_iter *iter, + struct bkey_format *out_f, + bool filter_whiteouts, + key_filter_fn filter, + key_merge_fn merge) +{ + struct bkey_packed *k, *prev = NULL, *out; + struct btree_nr_keys nr; + BKEY_PADDED(k) tmp; + + memset(&nr, 0, sizeof(nr)); + + while ((k = bch2_btree_node_iter_next_all(iter, src))) { + if (filter_whiteouts && bkey_whiteout(k)) + continue; + + /* + * The filter might modify pointers, so we have to unpack the + * key and values to &tmp.k: + */ + bch2_bkey_unpack(src, &tmp.k, k); + + if (filter && filter(c, src, bkey_i_to_s(&tmp.k))) + continue; + + /* prev is always unpacked, for key merging: */ + + if (prev && + merge && + merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE) + continue; + + /* + * the current key becomes the new prev: advance prev, then + * copy the current key - but first pack prev (in place): + */ + if (prev) { + bch2_bkey_pack(prev, (void *) prev, out_f); + + btree_keys_account_key_add(&nr, 0, prev); + prev = bkey_next(prev); + } else { + prev = vstruct_last(dst); + } + + bkey_copy(prev, &tmp.k); + } + + if (prev) { + bch2_bkey_pack(prev, (void *) prev, out_f); + btree_keys_account_key_add(&nr, 0, prev); + out = bkey_next(prev); + } else { + out = vstruct_last(dst); + } + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + return nr; +} + +void bch2_btree_sort_into(struct bch_fs *c, + struct btree *dst, + struct btree *src) +{ + struct btree_nr_keys nr; + struct btree_node_iter src_iter; + u64 start_time = local_clock(); + + BUG_ON(dst->nsets != 1); + + bch2_bset_set_no_aux_tree(dst, dst->set); + + bch2_btree_node_iter_init_from_start(&src_iter, src, + btree_node_is_extents(src)); + + if (btree_node_ops(src)->key_normalize || + btree_node_ops(src)->key_merge) + nr = sort_repack_merge(c, btree_bset_first(dst), + src, &src_iter, + &dst->format, + true, + btree_node_ops(src)->key_normalize, + btree_node_ops(src)->key_merge); + else + nr = sort_repack(btree_bset_first(dst), + src, &src_iter, + &dst->format, + true); + + bch2_time_stats_update(&c->btree_sort_time, start_time); + + set_btree_bset_end(dst, dst->set); + + dst->nr.live_u64s += nr.live_u64s; + dst->nr.bset_u64s[0] += nr.bset_u64s[0]; + dst->nr.packed_keys += nr.packed_keys; + dst->nr.unpacked_keys += nr.unpacked_keys; + + bch2_verify_btree_nr_keys(dst); +} + +#define SORT_CRIT (4096 / sizeof(u64)) + +/* + * We're about to add another bset to the btree node, so if there's currently + * too many bsets - sort some of them together: + */ +static bool btree_node_compact(struct bch_fs *c, struct btree *b, + struct btree_iter *iter) +{ + unsigned unwritten_idx; + bool ret = false; + + for (unwritten_idx = 0; + unwritten_idx < b->nsets; + unwritten_idx++) + if (bset_unwritten(b, bset(b, &b->set[unwritten_idx]))) + break; + + if (b->nsets - unwritten_idx > 1) { + btree_node_sort(c, b, iter, unwritten_idx, + b->nsets, false); + ret = true; + } + + if (unwritten_idx > 1) { + btree_node_sort(c, b, iter, 0, unwritten_idx, false); + ret = true; + } + + return ret; +} + +void bch2_btree_build_aux_trees(struct btree *b) +{ + struct bset_tree *t; + + for_each_bset(b, t) + bch2_bset_build_aux_tree(b, t, + bset_unwritten(b, bset(b, t)) && + t == bset_tree_last(b)); +} + +/* + * @bch_btree_init_next - initialize a new (unwritten) bset that can then be + * inserted into + * + * Safe to call if there already is an unwritten bset - will only add a new bset + * if @b doesn't already have one. + * + * Returns true if we sorted (i.e. invalidated iterators + */ +void bch2_btree_init_next(struct bch_fs *c, struct btree *b, + struct btree_iter *iter) +{ + struct btree_node_entry *bne; + bool did_sort; + + EBUG_ON(!(b->lock.state.seq & 1)); + EBUG_ON(iter && iter->nodes[b->level] != b); + + did_sort = btree_node_compact(c, b, iter); + + bne = want_new_bset(c, b); + if (bne) + bch2_bset_init_next(b, &bne->keys); + + bch2_btree_build_aux_trees(b); + + if (iter && did_sort) + bch2_btree_iter_reinit_node(iter, b); +} + +static struct nonce btree_nonce(struct btree *b, + struct bset *i, + unsigned offset) +{ + return (struct nonce) {{ + [0] = cpu_to_le32(offset), + [1] = ((__le32 *) &i->seq)[0], + [2] = ((__le32 *) &i->seq)[1], + [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, + }}; +} + +static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce) +{ + bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, + vstruct_end(i) - (void *) i->_data); +} + +#define btree_node_error(b, c, ptr, fmt, ...) \ + bch2_fs_inconsistent(c, \ + "btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\ + (b)->btree_id, (b)->level, btree_node_root(c, b) \ + ? btree_node_root(c, b)->level : -1, \ + PTR_BUCKET_NR(ca, ptr), (b)->written, \ + le16_to_cpu((i)->u64s), ##__VA_ARGS__) + +static const char *validate_bset(struct bch_fs *c, struct btree *b, + struct bch_dev *ca, + const struct bch_extent_ptr *ptr, + struct bset *i, unsigned sectors, + unsigned *whiteout_u64s) +{ + struct bkey_packed *k, *prev = NULL; + struct bpos prev_pos = POS_MIN; + bool seen_non_whiteout = false; + + if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) + return "unsupported bset version"; + + if (b->written + sectors > c->sb.btree_node_size) + return "bset past end of btree node"; + + if (i != &b->data->keys && !i->u64s) + btree_node_error(b, c, ptr, "empty set"); + + if (!BSET_SEPARATE_WHITEOUTS(i)) { + seen_non_whiteout = true; + whiteout_u64s = 0; + } + + for (k = i->start; + k != vstruct_last(i);) { + struct bkey_s_c u; + struct bkey tmp; + const char *invalid; + + if (!k->u64s) { + btree_node_error(b, c, ptr, + "KEY_U64s 0: %zu bytes of metadata lost", + vstruct_end(i) - (void *) k); + + i->u64s = cpu_to_le16((u64 *) k - i->_data); + break; + } + + if (bkey_next(k) > vstruct_last(i)) { + btree_node_error(b, c, ptr, + "key extends past end of bset"); + + i->u64s = cpu_to_le16((u64 *) k - i->_data); + break; + } + + if (k->format > KEY_FORMAT_CURRENT) { + btree_node_error(b, c, ptr, + "invalid bkey format %u", k->format); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + continue; + } + + if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) + bch2_bkey_swab(btree_node_type(b), &b->format, k); + + u = bkey_disassemble(b, k, &tmp); + + invalid = bch2_btree_bkey_invalid(c, b, u); + if (invalid) { + char buf[160]; + + bch2_bkey_val_to_text(c, btree_node_type(b), + buf, sizeof(buf), u); + btree_node_error(b, c, ptr, + "invalid bkey %s: %s", buf, invalid); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + continue; + } + + /* + * with the separate whiteouts thing (used for extents), the + * second set of keys actually can have whiteouts too, so we + * can't solely go off bkey_whiteout()... + */ + + if (!seen_non_whiteout && + (!bkey_whiteout(k) || + (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) { + *whiteout_u64s = k->_data - i->_data; + seen_non_whiteout = true; + } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) { + btree_node_error(b, c, ptr, + "keys out of order: %llu:%llu > %llu:%llu", + prev_pos.inode, + prev_pos.offset, + u.k->p.inode, + bkey_start_offset(u.k)); + /* XXX: repair this */ + } + + prev_pos = u.k->p; + prev = k; + k = bkey_next(k); + } + + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + return NULL; +} + +static bool extent_contains_ptr(struct bkey_s_c_extent e, + struct bch_extent_ptr match) +{ + const struct bch_extent_ptr *ptr; + + extent_for_each_ptr(e, ptr) + if (!memcmp(ptr, &match, sizeof(*ptr))) + return true; + + return false; +} + +void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, + struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + struct btree_node_entry *bne; + struct bset *i = &b->data->keys; + struct btree_node_iter *iter; + struct btree_node *sorted; + bool used_mempool; + unsigned u64s; + const char *err; + struct bch_csum csum; + struct nonce nonce; + int ret; + + iter = mempool_alloc(&c->fill_iter, GFP_NOIO); + __bch2_btree_node_iter_init(iter, btree_node_is_extents(b)); + + err = "dynamic fault"; + if (bch2_meta_read_fault("btree")) + goto err; + + while (b->written < c->sb.btree_node_size) { + unsigned sectors, whiteout_u64s = 0; + + if (!b->written) { + i = &b->data->keys; + + err = "bad magic"; + if (le64_to_cpu(b->data->magic) != bset_magic(c)) + goto err; + + err = "bad btree header"; + if (!b->data->keys.seq) + goto err; + + err = "unknown checksum type"; + if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) + goto err; + + /* XXX: retry checksum errors */ + + nonce = btree_nonce(b, i, b->written << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + + err = "bad checksum"; + if (bch2_crc_cmp(csum, b->data->csum)) + goto err; + + bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, + &b->data->flags, + (void *) &b->data->keys - + (void *) &b->data->flags); + nonce = nonce_add(nonce, + round_up((void *) &b->data->keys - + (void *) &b->data->flags, + CHACHA20_BLOCK_SIZE)); + bset_encrypt(c, i, nonce); + + sectors = vstruct_sectors(b->data, c->block_bits); + + if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { + u64 *p = (u64 *) &b->data->ptr; + + *p = swab64(*p); + bch2_bpos_swab(&b->data->min_key); + bch2_bpos_swab(&b->data->max_key); + } + + err = "incorrect btree id"; + if (BTREE_NODE_ID(b->data) != b->btree_id) + goto err; + + err = "incorrect level"; + if (BTREE_NODE_LEVEL(b->data) != b->level) + goto err; + + err = "incorrect max key"; + if (bkey_cmp(b->data->max_key, b->key.k.p)) + goto err; + + err = "incorrect backpointer"; + if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), + b->data->ptr)) + goto err; + + err = bch2_bkey_format_validate(&b->data->format); + if (err) + goto err; + + set_btree_bset(b, b->set, &b->data->keys); + + btree_node_set_format(b, b->data->format); + } else { + bne = write_block(b); + i = &bne->keys; + + if (i->seq != b->data->keys.seq) + break; + + err = "unknown checksum type"; + if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) + goto err; + + nonce = btree_nonce(b, i, b->written << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + + err = "bad checksum"; + if (memcmp(&csum, &bne->csum, sizeof(csum))) + goto err; + + bset_encrypt(c, i, nonce); + + sectors = vstruct_sectors(bne, c->block_bits); + } + + err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s); + if (err) + goto err; + + b->written += sectors; + + err = "insufficient memory"; + ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b); + if (ret < 0) + goto err; + + if (ret) + continue; + + __bch2_btree_node_iter_push(iter, b, + i->start, + vstruct_idx(i, whiteout_u64s)); + + __bch2_btree_node_iter_push(iter, b, + vstruct_idx(i, whiteout_u64s), + vstruct_last(i)); + } + + err = "corrupted btree"; + for (bne = write_block(b); + bset_byte_offset(b, bne) < btree_bytes(c); + bne = (void *) bne + block_bytes(c)) + if (bne->keys.seq == b->data->keys.seq) + goto err; + + sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool); + sorted->keys.u64s = 0; + + b->nr = btree_node_is_extents(b) + ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter) + : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter); + + u64s = le16_to_cpu(sorted->keys.u64s); + *sorted = *b->data; + sorted->keys.u64s = cpu_to_le16(u64s); + swap(sorted, b->data); + set_btree_bset(b, b->set, &b->data->keys); + b->nsets = 1; + + BUG_ON(b->nr.live_u64s != u64s); + + btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted); + + bch2_bset_build_aux_tree(b, b->set, false); + + set_needs_whiteout(btree_bset_first(b)); + + btree_node_reset_sib_u64s(b); +out: + mempool_free(iter, &c->fill_iter); + return; +err: + set_btree_node_read_error(b); + btree_node_error(b, c, ptr, "%s", err); + goto out; +} + +void bch2_btree_node_read(struct bch_fs *c, struct btree *b) +{ + uint64_t start_time = local_clock(); + struct bio *bio; + struct extent_pick_ptr pick; + + trace_btree_read(c, b); + + pick = bch2_btree_pick_ptr(c, b); + if (bch2_fs_fatal_err_on(!pick.ca, c, + "no cache device for btree node")) { + set_btree_node_read_error(b); + return; + } + + bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio); + bio->bi_bdev = pick.ca->disk_sb.bdev; + bio->bi_iter.bi_sector = pick.ptr.offset; + bio->bi_iter.bi_size = btree_bytes(c); + bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC); + bch2_bio_map(bio, b->data); + + submit_bio_wait(bio); + + if (bch2_dev_fatal_io_err_on(bio->bi_error, + pick.ca, "IO error reading bucket %zu", + PTR_BUCKET_NR(pick.ca, &pick.ptr)) || + bch2_meta_read_fault("btree")) { + set_btree_node_read_error(b); + goto out; + } + + bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr); + bch2_time_stats_update(&c->btree_read_time, start_time); +out: + bio_put(bio); + percpu_ref_put(&pick.ca->io_ref); +} + +int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, + const struct bkey_i *k, unsigned level) +{ + struct closure cl; + struct btree *b; + int ret; + + closure_init_stack(&cl); + + do { + ret = bch2_btree_node_cannibalize_lock(c, &cl); + closure_sync(&cl); + } while (ret); + + b = bch2_btree_node_mem_alloc(c); + bch2_btree_node_cannibalize_unlock(c); + + BUG_ON(IS_ERR(b)); + + bkey_copy(&b->key, k); + BUG_ON(bch2_btree_node_hash_insert(c, b, level, id)); + + bch2_btree_node_read(c, b); + six_unlock_write(&b->lock); + + if (btree_node_read_error(b)) { + six_unlock_intent(&b->lock); + return -EIO; + } + + bch2_btree_set_root_initial(c, b, NULL); + six_unlock_intent(&b->lock); + + return 0; +} + +void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, + struct btree_write *w) +{ + bch2_journal_pin_drop(&c->journal, &w->journal); + closure_wake_up(&w->wait); +} + +static void btree_node_write_done(struct bch_fs *c, struct btree *b) +{ + struct btree_write *w = btree_prev_write(b); + + /* + * Before calling bch2_btree_complete_write() - if the write errored, we + * have to halt new journal writes before they see this btree node + * write as completed: + */ + if (btree_node_write_error(b)) + bch2_journal_halt(&c->journal); + + bch2_btree_complete_write(c, b, w); + btree_node_io_unlock(b); +} + +static void btree_node_write_endio(struct bio *bio) +{ + struct btree *b = bio->bi_private; + struct bch_write_bio *wbio = to_wbio(bio); + struct bch_fs *c = wbio->c; + struct bio *orig = wbio->split ? wbio->orig : NULL; + struct closure *cl = !wbio->split ? wbio->cl : NULL; + struct bch_dev *ca = wbio->ca; + + if (bch2_dev_fatal_io_err_on(bio->bi_error, ca, "btree write") || + bch2_meta_write_fault("btree")) + set_btree_node_write_error(b); + + if (wbio->bounce) + btree_bounce_free(c, + wbio->order, + wbio->used_mempool, + page_address(bio->bi_io_vec[0].bv_page)); + + if (wbio->put_bio) + bio_put(bio); + + if (orig) { + bio_endio(orig); + } else { + btree_node_write_done(c, b); + if (cl) + closure_put(cl); + } + + if (ca) + percpu_ref_put(&ca->io_ref); +} + +void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, + struct closure *parent, + enum six_lock_type lock_type_held, + int idx_to_write) +{ + struct bio *bio; + struct bch_write_bio *wbio; + struct bset_tree *t; + struct bset *i; + struct btree_node *bn = NULL; + struct btree_node_entry *bne = NULL; + BKEY_PADDED(key) k; + struct bkey_s_extent e; + struct bch_extent_ptr *ptr; + struct sort_iter sort_iter; + struct nonce nonce; + unsigned bytes_to_write, sectors_to_write, order, bytes, u64s; + u64 seq = 0; + bool used_mempool; + unsigned long old, new; + void *data; + + /* + * We may only have a read lock on the btree node - the dirty bit is our + * "lock" against racing with other threads that may be trying to start + * a write, we do a write iff we clear the dirty bit. Since setting the + * dirty bit requires a write lock, we can't race with other threads + * redirtying it: + */ + do { + old = new = READ_ONCE(b->flags); + + if (!(old & (1 << BTREE_NODE_dirty))) + return; + + if (idx_to_write >= 0 && + idx_to_write != !!(old & (1 << BTREE_NODE_write_idx))) + return; + + if (old & (1 << BTREE_NODE_write_in_flight)) { + wait_on_bit_io(&b->flags, + BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); + continue; + } + + new &= ~(1 << BTREE_NODE_dirty); + new |= (1 << BTREE_NODE_write_in_flight); + new |= (1 << BTREE_NODE_just_written); + new ^= (1 << BTREE_NODE_write_idx); + } while (cmpxchg_acquire(&b->flags, old, new) != old); + + BUG_ON(!list_empty(&b->write_blocked)); + + BUG_ON(b->written >= c->sb.btree_node_size); + BUG_ON(bset_written(b, btree_bset_last(b))); + BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); + BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); + + if (lock_type_held == SIX_LOCK_intent) { + six_lock_write(&b->lock); + __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN); + six_unlock_write(&b->lock); + } else { + __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK); + } + + BUG_ON(b->uncompacted_whiteout_u64s); + + sort_iter_init(&sort_iter, b); + + bytes = !b->written + ? sizeof(struct btree_node) + : sizeof(struct btree_node_entry); + + bytes += b->whiteout_u64s * sizeof(u64); + + for_each_bset(b, t) { + i = bset(b, t); + + if (bset_written(b, i)) + continue; + + bytes += le16_to_cpu(i->u64s) * sizeof(u64); + sort_iter_add(&sort_iter, + btree_bkey_first(b, t), + btree_bkey_last(b, t)); + seq = max(seq, le64_to_cpu(i->journal_seq)); + } + + order = get_order(bytes); + data = btree_bounce_alloc(c, order, &used_mempool); + + if (!b->written) { + bn = data; + *bn = *b->data; + i = &bn->keys; + } else { + bne = data; + bne->keys = b->data->keys; + i = &bne->keys; + } + + i->journal_seq = cpu_to_le64(seq); + i->u64s = 0; + + if (!btree_node_is_extents(b)) { + sort_iter_add(&sort_iter, + unwritten_whiteouts_start(c, b), + unwritten_whiteouts_end(c, b)); + SET_BSET_SEPARATE_WHITEOUTS(i, false); + } else { + memcpy_u64s(i->start, + unwritten_whiteouts_start(c, b), + b->whiteout_u64s); + i->u64s = cpu_to_le16(b->whiteout_u64s); + SET_BSET_SEPARATE_WHITEOUTS(i, true); + } + + b->whiteout_u64s = 0; + + u64s = btree_node_is_extents(b) + ? sort_extents(vstruct_last(i), &sort_iter, false) + : sort_keys(i->start, &sort_iter, false); + le16_add_cpu(&i->u64s, u64s); + + clear_needs_whiteout(i); + + if (b->written && !i->u64s) { + /* Nothing to write: */ + btree_bounce_free(c, order, used_mempool, data); + btree_node_write_done(c, b); + return; + } + + BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); + BUG_ON(i->seq != b->data->keys.seq); + + i->version = cpu_to_le16(BCACHE_BSET_VERSION); + SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); + + nonce = btree_nonce(b, i, b->written << 9); + + if (bn) { + bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, + &bn->flags, + (void *) &b->data->keys - + (void *) &b->data->flags); + nonce = nonce_add(nonce, + round_up((void *) &b->data->keys - + (void *) &b->data->flags, + CHACHA20_BLOCK_SIZE)); + bset_encrypt(c, i, nonce); + + nonce = btree_nonce(b, i, b->written << 9); + bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); + } else { + bset_encrypt(c, i, nonce); + + bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + } + + bytes_to_write = vstruct_end(i) - data; + sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; + + memset(data + bytes_to_write, 0, + (sectors_to_write << 9) - bytes_to_write); + + BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size); + + trace_btree_write(b, bytes_to_write, sectors_to_write); + + /* + * We handle btree write errors by immediately halting the journal - + * after we've done that, we can't issue any subsequent btree writes + * because they might have pointers to new nodes that failed to write. + * + * Furthermore, there's no point in doing any more btree writes because + * with the journal stopped, we're never going to update the journal to + * reflect that those writes were done and the data flushed from the + * journal: + * + * Make sure to update b->written so bch2_btree_init_next() doesn't + * break: + */ + if (bch2_journal_error(&c->journal) || + c->opts.nochanges) { + set_btree_node_noevict(b); + b->written += sectors_to_write; + + btree_bounce_free(c, order, used_mempool, data); + btree_node_write_done(c, b); + return; + } + + bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write); + + wbio = to_wbio(bio); + wbio->cl = parent; + wbio->bounce = true; + wbio->put_bio = true; + wbio->order = order; + wbio->used_mempool = used_mempool; + bio->bi_iter.bi_size = sectors_to_write << 9; + bio->bi_end_io = btree_node_write_endio; + bio->bi_private = b; + bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA); + + if (parent) + closure_get(parent); + + bch2_bio_map(bio, data); + + /* + * If we're appending to a leaf node, we don't technically need FUA - + * this write just needs to be persisted before the next journal write, + * which will be marked FLUSH|FUA. + * + * Similarly if we're writing a new btree root - the pointer is going to + * be in the next journal entry. + * + * But if we're writing a new btree node (that isn't a root) or + * appending to a non leaf btree node, we need either FUA or a flush + * when we write the parent with the new pointer. FUA is cheaper than a + * flush, and writes appending to leaf nodes aren't blocking anything so + * just make all btree node writes FUA to keep things sane. + */ + + bkey_copy(&k.key, &b->key); + e = bkey_i_to_s_extent(&k.key); + + extent_for_each_ptr(e, ptr) + ptr->offset += b->written; + + extent_for_each_ptr(e, ptr) + atomic64_add(sectors_to_write, + &c->devs[ptr->dev]->btree_sectors_written); + + b->written += sectors_to_write; + + bch2_submit_wbio_replicas(wbio, c, &k.key); +} + +/* + * Work that must be done with write lock held: + */ +bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) +{ + bool invalidated_iter = false; + struct btree_node_entry *bne; + struct bset_tree *t; + + if (!btree_node_just_written(b)) + return false; + + BUG_ON(b->whiteout_u64s); + BUG_ON(b->uncompacted_whiteout_u64s); + + clear_btree_node_just_written(b); + + /* + * Note: immediately after write, bset_unwritten()/bset_written() don't + * work - the amount of data we had to write after compaction might have + * been smaller than the offset of the last bset. + * + * However, we know that all bsets have been written here, as long as + * we're still holding the write lock: + */ + + /* + * XXX: decide if we really want to unconditionally sort down to a + * single bset: + */ + if (b->nsets > 1) { + btree_node_sort(c, b, NULL, 0, b->nsets, true); + invalidated_iter = true; + } else { + invalidated_iter = bch2_drop_whiteouts(b); + } + + for_each_bset(b, t) + set_needs_whiteout(bset(b, t)); + + bch2_btree_verify(c, b); + + /* + * If later we don't unconditionally sort down to a single bset, we have + * to ensure this is still true: + */ + BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); + + bne = want_new_bset(c, b); + if (bne) + bch2_bset_init_next(b, &bne->keys); + + bch2_btree_build_aux_trees(b); + + return invalidated_iter; +} + +/* + * Use this one if the node is intent locked: + */ +void bch2_btree_node_write(struct bch_fs *c, struct btree *b, + struct closure *parent, + enum six_lock_type lock_type_held, + int idx_to_write) +{ + BUG_ON(lock_type_held == SIX_LOCK_write); + + if (lock_type_held == SIX_LOCK_intent || + six_trylock_convert(&b->lock, SIX_LOCK_read, + SIX_LOCK_intent)) { + __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write); + + six_lock_write(&b->lock); + bch2_btree_post_write_cleanup(c, b); + six_unlock_write(&b->lock); + + if (lock_type_held == SIX_LOCK_read) + six_lock_downgrade(&b->lock); + } else { + __bch2_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write); + } +} + +static void bch2_btree_node_write_dirty(struct bch_fs *c, struct btree *b, + struct closure *parent) +{ + six_lock_read(&b->lock); + BUG_ON(b->level); + + bch2_btree_node_write(c, b, parent, SIX_LOCK_read, -1); + six_unlock_read(&b->lock); +} + +/* + * Write all dirty btree nodes to disk, including roots + */ +void bch2_btree_flush(struct bch_fs *c) +{ + struct closure cl; + struct btree *b; + struct bucket_table *tbl; + struct rhash_head *pos; + bool dropped_lock; + unsigned i; + + closure_init_stack(&cl); + + rcu_read_lock(); + + do { + dropped_lock = false; + i = 0; +restart: + tbl = rht_dereference_rcu(c->btree_cache_table.tbl, + &c->btree_cache_table); + + for (; i < tbl->size; i++) + rht_for_each_entry_rcu(b, pos, tbl, i, hash) + /* + * XXX - locking for b->level, when called from + * bch2_journal_move() + */ + if (!b->level && btree_node_dirty(b)) { + rcu_read_unlock(); + bch2_btree_node_write_dirty(c, b, &cl); + dropped_lock = true; + rcu_read_lock(); + goto restart; + } + } while (dropped_lock); + + rcu_read_unlock(); + + closure_sync(&cl); +} + +/** + * bch_btree_node_flush_journal - flush any journal entries that contain keys + * from this node + * + * The bset's journal sequence number is used for preserving ordering of index + * updates across unclean shutdowns - it's used to ignore bsets newer than the + * most recent journal entry. + * + * But when rewriting btree nodes we compact all the bsets in a btree node - and + * if we compacted a bset that should be ignored with bsets we do need, that + * would be bad. So to avoid that, prior to making the new node visible ensure + * that the journal has been flushed so that all the bsets we compacted should + * be visible. + */ +void bch2_btree_node_flush_journal_entries(struct bch_fs *c, + struct btree *b, + struct closure *cl) +{ + int i = b->nsets; + + /* + * Journal sequence numbers in the different bsets will always be in + * ascending order, we only need to flush the highest - except that the + * most recent bset might not have a journal sequence number yet, so we + * need to loop: + */ + while (i--) { + u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq); + + if (seq) { + bch2_journal_flush_seq_async(&c->journal, seq, cl); + break; + } + } +} diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h new file mode 100644 index 00000000..290fb5d7 --- /dev/null +++ b/libbcachefs/btree_io.h @@ -0,0 +1,73 @@ +#ifndef _BCACHE_BTREE_IO_H +#define _BCACHE_BTREE_IO_H + +struct bch_fs; +struct btree_write; +struct btree; +struct btree_iter; + +static inline void btree_node_io_unlock(struct btree *b) +{ + EBUG_ON(!btree_node_write_in_flight(b)); + clear_btree_node_write_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +} + +static inline void btree_node_io_lock(struct btree *b) +{ + wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + +enum compact_mode { + COMPACT_LAZY, + COMPACT_WRITTEN, + COMPACT_WRITTEN_NO_WRITE_LOCK, +}; + +bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode); + +static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) +{ + struct bset_tree *t; + + for_each_bset(b, t) { + unsigned live_u64s = b->nr.bset_u64s[t - b->set]; + unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); + + if (live_u64s * 4 < bset_u64s * 3) + goto compact; + } + + return false; +compact: + return __bch2_compact_whiteouts(c, b, COMPACT_LAZY); +} + +void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); + +void bch2_btree_build_aux_trees(struct btree *); +void bch2_btree_init_next(struct bch_fs *, struct btree *, + struct btree_iter *); + +void bch2_btree_node_read_done(struct bch_fs *, struct btree *, + struct bch_dev *, const struct bch_extent_ptr *); +void bch2_btree_node_read(struct bch_fs *, struct btree *); +int bch2_btree_root_read(struct bch_fs *, enum btree_id, + const struct bkey_i *, unsigned); + +void bch2_btree_complete_write(struct bch_fs *, struct btree *, + struct btree_write *); + +void __bch2_btree_node_write(struct bch_fs *, struct btree *, + struct closure *, enum six_lock_type, int); +bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); + +void bch2_btree_node_write(struct bch_fs *, struct btree *, + struct closure *, enum six_lock_type, int); + +void bch2_btree_flush(struct bch_fs *); +void bch2_btree_node_flush_journal_entries(struct bch_fs *, struct btree *, + struct closure *); + +#endif /* _BCACHE_BTREE_IO_H */ diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c new file mode 100644 index 00000000..fb5c507e --- /dev/null +++ b/libbcachefs/btree_iter.c @@ -0,0 +1,1133 @@ + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_cache.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "debug.h" +#include "extents.h" + +#include <trace/events/bcachefs.h> + +#define BTREE_ITER_NOT_END ((struct btree *) 1) + +static inline bool is_btree_node(struct btree_iter *iter, unsigned l) +{ + return iter->nodes[l] && iter->nodes[l] != BTREE_ITER_NOT_END; +} + +/* Btree node locking: */ + +/* + * Updates the saved lock sequence number, so that bch2_btree_node_relock() will + * succeed: + */ +void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) +{ + struct btree_iter *linked; + + EBUG_ON(iter->nodes[b->level] != b); + EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq); + + for_each_linked_btree_node(iter, b, linked) + linked->lock_seq[b->level] += 2; + + iter->lock_seq[b->level] += 2; + + six_unlock_write(&b->lock); +} + +void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) +{ + struct btree_iter *linked; + unsigned readers = 0; + + EBUG_ON(iter->nodes[b->level] != b); + EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq); + + if (six_trylock_write(&b->lock)) + return; + + for_each_linked_btree_iter(iter, linked) + if (linked->nodes[b->level] == b && + btree_node_read_locked(linked, b->level)) + readers++; + + if (likely(!readers)) { + six_lock_write(&b->lock); + } else { + /* + * Must drop our read locks before calling six_lock_write() - + * six_unlock() won't do wakeups until the reader count + * goes to 0, and it's safe because we have the node intent + * locked: + */ + atomic64_sub(__SIX_VAL(read_lock, readers), + &b->lock.state.counter); + six_lock_write(&b->lock); + atomic64_add(__SIX_VAL(read_lock, readers), + &b->lock.state.counter); + } +} + +bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level) +{ + struct btree_iter *linked; + struct btree *b = iter->nodes[level]; + enum btree_node_locked_type want = btree_lock_want(iter, level); + enum btree_node_locked_type have = btree_node_locked_type(iter, level); + + if (want == have) + return true; + + if (!is_btree_node(iter, level)) + return false; + + if (race_fault()) + return false; + + if (have != BTREE_NODE_UNLOCKED + ? six_trylock_convert(&b->lock, have, want) + : six_relock_type(&b->lock, want, iter->lock_seq[level])) + goto success; + + for_each_linked_btree_iter(iter, linked) + if (linked->nodes[level] == b && + btree_node_locked_type(linked, level) == want && + iter->lock_seq[level] == b->lock.state.seq) { + btree_node_unlock(iter, level); + six_lock_increment(&b->lock, want); + goto success; + } + + return false; +success: + mark_btree_node_unlocked(iter, level); + mark_btree_node_locked(iter, level, want); + return true; +} + +/* Slowpath: */ +bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + unsigned level, + struct btree_iter *iter, + enum six_lock_type type) +{ + struct btree_iter *linked; + + /* Can't have children locked before ancestors: */ + EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked)); + + /* + * Can't hold any read locks while we block taking an intent lock - see + * below for reasoning, and we should have already dropped any read + * locks in the current iterator + */ + EBUG_ON(type == SIX_LOCK_intent && + iter->nodes_locked != iter->nodes_intent_locked); + + for_each_linked_btree_iter(iter, linked) + if (linked->nodes[level] == b && + btree_node_locked_type(linked, level) == type) { + six_lock_increment(&b->lock, type); + return true; + } + + /* + * Must lock btree nodes in key order - this case hapens when locking + * the prev sibling in btree node merging: + */ + if (iter->nodes_locked && + __ffs(iter->nodes_locked) == level && + __btree_iter_cmp(iter->btree_id, pos, iter)) + return false; + + for_each_linked_btree_iter(iter, linked) { + if (!linked->nodes_locked) + continue; + + /* + * Can't block taking an intent lock if we have _any_ nodes read + * locked: + * + * - Our read lock blocks another thread with an intent lock on + * the same node from getting a write lock, and thus from + * dropping its intent lock + * + * - And the other thread may have multiple nodes intent locked: + * both the node we want to intent lock, and the node we + * already have read locked - deadlock: + */ + if (type == SIX_LOCK_intent && + linked->nodes_locked != linked->nodes_intent_locked) { + linked->locks_want = max(linked->locks_want, + iter->locks_want); + return false; + } + + /* We have to lock btree nodes in key order: */ + if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) + return false; + + /* + * Interior nodes must be locked before their descendants: if + * another iterator has possible descendants locked of the node + * we're about to lock, it must have the ancestors locked too: + */ + if (linked->btree_id == iter->btree_id && + level > __fls(linked->nodes_locked)) { + linked->locks_want = max(linked->locks_want, + iter->locks_want); + return false; + } + } + + six_lock_type(&b->lock, type); + return true; +} + +/* Btree iterator locking: */ + + +static void btree_iter_drop_extra_locks(struct btree_iter *iter) +{ + unsigned l; + + while (iter->nodes_locked && + (l = __fls(iter->nodes_locked)) > iter->locks_want) { + if (!btree_node_locked(iter, l)) + panic("l %u nodes_locked %u\n", l, iter->nodes_locked); + + if (l > iter->level) { + btree_node_unlock(iter, l); + } else if (btree_node_intent_locked(iter, l)) { + six_lock_downgrade(&iter->nodes[l]->lock); + iter->nodes_intent_locked ^= 1 << l; + } + } +} + +bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter, + unsigned new_locks_want) +{ + struct btree_iter *linked; + unsigned l; + + /* Drop locks we don't want anymore: */ + if (new_locks_want < iter->locks_want) + for_each_linked_btree_iter(iter, linked) + if (linked->locks_want > new_locks_want) { + linked->locks_want = max_t(unsigned, 1, + new_locks_want); + btree_iter_drop_extra_locks(linked); + } + + iter->locks_want = new_locks_want; + btree_iter_drop_extra_locks(iter); + + for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++) + if (!bch2_btree_node_relock(iter, l)) + goto fail; + + return true; +fail: + /* + * Just an optimization: ancestor nodes must be locked before child + * nodes, so set locks_want on iterators that might lock ancestors + * before us to avoid getting -EINTR later: + */ + for_each_linked_btree_iter(iter, linked) + if (linked->btree_id == iter->btree_id && + btree_iter_cmp(linked, iter) <= 0) + linked->locks_want = max_t(unsigned, linked->locks_want, + new_locks_want); + return false; +} + +static int __bch2_btree_iter_unlock(struct btree_iter *iter) +{ + BUG_ON(iter->error == -EINTR); + + while (iter->nodes_locked) + btree_node_unlock(iter, __ffs(iter->nodes_locked)); + + return iter->error; +} + +int bch2_btree_iter_unlock(struct btree_iter *iter) +{ + struct btree_iter *linked; + + for_each_linked_btree_iter(iter, linked) + __bch2_btree_iter_unlock(linked); + return __bch2_btree_iter_unlock(iter); +} + +/* Btree iterator: */ + +#ifdef CONFIG_BCACHEFS_DEBUG + +static void __bch2_btree_iter_verify(struct btree_iter *iter, + struct btree *b) +{ + struct btree_node_iter *node_iter = &iter->node_iters[b->level]; + struct btree_node_iter tmp = *node_iter; + struct bkey_packed *k; + + bch2_btree_node_iter_verify(node_iter, b); + + /* + * For interior nodes, the iterator will have skipped past + * deleted keys: + */ + k = b->level + ? bch2_btree_node_iter_prev(&tmp, b) + : bch2_btree_node_iter_prev_all(&tmp, b); + if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k, + iter->is_extents)) { + char buf[100]; + struct bkey uk = bkey_unpack_key(b, k); + + bch2_bkey_to_text(buf, sizeof(buf), &uk); + panic("prev key should be before after pos:\n%s\n%llu:%llu\n", + buf, iter->pos.inode, iter->pos.offset); + } + + k = bch2_btree_node_iter_peek_all(node_iter, b); + if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k, + iter->is_extents)) { + char buf[100]; + struct bkey uk = bkey_unpack_key(b, k); + + bch2_bkey_to_text(buf, sizeof(buf), &uk); + panic("next key should be before iter pos:\n%llu:%llu\n%s\n", + iter->pos.inode, iter->pos.offset, buf); + } +} + +void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) +{ + struct btree_iter *linked; + + if (iter->nodes[b->level] == b) + __bch2_btree_iter_verify(iter, b); + + for_each_linked_btree_node(iter, b, linked) + __bch2_btree_iter_verify(iter, b); +} + +#endif + +static void __bch2_btree_node_iter_fix(struct btree_iter *iter, + struct btree *b, + struct btree_node_iter *node_iter, + struct bset_tree *t, + struct bkey_packed *where, + unsigned clobber_u64s, + unsigned new_u64s) +{ + const struct bkey_packed *end = btree_bkey_last(b, t); + struct btree_node_iter_set *set; + unsigned offset = __btree_node_key_to_offset(b, where); + int shift = new_u64s - clobber_u64s; + unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift; + + btree_node_iter_for_each(node_iter, set) + if (set->end == old_end) + goto found; + + /* didn't find the bset in the iterator - might have to readd it: */ + if (new_u64s && + btree_iter_pos_cmp_packed(b, &iter->pos, where, + iter->is_extents)) + bch2_btree_node_iter_push(node_iter, b, where, end); + return; +found: + set->end = (int) set->end + shift; + + /* Iterator hasn't gotten to the key that changed yet: */ + if (set->k < offset) + return; + + if (new_u64s && + btree_iter_pos_cmp_packed(b, &iter->pos, where, + iter->is_extents)) { + set->k = offset; + bch2_btree_node_iter_sort(node_iter, b); + } else if (set->k < offset + clobber_u64s) { + set->k = offset + new_u64s; + if (set->k == set->end) + *set = node_iter->data[--node_iter->used]; + bch2_btree_node_iter_sort(node_iter, b); + } else { + set->k = (int) set->k + shift; + } + + /* + * Interior nodes are special because iterators for interior nodes don't + * obey the usual invariants regarding the iterator position: + * + * We may have whiteouts that compare greater than the iterator + * position, and logically should be in the iterator, but that we + * skipped past to find the first live key greater than the iterator + * position. This becomes an issue when we insert a new key that is + * greater than the current iterator position, but smaller than the + * whiteouts we've already skipped past - this happens in the course of + * a btree split. + * + * We have to rewind the iterator past to before those whiteouts here, + * else bkey_node_iter_prev() is not going to work and who knows what + * else would happen. And we have to do it manually, because here we've + * already done the insert and the iterator is currently inconsistent: + * + * We've got multiple competing invariants, here - we have to be careful + * about rewinding iterators for interior nodes, because they should + * always point to the key for the child node the btree iterator points + * to. + */ + if (b->level && new_u64s && !bkey_deleted(where) && + btree_iter_pos_cmp_packed(b, &iter->pos, where, + iter->is_extents)) { + struct bset_tree *t; + struct bkey_packed *k; + + for_each_bset(b, t) { + if (bch2_bkey_to_bset(b, where) == t) + continue; + + k = bch2_bkey_prev_all(b, t, + bch2_btree_node_iter_bset_pos(node_iter, b, t)); + if (k && + __btree_node_iter_cmp(node_iter, b, + k, where) > 0) { + struct btree_node_iter_set *set; + unsigned offset = + __btree_node_key_to_offset(b, bkey_next(k)); + + btree_node_iter_for_each(node_iter, set) + if (set->k == offset) { + set->k = __btree_node_key_to_offset(b, k); + bch2_btree_node_iter_sort(node_iter, b); + goto next_bset; + } + + bch2_btree_node_iter_push(node_iter, b, k, + btree_bkey_last(b, t)); + } +next_bset: + t = t; + } + } +} + +void bch2_btree_node_iter_fix(struct btree_iter *iter, + struct btree *b, + struct btree_node_iter *node_iter, + struct bset_tree *t, + struct bkey_packed *where, + unsigned clobber_u64s, + unsigned new_u64s) +{ + struct btree_iter *linked; + + if (node_iter != &iter->node_iters[b->level]) + __bch2_btree_node_iter_fix(iter, b, node_iter, t, + where, clobber_u64s, new_u64s); + + if (iter->nodes[b->level] == b) + __bch2_btree_node_iter_fix(iter, b, + &iter->node_iters[b->level], t, + where, clobber_u64s, new_u64s); + + for_each_linked_btree_node(iter, b, linked) + __bch2_btree_node_iter_fix(linked, b, + &linked->node_iters[b->level], t, + where, clobber_u64s, new_u64s); + + /* interior node iterators are... special... */ + if (!b->level) + bch2_btree_iter_verify(iter, b); +} + +/* peek_all() doesn't skip deleted keys */ +static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter) +{ + struct btree *b = iter->nodes[iter->level]; + struct bkey_packed *k = + bch2_btree_node_iter_peek_all(&iter->node_iters[iter->level], b); + struct bkey_s_c ret; + + EBUG_ON(!btree_node_locked(iter, iter->level)); + + if (!k) + return bkey_s_c_null; + + ret = bkey_disassemble(b, k, &iter->k); + + if (debug_check_bkeys(iter->c)) + bch2_bkey_debugcheck(iter->c, b, ret); + + return ret; +} + +static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter) +{ + struct btree *b = iter->nodes[iter->level]; + struct bkey_packed *k = + bch2_btree_node_iter_peek(&iter->node_iters[iter->level], b); + struct bkey_s_c ret; + + EBUG_ON(!btree_node_locked(iter, iter->level)); + + if (!k) + return bkey_s_c_null; + + ret = bkey_disassemble(b, k, &iter->k); + + if (debug_check_bkeys(iter->c)) + bch2_bkey_debugcheck(iter->c, b, ret); + + return ret; +} + +static inline void __btree_iter_advance(struct btree_iter *iter) +{ + bch2_btree_node_iter_advance(&iter->node_iters[iter->level], + iter->nodes[iter->level]); +} + +/* + * Verify that iterator for parent node points to child node: + */ +static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) +{ + bool parent_locked; + struct bkey_packed *k; + + if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) || + !iter->nodes[b->level + 1]) + return; + + parent_locked = btree_node_locked(iter, b->level + 1); + + if (!bch2_btree_node_relock(iter, b->level + 1)) + return; + + k = bch2_btree_node_iter_peek_all(&iter->node_iters[b->level + 1], + iter->nodes[b->level + 1]); + if (!k || + bkey_deleted(k) || + bkey_cmp_left_packed(iter->nodes[b->level + 1], + k, &b->key.k.p)) { + char buf[100]; + struct bkey uk = bkey_unpack_key(b, k); + + bch2_bkey_to_text(buf, sizeof(buf), &uk); + panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", + buf, b->key.k.p.inode, b->key.k.p.offset); + } + + if (!parent_locked) + btree_node_unlock(iter, b->level + 1); +} + +static inline void __btree_iter_init(struct btree_iter *iter, + struct btree *b) +{ + bch2_btree_node_iter_init(&iter->node_iters[b->level], b, + iter->pos, iter->is_extents, + btree_node_is_extents(b)); + + /* Skip to first non whiteout: */ + if (b->level) + bch2_btree_node_iter_peek(&iter->node_iters[b->level], b); +} + +static inline bool btree_iter_pos_in_node(struct btree_iter *iter, + struct btree *b) +{ + return iter->btree_id == b->btree_id && + bkey_cmp(iter->pos, b->data->min_key) >= 0 && + btree_iter_pos_cmp(iter->pos, &b->key.k, iter->is_extents); +} + +static inline void btree_iter_node_set(struct btree_iter *iter, + struct btree *b) +{ + btree_iter_verify_new_node(iter, b); + + EBUG_ON(!btree_iter_pos_in_node(iter, b)); + EBUG_ON(b->lock.state.seq & 1); + + iter->lock_seq[b->level] = b->lock.state.seq; + iter->nodes[b->level] = b; + __btree_iter_init(iter, b); +} + +/* + * A btree node is being replaced - update the iterator to point to the new + * node: + */ +bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) +{ + struct btree_iter *linked; + + for_each_linked_btree_iter(iter, linked) + if (btree_iter_pos_in_node(linked, b)) { + /* + * bch2_btree_iter_node_drop() has already been called - + * the old node we're replacing has already been + * unlocked and the pointer invalidated + */ + BUG_ON(btree_node_locked(linked, b->level)); + + /* + * If @linked wants this node read locked, we don't want + * to actually take the read lock now because it's not + * legal to hold read locks on other nodes while we take + * write locks, so the journal can make forward + * progress... + * + * Instead, btree_iter_node_set() sets things up so + * bch2_btree_node_relock() will succeed: + */ + + if (btree_want_intent(linked, b->level)) { + six_lock_increment(&b->lock, SIX_LOCK_intent); + mark_btree_node_intent_locked(linked, b->level); + } + + btree_iter_node_set(linked, b); + } + + if (!btree_iter_pos_in_node(iter, b)) { + six_unlock_intent(&b->lock); + return false; + } + + mark_btree_node_intent_locked(iter, b->level); + btree_iter_node_set(iter, b); + return true; +} + +void bch2_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b) +{ + struct btree_iter *linked; + unsigned level = b->level; + + for_each_linked_btree_iter(iter, linked) + if (linked->nodes[level] == b) { + btree_node_unlock(linked, level); + linked->nodes[level] = BTREE_ITER_NOT_END; + } +} + +void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) +{ + unsigned level = b->level; + + if (iter->nodes[level] == b) { + BUG_ON(b->lock.state.intent_lock != 1); + btree_node_unlock(iter, level); + iter->nodes[level] = BTREE_ITER_NOT_END; + } +} + +/* + * A btree node has been modified in such a way as to invalidate iterators - fix + * them: + */ +void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) +{ + struct btree_iter *linked; + + for_each_linked_btree_node(iter, b, linked) + __btree_iter_init(linked, b); + __btree_iter_init(iter, b); +} + +static inline int btree_iter_lock_root(struct btree_iter *iter, + unsigned depth_want) +{ + struct bch_fs *c = iter->c; + struct btree *b; + enum six_lock_type lock_type; + unsigned i; + + EBUG_ON(iter->nodes_locked); + + while (1) { + b = READ_ONCE(c->btree_roots[iter->btree_id].b); + iter->level = READ_ONCE(b->level); + + if (unlikely(iter->level < depth_want)) { + /* + * the root is at a lower depth than the depth we want: + * got to the end of the btree, or we're walking nodes + * greater than some depth and there are no nodes >= + * that depth + */ + iter->level = depth_want; + iter->nodes[iter->level] = NULL; + return 0; + } + + lock_type = btree_lock_want(iter, iter->level); + if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, + iter, lock_type))) + return -EINTR; + + if (likely(b == c->btree_roots[iter->btree_id].b && + b->level == iter->level && + !race_fault())) { + for (i = 0; i < iter->level; i++) + iter->nodes[i] = BTREE_ITER_NOT_END; + iter->nodes[iter->level] = b; + + mark_btree_node_locked(iter, iter->level, lock_type); + btree_iter_node_set(iter, b); + return 0; + + } + + six_unlock_type(&b->lock, lock_type); + } +} + +static inline int btree_iter_down(struct btree_iter *iter) +{ + struct btree *b; + struct bkey_s_c k = __btree_iter_peek(iter); + unsigned level = iter->level - 1; + enum six_lock_type lock_type = btree_lock_want(iter, level); + BKEY_PADDED(k) tmp; + + bkey_reassemble(&tmp.k, k); + + b = bch2_btree_node_get(iter, &tmp.k, level, lock_type); + if (unlikely(IS_ERR(b))) + return PTR_ERR(b); + + iter->level = level; + mark_btree_node_locked(iter, level, lock_type); + btree_iter_node_set(iter, b); + return 0; +} + +static void btree_iter_up(struct btree_iter *iter) +{ + btree_node_unlock(iter, iter->level++); +} + +int __must_check __bch2_btree_iter_traverse(struct btree_iter *); + +static int btree_iter_traverse_error(struct btree_iter *iter, int ret) +{ + struct bch_fs *c = iter->c; + struct btree_iter *linked, *sorted_iters, **i; +retry_all: + bch2_btree_iter_unlock(iter); + + if (ret != -ENOMEM && ret != -EINTR) + goto io_error; + + if (ret == -ENOMEM) { + struct closure cl; + + closure_init_stack(&cl); + + do { + ret = bch2_btree_node_cannibalize_lock(c, &cl); + closure_sync(&cl); + } while (ret); + } + + /* + * Linked iters are normally a circular singly linked list - break cycle + * while we sort them: + */ + linked = iter->next; + iter->next = NULL; + sorted_iters = NULL; + + while (linked) { + iter = linked; + linked = linked->next; + + i = &sorted_iters; + while (*i && btree_iter_cmp(iter, *i) > 0) + i = &(*i)->next; + + iter->next = *i; + *i = iter; + } + + /* Make list circular again: */ + iter = sorted_iters; + while (iter->next) + iter = iter->next; + iter->next = sorted_iters; + + /* Now, redo traversals in correct order: */ + + iter = sorted_iters; + do { +retry: + ret = __bch2_btree_iter_traverse(iter); + if (unlikely(ret)) { + if (ret == -EINTR) + goto retry; + goto retry_all; + } + + iter = iter->next; + } while (iter != sorted_iters); + + ret = btree_iter_linked(iter) ? -EINTR : 0; +out: + bch2_btree_node_cannibalize_unlock(c); + return ret; +io_error: + BUG_ON(ret != -EIO); + + iter->error = ret; + iter->nodes[iter->level] = NULL; + goto out; +} + +/* + * This is the main state machine for walking down the btree - walks down to a + * specified depth + * + * Returns 0 on success, -EIO on error (error reading in a btree node). + * + * On error, caller (peek_node()/peek_key()) must return NULL; the error is + * stashed in the iterator and returned from bch2_btree_iter_unlock(). + */ +int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) +{ + unsigned depth_want = iter->level; + + /* make sure we have all the intent locks we need - ugh */ + if (unlikely(iter->nodes[iter->level] && + iter->level + 1 < iter->locks_want)) { + unsigned i; + + for (i = iter->level + 1; + i < iter->locks_want && iter->nodes[i]; + i++) + if (!bch2_btree_node_relock(iter, i)) { + while (iter->nodes[iter->level] && + iter->level + 1 < iter->locks_want) + btree_iter_up(iter); + break; + } + } + + /* + * If the current node isn't locked, go up until we have a locked node + * or run out of nodes: + */ + while (iter->nodes[iter->level] && + !(is_btree_node(iter, iter->level) && + bch2_btree_node_relock(iter, iter->level) && + btree_iter_pos_cmp(iter->pos, + &iter->nodes[iter->level]->key.k, + iter->is_extents))) + btree_iter_up(iter); + + /* + * If we've got a btree node locked (i.e. we aren't about to relock the + * root) - advance its node iterator if necessary: + */ + if (iter->nodes[iter->level]) { + struct bkey_s_c k; + + while ((k = __btree_iter_peek_all(iter)).k && + !btree_iter_pos_cmp(iter->pos, k.k, iter->is_extents)) + __btree_iter_advance(iter); + } + + /* + * Note: iter->nodes[iter->level] may be temporarily NULL here - that + * would indicate to other code that we got to the end of the btree, + * here it indicates that relocking the root failed - it's critical that + * btree_iter_lock_root() comes next and that it can't fail + */ + while (iter->level > depth_want) { + int ret = iter->nodes[iter->level] + ? btree_iter_down(iter) + : btree_iter_lock_root(iter, depth_want); + if (unlikely(ret)) { + iter->level = depth_want; + return ret; + } + } + + return 0; +} + +int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) +{ + int ret; + + if (unlikely(!iter->nodes[iter->level])) + return 0; + + iter->at_end_of_leaf = false; + + ret = __bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + ret = btree_iter_traverse_error(iter, ret); + + return ret; +} + +/* Iterate across nodes (leaf and interior nodes) */ + +struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) +{ + struct btree *b; + int ret; + + EBUG_ON(iter->is_extents); + + ret = bch2_btree_iter_traverse(iter); + if (ret) + return NULL; + + b = iter->nodes[iter->level]; + + if (b) { + EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); + iter->pos = b->key.k.p; + } + + return b; +} + +struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) +{ + struct btree *b; + int ret; + + EBUG_ON(iter->is_extents); + + btree_iter_up(iter); + + if (!iter->nodes[iter->level]) + return NULL; + + /* parent node usually won't be locked: redo traversal if necessary */ + ret = bch2_btree_iter_traverse(iter); + if (ret) + return NULL; + + b = iter->nodes[iter->level]; + if (!b) + return b; + + if (bkey_cmp(iter->pos, b->key.k.p) < 0) { + /* Haven't gotten to the end of the parent node: */ + + /* ick: */ + iter->pos = iter->btree_id == BTREE_ID_INODES + ? btree_type_successor(iter->btree_id, iter->pos) + : bkey_successor(iter->pos); + iter->level = depth; + + ret = bch2_btree_iter_traverse(iter); + if (ret) + return NULL; + + b = iter->nodes[iter->level]; + } + + iter->pos = b->key.k.p; + + return b; +} + +/* Iterate across keys (in leaf nodes only) */ + +void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) +{ + struct btree *b = iter->nodes[0]; + struct btree_node_iter *node_iter = &iter->node_iters[0]; + struct bkey_packed *k; + + EBUG_ON(iter->level != 0); + EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); + EBUG_ON(!btree_node_locked(iter, 0)); + EBUG_ON(bkey_cmp(new_pos, b->key.k.p) > 0); + + while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && + !btree_iter_pos_cmp_packed(b, &new_pos, k, + iter->is_extents)) + bch2_btree_node_iter_advance(node_iter, b); + + if (!k && + !btree_iter_pos_cmp(new_pos, &b->key.k, iter->is_extents)) + iter->at_end_of_leaf = true; + + iter->pos = new_pos; +} + +void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +{ + EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */ + iter->pos = new_pos; +} + +void bch2_btree_iter_advance_pos(struct btree_iter *iter) +{ + /* + * We use iter->k instead of iter->pos for extents: iter->pos will be + * equal to the start of the extent we returned, but we need to advance + * to the end of the extent we returned. + */ + bch2_btree_iter_set_pos(iter, + btree_type_successor(iter->btree_id, iter->k.p)); +} + +/* XXX: expensive */ +void bch2_btree_iter_rewind(struct btree_iter *iter, struct bpos pos) +{ + /* incapable of rewinding across nodes: */ + BUG_ON(bkey_cmp(pos, iter->nodes[iter->level]->data->min_key) < 0); + + iter->pos = pos; + __btree_iter_init(iter, iter->nodes[iter->level]); +} + +struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +{ + struct bkey_s_c k; + int ret; + + while (1) { + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) { + iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); + return bkey_s_c_err(ret); + } + + k = __btree_iter_peek(iter); + if (likely(k.k)) { + /* + * iter->pos should always be equal to the key we just + * returned - except extents can straddle iter->pos: + */ + if (!iter->is_extents || + bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); + return k; + } + + iter->pos = iter->nodes[0]->key.k.p; + + if (!bkey_cmp(iter->pos, POS_MAX)) { + iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); + bch2_btree_iter_unlock(iter); + return bkey_s_c_null; + } + + iter->pos = btree_type_successor(iter->btree_id, iter->pos); + } +} + +struct bkey_s_c bch2_btree_iter_peek_with_holes(struct btree_iter *iter) +{ + struct bkey_s_c k; + struct bkey n; + int ret; + + while (1) { + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) { + iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); + return bkey_s_c_err(ret); + } + + k = __btree_iter_peek_all(iter); +recheck: + if (!k.k || bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) { + /* hole */ + bkey_init(&n); + n.p = iter->pos; + + if (iter->is_extents) { + if (n.p.offset == KEY_OFFSET_MAX) { + iter->pos = bkey_successor(iter->pos); + goto recheck; + } + + if (!k.k) + k.k = &iter->nodes[0]->key.k; + + bch2_key_resize(&n, + min_t(u64, KEY_SIZE_MAX, + (k.k->p.inode == n.p.inode + ? bkey_start_offset(k.k) + : KEY_OFFSET_MAX) - + n.p.offset)); + + EBUG_ON(!n.size); + } + + iter->k = n; + return (struct bkey_s_c) { &iter->k, NULL }; + } else if (!bkey_deleted(k.k)) { + return k; + } else { + __btree_iter_advance(iter); + } + } +} + +void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned depth) +{ + iter->level = depth; + /* bch2_bkey_ops isn't used much, this would be a cache miss */ + /* iter->is_extents = bch2_bkey_ops[btree_id]->is_extents; */ + iter->is_extents = btree_id == BTREE_ID_EXTENTS; + iter->nodes_locked = 0; + iter->nodes_intent_locked = 0; + iter->locks_want = min(locks_want, BTREE_MAX_DEPTH); + iter->btree_id = btree_id; + iter->at_end_of_leaf = 0; + iter->error = 0; + iter->c = c; + iter->pos = pos; + memset(iter->nodes, 0, sizeof(iter->nodes)); + iter->nodes[iter->level] = BTREE_ITER_NOT_END; + iter->next = iter; + + prefetch(c->btree_roots[btree_id].b); +} + +void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new) +{ + BUG_ON(btree_iter_linked(new)); + + new->next = iter->next; + iter->next = new; + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + unsigned nr_iters = 1; + + for_each_linked_btree_iter(iter, new) + nr_iters++; + + BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE); + } +} + +void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) +{ + bch2_btree_iter_unlock(dst); + memcpy(dst, src, offsetof(struct btree_iter, next)); + dst->nodes_locked = dst->nodes_intent_locked = 0; +} diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h new file mode 100644 index 00000000..39731f0b --- /dev/null +++ b/libbcachefs/btree_iter.h @@ -0,0 +1,282 @@ +#ifndef _BCACHE_BTREE_ITER_H +#define _BCACHE_BTREE_ITER_H + +#include "btree_types.h" + +struct btree_iter { + /* Current btree depth */ + u8 level; + + /* + * Used in bch2_btree_iter_traverse(), to indicate whether we're + * searching for @pos or the first key strictly greater than @pos + */ + u8 is_extents; + + /* Bitmasks for read/intent locks held per level */ + u8 nodes_locked; + u8 nodes_intent_locked; + + /* Btree level below which we start taking intent locks */ + u8 locks_want; + + enum btree_id btree_id:8; + + /* + * indicates we need to call bch2_btree_iter_traverse() to revalidate + * iterator: + */ + u8 at_end_of_leaf; + + s8 error; + + struct bch_fs *c; + + /* Current position of the iterator */ + struct bpos pos; + + u32 lock_seq[BTREE_MAX_DEPTH]; + + /* + * NOTE: Never set iter->nodes to NULL except in btree_iter_lock_root(). + * + * This is because iter->nodes[iter->level] == NULL is how + * btree_iter_next_node() knows that it's finished with a depth first + * traversal. Just unlocking a node (with btree_node_unlock()) is fine, + * and if you really don't want that node used again (e.g. btree_split() + * freed it) decrementing lock_seq will cause bch2_btree_node_relock() to + * always fail (but since freeing a btree node takes a write lock on the + * node, which increments the node's lock seq, that's not actually + * necessary in that example). + * + * One extra slot for a sentinel NULL: + */ + struct btree *nodes[BTREE_MAX_DEPTH + 1]; + struct btree_node_iter node_iters[BTREE_MAX_DEPTH]; + + /* + * Current unpacked key - so that bch2_btree_iter_next()/ + * bch2_btree_iter_next_with_holes() can correctly advance pos. + */ + struct bkey k; + + /* + * Circular linked list of linked iterators: linked iterators share + * locks (e.g. two linked iterators may have the same node intent + * locked, or read and write locked, at the same time), and insertions + * through one iterator won't invalidate the other linked iterators. + */ + + /* Must come last: */ + struct btree_iter *next; +}; + +static inline bool btree_iter_linked(const struct btree_iter *iter) +{ + return iter->next != iter; +} + +/** + * for_each_linked_btree_iter - iterate over all iterators linked with @_iter + */ +#define for_each_linked_btree_iter(_iter, _linked) \ + for ((_linked) = (_iter)->next; \ + (_linked) != (_iter); \ + (_linked) = (_linked)->next) + +static inline struct btree_iter * +__next_linked_btree_node(struct btree_iter *iter, struct btree *b, + struct btree_iter *linked) +{ + do { + linked = linked->next; + + if (linked == iter) + return NULL; + + /* + * We don't compare the low bits of the lock sequence numbers + * because @iter might have taken a write lock on @b, and we + * don't want to skip the linked iterator if the sequence + * numbers were equal before taking that write lock. The lock + * sequence number is incremented by taking and releasing write + * locks and is even when unlocked: + */ + } while (linked->nodes[b->level] != b || + linked->lock_seq[b->level] >> 1 != b->lock.state.seq >> 1); + + return linked; +} + +/** + * for_each_linked_btree_node - iterate over all iterators linked with @_iter + * that also point to @_b + * + * @_b is assumed to be locked by @_iter + * + * Filters out iterators that don't have a valid btree_node iterator for @_b - + * i.e. iterators for which bch2_btree_node_relock() would not succeed. + */ +#define for_each_linked_btree_node(_iter, _b, _linked) \ + for ((_linked) = (_iter); \ + ((_linked) = __next_linked_btree_node(_iter, _b, _linked));) + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_btree_iter_verify(struct btree_iter *, struct btree *); +#else +static inline void bch2_btree_iter_verify(struct btree_iter *iter, + struct btree *b) {} +#endif + +void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, + struct btree_node_iter *, struct bset_tree *, + struct bkey_packed *, unsigned, unsigned); + +int bch2_btree_iter_unlock(struct btree_iter *); +bool __bch2_btree_iter_set_locks_want(struct btree_iter *, unsigned); + +static inline bool bch2_btree_iter_set_locks_want(struct btree_iter *iter, + unsigned new_locks_want) +{ + new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); + + if (iter->locks_want == new_locks_want && + iter->nodes_intent_locked == (1 << new_locks_want) - 1) + return true; + + return __bch2_btree_iter_set_locks_want(iter, new_locks_want); +} + +bool bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); +void bch2_btree_iter_node_drop_linked(struct btree_iter *, struct btree *); +void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); + +void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); + +int __must_check bch2_btree_iter_traverse(struct btree_iter *); + +struct btree *bch2_btree_iter_peek_node(struct btree_iter *); +struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned); + +struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_with_holes(struct btree_iter *); +void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); +void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); +void bch2_btree_iter_advance_pos(struct btree_iter *); +void bch2_btree_iter_rewind(struct btree_iter *, struct bpos); + +void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *, + enum btree_id, struct bpos, unsigned , unsigned); + +static inline void bch2_btree_iter_init(struct btree_iter *iter, + struct bch_fs *c, + enum btree_id btree_id, + struct bpos pos) +{ + __bch2_btree_iter_init(iter, c, btree_id, pos, 0, 0); +} + +static inline void bch2_btree_iter_init_intent(struct btree_iter *iter, + struct bch_fs *c, + enum btree_id btree_id, + struct bpos pos) +{ + __bch2_btree_iter_init(iter, c, btree_id, pos, 1, 0); +} + +void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *); +void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *); + +static inline struct bpos btree_type_successor(enum btree_id id, + struct bpos pos) +{ + if (id == BTREE_ID_INODES) { + pos.inode++; + pos.offset = 0; + } else if (id != BTREE_ID_EXTENTS) { + pos = bkey_successor(pos); + } + + return pos; +} + +static inline int __btree_iter_cmp(enum btree_id id, + struct bpos pos, + const struct btree_iter *r) +{ + if (id != r->btree_id) + return id < r->btree_id ? -1 : 1; + return bkey_cmp(pos, r->pos); +} + +static inline int btree_iter_cmp(const struct btree_iter *l, + const struct btree_iter *r) +{ + return __btree_iter_cmp(l->btree_id, l->pos, r); +} + +#define __for_each_btree_node(_iter, _c, _btree_id, _start, _depth, \ + _b, _locks_want) \ + for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), \ + _start, _locks_want, _depth), \ + (_iter)->is_extents = false, \ + _b = bch2_btree_iter_peek_node(_iter); \ + (_b); \ + (_b) = bch2_btree_iter_next_node(_iter, _depth)) + +#define for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b) \ + __for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b, 0) + +#define __for_each_btree_key(_iter, _c, _btree_id, _start, \ + _k, _locks_want) \ + for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), \ + _start, _locks_want, 0); \ + !IS_ERR_OR_NULL(((_k) = bch2_btree_iter_peek(_iter)).k); \ + bch2_btree_iter_advance_pos(_iter)) + +#define for_each_btree_key(_iter, _c, _btree_id, _start, _k) \ + __for_each_btree_key(_iter, _c, _btree_id, _start, _k, 0) + +#define for_each_btree_key_intent(_iter, _c, _btree_id, _start, _k) \ + __for_each_btree_key(_iter, _c, _btree_id, _start, _k, 1) + +#define __for_each_btree_key_with_holes(_iter, _c, _btree_id, \ + _start, _k, _locks_want) \ + for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), \ + _start, _locks_want, 0); \ + !IS_ERR_OR_NULL(((_k) = bch2_btree_iter_peek_with_holes(_iter)).k);\ + bch2_btree_iter_advance_pos(_iter)) + +#define for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k) \ + __for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 0) + +#define for_each_btree_key_with_holes_intent(_iter, _c, _btree_id, \ + _start, _k) \ + __for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 1) + +static inline int btree_iter_err(struct bkey_s_c k) +{ + return IS_ERR(k.k) ? PTR_ERR(k.k) : 0; +} + +/* + * Unlocks before scheduling + * Note: does not revalidate iterator + */ +static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter) +{ + struct btree_iter *linked; + + if (need_resched()) { + for_each_linked_btree_iter(iter, linked) + bch2_btree_iter_unlock(linked); + bch2_btree_iter_unlock(iter); + schedule(); + } else if (race_fault()) { + for_each_linked_btree_iter(iter, linked) + bch2_btree_iter_unlock(linked); + bch2_btree_iter_unlock(iter); + } +} + +#endif /* _BCACHE_BTREE_ITER_H */ diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h new file mode 100644 index 00000000..27709d1d --- /dev/null +++ b/libbcachefs/btree_locking.h @@ -0,0 +1,116 @@ +#ifndef _BCACHE_BTREE_LOCKING_H +#define _BCACHE_BTREE_LOCKING_H + +/* + * Only for internal btree use: + * + * The btree iterator tracks what locks it wants to take, and what locks it + * currently has - here we have wrappers for locking/unlocking btree nodes and + * updating the iterator state + */ + +#include "btree_iter.h" +#include "six.h" + +/* matches six lock types */ +enum btree_node_locked_type { + BTREE_NODE_UNLOCKED = -1, + BTREE_NODE_READ_LOCKED = SIX_LOCK_read, + BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, +}; + +static inline int btree_node_locked_type(struct btree_iter *iter, + unsigned level) +{ + /* + * We're relying on the fact that if nodes_intent_locked is set + * nodes_locked must be set as well, so that we can compute without + * branches: + */ + return BTREE_NODE_UNLOCKED + + ((iter->nodes_locked >> level) & 1) + + ((iter->nodes_intent_locked >> level) & 1); +} + +static inline bool btree_node_intent_locked(struct btree_iter *iter, + unsigned level) +{ + return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED; +} + +static inline bool btree_node_read_locked(struct btree_iter *iter, + unsigned level) +{ + return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED; +} + +static inline bool btree_node_locked(struct btree_iter *iter, unsigned level) +{ + return iter->nodes_locked & (1 << level); +} + +static inline void mark_btree_node_unlocked(struct btree_iter *iter, + unsigned level) +{ + iter->nodes_locked &= ~(1 << level); + iter->nodes_intent_locked &= ~(1 << level); +} + +static inline void mark_btree_node_locked(struct btree_iter *iter, + unsigned level, + enum six_lock_type type) +{ + /* relying on this to avoid a branch */ + BUILD_BUG_ON(SIX_LOCK_read != 0); + BUILD_BUG_ON(SIX_LOCK_intent != 1); + + iter->nodes_locked |= 1 << level; + iter->nodes_intent_locked |= type << level; +} + +static inline void mark_btree_node_intent_locked(struct btree_iter *iter, + unsigned level) +{ + mark_btree_node_locked(iter, level, SIX_LOCK_intent); +} + +static inline enum six_lock_type +btree_lock_want(struct btree_iter *iter, int level) +{ + return level < iter->locks_want + ? SIX_LOCK_intent + : SIX_LOCK_read; +} + +static inline bool btree_want_intent(struct btree_iter *iter, int level) +{ + return btree_lock_want(iter, level) == SIX_LOCK_intent; +} + +static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) +{ + int lock_type = btree_node_locked_type(iter, level); + + if (lock_type != BTREE_NODE_UNLOCKED) + six_unlock_type(&iter->nodes[level]->lock, lock_type); + mark_btree_node_unlocked(iter, level); +} + +bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, + struct btree_iter *, enum six_lock_type); + +static inline bool btree_node_lock(struct btree *b, struct bpos pos, + unsigned level, + struct btree_iter *iter, + enum six_lock_type type) +{ + return likely(six_trylock_type(&b->lock, type)) || + __bch2_btree_node_lock(b, pos, level, iter, type); +} + +bool bch2_btree_node_relock(struct btree_iter *, unsigned); + +void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); +void bch2_btree_node_lock_write(struct btree *, struct btree_iter *); + +#endif /* _BCACHE_BTREE_LOCKING_H */ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h new file mode 100644 index 00000000..915e42c2 --- /dev/null +++ b/libbcachefs/btree_types.h @@ -0,0 +1,309 @@ +#ifndef _BCACHE_BTREE_TYPES_H +#define _BCACHE_BTREE_TYPES_H + +#include <linux/list.h> +#include <linux/rhashtable.h> +#include <linux/semaphore.h> +#include <linux/workqueue.h> + +#include "bkey_methods.h" +#include "journal_types.h" +#include "six.h" + +struct open_bucket; +struct btree_interior_update; + +#define MAX_BSETS 3U + +struct btree_nr_keys { + + /* + * Amount of live metadata (i.e. size of node after a compaction) in + * units of u64s + */ + u16 live_u64s; + u16 bset_u64s[MAX_BSETS]; + + /* live keys only: */ + u16 packed_keys; + u16 unpacked_keys; +}; + +struct bset_tree { + /* + * We construct a binary tree in an array as if the array + * started at 1, so that things line up on the same cachelines + * better: see comments in bset.c at cacheline_to_bkey() for + * details + */ + + /* size of the binary tree and prev array */ + u16 size; + + /* function of size - precalculated for to_inorder() */ + u16 extra; + + u16 data_offset; + u16 aux_data_offset; + u16 end_offset; + + struct bpos max_key; +}; + +struct btree_write { + struct journal_entry_pin journal; + struct closure_waitlist wait; +}; + +struct btree { + /* Hottest entries first */ + struct rhash_head hash; + + /* Key/pointer for this btree node */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + + struct six_lock lock; + + unsigned long flags; + u16 written; + u8 level; + u8 btree_id; + u8 nsets; + u8 nr_key_bits; + + struct bkey_format format; + + struct btree_node *data; + void *aux_data; + + /* + * Sets of sorted keys - the real btree node - plus a binary search tree + * + * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point + * to the memory we have allocated for this btree node. Additionally, + * set[0]->data points to the entire btree node as it exists on disk. + */ + struct bset_tree set[MAX_BSETS]; + + struct btree_nr_keys nr; + u16 sib_u64s[2]; + u16 whiteout_u64s; + u16 uncompacted_whiteout_u64s; + u8 page_order; + u8 unpack_fn_len; + + /* + * XXX: add a delete sequence number, so when bch2_btree_node_relock() + * fails because the lock sequence number has changed - i.e. the + * contents were modified - we can still relock the node if it's still + * the one we want, without redoing the traversal + */ + + /* + * For asynchronous splits/interior node updates: + * When we do a split, we allocate new child nodes and update the parent + * node to point to them: we update the parent in memory immediately, + * but then we must wait until the children have been written out before + * the update to the parent can be written - this is a list of the + * btree_interior_updates that are blocking this node from being + * written: + */ + struct list_head write_blocked; + + struct open_bucket *ob; + + /* lru list */ + struct list_head list; + + struct btree_write writes[2]; + +#ifdef CONFIG_BCACHEFS_DEBUG + bool *expensive_debug_checks; +#endif +}; + +#define BTREE_FLAG(flag) \ +static inline bool btree_node_ ## flag(struct btree *b) \ +{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ + \ +static inline void set_btree_node_ ## flag(struct btree *b) \ +{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ + \ +static inline void clear_btree_node_ ## flag(struct btree *b) \ +{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } + +enum btree_flags { + BTREE_NODE_read_error, + BTREE_NODE_write_error, + BTREE_NODE_dirty, + BTREE_NODE_noevict, + BTREE_NODE_write_idx, + BTREE_NODE_accessed, + BTREE_NODE_write_in_flight, + BTREE_NODE_just_written, +}; + +BTREE_FLAG(read_error); +BTREE_FLAG(write_error); +BTREE_FLAG(dirty); +BTREE_FLAG(noevict); +BTREE_FLAG(write_idx); +BTREE_FLAG(accessed); +BTREE_FLAG(write_in_flight); +BTREE_FLAG(just_written); + +static inline struct btree_write *btree_current_write(struct btree *b) +{ + return b->writes + btree_node_write_idx(b); +} + +static inline struct btree_write *btree_prev_write(struct btree *b) +{ + return b->writes + (btree_node_write_idx(b) ^ 1); +} + +static inline struct bset_tree *bset_tree_last(struct btree *b) +{ + EBUG_ON(!b->nsets); + return b->set + b->nsets - 1; +} + +static inline struct bset *bset(const struct btree *b, + const struct bset_tree *t) +{ + return (void *) b->data + t->data_offset * sizeof(u64); +} + +static inline struct bset *btree_bset_first(struct btree *b) +{ + return bset(b, b->set); +} + +static inline struct bset *btree_bset_last(struct btree *b) +{ + return bset(b, bset_tree_last(b)); +} + +static inline u16 +__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) +{ + size_t ret = (u64 *) k - (u64 *) b->data - 1; + + EBUG_ON(ret > U16_MAX); + return ret; +} + +static inline struct bkey_packed * +__btree_node_offset_to_key(const struct btree *b, u16 k) +{ + return (void *) ((u64 *) b->data + k + 1); +} + +#define btree_bkey_first(_b, _t) (bset(_b, _t)->start) + +#define btree_bkey_last(_b, _t) \ +({ \ + EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ + vstruct_last(bset(_b, _t))); \ + \ + __btree_node_offset_to_key(_b, (_t)->end_offset); \ +}) + +static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) +{ + t->end_offset = + __btree_node_key_to_offset(b, vstruct_last(bset(b, t))); + btree_bkey_last(b, t); +} + +static inline void set_btree_bset(struct btree *b, struct bset_tree *t, + const struct bset *i) +{ + t->data_offset = (u64 *) i - (u64 *) b->data; + + EBUG_ON(bset(b, t) != i); + + set_btree_bset_end(b, t); +} + +static inline unsigned bset_byte_offset(struct btree *b, void *i) +{ + return i - (void *) b->data; +} + +/* Type of keys @b contains: */ +static inline enum bkey_type btree_node_type(struct btree *b) +{ + return b->level ? BKEY_TYPE_BTREE : b->btree_id; +} + +static inline const struct bkey_ops *btree_node_ops(struct btree *b) +{ + return bch2_bkey_ops[btree_node_type(b)]; +} + +static inline bool btree_node_has_ptrs(struct btree *b) +{ + return btree_type_has_ptrs(btree_node_type(b)); +} + +static inline bool btree_node_is_extents(struct btree *b) +{ + return btree_node_type(b) == BKEY_TYPE_EXTENTS; +} + +struct btree_root { + struct btree *b; + + struct btree_interior_update *as; + + /* On disk root - see async splits: */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + u8 level; + u8 alive; +}; + +/* + * Optional hook that will be called just prior to a btree node update, when + * we're holding the write lock and we know what key is about to be overwritten: + */ + +struct btree_iter; +struct btree_node_iter; + +enum extent_insert_hook_ret { + BTREE_HOOK_DO_INSERT, + BTREE_HOOK_NO_INSERT, + BTREE_HOOK_RESTART_TRANS, +}; + +struct extent_insert_hook { + enum extent_insert_hook_ret + (*fn)(struct extent_insert_hook *, struct bpos, struct bpos, + struct bkey_s_c, const struct bkey_i *); +}; + +enum btree_insert_ret { + BTREE_INSERT_OK, + /* extent spanned multiple leaf nodes: have to traverse to next node: */ + BTREE_INSERT_NEED_TRAVERSE, + /* write lock held for too long */ + BTREE_INSERT_NEED_RESCHED, + /* leaf node needs to be split */ + BTREE_INSERT_BTREE_NODE_FULL, + BTREE_INSERT_JOURNAL_RES_FULL, + BTREE_INSERT_ENOSPC, + BTREE_INSERT_NEED_GC_LOCK, +}; + +enum btree_gc_coalesce_fail_reason { + BTREE_GC_COALESCE_FAIL_RESERVE_GET, + BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, + BTREE_GC_COALESCE_FAIL_FORMAT_FITS, +}; + +typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, + struct btree *, + struct btree_node_iter *); + +#endif /* _BCACHE_BTREE_TYPES_H */ diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c new file mode 100644 index 00000000..51dff1b7 --- /dev/null +++ b/libbcachefs/btree_update.c @@ -0,0 +1,2344 @@ + +#include "bcachefs.h" +#include "alloc.h" +#include "bkey_methods.h" +#include "btree_cache.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "buckets.h" +#include "extents.h" +#include "journal.h" +#include "keylist.h" +#include "super-io.h" + +#include <linux/random.h> +#include <linux/sort.h> +#include <trace/events/bcachefs.h> + +static void btree_interior_update_updated_root(struct bch_fs *, + struct btree_interior_update *, + enum btree_id); + +/* Calculate ideal packed bkey format for new btree nodes: */ + +void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) +{ + struct bkey_packed *k; + struct bset_tree *t; + struct bkey uk; + + bch2_bkey_format_add_pos(s, b->data->min_key); + + for_each_bset(b, t) + for (k = btree_bkey_first(b, t); + k != btree_bkey_last(b, t); + k = bkey_next(k)) + if (!bkey_whiteout(k)) { + uk = bkey_unpack_key(b, k); + bch2_bkey_format_add_key(s, &uk); + } +} + +static struct bkey_format bch2_btree_calc_format(struct btree *b) +{ + struct bkey_format_state s; + + bch2_bkey_format_init(&s); + __bch2_btree_calc_format(&s, b); + + return bch2_bkey_format_done(&s); +} + +static size_t btree_node_u64s_with_format(struct btree *b, + struct bkey_format *new_f) +{ + struct bkey_format *old_f = &b->format; + + /* stupid integer promotion rules */ + ssize_t delta = + (((int) new_f->key_u64s - old_f->key_u64s) * + (int) b->nr.packed_keys) + + (((int) new_f->key_u64s - BKEY_U64s) * + (int) b->nr.unpacked_keys); + + BUG_ON(delta + b->nr.live_u64s < 0); + + return b->nr.live_u64s + delta; +} + +/** + * btree_node_format_fits - check if we could rewrite node with a new format + * + * This assumes all keys can pack with the new format -- it just checks if + * the re-packed keys would fit inside the node itself. + */ +bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, + struct bkey_format *new_f) +{ + size_t u64s = btree_node_u64s_with_format(b, new_f); + + return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); +} + +/* Btree node freeing/allocation: */ + +/* + * We're doing the index update that makes @b unreachable, update stuff to + * reflect that: + * + * Must be called _before_ btree_interior_update_updated_root() or + * btree_interior_update_updated_btree: + */ +static void bch2_btree_node_free_index(struct bch_fs *c, struct btree *b, + enum btree_id id, struct bkey_s_c k, + struct bch_fs_usage *stats) +{ + struct btree_interior_update *as; + struct pending_btree_node_free *d; + + mutex_lock(&c->btree_interior_update_lock); + + for_each_pending_btree_node_free(c, as, d) + if (!bkey_cmp(k.k->p, d->key.k.p) && + bkey_val_bytes(k.k) == bkey_val_bytes(&d->key.k) && + !memcmp(k.v, &d->key.v, bkey_val_bytes(k.k))) + goto found; + + BUG(); +found: + d->index_update_done = true; + + /* + * Btree nodes are accounted as freed in bch_alloc_stats when they're + * freed from the index: + */ + stats->s[S_COMPRESSED][S_META] -= c->sb.btree_node_size; + stats->s[S_UNCOMPRESSED][S_META] -= c->sb.btree_node_size; + + /* + * We're dropping @k from the btree, but it's still live until the + * index update is persistent so we need to keep a reference around for + * mark and sweep to find - that's primarily what the + * btree_node_pending_free list is for. + * + * So here (when we set index_update_done = true), we're moving an + * existing reference to a different part of the larger "gc keyspace" - + * and the new position comes after the old position, since GC marks + * the pending free list after it walks the btree. + * + * If we move the reference while mark and sweep is _between_ the old + * and the new position, mark and sweep will see the reference twice + * and it'll get double accounted - so check for that here and subtract + * to cancel out one of mark and sweep's markings if necessary: + */ + + /* + * bch2_mark_key() compares the current gc pos to the pos we're + * moving this reference from, hence one comparison here: + */ + if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) { + struct bch_fs_usage tmp = { 0 }; + + bch2_mark_key(c, bkey_i_to_s_c(&d->key), + -c->sb.btree_node_size, true, b + ? gc_pos_btree_node(b) + : gc_pos_btree_root(id), + &tmp, 0); + /* + * Don't apply tmp - pending deletes aren't tracked in + * bch_alloc_stats: + */ + } + + mutex_unlock(&c->btree_interior_update_lock); +} + +static void __btree_node_free(struct bch_fs *c, struct btree *b, + struct btree_iter *iter) +{ + trace_btree_node_free(c, b); + + BUG_ON(b == btree_node_root(c, b)); + BUG_ON(b->ob); + BUG_ON(!list_empty(&b->write_blocked)); + + six_lock_write(&b->lock); + + if (btree_node_dirty(b)) + bch2_btree_complete_write(c, b, btree_current_write(b)); + clear_btree_node_dirty(b); + + bch2_btree_node_hash_remove(c, b); + + mutex_lock(&c->btree_cache_lock); + list_move(&b->list, &c->btree_cache_freeable); + mutex_unlock(&c->btree_cache_lock); + + /* + * By using six_unlock_write() directly instead of + * bch2_btree_node_unlock_write(), we don't update the iterator's + * sequence numbers and cause future bch2_btree_node_relock() calls to + * fail: + */ + six_unlock_write(&b->lock); +} + +void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) +{ + struct open_bucket *ob = b->ob; + + b->ob = NULL; + + __btree_node_free(c, b, NULL); + + bch2_open_bucket_put(c, ob); +} + +void bch2_btree_node_free_inmem(struct btree_iter *iter, struct btree *b) +{ + bch2_btree_iter_node_drop_linked(iter, b); + + __btree_node_free(iter->c, b, iter); + + bch2_btree_iter_node_drop(iter, b); +} + +static void bch2_btree_node_free_ondisk(struct bch_fs *c, + struct pending_btree_node_free *pending) +{ + struct bch_fs_usage stats = { 0 }; + + BUG_ON(!pending->index_update_done); + + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), + -c->sb.btree_node_size, true, + gc_phase(GC_PHASE_PENDING_DELETE), + &stats, 0); + /* + * Don't apply stats - pending deletes aren't tracked in + * bch_alloc_stats: + */ +} + +void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b) +{ + bch2_open_bucket_put(c, b->ob); + b->ob = NULL; +} + +static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, + bool use_reserve, + struct disk_reservation *res, + struct closure *cl) +{ + BKEY_PADDED(k) tmp; + struct open_bucket *ob; + struct btree *b; + unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE; + + mutex_lock(&c->btree_reserve_cache_lock); + if (c->btree_reserve_cache_nr > reserve) { + struct btree_alloc *a = + &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + + ob = a->ob; + bkey_copy(&tmp.k, &a->k); + mutex_unlock(&c->btree_reserve_cache_lock); + goto mem_alloc; + } + mutex_unlock(&c->btree_reserve_cache_lock); + +retry: + /* alloc_sectors is weird, I suppose */ + bkey_extent_init(&tmp.k); + tmp.k.k.size = c->sb.btree_node_size, + + ob = bch2_alloc_sectors(c, &c->btree_write_point, + bkey_i_to_extent(&tmp.k), + res->nr_replicas, + c->opts.metadata_replicas_required, + use_reserve ? RESERVE_BTREE : RESERVE_NONE, + cl); + if (IS_ERR(ob)) + return ERR_CAST(ob); + + if (tmp.k.k.size < c->sb.btree_node_size) { + bch2_open_bucket_put(c, ob); + goto retry; + } +mem_alloc: + b = bch2_btree_node_mem_alloc(c); + + /* we hold cannibalize_lock: */ + BUG_ON(IS_ERR(b)); + BUG_ON(b->ob); + + bkey_copy(&b->key, &tmp.k); + b->key.k.size = 0; + b->ob = ob; + + return b; +} + +static struct btree *bch2_btree_node_alloc(struct bch_fs *c, + unsigned level, enum btree_id id, + struct btree_reserve *reserve) +{ + struct btree *b; + + BUG_ON(!reserve->nr); + + b = reserve->b[--reserve->nr]; + + BUG_ON(bch2_btree_node_hash_insert(c, b, level, id)); + + set_btree_node_accessed(b); + set_btree_node_dirty(b); + + bch2_bset_init_first(b, &b->data->keys); + memset(&b->nr, 0, sizeof(b->nr)); + b->data->magic = cpu_to_le64(bset_magic(c)); + b->data->flags = 0; + SET_BTREE_NODE_ID(b->data, id); + SET_BTREE_NODE_LEVEL(b->data, level); + b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr; + + bch2_btree_build_aux_trees(b); + + bch2_check_mark_super(c, &b->key, true); + + trace_btree_node_alloc(c, b); + return b; +} + +struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *c, + struct btree *b, + struct bkey_format format, + struct btree_reserve *reserve) +{ + struct btree *n; + + n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve); + + n->data->min_key = b->data->min_key; + n->data->max_key = b->data->max_key; + n->data->format = format; + + btree_node_set_format(n, format); + + bch2_btree_sort_into(c, n, b); + + btree_node_reset_sib_u64s(n); + + n->key.k.p = b->key.k.p; + return n; +} + +static struct btree *bch2_btree_node_alloc_replacement(struct bch_fs *c, + struct btree *b, + struct btree_reserve *reserve) +{ + struct bkey_format new_f = bch2_btree_calc_format(b); + + /* + * The keys might expand with the new format - if they wouldn't fit in + * the btree node anymore, use the old format for now: + */ + if (!bch2_btree_node_format_fits(c, b, &new_f)) + new_f = b->format; + + return __bch2_btree_node_alloc_replacement(c, b, new_f, reserve); +} + +static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b, + struct btree_reserve *btree_reserve) +{ + struct btree *old = btree_node_root(c, b); + + /* Root nodes cannot be reaped */ + mutex_lock(&c->btree_cache_lock); + list_del_init(&b->list); + mutex_unlock(&c->btree_cache_lock); + + mutex_lock(&c->btree_root_lock); + btree_node_root(c, b) = b; + mutex_unlock(&c->btree_root_lock); + + if (btree_reserve) { + /* + * New allocation (we're not being called because we're in + * bch2_btree_root_read()) - do marking while holding + * btree_root_lock: + */ + struct bch_fs_usage stats = { 0 }; + + bch2_mark_key(c, bkey_i_to_s_c(&b->key), + c->sb.btree_node_size, true, + gc_pos_btree_root(b->btree_id), + &stats, 0); + + if (old) + bch2_btree_node_free_index(c, NULL, old->btree_id, + bkey_i_to_s_c(&old->key), + &stats); + bch2_fs_usage_apply(c, &stats, &btree_reserve->disk_res, + gc_pos_btree_root(b->btree_id)); + } + + bch2_recalc_btree_reserve(c); +} + +static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b) +{ + struct btree_root *r = &c->btree_roots[b->btree_id]; + + mutex_lock(&c->btree_root_lock); + + BUG_ON(b != r->b); + bkey_copy(&r->key, &b->key); + r->level = b->level; + r->alive = true; + + mutex_unlock(&c->btree_root_lock); +} + +/* + * Only for filesystem bringup, when first reading the btree roots or allocating + * btree roots when initializing a new filesystem: + */ +void bch2_btree_set_root_initial(struct bch_fs *c, struct btree *b, + struct btree_reserve *btree_reserve) +{ + BUG_ON(btree_node_root(c, b)); + + bch2_btree_set_root_inmem(c, b, btree_reserve); + bch2_btree_set_root_ondisk(c, b); +} + +/** + * bch_btree_set_root - update the root in memory and on disk + * + * To ensure forward progress, the current task must not be holding any + * btree node write locks. However, you must hold an intent lock on the + * old root. + * + * Note: This allocates a journal entry but doesn't add any keys to + * it. All the btree roots are part of every journal write, so there + * is nothing new to be done. This just guarantees that there is a + * journal write. + */ +static void bch2_btree_set_root(struct btree_iter *iter, struct btree *b, + struct btree_interior_update *as, + struct btree_reserve *btree_reserve) +{ + struct bch_fs *c = iter->c; + struct btree *old; + + trace_btree_set_root(c, b); + BUG_ON(!b->written); + + old = btree_node_root(c, b); + + /* + * Ensure no one is using the old root while we switch to the + * new root: + */ + bch2_btree_node_lock_write(old, iter); + + bch2_btree_set_root_inmem(c, b, btree_reserve); + + btree_interior_update_updated_root(c, as, iter->btree_id); + + /* + * Unlock old root after new root is visible: + * + * The new root isn't persistent, but that's ok: we still have + * an intent lock on the new root, and any updates that would + * depend on the new root would have to update the new root. + */ + bch2_btree_node_unlock_write(old, iter); +} + +static struct btree *__btree_root_alloc(struct bch_fs *c, unsigned level, + enum btree_id id, + struct btree_reserve *reserve) +{ + struct btree *b = bch2_btree_node_alloc(c, level, id, reserve); + + b->data->min_key = POS_MIN; + b->data->max_key = POS_MAX; + b->data->format = bch2_btree_calc_format(b); + b->key.k.p = POS_MAX; + + btree_node_set_format(b, b->data->format); + bch2_btree_build_aux_trees(b); + + six_unlock_write(&b->lock); + + return b; +} + +void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve) +{ + bch2_disk_reservation_put(c, &reserve->disk_res); + + mutex_lock(&c->btree_reserve_cache_lock); + + while (reserve->nr) { + struct btree *b = reserve->b[--reserve->nr]; + + six_unlock_write(&b->lock); + + if (c->btree_reserve_cache_nr < + ARRAY_SIZE(c->btree_reserve_cache)) { + struct btree_alloc *a = + &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; + + a->ob = b->ob; + b->ob = NULL; + bkey_copy(&a->k, &b->key); + } else { + bch2_open_bucket_put(c, b->ob); + b->ob = NULL; + } + + __btree_node_free(c, b, NULL); + + six_unlock_intent(&b->lock); + } + + mutex_unlock(&c->btree_reserve_cache_lock); + + mempool_free(reserve, &c->btree_reserve_pool); +} + +static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c, + unsigned nr_nodes, + unsigned flags, + struct closure *cl) +{ + struct btree_reserve *reserve; + struct btree *b; + struct disk_reservation disk_res = { 0, 0 }; + unsigned sectors = nr_nodes * c->sb.btree_node_size; + int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD| + BCH_DISK_RESERVATION_METADATA; + + if (flags & BTREE_INSERT_NOFAIL) + disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL; + + /* + * This check isn't necessary for correctness - it's just to potentially + * prevent us from doing a lot of work that'll end up being wasted: + */ + ret = bch2_journal_error(&c->journal); + if (ret) + return ERR_PTR(ret); + + if (bch2_disk_reservation_get(c, &disk_res, sectors, disk_res_flags)) + return ERR_PTR(-ENOSPC); + + BUG_ON(nr_nodes > BTREE_RESERVE_MAX); + + /* + * Protects reaping from the btree node cache and using the btree node + * open bucket reserve: + */ + ret = bch2_btree_node_cannibalize_lock(c, cl); + if (ret) { + bch2_disk_reservation_put(c, &disk_res); + return ERR_PTR(ret); + } + + reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO); + + reserve->disk_res = disk_res; + reserve->nr = 0; + + while (reserve->nr < nr_nodes) { + b = __bch2_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE, + &disk_res, cl); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto err_free; + } + + reserve->b[reserve->nr++] = b; + } + + bch2_btree_node_cannibalize_unlock(c); + return reserve; +err_free: + bch2_btree_reserve_put(c, reserve); + bch2_btree_node_cannibalize_unlock(c); + trace_btree_reserve_get_fail(c, nr_nodes, cl); + return ERR_PTR(ret); +} + +struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, + struct btree *b, + unsigned extra_nodes, + unsigned flags, + struct closure *cl) +{ + unsigned depth = btree_node_root(c, b)->level - b->level; + unsigned nr_nodes = btree_reserve_required_nodes(depth) + extra_nodes; + + return __bch2_btree_reserve_get(c, nr_nodes, flags, cl); + +} + +int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, + struct closure *writes) +{ + struct closure cl; + struct btree_reserve *reserve; + struct btree *b; + + closure_init_stack(&cl); + + while (1) { + /* XXX haven't calculated capacity yet :/ */ + reserve = __bch2_btree_reserve_get(c, 1, 0, &cl); + if (!IS_ERR(reserve)) + break; + + if (PTR_ERR(reserve) == -ENOSPC) + return PTR_ERR(reserve); + + closure_sync(&cl); + } + + b = __btree_root_alloc(c, 0, id, reserve); + + bch2_btree_node_write(c, b, writes, SIX_LOCK_intent, -1); + + bch2_btree_set_root_initial(c, b, reserve); + bch2_btree_open_bucket_put(c, b); + six_unlock_intent(&b->lock); + + bch2_btree_reserve_put(c, reserve); + + return 0; +} + +static void bch2_insert_fixup_btree_ptr(struct btree_iter *iter, + struct btree *b, + struct bkey_i *insert, + struct btree_node_iter *node_iter, + struct disk_reservation *disk_res) +{ + struct bch_fs *c = iter->c; + struct bch_fs_usage stats = { 0 }; + struct bkey_packed *k; + struct bkey tmp; + + if (bkey_extent_is_data(&insert->k)) + bch2_mark_key(c, bkey_i_to_s_c(insert), + c->sb.btree_node_size, true, + gc_pos_btree_node(b), &stats, 0); + + while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && + !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false)) + bch2_btree_node_iter_advance(node_iter, b); + + /* + * If we're overwriting, look up pending delete and mark so that gc + * marks it on the pending delete list: + */ + if (k && !bkey_cmp_packed(b, k, &insert->k)) + bch2_btree_node_free_index(c, b, iter->btree_id, + bkey_disassemble(b, k, &tmp), + &stats); + + bch2_fs_usage_apply(c, &stats, disk_res, gc_pos_btree_node(b)); + + bch2_btree_bset_insert_key(iter, b, node_iter, insert); + set_btree_node_dirty(b); +} + +/* Inserting into a given leaf node (last stage of insert): */ + +/* Handle overwrites and do insert, for non extents: */ +bool bch2_btree_bset_insert_key(struct btree_iter *iter, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) +{ + const struct bkey_format *f = &b->format; + struct bkey_packed *k; + struct bset_tree *t; + unsigned clobber_u64s; + + EBUG_ON(btree_node_just_written(b)); + EBUG_ON(bset_written(b, btree_bset_last(b))); + EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); + EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 || + bkey_cmp(insert->k.p, b->data->max_key) > 0); + BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->c, b)); + + k = bch2_btree_node_iter_peek_all(node_iter, b); + if (k && !bkey_cmp_packed(b, k, &insert->k)) { + BUG_ON(bkey_whiteout(k)); + + t = bch2_bkey_to_bset(b, k); + + if (bset_unwritten(b, bset(b, t)) && + bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) { + BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k)); + + k->type = insert->k.type; + memcpy_u64s(bkeyp_val(f, k), &insert->v, + bkey_val_u64s(&insert->k)); + return true; + } + + insert->k.needs_whiteout = k->needs_whiteout; + + btree_keys_account_key_drop(&b->nr, t - b->set, k); + + if (t == bset_tree_last(b)) { + clobber_u64s = k->u64s; + + /* + * If we're deleting, and the key we're deleting doesn't + * need a whiteout (it wasn't overwriting a key that had + * been written to disk) - just delete it: + */ + if (bkey_whiteout(&insert->k) && !k->needs_whiteout) { + bch2_bset_delete(b, k, clobber_u64s); + bch2_btree_node_iter_fix(iter, b, node_iter, t, + k, clobber_u64s, 0); + return true; + } + + goto overwrite; + } + + k->type = KEY_TYPE_DELETED; + bch2_btree_node_iter_fix(iter, b, node_iter, t, k, + k->u64s, k->u64s); + + if (bkey_whiteout(&insert->k)) { + reserve_whiteout(b, t, k); + return true; + } else { + k->needs_whiteout = false; + } + } else { + /* + * Deleting, but the key to delete wasn't found - nothing to do: + */ + if (bkey_whiteout(&insert->k)) + return false; + + insert->k.needs_whiteout = false; + } + + t = bset_tree_last(b); + k = bch2_btree_node_iter_bset_pos(node_iter, b, t); + clobber_u64s = 0; +overwrite: + bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); + if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k)) + bch2_btree_node_iter_fix(iter, b, node_iter, t, k, + clobber_u64s, k->u64s); + return true; +} + +static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + unsigned i) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_write *w = container_of(pin, struct btree_write, journal); + struct btree *b = container_of(w, struct btree, writes[i]); + + six_lock_read(&b->lock); + /* + * Reusing a btree node can race with the journal reclaim code calling + * the journal pin flush fn, and there's no good fix for this: we don't + * really want journal_pin_drop() to block until the flush fn is no + * longer running, because journal_pin_drop() is called from the btree + * node write endio function, and we can't wait on the flush fn to + * finish running in mca_reap() - where we make reused btree nodes ready + * to use again - because there, we're holding the lock this function + * needs - deadlock. + * + * So, the b->level check is a hack so we don't try to write nodes we + * shouldn't: + */ + if (!b->level) + bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, i); + six_unlock_read(&b->lock); +} + +static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin) +{ + return __btree_node_flush(j, pin, 0); +} + +static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin) +{ + return __btree_node_flush(j, pin, 1); +} + +void bch2_btree_journal_key(struct btree_insert *trans, + struct btree_iter *iter, + struct bkey_i *insert) +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct btree *b = iter->nodes[0]; + struct btree_write *w = btree_current_write(b); + + EBUG_ON(iter->level || b->level); + EBUG_ON(!trans->journal_res.ref && + test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + + if (!journal_pin_active(&w->journal)) + bch2_journal_pin_add(j, &w->journal, + btree_node_write_idx(b) == 0 + ? btree_node_flush0 + : btree_node_flush1); + + if (trans->journal_res.ref) { + u64 seq = trans->journal_res.seq; + bool needs_whiteout = insert->k.needs_whiteout; + + /* + * have a bug where we're seeing an extent with an invalid crc + * entry in the journal, trying to track it down: + */ + BUG_ON(bch2_bkey_invalid(c, b->btree_id, bkey_i_to_s_c(insert))); + + /* ick */ + insert->k.needs_whiteout = false; + bch2_journal_add_keys(j, &trans->journal_res, + b->btree_id, insert); + insert->k.needs_whiteout = needs_whiteout; + + if (trans->journal_seq) + *trans->journal_seq = seq; + btree_bset_last(b)->journal_seq = cpu_to_le64(seq); + } + + if (!btree_node_dirty(b)) + set_btree_node_dirty(b); +} + +static enum btree_insert_ret +bch2_insert_fixup_key(struct btree_insert *trans, + struct btree_insert_entry *insert) +{ + struct btree_iter *iter = insert->iter; + + BUG_ON(iter->level); + + if (bch2_btree_bset_insert_key(iter, + iter->nodes[0], + &iter->node_iters[0], + insert->k)) + bch2_btree_journal_key(trans, iter, insert->k); + + trans->did_work = true; + return BTREE_INSERT_OK; +} + +static void verify_keys_sorted(struct keylist *l) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bkey_i *k; + + for_each_keylist_key(l, k) + BUG_ON(bkey_next(k) != l->top && + bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); +#endif +} + +static void btree_node_lock_for_insert(struct btree *b, struct btree_iter *iter) +{ + struct bch_fs *c = iter->c; + + bch2_btree_node_lock_write(b, iter); + + if (btree_node_just_written(b) && + bch2_btree_post_write_cleanup(c, b)) + bch2_btree_iter_reinit_node(iter, b); + + /* + * If the last bset has been written, or if it's gotten too big - start + * a new bset to insert into: + */ + if (want_new_bset(c, b)) + bch2_btree_init_next(c, b, iter); +} + +/* Asynchronous interior node update machinery */ + +struct btree_interior_update * +bch2_btree_interior_update_alloc(struct bch_fs *c) +{ + struct btree_interior_update *as; + + as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); + memset(as, 0, sizeof(*as)); + closure_init(&as->cl, &c->cl); + as->c = c; + as->mode = BTREE_INTERIOR_NO_UPDATE; + + bch2_keylist_init(&as->parent_keys, as->inline_keys, + ARRAY_SIZE(as->inline_keys)); + + mutex_lock(&c->btree_interior_update_lock); + list_add(&as->list, &c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); + + return as; +} + +static void btree_interior_update_free(struct closure *cl) +{ + struct btree_interior_update *as = container_of(cl, struct btree_interior_update, cl); + + mempool_free(as, &as->c->btree_interior_update_pool); +} + +static void btree_interior_update_nodes_reachable(struct closure *cl) +{ + struct btree_interior_update *as = + container_of(cl, struct btree_interior_update, cl); + struct bch_fs *c = as->c; + unsigned i; + + bch2_journal_pin_drop(&c->journal, &as->journal); + + mutex_lock(&c->btree_interior_update_lock); + + for (i = 0; i < as->nr_pending; i++) + bch2_btree_node_free_ondisk(c, &as->pending[i]); + as->nr_pending = 0; + + mutex_unlock(&c->btree_interior_update_lock); + + mutex_lock(&c->btree_interior_update_lock); + list_del(&as->list); + mutex_unlock(&c->btree_interior_update_lock); + + closure_wake_up(&as->wait); + + closure_return_with_destructor(cl, btree_interior_update_free); +} + +static void btree_interior_update_nodes_written(struct closure *cl) +{ + struct btree_interior_update *as = + container_of(cl, struct btree_interior_update, cl); + struct bch_fs *c = as->c; + struct btree *b; + + if (bch2_journal_error(&c->journal)) { + /* XXX what? */ + } + + /* XXX: missing error handling, damnit */ + + /* check for journal error, bail out if we flushed */ + + /* + * We did an update to a parent node where the pointers we added pointed + * to child nodes that weren't written yet: now, the child nodes have + * been written so we can write out the update to the interior node. + */ +retry: + mutex_lock(&c->btree_interior_update_lock); + switch (as->mode) { + case BTREE_INTERIOR_NO_UPDATE: + BUG(); + case BTREE_INTERIOR_UPDATING_NODE: + /* The usual case: */ + b = READ_ONCE(as->b); + + if (!six_trylock_read(&b->lock)) { + mutex_unlock(&c->btree_interior_update_lock); + six_lock_read(&b->lock); + six_unlock_read(&b->lock); + goto retry; + } + + BUG_ON(!btree_node_dirty(b)); + closure_wait(&btree_current_write(b)->wait, cl); + + list_del(&as->write_blocked_list); + + if (list_empty(&b->write_blocked)) + bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, -1); + six_unlock_read(&b->lock); + break; + + case BTREE_INTERIOR_UPDATING_AS: + /* + * The btree node we originally updated has been freed and is + * being rewritten - so we need to write anything here, we just + * need to signal to that btree_interior_update that it's ok to make the + * new replacement node visible: + */ + closure_put(&as->parent_as->cl); + + /* + * and then we have to wait on that btree_interior_update to finish: + */ + closure_wait(&as->parent_as->wait, cl); + break; + + case BTREE_INTERIOR_UPDATING_ROOT: + /* b is the new btree root: */ + b = READ_ONCE(as->b); + + if (!six_trylock_read(&b->lock)) { + mutex_unlock(&c->btree_interior_update_lock); + six_lock_read(&b->lock); + six_unlock_read(&b->lock); + goto retry; + } + + BUG_ON(c->btree_roots[b->btree_id].as != as); + c->btree_roots[b->btree_id].as = NULL; + + bch2_btree_set_root_ondisk(c, b); + + /* + * We don't have to wait anything anything here (before + * btree_interior_update_nodes_reachable frees the old nodes + * ondisk) - we've ensured that the very next journal write will + * have the pointer to the new root, and before the allocator + * can reuse the old nodes it'll have to do a journal commit: + */ + six_unlock_read(&b->lock); + } + mutex_unlock(&c->btree_interior_update_lock); + + continue_at(cl, btree_interior_update_nodes_reachable, system_wq); +} + +/* + * We're updating @b with pointers to nodes that haven't finished writing yet: + * block @b from being written until @as completes + */ +static void btree_interior_update_updated_btree(struct bch_fs *c, + struct btree_interior_update *as, + struct btree *b) +{ + mutex_lock(&c->btree_interior_update_lock); + + BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(!btree_node_dirty(b)); + + as->mode = BTREE_INTERIOR_UPDATING_NODE; + as->b = b; + list_add(&as->write_blocked_list, &b->write_blocked); + + mutex_unlock(&c->btree_interior_update_lock); + + bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl); + + continue_at(&as->cl, btree_interior_update_nodes_written, + system_freezable_wq); +} + +static void btree_interior_update_updated_root(struct bch_fs *c, + struct btree_interior_update *as, + enum btree_id btree_id) +{ + struct btree_root *r = &c->btree_roots[btree_id]; + + mutex_lock(&c->btree_interior_update_lock); + + BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + + /* + * Old root might not be persistent yet - if so, redirect its + * btree_interior_update operation to point to us: + */ + if (r->as) { + BUG_ON(r->as->mode != BTREE_INTERIOR_UPDATING_ROOT); + + r->as->b = NULL; + r->as->mode = BTREE_INTERIOR_UPDATING_AS; + r->as->parent_as = as; + closure_get(&as->cl); + } + + as->mode = BTREE_INTERIOR_UPDATING_ROOT; + as->b = r->b; + r->as = as; + + mutex_unlock(&c->btree_interior_update_lock); + + bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl); + + continue_at(&as->cl, btree_interior_update_nodes_written, + system_freezable_wq); +} + +static void interior_update_flush(struct journal *j, struct journal_entry_pin *pin) +{ + struct btree_interior_update *as = + container_of(pin, struct btree_interior_update, journal); + + bch2_journal_flush_seq_async(j, as->journal_seq, NULL); +} + +/* + * @b is being split/rewritten: it may have pointers to not-yet-written btree + * nodes and thus outstanding btree_interior_updates - redirect @b's + * btree_interior_updates to point to this btree_interior_update: + */ +void bch2_btree_interior_update_will_free_node(struct bch_fs *c, + struct btree_interior_update *as, + struct btree *b) +{ + struct btree_interior_update *p, *n; + struct pending_btree_node_free *d; + struct bset_tree *t; + + /* + * Does this node have data that hasn't been written in the journal? + * + * If so, we have to wait for the corresponding journal entry to be + * written before making the new nodes reachable - we can't just carry + * over the bset->journal_seq tracking, since we'll be mixing those keys + * in with keys that aren't in the journal anymore: + */ + for_each_bset(b, t) + as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq); + + /* + * Does this node have unwritten data that has a pin on the journal? + * + * If so, transfer that pin to the btree_interior_update operation - + * note that if we're freeing multiple nodes, we only need to keep the + * oldest pin of any of the nodes we're freeing. We'll release the pin + * when the new nodes are persistent and reachable on disk: + */ + bch2_journal_pin_add_if_older(&c->journal, + &b->writes[0].journal, + &as->journal, interior_update_flush); + bch2_journal_pin_add_if_older(&c->journal, + &b->writes[1].journal, + &as->journal, interior_update_flush); + + mutex_lock(&c->btree_interior_update_lock); + + /* + * Does this node have any btree_interior_update operations preventing + * it from being written? + * + * If so, redirect them to point to this btree_interior_update: we can + * write out our new nodes, but we won't make them visible until those + * operations complete + */ + list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { + BUG_ON(p->mode != BTREE_INTERIOR_UPDATING_NODE); + + p->mode = BTREE_INTERIOR_UPDATING_AS; + list_del(&p->write_blocked_list); + p->b = NULL; + p->parent_as = as; + closure_get(&as->cl); + } + + /* Add this node to the list of nodes being freed: */ + BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending)); + + d = &as->pending[as->nr_pending++]; + d->index_update_done = false; + d->seq = b->data->keys.seq; + d->btree_id = b->btree_id; + d->level = b->level; + bkey_copy(&d->key, &b->key); + + mutex_unlock(&c->btree_interior_update_lock); +} + +static void btree_node_interior_verify(struct btree *b) +{ + struct btree_node_iter iter; + struct bkey_packed *k; + + BUG_ON(!b->level); + + bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false); +#if 1 + BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) || + bkey_cmp_left_packed(b, k, &b->key.k.p)); + + BUG_ON((bch2_btree_node_iter_advance(&iter, b), + !bch2_btree_node_iter_end(&iter))); +#else + const char *msg; + + msg = "not found"; + k = bch2_btree_node_iter_peek(&iter, b); + if (!k) + goto err; + + msg = "isn't what it should be"; + if (bkey_cmp_left_packed(b, k, &b->key.k.p)) + goto err; + + bch2_btree_node_iter_advance(&iter, b); + + msg = "isn't last key"; + if (!bch2_btree_node_iter_end(&iter)) + goto err; + return; +err: + bch2_dump_btree_node(b); + printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode, + b->key.k.p.offset, msg); + BUG(); +#endif +} + +static enum btree_insert_ret +bch2_btree_insert_keys_interior(struct btree *b, + struct btree_iter *iter, + struct keylist *insert_keys, + struct btree_interior_update *as, + struct btree_reserve *res) +{ + struct bch_fs *c = iter->c; + struct btree_iter *linked; + struct btree_node_iter node_iter; + struct bkey_i *insert = bch2_keylist_front(insert_keys); + struct bkey_packed *k; + + BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); + BUG_ON(!b->level); + BUG_ON(!as || as->b); + verify_keys_sorted(insert_keys); + + btree_node_lock_for_insert(b, iter); + + if (bch_keylist_u64s(insert_keys) > + bch_btree_keys_u64s_remaining(c, b)) { + bch2_btree_node_unlock_write(b, iter); + return BTREE_INSERT_BTREE_NODE_FULL; + } + + /* Don't screw up @iter's position: */ + node_iter = iter->node_iters[b->level]; + + /* + * btree_split(), btree_gc_coalesce() will insert keys before + * the iterator's current position - they know the keys go in + * the node the iterator points to: + */ + while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && + (bkey_cmp_packed(b, k, &insert->k) >= 0)) + ; + + while (!bch2_keylist_empty(insert_keys)) { + insert = bch2_keylist_front(insert_keys); + + bch2_insert_fixup_btree_ptr(iter, b, insert, + &node_iter, &res->disk_res); + bch2_keylist_pop_front(insert_keys); + } + + btree_interior_update_updated_btree(c, as, b); + + for_each_linked_btree_node(iter, b, linked) + bch2_btree_node_iter_peek(&linked->node_iters[b->level], + b); + bch2_btree_node_iter_peek(&iter->node_iters[b->level], b); + + bch2_btree_iter_verify(iter, b); + + if (bch2_maybe_compact_whiteouts(c, b)) + bch2_btree_iter_reinit_node(iter, b); + + bch2_btree_node_unlock_write(b, iter); + + btree_node_interior_verify(b); + return BTREE_INSERT_OK; +} + +/* + * Move keys from n1 (original replacement node, now lower node) to n2 (higher + * node) + */ +static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n1, + struct btree_reserve *reserve) +{ + size_t nr_packed = 0, nr_unpacked = 0; + struct btree *n2; + struct bset *set1, *set2; + struct bkey_packed *k, *prev = NULL; + + n2 = bch2_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve); + n2->data->max_key = n1->data->max_key; + n2->data->format = n1->format; + n2->key.k.p = n1->key.k.p; + + btree_node_set_format(n2, n2->data->format); + + set1 = btree_bset_first(n1); + set2 = btree_bset_first(n2); + + /* + * Has to be a linear search because we don't have an auxiliary + * search tree yet + */ + k = set1->start; + while (1) { + if (bkey_next(k) == vstruct_last(set1)) + break; + if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) + break; + + if (bkey_packed(k)) + nr_packed++; + else + nr_unpacked++; + + prev = k; + k = bkey_next(k); + } + + BUG_ON(!prev); + + n1->key.k.p = bkey_unpack_pos(n1, prev); + n1->data->max_key = n1->key.k.p; + n2->data->min_key = + btree_type_successor(n1->btree_id, n1->key.k.p); + + set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); + set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); + + set_btree_bset_end(n1, n1->set); + set_btree_bset_end(n2, n2->set); + + n2->nr.live_u64s = le16_to_cpu(set2->u64s); + n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); + n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; + n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; + + n1->nr.live_u64s = le16_to_cpu(set1->u64s); + n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); + n1->nr.packed_keys = nr_packed; + n1->nr.unpacked_keys = nr_unpacked; + + BUG_ON(!set1->u64s); + BUG_ON(!set2->u64s); + + memcpy_u64s(set2->start, + vstruct_end(set1), + le16_to_cpu(set2->u64s)); + + btree_node_reset_sib_u64s(n1); + btree_node_reset_sib_u64s(n2); + + bch2_verify_btree_nr_keys(n1); + bch2_verify_btree_nr_keys(n2); + + if (n1->level) { + btree_node_interior_verify(n1); + btree_node_interior_verify(n2); + } + + return n2; +} + +/* + * For updates to interior nodes, we've got to do the insert before we split + * because the stuff we're inserting has to be inserted atomically. Post split, + * the keys might have to go in different nodes and the split would no longer be + * atomic. + * + * Worse, if the insert is from btree node coalescing, if we do the insert after + * we do the split (and pick the pivot) - the pivot we pick might be between + * nodes that were coalesced, and thus in the middle of a child node post + * coalescing: + */ +static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b, + struct keylist *keys, + struct btree_reserve *res) +{ + struct btree_node_iter node_iter; + struct bkey_i *k = bch2_keylist_front(keys); + struct bkey_packed *p; + struct bset *i; + + BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); + + bch2_btree_node_iter_init(&node_iter, b, k->k.p, false, false); + + while (!bch2_keylist_empty(keys)) { + k = bch2_keylist_front(keys); + + BUG_ON(bch_keylist_u64s(keys) > + bch_btree_keys_u64s_remaining(iter->c, b)); + BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0); + BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0); + + bch2_insert_fixup_btree_ptr(iter, b, k, &node_iter, &res->disk_res); + bch2_keylist_pop_front(keys); + } + + /* + * We can't tolerate whiteouts here - with whiteouts there can be + * duplicate keys, and it would be rather bad if we picked a duplicate + * for the pivot: + */ + i = btree_bset_first(b); + p = i->start; + while (p != vstruct_last(i)) + if (bkey_deleted(p)) { + le16_add_cpu(&i->u64s, -p->u64s); + set_btree_bset_end(b, b->set); + memmove_u64s_down(p, bkey_next(p), + (u64 *) vstruct_last(i) - + (u64 *) p); + } else + p = bkey_next(p); + + BUG_ON(b->nsets != 1 || + b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); + + btree_node_interior_verify(b); +} + +static void btree_split(struct btree *b, struct btree_iter *iter, + struct keylist *insert_keys, + struct btree_reserve *reserve, + struct btree_interior_update *as) +{ + struct bch_fs *c = iter->c; + struct btree *parent = iter->nodes[b->level + 1]; + struct btree *n1, *n2 = NULL, *n3 = NULL; + u64 start_time = local_clock(); + + BUG_ON(!parent && (b != btree_node_root(c, b))); + BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); + + bch2_btree_interior_update_will_free_node(c, as, b); + + n1 = bch2_btree_node_alloc_replacement(c, b, reserve); + if (b->level) + btree_split_insert_keys(iter, n1, insert_keys, reserve); + + if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) { + trace_btree_node_split(c, b, b->nr.live_u64s); + + n2 = __btree_split_node(iter, n1, reserve); + + bch2_btree_build_aux_trees(n2); + bch2_btree_build_aux_trees(n1); + six_unlock_write(&n2->lock); + six_unlock_write(&n1->lock); + + bch2_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent, -1); + + /* + * Note that on recursive parent_keys == insert_keys, so we + * can't start adding new keys to parent_keys before emptying it + * out (which we did with btree_split_insert_keys() above) + */ + bch2_keylist_add(&as->parent_keys, &n1->key); + bch2_keylist_add(&as->parent_keys, &n2->key); + + if (!parent) { + /* Depth increases, make a new root */ + n3 = __btree_root_alloc(c, b->level + 1, + iter->btree_id, + reserve); + n3->sib_u64s[0] = U16_MAX; + n3->sib_u64s[1] = U16_MAX; + + btree_split_insert_keys(iter, n3, &as->parent_keys, + reserve); + bch2_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent, -1); + } + } else { + trace_btree_node_compact(c, b, b->nr.live_u64s); + + bch2_btree_build_aux_trees(n1); + six_unlock_write(&n1->lock); + + bch2_keylist_add(&as->parent_keys, &n1->key); + } + + bch2_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent, -1); + + /* New nodes all written, now make them visible: */ + + if (parent) { + /* Split a non root node */ + bch2_btree_insert_node(parent, iter, &as->parent_keys, + reserve, as); + } else if (n3) { + bch2_btree_set_root(iter, n3, as, reserve); + } else { + /* Root filled up but didn't need to be split */ + bch2_btree_set_root(iter, n1, as, reserve); + } + + bch2_btree_open_bucket_put(c, n1); + if (n2) + bch2_btree_open_bucket_put(c, n2); + if (n3) + bch2_btree_open_bucket_put(c, n3); + + /* + * Note - at this point other linked iterators could still have @b read + * locked; we're depending on the bch2_btree_iter_node_replace() calls + * below removing all references to @b so we don't return with other + * iterators pointing to a node they have locked that's been freed. + * + * We have to free the node first because the bch2_iter_node_replace() + * calls will drop _our_ iterator's reference - and intent lock - to @b. + */ + bch2_btree_node_free_inmem(iter, b); + + /* Successful split, update the iterator to point to the new nodes: */ + + if (n3) + bch2_btree_iter_node_replace(iter, n3); + if (n2) + bch2_btree_iter_node_replace(iter, n2); + bch2_btree_iter_node_replace(iter, n1); + + bch2_time_stats_update(&c->btree_split_time, start_time); +} + +/** + * bch_btree_insert_node - insert bkeys into a given btree node + * + * @iter: btree iterator + * @insert_keys: list of keys to insert + * @hook: insert callback + * @persistent: if not null, @persistent will wait on journal write + * + * Inserts as many keys as it can into a given btree node, splitting it if full. + * If a split occurred, this function will return early. This can only happen + * for leaf nodes -- inserts into interior nodes have to be atomic. + */ +void bch2_btree_insert_node(struct btree *b, + struct btree_iter *iter, + struct keylist *insert_keys, + struct btree_reserve *reserve, + struct btree_interior_update *as) +{ + BUG_ON(!b->level); + BUG_ON(!reserve || !as); + + switch (bch2_btree_insert_keys_interior(b, iter, insert_keys, + as, reserve)) { + case BTREE_INSERT_OK: + break; + case BTREE_INSERT_BTREE_NODE_FULL: + btree_split(b, iter, insert_keys, reserve, as); + break; + default: + BUG(); + } +} + +static int bch2_btree_split_leaf(struct btree_iter *iter, unsigned flags) +{ + struct bch_fs *c = iter->c; + struct btree *b = iter->nodes[0]; + struct btree_reserve *reserve; + struct btree_interior_update *as; + struct closure cl; + int ret = 0; + + closure_init_stack(&cl); + + /* Hack, because gc and splitting nodes doesn't mix yet: */ + if (!down_read_trylock(&c->gc_lock)) { + bch2_btree_iter_unlock(iter); + down_read(&c->gc_lock); + } + + /* + * XXX: figure out how far we might need to split, + * instead of locking/reserving all the way to the root: + */ + if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) { + ret = -EINTR; + goto out; + } + + reserve = bch2_btree_reserve_get(c, b, 0, flags, &cl); + if (IS_ERR(reserve)) { + ret = PTR_ERR(reserve); + if (ret == -EAGAIN) { + bch2_btree_iter_unlock(iter); + up_read(&c->gc_lock); + closure_sync(&cl); + return -EINTR; + } + goto out; + } + + as = bch2_btree_interior_update_alloc(c); + + btree_split(b, iter, NULL, reserve, as); + bch2_btree_reserve_put(c, reserve); + + bch2_btree_iter_set_locks_want(iter, 1); +out: + up_read(&c->gc_lock); + return ret; +} + +enum btree_node_sibling { + btree_prev_sib, + btree_next_sib, +}; + +static struct btree *btree_node_get_sibling(struct btree_iter *iter, + struct btree *b, + enum btree_node_sibling sib) +{ + struct btree *parent; + struct btree_node_iter node_iter; + struct bkey_packed *k; + BKEY_PADDED(k) tmp; + struct btree *ret; + unsigned level = b->level; + + parent = iter->nodes[level + 1]; + if (!parent) + return NULL; + + if (!bch2_btree_node_relock(iter, level + 1)) { + bch2_btree_iter_set_locks_want(iter, level + 2); + return ERR_PTR(-EINTR); + } + + node_iter = iter->node_iters[parent->level]; + + k = bch2_btree_node_iter_peek_all(&node_iter, parent); + BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); + + do { + k = sib == btree_prev_sib + ? bch2_btree_node_iter_prev_all(&node_iter, parent) + : (bch2_btree_node_iter_advance(&node_iter, parent), + bch2_btree_node_iter_peek_all(&node_iter, parent)); + if (!k) + return NULL; + } while (bkey_deleted(k)); + + bch2_bkey_unpack(parent, &tmp.k, k); + + ret = bch2_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent); + + if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) { + btree_node_unlock(iter, level); + ret = bch2_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent); + } + + if (!IS_ERR(ret) && !bch2_btree_node_relock(iter, level)) { + six_unlock_intent(&ret->lock); + ret = ERR_PTR(-EINTR); + } + + return ret; +} + +static int __foreground_maybe_merge(struct btree_iter *iter, + enum btree_node_sibling sib) +{ + struct bch_fs *c = iter->c; + struct btree_reserve *reserve; + struct btree_interior_update *as; + struct bkey_format_state new_s; + struct bkey_format new_f; + struct bkey_i delete; + struct btree *b, *m, *n, *prev, *next, *parent; + struct closure cl; + size_t sib_u64s; + int ret = 0; + + closure_init_stack(&cl); +retry: + if (!bch2_btree_node_relock(iter, iter->level)) + return 0; + + b = iter->nodes[iter->level]; + + parent = iter->nodes[b->level + 1]; + if (!parent) + return 0; + + if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) + return 0; + + /* XXX: can't be holding read locks */ + m = btree_node_get_sibling(iter, b, sib); + if (IS_ERR(m)) { + ret = PTR_ERR(m); + goto out; + } + + /* NULL means no sibling: */ + if (!m) { + b->sib_u64s[sib] = U16_MAX; + return 0; + } + + if (sib == btree_prev_sib) { + prev = m; + next = b; + } else { + prev = b; + next = m; + } + + bch2_bkey_format_init(&new_s); + __bch2_btree_calc_format(&new_s, b); + __bch2_btree_calc_format(&new_s, m); + new_f = bch2_bkey_format_done(&new_s); + + sib_u64s = btree_node_u64s_with_format(b, &new_f) + + btree_node_u64s_with_format(m, &new_f); + + if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { + sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); + sib_u64s /= 2; + sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); + } + + sib_u64s = min(sib_u64s, btree_max_u64s(c)); + b->sib_u64s[sib] = sib_u64s; + + if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { + six_unlock_intent(&m->lock); + return 0; + } + + /* We're changing btree topology, doesn't mix with gc: */ + if (!down_read_trylock(&c->gc_lock)) { + six_unlock_intent(&m->lock); + bch2_btree_iter_unlock(iter); + + down_read(&c->gc_lock); + up_read(&c->gc_lock); + ret = -EINTR; + goto out; + } + + if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) { + ret = -EINTR; + goto out_unlock; + } + + reserve = bch2_btree_reserve_get(c, b, 0, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + &cl); + if (IS_ERR(reserve)) { + ret = PTR_ERR(reserve); + goto out_unlock; + } + + as = bch2_btree_interior_update_alloc(c); + + bch2_btree_interior_update_will_free_node(c, as, b); + bch2_btree_interior_update_will_free_node(c, as, m); + + n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve); + n->data->min_key = prev->data->min_key; + n->data->max_key = next->data->max_key; + n->data->format = new_f; + n->key.k.p = next->key.k.p; + + btree_node_set_format(n, new_f); + + bch2_btree_sort_into(c, n, prev); + bch2_btree_sort_into(c, n, next); + + bch2_btree_build_aux_trees(n); + six_unlock_write(&n->lock); + + bkey_init(&delete.k); + delete.k.p = prev->key.k.p; + bch2_keylist_add(&as->parent_keys, &delete); + bch2_keylist_add(&as->parent_keys, &n->key); + + bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1); + + bch2_btree_insert_node(parent, iter, &as->parent_keys, reserve, as); + + bch2_btree_open_bucket_put(c, n); + bch2_btree_node_free_inmem(iter, b); + bch2_btree_node_free_inmem(iter, m); + bch2_btree_iter_node_replace(iter, n); + + bch2_btree_iter_verify(iter, n); + + bch2_btree_reserve_put(c, reserve); +out_unlock: + if (ret != -EINTR && ret != -EAGAIN) + bch2_btree_iter_set_locks_want(iter, 1); + six_unlock_intent(&m->lock); + up_read(&c->gc_lock); +out: + if (ret == -EAGAIN || ret == -EINTR) { + bch2_btree_iter_unlock(iter); + ret = -EINTR; + } + + closure_sync(&cl); + + if (ret == -EINTR) { + ret = bch2_btree_iter_traverse(iter); + if (!ret) + goto retry; + } + + return ret; +} + +static int inline foreground_maybe_merge(struct btree_iter *iter, + enum btree_node_sibling sib) +{ + struct bch_fs *c = iter->c; + struct btree *b; + + if (!btree_node_locked(iter, iter->level)) + return 0; + + b = iter->nodes[iter->level]; + if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) + return 0; + + return __foreground_maybe_merge(iter, sib); +} + +/** + * btree_insert_key - insert a key one key into a leaf node + */ +static enum btree_insert_ret +btree_insert_key(struct btree_insert *trans, + struct btree_insert_entry *insert) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter = insert->iter; + struct btree *b = iter->nodes[0]; + enum btree_insert_ret ret; + int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + + ret = !btree_node_is_extents(b) + ? bch2_insert_fixup_key(trans, insert) + : bch2_insert_fixup_extent(trans, insert); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; + + if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); + if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(iter->c, b)) + bch2_btree_iter_reinit_node(iter, b); + + trace_btree_insert_key(c, b, insert->k); + return ret; +} + +static bool same_leaf_as_prev(struct btree_insert *trans, + struct btree_insert_entry *i) +{ + /* + * Because we sorted the transaction entries, if multiple iterators + * point to the same leaf node they'll always be adjacent now: + */ + return i != trans->entries && + i[0].iter->nodes[0] == i[-1].iter->nodes[0]; +} + +#define trans_for_each_entry(trans, i) \ + for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++) + +static void multi_lock_write(struct btree_insert *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_entry(trans, i) + if (!same_leaf_as_prev(trans, i)) + btree_node_lock_for_insert(i->iter->nodes[0], i->iter); +} + +static void multi_unlock_write(struct btree_insert *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_entry(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write(i->iter->nodes[0], i->iter); +} + +static int btree_trans_entry_cmp(const void *_l, const void *_r) +{ + const struct btree_insert_entry *l = _l; + const struct btree_insert_entry *r = _r; + + return btree_iter_cmp(l->iter, r->iter); +} + +/* Normal update interface: */ + +/** + * __bch_btree_insert_at - insert keys at given iterator positions + * + * This is main entry point for btree updates. + * + * Return values: + * -EINTR: locking changed, this function should be called again. Only returned + * if passed BTREE_INSERT_ATOMIC. + * -EROFS: filesystem read only + * -EIO: journal or btree node IO error + */ +int __bch2_btree_insert_at(struct btree_insert *trans) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct btree_iter *split = NULL; + bool cycle_gc_lock = false; + unsigned u64s; + int ret; + + trans_for_each_entry(trans, i) { + EBUG_ON(i->iter->level); + EBUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); + } + + sort(trans->entries, trans->nr, sizeof(trans->entries[0]), + btree_trans_entry_cmp, NULL); + + if (unlikely(!percpu_ref_tryget(&c->writes))) + return -EROFS; +retry_locks: + ret = -EINTR; + trans_for_each_entry(trans, i) + if (!bch2_btree_iter_set_locks_want(i->iter, 1)) + goto err; +retry: + trans->did_work = false; + u64s = 0; + trans_for_each_entry(trans, i) + if (!i->done) + u64s += jset_u64s(i->k->k.u64s + i->extra_res); + + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + + ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) + ? bch2_journal_res_get(&c->journal, + &trans->journal_res, + u64s, u64s) + : 0; + if (ret) + goto err; + + multi_lock_write(trans); + + u64s = 0; + trans_for_each_entry(trans, i) { + /* Multiple inserts might go to same leaf: */ + if (!same_leaf_as_prev(trans, i)) + u64s = 0; + + /* + * bch2_btree_node_insert_fits() must be called under write lock: + * with only an intent lock, another thread can still call + * bch2_btree_node_write(), converting an unwritten bset to a + * written one + */ + if (!i->done) { + u64s += i->k->k.u64s + i->extra_res; + if (!bch2_btree_node_insert_fits(c, + i->iter->nodes[0], u64s)) { + split = i->iter; + goto unlock; + } + } + } + + ret = 0; + split = NULL; + cycle_gc_lock = false; + + trans_for_each_entry(trans, i) { + if (i->done) + continue; + + switch (btree_insert_key(trans, i)) { + case BTREE_INSERT_OK: + i->done = true; + break; + case BTREE_INSERT_JOURNAL_RES_FULL: + case BTREE_INSERT_NEED_TRAVERSE: + ret = -EINTR; + break; + case BTREE_INSERT_NEED_RESCHED: + ret = -EAGAIN; + break; + case BTREE_INSERT_BTREE_NODE_FULL: + split = i->iter; + break; + case BTREE_INSERT_ENOSPC: + ret = -ENOSPC; + break; + case BTREE_INSERT_NEED_GC_LOCK: + cycle_gc_lock = true; + ret = -EINTR; + break; + default: + BUG(); + } + + if (!trans->did_work && (ret || split)) + break; + } +unlock: + multi_unlock_write(trans); + bch2_journal_res_put(&c->journal, &trans->journal_res); + + if (split) + goto split; + if (ret) + goto err; + + /* + * hack: iterators are inconsistent when they hit end of leaf, until + * traversed again + */ + trans_for_each_entry(trans, i) + if (i->iter->at_end_of_leaf) + goto out; + + trans_for_each_entry(trans, i) + if (!same_leaf_as_prev(trans, i)) { + foreground_maybe_merge(i->iter, btree_prev_sib); + foreground_maybe_merge(i->iter, btree_next_sib); + } +out: + /* make sure we didn't lose an error: */ + if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + trans_for_each_entry(trans, i) + BUG_ON(!i->done); + + percpu_ref_put(&c->writes); + return ret; +split: + /* + * have to drop journal res before splitting, because splitting means + * allocating new btree nodes, and holding a journal reservation + * potentially blocks the allocator: + */ + ret = bch2_btree_split_leaf(split, trans->flags); + if (ret) + goto err; + /* + * if the split didn't have to drop locks the insert will still be + * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked() + * and is overwriting won't have changed) + */ + goto retry_locks; +err: + if (cycle_gc_lock) { + down_read(&c->gc_lock); + up_read(&c->gc_lock); + } + + if (ret == -EINTR) { + trans_for_each_entry(trans, i) { + int ret2 = bch2_btree_iter_traverse(i->iter); + if (ret2) { + ret = ret2; + goto out; + } + } + + /* + * BTREE_ITER_ATOMIC means we have to return -EINTR if we + * dropped locks: + */ + if (!(trans->flags & BTREE_INSERT_ATOMIC)) + goto retry; + } + + goto out; +} + +int bch2_btree_insert_list_at(struct btree_iter *iter, + struct keylist *keys, + struct disk_reservation *disk_res, + struct extent_insert_hook *hook, + u64 *journal_seq, unsigned flags) +{ + BUG_ON(flags & BTREE_INSERT_ATOMIC); + BUG_ON(bch2_keylist_empty(keys)); + verify_keys_sorted(keys); + + while (!bch2_keylist_empty(keys)) { + /* need to traverse between each insert */ + int ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + ret = bch2_btree_insert_at(iter->c, disk_res, hook, + journal_seq, flags, + BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys))); + if (ret) + return ret; + + bch2_keylist_pop_front(keys); + } + + return 0; +} + +/** + * bch_btree_insert_check_key - insert dummy key into btree + * + * We insert a random key on a cache miss, then compare exchange on it + * once the cache promotion or backing device read completes. This + * ensures that if this key is written to after the read, the read will + * lose and not overwrite the key with stale data. + * + * Return values: + * -EAGAIN: @iter->cl was put on a waitlist waiting for btree node allocation + * -EINTR: btree node was changed while upgrading to write lock + */ +int bch2_btree_insert_check_key(struct btree_iter *iter, + struct bkey_i *check_key) +{ + struct bpos saved_pos = iter->pos; + struct bkey_i_cookie *cookie; + BKEY_PADDED(key) tmp; + int ret; + + BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&check_key->k))); + + check_key->k.type = KEY_TYPE_COOKIE; + set_bkey_val_bytes(&check_key->k, sizeof(struct bch_cookie)); + + cookie = bkey_i_to_cookie(check_key); + get_random_bytes(&cookie->v, sizeof(cookie->v)); + + bkey_copy(&tmp.key, check_key); + + ret = bch2_btree_insert_at(iter->c, NULL, NULL, NULL, + BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(iter, &tmp.key)); + + bch2_btree_iter_rewind(iter, saved_pos); + + return ret; +} + +/** + * bch_btree_insert - insert keys into the extent btree + * @c: pointer to struct bch_fs + * @id: btree to insert into + * @insert_keys: list of keys to insert + * @hook: insert callback + */ +int bch2_btree_insert(struct bch_fs *c, enum btree_id id, + struct bkey_i *k, + struct disk_reservation *disk_res, + struct extent_insert_hook *hook, + u64 *journal_seq, int flags) +{ + struct btree_iter iter; + int ret, ret2; + + bch2_btree_iter_init_intent(&iter, c, id, bkey_start_pos(&k->k)); + + ret = bch2_btree_iter_traverse(&iter); + if (unlikely(ret)) + goto out; + + ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags, + BTREE_INSERT_ENTRY(&iter, k)); +out: ret2 = bch2_btree_iter_unlock(&iter); + + return ret ?: ret2; +} + +/** + * bch_btree_update - like bch2_btree_insert(), but asserts that we're + * overwriting an existing key + */ +int bch2_btree_update(struct bch_fs *c, enum btree_id id, + struct bkey_i *k, u64 *journal_seq) +{ + struct btree_iter iter; + struct bkey_s_c u; + int ret; + + EBUG_ON(id == BTREE_ID_EXTENTS); + + bch2_btree_iter_init_intent(&iter, c, id, k->k.p); + + u = bch2_btree_iter_peek_with_holes(&iter); + ret = btree_iter_err(u); + if (ret) + return ret; + + if (bkey_deleted(u.k)) { + bch2_btree_iter_unlock(&iter); + return -ENOENT; + } + + ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, 0, + BTREE_INSERT_ENTRY(&iter, k)); + bch2_btree_iter_unlock(&iter); + return ret; +} + +/* + * bch_btree_delete_range - delete everything within a given range + * + * Range is a half open interval - [start, end) + */ +int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, + struct bpos end, + struct bversion version, + struct disk_reservation *disk_res, + struct extent_insert_hook *hook, + u64 *journal_seq) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_btree_iter_init_intent(&iter, c, id, start); + + while ((k = bch2_btree_iter_peek(&iter)).k && + !(ret = btree_iter_err(k))) { + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + /* really shouldn't be using a bare, unpadded bkey_i */ + struct bkey_i delete; + + if (bkey_cmp(iter.pos, end) >= 0) + break; + + bkey_init(&delete.k); + + /* + * For extents, iter.pos won't necessarily be the same as + * bkey_start_pos(k.k) (for non extents they always will be the + * same). It's important that we delete starting from iter.pos + * because the range we want to delete could start in the middle + * of k. + * + * (bch2_btree_iter_peek() does guarantee that iter.pos >= + * bkey_start_pos(k.k)). + */ + delete.k.p = iter.pos; + delete.k.version = version; + + if (iter.is_extents) { + /* + * The extents btree is special - KEY_TYPE_DISCARD is + * used for deletions, not KEY_TYPE_DELETED. This is an + * internal implementation detail that probably + * shouldn't be exposed (internally, KEY_TYPE_DELETED is + * used as a proxy for k->size == 0): + */ + delete.k.type = KEY_TYPE_DISCARD; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete.k); + } + + ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, + BTREE_INSERT_NOFAIL, + BTREE_INSERT_ENTRY(&iter, &delete)); + if (ret) + break; + + bch2_btree_iter_cond_resched(&iter); + } + + bch2_btree_iter_unlock(&iter); + return ret; +} + +/** + * bch_btree_node_rewrite - Rewrite/move a btree node + * + * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. + * btree_check_reserve() has to wait) + */ +int bch2_btree_node_rewrite(struct btree_iter *iter, struct btree *b, + struct closure *cl) +{ + struct bch_fs *c = iter->c; + struct btree *n, *parent = iter->nodes[b->level + 1]; + struct btree_reserve *reserve; + struct btree_interior_update *as; + unsigned flags = BTREE_INSERT_NOFAIL; + + /* + * if caller is going to wait if allocating reserve fails, then this is + * a rewrite that must succeed: + */ + if (cl) + flags |= BTREE_INSERT_USE_RESERVE; + + if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) + return -EINTR; + + reserve = bch2_btree_reserve_get(c, b, 0, flags, cl); + if (IS_ERR(reserve)) { + trace_btree_gc_rewrite_node_fail(c, b); + return PTR_ERR(reserve); + } + + as = bch2_btree_interior_update_alloc(c); + + bch2_btree_interior_update_will_free_node(c, as, b); + + n = bch2_btree_node_alloc_replacement(c, b, reserve); + + bch2_btree_build_aux_trees(n); + six_unlock_write(&n->lock); + + trace_btree_gc_rewrite_node(c, b); + + bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1); + + if (parent) { + bch2_btree_insert_node(parent, iter, + &keylist_single(&n->key), + reserve, as); + } else { + bch2_btree_set_root(iter, n, as, reserve); + } + + bch2_btree_open_bucket_put(c, n); + + bch2_btree_node_free_inmem(iter, b); + + BUG_ON(!bch2_btree_iter_node_replace(iter, n)); + + bch2_btree_reserve_put(c, reserve); + return 0; +} diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h new file mode 100644 index 00000000..b18c44c7 --- /dev/null +++ b/libbcachefs/btree_update.h @@ -0,0 +1,421 @@ +#ifndef _BCACHE_BTREE_INSERT_H +#define _BCACHE_BTREE_INSERT_H + +#include "btree_cache.h" +#include "btree_iter.h" +#include "buckets.h" +#include "journal.h" +#include "vstructs.h" + +struct bch_fs; +struct bkey_format_state; +struct bkey_format; +struct btree; + +static inline void btree_node_reset_sib_u64s(struct btree *b) +{ + b->sib_u64s[0] = b->nr.live_u64s; + b->sib_u64s[1] = b->nr.live_u64s; +} + +struct btree_reserve { + struct disk_reservation disk_res; + unsigned nr; + struct btree *b[BTREE_RESERVE_MAX]; +}; + +void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); +bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, + struct bkey_format *); + +/* Btree node freeing/allocation: */ + +/* + * Tracks a btree node that has been (or is about to be) freed in memory, but + * has _not_ yet been freed on disk (because the write that makes the new + * node(s) visible and frees the old hasn't completed yet) + */ +struct pending_btree_node_free { + bool index_update_done; + + __le64 seq; + enum btree_id btree_id; + unsigned level; + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); +}; + +/* + * Tracks an in progress split/rewrite of a btree node and the update to the + * parent node: + * + * When we split/rewrite a node, we do all the updates in memory without + * waiting for any writes to complete - we allocate the new node(s) and update + * the parent node, possibly recursively up to the root. + * + * The end result is that we have one or more new nodes being written - + * possibly several, if there were multiple splits - and then a write (updating + * an interior node) which will make all these new nodes visible. + * + * Additionally, as we split/rewrite nodes we free the old nodes - but the old + * nodes can't be freed (their space on disk can't be reclaimed) until the + * update to the interior node that makes the new node visible completes - + * until then, the old nodes are still reachable on disk. + * + */ +struct btree_interior_update { + struct closure cl; + struct bch_fs *c; + + struct list_head list; + + /* What kind of update are we doing? */ + enum { + BTREE_INTERIOR_NO_UPDATE, + BTREE_INTERIOR_UPDATING_NODE, + BTREE_INTERIOR_UPDATING_ROOT, + BTREE_INTERIOR_UPDATING_AS, + } mode; + + /* + * BTREE_INTERIOR_UPDATING_NODE: + * The update that made the new nodes visible was a regular update to an + * existing interior node - @b. We can't write out the update to @b + * until the new nodes we created are finished writing, so we block @b + * from writing by putting this btree_interior update on the + * @b->write_blocked list with @write_blocked_list: + */ + struct btree *b; + struct list_head write_blocked_list; + + /* + * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now + * we're now blocking another btree_interior_update + * @parent_as - btree_interior_update that's waiting on our nodes to finish + * writing, before it can make new nodes visible on disk + * @wait - list of child btree_interior_updates that are waiting on this + * btree_interior_update to make all the new nodes visible before they can free + * their old btree nodes + */ + struct btree_interior_update *parent_as; + struct closure_waitlist wait; + + /* + * We may be freeing nodes that were dirty, and thus had journal entries + * pinned: we need to transfer the oldest of those pins to the + * btree_interior_update operation, and release it when the new node(s) + * are all persistent and reachable: + */ + struct journal_entry_pin journal; + + u64 journal_seq; + + /* + * Nodes being freed: + * Protected by c->btree_node_pending_free_lock + */ + struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES]; + unsigned nr_pending; + + /* Only here to reduce stack usage on recursive splits: */ + struct keylist parent_keys; + /* + * Enough room for btree_split's keys without realloc - btree node + * pointers never have crc/compression info, so we only need to acount + * for the pointers for three keys + */ + u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; +}; + +#define for_each_pending_btree_node_free(c, as, p) \ + list_for_each_entry(as, &c->btree_interior_update_list, list) \ + for (p = as->pending; p < as->pending + as->nr_pending; p++) + +void bch2_btree_node_free_inmem(struct btree_iter *, struct btree *); +void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); +void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *); + +struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *, + struct btree *, + struct bkey_format, + struct btree_reserve *); + +struct btree_interior_update * +bch2_btree_interior_update_alloc(struct bch_fs *); + +void bch2_btree_interior_update_will_free_node(struct bch_fs *, + struct btree_interior_update *, + struct btree *); + +void bch2_btree_set_root_initial(struct bch_fs *, struct btree *, + struct btree_reserve *); + +void bch2_btree_reserve_put(struct bch_fs *, struct btree_reserve *); +struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *, + struct btree *, unsigned, + unsigned, struct closure *); + +int bch2_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *); + +/* Inserting into a given leaf node (last stage of insert): */ + +bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, + struct btree_node_iter *, struct bkey_i *); +void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *, + struct bkey_i *); + +static inline void *btree_data_end(struct bch_fs *c, struct btree *b) +{ + return (void *) b->data + btree_bytes(c); +} + +static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, + struct btree *b) +{ + return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); +} + +static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, + struct btree *b) +{ + return btree_data_end(c, b); +} + +static inline void *write_block(struct btree *b) +{ + return (void *) b->data + (b->written << 9); +} + +static inline bool bset_written(struct btree *b, struct bset *i) +{ + return (void *) i < write_block(b); +} + +static inline bool bset_unwritten(struct btree *b, struct bset *i) +{ + return (void *) i > write_block(b); +} + +static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b, + struct bset *i) +{ + return round_up(bset_byte_offset(b, vstruct_end(i)), + block_bytes(c)) >> 9; +} + +static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, + struct btree *b) +{ + struct bset *i = btree_bset_last(b); + unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) + + b->whiteout_u64s + + b->uncompacted_whiteout_u64s; + unsigned total = c->sb.btree_node_size << 6; + + EBUG_ON(used > total); + + if (bset_written(b, i)) + return 0; + + return total - used; +} + +static inline unsigned btree_write_set_buffer(struct btree *b) +{ + /* + * Could buffer up larger amounts of keys for btrees with larger keys, + * pending benchmarking: + */ + return 4 << 10; +} + +static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, + struct btree *b) +{ + struct bset *i = btree_bset_last(b); + unsigned offset = max_t(unsigned, b->written << 9, + bset_byte_offset(b, vstruct_end(i))); + ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t) + (offset + sizeof(struct btree_node_entry) + + b->whiteout_u64s * sizeof(u64) + + b->uncompacted_whiteout_u64s * sizeof(u64)); + + EBUG_ON(offset > btree_bytes(c)); + + if ((unlikely(bset_written(b, i)) && n > 0) || + (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) && + n > btree_write_set_buffer(b))) + return (void *) b->data + offset; + + return NULL; +} + +/* + * write lock must be held on @b (else the dirty bset that we were going to + * insert into could be written out from under us) + */ +static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, + struct btree *b, unsigned u64s) +{ + if (btree_node_is_extents(b)) { + /* The insert key might split an existing key + * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case: + */ + u64s += BKEY_EXTENT_U64s_MAX; + } + + return u64s <= bch_btree_keys_u64s_remaining(c, b); +} + +static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + if (bset_written(b, bset(b, t))) { + EBUG_ON(b->uncompacted_whiteout_u64s < + bkeyp_key_u64s(&b->format, k)); + b->uncompacted_whiteout_u64s -= + bkeyp_key_u64s(&b->format, k); + } +} + +static inline void reserve_whiteout(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + if (bset_written(b, bset(b, t))) { + BUG_ON(!k->needs_whiteout); + b->uncompacted_whiteout_u64s += + bkeyp_key_u64s(&b->format, k); + } +} + +void bch2_btree_insert_node(struct btree *, struct btree_iter *, + struct keylist *, struct btree_reserve *, + struct btree_interior_update *as); + +/* Normal update interface: */ + +struct btree_insert { + struct bch_fs *c; + struct disk_reservation *disk_res; + struct journal_res journal_res; + u64 *journal_seq; + struct extent_insert_hook *hook; + unsigned flags; + bool did_work; + + unsigned short nr; + struct btree_insert_entry { + struct btree_iter *iter; + struct bkey_i *k; + unsigned extra_res; + /* + * true if entire key was inserted - can only be false for + * extents + */ + bool done; + } *entries; +}; + +int __bch2_btree_insert_at(struct btree_insert *); + + +#define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N +#define COUNT_ARGS(...) _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1) + +#define BTREE_INSERT_ENTRY(_iter, _k) \ + ((struct btree_insert_entry) { \ + .iter = (_iter), \ + .k = (_k), \ + .done = false, \ + }) + +#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra) \ + ((struct btree_insert_entry) { \ + .iter = (_iter), \ + .k = (_k), \ + .extra_res = (_extra), \ + .done = false, \ + }) + +/** + * bch_btree_insert_at - insert one or more keys at iterator positions + * @iter: btree iterator + * @insert_key: key to insert + * @disk_res: disk reservation + * @hook: extent insert callback + * + * Return values: + * -EINTR: locking changed, this function should be called again. Only returned + * if passed BTREE_INSERT_ATOMIC. + * -EROFS: filesystem read only + * -EIO: journal or btree node IO error + */ +#define bch2_btree_insert_at(_c, _disk_res, _hook, \ + _journal_seq, _flags, ...) \ + __bch2_btree_insert_at(&(struct btree_insert) { \ + .c = (_c), \ + .disk_res = (_disk_res), \ + .journal_seq = (_journal_seq), \ + .hook = (_hook), \ + .flags = (_flags), \ + .nr = COUNT_ARGS(__VA_ARGS__), \ + .entries = (struct btree_insert_entry[]) { \ + __VA_ARGS__ \ + }}) + +/* + * Don't drop/retake locks: instead return -EINTR if need to upgrade to intent + * locks, -EAGAIN if need to wait on btree reserve + */ +#define BTREE_INSERT_ATOMIC (1 << 0) + +/* Don't check for -ENOSPC: */ +#define BTREE_INSERT_NOFAIL (1 << 1) + +/* for copygc, or when merging btree nodes */ +#define BTREE_INSERT_USE_RESERVE (1 << 2) + +/* + * Insert is for journal replay: don't get journal reservations, or mark extents + * (bch_mark_key) + */ +#define BTREE_INSERT_JOURNAL_REPLAY (1 << 3) + +int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *, + struct disk_reservation *, + struct extent_insert_hook *, u64 *, unsigned); + +static inline bool journal_res_insert_fits(struct btree_insert *trans, + struct btree_insert_entry *insert) +{ + unsigned u64s = 0; + struct btree_insert_entry *i; + + /* + * If we didn't get a journal reservation, we're in journal replay and + * we're not journalling updates: + */ + if (!trans->journal_res.ref) + return true; + + for (i = insert; i < trans->entries + trans->nr; i++) + u64s += jset_u64s(i->k->k.u64s + i->extra_res); + + return u64s <= trans->journal_res.u64s; +} + +int bch2_btree_insert_check_key(struct btree_iter *, struct bkey_i *); +int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, + struct disk_reservation *, + struct extent_insert_hook *, u64 *, int flags); +int bch2_btree_update(struct bch_fs *, enum btree_id, + struct bkey_i *, u64 *); + +int bch2_btree_delete_range(struct bch_fs *, enum btree_id, + struct bpos, struct bpos, struct bversion, + struct disk_reservation *, + struct extent_insert_hook *, u64 *); + +int bch2_btree_node_rewrite(struct btree_iter *, struct btree *, struct closure *); + +#endif /* _BCACHE_BTREE_INSERT_H */ + diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c new file mode 100644 index 00000000..396251d5 --- /dev/null +++ b/libbcachefs/buckets.c @@ -0,0 +1,750 @@ +/* + * Code for manipulating bucket marks for garbage collection. + * + * Copyright 2014 Datera, Inc. + * + * Bucket states: + * - free bucket: mark == 0 + * The bucket contains no data and will not be read + * + * - allocator bucket: owned_by_allocator == 1 + * The bucket is on a free list, or it is an open bucket + * + * - cached bucket: owned_by_allocator == 0 && + * dirty_sectors == 0 && + * cached_sectors > 0 + * The bucket contains data but may be safely discarded as there are + * enough replicas of the data on other cache devices, or it has been + * written back to the backing device + * + * - dirty bucket: owned_by_allocator == 0 && + * dirty_sectors > 0 + * The bucket contains data that we must not discard (either only copy, + * or one of the 'main copies' for data requiring multiple replicas) + * + * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1 + * This is a btree node, journal or gen/prio bucket + * + * Lifecycle: + * + * bucket invalidated => bucket on freelist => open bucket => + * [dirty bucket =>] cached bucket => bucket invalidated => ... + * + * Note that cache promotion can skip the dirty bucket step, as data + * is copied from a deeper tier to a shallower tier, onto a cached + * bucket. + * Note also that a cached bucket can spontaneously become dirty -- + * see below. + * + * Only a traversal of the key space can determine whether a bucket is + * truly dirty or cached. + * + * Transitions: + * + * - free => allocator: bucket was invalidated + * - cached => allocator: bucket was invalidated + * + * - allocator => dirty: open bucket was filled up + * - allocator => cached: open bucket was filled up + * - allocator => metadata: metadata was allocated + * + * - dirty => cached: dirty sectors were copied to a deeper tier + * - dirty => free: dirty sectors were overwritten or moved (copy gc) + * - cached => free: cached sectors were overwritten + * + * - metadata => free: metadata was freed + * + * Oddities: + * - cached => dirty: a device was removed so formerly replicated data + * is no longer sufficiently replicated + * - free => cached: cannot happen + * - free => dirty: cannot happen + * - free => metadata: cannot happen + */ + +#include "bcachefs.h" +#include "alloc.h" +#include "btree_gc.h" +#include "buckets.h" +#include "error.h" + +#include <linux/preempt.h> +#include <trace/events/bcachefs.h> + +#ifdef DEBUG_BUCKETS + +#define lg_local_lock lg_global_lock +#define lg_local_unlock lg_global_unlock + +static void bch2_fs_stats_verify(struct bch_fs *c) +{ + struct bch_fs_usage stats = + __bch2_fs_usage_read(c); + + if ((s64) stats.sectors_dirty < 0) + panic("sectors_dirty underflow: %lli\n", stats.sectors_dirty); + + if ((s64) stats.sectors_cached < 0) + panic("sectors_cached underflow: %lli\n", stats.sectors_cached); + + if ((s64) stats.sectors_meta < 0) + panic("sectors_meta underflow: %lli\n", stats.sectors_meta); + + if ((s64) stats.sectors_persistent_reserved < 0) + panic("sectors_persistent_reserved underflow: %lli\n", stats.sectors_persistent_reserved); + + if ((s64) stats.sectors_online_reserved < 0) + panic("sectors_online_reserved underflow: %lli\n", stats.sectors_online_reserved); +} + +#else + +static void bch2_fs_stats_verify(struct bch_fs *c) {} + +#endif + +/* + * Clear journal_seq_valid for buckets for which it's not needed, to prevent + * wraparound: + */ +void bch2_bucket_seq_cleanup(struct bch_fs *c) +{ + u16 last_seq_ondisk = c->journal.last_seq_ondisk; + struct bch_dev *ca; + struct bucket *g; + struct bucket_mark m; + unsigned i; + + for_each_member_device(ca, c, i) + for_each_bucket(g, ca) { + bucket_cmpxchg(g, m, ({ + if (!m.journal_seq_valid || + bucket_needs_journal_commit(m, last_seq_ondisk)) + break; + + m.journal_seq_valid = 0; + })); + } +} + +#define bch2_usage_add(_acc, _stats) \ +do { \ + typeof(_acc) _a = (_acc), _s = (_stats); \ + unsigned i; \ + \ + for (i = 0; i < sizeof(*_a) / sizeof(u64); i++) \ + ((u64 *) (_a))[i] += ((u64 *) (_s))[i]; \ +} while (0) + +#define bch2_usage_read_raw(_stats) \ +({ \ + typeof(*this_cpu_ptr(_stats)) _acc = { 0 }; \ + int cpu; \ + \ + for_each_possible_cpu(cpu) \ + bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \ + \ + _acc; \ +}) + +#define bch2_usage_read_cached(_c, _cached, _uncached) \ +({ \ + typeof(_cached) _ret; \ + unsigned _seq; \ + \ + do { \ + _seq = read_seqcount_begin(&(_c)->gc_pos_lock); \ + _ret = (_c)->gc_pos.phase == GC_PHASE_DONE \ + ? bch2_usage_read_raw(_uncached) \ + : (_cached); \ + } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq)); \ + \ + _ret; \ +}) + +struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca) +{ + return bch2_usage_read_raw(ca->usage_percpu); +} + +struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) +{ + return bch2_usage_read_cached(ca->fs, + ca->usage_cached, + ca->usage_percpu); +} + +struct bch_fs_usage +__bch2_fs_usage_read(struct bch_fs *c) +{ + return bch2_usage_read_raw(c->usage_percpu); +} + +struct bch_fs_usage +bch2_fs_usage_read(struct bch_fs *c) +{ + return bch2_usage_read_cached(c, + c->usage_cached, + c->usage_percpu); +} + +static inline int is_meta_bucket(struct bucket_mark m) +{ + return m.data_type != BUCKET_DATA; +} + +static inline int is_dirty_bucket(struct bucket_mark m) +{ + return m.data_type == BUCKET_DATA && !!m.dirty_sectors; +} + +static inline int is_cached_bucket(struct bucket_mark m) +{ + return m.data_type == BUCKET_DATA && + !m.dirty_sectors && !!m.cached_sectors; +} + +static inline enum s_alloc bucket_type(struct bucket_mark m) +{ + return is_meta_bucket(m) ? S_META : S_DIRTY; +} + +static bool bucket_became_unavailable(struct bch_fs *c, + struct bucket_mark old, + struct bucket_mark new) +{ + return is_available_bucket(old) && + !is_available_bucket(new) && + c && c->gc_pos.phase == GC_PHASE_DONE; +} + +void bch2_fs_usage_apply(struct bch_fs *c, + struct bch_fs_usage *stats, + struct disk_reservation *disk_res, + struct gc_pos gc_pos) +{ + s64 added = + stats->s[S_COMPRESSED][S_META] + + stats->s[S_COMPRESSED][S_DIRTY] + + stats->persistent_reserved + + stats->online_reserved; + + /* + * Not allowed to reduce sectors_available except by getting a + * reservation: + */ + BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0)); + + if (added > 0) { + disk_res->sectors -= added; + stats->online_reserved -= added; + } + + lg_local_lock(&c->usage_lock); + /* online_reserved not subject to gc: */ + this_cpu_ptr(c->usage_percpu)->online_reserved += + stats->online_reserved; + stats->online_reserved = 0; + + if (!gc_will_visit(c, gc_pos)) + bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats); + + bch2_fs_stats_verify(c); + lg_local_unlock(&c->usage_lock); + + memset(stats, 0, sizeof(*stats)); +} + +static void bch2_fs_usage_update(struct bch_fs_usage *fs_usage, + struct bucket_mark old, struct bucket_mark new) +{ + fs_usage->s[S_COMPRESSED][S_CACHED] += + (int) new.cached_sectors - (int) old.cached_sectors; + fs_usage->s[S_COMPRESSED][bucket_type(old)] -= + old.dirty_sectors; + fs_usage->s[S_COMPRESSED][bucket_type(new)] += + new.dirty_sectors; +} + +static void bch2_dev_usage_update(struct bch_dev *ca, + struct bucket_mark old, struct bucket_mark new) +{ + struct bch_fs *c = ca->fs; + struct bch_dev_usage *dev_usage; + + bch2_fs_inconsistent_on(old.data_type && new.data_type && + old.data_type != new.data_type, c, + "different types of metadata in same bucket: %u, %u", + old.data_type, new.data_type); + + preempt_disable(); + dev_usage = this_cpu_ptr(ca->usage_percpu); + + dev_usage->sectors[S_CACHED] += + (int) new.cached_sectors - (int) old.cached_sectors; + + dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors; + dev_usage->sectors[bucket_type(new)] += new.dirty_sectors; + + dev_usage->buckets_alloc += + (int) new.owned_by_allocator - (int) old.owned_by_allocator; + + dev_usage->buckets_meta += is_meta_bucket(new) - is_meta_bucket(old); + dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old); + dev_usage->buckets_dirty += is_dirty_bucket(new) - is_dirty_bucket(old); + preempt_enable(); + + if (!is_available_bucket(old) && is_available_bucket(new)) + bch2_wake_allocator(ca); +} + +#define bucket_data_cmpxchg(ca, g, new, expr) \ +({ \ + struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ + \ + bch2_dev_usage_update(ca, _old, new); \ + _old; \ +}) + +void bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g) +{ + struct bch_fs_usage stats = { 0 }; + struct bucket_mark old, new; + + old = bucket_data_cmpxchg(ca, g, new, ({ + new.owned_by_allocator = 1; + new.had_metadata = 0; + new.data_type = 0; + new.cached_sectors = 0; + new.dirty_sectors = 0; + new.copygc = 0; + new.gen++; + })); + + /* XXX: we're not actually updating fs usage's cached sectors... */ + bch2_fs_usage_update(&stats, old, new); + + if (!old.owned_by_allocator && old.cached_sectors) + trace_invalidate(ca, g - ca->buckets, + old.cached_sectors); +} + +void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g) +{ + struct bucket_mark old, new; + + old = bucket_data_cmpxchg(ca, g, new, ({ + new.owned_by_allocator = 0; + new.data_type = 0; + new.cached_sectors = 0; + new.dirty_sectors = 0; + })); + + BUG_ON(bucket_became_unavailable(ca->fs, old, new)); +} + +void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g, + bool owned_by_allocator) +{ + struct bucket_mark new; + + bucket_data_cmpxchg(ca, g, new, ({ + new.owned_by_allocator = owned_by_allocator; + })); +} + +void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g, + enum bucket_data_type type, + bool may_make_unavailable) +{ + struct bucket_mark old, new; + + BUG_ON(!type); + + old = bucket_data_cmpxchg(ca, g, new, ({ + new.data_type = type; + new.had_metadata = 1; + })); + + BUG_ON(old.cached_sectors); + BUG_ON(old.dirty_sectors); + BUG_ON(!may_make_unavailable && + bucket_became_unavailable(ca->fs, old, new)); +} + +#define saturated_add(ca, dst, src, max) \ +do { \ + BUG_ON((int) (dst) + (src) < 0); \ + if ((dst) == (max)) \ + ; \ + else if ((dst) + (src) <= (max)) \ + dst += (src); \ + else { \ + dst = (max); \ + trace_sectors_saturated(ca); \ + } \ +} while (0) + +#if 0 +/* Reverting this until the copygc + compression issue is fixed: */ + +static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors) +{ + return crc_compression_type(crc) + ? sectors * crc_compressed_size(crc) / crc_uncompressed_size(crc) + : sectors; +} + +static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors) +{ + return crc_compression_type(crc) + ? min_t(unsigned, crc_compressed_size(crc), sectors) + : sectors; +} +#else +static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors) +{ + return sectors; +} + +static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors) +{ + return sectors; +} +#endif + +/* + * Checking against gc's position has to be done here, inside the cmpxchg() + * loop, to avoid racing with the start of gc clearing all the marks - GC does + * that with the gc pos seqlock held. + */ +static void bch2_mark_pointer(struct bch_fs *c, + struct bkey_s_c_extent e, + const union bch_extent_crc *crc, + const struct bch_extent_ptr *ptr, + s64 sectors, enum s_alloc type, + bool may_make_unavailable, + struct bch_fs_usage *stats, + bool gc_will_visit, u64 journal_seq) +{ + struct bucket_mark old, new; + unsigned saturated; + struct bch_dev *ca = c->devs[ptr->dev]; + struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr); + unsigned old_sectors, new_sectors; + int disk_sectors, compressed_sectors; + + if (sectors > 0) { + old_sectors = 0; + new_sectors = sectors; + } else { + old_sectors = e.k->size; + new_sectors = e.k->size + sectors; + } + + disk_sectors = -__disk_sectors(crc, old_sectors) + + __disk_sectors(crc, new_sectors); + compressed_sectors = -__compressed_sectors(crc, old_sectors) + + __compressed_sectors(crc, new_sectors); + + if (gc_will_visit) { + if (journal_seq) + bucket_cmpxchg(g, new, new.journal_seq = journal_seq); + + goto out; + } + + old = bucket_data_cmpxchg(ca, g, new, ({ + saturated = 0; + + /* + * Check this after reading bucket mark to guard against + * the allocator invalidating a bucket after we've already + * checked the gen + */ + if (gen_after(new.gen, ptr->gen)) { + EBUG_ON(type != S_CACHED && + test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); + return; + } + + EBUG_ON(type != S_CACHED && + !may_make_unavailable && + is_available_bucket(new) && + test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); + + if (type != S_CACHED && + new.dirty_sectors == GC_MAX_SECTORS_USED && + disk_sectors < 0) + saturated = -disk_sectors; + + if (type == S_CACHED) + saturated_add(ca, new.cached_sectors, disk_sectors, + GC_MAX_SECTORS_USED); + else + saturated_add(ca, new.dirty_sectors, disk_sectors, + GC_MAX_SECTORS_USED); + + if (!new.dirty_sectors && + !new.cached_sectors) { + new.data_type = 0; + + if (journal_seq) { + new.journal_seq_valid = 1; + new.journal_seq = journal_seq; + } + } else { + new.data_type = type == S_META + ? BUCKET_BTREE : BUCKET_DATA; + } + + new.had_metadata |= is_meta_bucket(new); + })); + + BUG_ON(!may_make_unavailable && + bucket_became_unavailable(c, old, new)); + + if (saturated && + atomic_long_add_return(saturated, + &ca->saturated_count) >= + ca->free_inc.size << ca->bucket_bits) { + if (c->gc_thread) { + trace_gc_sectors_saturated(c); + wake_up_process(c->gc_thread); + } + } +out: + stats->s[S_COMPRESSED][type] += compressed_sectors; + stats->s[S_UNCOMPRESSED][type] += sectors; +} + +static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e, + s64 sectors, bool metadata, + bool may_make_unavailable, + struct bch_fs_usage *stats, + bool gc_will_visit, u64 journal_seq) +{ + const struct bch_extent_ptr *ptr; + const union bch_extent_crc *crc; + enum s_alloc type = metadata ? S_META : S_DIRTY; + + BUG_ON(metadata && bkey_extent_is_cached(e.k)); + BUG_ON(!sectors); + + extent_for_each_ptr_crc(e, ptr, crc) + bch2_mark_pointer(c, e, crc, ptr, sectors, + ptr->cached ? S_CACHED : type, + may_make_unavailable, + stats, gc_will_visit, journal_seq); +} + +static void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, + s64 sectors, bool metadata, + bool may_make_unavailable, + struct bch_fs_usage *stats, + bool gc_will_visit, u64 journal_seq) +{ + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + bch2_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata, + may_make_unavailable, stats, + gc_will_visit, journal_seq); + break; + case BCH_RESERVATION: { + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + stats->persistent_reserved += r.v->nr_replicas * sectors; + break; + } + } +} + +void __bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, + s64 sectors, bool metadata, + struct bch_fs_usage *stats) +{ + __bch2_mark_key(c, k, sectors, metadata, true, stats, false, 0); +} + +void bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, + s64 sectors, bool metadata) +{ + struct bch_fs_usage stats = { 0 }; + + __bch2_gc_mark_key(c, k, sectors, metadata, &stats); + + preempt_disable(); + bch2_usage_add(this_cpu_ptr(c->usage_percpu), &stats); + preempt_enable(); +} + +void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, + s64 sectors, bool metadata, struct gc_pos gc_pos, + struct bch_fs_usage *stats, u64 journal_seq) +{ + /* + * synchronization w.r.t. GC: + * + * Normally, bucket sector counts/marks are updated on the fly, as + * references are added/removed from the btree, the lists of buckets the + * allocator owns, other metadata buckets, etc. + * + * When GC is in progress and going to mark this reference, we do _not_ + * mark this reference here, to avoid double counting - GC will count it + * when it gets to it. + * + * To know whether we should mark a given reference (GC either isn't + * running, or has already marked references at this position) we + * construct a total order for everything GC walks. Then, we can simply + * compare the position of the reference we're marking - @gc_pos - with + * GC's current position. If GC is going to mark this reference, GC's + * current position will be less than @gc_pos; if GC's current position + * is greater than @gc_pos GC has either already walked this position, + * or isn't running. + * + * To avoid racing with GC's position changing, we have to deal with + * - GC's position being set to GC_POS_MIN when GC starts: + * usage_lock guards against this + * - GC's position overtaking @gc_pos: we guard against this with + * whatever lock protects the data structure the reference lives in + * (e.g. the btree node lock, or the relevant allocator lock). + */ + lg_local_lock(&c->usage_lock); + __bch2_mark_key(c, k, sectors, metadata, false, stats, + gc_will_visit(c, gc_pos), journal_seq); + + bch2_fs_stats_verify(c); + lg_local_unlock(&c->usage_lock); +} + +static u64 __recalc_sectors_available(struct bch_fs *c) +{ + return c->capacity - bch2_fs_sectors_used(c); +} + +/* Used by gc when it's starting: */ +void bch2_recalc_sectors_available(struct bch_fs *c) +{ + int cpu; + + lg_global_lock(&c->usage_lock); + + for_each_possible_cpu(cpu) + per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0; + + atomic64_set(&c->sectors_available, + __recalc_sectors_available(c)); + + lg_global_unlock(&c->usage_lock); +} + +void bch2_disk_reservation_put(struct bch_fs *c, + struct disk_reservation *res) +{ + if (res->sectors) { + lg_local_lock(&c->usage_lock); + this_cpu_sub(c->usage_percpu->online_reserved, + res->sectors); + + bch2_fs_stats_verify(c); + lg_local_unlock(&c->usage_lock); + + res->sectors = 0; + } +} + +#define SECTORS_CACHE 1024 + +int bch2_disk_reservation_add(struct bch_fs *c, + struct disk_reservation *res, + unsigned sectors, int flags) +{ + struct bch_fs_usage *stats; + u64 old, new, v; + s64 sectors_available; + int ret; + + sectors *= res->nr_replicas; + + lg_local_lock(&c->usage_lock); + stats = this_cpu_ptr(c->usage_percpu); + + if (sectors >= stats->available_cache) + goto out; + + v = atomic64_read(&c->sectors_available); + do { + old = v; + if (old < sectors) { + lg_local_unlock(&c->usage_lock); + goto recalculate; + } + + new = max_t(s64, 0, old - sectors - SECTORS_CACHE); + } while ((v = atomic64_cmpxchg(&c->sectors_available, + old, new)) != old); + + stats->available_cache += old - new; +out: + stats->available_cache -= sectors; + stats->online_reserved += sectors; + res->sectors += sectors; + + bch2_fs_stats_verify(c); + lg_local_unlock(&c->usage_lock); + return 0; + +recalculate: + /* + * GC recalculates sectors_available when it starts, so that hopefully + * we don't normally end up blocking here: + */ + + /* + * Piss fuck, we can be called from extent_insert_fixup() with btree + * locks held: + */ + + if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) { + if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD)) + down_read(&c->gc_lock); + else if (!down_read_trylock(&c->gc_lock)) + return -EINTR; + } + lg_global_lock(&c->usage_lock); + + sectors_available = __recalc_sectors_available(c); + + if (sectors <= sectors_available || + (flags & BCH_DISK_RESERVATION_NOFAIL)) { + atomic64_set(&c->sectors_available, + max_t(s64, 0, sectors_available - sectors)); + stats->online_reserved += sectors; + res->sectors += sectors; + ret = 0; + } else { + atomic64_set(&c->sectors_available, sectors_available); + ret = -ENOSPC; + } + + bch2_fs_stats_verify(c); + lg_global_unlock(&c->usage_lock); + if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) + up_read(&c->gc_lock); + + return ret; +} + +int bch2_disk_reservation_get(struct bch_fs *c, + struct disk_reservation *res, + unsigned sectors, int flags) +{ + res->sectors = 0; + res->gen = c->capacity_gen; + res->nr_replicas = (flags & BCH_DISK_RESERVATION_METADATA) + ? c->opts.metadata_replicas + : c->opts.data_replicas; + + return bch2_disk_reservation_add(c, res, sectors, flags); +} diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h new file mode 100644 index 00000000..9c77304f --- /dev/null +++ b/libbcachefs/buckets.h @@ -0,0 +1,267 @@ +/* + * Code for manipulating bucket marks for garbage collection. + * + * Copyright 2014 Datera, Inc. + */ + +#ifndef _BUCKETS_H +#define _BUCKETS_H + +#include "buckets_types.h" +#include "super.h" + +#define for_each_bucket(b, ca) \ + for (b = (ca)->buckets + (ca)->mi.first_bucket; \ + b < (ca)->buckets + (ca)->mi.nbuckets; b++) + +#define bucket_cmpxchg(g, new, expr) \ +({ \ + u64 _v = READ_ONCE((g)->_mark.counter); \ + struct bucket_mark _old; \ + \ + do { \ + (new).counter = _old.counter = _v; \ + expr; \ + } while ((_v = cmpxchg(&(g)->_mark.counter, \ + _old.counter, \ + (new).counter)) != _old.counter);\ + _old; \ +}) + +/* + * bucket_gc_gen() returns the difference between the bucket's current gen and + * the oldest gen of any pointer into that bucket in the btree. + */ + +static inline u8 bucket_gc_gen(struct bch_dev *ca, struct bucket *g) +{ + unsigned long r = g - ca->buckets; + return g->mark.gen - ca->oldest_gens[r]; +} + +static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + return sector_to_bucket(ca, ptr->offset); +} + +/* + * Returns 0 if no pointers or device offline - only for tracepoints! + */ +static inline size_t PTR_BUCKET_NR_TRACE(const struct bch_fs *c, + const struct bkey_i *k, + unsigned ptr) +{ + size_t bucket = 0; +#if 0 + if (bkey_extent_is_data(&k->k)) { + const struct bch_extent_ptr *ptr; + + extent_for_each_ptr(bkey_i_to_s_c_extent(k), ptr) { + const struct bch_dev *ca = c->devs[ptr->dev]; + bucket = PTR_BUCKET_NR(ca, ptr); + break; + } + } +#endif + return bucket; +} + +static inline struct bucket *PTR_BUCKET(const struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + return ca->buckets + PTR_BUCKET_NR(ca, ptr); +} + +static inline u8 __gen_after(u8 a, u8 b) +{ + u8 r = a - b; + + return r > 128U ? 0 : r; +} + +static inline u8 gen_after(u8 a, u8 b) +{ + u8 r = a - b; + + BUG_ON(r > 128U); + + return r; +} + +/** + * ptr_stale() - check if a pointer points into a bucket that has been + * invalidated. + */ +static inline u8 ptr_stale(const struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + return gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen); +} + +/* bucket heaps */ + +static inline bool bucket_min_cmp(struct bucket_heap_entry l, + struct bucket_heap_entry r) +{ + return l.val < r.val; +} + +static inline bool bucket_max_cmp(struct bucket_heap_entry l, + struct bucket_heap_entry r) +{ + return l.val > r.val; +} + +static inline void bucket_heap_push(struct bch_dev *ca, struct bucket *g, + unsigned long val) +{ + struct bucket_heap_entry new = { g, val }; + + if (!heap_full(&ca->heap)) + heap_add(&ca->heap, new, bucket_min_cmp); + else if (bucket_min_cmp(new, heap_peek(&ca->heap))) { + ca->heap.data[0] = new; + heap_sift(&ca->heap, 0, bucket_min_cmp); + } +} + +/* bucket gc marks */ + +/* The dirty and cached sector counts saturate. If this occurs, + * reference counting alone will not free the bucket, and a btree + * GC must be performed. */ +#define GC_MAX_SECTORS_USED ((1U << 15) - 1) + +static inline bool bucket_unused(struct bucket *g) +{ + return !g->mark.counter; +} + +static inline unsigned bucket_sectors_used(struct bucket *g) +{ + return g->mark.dirty_sectors + g->mark.cached_sectors; +} + +/* Per device stats: */ + +struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *); +struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); + +static inline u64 __dev_buckets_available(struct bch_dev *ca, + struct bch_dev_usage stats) +{ + return max_t(s64, 0, + ca->mi.nbuckets - ca->mi.first_bucket - + stats.buckets_dirty - + stats.buckets_alloc - + stats.buckets_meta); +} + +/* + * Number of reclaimable buckets - only for use by the allocator thread: + */ +static inline u64 dev_buckets_available(struct bch_dev *ca) +{ + return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); +} + +static inline u64 __dev_buckets_free(struct bch_dev *ca, + struct bch_dev_usage stats) +{ + return __dev_buckets_available(ca, stats) + + fifo_used(&ca->free[RESERVE_NONE]) + + fifo_used(&ca->free_inc); +} + +static inline u64 dev_buckets_free(struct bch_dev *ca) +{ + return __dev_buckets_free(ca, bch2_dev_usage_read(ca)); +} + +/* Cache set stats: */ + +struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *); +struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *); +void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, + struct disk_reservation *, struct gc_pos); + +static inline u64 __bch2_fs_sectors_used(struct bch_fs *c) +{ + struct bch_fs_usage stats = __bch2_fs_usage_read(c); + u64 reserved = stats.persistent_reserved + + stats.online_reserved; + + return stats.s[S_COMPRESSED][S_META] + + stats.s[S_COMPRESSED][S_DIRTY] + + reserved + + (reserved >> 7); +} + +static inline u64 bch2_fs_sectors_used(struct bch_fs *c) +{ + return min(c->capacity, __bch2_fs_sectors_used(c)); +} + +/* XXX: kill? */ +static inline u64 sectors_available(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + u64 ret = 0; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i) + ret += dev_buckets_available(ca) << ca->bucket_bits; + rcu_read_unlock(); + + return ret; +} + +static inline bool is_available_bucket(struct bucket_mark mark) +{ + return (!mark.owned_by_allocator && + mark.data_type == BUCKET_DATA && + !mark.dirty_sectors && + !mark.nouse); +} + +static inline bool bucket_needs_journal_commit(struct bucket_mark m, + u16 last_seq_ondisk) +{ + return m.journal_seq_valid && + ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); +} + +void bch2_bucket_seq_cleanup(struct bch_fs *); + +void bch2_invalidate_bucket(struct bch_dev *, struct bucket *); +void bch2_mark_free_bucket(struct bch_dev *, struct bucket *); +void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool); +void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *, + enum bucket_data_type, bool); + +void __bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, + struct bch_fs_usage *); +void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool); +void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, + struct gc_pos, struct bch_fs_usage *, u64); + +void bch2_recalc_sectors_available(struct bch_fs *); + +void bch2_disk_reservation_put(struct bch_fs *, + struct disk_reservation *); + +#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) +#define BCH_DISK_RESERVATION_METADATA (1 << 1) +#define BCH_DISK_RESERVATION_GC_LOCK_HELD (1 << 2) +#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD (1 << 3) + +int bch2_disk_reservation_add(struct bch_fs *, + struct disk_reservation *, + unsigned, int); +int bch2_disk_reservation_get(struct bch_fs *, + struct disk_reservation *, + unsigned, int); + +#endif /* _BUCKETS_H */ diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h new file mode 100644 index 00000000..ca187099 --- /dev/null +++ b/libbcachefs/buckets_types.h @@ -0,0 +1,112 @@ +#ifndef _BUCKETS_TYPES_H +#define _BUCKETS_TYPES_H + +enum bucket_data_type { + BUCKET_DATA = 0, + BUCKET_BTREE, + BUCKET_PRIOS, + BUCKET_JOURNAL, + BUCKET_SB, +}; + +struct bucket_mark { + union { + struct { + u64 counter; + }; + + struct { + u8 gen; + + /* generation copygc is going to move this bucket into */ + unsigned copygc:1; + + unsigned journal_seq_valid:1; + + /* + * If this bucket had metadata while at the current generation + * number, the allocator must increment its gen before we reuse + * it: + */ + unsigned had_metadata:1; + + unsigned owned_by_allocator:1; + + unsigned data_type:3; + + unsigned nouse:1; + + u16 dirty_sectors; + u16 cached_sectors; + + /* + * low bits of journal sequence number when this bucket was most + * recently modified: if journal_seq_valid is set, this bucket + * can't be reused until the journal sequence number written to + * disk is >= the bucket's journal sequence number: + */ + u16 journal_seq; + }; + }; +}; + +struct bucket { + union { + struct { + u16 read_prio; + u16 write_prio; + }; + u16 prio[2]; + }; + + union { + struct bucket_mark _mark; + const struct bucket_mark mark; + }; +}; + +enum s_compressed { + S_COMPRESSED, + S_UNCOMPRESSED, + S_COMPRESSED_NR, +}; + +enum s_alloc { + S_META, + S_DIRTY, + S_CACHED, + S_ALLOC_NR, +}; + +struct bch_dev_usage { + u64 buckets_dirty; + u64 buckets_cached; + u64 buckets_meta; + u64 buckets_alloc; + + u64 sectors[S_ALLOC_NR]; +}; + +struct bch_fs_usage { + /* all fields are in units of 512 byte sectors: */ + u64 s[S_COMPRESSED_NR][S_ALLOC_NR]; + u64 persistent_reserved; + u64 online_reserved; + u64 available_cache; +}; + +struct bucket_heap_entry { + struct bucket *g; + unsigned long val; +}; + +/* + * A reservation for space on disk: + */ +struct disk_reservation { + u64 sectors; + u32 gen; + unsigned nr_replicas; +}; + +#endif /* _BUCKETS_TYPES_H */ diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c new file mode 100644 index 00000000..24b92a29 --- /dev/null +++ b/libbcachefs/chardev.c @@ -0,0 +1,407 @@ +#include "bcachefs.h" +#include "bcachefs_ioctl.h" +#include "super.h" +#include "super-io.h" + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/major.h> +#include <linux/cdev.h> +#include <linux/device.h> +#include <linux/ioctl.h> +#include <linux/uaccess.h> +#include <linux/slab.h> + +static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) +{ + struct bch_ioctl_assemble arg; + const char *err; + u64 *user_devs = NULL; + char **devs = NULL; + unsigned i; + int ret = -EFAULT; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags || arg.pad) + return -EINVAL; + + user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); + if (!devs) + return -ENOMEM; + + devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); + + if (copy_from_user(user_devs, user_arg->devs, + sizeof(u64) * arg.nr_devs)) + goto err; + + for (i = 0; i < arg.nr_devs; i++) { + devs[i] = strndup_user((const char __user *)(unsigned long) + user_devs[i], + PATH_MAX); + if (!devs[i]) { + ret = -ENOMEM; + goto err; + } + } + + err = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty(), NULL); + if (err) { + pr_err("Could not open filesystem: %s", err); + ret = -EINVAL; + goto err; + } + + ret = 0; +err: + if (devs) + for (i = 0; i < arg.nr_devs; i++) + kfree(devs[i]); + kfree(devs); + return ret; +} + +static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) +{ + struct bch_ioctl_incremental arg; + const char *err; + char *path; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags || arg.pad) + return -EINVAL; + + path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); + if (!path) + return -ENOMEM; + + err = bch2_fs_open_incremental(path); + kfree(path); + + if (err) { + pr_err("Could not register bcachefs devices: %s", err); + return -EINVAL; + } + + return 0; +} + +static long bch2_global_ioctl(unsigned cmd, void __user *arg) +{ + switch (cmd) { + case BCH_IOCTL_ASSEMBLE: + return bch2_ioctl_assemble(arg); + case BCH_IOCTL_INCREMENTAL: + return bch2_ioctl_incremental(arg); + default: + return -ENOTTY; + } +} + +static long bch2_ioctl_query_uuid(struct bch_fs *c, + struct bch_ioctl_query_uuid __user *user_arg) +{ + return copy_to_user(&user_arg->uuid, + &c->sb.user_uuid, + sizeof(c->sb.user_uuid)); +} + +static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start __user *user_arg) +{ + struct bch_ioctl_start arg; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags || arg.pad) + return -EINVAL; + + return bch2_fs_start(c) ? -EIO : 0; +} + +static long bch2_ioctl_stop(struct bch_fs *c) +{ + bch2_fs_stop(c); + return 0; +} + +/* returns with ref on ca->ref */ +static struct bch_dev *bch2_device_lookup(struct bch_fs *c, + const char __user *dev) +{ + struct block_device *bdev; + struct bch_dev *ca; + char *path; + unsigned i; + + path = strndup_user(dev, PATH_MAX); + if (!path) + return ERR_PTR(-ENOMEM); + + bdev = lookup_bdev(strim(path)); + kfree(path); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + + for_each_member_device(ca, c, i) + if (ca->disk_sb.bdev == bdev) + goto found; + + ca = NULL; +found: + bdput(bdev); + return ca; +} + +#if 0 +static struct bch_member *bch2_uuid_lookup(struct bch_fs *c, uuid_le uuid) +{ + struct bch_sb_field_members *mi = bch2_sb_get_members(c->disk_sb); + unsigned i; + + lockdep_assert_held(&c->sb_lock); + + for (i = 0; i < c->disk_sb->nr_devices; i++) + if (!memcmp(&mi->members[i].uuid, &uuid, sizeof(uuid))) + return &mi->members[i]; + + return NULL; +} +#endif + +static long bch2_ioctl_disk_add(struct bch_fs *c, + struct bch_ioctl_disk __user *user_arg) +{ + struct bch_ioctl_disk arg; + char *path; + int ret; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags || arg.pad) + return -EINVAL; + + path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); + if (!path) + return -ENOMEM; + + ret = bch2_dev_add(c, path); + kfree(path); + + return ret; +} + +static long bch2_ioctl_disk_remove(struct bch_fs *c, + struct bch_ioctl_disk __user *user_arg) +{ + struct bch_ioctl_disk arg; + struct bch_dev *ca; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + ca = bch2_device_lookup(c, (const char __user *)(unsigned long) arg.dev); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + return bch2_dev_remove(c, ca, arg.flags); +} + +static long bch2_ioctl_disk_online(struct bch_fs *c, + struct bch_ioctl_disk __user *user_arg) +{ + struct bch_ioctl_disk arg; + char *path; + int ret; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags || arg.pad) + return -EINVAL; + + path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); + if (!path) + return -ENOMEM; + + ret = bch2_dev_online(c, path); + kfree(path); + return ret; +} + +static long bch2_ioctl_disk_offline(struct bch_fs *c, + struct bch_ioctl_disk __user *user_arg) +{ + struct bch_ioctl_disk arg; + struct bch_dev *ca; + int ret; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.pad) + return -EINVAL; + + ca = bch2_device_lookup(c, (const char __user *)(unsigned long) arg.dev); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_dev_offline(c, ca, arg.flags); + percpu_ref_put(&ca->ref); + return ret; +} + +static long bch2_ioctl_disk_set_state(struct bch_fs *c, + struct bch_ioctl_disk_set_state __user *user_arg) +{ + struct bch_ioctl_disk_set_state arg; + struct bch_dev *ca; + int ret; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + ca = bch2_device_lookup(c, (const char __user *)(unsigned long) arg.dev); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); + + percpu_ref_put(&ca->ref); + return ret; +} + +static long bch2_ioctl_disk_evacuate(struct bch_fs *c, + struct bch_ioctl_disk __user *user_arg) +{ + struct bch_ioctl_disk arg; + struct bch_dev *ca; + int ret; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + ca = bch2_device_lookup(c, (const char __user *)(unsigned long) arg.dev); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_dev_evacuate(c, ca); + + percpu_ref_put(&ca->ref); + return ret; +} + +long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) +{ + /* ioctls that don't require admin cap: */ + switch (cmd) { + case BCH_IOCTL_QUERY_UUID: + return bch2_ioctl_query_uuid(c, arg); + } + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* ioctls that do require admin cap: */ + switch (cmd) { + case BCH_IOCTL_START: + return bch2_ioctl_start(c, arg); + case BCH_IOCTL_STOP: + return bch2_ioctl_stop(c); + + case BCH_IOCTL_DISK_ADD: + return bch2_ioctl_disk_add(c, arg); + case BCH_IOCTL_DISK_REMOVE: + return bch2_ioctl_disk_remove(c, arg); + case BCH_IOCTL_DISK_ONLINE: + return bch2_ioctl_disk_online(c, arg); + case BCH_IOCTL_DISK_OFFLINE: + return bch2_ioctl_disk_offline(c, arg); + case BCH_IOCTL_DISK_SET_STATE: + return bch2_ioctl_disk_set_state(c, arg); + case BCH_IOCTL_DISK_EVACUATE: + return bch2_ioctl_disk_evacuate(c, arg); + + default: + return -ENOTTY; + } +} + +static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) +{ + struct bch_fs *c = filp->private_data; + void __user *arg = (void __user *) v; + + return c + ? bch2_fs_ioctl(c, cmd, arg) + : bch2_global_ioctl(cmd, arg); +} + +static const struct file_operations bch_chardev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = bch2_chardev_ioctl, + .open = nonseekable_open, +}; + +static int bch_chardev_major; +static struct class *bch_chardev_class; +static struct device *bch_chardev; +static DEFINE_IDR(bch_chardev_minor); + +void bch2_fs_chardev_exit(struct bch_fs *c) +{ + if (!IS_ERR_OR_NULL(c->chardev)) + device_unregister(c->chardev); + if (c->minor >= 0) + idr_remove(&bch_chardev_minor, c->minor); +} + +int bch2_fs_chardev_init(struct bch_fs *c) +{ + c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); + if (c->minor < 0) + return c->minor; + + c->chardev = device_create(bch_chardev_class, NULL, + MKDEV(bch_chardev_major, c->minor), NULL, + "bcachefs%u-ctl", c->minor); + if (IS_ERR(c->chardev)) + return PTR_ERR(c->chardev); + + return 0; +} + +void bch2_chardev_exit(void) +{ + if (!IS_ERR_OR_NULL(bch_chardev_class)) + device_destroy(bch_chardev_class, + MKDEV(bch_chardev_major, 255)); + if (!IS_ERR_OR_NULL(bch_chardev_class)) + class_destroy(bch_chardev_class); + if (bch_chardev_major > 0) + unregister_chrdev(bch_chardev_major, "bcachefs"); +} + +int __init bch2_chardev_init(void) +{ + bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); + if (bch_chardev_major < 0) + return bch_chardev_major; + + bch_chardev_class = class_create(THIS_MODULE, "bcachefs"); + if (IS_ERR(bch_chardev_class)) + return PTR_ERR(bch_chardev_class); + + bch_chardev = device_create(bch_chardev_class, NULL, + MKDEV(bch_chardev_major, 255), + NULL, "bcachefs-ctl"); + if (IS_ERR(bch_chardev)) + return PTR_ERR(bch_chardev); + + return 0; +} diff --git a/libbcachefs/chardev.h b/libbcachefs/chardev.h new file mode 100644 index 00000000..e0e34e24 --- /dev/null +++ b/libbcachefs/chardev.h @@ -0,0 +1,30 @@ +#ifndef _BCACHE_CHARDEV_H +#define _BCACHE_CHARDEV_H + +#ifndef NO_BCACHE_CHARDEV + +long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); + +void bch2_fs_chardev_exit(struct bch_fs *); +int bch2_fs_chardev_init(struct bch_fs *); + +void bch2_chardev_exit(void); +int __init bch2_chardev_init(void); + +#else + +static inline long bch2_fs_ioctl(struct bch_fs *c, + unsigned cmd, void __user * arg) +{ + return -ENOSYS; +} + +static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} +static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } + +static inline void bch2_chardev_exit(void) {} +static inline int __init bch2_chardev_init(void) { return 0; } + +#endif + +#endif /* _BCACHE_CHARDEV_H */ diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c new file mode 100644 index 00000000..4545a499 --- /dev/null +++ b/libbcachefs/checksum.c @@ -0,0 +1,595 @@ +#include "bcachefs.h" +#include "checksum.h" +#include "super.h" +#include "super-io.h" + +#include <linux/crc32c.h> +#include <linux/crypto.h> +#include <linux/key.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <crypto/algapi.h> +#include <crypto/chacha20.h> +#include <crypto/hash.h> +#include <crypto/poly1305.h> +#include <keys/user-type.h> + +/* + * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any + * use permitted, subject to terms of PostgreSQL license; see.) + + * If we have a 64-bit integer type, then a 64-bit CRC looks just like the + * usual sort of implementation. (See Ross Williams' excellent introduction + * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from + * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) + * If we have no working 64-bit type, then fake it with two 32-bit registers. + * + * The present implementation is a normal (not "reflected", in Williams' + * terms) 64-bit CRC, using initial all-ones register contents and a final + * bit inversion. The chosen polynomial is borrowed from the DLT1 spec + * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): + * + * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + + * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + + * x^7 + x^4 + x + 1 +*/ + +static const u64 crc_table[256] = { + 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL, + 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL, + 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL, + 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL, + 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL, + 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL, + 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL, + 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL, + 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL, + 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL, + 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL, + 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL, + 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL, + 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL, + 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL, + 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL, + 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL, + 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL, + 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL, + 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL, + 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL, + 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL, + 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL, + 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL, + 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL, + 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL, + 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL, + 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL, + 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL, + 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL, + 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL, + 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL, + 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL, + 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL, + 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL, + 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL, + 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL, + 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL, + 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL, + 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL, + 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL, + 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL, + 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL, + 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL, + 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL, + 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL, + 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL, + 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL, + 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL, + 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL, + 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL, + 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL, + 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL, + 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL, + 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL, + 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL, + 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL, + 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL, + 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL, + 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL, + 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL, + 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL, + 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL, + 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL, + 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL, + 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL, + 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL, + 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL, + 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL, + 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL, + 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL, + 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL, + 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL, + 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL, + 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL, + 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL, + 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL, + 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL, + 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL, + 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL, + 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL, + 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL, + 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL, + 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL, + 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL, + 0x9AFCE626CE85B507ULL, +}; + +u64 bch2_crc64_update(u64 crc, const void *_data, size_t len) +{ + const unsigned char *data = _data; + + while (len--) { + int i = ((int) (crc >> 56) ^ *data++) & 0xFF; + crc = crc_table[i] ^ (crc << 8); + } + + return crc; +} + +static u64 bch2_checksum_init(unsigned type) +{ + switch (type) { + case BCH_CSUM_NONE: + return 0; + case BCH_CSUM_CRC32C: + return U32_MAX; + case BCH_CSUM_CRC64: + return U64_MAX; + default: + BUG(); + } +} + +static u64 bch2_checksum_final(unsigned type, u64 crc) +{ + switch (type) { + case BCH_CSUM_NONE: + return 0; + case BCH_CSUM_CRC32C: + return crc ^ U32_MAX; + case BCH_CSUM_CRC64: + return crc ^ U64_MAX; + default: + BUG(); + } +} + +static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len) +{ + switch (type) { + case BCH_CSUM_NONE: + return 0; + case BCH_CSUM_CRC32C: + return crc32c(crc, data, len); + case BCH_CSUM_CRC64: + return bch2_crc64_update(crc, data, len); + default: + BUG(); + } +} + +static inline void do_encrypt_sg(struct crypto_blkcipher *tfm, + struct nonce nonce, + struct scatterlist *sg, size_t len) +{ + struct blkcipher_desc desc = { .tfm = tfm, .info = nonce.d }; + int ret; + + ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len); + BUG_ON(ret); +} + +static inline void do_encrypt(struct crypto_blkcipher *tfm, + struct nonce nonce, + void *buf, size_t len) +{ + struct scatterlist sg; + + sg_init_one(&sg, buf, len); + do_encrypt_sg(tfm, nonce, &sg, len); +} + +int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, + void *buf, size_t len) +{ + struct crypto_blkcipher *chacha20 = + crypto_alloc_blkcipher("chacha20", 0, CRYPTO_ALG_ASYNC); + int ret; + + if (!chacha20) + return PTR_ERR(chacha20); + + ret = crypto_blkcipher_setkey(chacha20, (void *) key, sizeof(*key)); + if (ret) + goto err; + + do_encrypt(chacha20, nonce, buf, len); +err: + crypto_free_blkcipher(chacha20); + return ret; +} + +static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, + struct nonce nonce) +{ + u8 key[POLY1305_KEY_SIZE]; + + nonce.d[3] ^= BCH_NONCE_POLY; + + memset(key, 0, sizeof(key)); + do_encrypt(c->chacha20, nonce, key, sizeof(key)); + + desc->tfm = c->poly1305; + desc->flags = 0; + crypto_shash_init(desc); + crypto_shash_update(desc, key, sizeof(key)); +} + +struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, + struct nonce nonce, const void *data, size_t len) +{ + switch (type) { + case BCH_CSUM_NONE: + case BCH_CSUM_CRC32C: + case BCH_CSUM_CRC64: { + u64 crc = bch2_checksum_init(type); + + crc = bch2_checksum_update(type, crc, data, len); + crc = bch2_checksum_final(type, crc); + + return (struct bch_csum) { .lo = crc }; + } + + case BCH_CSUM_CHACHA20_POLY1305_80: + case BCH_CSUM_CHACHA20_POLY1305_128: { + SHASH_DESC_ON_STACK(desc, c->poly1305); + u8 digest[POLY1305_DIGEST_SIZE]; + struct bch_csum ret = { 0 }; + + gen_poly_key(c, desc, nonce); + + crypto_shash_update(desc, data, len); + crypto_shash_final(desc, digest); + + memcpy(&ret, digest, bch_crc_bytes[type]); + return ret; + } + default: + BUG(); + } +} + +void bch2_encrypt(struct bch_fs *c, unsigned type, + struct nonce nonce, void *data, size_t len) +{ + if (!bch2_csum_type_is_encryption(type)) + return; + + do_encrypt(c->chacha20, nonce, data, len); +} + +struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) +{ + struct bio_vec bv; + struct bvec_iter iter; + + switch (type) { + case BCH_CSUM_NONE: + return (struct bch_csum) { 0 }; + case BCH_CSUM_CRC32C: + case BCH_CSUM_CRC64: { + u64 crc = bch2_checksum_init(type); + + bio_for_each_contig_segment(bv, bio, iter) { + void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; + crc = bch2_checksum_update(type, + crc, p, bv.bv_len); + kunmap_atomic(p); + } + + crc = bch2_checksum_final(type, crc); + return (struct bch_csum) { .lo = crc }; + } + + case BCH_CSUM_CHACHA20_POLY1305_80: + case BCH_CSUM_CHACHA20_POLY1305_128: { + SHASH_DESC_ON_STACK(desc, c->poly1305); + u8 digest[POLY1305_DIGEST_SIZE]; + struct bch_csum ret = { 0 }; + + gen_poly_key(c, desc, nonce); + + bio_for_each_contig_segment(bv, bio, iter) { + void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; + + crypto_shash_update(desc, p, bv.bv_len); + kunmap_atomic(p); + } + + crypto_shash_final(desc, digest); + + memcpy(&ret, digest, bch_crc_bytes[type]); + return ret; + } + default: + BUG(); + } +} + +void bch2_encrypt_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) +{ + struct bio_vec bv; + struct bvec_iter iter; + struct scatterlist sgl[16], *sg = sgl; + size_t bytes = 0; + + if (!bch2_csum_type_is_encryption(type)) + return; + + sg_init_table(sgl, ARRAY_SIZE(sgl)); + + bio_for_each_contig_segment(bv, bio, iter) { + if (sg == sgl + ARRAY_SIZE(sgl)) { + sg_mark_end(sg - 1); + do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + + le32_add_cpu(nonce.d, bytes / CHACHA20_BLOCK_SIZE); + bytes = 0; + + sg_init_table(sgl, ARRAY_SIZE(sgl)); + sg = sgl; + } + + sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); + bytes += bv.bv_len; + + } + + sg_mark_end(sg - 1); + do_encrypt_sg(c->chacha20, nonce, sgl, bytes); +} + +#ifdef __KERNEL__ +int bch2_request_key(struct bch_sb *sb, struct bch_key *key) +{ + char key_description[60]; + struct key *keyring_key; + const struct user_key_payload *ukp; + int ret; + + snprintf(key_description, sizeof(key_description), + "bcachefs:%pUb", &sb->user_uuid); + + keyring_key = request_key(&key_type_logon, key_description, NULL); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); + + down_read(&keyring_key->sem); + ukp = user_key_payload(keyring_key); + if (ukp->datalen == sizeof(*key)) { + memcpy(key, ukp->data, ukp->datalen); + ret = 0; + } else { + ret = -EINVAL; + } + up_read(&keyring_key->sem); + key_put(keyring_key); + + return ret; +} +#else +#include <keyutils.h> +#include <uuid/uuid.h> + +int bch2_request_key(struct bch_sb *sb, struct bch_key *key) +{ + key_serial_t key_id; + char key_description[60]; + char uuid[40]; + + uuid_unparse_lower(sb->user_uuid.b, uuid); + sprintf(key_description, "bcachefs:%s", uuid); + + key_id = request_key("user", key_description, NULL, + KEY_SPEC_USER_KEYRING); + if (key_id < 0) + return -errno; + + if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) + return -1; + + return 0; +} +#endif + +static int bch2_decrypt_sb_key(struct bch_fs *c, + struct bch_sb_field_crypt *crypt, + struct bch_key *key) +{ + struct bch_encrypted_key sb_key = crypt->key; + struct bch_key user_key; + int ret = 0; + + /* is key encrypted? */ + if (!bch2_key_is_encrypted(&sb_key)) + goto out; + + ret = bch2_request_key(c->disk_sb, &user_key); + if (ret) { + bch_err(c, "error requesting encryption key"); + goto err; + } + + /* decrypt real key: */ + ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), + &sb_key, sizeof(sb_key)); + if (ret) + goto err; + + if (bch2_key_is_encrypted(&sb_key)) { + bch_err(c, "incorrect encryption key"); + ret = -EINVAL; + goto err; + } +out: + *key = sb_key.key; +err: + memzero_explicit(&sb_key, sizeof(sb_key)); + memzero_explicit(&user_key, sizeof(user_key)); + return ret; +} + +static int bch2_alloc_ciphers(struct bch_fs *c) +{ + if (!c->chacha20) + c->chacha20 = crypto_alloc_blkcipher("chacha20", 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(c->chacha20)) + return PTR_ERR(c->chacha20); + + if (!c->poly1305) + c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); + if (IS_ERR(c->poly1305)) + return PTR_ERR(c->poly1305); + + return 0; +} + +int bch2_disable_encryption(struct bch_fs *c) +{ + struct bch_sb_field_crypt *crypt; + struct bch_key key; + int ret = -EINVAL; + + mutex_lock(&c->sb_lock); + + crypt = bch2_sb_get_crypt(c->disk_sb); + if (!crypt) + goto out; + + /* is key encrypted? */ + ret = 0; + if (bch2_key_is_encrypted(&crypt->key)) + goto out; + + ret = bch2_decrypt_sb_key(c, crypt, &key); + if (ret) + goto out; + + crypt->key.magic = BCH_KEY_MAGIC; + crypt->key.key = key; + + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 0); + bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); + + return ret; +} + +int bch2_enable_encryption(struct bch_fs *c, bool keyed) +{ + struct bch_encrypted_key key; + struct bch_key user_key; + struct bch_sb_field_crypt *crypt; + int ret = -EINVAL; + + mutex_lock(&c->sb_lock); + + /* Do we already have an encryption key? */ + if (bch2_sb_get_crypt(c->disk_sb)) + goto err; + + ret = bch2_alloc_ciphers(c); + if (ret) + goto err; + + key.magic = BCH_KEY_MAGIC; + get_random_bytes(&key.key, sizeof(key.key)); + + if (keyed) { + ret = bch2_request_key(c->disk_sb, &user_key); + if (ret) { + bch_err(c, "error requesting encryption key"); + goto err; + } + + ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), + &key, sizeof(key)); + if (ret) + goto err; + } + + ret = crypto_blkcipher_setkey(c->chacha20, + (void *) &key.key, sizeof(key.key)); + if (ret) + goto err; + + crypt = bch2_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64)); + if (!crypt) { + ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ + goto err; + } + + crypt->key = key; + + /* write superblock */ + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 1); + bch2_write_super(c); +err: + mutex_unlock(&c->sb_lock); + memzero_explicit(&user_key, sizeof(user_key)); + memzero_explicit(&key, sizeof(key)); + return ret; +} + +void bch2_fs_encryption_exit(struct bch_fs *c) +{ + if (!IS_ERR_OR_NULL(c->poly1305)) + crypto_free_shash(c->poly1305); + if (!IS_ERR_OR_NULL(c->chacha20)) + crypto_free_blkcipher(c->chacha20); + if (!IS_ERR_OR_NULL(c->sha256)) + crypto_free_shash(c->sha256); +} + +int bch2_fs_encryption_init(struct bch_fs *c) +{ + struct bch_sb_field_crypt *crypt; + struct bch_key key; + int ret; + + c->sha256 = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(c->sha256)) + return PTR_ERR(c->sha256); + + crypt = bch2_sb_get_crypt(c->disk_sb); + if (!crypt) + return 0; + + ret = bch2_alloc_ciphers(c); + if (ret) + return ret; + + ret = bch2_decrypt_sb_key(c, crypt, &key); + if (ret) + goto err; + + ret = crypto_blkcipher_setkey(c->chacha20, + (void *) &key.key, sizeof(key.key)); +err: + memzero_explicit(&key, sizeof(key)); + return ret; +} diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h new file mode 100644 index 00000000..f540e305 --- /dev/null +++ b/libbcachefs/checksum.h @@ -0,0 +1,133 @@ +#ifndef _BCACHE_CHECKSUM_H +#define _BCACHE_CHECKSUM_H + +#include "bcachefs.h" +#include "super-io.h" + +#include <crypto/chacha20.h> + +u64 bch2_crc64_update(u64, const void *, size_t); + +#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) +#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) +#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) +#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) +#define BCH_NONCE_POLY cpu_to_le32(1 << 31) + +struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, + const void *, size_t); + +/* + * This is used for various on disk data structures - bch_sb, prio_set, bset, + * jset: The checksum is _always_ the first field of these structs + */ +#define csum_vstruct(_c, _type, _nonce, _i) \ +({ \ + const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ + const void *end = vstruct_end(_i); \ + \ + bch2_checksum(_c, _type, _nonce, start, end - start); \ +}) + +int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); +int bch2_request_key(struct bch_sb *, struct bch_key *); + +void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, + void *data, size_t); + +struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, + struct nonce, struct bio *); +void bch2_encrypt_bio(struct bch_fs *, unsigned, + struct nonce, struct bio *); + +int bch2_disable_encryption(struct bch_fs *); +int bch2_enable_encryption(struct bch_fs *, bool); + +void bch2_fs_encryption_exit(struct bch_fs *); +int bch2_fs_encryption_init(struct bch_fs *); + +static inline unsigned bch2_data_checksum_type(struct bch_fs *c) +{ + if (c->sb.encryption_type) + return c->opts.wide_macs + ? BCH_CSUM_CHACHA20_POLY1305_128 + : BCH_CSUM_CHACHA20_POLY1305_80; + + return c->opts.data_checksum; +} + +static inline unsigned bch2_meta_checksum_type(struct bch_fs *c) +{ + return c->sb.encryption_type + ? BCH_CSUM_CHACHA20_POLY1305_128 + : c->opts.metadata_checksum; +} + +static inline bool bch2_checksum_type_valid(const struct bch_fs *c, + unsigned type) +{ + if (type >= BCH_CSUM_NR) + return false; + + if (bch2_csum_type_is_encryption(type) && !c->chacha20) + return false; + + return true; +} + +static const unsigned bch_crc_bytes[] = { + [BCH_CSUM_NONE] = 0, + [BCH_CSUM_CRC32C] = 4, + [BCH_CSUM_CRC64] = 8, + [BCH_CSUM_CHACHA20_POLY1305_80] = 10, + [BCH_CSUM_CHACHA20_POLY1305_128] = 16, +}; + +static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) +{ + /* + * XXX: need some way of preventing the compiler from optimizing this + * into a form that isn't constant time.. + */ + return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; +} + +/* for skipping ahead and encrypting/decrypting at an offset: */ +static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) +{ + EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + + le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + return nonce; +} + +static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) +{ + return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; +} + +static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) +{ + __le64 magic = __bch2_sb_magic(sb); + + return (struct nonce) {{ + [0] = 0, + [1] = 0, + [2] = ((__le32 *) &magic)[0], + [3] = ((__le32 *) &magic)[1], + }}; +} + +static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) +{ + __le64 magic = bch2_sb_magic(c); + + return (struct nonce) {{ + [0] = 0, + [1] = 0, + [2] = ((__le32 *) &magic)[0], + [3] = ((__le32 *) &magic)[1], + }}; +} + +#endif /* _BCACHE_CHECKSUM_H */ diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c new file mode 100644 index 00000000..3c3649f0 --- /dev/null +++ b/libbcachefs/clock.c @@ -0,0 +1,161 @@ +#include "bcachefs.h" +#include "clock.h" + +#include <linux/freezer.h> +#include <linux/kthread.h> + +static inline bool io_timer_cmp(struct io_timer *l, struct io_timer *r) +{ + return time_after(l->expire, r->expire); +} + +void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) +{ + size_t i; + + spin_lock(&clock->timer_lock); + for (i = 0; i < clock->timers.used; i++) + if (clock->timers.data[i] == timer) + goto out; + + BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp)); +out: + spin_unlock(&clock->timer_lock); +} + +void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) +{ + size_t i; + + spin_lock(&clock->timer_lock); + + for (i = 0; i < clock->timers.used; i++) + if (clock->timers.data[i] == timer) { + heap_del(&clock->timers, i, io_timer_cmp); + break; + } + + spin_unlock(&clock->timer_lock); +} + +struct io_clock_wait { + struct io_timer timer; + struct task_struct *task; + int expired; +}; + +static void io_clock_wait_fn(struct io_timer *timer) +{ + struct io_clock_wait *wait = container_of(timer, + struct io_clock_wait, timer); + + wait->expired = 1; + wake_up_process(wait->task); +} + +void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) +{ + struct io_clock_wait wait; + + /* XXX: calculate sleep time rigorously */ + wait.timer.expire = until; + wait.timer.fn = io_clock_wait_fn; + wait.task = current; + wait.expired = 0; + bch2_io_timer_add(clock, &wait.timer); + + schedule(); + + bch2_io_timer_del(clock, &wait.timer); +} + +/* + * _only_ to be used from a kthread + */ +void bch2_kthread_io_clock_wait(struct io_clock *clock, + unsigned long until) +{ + struct io_clock_wait wait; + + /* XXX: calculate sleep time rigorously */ + wait.timer.expire = until; + wait.timer.fn = io_clock_wait_fn; + wait.task = current; + wait.expired = 0; + bch2_io_timer_add(clock, &wait.timer); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + + if (wait.expired) + break; + + schedule(); + try_to_freeze(); + } + + __set_current_state(TASK_RUNNING); + bch2_io_timer_del(clock, &wait.timer); +} + +static struct io_timer *get_expired_timer(struct io_clock *clock, + unsigned long now) +{ + struct io_timer *ret = NULL; + + spin_lock(&clock->timer_lock); + + if (clock->timers.used && + time_after_eq(now, clock->timers.data[0]->expire)) + heap_pop(&clock->timers, ret, io_timer_cmp); + + spin_unlock(&clock->timer_lock); + + return ret; +} + +void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw) +{ + struct io_clock *clock = &c->io_clock[rw]; + struct io_timer *timer; + unsigned long now; + + /* Buffer up one megabyte worth of IO in the percpu counter */ + preempt_disable(); + + if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) < + IO_CLOCK_PCPU_SECTORS)) { + preempt_enable(); + return; + } + + sectors = this_cpu_xchg(*clock->pcpu_buf, 0); + preempt_enable(); + now = atomic_long_add_return(sectors, &clock->now); + + while ((timer = get_expired_timer(clock, now))) + timer->fn(timer); +} + +void bch2_io_clock_exit(struct io_clock *clock) +{ + free_heap(&clock->timers); + free_percpu(clock->pcpu_buf); +} + +int bch2_io_clock_init(struct io_clock *clock) +{ + atomic_long_set(&clock->now, 0); + spin_lock_init(&clock->timer_lock); + + clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); + if (!clock->pcpu_buf) + return -ENOMEM; + + if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) + return -ENOMEM; + + return 0; +} diff --git a/libbcachefs/clock.h b/libbcachefs/clock.h new file mode 100644 index 00000000..061bf04a --- /dev/null +++ b/libbcachefs/clock.h @@ -0,0 +1,23 @@ +#ifndef _BCACHE_CLOCK_H +#define _BCACHE_CLOCK_H + +void bch2_io_timer_add(struct io_clock *, struct io_timer *); +void bch2_io_timer_del(struct io_clock *, struct io_timer *); +void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long); +void bch2_increment_clock(struct bch_fs *, unsigned, int); + +void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); + +#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_timeout(wq, condition, timeout); \ + __ret; \ +}) + +void bch2_io_clock_exit(struct io_clock *); +int bch2_io_clock_init(struct io_clock *); + +#endif /* _BCACHE_CLOCK_H */ diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h new file mode 100644 index 00000000..4a02f467 --- /dev/null +++ b/libbcachefs/clock_types.h @@ -0,0 +1,34 @@ +#ifndef _BCACHE_CLOCK_TYPES_H +#define _BCACHE_CLOCK_TYPES_H + +#include "util.h" + +#define NR_IO_TIMERS 8 + +/* + * Clocks/timers in units of sectors of IO: + * + * Note - they use percpu batching, so they're only approximate. + */ + +struct io_timer; +typedef void (*io_timer_fn)(struct io_timer *); + +struct io_timer { + io_timer_fn fn; + unsigned long expire; +}; + +/* Amount to buffer up on a percpu counter */ +#define IO_CLOCK_PCPU_SECTORS 128 + +struct io_clock { + atomic_long_t now; + u16 __percpu *pcpu_buf; + + spinlock_t timer_lock; + DECLARE_HEAP(struct io_timer *, timers); +}; + +#endif /* _BCACHE_CLOCK_TYPES_H */ + diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c new file mode 100644 index 00000000..547ea732 --- /dev/null +++ b/libbcachefs/compress.c @@ -0,0 +1,500 @@ +#include "bcachefs.h" +#include "compress.h" +#include "extents.h" +#include "io.h" +#include "super-io.h" + +#include <linux/lz4.h> +#include <linux/zlib.h> + +enum bounced { + BOUNCED_CONTIG, + BOUNCED_MAPPED, + BOUNCED_KMALLOCED, + BOUNCED_VMALLOCED, + BOUNCED_MEMPOOLED, +}; + +static void *__bounce_alloc(struct bch_fs *c, unsigned size, + unsigned *bounced, int direction) +{ + void *data; + + *bounced = BOUNCED_KMALLOCED; + data = kmalloc(size, GFP_NOIO|__GFP_NOWARN); + if (data) + return data; + + *bounced = BOUNCED_MEMPOOLED; + data = mempool_alloc(&c->compression_bounce[direction], GFP_NOWAIT); + if (data) + return page_address(data); + + *bounced = BOUNCED_VMALLOCED; + data = vmalloc(size); + if (data) + return data; + + *bounced = BOUNCED_MEMPOOLED; + data = mempool_alloc(&c->compression_bounce[direction], GFP_NOIO); + return page_address(data); +} + +static void *__bio_map_or_bounce(struct bch_fs *c, + struct bio *bio, struct bvec_iter start, + unsigned *bounced, int direction) +{ + struct bio_vec bv; + struct bvec_iter iter; + unsigned nr_pages = 0; + struct page *stack_pages[16]; + struct page **pages = NULL; + bool first = true; + unsigned prev_end = PAGE_SIZE; + void *data; + + BUG_ON(bvec_iter_sectors(start) > BCH_ENCODED_EXTENT_MAX); + +#ifndef CONFIG_HIGHMEM + *bounced = BOUNCED_CONTIG; + + __bio_for_each_contig_segment(bv, bio, iter, start) { + if (bv.bv_len == start.bi_size) + return page_address(bv.bv_page) + bv.bv_offset; + } +#endif + *bounced = BOUNCED_MAPPED; + + __bio_for_each_segment(bv, bio, iter, start) { + if ((!first && bv.bv_offset) || + prev_end != PAGE_SIZE) + goto bounce; + + prev_end = bv.bv_offset + bv.bv_len; + nr_pages++; + } + + BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); + + pages = nr_pages > ARRAY_SIZE(stack_pages) + ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO) + : stack_pages; + if (!pages) + goto bounce; + + nr_pages = 0; + __bio_for_each_segment(bv, bio, iter, start) + pages[nr_pages++] = bv.bv_page; + + data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (pages != stack_pages) + kfree(pages); + + return data + bio_iter_offset(bio, start); +bounce: + data = __bounce_alloc(c, start.bi_size, bounced, direction); + + if (direction == READ) + memcpy_from_bio(data, bio, start); + + return data; +} + +static void *bio_map_or_bounce(struct bch_fs *c, struct bio *bio, + unsigned *bounced, int direction) +{ + return __bio_map_or_bounce(c, bio, bio->bi_iter, bounced, direction); +} + +static void bio_unmap_or_unbounce(struct bch_fs *c, void *data, + unsigned bounced, int direction) +{ + if (!data) + return; + + switch (bounced) { + case BOUNCED_MAPPED: + vunmap((void *) ((unsigned long) data & PAGE_MASK)); + return; + case BOUNCED_KMALLOCED: + kfree(data); + return; + case BOUNCED_VMALLOCED: + vfree(data); + return; + case BOUNCED_MEMPOOLED: + mempool_free(virt_to_page(data), &c->compression_bounce[direction]); + return; + } +} + +static inline void zlib_set_workspace(z_stream *strm, void *workspace) +{ +#ifdef __KERNEL__ + strm->workspace = workspace; +#endif +} + +static int __bio_uncompress(struct bch_fs *c, struct bio *src, + void *dst_data, struct bch_extent_crc128 crc) +{ + void *src_data = NULL; + unsigned src_bounced; + size_t src_len = src->bi_iter.bi_size; + size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; + int ret; + + src_data = bio_map_or_bounce(c, src, &src_bounced, READ); + + switch (crc.compression_type) { + case BCH_COMPRESSION_LZ4: + ret = lz4_decompress(src_data, &src_len, + dst_data, dst_len); + if (ret) { + ret = -EIO; + goto err; + } + break; + case BCH_COMPRESSION_GZIP: { + void *workspace; + z_stream strm; + + workspace = kmalloc(zlib_inflate_workspacesize(), + GFP_NOIO|__GFP_NOWARN); + if (!workspace) { + mutex_lock(&c->zlib_workspace_lock); + workspace = c->zlib_workspace; + } + + strm.next_in = src_data; + strm.avail_in = src_len; + strm.next_out = dst_data; + strm.avail_out = dst_len; + zlib_set_workspace(&strm, workspace); + zlib_inflateInit2(&strm, -MAX_WBITS); + + ret = zlib_inflate(&strm, Z_FINISH); + + if (workspace == c->zlib_workspace) + mutex_unlock(&c->zlib_workspace_lock); + else + kfree(workspace); + + if (ret != Z_STREAM_END) { + ret = -EIO; + goto err; + } + break; + } + default: + BUG(); + } + ret = 0; +err: + bio_unmap_or_unbounce(c, src_data, src_bounced, READ); + return ret; +} + +int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, + unsigned live_data_sectors, + struct bch_extent_crc128 crc) +{ + void *dst_data = NULL; + size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; + int ret = -ENOMEM; + + BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs); + + /* XXX mempoolify */ + dst_data = kmalloc(dst_len, GFP_NOIO|__GFP_NOWARN); + if (!dst_data) { + dst_data = vmalloc(dst_len); + if (!dst_data) + goto err; + } + + ret = __bio_uncompress(c, bio, dst_data, crc); + if (ret) + goto err; + + while (bio->bi_vcnt < DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS)) { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; + + bv->bv_page = alloc_page(GFP_NOIO); + if (!bv->bv_page) + goto use_mempool; + + bv->bv_len = PAGE_SIZE; + bv->bv_offset = 0; + bio->bi_vcnt++; + } + + bio->bi_iter.bi_size = live_data_sectors << 9; +copy_data: + memcpy_to_bio(bio, bio->bi_iter, dst_data + (crc.offset << 9)); +err: + kvfree(dst_data); + return ret; +use_mempool: + /* + * We already allocated from mempool, we can't allocate from it again + * without freeing the pages we already allocated or else we could + * deadlock: + */ + + bch2_bio_free_pages_pool(c, bio); + bch2_bio_alloc_pages_pool(c, bio, live_data_sectors << 9); + goto copy_data; +} + +int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, + struct bio *dst, struct bvec_iter dst_iter, + struct bch_extent_crc128 crc) +{ + void *dst_data = NULL; + unsigned dst_bounced; + size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; + int ret = -ENOMEM; + + dst_data = dst_len == dst_iter.bi_size + ? __bio_map_or_bounce(c, dst, dst_iter, &dst_bounced, WRITE) + : __bounce_alloc(c, dst_len, &dst_bounced, WRITE); + + ret = __bio_uncompress(c, src, dst_data, crc); + if (ret) + goto err; + + if (dst_bounced) + memcpy_to_bio(dst, dst_iter, dst_data + (crc.offset << 9)); +err: + bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE); + return ret; +} + +static int __bio_compress(struct bch_fs *c, + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, + unsigned compression_type) +{ + void *src_data = NULL, *dst_data = NULL; + unsigned src_bounced, dst_bounced, pad; + int ret = -1; + + dst_data = bio_map_or_bounce(c, dst, &dst_bounced, WRITE); + src_data = bio_map_or_bounce(c, src, &src_bounced, READ); + + switch (compression_type) { + case BCH_COMPRESSION_LZ4: { + void *workspace; + + *dst_len = dst->bi_iter.bi_size; + *src_len = src->bi_iter.bi_size; + + workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO); + + while (*src_len > block_bytes(c) && + (ret = lz4_compress(src_data, *src_len, + dst_data, dst_len, + workspace))) { + /* + * On error, the compressed data was bigger than + * dst_len, and -ret is the amount of data we were able + * to compress - round down to nearest block and try + * again: + */ + BUG_ON(ret > 0); + BUG_ON(-ret >= *src_len); + + *src_len = round_down(-ret, block_bytes(c)); + } + + mempool_free(workspace, &c->lz4_workspace_pool); + + if (ret) + goto err; + break; + } + case BCH_COMPRESSION_GZIP: { + void *workspace; + z_stream strm; + + workspace = kmalloc(zlib_deflate_workspacesize(MAX_WBITS, + DEF_MEM_LEVEL), + GFP_NOIO|__GFP_NOWARN); + if (!workspace) { + mutex_lock(&c->zlib_workspace_lock); + workspace = c->zlib_workspace; + } + + strm.next_in = src_data; + strm.avail_in = min(src->bi_iter.bi_size, + dst->bi_iter.bi_size); + strm.next_out = dst_data; + strm.avail_out = dst->bi_iter.bi_size; + zlib_set_workspace(&strm, workspace); + zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, + Z_DEFAULT_STRATEGY); + + ret = zlib_deflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END) { + ret = -EIO; + goto zlib_err; + } + + ret = zlib_deflateEnd(&strm); + if (ret != Z_OK) { + ret = -EIO; + goto zlib_err; + } + + ret = 0; +zlib_err: + if (workspace == c->zlib_workspace) + mutex_unlock(&c->zlib_workspace_lock); + else + kfree(workspace); + + if (ret) + goto err; + + *dst_len = strm.total_out; + *src_len = strm.total_in; + break; + } + default: + BUG(); + } + + BUG_ON(!*dst_len); + BUG_ON(*dst_len > dst->bi_iter.bi_size); + + BUG_ON(*src_len & (block_bytes(c) - 1)); + BUG_ON(*src_len > src->bi_iter.bi_size); + + /* Didn't get smaller: */ + if (round_up(*dst_len, block_bytes(c)) >= *src_len) { + ret = -1; + goto err; + } + + pad = round_up(*dst_len, block_bytes(c)) - *dst_len; + + memset(dst_data + *dst_len, 0, pad); + *dst_len += pad; + + if (dst_bounced) + memcpy_to_bio(dst, dst->bi_iter, dst_data); +err: + bio_unmap_or_unbounce(c, src_data, src_bounced, READ); + bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE); + return ret; +} + +void bch2_bio_compress(struct bch_fs *c, + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, + unsigned *compression_type) +{ + unsigned orig_dst = dst->bi_iter.bi_size; + unsigned orig_src = src->bi_iter.bi_size; + + /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ + src->bi_iter.bi_size = + min(src->bi_iter.bi_size, BCH_ENCODED_EXTENT_MAX << 9); + + /* Don't generate a bigger output than input: */ + dst->bi_iter.bi_size = + min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + + /* If it's only one block, don't bother trying to compress: */ + if (*compression_type != BCH_COMPRESSION_NONE && + bio_sectors(src) > c->sb.block_size && + !__bio_compress(c, dst, dst_len, src, src_len, *compression_type)) + goto out; + + /* If compressing failed (didn't get smaller), just copy: */ + *compression_type = BCH_COMPRESSION_NONE; + *dst_len = *src_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + bio_copy_data(dst, src); +out: + dst->bi_iter.bi_size = orig_dst; + src->bi_iter.bi_size = orig_src; +} + +/* doesn't write superblock: */ +int bch2_check_set_has_compressed_data(struct bch_fs *c, + unsigned compression_type) +{ + switch (compression_type) { + case BCH_COMPRESSION_NONE: + return 0; + case BCH_COMPRESSION_LZ4: + if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) + return 0; + + bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4); + break; + case BCH_COMPRESSION_GZIP: + if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) + return 0; + + bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP); + break; + } + + return bch2_fs_compress_init(c); +} + +void bch2_fs_compress_exit(struct bch_fs *c) +{ + vfree(c->zlib_workspace); + mempool_exit(&c->lz4_workspace_pool); + mempool_exit(&c->compression_bounce[WRITE]); + mempool_exit(&c->compression_bounce[READ]); +} + +#define COMPRESSION_WORKSPACE_SIZE \ + max_t(size_t, zlib_inflate_workspacesize(), \ + zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL)) + +int bch2_fs_compress_init(struct bch_fs *c) +{ + unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9); + int ret; + + if (!bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) && + !bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) + return 0; + + if (!mempool_initialized(&c->compression_bounce[READ])) { + ret = mempool_init_page_pool(&c->compression_bounce[READ], + 1, order); + if (ret) + return ret; + } + + if (!mempool_initialized(&c->compression_bounce[WRITE])) { + ret = mempool_init_page_pool(&c->compression_bounce[WRITE], + 1, order); + if (ret) + return ret; + } + + if (!mempool_initialized(&c->lz4_workspace_pool) && + bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) { + ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool, + 1, LZ4_MEM_COMPRESS); + if (ret) + return ret; + } + + if (!c->zlib_workspace && + bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) { + c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE); + if (!c->zlib_workspace) + return -ENOMEM; + } + + return 0; +} diff --git a/libbcachefs/compress.h b/libbcachefs/compress.h new file mode 100644 index 00000000..05804f55 --- /dev/null +++ b/libbcachefs/compress.h @@ -0,0 +1,15 @@ +#ifndef _BCACHE_COMPRESS_H +#define _BCACHE_COMPRESS_H + +int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, + unsigned, struct bch_extent_crc128); +int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, + struct bvec_iter, struct bch_extent_crc128); +void bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, + struct bio *, size_t *, unsigned *); + +int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); +void bch2_fs_compress_exit(struct bch_fs *); +int bch2_fs_compress_init(struct bch_fs *); + +#endif /* _BCACHE_COMPRESS_H */ diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c new file mode 100644 index 00000000..248bc7a1 --- /dev/null +++ b/libbcachefs/debug.c @@ -0,0 +1,417 @@ +/* + * Assorted bcachefs debug code + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "buckets.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "fs-gc.h" +#include "inode.h" +#include "io.h" +#include "super.h" + +#include <linux/console.h> +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/seq_file.h> + +static struct dentry *bch_debug; + +#ifdef CONFIG_BCACHEFS_DEBUG + +void __bch2_btree_verify(struct bch_fs *c, struct btree *b) +{ + struct btree *v = c->verify_data; + struct btree_node *n_ondisk, *n_sorted, *n_inmemory; + struct bset *sorted, *inmemory; + struct extent_pick_ptr pick; + struct bio *bio; + + if (c->opts.nochanges) + return; + + btree_node_io_lock(b); + mutex_lock(&c->verify_lock); + + n_ondisk = c->verify_ondisk; + n_sorted = c->verify_data->data; + n_inmemory = b->data; + + bkey_copy(&v->key, &b->key); + v->written = 0; + v->level = b->level; + v->btree_id = b->btree_id; + bch2_btree_keys_init(v, &c->expensive_debug_checks); + + pick = bch2_btree_pick_ptr(c, b); + if (IS_ERR_OR_NULL(pick.ca)) + return; + + bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio); + bio->bi_bdev = pick.ca->disk_sb.bdev; + bio->bi_iter.bi_sector = pick.ptr.offset; + bio->bi_iter.bi_size = btree_bytes(c); + bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC); + bch2_bio_map(bio, n_sorted); + + submit_bio_wait(bio); + + bio_put(bio); + + memcpy(n_ondisk, n_sorted, btree_bytes(c)); + + bch2_btree_node_read_done(c, v, pick.ca, &pick.ptr); + n_sorted = c->verify_data->data; + + percpu_ref_put(&pick.ca->io_ref); + + sorted = &n_sorted->keys; + inmemory = &n_inmemory->keys; + + if (inmemory->u64s != sorted->u64s || + memcmp(inmemory->start, + sorted->start, + vstruct_end(inmemory) - (void *) inmemory->start)) { + unsigned offset = 0, sectors; + struct bset *i; + unsigned j; + + console_lock(); + + printk(KERN_ERR "*** in memory:\n"); + bch2_dump_bset(b, inmemory, 0); + + printk(KERN_ERR "*** read back in:\n"); + bch2_dump_bset(v, sorted, 0); + + while (offset < b->written) { + if (!offset ) { + i = &n_ondisk->keys; + sectors = vstruct_blocks(n_ondisk, c->block_bits) << + c->block_bits; + } else { + struct btree_node_entry *bne = + (void *) n_ondisk + (offset << 9); + i = &bne->keys; + + sectors = vstruct_blocks(bne, c->block_bits) << + c->block_bits; + } + + printk(KERN_ERR "*** on disk block %u:\n", offset); + bch2_dump_bset(b, i, offset); + + offset += sectors; + } + + printk(KERN_ERR "*** block %u/%u not written\n", + offset >> c->block_bits, btree_blocks(c)); + + for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) + if (inmemory->_data[j] != sorted->_data[j]) + break; + + printk(KERN_ERR "b->written %u\n", b->written); + + console_unlock(); + panic("verify failed at %u\n", j); + } + + mutex_unlock(&c->verify_lock); + btree_node_io_unlock(b); +} + +#endif + +#ifdef CONFIG_DEBUG_FS + +/* XXX: bch_fs refcounting */ + +struct dump_iter { + struct bpos from; + struct bch_fs *c; + enum btree_id id; + + char buf[PAGE_SIZE]; + size_t bytes; /* what's currently in buf */ + + char __user *ubuf; /* destination user buffer */ + size_t size; /* size of requested read */ + ssize_t ret; /* bytes read so far */ +}; + +static int flush_buf(struct dump_iter *i) +{ + if (i->bytes) { + size_t bytes = min(i->bytes, i->size); + int err = copy_to_user(i->ubuf, i->buf, bytes); + + if (err) + return err; + + i->ret += bytes; + i->ubuf += bytes; + i->size -= bytes; + i->bytes -= bytes; + memmove(i->buf, i->buf + bytes, i->bytes); + } + + return 0; +} + +static int bch2_dump_open(struct inode *inode, struct file *file) +{ + struct btree_debug *bd = inode->i_private; + struct dump_iter *i; + + i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); + if (!i) + return -ENOMEM; + + file->private_data = i; + i->from = POS_MIN; + i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); + i->id = bd->id; + + return 0; +} + +static int bch2_dump_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static ssize_t bch2_read_btree(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct btree_iter iter; + struct bkey_s_c k; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + err = flush_buf(i); + if (err) + return err; + + if (!i->size) + return i->ret; + + bch2_btree_iter_init(&iter, i->c, i->id, i->from); + + while ((k = bch2_btree_iter_peek(&iter)).k && + !(err = btree_iter_err(k))) { + bch2_bkey_val_to_text(i->c, bkey_type(0, i->id), + i->buf, sizeof(i->buf), k); + i->bytes = strlen(i->buf); + BUG_ON(i->bytes >= PAGE_SIZE); + i->buf[i->bytes] = '\n'; + i->bytes++; + + bch2_btree_iter_advance_pos(&iter); + i->from = iter.pos; + + err = flush_buf(i); + if (err) + break; + + if (!i->size) + break; + } + bch2_btree_iter_unlock(&iter); + + return err < 0 ? err : i->ret; +} + +static const struct file_operations btree_debug_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_read_btree, +}; + +static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct btree_iter iter; + struct btree *b; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + err = flush_buf(i); + if (err) + return err; + + if (!i->size || !bkey_cmp(POS_MAX, i->from)) + return i->ret; + + for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) { + i->bytes = bch2_print_btree_node(i->c, b, i->buf, + sizeof(i->buf)); + err = flush_buf(i); + if (err) + break; + + /* + * can't easily correctly restart a btree node traversal across + * all nodes, meh + */ + i->from = bkey_cmp(POS_MAX, b->key.k.p) + ? bkey_successor(b->key.k.p) + : b->key.k.p; + + if (!i->size) + break; + } + bch2_btree_iter_unlock(&iter); + + return err < 0 ? err : i->ret; +} + +static const struct file_operations btree_format_debug_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_read_btree_formats, +}; + +static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct btree_iter iter; + struct bkey_s_c k; + struct btree *prev_node = NULL; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + err = flush_buf(i); + if (err) + return err; + + if (!i->size) + return i->ret; + + bch2_btree_iter_init(&iter, i->c, i->id, i->from); + + while ((k = bch2_btree_iter_peek(&iter)).k && + !(err = btree_iter_err(k))) { + struct btree *b = iter.nodes[0]; + struct btree_node_iter *node_iter = &iter.node_iters[0]; + struct bkey_packed *_k = bch2_btree_node_iter_peek(node_iter, b); + + if (iter.nodes[0] != prev_node) { + i->bytes = bch2_print_btree_node(i->c, b, i->buf, + sizeof(i->buf)); + err = flush_buf(i); + if (err) + break; + } + prev_node = iter.nodes[0]; + + i->bytes = bch2_bkey_print_bfloat(b, _k, i->buf, sizeof(i->buf)); + + err = flush_buf(i); + if (err) + break; + + bch2_btree_iter_advance_pos(&iter); + i->from = iter.pos; + + err = flush_buf(i); + if (err) + break; + + if (!i->size) + break; + } + bch2_btree_iter_unlock(&iter); + + return err < 0 ? err : i->ret; +} + +static const struct file_operations bfloat_failed_debug_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_read_bfloat_failed, +}; + +void bch2_fs_debug_exit(struct bch_fs *c) +{ + if (!IS_ERR_OR_NULL(c->debug)) + debugfs_remove_recursive(c->debug); +} + +void bch2_fs_debug_init(struct bch_fs *c) +{ + struct btree_debug *bd; + char name[100]; + + if (IS_ERR_OR_NULL(bch_debug)) + return; + + snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + c->debug = debugfs_create_dir(name, bch_debug); + if (IS_ERR_OR_NULL(c->debug)) + return; + + for (bd = c->btree_debug; + bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); + bd++) { + bd->id = bd - c->btree_debug; + bd->btree = debugfs_create_file(bch2_btree_ids[bd->id], + 0400, c->debug, bd, + &btree_debug_ops); + + snprintf(name, sizeof(name), "%s-formats", + bch2_btree_ids[bd->id]); + + bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, + &btree_format_debug_ops); + + snprintf(name, sizeof(name), "%s-bfloat-failed", + bch2_btree_ids[bd->id]); + + bd->failed = debugfs_create_file(name, 0400, c->debug, bd, + &bfloat_failed_debug_ops); + } +} + +#endif + +void bch2_debug_exit(void) +{ + if (!IS_ERR_OR_NULL(bch_debug)) + debugfs_remove_recursive(bch_debug); +} + +int __init bch2_debug_init(void) +{ + int ret = 0; + + bch_debug = debugfs_create_dir("bcachefs", NULL); + return ret; +} diff --git a/libbcachefs/debug.h b/libbcachefs/debug.h new file mode 100644 index 00000000..77245045 --- /dev/null +++ b/libbcachefs/debug.h @@ -0,0 +1,62 @@ +#ifndef _BCACHE_DEBUG_H +#define _BCACHE_DEBUG_H + +#include "bcachefs.h" + +struct bio; +struct btree; +struct bch_fs; + +#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; +BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + +#define BCH_DEBUG_PARAM(name, description) \ + static inline bool name(struct bch_fs *c) \ + { return bch2_##name || c->name; } +BCH_DEBUG_PARAMS_ALWAYS() +#undef BCH_DEBUG_PARAM + +#ifdef CONFIG_BCACHEFS_DEBUG + +#define BCH_DEBUG_PARAM(name, description) \ + static inline bool name(struct bch_fs *c) \ + { return bch2_##name || c->name; } +BCH_DEBUG_PARAMS_DEBUG() +#undef BCH_DEBUG_PARAM + +void __bch2_btree_verify(struct bch_fs *, struct btree *); + +#define bypass_torture_test(d) ((d)->bypass_torture_test) + +#else /* DEBUG */ + +#define BCH_DEBUG_PARAM(name, description) \ + static inline bool name(struct bch_fs *c) { return false; } +BCH_DEBUG_PARAMS_DEBUG() +#undef BCH_DEBUG_PARAM + +static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} + +#define bypass_torture_test(d) 0 + +#endif + +static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) +{ + if (verify_btree_ondisk(c)) + __bch2_btree_verify(c, b); +} + +#ifdef CONFIG_DEBUG_FS +void bch2_fs_debug_exit(struct bch_fs *); +void bch2_fs_debug_init(struct bch_fs *); +#else +static inline void bch2_fs_debug_exit(struct bch_fs *c) {} +static inline void bch2_fs_debug_init(struct bch_fs *c) {} +#endif + +void bch2_debug_exit(void); +int bch2_debug_init(void); + +#endif diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c new file mode 100644 index 00000000..503f0dc4 --- /dev/null +++ b/libbcachefs/dirent.c @@ -0,0 +1,427 @@ + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_update.h" +#include "extents.h" +#include "dirent.h" +#include "fs.h" +#include "keylist.h" +#include "str_hash.h" + +#include <linux/dcache.h> + +unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) +{ + unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent); + + while (len && !d.v->d_name[len - 1]) + --len; + + return len; +} + +static u64 bch2_dirent_hash(const struct bch_hash_info *info, + const struct qstr *name) +{ + struct bch_str_hash_ctx ctx; + + bch2_str_hash_init(&ctx, info); + bch2_str_hash_update(&ctx, info, name->name, name->len); + + /* [0,2) reserved for dots */ + return max_t(u64, bch2_str_hash_end(&ctx, info), 2); +} + +static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) +{ + return bch2_dirent_hash(info, key); +} + +static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); + + return bch2_dirent_hash(info, &name); +} + +static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) +{ + struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); + int len = bch2_dirent_name_bytes(l); + const struct qstr *r = _r; + + return len - r->len ?: memcmp(l.v->d_name, r->name, len); +} + +static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) +{ + struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); + struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); + int l_len = bch2_dirent_name_bytes(l); + int r_len = bch2_dirent_name_bytes(r); + + return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); +} + +static const struct bch_hash_desc dirent_hash_desc = { + .btree_id = BTREE_ID_DIRENTS, + .key_type = BCH_DIRENT, + .whiteout_type = BCH_DIRENT_WHITEOUT, + .hash_key = dirent_hash_key, + .hash_bkey = dirent_hash_bkey, + .cmp_key = dirent_cmp_key, + .cmp_bkey = dirent_cmp_bkey, +}; + +static const char *bch2_dirent_invalid(const struct bch_fs *c, + struct bkey_s_c k) +{ + switch (k.k->type) { + case BCH_DIRENT: + return bkey_val_bytes(k.k) < sizeof(struct bch_dirent) + ? "value too small" + : NULL; + + case BCH_DIRENT_WHITEOUT: + return bkey_val_bytes(k.k) != 0 + ? "value size should be zero" + : NULL; + + default: + return "invalid type"; + } +} + +static void bch2_dirent_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) +{ + struct bkey_s_c_dirent d; + + switch (k.k->type) { + case BCH_DIRENT: + d = bkey_s_c_to_dirent(k); + + if (size) { + unsigned n = min_t(unsigned, size, + bch2_dirent_name_bytes(d)); + memcpy(buf, d.v->d_name, n); + buf[size - 1] = '\0'; + buf += n; + size -= n; + } + + scnprintf(buf, size, " -> %llu", d.v->d_inum); + break; + case BCH_DIRENT_WHITEOUT: + scnprintf(buf, size, "whiteout"); + break; + } +} + +const struct bkey_ops bch2_bkey_dirent_ops = { + .key_invalid = bch2_dirent_invalid, + .val_to_text = bch2_dirent_to_text, +}; + +static struct bkey_i_dirent *dirent_create_key(u8 type, + const struct qstr *name, u64 dst) +{ + struct bkey_i_dirent *dirent; + unsigned u64s = BKEY_U64s + + DIV_ROUND_UP(sizeof(struct bch_dirent) + name->len, + sizeof(u64)); + + dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS); + if (!dirent) + return NULL; + + bkey_dirent_init(&dirent->k_i); + dirent->k.u64s = u64s; + dirent->v.d_inum = cpu_to_le64(dst); + dirent->v.d_type = type; + + memcpy(dirent->v.d_name, name->name, name->len); + memset(dirent->v.d_name + name->len, 0, + bkey_val_bytes(&dirent->k) - + (sizeof(struct bch_dirent) + name->len)); + + EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); + + return dirent; +} + +int bch2_dirent_create(struct bch_fs *c, u64 dir_inum, + const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, + u64 *journal_seq, int flags) +{ + struct bkey_i_dirent *dirent; + int ret; + + dirent = dirent_create_key(type, name, dst_inum); + if (!dirent) + return -ENOMEM; + + ret = bch2_hash_set(dirent_hash_desc, hash_info, c, dir_inum, + journal_seq, &dirent->k_i, flags); + kfree(dirent); + + return ret; +} + +static void dirent_copy_target(struct bkey_i_dirent *dst, + struct bkey_s_c_dirent src) +{ + dst->v.d_inum = src.v->d_inum; + dst->v.d_type = src.v->d_type; +} + +static struct bpos bch2_dirent_pos(struct bch_inode_info *ei, + const struct qstr *name) +{ + return POS(ei->vfs_inode.i_ino, bch2_dirent_hash(&ei->str_hash, name)); +} + +int bch2_dirent_rename(struct bch_fs *c, + struct inode *src_dir, const struct qstr *src_name, + struct inode *dst_dir, const struct qstr *dst_name, + u64 *journal_seq, enum bch_rename_mode mode) +{ + struct bch_inode_info *src_ei = to_bch_ei(src_dir); + struct bch_inode_info *dst_ei = to_bch_ei(dst_dir); + struct btree_iter src_iter, dst_iter, whiteout_iter; + struct bkey_s_c old_src, old_dst; + struct bkey delete; + struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; + struct bpos src_pos = bch2_dirent_pos(src_ei, src_name); + struct bpos dst_pos = bch2_dirent_pos(dst_ei, dst_name); + bool need_whiteout; + int ret = -ENOMEM; + + bch2_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos); + bch2_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos); + bch2_btree_iter_link(&src_iter, &dst_iter); + + bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos); + bch2_btree_iter_link(&src_iter, &whiteout_iter); + + if (mode == BCH_RENAME_EXCHANGE) { + new_src = dirent_create_key(0, src_name, 0); + if (!new_src) + goto err; + } else { + new_src = (void *) &delete; + } + + new_dst = dirent_create_key(0, dst_name, 0); + if (!new_dst) + goto err; +retry: + /* + * Note that on -EINTR/dropped locks we're not restarting the lookup + * from the original hashed position (like we do when creating dirents, + * in bch_hash_set) - we never move existing dirents to different slot: + */ + old_src = bch2_hash_lookup_at(dirent_hash_desc, + &src_ei->str_hash, + &src_iter, src_name); + if ((ret = btree_iter_err(old_src))) + goto err; + + ret = bch2_hash_needs_whiteout(dirent_hash_desc, + &src_ei->str_hash, + &whiteout_iter, &src_iter); + if (ret < 0) + goto err; + need_whiteout = ret; + + /* + * Note that in BCH_RENAME mode, we're _not_ checking if + * the target already exists - we're relying on the VFS + * to do that check for us for correctness: + */ + old_dst = mode == BCH_RENAME + ? bch2_hash_hole_at(dirent_hash_desc, &dst_iter) + : bch2_hash_lookup_at(dirent_hash_desc, + &dst_ei->str_hash, + &dst_iter, dst_name); + if ((ret = btree_iter_err(old_dst))) + goto err; + + switch (mode) { + case BCH_RENAME: + bkey_init(&new_src->k); + dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + + if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && + bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { + /* + * If we couldn't insert new_dst at its hashed + * position (dst_pos) due to a hash collision, + * and we're going to be deleting in + * between the hashed position and first empty + * slot we found - just overwrite the pos we + * were going to delete: + * + * Note: this is a correctness issue, in this + * situation bch2_hash_needs_whiteout() could + * return false when the whiteout would have + * been needed if we inserted at the pos + * __dirent_find_hole() found + */ + new_dst->k.p = src_iter.pos; + ret = bch2_btree_insert_at(c, NULL, NULL, + journal_seq, + BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(&src_iter, + &new_dst->k_i)); + goto err; + } + + if (need_whiteout) + new_src->k.type = BCH_DIRENT_WHITEOUT; + break; + case BCH_RENAME_OVERWRITE: + bkey_init(&new_src->k); + dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + + if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && + bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { + /* + * Same case described above - + * bch_hash_needs_whiteout could spuriously + * return false, but we have to insert at + * dst_iter.pos because we're overwriting + * another dirent: + */ + new_src->k.type = BCH_DIRENT_WHITEOUT; + } else if (need_whiteout) + new_src->k.type = BCH_DIRENT_WHITEOUT; + break; + case BCH_RENAME_EXCHANGE: + dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); + dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + break; + } + + new_src->k.p = src_iter.pos; + new_dst->k.p = dst_iter.pos; + ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, + BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i), + BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i)); +err: + if (ret == -EINTR) + goto retry; + + bch2_btree_iter_unlock(&whiteout_iter); + bch2_btree_iter_unlock(&dst_iter); + bch2_btree_iter_unlock(&src_iter); + + if (new_src != (void *) &delete) + kfree(new_src); + kfree(new_dst); + return ret; +} + +int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, + const struct bch_hash_info *hash_info, + const struct qstr *name, + u64 *journal_seq) +{ + return bch2_hash_delete(dirent_hash_desc, hash_info, + c, dir_inum, journal_seq, name); +} + +u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, + const struct bch_hash_info *hash_info, + const struct qstr *name) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 inum; + + k = bch2_hash_lookup(dirent_hash_desc, hash_info, c, + dir_inum, &iter, name); + if (IS_ERR(k.k)) { + bch2_btree_iter_unlock(&iter); + return 0; + } + + inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); + bch2_btree_iter_unlock(&iter); + + return inum; +} + +int bch2_empty_dir(struct bch_fs *c, u64 dir_inum) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), k) { + if (k.k->p.inode > dir_inum) + break; + + if (k.k->type == BCH_DIRENT) { + ret = -ENOTEMPTY; + break; + } + } + bch2_btree_iter_unlock(&iter); + + return ret; +} + +int bch2_readdir(struct bch_fs *c, struct file *file, + struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent dirent; + unsigned len; + + if (!dir_emit_dots(file, ctx)) + return 0; + + pr_debug("listing for %lu from %llu", inode->i_ino, ctx->pos); + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, + POS(inode->i_ino, ctx->pos), k) { + if (k.k->type != BCH_DIRENT) + continue; + + dirent = bkey_s_c_to_dirent(k); + + pr_debug("saw %llu:%llu (%s) -> %llu", + k.k->p.inode, k.k->p.offset, + dirent.v->d_name, dirent.v->d_inum); + + if (bkey_cmp(k.k->p, POS(inode->i_ino, ctx->pos)) < 0) + continue; + + if (k.k->p.inode > inode->i_ino) + break; + + len = bch2_dirent_name_bytes(dirent); + + pr_debug("emitting %s", dirent.v->d_name); + + /* + * XXX: dir_emit() can fault and block, while we're holding + * locks + */ + if (!dir_emit(ctx, dirent.v->d_name, len, + le64_to_cpu(dirent.v->d_inum), + dirent.v->d_type)) + break; + + ctx->pos = k.k->p.offset + 1; + } + bch2_btree_iter_unlock(&iter); + + return 0; +} diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h new file mode 100644 index 00000000..b1a30bda --- /dev/null +++ b/libbcachefs/dirent.h @@ -0,0 +1,36 @@ +#ifndef _BCACHE_DIRENT_H +#define _BCACHE_DIRENT_H + +extern const struct bkey_ops bch2_bkey_dirent_ops; + +struct qstr; +struct file; +struct dir_context; +struct bch_fs; +struct bch_hash_info; + +unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); +int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *, + u8, const struct qstr *, u64, u64 *, int); +int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *, + const struct qstr *, u64 *); + +enum bch_rename_mode { + BCH_RENAME, + BCH_RENAME_OVERWRITE, + BCH_RENAME_EXCHANGE, +}; + +int bch2_dirent_rename(struct bch_fs *, + struct inode *, const struct qstr *, + struct inode *, const struct qstr *, + u64 *, enum bch_rename_mode); + +u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, + const struct qstr *); + +int bch2_empty_dir(struct bch_fs *, u64); +int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *); + +#endif /* _BCACHE_DIRENT_H */ + diff --git a/libbcachefs/error.c b/libbcachefs/error.c new file mode 100644 index 00000000..8babf196 --- /dev/null +++ b/libbcachefs/error.c @@ -0,0 +1,51 @@ +#include "bcachefs.h" +#include "error.h" +#include "io.h" +#include "super.h" + +void bch2_inconsistent_error(struct bch_fs *c) +{ + set_bit(BCH_FS_ERROR, &c->flags); + + switch (c->opts.errors) { + case BCH_ON_ERROR_CONTINUE: + break; + case BCH_ON_ERROR_RO: + if (bch2_fs_emergency_read_only(c)) + bch_err(c, "emergency read only"); + break; + case BCH_ON_ERROR_PANIC: + panic(bch2_fmt(c, "panic after error")); + break; + } +} + +void bch2_fatal_error(struct bch_fs *c) +{ + if (bch2_fs_emergency_read_only(c)) + bch_err(c, "emergency read only"); +} + +void bch2_nonfatal_io_error_work(struct work_struct *work) +{ + struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); + struct bch_fs *c = ca->fs; + bool dev; + + mutex_lock(&c->state_lock); + dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO, + BCH_FORCE_IF_DEGRADED); + if (dev + ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, + BCH_FORCE_IF_DEGRADED) + : bch2_fs_emergency_read_only(c)) + bch_err(ca, + "too many IO errors, setting %s RO", + dev ? "device" : "filesystem"); + mutex_unlock(&c->state_lock); +} + +void bch2_nonfatal_io_error(struct bch_dev *ca) +{ + queue_work(system_long_wq, &ca->io_error_work); +} diff --git a/libbcachefs/error.h b/libbcachefs/error.h new file mode 100644 index 00000000..83d3a627 --- /dev/null +++ b/libbcachefs/error.h @@ -0,0 +1,237 @@ +#ifndef _BCACHE_ERROR_H +#define _BCACHE_ERROR_H + +#include <linux/printk.h> + +struct bch_dev; +struct bch_fs; + +/* + * XXX: separate out errors that indicate on disk data is inconsistent, and flag + * superblock as such + */ + +/* Error messages: */ + +/* + * Very fatal logic/inconsistency errors: these indicate that we've majorly + * screwed up at runtime, i.e. it's not likely that it was just caused by the + * data on disk being inconsistent. These BUG(): + * + * XXX: audit and convert to inconsistent() checks + */ + +#define bch2_fs_bug(c, ...) \ +do { \ + bch_err(c, __VA_ARGS__); \ + BUG(); \ +} while (0) + +#define bch2_fs_bug_on(cond, c, ...) \ +do { \ + if (cond) \ + bch2_fs_bug(c, __VA_ARGS__); \ +} while (0) + +/* + * Inconsistency errors: The on disk data is inconsistent. If these occur during + * initial recovery, they don't indicate a bug in the running code - we walk all + * the metadata before modifying anything. If they occur at runtime, they + * indicate either a bug in the running code or (less likely) data is being + * silently corrupted under us. + * + * XXX: audit all inconsistent errors and make sure they're all recoverable, in + * BCH_ON_ERROR_CONTINUE mode + */ + +void bch2_inconsistent_error(struct bch_fs *); + +#define bch2_fs_inconsistent(c, ...) \ +do { \ + bch_err(c, __VA_ARGS__); \ + bch2_inconsistent_error(c); \ +} while (0) + +#define bch2_fs_inconsistent_on(cond, c, ...) \ +({ \ + int _ret = !!(cond); \ + \ + if (_ret) \ + bch2_fs_inconsistent(c, __VA_ARGS__); \ + _ret; \ +}) + +/* + * Later we might want to mark only the particular device inconsistent, not the + * entire filesystem: + */ + +#define bch2_dev_inconsistent(ca, ...) \ +do { \ + bch_err(ca, __VA_ARGS__); \ + bch2_inconsistent_error((ca)->fs); \ +} while (0) + +#define bch2_dev_inconsistent_on(cond, ca, ...) \ +({ \ + int _ret = !!(cond); \ + \ + if (_ret) \ + bch2_dev_inconsistent(ca, __VA_ARGS__); \ + _ret; \ +}) + +/* + * Fsck errors: inconsistency errors we detect at mount time, and should ideally + * be able to repair: + */ + +enum { + BCH_FSCK_OK = 0, + BCH_FSCK_ERRORS_NOT_FIXED = 1, + BCH_FSCK_REPAIR_UNIMPLEMENTED = 2, + BCH_FSCK_REPAIR_IMPOSSIBLE = 3, + BCH_FSCK_UNKNOWN_VERSION = 4, +}; + +/* These macros return true if error should be fixed: */ + +/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ + +#ifndef __fsck_err +#define __fsck_err(c, _can_fix, _can_ignore, _nofix_msg, msg, ...) \ +({ \ + bool _fix = false; \ + \ + if (_can_fix && (c)->opts.fix_errors) { \ + bch_err(c, msg ", fixing", ##__VA_ARGS__); \ + set_bit(BCH_FS_FSCK_FIXED_ERRORS, &(c)->flags); \ + _fix = true; \ + } else if (_can_ignore && \ + (c)->opts.errors == BCH_ON_ERROR_CONTINUE) { \ + bch_err(c, msg " (ignoring)", ##__VA_ARGS__); \ + } else { \ + bch_err(c, msg " ("_nofix_msg")", ##__VA_ARGS__); \ + ret = BCH_FSCK_ERRORS_NOT_FIXED; \ + goto fsck_err; \ + } \ + \ + BUG_ON(!_fix && !_can_ignore); \ + _fix; \ +}) +#endif + +#define __fsck_err_on(cond, c, _can_fix, _can_ignore, _nofix_msg, ...) \ + ((cond) ? __fsck_err(c, _can_fix, _can_ignore, \ + _nofix_msg, ##__VA_ARGS__) : false) + +#define unfixable_fsck_err_on(cond, c, ...) \ + __fsck_err_on(cond, c, false, true, "repair unimplemented", ##__VA_ARGS__) + +#define need_fsck_err_on(cond, c, ...) \ + __fsck_err_on(cond, c, false, true, "run fsck to correct", ##__VA_ARGS__) + +#define mustfix_fsck_err(c, ...) \ + __fsck_err(c, true, false, "not fixing", ##__VA_ARGS__) + +#define mustfix_fsck_err_on(cond, c, ...) \ + __fsck_err_on(cond, c, true, false, "not fixing", ##__VA_ARGS__) + +#define fsck_err_on(cond, c, ...) \ + __fsck_err_on(cond, c, true, true, "not fixing", ##__VA_ARGS__) + +/* + * Fatal errors: these don't indicate a bug, but we can't continue running in RW + * mode - pretty much just due to metadata IO errors: + */ + +void bch2_fatal_error(struct bch_fs *); + +#define bch2_fs_fatal_error(c, ...) \ +do { \ + bch_err(c, __VA_ARGS__); \ + bch2_fatal_error(c); \ +} while (0) + +#define bch2_fs_fatal_err_on(cond, c, ...) \ +({ \ + int _ret = !!(cond); \ + \ + if (_ret) \ + bch2_fs_fatal_error(c, __VA_ARGS__); \ + _ret; \ +}) + +#define bch2_dev_fatal_error(ca, ...) \ +do { \ + bch_err(ca, __VA_ARGS__); \ + bch2_fatal_error(c); \ +} while (0) + +#define bch2_dev_fatal_io_error(ca, fmt, ...) \ +do { \ + printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ + "fatal IO error on %s for " fmt), \ + (ca)->name, ##__VA_ARGS__); \ + bch2_fatal_error((ca)->fs); \ +} while (0) + +#define bch2_dev_fatal_io_err_on(cond, ca, ...) \ +({ \ + int _ret = !!(cond); \ + \ + if (_ret) \ + bch2_dev_fatal_io_error(ca, __VA_ARGS__); \ + _ret; \ +}) + +/* + * Nonfatal IO errors: either recoverable metadata IO (because we have + * replicas), or data IO - we need to log it and print out a message, but we + * don't (necessarily) want to shut down the fs: + */ + +void bch2_nonfatal_io_error_work(struct work_struct *); + +/* Does the error handling without logging a message */ +void bch2_nonfatal_io_error(struct bch_dev *); + +#if 0 +#define bch2_fs_nonfatal_io_error(c, ...) \ +do { \ + bch_err(c, __VA_ARGS__); \ + bch2_nonfatal_io_error(c); \ +} while (0) +#endif + +/* Logs message and handles the error: */ +#define bch2_dev_nonfatal_io_error(ca, fmt, ...) \ +do { \ + printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ + "IO error on %s for " fmt), \ + (ca)->name, ##__VA_ARGS__); \ + bch2_nonfatal_io_error(ca); \ +} while (0) + +#define bch2_dev_nonfatal_io_err_on(cond, ca, ...) \ +({ \ + bool _ret = (cond); \ + \ + if (_ret) \ + bch2_dev_nonfatal_io_error(ca, __VA_ARGS__); \ + _ret; \ +}) + +/* kill? */ + +#define __bcache_io_error(c, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt(c, \ + "IO error: " fmt), ##__VA_ARGS__) + +#define bcache_io_error(c, bio, fmt, ...) \ +do { \ + __bcache_io_error(c, fmt, ##__VA_ARGS__); \ + (bio)->bi_error = -EIO; \ +} while (0) + +#endif /* _BCACHE_ERROR_H */ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c new file mode 100644 index 00000000..26f9352a --- /dev/null +++ b/libbcachefs/extents.c @@ -0,0 +1,2381 @@ +/* + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> + * + * Code for managing the extent btree and dynamically updating the writeback + * dirty sector count. + */ + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "checksum.h" +#include "debug.h" +#include "dirent.h" +#include "error.h" +#include "extents.h" +#include "inode.h" +#include "journal.h" +#include "super-io.h" +#include "xattr.h" + +#include <trace/events/bcachefs.h> + +static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *, + struct bkey_i *, struct bkey_i *); + +static void sort_key_next(struct btree_node_iter *iter, + struct btree *b, + struct btree_node_iter_set *i) +{ + i->k += __btree_node_offset_to_key(b, i->k)->u64s; + + if (i->k == i->end) + *i = iter->data[--iter->used]; +} + +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for btree_sort_fixup() - if there are multiple keys that compare + * equal in different sets, we have to process them newest to oldest. + */ +#define key_sort_cmp(l, r) \ +({ \ + int _c = bkey_cmp_packed(b, \ + __btree_node_offset_to_key(b, (l).k), \ + __btree_node_offset_to_key(b, (r).k)); \ + \ + _c ? _c > 0 : (l).k > (r).k; \ +}) + +static inline bool should_drop_next_key(struct btree_node_iter *iter, + struct btree *b) +{ + struct btree_node_iter_set *l = iter->data, *r = iter->data + 1; + struct bkey_packed *k = __btree_node_offset_to_key(b, l->k); + + if (bkey_whiteout(k)) + return true; + + if (iter->used < 2) + return false; + + if (iter->used > 2 && + key_sort_cmp(r[0], r[1])) + r++; + + /* + * key_sort_cmp() ensures that when keys compare equal the older key + * comes first; so if l->k compares equal to r->k then l->k is older and + * should be dropped. + */ + return !bkey_cmp_packed(b, + __btree_node_offset_to_key(b, l->k), + __btree_node_offset_to_key(b, r->k)); +} + +struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, + struct btree *b, + struct btree_node_iter *iter) +{ + struct bkey_packed *out = dst->start; + struct btree_nr_keys nr; + + memset(&nr, 0, sizeof(nr)); + + heap_resort(iter, key_sort_cmp); + + while (!bch2_btree_node_iter_end(iter)) { + if (!should_drop_next_key(iter, b)) { + struct bkey_packed *k = + __btree_node_offset_to_key(b, iter->data->k); + + bkey_copy(out, k); + btree_keys_account_key_add(&nr, 0, out); + out = bkey_next(out); + } + + sort_key_next(iter, b, iter->data); + heap_sift(iter, 0, key_sort_cmp); + } + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + return nr; +} + +/* Common among btree and extent ptrs */ + +const struct bch_extent_ptr * +bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) +{ + const struct bch_extent_ptr *ptr; + + extent_for_each_ptr(e, ptr) + if (ptr->dev == dev) + return ptr; + + return NULL; +} + +unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e) +{ + const struct bch_extent_ptr *ptr; + unsigned nr_ptrs = 0; + + extent_for_each_ptr(e, ptr) + nr_ptrs++; + + return nr_ptrs; +} + +unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k) +{ + struct bkey_s_c_extent e; + const struct bch_extent_ptr *ptr; + unsigned nr_ptrs = 0; + + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + e = bkey_s_c_to_extent(k); + + extent_for_each_ptr(e, ptr) + nr_ptrs += !ptr->cached; + break; + + case BCH_RESERVATION: + nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; + break; + } + + return nr_ptrs; +} + +/* returns true if equal */ +static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r) +{ + return extent_crc_type(l) == extent_crc_type(r) && + !memcmp(l, r, extent_entry_bytes(to_entry(l))); +} + +/* Increment pointers after @crc by crc's offset until the next crc entry: */ +void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc) +{ + union bch_extent_entry *entry; + + extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) { + if (!extent_entry_is_ptr(entry)) + return; + + entry->ptr.offset += crc_offset(crc); + } +} + +/* + * We're writing another replica for this extent, so while we've got the data in + * memory we'll be computing a new checksum for the currently live data. + * + * If there are other replicas we aren't moving, and they are checksummed but + * not compressed, we can modify them to point to only the data that is + * currently live (so that readers won't have to bounce) while we've got the + * checksum we need: + * + * XXX: to guard against data being corrupted while in memory, instead of + * recomputing the checksum here, it would be better in the read path to instead + * of computing the checksum of the entire extent: + * + * | extent | + * + * compute the checksums of the live and dead data separately + * | dead data || live data || dead data | + * + * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then + * use crc_live here (that we verified was correct earlier) + * + * note: doesn't work with encryption + */ +void bch2_extent_narrow_crcs(struct bkey_s_extent e) +{ + union bch_extent_crc *crc; + bool have_wide = false, have_narrow = false; + struct bch_csum csum = { 0 }; + unsigned csum_type = 0; + + extent_for_each_crc(e, crc) { + if (crc_compression_type(crc) || + bch2_csum_type_is_encryption(crc_csum_type(crc))) + continue; + + if (crc_uncompressed_size(e.k, crc) != e.k->size) { + have_wide = true; + } else { + have_narrow = true; + csum = crc_csum(crc); + csum_type = crc_csum_type(crc); + } + } + + if (!have_wide || !have_narrow) + return; + + extent_for_each_crc(e, crc) { + if (crc_compression_type(crc)) + continue; + + if (crc_uncompressed_size(e.k, crc) != e.k->size) { + switch (extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + BUG(); + case BCH_EXTENT_CRC32: + if (bch_crc_bytes[csum_type] > 4) + continue; + + bch2_extent_crc_narrow_pointers(e, crc); + crc->crc32._compressed_size = e.k->size - 1; + crc->crc32._uncompressed_size = e.k->size - 1; + crc->crc32.offset = 0; + crc->crc32.csum_type = csum_type; + crc->crc32.csum = csum.lo; + break; + case BCH_EXTENT_CRC64: + if (bch_crc_bytes[csum_type] > 10) + continue; + + bch2_extent_crc_narrow_pointers(e, crc); + crc->crc64._compressed_size = e.k->size - 1; + crc->crc64._uncompressed_size = e.k->size - 1; + crc->crc64.offset = 0; + crc->crc64.csum_type = csum_type; + crc->crc64.csum_lo = csum.lo; + crc->crc64.csum_hi = csum.hi; + break; + case BCH_EXTENT_CRC128: + if (bch_crc_bytes[csum_type] > 16) + continue; + + bch2_extent_crc_narrow_pointers(e, crc); + crc->crc128._compressed_size = e.k->size - 1; + crc->crc128._uncompressed_size = e.k->size - 1; + crc->crc128.offset = 0; + crc->crc128.csum_type = csum_type; + crc->crc128.csum = csum; + break; + } + } + } +} + +void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e) +{ + union bch_extent_entry *entry = e.v->start; + union bch_extent_crc *crc, *prev = NULL; + + while (entry != extent_entry_last(e)) { + union bch_extent_entry *next = extent_entry_next(entry); + size_t crc_u64s = extent_entry_u64s(entry); + + if (!extent_entry_is_crc(entry)) + goto next; + + crc = entry_to_crc(entry); + + if (next == extent_entry_last(e)) { + /* crc entry with no pointers after it: */ + goto drop; + } + + if (extent_entry_is_crc(next)) { + /* no pointers before next crc entry: */ + goto drop; + } + + if (prev && crc_cmp(crc, prev)) { + /* identical to previous crc entry: */ + goto drop; + } + + if (!prev && + !crc_csum_type(crc) && + !crc_compression_type(crc)) { + /* null crc entry: */ + bch2_extent_crc_narrow_pointers(e, crc); + goto drop; + } + + prev = crc; +next: + entry = next; + continue; +drop: + memmove_u64s_down(crc, next, + (u64 *) extent_entry_last(e) - (u64 *) next); + e.k->u64s -= crc_u64s; + } + + EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c)); +} + +static bool should_drop_ptr(const struct bch_fs *c, + struct bkey_s_c_extent e, + const struct bch_extent_ptr *ptr) +{ + return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr); +} + +static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e) +{ + struct bch_extent_ptr *ptr = &e.v->start->ptr; + bool dropped = false; + + while ((ptr = extent_ptr_next(e, ptr))) + if (should_drop_ptr(c, e.c, ptr)) { + __bch2_extent_drop_ptr(e, ptr); + dropped = true; + } else + ptr++; + + if (dropped) + bch2_extent_drop_redundant_crcs(e); +} + +static bool bch2_ptr_normalize(struct bch_fs *c, struct btree *bk, + struct bkey_s k) +{ + return bch2_extent_normalize(c, k); +} + +static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) +{ + switch (k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: { + union bch_extent_entry *entry; + u64 *d = (u64 *) bkeyp_val(f, k); + unsigned i; + + for (i = 0; i < bkeyp_val_u64s(f, k); i++) + d[i] = swab64(d[i]); + + for (entry = (union bch_extent_entry *) d; + entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); + entry = extent_entry_next(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.csum = swab32(entry->crc32.csum); + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); + entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.csum.hi = swab64(entry->crc64.csum_hi); + entry->crc128.csum.lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_ptr: + break; + } + } + break; + } + } +} + +static const char *extent_ptr_invalid(const struct bch_fs *c, + struct bkey_s_c_extent e, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata) +{ + const struct bch_extent_ptr *ptr2; + struct bch_dev *ca; + + if (ptr->dev >= c->sb.nr_devices) + return "pointer to invalid device"; + + ca = c->devs[ptr->dev]; + if (!ca) + return "pointer to invalid device"; + + extent_for_each_ptr(e, ptr2) + if (ptr != ptr2 && ptr->dev == ptr2->dev) + return "multiple pointers to same device"; + + if (ptr->offset + size_ondisk > ca->mi.bucket_size * ca->mi.nbuckets) + return "offset past end of device"; + + if (ptr->offset < ca->mi.bucket_size * ca->mi.first_bucket) + return "offset before first bucket"; + + if ((ptr->offset & (ca->mi.bucket_size - 1)) + + size_ondisk > ca->mi.bucket_size) + return "spans multiple buckets"; + + if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data)) + return "device not marked as containing data"; + + return NULL; +} + +static size_t extent_print_ptrs(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c_extent e) +{ + char *out = buf, *end = buf + size; + const union bch_extent_entry *entry; + const union bch_extent_crc *crc; + const struct bch_extent_ptr *ptr; + struct bch_dev *ca; + bool first = true; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + extent_for_each_entry(e, entry) { + if (!first) + p(" "); + + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = entry_to_crc(entry); + + p("crc: c_size %u size %u offset %u csum %u compress %u", + crc_compressed_size(e.k, crc), + crc_uncompressed_size(e.k, crc), + crc_offset(crc), crc_csum_type(crc), + crc_compression_type(crc)); + break; + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); + ca = c->devs[ptr->dev]; + + p("ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ca && ptr_stale(ca, ptr) + ? " stale" : ""); + break; + default: + p("(invalid extent entry %.16llx)", *((u64 *) entry)); + goto out; + } + + first = false; + } +out: + if (bkey_extent_is_cached(e.k)) + p(" cached"); +#undef p + return out - buf; +} + +/* Btree ptrs */ + +static const char *bch2_btree_ptr_invalid(const struct bch_fs *c, + struct bkey_s_c k) +{ + if (bkey_extent_is_cached(k.k)) + return "cached"; + + if (k.k->size) + return "nonzero key size"; + + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; + + switch (k.k->type) { + case BCH_EXTENT: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + const struct bch_extent_ptr *ptr; + const union bch_extent_crc *crc; + const char *reason; + + extent_for_each_entry(e, entry) + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) + return "invalid extent entry type"; + + extent_for_each_ptr_crc(e, ptr, crc) { + reason = extent_ptr_invalid(c, e, ptr, + c->sb.btree_node_size, + true); + if (reason) + return reason; + } + + if (crc) + return "has crc field"; + + return NULL; + } + + default: + return "invalid value type"; + } +} + +static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, + struct bkey_s_c k) +{ + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr; + unsigned seq; + const char *err; + char buf[160]; + struct bucket *g; + struct bch_dev *ca; + unsigned replicas = 0; + bool bad; + + extent_for_each_ptr(e, ptr) { + ca = c->devs[ptr->dev]; + g = PTR_BUCKET(ca, ptr); + replicas++; + + err = "stale"; + if (ptr_stale(ca, ptr)) + goto err; + + do { + seq = read_seqcount_begin(&c->gc_pos_lock); + bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && + g->mark.data_type != BUCKET_BTREE; + } while (read_seqcount_retry(&c->gc_pos_lock, seq)); + + err = "inconsistent"; + if (bad) + goto err; + } + + if (replicas < c->sb.meta_replicas_have) { + bch2_bkey_val_to_text(c, btree_node_type(b), + buf, sizeof(buf), k); + bch2_fs_bug(c, + "btree key bad (too few replicas, %u < %u): %s", + replicas, c->sb.meta_replicas_have, buf); + return; + } + + return; +err: + bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); + bch2_fs_bug(c, "%s btree pointer %s: bucket %zi prio %i " + "gen %i last_gc %i mark %08x", + err, buf, PTR_BUCKET_NR(ca, ptr), + g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen, + ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], + (unsigned) g->mark.counter); +} + +static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) +{ + char *out = buf, *end = buf + size; + const char *invalid; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + if (bkey_extent_is_data(k.k)) + out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); + + invalid = bch2_btree_ptr_invalid(c, k); + if (invalid) + p(" invalid: %s", invalid); +#undef p +} + +struct extent_pick_ptr +bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b) +{ + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); + const union bch_extent_crc *crc; + const struct bch_extent_ptr *ptr; + struct extent_pick_ptr pick = { .ca = NULL }; + + extent_for_each_ptr_crc(e, ptr, crc) { + struct bch_dev *ca = c->devs[ptr->dev]; + struct btree *root = btree_node_root(c, b); + + if (bch2_fs_inconsistent_on(crc, c, + "btree node pointer with crc at btree %u level %u/%u bucket %zu", + b->btree_id, b->level, root ? root->level : -1, + PTR_BUCKET_NR(ca, ptr))) + break; + + if (bch2_dev_inconsistent_on(ptr_stale(ca, ptr), ca, + "stale btree node pointer at btree %u level %u/%u bucket %zu", + b->btree_id, b->level, root ? root->level : -1, + PTR_BUCKET_NR(ca, ptr))) + continue; + + if (ca->mi.state == BCH_MEMBER_STATE_FAILED) + continue; + + if (pick.ca && pick.ca->mi.tier < ca->mi.tier) + continue; + + if (!percpu_ref_tryget(&ca->io_ref)) + continue; + + if (pick.ca) + percpu_ref_put(&pick.ca->io_ref); + + pick.ca = ca; + pick.ptr = *ptr; + } + + return pick; +} + +const struct bkey_ops bch2_bkey_btree_ops = { + .key_invalid = bch2_btree_ptr_invalid, + .key_debugcheck = btree_ptr_debugcheck, + .val_to_text = bch2_btree_ptr_to_text, + .swab = bch2_ptr_swab, +}; + +/* Extents */ + +static bool __bch2_cut_front(struct bpos where, struct bkey_s k) +{ + u64 len = 0; + + if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) + return false; + + EBUG_ON(bkey_cmp(where, k.k->p) > 0); + + len = k.k->p.offset - where.offset; + + BUG_ON(len > k.k->size); + + /* + * Don't readjust offset if the key size is now 0, because that could + * cause offset to point to the next bucket: + */ + if (!len) + __set_bkey_deleted(k.k); + else if (bkey_extent_is_data(k.k)) { + struct bkey_s_extent e = bkey_s_to_extent(k); + struct bch_extent_ptr *ptr; + union bch_extent_crc *crc, *prev_crc = NULL; + + extent_for_each_ptr_crc(e, ptr, crc) { + switch (extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + ptr->offset += e.k->size - len; + break; + case BCH_EXTENT_CRC32: + if (prev_crc != crc) + crc->crc32.offset += e.k->size - len; + break; + case BCH_EXTENT_CRC64: + if (prev_crc != crc) + crc->crc64.offset += e.k->size - len; + break; + case BCH_EXTENT_CRC128: + if (prev_crc != crc) + crc->crc128.offset += e.k->size - len; + break; + } + prev_crc = crc; + } + } + + k.k->size = len; + + return true; +} + +bool bch2_cut_front(struct bpos where, struct bkey_i *k) +{ + return __bch2_cut_front(where, bkey_i_to_s(k)); +} + +bool bch2_cut_back(struct bpos where, struct bkey *k) +{ + u64 len = 0; + + if (bkey_cmp(where, k->p) >= 0) + return false; + + EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0); + + len = where.offset - bkey_start_offset(k); + + BUG_ON(len > k->size); + + k->p = where; + k->size = len; + + if (!len) + __set_bkey_deleted(k); + + return true; +} + +/** + * bch_key_resize - adjust size of @k + * + * bkey_start_offset(k) will be preserved, modifies where the extent ends + */ +void bch2_key_resize(struct bkey *k, + unsigned new_size) +{ + k->p.offset -= k->size; + k->p.offset += new_size; + k->size = new_size; +} + +/* + * In extent_sort_fix_overlapping(), insert_fixup_extent(), + * extent_merge_inline() - we're modifying keys in place that are packed. To do + * that we have to unpack the key, modify the unpacked key - then this + * copies/repacks the unpacked to the original as necessary. + */ +static bool __extent_save(struct btree *b, struct btree_node_iter *iter, + struct bkey_packed *dst, struct bkey *src) +{ + struct bkey_format *f = &b->format; + struct bkey_i *dst_unpacked; + bool ret; + + if ((dst_unpacked = packed_to_bkey(dst))) { + dst_unpacked->k = *src; + ret = true; + } else { + ret = bch2_bkey_pack_key(dst, src, f); + } + + if (ret && iter) + bch2_verify_key_order(b, iter, dst); + + return ret; +} + +static void extent_save(struct btree *b, struct btree_node_iter *iter, + struct bkey_packed *dst, struct bkey *src) +{ + BUG_ON(!__extent_save(b, iter, dst, src)); +} + +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for sort_fix_overlapping() - if there are multiple keys that + * compare equal in different sets, we have to process them newest to oldest. + */ +#define extent_sort_cmp(l, r) \ +({ \ + struct bkey _ul = bkey_unpack_key(b, \ + __btree_node_offset_to_key(b, (l).k)); \ + struct bkey _ur = bkey_unpack_key(b, \ + __btree_node_offset_to_key(b, (r).k)); \ + \ + int _c = bkey_cmp(bkey_start_pos(&_ul), bkey_start_pos(&_ur)); \ + _c ? _c > 0 : (l).k < (r).k; \ +}) + +static inline void extent_sort_sift(struct btree_node_iter *iter, + struct btree *b, size_t i) +{ + heap_sift(iter, i, extent_sort_cmp); +} + +static inline void extent_sort_next(struct btree_node_iter *iter, + struct btree *b, + struct btree_node_iter_set *i) +{ + sort_key_next(iter, b, i); + heap_sift(iter, i - iter->data, extent_sort_cmp); +} + +static void extent_sort_append(struct bch_fs *c, + struct btree *b, + struct btree_nr_keys *nr, + struct bkey_packed *start, + struct bkey_packed **prev, + struct bkey_packed *k) +{ + struct bkey_format *f = &b->format; + BKEY_PADDED(k) tmp; + + if (bkey_whiteout(k)) + return; + + bch2_bkey_unpack(b, &tmp.k, k); + + if (*prev && + bch2_extent_merge(c, b, (void *) *prev, &tmp.k)) + return; + + if (*prev) { + bch2_bkey_pack(*prev, (void *) *prev, f); + + btree_keys_account_key_add(nr, 0, *prev); + *prev = bkey_next(*prev); + } else { + *prev = start; + } + + bkey_copy(*prev, &tmp.k); +} + +struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + struct bset *dst, + struct btree *b, + struct btree_node_iter *iter) +{ + struct bkey_format *f = &b->format; + struct btree_node_iter_set *_l = iter->data, *_r; + struct bkey_packed *prev = NULL, *out, *lk, *rk; + struct bkey l_unpacked, r_unpacked; + struct bkey_s l, r; + struct btree_nr_keys nr; + + memset(&nr, 0, sizeof(nr)); + + heap_resort(iter, extent_sort_cmp); + + while (!bch2_btree_node_iter_end(iter)) { + lk = __btree_node_offset_to_key(b, _l->k); + + if (iter->used == 1) { + extent_sort_append(c, b, &nr, dst->start, &prev, lk); + extent_sort_next(iter, b, _l); + continue; + } + + _r = iter->data + 1; + if (iter->used > 2 && + extent_sort_cmp(_r[0], _r[1])) + _r++; + + rk = __btree_node_offset_to_key(b, _r->k); + + l = __bkey_disassemble(b, lk, &l_unpacked); + r = __bkey_disassemble(b, rk, &r_unpacked); + + /* If current key and next key don't overlap, just append */ + if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { + extent_sort_append(c, b, &nr, dst->start, &prev, lk); + extent_sort_next(iter, b, _l); + continue; + } + + /* Skip 0 size keys */ + if (!r.k->size) { + extent_sort_next(iter, b, _r); + continue; + } + + /* + * overlap: keep the newer key and trim the older key so they + * don't overlap. comparing pointers tells us which one is + * newer, since the bsets are appended one after the other. + */ + + /* can't happen because of comparison func */ + BUG_ON(_l->k < _r->k && + !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); + + if (_l->k > _r->k) { + /* l wins, trim r */ + if (bkey_cmp(l.k->p, r.k->p) >= 0) { + sort_key_next(iter, b, _r); + } else { + __bch2_cut_front(l.k->p, r); + extent_save(b, NULL, rk, r.k); + } + + extent_sort_sift(iter, b, _r - iter->data); + } else if (bkey_cmp(l.k->p, r.k->p) > 0) { + BKEY_PADDED(k) tmp; + + /* + * r wins, but it overlaps in the middle of l - split l: + */ + bkey_reassemble(&tmp.k, l.s_c); + bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k); + + __bch2_cut_front(r.k->p, l); + extent_save(b, NULL, lk, l.k); + + extent_sort_sift(iter, b, 0); + + extent_sort_append(c, b, &nr, dst->start, &prev, + bkey_to_packed(&tmp.k)); + } else { + bch2_cut_back(bkey_start_pos(r.k), l.k); + extent_save(b, NULL, lk, l.k); + } + } + + if (prev) { + bch2_bkey_pack(prev, (void *) prev, f); + btree_keys_account_key_add(&nr, 0, prev); + out = bkey_next(prev); + } else { + out = dst->start; + } + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + return nr; +} + +struct extent_insert_state { + struct btree_insert *trans; + struct btree_insert_entry *insert; + struct bpos committed; + struct bch_fs_usage stats; + + /* for deleting: */ + struct bkey_i whiteout; + bool do_journal; + bool deleting; +}; + +static void bch2_add_sectors(struct extent_insert_state *s, + struct bkey_s_c k, u64 offset, s64 sectors) +{ + struct bch_fs *c = s->trans->c; + struct btree *b = s->insert->iter->nodes[0]; + + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0); + + if (!sectors) + return; + + bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b), + &s->stats, s->trans->journal_res.seq); +} + +static void bch2_subtract_sectors(struct extent_insert_state *s, + struct bkey_s_c k, u64 offset, s64 sectors) +{ + bch2_add_sectors(s, k, offset, -sectors); +} + +/* These wrappers subtract exactly the sectors that we're removing from @k */ +static void bch2_cut_subtract_back(struct extent_insert_state *s, + struct bpos where, struct bkey_s k) +{ + bch2_subtract_sectors(s, k.s_c, where.offset, + k.k->p.offset - where.offset); + bch2_cut_back(where, k.k); +} + +static void bch2_cut_subtract_front(struct extent_insert_state *s, + struct bpos where, struct bkey_s k) +{ + bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k), + where.offset - bkey_start_offset(k.k)); + __bch2_cut_front(where, k); +} + +static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k) +{ + if (k.k->size) + bch2_subtract_sectors(s, k.s_c, + bkey_start_offset(k.k), k.k->size); + k.k->size = 0; + __set_bkey_deleted(k.k); +} + +static bool bch2_extent_merge_inline(struct bch_fs *, + struct btree_iter *, + struct bkey_packed *, + struct bkey_packed *, + bool); + +#define MAX_LOCK_HOLD_TIME (5 * NSEC_PER_MSEC) + +static enum btree_insert_ret +extent_insert_should_stop(struct extent_insert_state *s) +{ + struct btree *b = s->insert->iter->nodes[0]; + + /* + * Check if we have sufficient space in both the btree node and the + * journal reservation: + * + * Each insert checks for room in the journal entry, but we check for + * room in the btree node up-front. In the worst case, bkey_cmpxchg() + * will insert two keys, and one iteration of this room will insert one + * key, so we need room for three keys. + */ + if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s)) + return BTREE_INSERT_BTREE_NODE_FULL; + else if (!journal_res_insert_fits(s->trans, s->insert)) + return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */ + else + return BTREE_INSERT_OK; +} + +static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, + struct bkey_i *insert) +{ + struct btree *b = iter->nodes[0]; + struct btree_node_iter *node_iter = &iter->node_iters[0]; + struct bset_tree *t = bset_tree_last(b); + struct bkey_packed *where = + bch2_btree_node_iter_bset_pos(node_iter, b, t); + struct bkey_packed *prev = bch2_bkey_prev(b, t, where); + struct bkey_packed *next_live_key = where; + unsigned clobber_u64s; + + if (prev) + where = bkey_next(prev); + + while (next_live_key != btree_bkey_last(b, t) && + bkey_deleted(next_live_key)) + next_live_key = bkey_next(next_live_key); + + /* + * Everything between where and next_live_key is now deleted keys, and + * is overwritten: + */ + clobber_u64s = (u64 *) next_live_key - (u64 *) where; + + if (prev && + bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true)) + goto drop_deleted_keys; + + if (next_live_key != btree_bkey_last(b, t) && + bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), + next_live_key, false)) + goto drop_deleted_keys; + + bch2_bset_insert(b, node_iter, where, insert, clobber_u64s); + bch2_btree_node_iter_fix(iter, b, node_iter, t, where, + clobber_u64s, where->u64s); + return; +drop_deleted_keys: + bch2_bset_delete(b, where, clobber_u64s); + bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, 0); +} + +static void extent_insert_committed(struct extent_insert_state *s) +{ + struct bch_fs *c = s->trans->c; + struct btree_iter *iter = s->insert->iter; + struct bkey_i *insert = !s->deleting + ? s->insert->k + : &s->whiteout; + BKEY_PADDED(k) split; + + EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0); + EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0); + + if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k))) + return; + + if (s->deleting && !s->do_journal) { + bch2_cut_front(s->committed, insert); + goto done; + } + + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + + bkey_copy(&split.k, insert); + + if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && + bkey_cmp(s->committed, insert->k.p) && + bkey_extent_is_compressed(bkey_i_to_s_c(insert))) { + /* XXX: possibly need to increase our reservation? */ + bch2_cut_subtract_back(s, s->committed, + bkey_i_to_s(&split.k)); + bch2_cut_front(s->committed, insert); + bch2_add_sectors(s, bkey_i_to_s_c(insert), + bkey_start_offset(&insert->k), + insert->k.size); + } else { + bch2_cut_back(s->committed, &split.k.k); + bch2_cut_front(s->committed, insert); + } + + if (debug_check_bkeys(c)) + bch2_bkey_debugcheck(c, iter->nodes[iter->level], + bkey_i_to_s_c(&split.k)); + + bch2_btree_journal_key(s->trans, iter, &split.k); + + if (!s->deleting) + extent_bset_insert(c, iter, &split.k); +done: + bch2_btree_iter_set_pos_same_leaf(iter, s->committed); + + insert->k.needs_whiteout = false; + s->do_journal = false; + s->trans->did_work = true; +} + +static enum extent_insert_hook_ret +__extent_insert_advance_pos(struct extent_insert_state *s, + struct bpos next_pos, + struct bkey_s_c k) +{ + struct extent_insert_hook *hook = s->trans->hook; + enum extent_insert_hook_ret ret; +#if 0 + /* + * Currently disabled for encryption - broken with fcollapse. Will have + * to reenable when versions are exposed for send/receive - versions + * will have to be monotonic then: + */ + if (k.k && k.k->size && + !bversion_zero(s->insert->k->k.version) && + bversion_cmp(k.k->version, s->insert->k->k.version) > 0) { + ret = BTREE_HOOK_NO_INSERT; + } else +#endif + if (hook) + ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k); + else + ret = BTREE_HOOK_DO_INSERT; + + EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size); + + switch (ret) { + case BTREE_HOOK_DO_INSERT: + break; + case BTREE_HOOK_NO_INSERT: + extent_insert_committed(s); + bch2_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k)); + + bch2_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos); + break; + case BTREE_HOOK_RESTART_TRANS: + return ret; + } + + s->committed = next_pos; + return ret; +} + +/* + * Update iter->pos, marking how much of @insert we've processed, and call hook + * fn: + */ +static enum extent_insert_hook_ret +extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k) +{ + struct btree *b = s->insert->iter->nodes[0]; + struct bpos next_pos = bpos_min(s->insert->k->k.p, + k.k ? k.k->p : b->key.k.p); + + /* hole? */ + if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) { + bool have_uncommitted = bkey_cmp(s->committed, + bkey_start_pos(&s->insert->k->k)) > 0; + + switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k), + bkey_s_c_null)) { + case BTREE_HOOK_DO_INSERT: + break; + case BTREE_HOOK_NO_INSERT: + /* + * we had to split @insert and insert the committed + * part - need to bail out and recheck journal + * reservation/btree node before we advance pos past @k: + */ + if (have_uncommitted) + return BTREE_HOOK_NO_INSERT; + break; + case BTREE_HOOK_RESTART_TRANS: + return BTREE_HOOK_RESTART_TRANS; + } + } + + /* avoid redundant calls to hook fn: */ + if (!bkey_cmp(s->committed, next_pos)) + return BTREE_HOOK_DO_INSERT; + + return __extent_insert_advance_pos(s, next_pos, k); +} + +static enum btree_insert_ret +extent_insert_check_split_compressed(struct extent_insert_state *s, + struct bkey_s_c k, + enum bch_extent_overlap overlap) +{ + struct bch_fs *c = s->trans->c; + unsigned sectors; + + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && + (sectors = bkey_extent_is_compressed(k))) { + int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD; + + if (s->trans->flags & BTREE_INSERT_NOFAIL) + flags |= BCH_DISK_RESERVATION_NOFAIL; + + switch (bch2_disk_reservation_add(c, + s->trans->disk_res, + sectors, flags)) { + case 0: + break; + case -ENOSPC: + return BTREE_INSERT_ENOSPC; + case -EINTR: + return BTREE_INSERT_NEED_GC_LOCK; + default: + BUG(); + } + } + + return BTREE_INSERT_OK; +} + +static enum btree_insert_ret +extent_squash(struct extent_insert_state *s, struct bkey_i *insert, + struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k, + enum bch_extent_overlap overlap) +{ + struct bch_fs *c = s->trans->c; + struct btree_iter *iter = s->insert->iter; + struct btree *b = iter->nodes[0]; + struct btree_node_iter *node_iter = &iter->node_iters[0]; + + switch (overlap) { + case BCH_EXTENT_OVERLAP_FRONT: + /* insert overlaps with start of k: */ + bch2_cut_subtract_front(s, insert->k.p, k); + BUG_ON(bkey_deleted(k.k)); + extent_save(b, node_iter, _k, k.k); + break; + + case BCH_EXTENT_OVERLAP_BACK: + /* insert overlaps with end of k: */ + bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k); + BUG_ON(bkey_deleted(k.k)); + extent_save(b, node_iter, _k, k.k); + + /* + * As the auxiliary tree is indexed by the end of the + * key and we've just changed the end, update the + * auxiliary tree. + */ + bch2_bset_fix_invalidated_key(b, t, _k); + bch2_btree_node_iter_fix(iter, b, node_iter, t, + _k, _k->u64s, _k->u64s); + break; + + case BCH_EXTENT_OVERLAP_ALL: { + struct bpos orig_pos = k.k->p; + + /* The insert key completely covers k, invalidate k */ + if (!bkey_whiteout(k.k)) + btree_keys_account_key_drop(&b->nr, + t - b->set, _k); + + bch2_drop_subtract(s, k); + k.k->p = bkey_start_pos(&insert->k); + if (!__extent_save(b, node_iter, _k, k.k)) { + /* + * Couldn't repack: we aren't necessarily able + * to repack if the new key is outside the range + * of the old extent, so we have to split + * @insert: + */ + k.k->p = orig_pos; + extent_save(b, node_iter, _k, k.k); + + if (extent_insert_advance_pos(s, k.s_c) == + BTREE_HOOK_RESTART_TRANS) + return BTREE_INSERT_NEED_TRAVERSE; + + extent_insert_committed(s); + /* + * We split and inserted upto at k.k->p - that + * has to coincide with iter->pos, so that we + * don't have anything more we have to insert + * until we recheck our journal reservation: + */ + EBUG_ON(bkey_cmp(s->committed, k.k->p)); + } else { + bch2_bset_fix_invalidated_key(b, t, _k); + bch2_btree_node_iter_fix(iter, b, node_iter, t, + _k, _k->u64s, _k->u64s); + } + + break; + } + case BCH_EXTENT_OVERLAP_MIDDLE: { + BKEY_PADDED(k) split; + /* + * The insert key falls 'in the middle' of k + * The insert key splits k in 3: + * - start only in k, preserve + * - middle common section, invalidate in k + * - end only in k, preserve + * + * We update the old key to preserve the start, + * insert will be the new common section, + * we manually insert the end that we are preserving. + * + * modify k _before_ doing the insert (which will move + * what k points to) + */ + bkey_reassemble(&split.k, k.s_c); + split.k.k.needs_whiteout |= bset_written(b, bset(b, t)); + + bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k); + BUG_ON(bkey_deleted(&split.k.k)); + + bch2_cut_subtract_front(s, insert->k.p, k); + BUG_ON(bkey_deleted(k.k)); + extent_save(b, node_iter, _k, k.k); + + bch2_add_sectors(s, bkey_i_to_s_c(&split.k), + bkey_start_offset(&split.k.k), + split.k.k.size); + extent_bset_insert(c, iter, &split.k); + break; + } + } + + return BTREE_INSERT_OK; +} + +static enum btree_insert_ret +bch2_delete_fixup_extent(struct extent_insert_state *s) +{ + struct bch_fs *c = s->trans->c; + struct btree_iter *iter = s->insert->iter; + struct btree *b = iter->nodes[0]; + struct btree_node_iter *node_iter = &iter->node_iters[0]; + struct bkey_packed *_k; + struct bkey unpacked; + struct bkey_i *insert = s->insert->k; + enum btree_insert_ret ret = BTREE_INSERT_OK; + + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); + + s->whiteout = *insert; + s->do_journal = false; + + while (bkey_cmp(s->committed, insert->k.p) < 0 && + (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && + (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { + struct bset_tree *t = bch2_bkey_to_bset(b, _k); + struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); + enum bch_extent_overlap overlap; + + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); + EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); + + if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) + break; + + if (bkey_whiteout(k.k)) { + s->committed = bpos_min(insert->k.p, k.k->p); + goto next; + } + + overlap = bch2_extent_overlap(&insert->k, k.k); + + ret = extent_insert_check_split_compressed(s, k.s_c, overlap); + if (ret != BTREE_INSERT_OK) + goto stop; + + switch (extent_insert_advance_pos(s, k.s_c)) { + case BTREE_HOOK_DO_INSERT: + break; + case BTREE_HOOK_NO_INSERT: + continue; + case BTREE_HOOK_RESTART_TRANS: + ret = BTREE_INSERT_NEED_TRAVERSE; + goto stop; + } + + s->do_journal = true; + + if (overlap == BCH_EXTENT_OVERLAP_ALL) { + btree_keys_account_key_drop(&b->nr, + t - b->set, _k); + bch2_subtract_sectors(s, k.s_c, + bkey_start_offset(k.k), k.k->size); + _k->type = KEY_TYPE_DISCARD; + reserve_whiteout(b, t, _k); + } else if (k.k->needs_whiteout || + bset_written(b, bset(b, t))) { + struct bkey_i discard = *insert; + + switch (overlap) { + case BCH_EXTENT_OVERLAP_FRONT: + bch2_cut_front(bkey_start_pos(k.k), &discard); + break; + case BCH_EXTENT_OVERLAP_BACK: + bch2_cut_back(k.k->p, &discard.k); + break; + default: + break; + } + + discard.k.needs_whiteout = true; + + ret = extent_squash(s, insert, t, _k, k, overlap); + BUG_ON(ret != BTREE_INSERT_OK); + + extent_bset_insert(c, iter, &discard); + } else { + ret = extent_squash(s, insert, t, _k, k, overlap); + BUG_ON(ret != BTREE_INSERT_OK); + } +next: + bch2_cut_front(s->committed, insert); + bch2_btree_iter_set_pos_same_leaf(iter, s->committed); + } + + if (bkey_cmp(s->committed, insert->k.p) < 0 && + ret == BTREE_INSERT_OK && + extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS) + ret = BTREE_INSERT_NEED_TRAVERSE; +stop: + extent_insert_committed(s); + + bch2_fs_usage_apply(c, &s->stats, s->trans->disk_res, + gc_pos_btree_node(b)); + + EBUG_ON(bkey_cmp(iter->pos, s->committed)); + EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf); + + bch2_cut_front(iter->pos, insert); + + if (insert->k.size && iter->at_end_of_leaf) + ret = BTREE_INSERT_NEED_TRAVERSE; + + EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK); + + return ret; +} + +/** + * bch_extent_insert_fixup - insert a new extent and deal with overlaps + * + * this may result in not actually doing the insert, or inserting some subset + * of the insert key. For cmpxchg operations this is where that logic lives. + * + * All subsets of @insert that need to be inserted are inserted using + * bch2_btree_insert_and_journal(). If @b or @res fills up, this function + * returns false, setting @iter->pos for the prefix of @insert that actually got + * inserted. + * + * BSET INVARIANTS: this function is responsible for maintaining all the + * invariants for bsets of extents in memory. things get really hairy with 0 + * size extents + * + * within one bset: + * + * bkey_start_pos(bkey_next(k)) >= k + * or bkey_start_offset(bkey_next(k)) >= k->offset + * + * i.e. strict ordering, no overlapping extents. + * + * multiple bsets (i.e. full btree node): + * + * ∀ k, j + * k.size != 0 ∧ j.size != 0 → + * ¬ (k > bkey_start_pos(j) ∧ k < j) + * + * i.e. no two overlapping keys _of nonzero size_ + * + * We can't realistically maintain this invariant for zero size keys because of + * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j + * there may be another 0 size key between them in another bset, and it will + * thus overlap with the merged key. + * + * In addition, the end of iter->pos indicates how much has been processed. + * If the end of iter->pos is not the same as the end of insert, then + * key insertion needs to continue/be retried. + */ +enum btree_insert_ret +bch2_insert_fixup_extent(struct btree_insert *trans, + struct btree_insert_entry *insert) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter = insert->iter; + struct btree *b = iter->nodes[0]; + struct btree_node_iter *node_iter = &iter->node_iters[0]; + struct bkey_packed *_k; + struct bkey unpacked; + enum btree_insert_ret ret = BTREE_INSERT_OK; + + struct extent_insert_state s = { + .trans = trans, + .insert = insert, + .committed = insert->iter->pos, + .deleting = bkey_whiteout(&insert->k->k), + }; + + EBUG_ON(iter->level); + EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size); + + if (s.deleting) + return bch2_delete_fixup_extent(&s); + + /* + * As we process overlapping extents, we advance @iter->pos both to + * signal to our caller (btree_insert_key()) how much of @insert->k has + * been inserted, and also to keep @iter->pos consistent with + * @insert->k and the node iterator that we're advancing: + */ + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); + + if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) + bch2_add_sectors(&s, bkey_i_to_s_c(insert->k), + bkey_start_offset(&insert->k->k), + insert->k->k.size); + + while (bkey_cmp(s.committed, insert->k->k.p) < 0 && + (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK && + (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { + struct bset_tree *t = bch2_bkey_to_bset(b, _k); + struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); + enum bch_extent_overlap overlap; + + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); + EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); + + if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0) + break; + + overlap = bch2_extent_overlap(&insert->k->k, k.k); + + ret = extent_insert_check_split_compressed(&s, k.s_c, overlap); + if (ret != BTREE_INSERT_OK) + goto stop; + + if (!k.k->size) + goto squash; + + /* + * Only call advance pos & call hook for nonzero size extents: + * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer + * overlaps with @k: + */ + switch (extent_insert_advance_pos(&s, k.s_c)) { + case BTREE_HOOK_DO_INSERT: + break; + case BTREE_HOOK_NO_INSERT: + continue; + case BTREE_HOOK_RESTART_TRANS: + ret = BTREE_INSERT_NEED_TRAVERSE; + goto stop; + } + + if (k.k->size && + (k.k->needs_whiteout || bset_written(b, bset(b, t)))) + insert->k->k.needs_whiteout = true; + + if (overlap == BCH_EXTENT_OVERLAP_ALL && + bkey_whiteout(k.k) && + k.k->needs_whiteout) { + unreserve_whiteout(b, t, _k); + _k->needs_whiteout = false; + } +squash: + ret = extent_squash(&s, insert->k, t, _k, k, overlap); + if (ret != BTREE_INSERT_OK) + goto stop; + } + + if (bkey_cmp(s.committed, insert->k->k.p) < 0 && + ret == BTREE_INSERT_OK && + extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS) + ret = BTREE_INSERT_NEED_TRAVERSE; +stop: + extent_insert_committed(&s); + /* + * Subtract any remaining sectors from @insert, if we bailed out early + * and didn't fully insert @insert: + */ + if (insert->k->k.size && + !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) + bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k), + bkey_start_offset(&insert->k->k), + insert->k->k.size); + + bch2_fs_usage_apply(c, &s.stats, trans->disk_res, + gc_pos_btree_node(b)); + + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); + EBUG_ON(bkey_cmp(iter->pos, s.committed)); + EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf); + + if (insert->k->k.size && iter->at_end_of_leaf) + ret = BTREE_INSERT_NEED_TRAVERSE; + + EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK); + + return ret; +} + +static const char *bch2_extent_invalid(const struct bch_fs *c, + struct bkey_s_c k) +{ + if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) + return "value too big"; + + if (!k.k->size) + return "zero key size"; + + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + const union bch_extent_crc *crc; + const struct bch_extent_ptr *ptr; + unsigned size_ondisk = e.k->size; + const char *reason; + + extent_for_each_entry(e, entry) { + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) + return "invalid extent entry type"; + + if (extent_entry_is_crc(entry)) { + crc = entry_to_crc(entry); + + if (crc_offset(crc) + e.k->size > + crc_uncompressed_size(e.k, crc)) + return "checksum offset + key size > uncompressed size"; + + size_ondisk = crc_compressed_size(e.k, crc); + + if (!bch2_checksum_type_valid(c, crc_csum_type(crc))) + return "invalid checksum type"; + + if (crc_compression_type(crc) >= BCH_COMPRESSION_NR) + return "invalid compression type"; + } else { + ptr = entry_to_ptr(entry); + + reason = extent_ptr_invalid(c, e, &entry->ptr, + size_ondisk, false); + if (reason) + return reason; + } + } + + return NULL; + } + + case BCH_RESERVATION: { + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) + return "incorrect value size"; + + if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) + return "invalid nr_replicas"; + + return NULL; + } + + default: + return "invalid value type"; + } +} + +static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, + struct bkey_s_c_extent e) +{ + const struct bch_extent_ptr *ptr; + struct bch_dev *ca; + struct bucket *g; + unsigned seq, stale; + char buf[160]; + bool bad; + unsigned ptrs_per_tier[BCH_TIER_MAX]; + unsigned replicas = 0; + + /* + * XXX: we should be doing most/all of these checks at startup time, + * where we check bch2_bkey_invalid() in btree_node_read_done() + * + * But note that we can't check for stale pointers or incorrect gc marks + * until after journal replay is done (it might be an extent that's + * going to get overwritten during replay) + */ + + memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier)); + + extent_for_each_ptr(e, ptr) { + ca = c->devs[ptr->dev]; + g = PTR_BUCKET(ca, ptr); + replicas++; + ptrs_per_tier[ca->mi.tier]++; + + /* + * If journal replay hasn't finished, we might be seeing keys + * that will be overwritten by the time journal replay is done: + */ + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + continue; + + stale = 0; + + do { + struct bucket_mark mark; + + seq = read_seqcount_begin(&c->gc_pos_lock); + mark = READ_ONCE(g->mark); + + /* between mark and bucket gen */ + smp_rmb(); + + stale = ptr_stale(ca, ptr); + + bch2_fs_bug_on(stale && !ptr->cached, c, + "stale dirty pointer"); + + bch2_fs_bug_on(stale > 96, c, + "key too stale: %i", + stale); + + if (stale) + break; + + bad = (mark.data_type != BUCKET_DATA || + (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && + !mark.owned_by_allocator && + !(ptr->cached + ? mark.cached_sectors + : mark.dirty_sectors))); + } while (read_seqcount_retry(&c->gc_pos_lock, seq)); + + if (bad) + goto bad_ptr; + } + + if (replicas > BCH_REPLICAS_MAX) { + bch2_bkey_val_to_text(c, btree_node_type(b), buf, + sizeof(buf), e.s_c); + bch2_fs_bug(c, + "extent key bad (too many replicas: %u): %s", + replicas, buf); + return; + } + + if (!bkey_extent_is_cached(e.k) && + replicas < c->sb.data_replicas_have) { + bch2_bkey_val_to_text(c, btree_node_type(b), buf, + sizeof(buf), e.s_c); + bch2_fs_bug(c, + "extent key bad (too few replicas, %u < %u): %s", + replicas, c->sb.data_replicas_have, buf); + return; + } + + return; + +bad_ptr: + bch2_bkey_val_to_text(c, btree_node_type(b), buf, + sizeof(buf), e.s_c); + bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i " + "gen %i last_gc %i mark 0x%08x", + buf, PTR_BUCKET_NR(ca, ptr), + g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen, + ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], + (unsigned) g->mark.counter); + return; +} + +static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, + struct bkey_s_c k) +{ + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k)); + break; + case BCH_RESERVATION: + break; + default: + BUG(); + } +} + +static void bch2_extent_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) +{ + char *out = buf, *end = buf + size; + const char *invalid; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + if (bkey_extent_is_data(k.k)) + out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); + + invalid = bch2_extent_invalid(c, k); + if (invalid) + p(" invalid: %s", invalid); +#undef p +} + +static unsigned PTR_TIER(struct bch_fs *c, + const struct bch_extent_ptr *ptr) +{ + return c->devs[ptr->dev]->mi.tier; +} + +static void bch2_extent_crc_init(union bch_extent_crc *crc, + unsigned compressed_size, + unsigned uncompressed_size, + unsigned compression_type, + unsigned nonce, + struct bch_csum csum, unsigned csum_type) +{ + if (bch_crc_bytes[csum_type] <= 4 && + uncompressed_size <= CRC32_SIZE_MAX && + nonce <= CRC32_NONCE_MAX) { + crc->crc32 = (struct bch_extent_crc32) { + .type = 1 << BCH_EXTENT_ENTRY_crc32, + ._compressed_size = compressed_size - 1, + ._uncompressed_size = uncompressed_size - 1, + .offset = 0, + .compression_type = compression_type, + .csum_type = csum_type, + .csum = *((__le32 *) &csum.lo), + }; + return; + } + + if (bch_crc_bytes[csum_type] <= 10 && + uncompressed_size <= CRC64_SIZE_MAX && + nonce <= CRC64_NONCE_MAX) { + crc->crc64 = (struct bch_extent_crc64) { + .type = 1 << BCH_EXTENT_ENTRY_crc64, + ._compressed_size = compressed_size - 1, + ._uncompressed_size = uncompressed_size - 1, + .offset = 0, + .nonce = nonce, + .compression_type = compression_type, + .csum_type = csum_type, + .csum_lo = csum.lo, + .csum_hi = *((__le16 *) &csum.hi), + }; + return; + } + + if (bch_crc_bytes[csum_type] <= 16 && + uncompressed_size <= CRC128_SIZE_MAX && + nonce <= CRC128_NONCE_MAX) { + crc->crc128 = (struct bch_extent_crc128) { + .type = 1 << BCH_EXTENT_ENTRY_crc128, + ._compressed_size = compressed_size - 1, + ._uncompressed_size = uncompressed_size - 1, + .offset = 0, + .nonce = nonce, + .compression_type = compression_type, + .csum_type = csum_type, + .csum = csum, + }; + return; + } + + BUG(); +} + +void bch2_extent_crc_append(struct bkey_i_extent *e, + unsigned compressed_size, + unsigned uncompressed_size, + unsigned compression_type, + unsigned nonce, + struct bch_csum csum, unsigned csum_type) +{ + union bch_extent_crc *crc; + + BUG_ON(compressed_size > uncompressed_size); + BUG_ON(uncompressed_size != e->k.size); + BUG_ON(!compressed_size || !uncompressed_size); + + /* + * Look up the last crc entry, so we can check if we need to add + * another: + */ + extent_for_each_crc(extent_i_to_s(e), crc) + ; + + if (!crc && !csum_type && !compression_type) + return; + + if (crc && + crc_compressed_size(&e->k, crc) == compressed_size && + crc_uncompressed_size(&e->k, crc) == uncompressed_size && + crc_offset(crc) == 0 && + crc_nonce(crc) == nonce && + crc_csum_type(crc) == csum_type && + crc_compression_type(crc) == compression_type && + crc_csum(crc).lo == csum.lo && + crc_csum(crc).hi == csum.hi) + return; + + bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), + compressed_size, + uncompressed_size, + compression_type, + nonce, csum, csum_type); + __extent_entry_push(e); +} + +/* + * bch_extent_normalize - clean up an extent, dropping stale pointers etc. + * + * Returns true if @k should be dropped entirely + * + * For existing keys, only called when btree nodes are being rewritten, not when + * they're merely being compacted/resorted in memory. + */ +bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) +{ + struct bkey_s_extent e; + + switch (k.k->type) { + case KEY_TYPE_ERROR: + return false; + + case KEY_TYPE_DELETED: + case KEY_TYPE_COOKIE: + return true; + + case KEY_TYPE_DISCARD: + return bversion_zero(k.k->version); + + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + e = bkey_s_to_extent(k); + + bch2_extent_drop_stale(c, e); + + if (!bkey_val_u64s(e.k)) { + if (bkey_extent_is_cached(e.k)) { + k.k->type = KEY_TYPE_DISCARD; + if (bversion_zero(k.k->version)) + return true; + } else { + k.k->type = KEY_TYPE_ERROR; + } + } + + return false; + case BCH_RESERVATION: + return false; + default: + BUG(); + } +} + +void bch2_extent_mark_replicas_cached(struct bch_fs *c, + struct bkey_s_extent e, + unsigned nr_cached) +{ + struct bch_extent_ptr *ptr; + bool have_higher_tier; + unsigned tier = 0; + + if (!nr_cached) + return; + + do { + have_higher_tier = false; + + extent_for_each_ptr(e, ptr) { + if (!ptr->cached && + PTR_TIER(c, ptr) == tier) { + ptr->cached = true; + nr_cached--; + if (!nr_cached) + return; + } + + if (PTR_TIER(c, ptr) > tier) + have_higher_tier = true; + } + + tier++; + } while (have_higher_tier); +} + +/* + * This picks a non-stale pointer, preferabbly from a device other than + * avoid. Avoid can be NULL, meaning pick any. If there are no non-stale + * pointers to other devices, it will still pick a pointer from avoid. + * Note that it prefers lowered-numbered pointers to higher-numbered pointers + * as the pointers are sorted by tier, hence preferring pointers to tier 0 + * rather than pointers to tier 1. + */ +void bch2_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k, + struct bch_dev *avoid, + struct extent_pick_ptr *ret) +{ + struct bkey_s_c_extent e; + const union bch_extent_crc *crc; + const struct bch_extent_ptr *ptr; + + switch (k.k->type) { + case KEY_TYPE_DELETED: + case KEY_TYPE_DISCARD: + case KEY_TYPE_COOKIE: + ret->ca = NULL; + return; + + case KEY_TYPE_ERROR: + ret->ca = ERR_PTR(-EIO); + return; + + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + e = bkey_s_c_to_extent(k); + ret->ca = NULL; + + extent_for_each_ptr_crc(e, ptr, crc) { + struct bch_dev *ca = c->devs[ptr->dev]; + + if (ptr->cached && ptr_stale(ca, ptr)) + continue; + + if (ca->mi.state == BCH_MEMBER_STATE_FAILED) + continue; + + if (ret->ca && + (ca == avoid || + ret->ca->mi.tier < ca->mi.tier)) + continue; + + if (!percpu_ref_tryget(&ca->io_ref)) + continue; + + if (ret->ca) + percpu_ref_put(&ret->ca->io_ref); + + *ret = (struct extent_pick_ptr) { + .crc = crc_to_128(e.k, crc), + .ptr = *ptr, + .ca = ca, + }; + } + + if (!ret->ca && !bkey_extent_is_cached(e.k)) + ret->ca = ERR_PTR(-EIO); + return; + + case BCH_RESERVATION: + ret->ca = NULL; + return; + + default: + BUG(); + } +} + +static enum merge_result bch2_extent_merge(struct bch_fs *c, + struct btree *bk, + struct bkey_i *l, struct bkey_i *r) +{ + struct bkey_s_extent el, er; + union bch_extent_entry *en_l, *en_r; + + if (key_merging_disabled(c)) + return BCH_MERGE_NOMERGE; + + /* + * Generic header checks + * Assumes left and right are in order + * Left and right must be exactly aligned + */ + + if (l->k.u64s != r->k.u64s || + l->k.type != r->k.type || + bversion_cmp(l->k.version, r->k.version) || + bkey_cmp(l->k.p, bkey_start_pos(&r->k))) + return BCH_MERGE_NOMERGE; + + switch (l->k.type) { + case KEY_TYPE_DELETED: + case KEY_TYPE_DISCARD: + case KEY_TYPE_ERROR: + /* These types are mergeable, and no val to check */ + break; + + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + el = bkey_i_to_s_extent(l); + er = bkey_i_to_s_extent(r); + + extent_for_each_entry(el, en_l) { + struct bch_extent_ptr *lp, *rp; + unsigned bucket_size; + + en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); + + if ((extent_entry_type(en_l) != + extent_entry_type(en_r)) || + extent_entry_is_crc(en_l)) + return BCH_MERGE_NOMERGE; + + lp = &en_l->ptr; + rp = &en_r->ptr; + + if (lp->offset + el.k->size != rp->offset || + lp->dev != rp->dev || + lp->gen != rp->gen) + return BCH_MERGE_NOMERGE; + + /* We don't allow extents to straddle buckets: */ + bucket_size = c->devs[lp->dev]->mi.bucket_size; + + if ((lp->offset & ~((u64) bucket_size - 1)) != + (rp->offset & ~((u64) bucket_size - 1))) + return BCH_MERGE_NOMERGE; + } + + break; + case BCH_RESERVATION: { + struct bkey_i_reservation *li = bkey_i_to_reservation(l); + struct bkey_i_reservation *ri = bkey_i_to_reservation(r); + + if (li->v.generation != ri->v.generation || + li->v.nr_replicas != ri->v.nr_replicas) + return BCH_MERGE_NOMERGE; + break; + } + default: + return BCH_MERGE_NOMERGE; + } + + l->k.needs_whiteout |= r->k.needs_whiteout; + + /* Keys with no pointers aren't restricted to one bucket and could + * overflow KEY_SIZE + */ + if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) { + bch2_key_resize(&l->k, KEY_SIZE_MAX); + bch2_cut_front(l->k.p, r); + return BCH_MERGE_PARTIAL; + } + + bch2_key_resize(&l->k, l->k.size + r->k.size); + + return BCH_MERGE_MERGE; +} + +static void extent_i_save(struct btree *b, struct bkey_packed *dst, + struct bkey_i *src) +{ + struct bkey_format *f = &b->format; + struct bkey_i *dst_unpacked; + + BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k)); + + /* + * We don't want the bch2_verify_key_order() call in extent_save(), + * because we may be out of order with deleted keys that are about to be + * removed by extent_bset_insert() + */ + + if ((dst_unpacked = packed_to_bkey(dst))) + bkey_copy(dst_unpacked, src); + else + BUG_ON(!bch2_bkey_pack(dst, src, f)); +} + +static bool extent_merge_one_overlapping(struct btree_iter *iter, + struct bpos new_pos, + struct bset_tree *t, + struct bkey_packed *k, struct bkey uk, + bool check, bool could_pack) +{ + struct btree *b = iter->nodes[0]; + struct btree_node_iter *node_iter = &iter->node_iters[0]; + + BUG_ON(!bkey_deleted(k)); + + if (check) { + return !bkey_packed(k) || could_pack; + } else { + uk.p = new_pos; + extent_save(b, node_iter, k, &uk); + bch2_bset_fix_invalidated_key(b, t, k); + bch2_btree_node_iter_fix(iter, b, node_iter, t, + k, k->u64s, k->u64s); + return true; + } +} + +static bool extent_merge_do_overlapping(struct btree_iter *iter, + struct bkey *m, bool back_merge) +{ + struct btree *b = iter->nodes[0]; + struct btree_node_iter *node_iter = &iter->node_iters[0]; + struct bset_tree *t; + struct bkey_packed *k; + struct bkey uk; + struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m); + bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b); + bool check = true; + + /* + * @m is the new merged extent: + * + * The merge took place in the last bset; we know there can't be any 0 + * size extents overlapping with m there because if so they would have + * been between the two extents we merged. + * + * But in the other bsets, we have to check for and fix such extents: + */ +do_fixup: + for_each_bset(b, t) { + if (t == bset_tree_last(b)) + break; + + /* + * if we don't find this bset in the iterator we already got to + * the end of that bset, so start searching from the end. + */ + k = bch2_btree_node_iter_bset_pos(node_iter, b, t); + + if (k == btree_bkey_last(b, t)) + k = bch2_bkey_prev_all(b, t, k); + if (!k) + continue; + + if (back_merge) { + /* + * Back merge: 0 size extents will be before the key + * that was just inserted (and thus the iterator + * position) - walk backwards to find them + */ + for (; + k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(uk.p, bkey_start_pos(m)) > 0); + k = bch2_bkey_prev_all(b, t, k)) { + if (bkey_cmp(uk.p, m->p) >= 0) + continue; + + if (!extent_merge_one_overlapping(iter, new_pos, + t, k, uk, check, could_pack)) + return false; + } + } else { + /* Front merge - walk forwards */ + for (; + k != btree_bkey_last(b, t) && + (uk = bkey_unpack_key(b, k), + bkey_cmp(uk.p, m->p) < 0); + k = bkey_next(k)) { + if (bkey_cmp(uk.p, + bkey_start_pos(m)) <= 0) + continue; + + if (!extent_merge_one_overlapping(iter, new_pos, + t, k, uk, check, could_pack)) + return false; + } + } + } + + if (check) { + check = false; + goto do_fixup; + } + + return true; +} + +/* + * When merging an extent that we're inserting into a btree node, the new merged + * extent could overlap with an existing 0 size extent - if we don't fix that, + * it'll break the btree node iterator so this code finds those 0 size extents + * and shifts them out of the way. + * + * Also unpacks and repacks. + */ +static bool bch2_extent_merge_inline(struct bch_fs *c, + struct btree_iter *iter, + struct bkey_packed *l, + struct bkey_packed *r, + bool back_merge) +{ + struct btree *b = iter->nodes[0]; + struct btree_node_iter *node_iter = &iter->node_iters[0]; + const struct bkey_format *f = &b->format; + struct bset_tree *t = bset_tree_last(b); + struct bkey_packed *m; + BKEY_PADDED(k) li; + BKEY_PADDED(k) ri; + struct bkey_i *mi; + struct bkey tmp; + + /* + * We need to save copies of both l and r, because we might get a + * partial merge (which modifies both) and then fails to repack + */ + bch2_bkey_unpack(b, &li.k, l); + bch2_bkey_unpack(b, &ri.k, r); + + m = back_merge ? l : r; + mi = back_merge ? &li.k : &ri.k; + + /* l & r should be in last bset: */ + EBUG_ON(bch2_bkey_to_bset(b, m) != t); + + switch (bch2_extent_merge(c, b, &li.k, &ri.k)) { + case BCH_MERGE_NOMERGE: + return false; + case BCH_MERGE_PARTIAL: + if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f)) + return false; + + if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) + return false; + + extent_i_save(b, m, mi); + bch2_bset_fix_invalidated_key(b, t, m); + + /* + * Update iterator to reflect what we just inserted - otherwise, + * the iter_fix() call is going to put us _before_ the key we + * just partially merged with: + */ + if (back_merge) + bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p); + + bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter, + t, m, m->u64s, m->u64s); + + if (!back_merge) + bkey_copy(packed_to_bkey(l), &li.k); + else + bkey_copy(packed_to_bkey(r), &ri.k); + return false; + case BCH_MERGE_MERGE: + if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f)) + return false; + + if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) + return false; + + extent_i_save(b, m, &li.k); + bch2_bset_fix_invalidated_key(b, t, m); + + bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter, + t, m, m->u64s, m->u64s); + return true; + default: + BUG(); + } +} + +const struct bkey_ops bch2_bkey_extent_ops = { + .key_invalid = bch2_extent_invalid, + .key_debugcheck = bch2_extent_debugcheck, + .val_to_text = bch2_extent_to_text, + .swab = bch2_ptr_swab, + .key_normalize = bch2_ptr_normalize, + .key_merge = bch2_extent_merge, + .is_extents = true, +}; diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h new file mode 100644 index 00000000..3a952484 --- /dev/null +++ b/libbcachefs/extents.h @@ -0,0 +1,581 @@ +#ifndef _BCACHE_EXTENTS_H +#define _BCACHE_EXTENTS_H + +#include "bcachefs.h" +#include "bkey.h" + +struct btree_node_iter; +struct btree_insert; +struct btree_insert_entry; +struct extent_insert_hook; + +struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *, + struct btree *, + struct btree_node_iter *); +struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + struct bset *, + struct btree *, + struct btree_node_iter *); + +extern const struct bkey_ops bch2_bkey_btree_ops; +extern const struct bkey_ops bch2_bkey_extent_ops; + +struct bch_fs; +struct journal_res; + +struct extent_pick_ptr { + struct bch_extent_crc128 crc; + struct bch_extent_ptr ptr; + struct bch_dev *ca; +}; + +struct extent_pick_ptr +bch2_btree_pick_ptr(struct bch_fs *, const struct btree *); + +void bch2_extent_pick_ptr_avoiding(struct bch_fs *, struct bkey_s_c, + struct bch_dev *, struct extent_pick_ptr *); + +static inline void +bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, + struct extent_pick_ptr *ret) +{ + bch2_extent_pick_ptr_avoiding(c, k, NULL, ret); +} + +enum btree_insert_ret +bch2_insert_fixup_extent(struct btree_insert *, + struct btree_insert_entry *); + +bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); +void bch2_extent_mark_replicas_cached(struct bch_fs *, + struct bkey_s_extent, unsigned); + +unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent); +unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c); + +static inline bool bkey_extent_is_data(const struct bkey *k) +{ + switch (k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + return true; + default: + return false; + } +} + +static inline bool bkey_extent_is_allocation(const struct bkey *k) +{ + switch (k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + case BCH_RESERVATION: + return true; + default: + return false; + } +} + +static inline bool bkey_extent_is_cached(const struct bkey *k) +{ + return k->type == BCH_EXTENT_CACHED; +} + +static inline void bkey_extent_set_cached(struct bkey *k, bool cached) +{ + EBUG_ON(k->type != BCH_EXTENT && + k->type != BCH_EXTENT_CACHED); + + k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT; +} + +static inline unsigned +__extent_entry_type(const union bch_extent_entry *e) +{ + return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; +} + +static inline enum bch_extent_entry_type +extent_entry_type(const union bch_extent_entry *e) +{ + int ret = __ffs(e->type); + + EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); + + return ret; +} + +static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) +{ + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_crc32: + return sizeof(struct bch_extent_crc32); + case BCH_EXTENT_ENTRY_crc64: + return sizeof(struct bch_extent_crc64); + case BCH_EXTENT_ENTRY_crc128: + return sizeof(struct bch_extent_crc128); + case BCH_EXTENT_ENTRY_ptr: + return sizeof(struct bch_extent_ptr); + default: + BUG(); + } +} + +static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) +{ + return extent_entry_bytes(entry) / sizeof(u64); +} + +static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) +{ + return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; +} + +static inline bool extent_entry_is_crc(const union bch_extent_entry *e) +{ + return !extent_entry_is_ptr(e); +} + +union bch_extent_crc { + u8 type; + struct bch_extent_crc32 crc32; + struct bch_extent_crc64 crc64; + struct bch_extent_crc128 crc128; +}; + +/* downcast, preserves const */ +#define to_entry(_entry) \ +({ \ + BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ + !type_is(_entry, struct bch_extent_ptr *)); \ + \ + __builtin_choose_expr( \ + (type_is_exact(_entry, const union bch_extent_crc *) || \ + type_is_exact(_entry, const struct bch_extent_ptr *)), \ + (const union bch_extent_entry *) (_entry), \ + (union bch_extent_entry *) (_entry)); \ +}) + +#define __entry_to_crc(_entry) \ + __builtin_choose_expr( \ + type_is_exact(_entry, const union bch_extent_entry *), \ + (const union bch_extent_crc *) (_entry), \ + (union bch_extent_crc *) (_entry)) + +#define entry_to_crc(_entry) \ +({ \ + EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ + \ + __entry_to_crc(_entry); \ +}) + +#define entry_to_ptr(_entry) \ +({ \ + EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ + \ + __builtin_choose_expr( \ + type_is_exact(_entry, const union bch_extent_entry *), \ + (const struct bch_extent_ptr *) (_entry), \ + (struct bch_extent_ptr *) (_entry)); \ +}) + +enum bch_extent_crc_type { + BCH_EXTENT_CRC_NONE, + BCH_EXTENT_CRC32, + BCH_EXTENT_CRC64, + BCH_EXTENT_CRC128, +}; + +static inline enum bch_extent_crc_type +__extent_crc_type(const union bch_extent_crc *crc) +{ + if (!crc) + return BCH_EXTENT_CRC_NONE; + + switch (extent_entry_type(to_entry(crc))) { + case BCH_EXTENT_ENTRY_crc32: + return BCH_EXTENT_CRC32; + case BCH_EXTENT_ENTRY_crc64: + return BCH_EXTENT_CRC64; + case BCH_EXTENT_ENTRY_crc128: + return BCH_EXTENT_CRC128; + default: + BUG(); + } +} + +#define extent_crc_type(_crc) \ +({ \ + BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) && \ + !type_is(_crc, struct bch_extent_crc64 *) && \ + !type_is(_crc, struct bch_extent_crc128 *) && \ + !type_is(_crc, union bch_extent_crc *)); \ + \ + type_is(_crc, struct bch_extent_crc32 *) ? BCH_EXTENT_CRC32 \ + : type_is(_crc, struct bch_extent_crc64 *) ? BCH_EXTENT_CRC64 \ + : type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128 \ + : __extent_crc_type((union bch_extent_crc *) _crc); \ +}) + +#define extent_entry_next(_entry) \ + ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) + +#define extent_entry_last(_e) \ + vstruct_idx((_e).v, bkey_val_u64s((_e).k)) + +/* Iterate over all entries: */ + +#define extent_for_each_entry_from(_e, _entry, _start) \ + for ((_entry) = _start; \ + (_entry) < extent_entry_last(_e); \ + (_entry) = extent_entry_next(_entry)) + +#define extent_for_each_entry(_e, _entry) \ + extent_for_each_entry_from(_e, _entry, (_e).v->start) + +/* Iterate over crcs only: */ + +#define extent_crc_next(_e, _p) \ +({ \ + typeof(&(_e).v->start[0]) _entry = _p; \ + \ + while ((_entry) < extent_entry_last(_e) && \ + !extent_entry_is_crc(_entry)) \ + (_entry) = extent_entry_next(_entry); \ + \ + entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL); \ +}) + +#define extent_for_each_crc(_e, _crc) \ + for ((_crc) = extent_crc_next(_e, (_e).v->start); \ + (_crc); \ + (_crc) = extent_crc_next(_e, extent_entry_next(to_entry(_crc)))) + +/* Iterate over pointers, with crcs: */ + +#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter) \ +({ \ + __label__ out; \ + typeof(&(_e).v->start[0]) _entry; \ + \ + extent_for_each_entry_from(_e, _entry, to_entry(_ptr)) \ + if (extent_entry_is_crc(_entry)) { \ + (_crc) = entry_to_crc(_entry); \ + } else { \ + _ptr = entry_to_ptr(_entry); \ + if (_filter) \ + goto out; \ + } \ + \ + _ptr = NULL; \ +out: \ + _ptr; \ +}) + +#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter) \ + for ((_crc) = NULL, \ + (_ptr) = &(_e).v->start->ptr; \ + ((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\ + (_ptr)++) + +#define extent_for_each_ptr_crc(_e, _ptr, _crc) \ + extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true) + +/* Iterate over pointers only, and from a given position: */ + +#define extent_ptr_next_filter(_e, _ptr, _filter) \ +({ \ + typeof(__entry_to_crc(&(_e).v->start[0])) _crc; \ + \ + extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter); \ +}) + +#define extent_ptr_next(_e, _ptr) \ + extent_ptr_next_filter(_e, _ptr, true) + +#define extent_for_each_ptr_filter(_e, _ptr, _filter) \ + for ((_ptr) = &(_e).v->start->ptr; \ + ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter)); \ + (_ptr)++) + +#define extent_for_each_ptr(_e, _ptr) \ + extent_for_each_ptr_filter(_e, _ptr, true) + +#define extent_ptr_prev(_e, _ptr) \ +({ \ + typeof(&(_e).v->start->ptr) _p; \ + typeof(&(_e).v->start->ptr) _prev = NULL; \ + \ + extent_for_each_ptr(_e, _p) { \ + if (_p == (_ptr)) \ + break; \ + _prev = _p; \ + } \ + \ + _prev; \ +}) + +/* + * Use this when you'll be dropping pointers as you iterate. Quadratic, + * unfortunately: + */ +#define extent_for_each_ptr_backwards(_e, _ptr) \ + for ((_ptr) = extent_ptr_prev(_e, NULL); \ + (_ptr); \ + (_ptr) = extent_ptr_prev(_e, _ptr)) + +void bch2_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned, + unsigned, unsigned, struct bch_csum, unsigned); + +static inline void __extent_entry_push(struct bkey_i_extent *e) +{ + union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e)); + + EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) > + BKEY_EXTENT_VAL_U64s_MAX); + + e->k.u64s += extent_entry_u64s(entry); +} + +static inline void extent_ptr_append(struct bkey_i_extent *e, + struct bch_extent_ptr ptr) +{ + ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + extent_entry_last(extent_i_to_s(e))->ptr = ptr; + __extent_entry_push(e); +} + +static inline struct bch_extent_crc128 crc_to_128(const struct bkey *k, + const union bch_extent_crc *crc) +{ + EBUG_ON(!k->size); + + switch (extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + return (struct bch_extent_crc128) { + ._compressed_size = k->size - 1, + ._uncompressed_size = k->size - 1, + }; + case BCH_EXTENT_CRC32: + return (struct bch_extent_crc128) { + .type = 1 << BCH_EXTENT_ENTRY_crc128, + ._compressed_size = crc->crc32._compressed_size, + ._uncompressed_size = crc->crc32._uncompressed_size, + .offset = crc->crc32.offset, + .csum_type = crc->crc32.csum_type, + .compression_type = crc->crc32.compression_type, + .csum.lo = crc->crc32.csum, + }; + case BCH_EXTENT_CRC64: + return (struct bch_extent_crc128) { + .type = 1 << BCH_EXTENT_ENTRY_crc128, + ._compressed_size = crc->crc64._compressed_size, + ._uncompressed_size = crc->crc64._uncompressed_size, + .offset = crc->crc64.offset, + .nonce = crc->crc64.nonce, + .csum_type = crc->crc64.csum_type, + .compression_type = crc->crc64.compression_type, + .csum.lo = crc->crc64.csum_lo, + .csum.hi = crc->crc64.csum_hi, + }; + case BCH_EXTENT_CRC128: + return crc->crc128; + default: + BUG(); + } +} + +#define crc_compressed_size(_k, _crc) \ +({ \ + unsigned _size = 0; \ + \ + switch (extent_crc_type(_crc)) { \ + case BCH_EXTENT_CRC_NONE: \ + _size = ((const struct bkey *) (_k))->size; \ + break; \ + case BCH_EXTENT_CRC32: \ + _size = ((struct bch_extent_crc32 *) _crc) \ + ->_compressed_size + 1; \ + break; \ + case BCH_EXTENT_CRC64: \ + _size = ((struct bch_extent_crc64 *) _crc) \ + ->_compressed_size + 1; \ + break; \ + case BCH_EXTENT_CRC128: \ + _size = ((struct bch_extent_crc128 *) _crc) \ + ->_compressed_size + 1; \ + break; \ + } \ + _size; \ +}) + +#define crc_uncompressed_size(_k, _crc) \ +({ \ + unsigned _size = 0; \ + \ + switch (extent_crc_type(_crc)) { \ + case BCH_EXTENT_CRC_NONE: \ + _size = ((const struct bkey *) (_k))->size; \ + break; \ + case BCH_EXTENT_CRC32: \ + _size = ((struct bch_extent_crc32 *) _crc) \ + ->_uncompressed_size + 1; \ + break; \ + case BCH_EXTENT_CRC64: \ + _size = ((struct bch_extent_crc64 *) _crc) \ + ->_uncompressed_size + 1; \ + break; \ + case BCH_EXTENT_CRC128: \ + _size = ((struct bch_extent_crc128 *) _crc) \ + ->_uncompressed_size + 1; \ + break; \ + } \ + _size; \ +}) + +static inline unsigned crc_offset(const union bch_extent_crc *crc) +{ + switch (extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + return 0; + case BCH_EXTENT_CRC32: + return crc->crc32.offset; + case BCH_EXTENT_CRC64: + return crc->crc64.offset; + case BCH_EXTENT_CRC128: + return crc->crc128.offset; + default: + BUG(); + } +} + +static inline unsigned crc_nonce(const union bch_extent_crc *crc) +{ + switch (extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + case BCH_EXTENT_CRC32: + return 0; + case BCH_EXTENT_CRC64: + return crc->crc64.nonce; + case BCH_EXTENT_CRC128: + return crc->crc128.nonce; + default: + BUG(); + } +} + +static inline unsigned crc_csum_type(const union bch_extent_crc *crc) +{ + switch (extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + return 0; + case BCH_EXTENT_CRC32: + return crc->crc32.csum_type; + case BCH_EXTENT_CRC64: + return crc->crc64.csum_type; + case BCH_EXTENT_CRC128: + return crc->crc128.csum_type; + default: + BUG(); + } +} + +static inline unsigned crc_compression_type(const union bch_extent_crc *crc) +{ + switch (extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + return 0; + case BCH_EXTENT_CRC32: + return crc->crc32.compression_type; + case BCH_EXTENT_CRC64: + return crc->crc64.compression_type; + case BCH_EXTENT_CRC128: + return crc->crc128.compression_type; + default: + BUG(); + } +} + +static inline struct bch_csum crc_csum(const union bch_extent_crc *crc) +{ + switch (extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + return (struct bch_csum) { 0 }; + case BCH_EXTENT_CRC32: + return (struct bch_csum) { .lo = crc->crc32.csum }; + case BCH_EXTENT_CRC64: + return (struct bch_csum) { + .lo = crc->crc64.csum_lo, + .hi = crc->crc64.csum_hi, + }; + case BCH_EXTENT_CRC128: + return crc->crc128.csum; + default: + BUG(); + } +} + +static inline unsigned bkey_extent_is_compressed(struct bkey_s_c k) +{ + struct bkey_s_c_extent e; + const struct bch_extent_ptr *ptr; + const union bch_extent_crc *crc; + unsigned ret = 0; + + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + e = bkey_s_c_to_extent(k); + + extent_for_each_ptr_crc(e, ptr, crc) + if (!ptr->cached && + crc_compression_type(crc) != BCH_COMPRESSION_NONE && + crc_compressed_size(e.k, crc) < k.k->size) + ret = max_t(unsigned, ret, + crc_compressed_size(e.k, crc)); + } + + return ret; +} + +static inline unsigned extent_current_nonce(struct bkey_s_c_extent e) +{ + const union bch_extent_crc *crc; + + extent_for_each_crc(e, crc) + if (bch2_csum_type_is_encryption(crc_csum_type(crc))) + return crc_offset(crc) + crc_nonce(crc); + + return 0; +} + +void bch2_extent_narrow_crcs(struct bkey_s_extent); +void bch2_extent_drop_redundant_crcs(struct bkey_s_extent); + +/* Doesn't cleanup redundant crcs */ +static inline void __bch2_extent_drop_ptr(struct bkey_s_extent e, + struct bch_extent_ptr *ptr) +{ + EBUG_ON(ptr < &e.v->start->ptr || + ptr >= &extent_entry_last(e)->ptr); + EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + memmove_u64s_down(ptr, ptr + 1, + (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1)); + e.k->u64s -= sizeof(*ptr) / sizeof(u64); +} + +static inline void bch2_extent_drop_ptr(struct bkey_s_extent e, + struct bch_extent_ptr *ptr) +{ + __bch2_extent_drop_ptr(e, ptr); + bch2_extent_drop_redundant_crcs(e); +} + +const struct bch_extent_ptr * +bch2_extent_has_device(struct bkey_s_c_extent, unsigned); + +bool bch2_cut_front(struct bpos, struct bkey_i *); +bool bch2_cut_back(struct bpos, struct bkey *); +void bch2_key_resize(struct bkey *, unsigned); + +#endif /* _BCACHE_EXTENTS_H */ diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h new file mode 100644 index 00000000..13d54e5e --- /dev/null +++ b/libbcachefs/eytzinger.h @@ -0,0 +1,196 @@ +#ifndef _EYTZINGER_H +#define _EYTZINGER_H + +#include <linux/bitops.h> +#include <linux/log2.h> + +#include "util.h" + +/* + * Traversal for trees in eytzinger layout - a full binary tree layed out in an + * array + * + * We used one based indexing, not zero based: with one based indexing, each + * level of the tree starts at a power of two - leading to better alignment - + * and it's what you want for implementing next/prev and to/from inorder. + * + * To/from inorder also uses 1 based indexing. + * + * Size parameter is treated as if we were using 0 based indexing, however: + * valid nodes, and inorder indices, are in the range [1..size) + */ + +static inline unsigned eytzinger_child(unsigned j, unsigned child) +{ + EBUG_ON(child > 1); + + return (j << 1) + child; +} + +static inline unsigned eytzinger_left_child(unsigned j) +{ + return eytzinger_child(j, 0); +} + +static inline unsigned eytzinger_right_child(unsigned j) +{ + return eytzinger_child(j, 1); +} + +static inline unsigned eytzinger_first(unsigned size) +{ + return rounddown_pow_of_two(size - 1); +} + +static inline unsigned eytzinger_last(unsigned size) +{ + return rounddown_pow_of_two(size) - 1; +} + +/* + * eytzinger_next() and eytzinger_prev() have the nice properties that + * + * eytzinger_next(0) == eytzinger_first()) + * eytzinger_prev(0) == eytzinger_last()) + * + * eytzinger_prev(eytzinger_first()) == 0 + * eytzinger_next(eytzinger_last()) == 0 + */ + +static inline unsigned eytzinger_next(unsigned j, unsigned size) +{ + EBUG_ON(j >= size); + + if (eytzinger_right_child(j) < size) { + j = eytzinger_right_child(j); + + j <<= __fls(size) - __fls(j); + j >>= j >= size; + } else { + j >>= ffz(j) + 1; + } + + return j; +} + +static inline unsigned eytzinger_prev(unsigned j, unsigned size) +{ + EBUG_ON(j >= size); + + if (eytzinger_left_child(j) < size) { + j = eytzinger_left_child(j); + + j <<= __fls(size) - __fls(j); + j -= 1; + j >>= j >= size; + } else { + j >>= __ffs(j) + 1; + } + + return j; +} + +static inline unsigned eytzinger_extra(unsigned size) +{ + return (size - rounddown_pow_of_two(size - 1)) << 1; +} + +static inline unsigned __eytzinger_to_inorder(unsigned j, unsigned size, + unsigned extra) +{ + unsigned b = __fls(j); + unsigned shift = __fls(size - 1) - b; + int s; + + EBUG_ON(!j || j >= size); + + j ^= 1U << b; + j <<= 1; + j |= 1; + j <<= shift; + + /* + * sign bit trick: + * + * if (j > extra) + * j -= (j - extra) >> 1; + */ + s = extra - j; + j += (s >> 1) & (s >> 31); + + return j; +} + +static inline unsigned __inorder_to_eytzinger(unsigned j, unsigned size, + unsigned extra) +{ + unsigned shift; + int s; + + EBUG_ON(!j || j >= size); + + /* + * sign bit trick: + * + * if (j > extra) + * j += j - extra; + */ + s = extra - j; + j -= s & (s >> 31); + + shift = __ffs(j); + + j >>= shift + 1; + j |= 1U << (__fls(size - 1) - shift); + + return j; +} + +static inline unsigned eytzinger_to_inorder(unsigned j, unsigned size) +{ + return __eytzinger_to_inorder(j, size, eytzinger_extra(size)); +} + +static inline unsigned inorder_to_eytzinger(unsigned j, unsigned size) +{ + return __inorder_to_eytzinger(j, size, eytzinger_extra(size)); +} + +#define eytzinger_for_each(_i, _size) \ + for ((_i) = eytzinger_first((_size)); \ + (_i) != 0; \ + (_i) = eytzinger_next((_i), (_size))) + +#if 0 +void eytzinger_test(void) +{ + unsigned i, j, size; + + for (size = 2; + size < 65536000; + size++) { + if (!(size % 4096)) + printk(KERN_INFO "tree size %u\n", size); + + assert(eytzinger_prev(0, size) == eytzinger_last(size)); + assert(eytzinger_next(0, size) == eytzinger_first(size)); + + assert(eytzinger_prev(eytzinger_first(size), size) == 0); + assert(eytzinger_next(eytzinger_last(size), size) == 0); + + eytzinger_for_each(j, size) { + assert(from_inorder(i, size) == j); + assert(to_inorder(j, size) == i); + + if (j != eytzinger_last(size)) { + unsigned next = eytzinger_next(j, size); + + assert(eytzinger_prev(next, size) == j); + } + } + } + +} +#endif + +#endif /* _EYTZINGER_H */ diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h new file mode 100644 index 00000000..2908ca23 --- /dev/null +++ b/libbcachefs/fifo.h @@ -0,0 +1,123 @@ +#ifndef _BCACHE_FIFO_H +#define _BCACHE_FIFO_H + +#define DECLARE_FIFO(type, name) \ + struct { \ + size_t front, back, size, mask; \ + type *data; \ + } name + +#define init_fifo(fifo, _size, _gfp) \ +({ \ + bool _ret = true; \ + gfp_t gfp_flags = (_gfp); \ + \ + if (gfp_flags & GFP_KERNEL) \ + gfp_flags |= __GFP_NOWARN; \ + \ + (fifo)->size = (_size); \ + (fifo)->front = (fifo)->back = 0; \ + (fifo)->data = NULL; \ + \ + if ((fifo)->size) { \ + size_t _allocated_size, _bytes; \ + \ + _allocated_size = roundup_pow_of_two((fifo)->size); \ + _bytes = _allocated_size * sizeof(*(fifo)->data); \ + \ + (fifo)->mask = _allocated_size - 1; \ + \ + if (_bytes < KMALLOC_MAX_SIZE) \ + (fifo)->data = kmalloc(_bytes, gfp_flags); \ + if ((!(fifo)->data) && (gfp_flags & GFP_KERNEL)) \ + (fifo)->data = vmalloc(_bytes); \ + if ((!(fifo)->data)) \ + _ret = false; \ + } \ + _ret; \ +}) + +#define free_fifo(fifo) \ +do { \ + kvfree((fifo)->data); \ + (fifo)->data = NULL; \ +} while (0) + +#define fifo_swap(l, r) \ +do { \ + swap((l)->front, (r)->front); \ + swap((l)->back, (r)->back); \ + swap((l)->size, (r)->size); \ + swap((l)->mask, (r)->mask); \ + swap((l)->data, (r)->data); \ +} while (0) + +#define fifo_move(dest, src) \ +do { \ + typeof(*((dest)->data)) _t; \ + while (!fifo_full(dest) && \ + fifo_pop(src, _t)) \ + fifo_push(dest, _t); \ +} while (0) + +#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) +#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) + +#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) +#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) + +#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) +#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) + +#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) + +#define fifo_push_back(fifo, i) \ +({ \ + bool _r = !fifo_full((fifo)); \ + if (_r) \ + (fifo)->data[(fifo)->back++ & (fifo)->mask] = (i); \ + _r; \ +}) + +#define fifo_pop_front(fifo, i) \ +({ \ + bool _r = !fifo_empty((fifo)); \ + if (_r) \ + (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ + _r; \ +}) + +#define fifo_push_front(fifo, i) \ +({ \ + bool _r = !fifo_full((fifo)); \ + if (_r) \ + (fifo)->data[--(fifo)->front & (fifo)->mask] = (i); \ + _r; \ +}) + +#define fifo_pop_back(fifo, i) \ +({ \ + bool _r = !fifo_empty((fifo)); \ + if (_r) \ + (i) = (fifo)->data[--(fifo)->back & (fifo)->mask] \ + _r; \ +}) + +#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) +#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) +#define fifo_peek(fifo) fifo_peek_front(fifo) + +#define fifo_for_each_entry(_entry, _fifo, _iter) \ + for (_iter = (_fifo)->front; \ + ((_iter != (_fifo)->back) && \ + (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ + _iter++) + +#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ + for (_iter = (_fifo)->front; \ + ((_iter != (_fifo)->back) && \ + (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ + _iter++) + +#endif /* _BCACHE_FIFO_H */ + diff --git a/libbcachefs/fs-gc.c b/libbcachefs/fs-gc.c new file mode 100644 index 00000000..20f552d2 --- /dev/null +++ b/libbcachefs/fs-gc.c @@ -0,0 +1,924 @@ + +#include "bcachefs.h" +#include "btree_update.h" +#include "dirent.h" +#include "error.h" +#include "fs.h" +#include "fs-gc.h" +#include "inode.h" +#include "keylist.h" +#include "super.h" + +#include <linux/generic-radix-tree.h> + +#define QSTR(n) { { { .len = strlen(n) } }, .name = n } + +static int remove_dirent(struct bch_fs *c, struct btree_iter *iter, + struct bkey_s_c_dirent dirent) +{ + struct qstr name; + struct bch_inode_unpacked dir_inode; + struct bch_hash_info dir_hash_info; + u64 dir_inum = dirent.k->p.inode; + int ret; + char *buf; + + name.len = bch2_dirent_name_bytes(dirent); + buf = kmalloc(name.len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + memcpy(buf, dirent.v->d_name, name.len); + buf[name.len] = '\0'; + name.name = buf; + + /* Unlock iter so we don't deadlock, after copying name: */ + bch2_btree_iter_unlock(iter); + + ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode); + if (ret) + goto err; + + dir_hash_info = bch2_hash_info_init(c, &dir_inode); + + ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL); +err: + kfree(buf); + return ret; +} + +static int reattach_inode(struct bch_fs *c, + struct bch_inode_unpacked *lostfound_inode, + u64 inum) +{ + struct bch_hash_info lostfound_hash_info = + bch2_hash_info_init(c, lostfound_inode); + struct bkey_inode_buf packed; + char name_buf[20]; + struct qstr name; + int ret; + + snprintf(name_buf, sizeof(name_buf), "%llu", inum); + name = (struct qstr) QSTR(name_buf); + + lostfound_inode->i_nlink++; + + bch2_inode_pack(&packed, lostfound_inode); + + ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, + NULL, NULL, NULL, 0); + if (ret) + return ret; + + return bch2_dirent_create(c, lostfound_inode->inum, + &lostfound_hash_info, + DT_DIR, &name, inum, NULL, 0); +} + +struct inode_walker { + bool first_this_inode; + bool have_inode; + u64 cur_inum; + struct bch_inode_unpacked inode; +}; + +static struct inode_walker inode_walker_init(void) +{ + return (struct inode_walker) { + .cur_inum = -1, + .have_inode = false, + }; +} + +static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum) +{ + w->first_this_inode = inum != w->cur_inum; + w->cur_inum = inum; + + if (w->first_this_inode) { + int ret = bch2_inode_find_by_inum(c, inum, &w->inode); + + if (ret && ret != -ENOENT) + return ret; + + w->have_inode = !ret; + } + + return 0; +} + +/* + * Walk extents: verify that extents have a corresponding S_ISREG inode, and + * that i_size an i_sectors are consistent + */ +noinline_for_stack +static int check_extents(struct bch_fs *c) +{ + struct inode_walker w = inode_walker_init(); + struct btree_iter iter; + struct bkey_s_c k; + u64 i_sectors; + int ret = 0; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + POS(BCACHE_ROOT_INO, 0), k) { + if (k.k->type == KEY_TYPE_DISCARD) + continue; + + ret = walk_inode(c, &w, k.k->p.inode); + if (ret) + break; + + unfixable_fsck_err_on(!w.have_inode, c, + "extent type %u for missing inode %llu", + k.k->type, k.k->p.inode); + + unfixable_fsck_err_on(w.first_this_inode && w.have_inode && + w.inode.i_sectors != + (i_sectors = bch2_count_inode_sectors(c, w.cur_inum)), + c, "i_sectors wrong: got %llu, should be %llu", + w.inode.i_sectors, i_sectors); + + unfixable_fsck_err_on(w.have_inode && + !S_ISREG(w.inode.i_mode) && !S_ISLNK(w.inode.i_mode), c, + "extent type %u for non regular file, inode %llu mode %o", + k.k->type, k.k->p.inode, w.inode.i_mode); + + unfixable_fsck_err_on(k.k->type != BCH_RESERVATION && + k.k->p.offset > round_up(w.inode.i_size, PAGE_SIZE) >> 9, c, + "extent type %u offset %llu past end of inode %llu, i_size %llu", + k.k->type, k.k->p.offset, k.k->p.inode, w.inode.i_size); + } +fsck_err: + return bch2_btree_iter_unlock(&iter) ?: ret; +} + +/* + * Walk dirents: verify that they all have a corresponding S_ISDIR inode, + * validate d_type + */ +noinline_for_stack +static int check_dirents(struct bch_fs *c) +{ + struct inode_walker w = inode_walker_init(); + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, + POS(BCACHE_ROOT_INO, 0), k) { + struct bkey_s_c_dirent d; + struct bch_inode_unpacked target; + bool have_target; + u64 d_inum; + + ret = walk_inode(c, &w, k.k->p.inode); + if (ret) + break; + + unfixable_fsck_err_on(!w.have_inode, c, + "dirent in nonexisting directory %llu", + k.k->p.inode); + + unfixable_fsck_err_on(!S_ISDIR(w.inode.i_mode), c, + "dirent in non directory inode %llu, type %u", + k.k->p.inode, mode_to_type(w.inode.i_mode)); + + if (k.k->type != BCH_DIRENT) + continue; + + d = bkey_s_c_to_dirent(k); + d_inum = le64_to_cpu(d.v->d_inum); + + if (fsck_err_on(d_inum == d.k->p.inode, c, + "dirent points to own directory")) { + ret = remove_dirent(c, &iter, d); + if (ret) + goto err; + continue; + } + + ret = bch2_inode_find_by_inum(c, d_inum, &target); + if (ret && ret != -ENOENT) + break; + + have_target = !ret; + ret = 0; + + if (fsck_err_on(!have_target, c, + "dirent points to missing inode %llu, type %u filename %s", + d_inum, d.v->d_type, d.v->d_name)) { + ret = remove_dirent(c, &iter, d); + if (ret) + goto err; + continue; + } + + if (fsck_err_on(have_target && + d.v->d_type != + mode_to_type(le16_to_cpu(target.i_mode)), c, + "incorrect d_type: got %u should be %u, filename %s", + d.v->d_type, + mode_to_type(le16_to_cpu(target.i_mode)), + d.v->d_name)) { + struct bkey_i_dirent *n; + + n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); + if (!n) { + ret = -ENOMEM; + goto err; + } + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = mode_to_type(le16_to_cpu(target.i_mode)); + + ret = bch2_btree_insert_at(c, NULL, NULL, NULL, + BTREE_INSERT_NOFAIL, + BTREE_INSERT_ENTRY(&iter, &n->k_i)); + kfree(n); + if (ret) + goto err; + + } + } +err: +fsck_err: + return bch2_btree_iter_unlock(&iter) ?: ret; +} + +/* + * Walk xattrs: verify that they all have a corresponding inode + */ +noinline_for_stack +static int check_xattrs(struct bch_fs *c) +{ + struct inode_walker w = inode_walker_init(); + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key(&iter, c, BTREE_ID_XATTRS, + POS(BCACHE_ROOT_INO, 0), k) { + ret = walk_inode(c, &w, k.k->p.inode); + if (ret) + break; + + unfixable_fsck_err_on(!w.have_inode, c, + "xattr for missing inode %llu", + k.k->p.inode); + } +fsck_err: + return bch2_btree_iter_unlock(&iter) ?: ret; +} + +/* Get root directory, create if it doesn't exist: */ +static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) +{ + struct bkey_inode_buf packed; + int ret; + + ret = bch2_inode_find_by_inum(c, BCACHE_ROOT_INO, root_inode); + if (ret && ret != -ENOENT) + return ret; + + if (fsck_err_on(ret, c, "root directory missing")) + goto create_root; + + if (fsck_err_on(!S_ISDIR(root_inode->i_mode), c, + "root inode not a directory")) + goto create_root; + + return 0; +fsck_err: + return ret; +create_root: + bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); + root_inode->inum = BCACHE_ROOT_INO; + + bch2_inode_pack(&packed, root_inode); + + return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, + NULL, NULL, NULL, 0); +} + +/* Get lost+found, create if it doesn't exist: */ +static int check_lostfound(struct bch_fs *c, + struct bch_inode_unpacked *root_inode, + struct bch_inode_unpacked *lostfound_inode) +{ + struct qstr lostfound = QSTR("lost+found"); + struct bch_hash_info root_hash_info = + bch2_hash_info_init(c, root_inode); + struct bkey_inode_buf packed; + u64 inum; + int ret; + + inum = bch2_dirent_lookup(c, BCACHE_ROOT_INO, &root_hash_info, + &lostfound); + if (!inum) { + bch_notice(c, "creating lost+found"); + goto create_lostfound; + } + + ret = bch2_inode_find_by_inum(c, inum, lostfound_inode); + if (ret && ret != -ENOENT) + return ret; + + if (fsck_err_on(ret, c, "lost+found missing")) + goto create_lostfound; + + if (fsck_err_on(!S_ISDIR(lostfound_inode->i_mode), c, + "lost+found inode not a directory")) + goto create_lostfound; + + return 0; +fsck_err: + return ret; +create_lostfound: + root_inode->i_nlink++; + + bch2_inode_pack(&packed, root_inode); + + ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, + NULL, NULL, NULL, 0); + if (ret) + return ret; + + bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); + bch2_inode_pack(&packed, lostfound_inode); + + ret = bch2_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0, + &c->unused_inode_hint); + if (ret) + return ret; + + lostfound_inode->inum = packed.inode.k.p.inode; + + ret = bch2_dirent_create(c, BCACHE_ROOT_INO, &root_hash_info, DT_DIR, + &lostfound, lostfound_inode->inum, NULL, 0); + if (ret) + return ret; + + return 0; +} + +struct inode_bitmap { + unsigned long *bits; + size_t size; +}; + +static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) +{ + return nr < b->size ? test_bit(nr, b->bits) : false; +} + +static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) +{ + if (nr >= b->size) { + size_t new_size = max(max(PAGE_SIZE * 8, + b->size * 2), + nr + 1); + void *n; + + new_size = roundup_pow_of_two(new_size); + n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); + if (!n) + return -ENOMEM; + + b->bits = n; + b->size = new_size; + } + + __set_bit(nr, b->bits); + return 0; +} + +struct pathbuf { + size_t nr; + size_t size; + + struct pathbuf_entry { + u64 inum; + u64 offset; + } *entries; +}; + +static int path_down(struct pathbuf *p, u64 inum) +{ + if (p->nr == p->size) { + size_t new_size = max(256UL, p->size * 2); + void *n = krealloc(p->entries, + new_size * sizeof(p->entries[0]), + GFP_KERNEL); + if (!n) + return -ENOMEM; + + p->entries = n; + p->size = new_size; + }; + + p->entries[p->nr++] = (struct pathbuf_entry) { + .inum = inum, + .offset = 0, + }; + return 0; +} + +noinline_for_stack +static int check_directory_structure(struct bch_fs *c, + struct bch_inode_unpacked *lostfound_inode) +{ + struct inode_bitmap dirs_done = { NULL, 0 }; + struct pathbuf path = { 0, 0, NULL }; + struct pathbuf_entry *e; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent dirent; + bool had_unreachable; + u64 d_inum; + int ret = 0; + + /* DFS: */ +restart_dfs: + ret = inode_bitmap_set(&dirs_done, BCACHE_ROOT_INO); + if (ret) + goto err; + + ret = path_down(&path, BCACHE_ROOT_INO); + if (ret) + return ret; + + while (path.nr) { +next: + e = &path.entries[path.nr - 1]; + + if (e->offset == U64_MAX) + goto up; + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, + POS(e->inum, e->offset + 1), k) { + if (k.k->p.inode != e->inum) + break; + + e->offset = k.k->p.offset; + + if (k.k->type != BCH_DIRENT) + continue; + + dirent = bkey_s_c_to_dirent(k); + + if (dirent.v->d_type != DT_DIR) + continue; + + d_inum = le64_to_cpu(dirent.v->d_inum); + + if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c, + "directory with multiple hardlinks")) { + ret = remove_dirent(c, &iter, dirent); + if (ret) + goto err; + continue; + } + + ret = inode_bitmap_set(&dirs_done, d_inum); + if (ret) + goto err; + + ret = path_down(&path, d_inum); + if (ret) + goto err; + + bch2_btree_iter_unlock(&iter); + goto next; + } + ret = bch2_btree_iter_unlock(&iter); + if (ret) + goto err; +up: + path.nr--; + } + + had_unreachable = false; + + for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) { + if (k.k->type != BCH_INODE_FS || + !S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->i_mode))) + continue; + + if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c, + "unreachable directory found (inum %llu)", + k.k->p.inode)) { + bch2_btree_iter_unlock(&iter); + + ret = reattach_inode(c, lostfound_inode, k.k->p.inode); + if (ret) + goto err; + + had_unreachable = true; + } + } + ret = bch2_btree_iter_unlock(&iter); + if (ret) + goto err; + + if (had_unreachable) { + bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); + kfree(dirs_done.bits); + kfree(path.entries); + memset(&dirs_done, 0, sizeof(dirs_done)); + memset(&path, 0, sizeof(path)); + goto restart_dfs; + } + +out: + kfree(dirs_done.bits); + kfree(path.entries); + return ret; +err: +fsck_err: + ret = bch2_btree_iter_unlock(&iter) ?: ret; + goto out; +} + +struct nlink { + u32 count; + u32 dir_count; +}; + +typedef GENRADIX(struct nlink) nlink_table; + +static void inc_link(struct bch_fs *c, nlink_table *links, + u64 range_start, u64 *range_end, + u64 inum, bool dir) +{ + struct nlink *link; + + if (inum < range_start || inum >= *range_end) + return; + + link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); + if (!link) { + bch_verbose(c, "allocation failed during fs gc - will need another pass"); + *range_end = inum; + return; + } + + if (dir) + link->dir_count++; + else + link->count++; +} + +noinline_for_stack +static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, + u64 range_start, u64 *range_end) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + u64 d_inum; + int ret; + + inc_link(c, links, range_start, range_end, BCACHE_ROOT_INO, false); + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, k) { + switch (k.k->type) { + case BCH_DIRENT: + d = bkey_s_c_to_dirent(k); + d_inum = le64_to_cpu(d.v->d_inum); + + if (d.v->d_type == DT_DIR) + inc_link(c, links, range_start, range_end, + d.k->p.inode, true); + + inc_link(c, links, range_start, range_end, + d_inum, false); + + break; + } + + bch2_btree_iter_cond_resched(&iter); + } + ret = bch2_btree_iter_unlock(&iter); + if (ret) + bch_err(c, "error in fs gc: btree error %i while walking dirents", ret); + + return ret; +} + +s64 bch2_count_inode_sectors(struct bch_fs *c, u64 inum) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 sectors = 0; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), k) { + if (k.k->p.inode != inum) + break; + + if (bkey_extent_is_allocation(k.k)) + sectors += k.k->size; + } + + return bch2_btree_iter_unlock(&iter) ?: sectors; +} + +static int bch2_gc_do_inode(struct bch_fs *c, + struct bch_inode_unpacked *lostfound_inode, + struct btree_iter *iter, + struct bkey_s_c_inode inode, struct nlink link) +{ + struct bch_inode_unpacked u; + int ret = 0; + u32 i_nlink, real_i_nlink; + bool do_update = false; + + ret = bch2_inode_unpack(inode, &u); + if (bch2_fs_inconsistent_on(ret, c, + "error unpacking inode %llu in fs-gc", + inode.k->p.inode)) + return ret; + + i_nlink = u.i_nlink + nlink_bias(u.i_mode); + + fsck_err_on(i_nlink < link.count, c, + "inode %llu i_link too small (%u < %u, type %i)", + inode.k->p.inode, i_nlink, + link.count, mode_to_type(u.i_mode)); + + /* These should have been caught/fixed by earlier passes: */ + if (S_ISDIR(u.i_mode)) { + need_fsck_err_on(link.count > 1, c, + "directory %llu with multiple hardlinks: %u", + inode.k->p.inode, link.count); + + real_i_nlink = link.count * 2 + link.dir_count; + } else { + need_fsck_err_on(link.dir_count, c, + "found dirents for non directory %llu", + inode.k->p.inode); + + real_i_nlink = link.count + link.dir_count; + } + + if (!link.count) { + fsck_err_on(c->sb.clean, c, + "filesystem marked clean, " + "but found orphaned inode %llu", + inode.k->p.inode); + + if (fsck_err_on(S_ISDIR(u.i_mode) && + bch2_empty_dir(c, inode.k->p.inode), c, + "non empty directory with link count 0, " + "inode nlink %u, dir links found %u", + i_nlink, link.dir_count)) { + ret = reattach_inode(c, lostfound_inode, + inode.k->p.inode); + if (ret) + return ret; + } + + bch_verbose(c, "deleting inode %llu", inode.k->p.inode); + + ret = bch2_inode_rm(c, inode.k->p.inode); + if (ret) + bch_err(c, "error in fs gc: error %i " + "while deleting inode", ret); + return ret; + } + + if (u.i_flags & BCH_INODE_I_SIZE_DIRTY) { + fsck_err_on(c->sb.clean, c, + "filesystem marked clean, " + "but inode %llu has i_size dirty", + inode.k->p.inode); + + bch_verbose(c, "truncating inode %llu", inode.k->p.inode); + + /* + * XXX: need to truncate partial blocks too here - or ideally + * just switch units to bytes and that issue goes away + */ + + ret = bch2_inode_truncate(c, inode.k->p.inode, + round_up(u.i_size, PAGE_SIZE) >> 9, + NULL, NULL); + if (ret) { + bch_err(c, "error in fs gc: error %i " + "truncating inode", ret); + return ret; + } + + /* + * We truncated without our normal sector accounting hook, just + * make sure we recalculate it: + */ + u.i_flags |= BCH_INODE_I_SECTORS_DIRTY; + + u.i_flags &= ~BCH_INODE_I_SIZE_DIRTY; + do_update = true; + } + + if (u.i_flags & BCH_INODE_I_SECTORS_DIRTY) { + s64 sectors; + + fsck_err_on(c->sb.clean, c, + "filesystem marked clean, " + "but inode %llu has i_sectors dirty", + inode.k->p.inode); + + bch_verbose(c, "recounting sectors for inode %llu", + inode.k->p.inode); + + sectors = bch2_count_inode_sectors(c, inode.k->p.inode); + if (sectors < 0) { + bch_err(c, "error in fs gc: error %i " + "recounting inode sectors", + (int) sectors); + return sectors; + } + + u.i_sectors = sectors; + u.i_flags &= ~BCH_INODE_I_SECTORS_DIRTY; + do_update = true; + } + + if (i_nlink != real_i_nlink) { + fsck_err_on(c->sb.clean, c, + "filesystem marked clean, " + "but inode %llu has wrong i_nlink " + "(type %u i_nlink %u, should be %u)", + inode.k->p.inode, mode_to_type(u.i_mode), + i_nlink, real_i_nlink); + + bch_verbose(c, "setting inode %llu nlinks from %u to %u", + inode.k->p.inode, i_nlink, real_i_nlink); + u.i_nlink = real_i_nlink - nlink_bias(u.i_mode);; + do_update = true; + } + + if (do_update) { + struct bkey_inode_buf p; + + bch2_inode_pack(&p, &u); + + ret = bch2_btree_insert_at(c, NULL, NULL, NULL, + BTREE_INSERT_NOFAIL, + BTREE_INSERT_ENTRY(iter, &p.inode.k_i)); + if (ret && ret != -EINTR) + bch_err(c, "error in fs gc: error %i " + "updating inode", ret); + } +fsck_err: + return ret; +} + +noinline_for_stack +static int bch2_gc_walk_inodes(struct bch_fs *c, + struct bch_inode_unpacked *lostfound_inode, + nlink_table *links, + u64 range_start, u64 range_end) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct nlink *link, zero_links = { 0, 0 }; + struct genradix_iter nlinks_iter; + int ret = 0, ret2 = 0; + u64 nlinks_pos; + + bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0)); + genradix_iter_init(&nlinks_iter); + + while ((k = bch2_btree_iter_peek(&iter)).k && + !btree_iter_err(k)) { +peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); + + if (!link && (!k.k || iter.pos.inode >= range_end)) + break; + + nlinks_pos = range_start + nlinks_iter.pos; + if (iter.pos.inode > nlinks_pos) { + /* Should have been caught by dirents pass: */ + need_fsck_err_on(link && link->count, c, + "missing inode %llu (nlink %u)", + nlinks_pos, link->count); + genradix_iter_advance(&nlinks_iter, links); + goto peek_nlinks; + } + + if (iter.pos.inode < nlinks_pos || !link) + link = &zero_links; + + if (k.k && k.k->type == BCH_INODE_FS) { + /* + * Avoid potential deadlocks with iter for + * truncate/rm/etc.: + */ + bch2_btree_iter_unlock(&iter); + + ret = bch2_gc_do_inode(c, lostfound_inode, &iter, + bkey_s_c_to_inode(k), *link); + if (ret == -EINTR) + continue; + if (ret) + break; + + if (link->count) + atomic_long_inc(&c->nr_inodes); + } else { + /* Should have been caught by dirents pass: */ + need_fsck_err_on(link->count, c, + "missing inode %llu (nlink %u)", + nlinks_pos, link->count); + } + + if (nlinks_pos == iter.pos.inode) + genradix_iter_advance(&nlinks_iter, links); + + bch2_btree_iter_advance_pos(&iter); + bch2_btree_iter_cond_resched(&iter); + } +fsck_err: + ret2 = bch2_btree_iter_unlock(&iter); + if (ret2) + bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2); + + return ret ?: ret2; +} + +noinline_for_stack +static int check_inode_nlinks(struct bch_fs *c, + struct bch_inode_unpacked *lostfound_inode) +{ + nlink_table links; + u64 this_iter_range_start, next_iter_range_start = 0; + int ret = 0; + + genradix_init(&links); + + do { + this_iter_range_start = next_iter_range_start; + next_iter_range_start = U64_MAX; + + ret = bch2_gc_walk_dirents(c, &links, + this_iter_range_start, + &next_iter_range_start); + if (ret) + break; + + ret = bch2_gc_walk_inodes(c, lostfound_inode, &links, + this_iter_range_start, + next_iter_range_start); + if (ret) + break; + + genradix_free(&links); + } while (next_iter_range_start != U64_MAX); + + genradix_free(&links); + + return ret; +} + +/* + * Checks for inconsistencies that shouldn't happen, unless we have a bug. + * Doesn't fix them yet, mainly because they haven't yet been observed: + */ +int bch2_fsck(struct bch_fs *c, bool full_fsck) +{ + struct bch_inode_unpacked root_inode, lostfound_inode; + int ret; + + ret = check_root(c, &root_inode); + if (ret) + return ret; + + ret = check_lostfound(c, &root_inode, &lostfound_inode); + if (ret) + return ret; + + if (!full_fsck) + goto check_nlinks; + + ret = check_extents(c); + if (ret) + return ret; + + ret = check_dirents(c); + if (ret) + return ret; + + ret = check_xattrs(c); + if (ret) + return ret; + + ret = check_directory_structure(c, &lostfound_inode); + if (ret) + return ret; +check_nlinks: + ret = check_inode_nlinks(c, &lostfound_inode); + if (ret) + return ret; + + return 0; +} diff --git a/libbcachefs/fs-gc.h b/libbcachefs/fs-gc.h new file mode 100644 index 00000000..4bde1bda --- /dev/null +++ b/libbcachefs/fs-gc.h @@ -0,0 +1,7 @@ +#ifndef _BCACHE_FS_GC_H +#define _BCACHE_FS_GC_H + +s64 bch2_count_inode_sectors(struct bch_fs *, u64); +int bch2_fsck(struct bch_fs *, bool); + +#endif /* _BCACHE_FS_GC_H */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c new file mode 100644 index 00000000..8ad192c4 --- /dev/null +++ b/libbcachefs/fs-io.c @@ -0,0 +1,2572 @@ + +#include "bcachefs.h" +#include "btree_update.h" +#include "buckets.h" +#include "clock.h" +#include "error.h" +#include "fs.h" +#include "fs-gc.h" +#include "fs-io.h" +#include "inode.h" +#include "journal.h" +#include "io.h" +#include "keylist.h" + +#include <linux/aio.h> +#include <linux/backing-dev.h> +#include <linux/falloc.h> +#include <linux/migrate.h> +#include <linux/mmu_context.h> +#include <linux/pagevec.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/uio.h> +#include <linux/writeback.h> +#include <trace/events/writeback.h> + +struct bio_set *bch2_writepage_bioset; +struct bio_set *bch2_dio_read_bioset; +struct bio_set *bch2_dio_write_bioset; + +/* pagecache_block must be held */ +static int write_invalidate_inode_pages_range(struct address_space *mapping, + loff_t start, loff_t end) +{ + int ret; + + /* + * XXX: the way this is currently implemented, we can spin if a process + * is continually redirtying a specific page + */ + do { + if (!mapping->nrpages && + !mapping->nrexceptional) + return 0; + + ret = filemap_write_and_wait_range(mapping, start, end); + if (ret) + break; + + if (!mapping->nrpages) + return 0; + + ret = invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + } while (ret == -EBUSY); + + return ret; +} + +/* i_size updates: */ + +static int inode_set_size(struct bch_inode_info *ei, + struct bch_inode_unpacked *bi, + void *p) +{ + loff_t *new_i_size = p; + + lockdep_assert_held(&ei->update_lock); + + bi->i_size = *new_i_size; + + if (atomic_long_read(&ei->i_size_dirty_count)) + bi->i_flags |= BCH_INODE_I_SIZE_DIRTY; + else + bi->i_flags &= ~BCH_INODE_I_SIZE_DIRTY; + + return 0; +} + +static int __must_check bch2_write_inode_size(struct bch_fs *c, + struct bch_inode_info *ei, + loff_t new_size) +{ + return __bch2_write_inode(c, ei, inode_set_size, &new_size); +} + +static inline void i_size_dirty_put(struct bch_inode_info *ei) +{ + atomic_long_dec_bug(&ei->i_size_dirty_count); +} + +static inline void i_size_dirty_get(struct bch_inode_info *ei) +{ + lockdep_assert_held(&ei->vfs_inode.i_rwsem); + + atomic_long_inc(&ei->i_size_dirty_count); +} + +/* i_sectors accounting: */ + +static enum extent_insert_hook_ret +i_sectors_hook_fn(struct extent_insert_hook *hook, + struct bpos committed_pos, + struct bpos next_pos, + struct bkey_s_c k, + const struct bkey_i *insert) +{ + struct i_sectors_hook *h = container_of(hook, + struct i_sectors_hook, hook); + s64 sectors = next_pos.offset - committed_pos.offset; + int sign = bkey_extent_is_allocation(&insert->k) - + (k.k && bkey_extent_is_allocation(k.k)); + + EBUG_ON(!(h->ei->i_flags & BCH_INODE_I_SECTORS_DIRTY)); + EBUG_ON(!atomic_long_read(&h->ei->i_sectors_dirty_count)); + + h->sectors += sectors * sign; + + return BTREE_HOOK_DO_INSERT; +} + +static int inode_set_i_sectors_dirty(struct bch_inode_info *ei, + struct bch_inode_unpacked *bi, void *p) +{ + BUG_ON(bi->i_flags & BCH_INODE_I_SECTORS_DIRTY); + + bi->i_flags |= BCH_INODE_I_SECTORS_DIRTY; + return 0; +} + +static int inode_clear_i_sectors_dirty(struct bch_inode_info *ei, + struct bch_inode_unpacked *bi, + void *p) +{ + BUG_ON(!(bi->i_flags & BCH_INODE_I_SECTORS_DIRTY)); + + bi->i_sectors = atomic64_read(&ei->i_sectors); + bi->i_flags &= ~BCH_INODE_I_SECTORS_DIRTY; + return 0; +} + +static void i_sectors_dirty_put(struct bch_inode_info *ei, + struct i_sectors_hook *h) +{ + struct inode *inode = &ei->vfs_inode; + + if (h->sectors) { + spin_lock(&inode->i_lock); + inode->i_blocks += h->sectors; + spin_unlock(&inode->i_lock); + + atomic64_add(h->sectors, &ei->i_sectors); + EBUG_ON(atomic64_read(&ei->i_sectors) < 0); + } + + EBUG_ON(atomic_long_read(&ei->i_sectors_dirty_count) <= 0); + + mutex_lock(&ei->update_lock); + + if (atomic_long_dec_and_test(&ei->i_sectors_dirty_count)) { + struct bch_fs *c = ei->vfs_inode.i_sb->s_fs_info; + int ret = __bch2_write_inode(c, ei, inode_clear_i_sectors_dirty, NULL); + + ret = ret; + } + + mutex_unlock(&ei->update_lock); +} + +static int __must_check i_sectors_dirty_get(struct bch_inode_info *ei, + struct i_sectors_hook *h) +{ + int ret = 0; + + h->hook.fn = i_sectors_hook_fn; + h->sectors = 0; +#ifdef CONFIG_BCACHEFS_DEBUG + h->ei = ei; +#endif + + if (atomic_long_inc_not_zero(&ei->i_sectors_dirty_count)) + return 0; + + mutex_lock(&ei->update_lock); + + if (!(ei->i_flags & BCH_INODE_I_SECTORS_DIRTY)) { + struct bch_fs *c = ei->vfs_inode.i_sb->s_fs_info; + + ret = __bch2_write_inode(c, ei, inode_set_i_sectors_dirty, NULL); + } + + if (!ret) + atomic_long_inc(&ei->i_sectors_dirty_count); + + mutex_unlock(&ei->update_lock); + + return ret; +} + +struct bchfs_extent_trans_hook { + struct bchfs_write_op *op; + struct extent_insert_hook hook; + + struct bch_inode_unpacked inode_u; + struct bkey_inode_buf inode_p; + + bool need_inode_update; +}; + +static enum extent_insert_hook_ret +bchfs_extent_update_hook(struct extent_insert_hook *hook, + struct bpos committed_pos, + struct bpos next_pos, + struct bkey_s_c k, + const struct bkey_i *insert) +{ + struct bchfs_extent_trans_hook *h = container_of(hook, + struct bchfs_extent_trans_hook, hook); + struct bch_inode_info *ei = h->op->ei; + struct inode *inode = &ei->vfs_inode; + int sign = bkey_extent_is_allocation(&insert->k) - + (k.k && bkey_extent_is_allocation(k.k)); + s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign; + u64 offset = min(next_pos.offset << 9, h->op->new_i_size); + bool do_pack = false; + + BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE)); + + /* XXX: ei->i_size locking */ + if (offset > ei->i_size) { + BUG_ON(ei->i_flags & BCH_INODE_I_SIZE_DIRTY); + + if (!h->need_inode_update) { + h->need_inode_update = true; + return BTREE_HOOK_RESTART_TRANS; + } + + h->inode_u.i_size = offset; + do_pack = true; + + ei->i_size = offset; + + if (h->op->is_dio) + i_size_write(inode, offset); + } + + if (sectors) { + if (!h->need_inode_update) { + h->need_inode_update = true; + return BTREE_HOOK_RESTART_TRANS; + } + + h->inode_u.i_sectors += sectors; + do_pack = true; + + atomic64_add(sectors, &ei->i_sectors); + + h->op->sectors_added += sectors; + + if (h->op->is_dio) { + spin_lock(&inode->i_lock); + inode->i_blocks += sectors; + spin_unlock(&inode->i_lock); + } + } + + if (do_pack) + bch2_inode_pack(&h->inode_p, &h->inode_u); + + return BTREE_HOOK_DO_INSERT; +} + +static int bchfs_write_index_update(struct bch_write_op *wop) +{ + struct bchfs_write_op *op = container_of(wop, + struct bchfs_write_op, op); + struct keylist *keys = &op->op.insert_keys; + struct btree_iter extent_iter, inode_iter; + struct bchfs_extent_trans_hook hook; + struct bkey_i *k = bch2_keylist_front(keys); + int ret; + + BUG_ON(k->k.p.inode != op->ei->vfs_inode.i_ino); + + bch2_btree_iter_init_intent(&extent_iter, wop->c, BTREE_ID_EXTENTS, + bkey_start_pos(&bch2_keylist_front(keys)->k)); + bch2_btree_iter_init_intent(&inode_iter, wop->c, BTREE_ID_INODES, + POS(extent_iter.pos.inode, 0)); + + hook.op = op; + hook.hook.fn = bchfs_extent_update_hook; + hook.need_inode_update = false; + + do { + ret = bch2_btree_iter_traverse(&extent_iter); + if (ret) + goto err; + + /* XXX: ei->i_size locking */ + k = bch2_keylist_front(keys); + if (min(k->k.p.offset << 9, op->new_i_size) > op->ei->i_size) + hook.need_inode_update = true; + + if (hook.need_inode_update) { + struct bkey_s_c inode; + + if (!btree_iter_linked(&inode_iter)) + bch2_btree_iter_link(&extent_iter, &inode_iter); + + inode = bch2_btree_iter_peek_with_holes(&inode_iter); + if ((ret = btree_iter_err(inode))) + goto err; + + if (WARN_ONCE(inode.k->type != BCH_INODE_FS, + "inode %llu not found when updating", + extent_iter.pos.inode)) { + ret = -ENOENT; + break; + } + + if (WARN_ONCE(bkey_bytes(inode.k) > + sizeof(hook.inode_p), + "inode %llu too big (%zu bytes, buf %zu)", + extent_iter.pos.inode, + bkey_bytes(inode.k), + sizeof(hook.inode_p))) { + ret = -ENOENT; + break; + } + + bkey_reassemble(&hook.inode_p.inode.k_i, inode); + ret = bch2_inode_unpack(bkey_s_c_to_inode(inode), + &hook.inode_u); + if (WARN_ONCE(ret, + "error %i unpacking inode %llu", + ret, extent_iter.pos.inode)) { + ret = -ENOENT; + break; + } + + ret = bch2_btree_insert_at(wop->c, &wop->res, + &hook.hook, op_journal_seq(wop), + BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(&extent_iter, k), + BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter, + &hook.inode_p.inode.k_i, 2)); + } else { + ret = bch2_btree_insert_at(wop->c, &wop->res, + &hook.hook, op_journal_seq(wop), + BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(&extent_iter, k)); + } +err: + if (ret == -EINTR) + continue; + if (ret) + break; + + bch2_keylist_pop_front(keys); + } while (!bch2_keylist_empty(keys)); + + bch2_btree_iter_unlock(&extent_iter); + bch2_btree_iter_unlock(&inode_iter); + + return ret; +} + +/* page state: */ + +/* stored in page->private: */ + +/* + * bch_page_state has to (unfortunately) be manipulated with cmpxchg - we could + * almost protected it with the page lock, except that bch2_writepage_io_done has + * to update the sector counts (and from interrupt/bottom half context). + */ +struct bch_page_state { +union { struct { + /* + * page is _fully_ written on disk, and not compressed - which means to + * write this page we don't have to reserve space (the new write will + * never take up more space on disk than what it's overwriting) + */ + unsigned allocated:1; + + /* Owns PAGE_SECTORS sized reservation: */ + unsigned reserved:1; + unsigned nr_replicas:4; + + /* + * Number of sectors on disk - for i_blocks + * Uncompressed size, not compressed size: + */ + u8 sectors; + u8 dirty_sectors; +}; + /* for cmpxchg: */ + unsigned long v; +}; +}; + +#define page_state_cmpxchg(_ptr, _new, _expr) \ +({ \ + unsigned long _v = READ_ONCE((_ptr)->v); \ + struct bch_page_state _old; \ + \ + do { \ + _old.v = _new.v = _v; \ + _expr; \ + \ + EBUG_ON(_new.sectors + _new.dirty_sectors > PAGE_SECTORS);\ + } while (_old.v != _new.v && \ + (_v = cmpxchg(&(_ptr)->v, _old.v, _new.v)) != _old.v); \ + \ + _old; \ +}) + +static inline struct bch_page_state *page_state(struct page *page) +{ + struct bch_page_state *s = (void *) &page->private; + + BUILD_BUG_ON(sizeof(*s) > sizeof(page->private)); + + if (!PagePrivate(page)) + SetPagePrivate(page); + + return s; +} + +static void bch2_put_page_reservation(struct bch_fs *c, struct page *page) +{ + struct disk_reservation res = { .sectors = PAGE_SECTORS }; + struct bch_page_state s; + + s = page_state_cmpxchg(page_state(page), s, { + if (!s.reserved) + return; + s.reserved = 0; + }); + + bch2_disk_reservation_put(c, &res); +} + +static int bch2_get_page_reservation(struct bch_fs *c, struct page *page, + bool check_enospc) +{ + struct bch_page_state *s = page_state(page), new; + struct disk_reservation res; + int ret = 0; + + BUG_ON(s->allocated && s->sectors != PAGE_SECTORS); + + if (s->allocated || s->reserved) + return 0; + + ret = bch2_disk_reservation_get(c, &res, PAGE_SECTORS, !check_enospc + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + return ret; + + page_state_cmpxchg(s, new, { + if (new.reserved) { + bch2_disk_reservation_put(c, &res); + return 0; + } + new.reserved = 1; + new.nr_replicas = res.nr_replicas; + }); + + return 0; +} + +static void bch2_clear_page_bits(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct bch_fs *c = inode->i_sb->s_fs_info; + struct disk_reservation res = { .sectors = PAGE_SECTORS }; + struct bch_page_state s; + + if (!PagePrivate(page)) + return; + + s = xchg(page_state(page), (struct bch_page_state) { .v = 0 }); + ClearPagePrivate(page); + + if (s.dirty_sectors) { + spin_lock(&inode->i_lock); + inode->i_blocks -= s.dirty_sectors; + spin_unlock(&inode->i_lock); + } + + if (s.reserved) + bch2_disk_reservation_put(c, &res); +} + +int bch2_set_page_dirty(struct page *page) +{ + struct bch_page_state old, new; + + old = page_state_cmpxchg(page_state(page), new, + new.dirty_sectors = PAGE_SECTORS - new.sectors; + ); + + if (old.dirty_sectors != new.dirty_sectors) { + struct inode *inode = page->mapping->host; + + spin_lock(&inode->i_lock); + inode->i_blocks += new.dirty_sectors - old.dirty_sectors; + spin_unlock(&inode->i_lock); + } + + return __set_page_dirty_nobuffers(page); +} + +/* readpages/writepages: */ + +static bool bio_can_add_page_contig(struct bio *bio, struct page *page) +{ + sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9); + + return bio->bi_vcnt < bio->bi_max_vecs && + bio_end_sector(bio) == offset; +} + +static void __bio_add_page(struct bio *bio, struct page *page) +{ + bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) { + .bv_page = page, + .bv_len = PAGE_SIZE, + .bv_offset = 0, + }; + + bio->bi_iter.bi_size += PAGE_SIZE; +} + +static int bio_add_page_contig(struct bio *bio, struct page *page) +{ + sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9); + + BUG_ON(!bio->bi_max_vecs); + + if (!bio->bi_vcnt) + bio->bi_iter.bi_sector = offset; + else if (!bio_can_add_page_contig(bio, page)) + return -1; + + __bio_add_page(bio, page); + return 0; +} + +static void bch2_readpages_end_io(struct bio *bio) +{ + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + if (!bio->bi_error) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } + + bio_put(bio); +} + +struct readpages_iter { + struct address_space *mapping; + struct list_head pages; + unsigned nr_pages; +}; + +static int readpage_add_page(struct readpages_iter *iter, struct page *page) +{ + struct bch_page_state *s = page_state(page); + int ret; + + BUG_ON(s->reserved); + s->allocated = 1; + s->sectors = 0; + + prefetchw(&page->flags); + ret = add_to_page_cache_lru(page, iter->mapping, + page->index, GFP_NOFS); + put_page(page); + return ret; +} + +static inline struct page *readpage_iter_next(struct readpages_iter *iter) +{ + while (iter->nr_pages) { + struct page *page = + list_last_entry(&iter->pages, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + iter->nr_pages--; + + if (!readpage_add_page(iter, page)) + return page; + } + + return NULL; +} + +#define for_each_readpage_page(_iter, _page) \ + for (; \ + ((_page) = __readpage_next_page(&(_iter)));) \ + +static void bch2_mark_pages_unalloc(struct bio *bio) +{ + struct bvec_iter iter; + struct bio_vec bv; + + bio_for_each_segment(bv, bio, iter) + page_state(bv.bv_page)->allocated = 0; +} + +static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) +{ + struct bvec_iter iter; + struct bio_vec bv; + + bio_for_each_segment(bv, bio, iter) { + struct bch_page_state *s = page_state(bv.bv_page); + + /* sectors in @k from the start of this page: */ + unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset); + + unsigned page_sectors = min(bv.bv_len >> 9, k_sectors); + + if (!s->sectors) + s->nr_replicas = bch2_extent_nr_dirty_ptrs(k); + else + s->nr_replicas = min_t(unsigned, s->nr_replicas, + bch2_extent_nr_dirty_ptrs(k)); + + BUG_ON(s->sectors + page_sectors > PAGE_SECTORS); + s->sectors += page_sectors; + } +} + +static void readpage_bio_extend(struct readpages_iter *iter, + struct bio *bio, u64 offset, + bool get_more) +{ + struct page *page; + pgoff_t page_offset; + int ret; + + while (bio_end_sector(bio) < offset && + bio->bi_vcnt < bio->bi_max_vecs) { + page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; + + if (iter->nr_pages) { + page = list_last_entry(&iter->pages, struct page, lru); + if (page->index != page_offset) + break; + + list_del(&page->lru); + iter->nr_pages--; + } else if (get_more) { + rcu_read_lock(); + page = radix_tree_lookup(&iter->mapping->page_tree, page_offset); + rcu_read_unlock(); + + if (page && !radix_tree_exceptional_entry(page)) + break; + + page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); + if (!page) + break; + + page->index = page_offset; + ClearPageReadahead(bio->bi_io_vec[bio->bi_vcnt - 1].bv_page); + } else { + break; + } + + ret = readpage_add_page(iter, page); + if (ret) + break; + + __bio_add_page(bio, page); + } + + if (!iter->nr_pages) + SetPageReadahead(bio->bi_io_vec[bio->bi_vcnt - 1].bv_page); +} + +static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, + struct bch_read_bio *rbio, u64 inode, + struct readpages_iter *readpages_iter) +{ + struct bio *bio = &rbio->bio; + int flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_PROMOTE| + BCH_READ_MAY_REUSE_BIO; + + while (1) { + struct extent_pick_ptr pick; + BKEY_PADDED(k) tmp; + struct bkey_s_c k; + unsigned bytes; + bool is_last; + + bch2_btree_iter_set_pos(iter, POS(inode, bio->bi_iter.bi_sector)); + + k = bch2_btree_iter_peek_with_holes(iter); + BUG_ON(!k.k); + + if (IS_ERR(k.k)) { + int ret = bch2_btree_iter_unlock(iter); + BUG_ON(!ret); + bcache_io_error(c, bio, "btree IO error %i", ret); + bio_endio(bio); + return; + } + + bkey_reassemble(&tmp.k, k); + bch2_btree_iter_unlock(iter); + k = bkey_i_to_s_c(&tmp.k); + + bch2_extent_pick_ptr(c, k, &pick); + if (IS_ERR(pick.ca)) { + bcache_io_error(c, bio, "no device to read from"); + bio_endio(bio); + return; + } + + if (readpages_iter) + readpage_bio_extend(readpages_iter, + bio, k.k->p.offset, + pick.ca && + (pick.crc.csum_type || + pick.crc.compression_type)); + + bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) - + bio->bi_iter.bi_sector) << 9; + is_last = bytes == bio->bi_iter.bi_size; + swap(bio->bi_iter.bi_size, bytes); + + if (bkey_extent_is_allocation(k.k)) + bch2_add_page_sectors(bio, k); + + if (!bkey_extent_is_allocation(k.k) || + bkey_extent_is_compressed(k)) + bch2_mark_pages_unalloc(bio); + + if (is_last) + flags |= BCH_READ_IS_LAST; + + if (pick.ca) { + PTR_BUCKET(pick.ca, &pick.ptr)->read_prio = + c->prio_clock[READ].hand; + + bch2_read_extent(c, rbio, k, &pick, flags); + flags &= ~BCH_READ_MAY_REUSE_BIO; + } else { + zero_fill_bio(bio); + + if (is_last) + bio_endio(bio); + } + + if (is_last) + return; + + swap(bio->bi_iter.bi_size, bytes); + bio_advance(bio, bytes); + } +} + +int bch2_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + struct bch_fs *c = inode->i_sb->s_fs_info; + struct btree_iter iter; + struct page *page; + struct readpages_iter readpages_iter = { + .mapping = mapping, .nr_pages = nr_pages + }; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + + INIT_LIST_HEAD(&readpages_iter.pages); + list_add(&readpages_iter.pages, pages); + list_del_init(pages); + + if (current->pagecache_lock != &mapping->add_lock) + pagecache_add_get(&mapping->add_lock); + + while ((page = readpage_iter_next(&readpages_iter))) { + unsigned n = max(min_t(unsigned, readpages_iter.nr_pages + 1, + BIO_MAX_PAGES), + BCH_ENCODED_EXTENT_MAX >> PAGE_SECTOR_SHIFT); + + struct bch_read_bio *rbio = + container_of(bio_alloc_bioset(GFP_NOFS, n, + &c->bio_read), + struct bch_read_bio, bio); + + rbio->bio.bi_end_io = bch2_readpages_end_io; + bio_add_page_contig(&rbio->bio, page); + bchfs_read(c, &iter, rbio, inode->i_ino, &readpages_iter); + } + + if (current->pagecache_lock != &mapping->add_lock) + pagecache_add_put(&mapping->add_lock); + + return 0; +} + +static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, + u64 inode, struct page *page) +{ + struct btree_iter iter; + + /* + * Initialize page state: + * If a page is partly allocated and partly a hole, we want it to be + * marked BCH_PAGE_UNALLOCATED - so we initially mark all pages + * allocated and then mark them unallocated as we find holes: + * + * Note that the bio hasn't been split yet - it's the only bio that + * points to these pages. As we walk extents and split @bio, that + * necessarily be true, the splits won't necessarily be on page + * boundaries: + */ + struct bch_page_state *s = page_state(page); + + EBUG_ON(s->reserved); + s->allocated = 1; + s->sectors = 0; + + bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); + bio_add_page_contig(&rbio->bio, page); + + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + bchfs_read(c, &iter, rbio, inode, NULL); +} + +int bch2_readpage(struct file *file, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct bch_fs *c = inode->i_sb->s_fs_info; + struct bch_read_bio *rbio; + + rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1, + &c->bio_read), + struct bch_read_bio, bio); + rbio->bio.bi_end_io = bch2_readpages_end_io; + + __bchfs_readpage(c, rbio, inode->i_ino, page); + return 0; +} + +struct bch_writepage_state { + struct bch_writepage_io *io; +}; + +static void bch2_writepage_io_free(struct closure *cl) +{ + struct bch_writepage_io *io = container_of(cl, + struct bch_writepage_io, cl); + struct bio *bio = &io->bio.bio; + + bio_put(bio); +} + +static void bch2_writepage_io_done(struct closure *cl) +{ + struct bch_writepage_io *io = container_of(cl, + struct bch_writepage_io, cl); + struct bch_fs *c = io->op.op.c; + struct bio *bio = &io->bio.bio; + struct bio_vec *bvec; + unsigned i; + + atomic_sub(bio->bi_vcnt, &c->writeback_pages); + wake_up(&c->writeback_wait); + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (io->op.op.error) { + SetPageError(page); + if (page->mapping) + set_bit(AS_EIO, &page->mapping->flags); + } + + if (io->op.op.written >= PAGE_SECTORS) { + struct bch_page_state old, new; + + old = page_state_cmpxchg(page_state(page), new, { + new.sectors = PAGE_SECTORS; + new.dirty_sectors = 0; + }); + + io->op.sectors_added -= old.dirty_sectors; + io->op.op.written -= PAGE_SECTORS; + } + } + + /* + * racing with fallocate can cause us to add fewer sectors than + * expected - but we shouldn't add more sectors than expected: + * + * (error (due to going RO) halfway through a page can screw that up + * slightly) + */ + BUG_ON(io->op.sectors_added >= (s64) PAGE_SECTORS); + + /* + * PageWriteback is effectively our ref on the inode - fixup i_blocks + * before calling end_page_writeback: + */ + if (io->op.sectors_added) { + struct inode *inode = &io->op.ei->vfs_inode; + + spin_lock(&inode->i_lock); + inode->i_blocks += io->op.sectors_added; + spin_unlock(&inode->i_lock); + } + + bio_for_each_segment_all(bvec, bio, i) + end_page_writeback(bvec->bv_page); + + closure_return_with_destructor(&io->cl, bch2_writepage_io_free); +} + +static void bch2_writepage_do_io(struct bch_writepage_state *w) +{ + struct bch_writepage_io *io = w->io; + + w->io = NULL; + atomic_add(io->bio.bio.bi_vcnt, &io->op.op.c->writeback_pages); + + io->op.op.pos.offset = io->bio.bio.bi_iter.bi_sector; + + closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl); + continue_at(&io->cl, bch2_writepage_io_done, NULL); +} + +/* + * Get a bch_writepage_io and add @page to it - appending to an existing one if + * possible, else allocating a new one: + */ +static void bch2_writepage_io_alloc(struct bch_fs *c, + struct bch_writepage_state *w, + struct bch_inode_info *ei, + struct page *page) +{ + u64 inum = ei->vfs_inode.i_ino; + unsigned nr_replicas = page_state(page)->nr_replicas; + + EBUG_ON(!nr_replicas); + /* XXX: disk_reservation->gen isn't plumbed through */ + + if (!w->io) { +alloc_io: + w->io = container_of(bio_alloc_bioset(GFP_NOFS, + BIO_MAX_PAGES, + bch2_writepage_bioset), + struct bch_writepage_io, bio.bio); + + closure_init(&w->io->cl, NULL); + w->io->op.ei = ei; + w->io->op.sectors_added = 0; + w->io->op.is_dio = false; + bch2_write_op_init(&w->io->op.op, c, &w->io->bio, + (struct disk_reservation) { + .nr_replicas = c->opts.data_replicas, + }, + foreground_write_point(c, inum), + POS(inum, 0), + &ei->journal_seq, 0); + w->io->op.op.index_update_fn = bchfs_write_index_update; + } + + if (w->io->op.op.res.nr_replicas != nr_replicas || + bio_add_page_contig(&w->io->bio.bio, page)) { + bch2_writepage_do_io(w); + goto alloc_io; + } + + /* + * We shouldn't ever be handed pages for multiple inodes in a single + * pass - right? + */ + BUG_ON(ei != w->io->op.ei); +} + +static int __bch2_writepage(struct bch_fs *c, struct page *page, + struct writeback_control *wbc, + struct bch_writepage_state *w) +{ + struct inode *inode = page->mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + struct bch_page_state new, old; + unsigned offset; + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_SHIFT; + + EBUG_ON(!PageUptodate(page)); + + /* Is the page fully inside i_size? */ + if (page->index < end_index) + goto do_io; + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_SIZE - 1); + if (page->index > end_index || !offset) { + unlock_page(page); + return 0; + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_SIZE); +do_io: + bch2_writepage_io_alloc(c, w, ei, page); + + /* while page is locked: */ + w->io->op.new_i_size = i_size; + + if (wbc->sync_mode == WB_SYNC_ALL) + w->io->bio.bio.bi_opf |= WRITE_SYNC; + + /* Before unlocking the page, transfer reservation to w->io: */ + old = page_state_cmpxchg(page_state(page), new, { + EBUG_ON(!new.reserved && + (new.sectors != PAGE_SECTORS || + !new.allocated)); + + if (new.allocated && + w->io->op.op.compression_type != BCH_COMPRESSION_NONE) + new.allocated = 0; + else if (!new.reserved) + goto out; + new.reserved = 0; + }); + + w->io->op.op.res.sectors += PAGE_SECTORS * + (old.reserved - new.reserved) * + old.nr_replicas; +out: + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + + return 0; +} + +int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + struct bch_fs *c = mapping->host->i_sb->s_fs_info; + struct bch_writepage_state w = { NULL }; + struct pagecache_iter iter; + struct page *page; + int ret = 0; + int done = 0; + pgoff_t uninitialized_var(writeback_index); + pgoff_t index; + pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int cycled; + int range_whole = 0; + int tag; + + if (wbc->range_cyclic) { + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; + end = -1; + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + cycled = 1; /* ignore range_cyclic tests */ + } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + + done_index = index; +get_pages: + for_each_pagecache_tag(&iter, mapping, tag, index, end, page) { + done_index = page->index; + + if (w.io && + !bio_can_add_page_contig(&w.io->bio.bio, page)) + bch2_writepage_do_io(&w); + + if (!w.io && + atomic_read(&c->writeback_pages) >= + c->writeback_pages_max) { + /* don't sleep with pages pinned: */ + pagecache_iter_release(&iter); + + __wait_event(c->writeback_wait, + atomic_read(&c->writeback_pages) < + c->writeback_pages_max); + goto get_pages; + } + + lock_page(page); + + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; + } + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); + ret = __bch2_writepage(c, page, wbc, &w); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done_index = page->index + 1; + done = 1; + break; + } + } + + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + } + pagecache_iter_release(&iter); + + if (w.io) + bch2_writepage_do_io(&w); + + if (!cycled && !done) { + /* + * range_cyclic: + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + cycled = 1; + index = 0; + end = writeback_index - 1; + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; + + return ret; +} + +int bch2_writepage(struct page *page, struct writeback_control *wbc) +{ + struct bch_fs *c = page->mapping->host->i_sb->s_fs_info; + struct bch_writepage_state w = { NULL }; + int ret; + + ret = __bch2_writepage(c, page, wbc, &w); + if (w.io) + bch2_writepage_do_io(&w); + + return ret; +} + +static void bch2_read_single_page_end_io(struct bio *bio) +{ + complete(bio->bi_private); +} + +static int bch2_read_single_page(struct page *page, + struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct bch_fs *c = inode->i_sb->s_fs_info; + struct bch_read_bio *rbio; + int ret; + DECLARE_COMPLETION_ONSTACK(done); + + rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1, + &c->bio_read), + struct bch_read_bio, bio); + rbio->bio.bi_private = &done; + rbio->bio.bi_end_io = bch2_read_single_page_end_io; + + __bchfs_readpage(c, rbio, inode->i_ino, page); + wait_for_completion(&done); + + ret = rbio->bio.bi_error; + bio_put(&rbio->bio); + + if (ret < 0) + return ret; + + SetPageUptodate(page); + return 0; +} + +int bch2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct bch_fs *c = inode->i_sb->s_fs_info; + pgoff_t index = pos >> PAGE_SHIFT; + unsigned offset = pos & (PAGE_SIZE - 1); + struct page *page; + int ret = -ENOMEM; + + BUG_ON(inode_unhashed(mapping->host)); + + /* Not strictly necessary - same reason as mkwrite(): */ + pagecache_add_get(&mapping->add_lock); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + goto err_unlock; + + if (PageUptodate(page)) + goto out; + + /* If we're writing entire page, don't need to read it in first: */ + if (len == PAGE_SIZE) + goto out; + + if (!offset && pos + len >= inode->i_size) { + zero_user_segment(page, len, PAGE_SIZE); + flush_dcache_page(page); + goto out; + } + + if (index > inode->i_size >> PAGE_SHIFT) { + zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); + flush_dcache_page(page); + goto out; + } +readpage: + ret = bch2_read_single_page(page, mapping); + if (ret) + goto err; +out: + ret = bch2_get_page_reservation(c, page, true); + if (ret) { + if (!PageUptodate(page)) { + /* + * If the page hasn't been read in, we won't know if we + * actually need a reservation - we don't actually need + * to read here, we just need to check if the page is + * fully backed by uncompressed data: + */ + goto readpage; + } + + goto err; + } + + *pagep = page; + return 0; +err: + unlock_page(page); + put_page(page); + *pagep = NULL; +err_unlock: + pagecache_add_put(&mapping->add_lock); + return ret; +} + +int bch2_write_end(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + struct bch_fs *c = inode->i_sb->s_fs_info; + + lockdep_assert_held(&inode->i_rwsem); + + if (unlikely(copied < len && !PageUptodate(page))) { + /* + * The page needs to be read in, but that would destroy + * our partial write - simplest thing is to just force + * userspace to redo the write: + */ + zero_user(page, 0, PAGE_SIZE); + flush_dcache_page(page); + copied = 0; + } + + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + + if (copied) { + if (!PageUptodate(page)) + SetPageUptodate(page); + if (!PageDirty(page)) + set_page_dirty(page); + } else { + bch2_put_page_reservation(c, page); + } + + unlock_page(page); + put_page(page); + pagecache_add_put(&mapping->add_lock); + + return copied; +} + +/* O_DIRECT */ + +static void bch2_dio_read_complete(struct closure *cl) +{ + struct dio_read *dio = container_of(cl, struct dio_read, cl); + + dio->req->ki_complete(dio->req, dio->ret, 0); + bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ +} + +static void bch2_direct_IO_read_endio(struct bio *bio) +{ + struct dio_read *dio = bio->bi_private; + + if (bio->bi_error) + dio->ret = bio->bi_error; + + closure_put(&dio->cl); +} + +static void bch2_direct_IO_read_split_endio(struct bio *bio) +{ + bch2_direct_IO_read_endio(bio); + bio_check_pages_dirty(bio); /* transfers ownership */ +} + +static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req, + struct file *file, struct inode *inode, + struct iov_iter *iter, loff_t offset) +{ + struct dio_read *dio; + struct bio *bio; + bool sync = is_sync_kiocb(req); + ssize_t ret; + + if ((offset|iter->count) & (block_bytes(c) - 1)) + return -EINVAL; + + ret = min_t(loff_t, iter->count, + max_t(loff_t, 0, i_size_read(inode) - offset)); + iov_iter_truncate(iter, round_up(ret, block_bytes(c))); + + if (!ret) + return ret; + + bio = bio_alloc_bioset(GFP_KERNEL, + iov_iter_npages(iter, BIO_MAX_PAGES), + bch2_dio_read_bioset); + + bio->bi_end_io = bch2_direct_IO_read_endio; + + dio = container_of(bio, struct dio_read, rbio.bio); + closure_init(&dio->cl, NULL); + + /* + * this is a _really_ horrible hack just to avoid an atomic sub at the + * end: + */ + if (!sync) { + set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_RUNNING + + CLOSURE_DESTRUCTOR); + } else { + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER + 1); + } + + dio->req = req; + dio->ret = ret; + + goto start; + while (iter->count) { + bio = bio_alloc_bioset(GFP_KERNEL, + iov_iter_npages(iter, BIO_MAX_PAGES), + &c->bio_read); + bio->bi_end_io = bch2_direct_IO_read_split_endio; +start: + bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); + bio->bi_iter.bi_sector = offset >> 9; + bio->bi_private = dio; + + ret = bio_get_user_pages(bio, iter, 1); + if (ret < 0) { + /* XXX: fault inject this path */ + bio->bi_error = ret; + bio_endio(bio); + break; + } + + offset += bio->bi_iter.bi_size; + bio_set_pages_dirty(bio); + + if (iter->count) + closure_get(&dio->cl); + + bch2_read(c, container_of(bio, + struct bch_read_bio, bio), + inode->i_ino); + } + + if (sync) { + closure_sync(&dio->cl); + closure_debug_destroy(&dio->cl); + ret = dio->ret; + bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ + return ret; + } else { + return -EIOCBQUEUED; + } +} + +static long __bch2_dio_write_complete(struct dio_write *dio) +{ + struct file *file = dio->req->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = file->f_inode; + long ret = dio->error ?: dio->written; + + bch2_disk_reservation_put(dio->c, &dio->res); + + __pagecache_block_put(&mapping->add_lock); + inode_dio_end(inode); + + if (dio->iovec && dio->iovec != dio->inline_vecs) + kfree(dio->iovec); + + bio_put(&dio->bio.bio); + return ret; +} + +static void bch2_dio_write_complete(struct closure *cl) +{ + struct dio_write *dio = container_of(cl, struct dio_write, cl); + struct kiocb *req = dio->req; + + req->ki_complete(req, __bch2_dio_write_complete(dio), 0); +} + +static void bch2_dio_write_done(struct dio_write *dio) +{ + struct bio_vec *bv; + int i; + + dio->written += dio->iop.op.written << 9; + + if (dio->iop.op.error) + dio->error = dio->iop.op.error; + + bio_for_each_segment_all(bv, &dio->bio.bio, i) + put_page(bv->bv_page); + + if (dio->iter.count) + bio_reset(&dio->bio.bio); +} + +static void bch2_do_direct_IO_write(struct dio_write *dio) +{ + struct file *file = dio->req->ki_filp; + struct inode *inode = file->f_inode; + struct bch_inode_info *ei = to_bch_ei(inode); + struct bio *bio = &dio->bio.bio; + unsigned flags = 0; + int ret; + + if ((dio->req->ki_flags & IOCB_DSYNC) && + !dio->c->opts.journal_flush_disabled) + flags |= BCH_WRITE_FLUSH; + + bio->bi_iter.bi_sector = (dio->offset + dio->written) >> 9; + + ret = bio_get_user_pages(bio, &dio->iter, 0); + if (ret < 0) { + /* + * these didn't get initialized, but bch2_dio_write_done() will + * look at them: + */ + dio->iop.op.error = 0; + dio->iop.op.written = 0; + dio->error = ret; + return; + } + + dio->iop.ei = ei; + dio->iop.sectors_added = 0; + dio->iop.is_dio = true; + dio->iop.new_i_size = U64_MAX; + bch2_write_op_init(&dio->iop.op, dio->c, &dio->bio, + dio->res, + foreground_write_point(dio->c, inode->i_ino), + POS(inode->i_ino, bio->bi_iter.bi_sector), + &ei->journal_seq, flags); + dio->iop.op.index_update_fn = bchfs_write_index_update; + + dio->res.sectors -= bio_sectors(bio); + dio->iop.op.res.sectors = bio_sectors(bio); + + task_io_account_write(bio->bi_iter.bi_size); + + closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl); +} + +static void bch2_dio_write_loop_async(struct closure *cl) +{ + struct dio_write *dio = + container_of(cl, struct dio_write, cl); + struct address_space *mapping = dio->req->ki_filp->f_mapping; + + bch2_dio_write_done(dio); + + if (dio->iter.count && !dio->error) { + use_mm(dio->mm); + pagecache_block_get(&mapping->add_lock); + + bch2_do_direct_IO_write(dio); + + pagecache_block_put(&mapping->add_lock); + unuse_mm(dio->mm); + + continue_at(&dio->cl, bch2_dio_write_loop_async, NULL); + } else { +#if 0 + closure_return_with_destructor(cl, bch2_dio_write_complete); +#else + closure_debug_destroy(cl); + bch2_dio_write_complete(cl); +#endif + } +} + +static int bch2_direct_IO_write(struct bch_fs *c, struct kiocb *req, + struct file *file, struct inode *inode, + struct iov_iter *iter, loff_t offset) +{ + struct address_space *mapping = file->f_mapping; + struct dio_write *dio; + struct bio *bio; + ssize_t ret; + bool sync = is_sync_kiocb(req); + + lockdep_assert_held(&inode->i_rwsem); + + if (unlikely(!iter->count)) + return 0; + + if (unlikely((offset|iter->count) & (block_bytes(c) - 1))) + return -EINVAL; + + bio = bio_alloc_bioset(GFP_KERNEL, + iov_iter_npages(iter, BIO_MAX_PAGES), + bch2_dio_write_bioset); + dio = container_of(bio, struct dio_write, bio.bio); + dio->req = req; + dio->c = c; + dio->written = 0; + dio->error = 0; + dio->offset = offset; + dio->iovec = NULL; + dio->iter = *iter; + dio->mm = current->mm; + closure_init(&dio->cl, NULL); + + if (offset + iter->count > inode->i_size) + sync = true; + + /* + * XXX: we shouldn't return -ENOSPC if we're overwriting existing data - + * if getting a reservation fails we should check if we are doing an + * overwrite. + * + * Have to then guard against racing with truncate (deleting data that + * we would have been overwriting) + */ + ret = bch2_disk_reservation_get(c, &dio->res, iter->count >> 9, 0); + if (unlikely(ret)) { + closure_debug_destroy(&dio->cl); + bio_put(bio); + return ret; + } + + inode_dio_begin(inode); + __pagecache_block_get(&mapping->add_lock); + + if (sync) { + do { + bch2_do_direct_IO_write(dio); + + closure_sync(&dio->cl); + bch2_dio_write_done(dio); + } while (dio->iter.count && !dio->error); + + closure_debug_destroy(&dio->cl); + return __bch2_dio_write_complete(dio); + } else { + bch2_do_direct_IO_write(dio); + + if (dio->iter.count && !dio->error) { + if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { + dio->iovec = kmalloc(dio->iter.nr_segs * + sizeof(struct iovec), + GFP_KERNEL); + if (!dio->iovec) + dio->error = -ENOMEM; + } else { + dio->iovec = dio->inline_vecs; + } + + memcpy(dio->iovec, + dio->iter.iov, + dio->iter.nr_segs * sizeof(struct iovec)); + dio->iter.iov = dio->iovec; + } + + continue_at_noreturn(&dio->cl, bch2_dio_write_loop_async, NULL); + return -EIOCBQUEUED; + } +} + +ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter) +{ + struct file *file = req->ki_filp; + struct inode *inode = file->f_inode; + struct bch_fs *c = inode->i_sb->s_fs_info; + struct blk_plug plug; + ssize_t ret; + + blk_start_plug(&plug); + ret = ((iov_iter_rw(iter) == WRITE) + ? bch2_direct_IO_write + : bch2_direct_IO_read)(c, req, file, inode, iter, req->ki_pos); + blk_finish_plug(&plug); + + return ret; +} + +static ssize_t +bch2_direct_write(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_inode; + struct bch_fs *c = inode->i_sb->s_fs_info; + struct address_space *mapping = file->f_mapping; + loff_t pos = iocb->ki_pos; + ssize_t ret; + + pagecache_block_get(&mapping->add_lock); + + /* Write and invalidate pagecache range that we're writing to: */ + ret = write_invalidate_inode_pages_range(file->f_mapping, pos, + pos + iov_iter_count(iter) - 1); + if (unlikely(ret)) + goto err; + + ret = bch2_direct_IO_write(c, iocb, file, inode, iter, pos); +err: + pagecache_block_put(&mapping->add_lock); + + return ret; +} + +static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + ret = file_remove_privs(file); + if (ret) + goto out; + + ret = file_update_time(file); + if (ret) + goto out; + + ret = iocb->ki_flags & IOCB_DIRECT + ? bch2_direct_write(iocb, from) + : generic_perform_write(file, from, iocb->ki_pos); + + if (likely(ret > 0)) + iocb->ki_pos += ret; +out: + current->backing_dev_info = NULL; + return ret; +} + +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + bool direct = iocb->ki_flags & IOCB_DIRECT; + ssize_t ret; + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret > 0) + ret = __bch2_write_iter(iocb, from); + inode_unlock(inode); + + if (ret > 0 && !direct) + ret = generic_write_sync(iocb, ret); + + return ret; +} + +int bch2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + struct address_space *mapping = inode->i_mapping; + struct bch_fs *c = inode->i_sb->s_fs_info; + int ret = VM_FAULT_LOCKED; + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + + /* + * Not strictly necessary, but helps avoid dio writes livelocking in + * write_invalidate_inode_pages_range() - can drop this if/when we get + * a write_invalidate_inode_pages_range() that works without dropping + * page lock before invalidating page + */ + if (current->pagecache_lock != &mapping->add_lock) + pagecache_add_get(&mapping->add_lock); + + lock_page(page); + if (page->mapping != mapping || + page_offset(page) > i_size_read(inode)) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out; + } + + if (bch2_get_page_reservation(c, page, true)) { + unlock_page(page); + ret = VM_FAULT_SIGBUS; + goto out; + } + + if (!PageDirty(page)) + set_page_dirty(page); + wait_for_stable_page(page); +out: + if (current->pagecache_lock != &mapping->add_lock) + pagecache_add_put(&mapping->add_lock); + sb_end_pagefault(inode->i_sb); + return ret; +} + +void bch2_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + EBUG_ON(!PageLocked(page)); + EBUG_ON(PageWriteback(page)); + + if (offset || length < PAGE_SIZE) + return; + + bch2_clear_page_bits(page); +} + +int bch2_releasepage(struct page *page, gfp_t gfp_mask) +{ + EBUG_ON(!PageLocked(page)); + EBUG_ON(PageWriteback(page)); + + if (PageDirty(page)) + return 0; + + bch2_clear_page_bits(page); + return 1; +} + +#ifdef CONFIG_MIGRATION +int bch2_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + int ret; + + ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + + if (PagePrivate(page)) { + *page_state(newpage) = *page_state(page); + ClearPagePrivate(page); + } + + migrate_page_copy(newpage, page); + return MIGRATEPAGE_SUCCESS; +} +#endif + +int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + struct bch_fs *c = inode->i_sb->s_fs_info; + int ret; + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + if (c->opts.journal_flush_disabled) + return 0; + + return bch2_journal_flush_seq(&c->journal, ei->journal_seq); +} + +static int __bch2_truncate_page(struct address_space *mapping, + pgoff_t index, loff_t start, loff_t end) +{ + struct inode *inode = mapping->host; + struct bch_fs *c = inode->i_sb->s_fs_info; + unsigned start_offset = start & (PAGE_SIZE - 1); + unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; + struct page *page; + int ret = 0; + + /* Page boundary? Nothing to do */ + if (!((index == start >> PAGE_SHIFT && start_offset) || + (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) + return 0; + + /* Above i_size? */ + if (index << PAGE_SHIFT >= inode->i_size) + return 0; + + page = find_lock_page(mapping, index); + if (!page) { + struct btree_iter iter; + struct bkey_s_c k = bkey_s_c_null; + + /* + * XXX: we're doing two index lookups when we end up reading the + * page + */ + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + POS(inode->i_ino, + index << (PAGE_SHIFT - 9)), k) { + if (bkey_cmp(bkey_start_pos(k.k), + POS(inode->i_ino, + (index + 1) << (PAGE_SHIFT - 9))) >= 0) + break; + + if (k.k->type != KEY_TYPE_DISCARD && + k.k->type != BCH_RESERVATION) { + bch2_btree_iter_unlock(&iter); + goto create; + } + } + bch2_btree_iter_unlock(&iter); + return 0; +create: + page = find_or_create_page(mapping, index, GFP_KERNEL); + if (unlikely(!page)) { + ret = -ENOMEM; + goto out; + } + } + + if (!PageUptodate(page)) { + ret = bch2_read_single_page(page, mapping); + if (ret) + goto unlock; + } + + /* + * Bit of a hack - we don't want truncate to fail due to -ENOSPC. + * + * XXX: because we aren't currently tracking whether the page has actual + * data in it (vs. just 0s, or only partially written) this wrong. ick. + */ + ret = bch2_get_page_reservation(c, page, false); + BUG_ON(ret); + + if (index == start >> PAGE_SHIFT && + index == end >> PAGE_SHIFT) + zero_user_segment(page, start_offset, end_offset); + else if (index == start >> PAGE_SHIFT) + zero_user_segment(page, start_offset, PAGE_SIZE); + else if (index == end >> PAGE_SHIFT) + zero_user_segment(page, 0, end_offset); + + if (!PageDirty(page)) + set_page_dirty(page); +unlock: + unlock_page(page); + put_page(page); +out: + return ret; +} + +static int bch2_truncate_page(struct address_space *mapping, loff_t from) +{ + return __bch2_truncate_page(mapping, from >> PAGE_SHIFT, + from, from + PAGE_SIZE); +} + +int bch2_truncate(struct inode *inode, struct iattr *iattr) +{ + struct address_space *mapping = inode->i_mapping; + struct bch_inode_info *ei = to_bch_ei(inode); + struct bch_fs *c = inode->i_sb->s_fs_info; + bool shrink = iattr->ia_size <= inode->i_size; + int ret = 0; + + inode_dio_wait(inode); + pagecache_block_get(&mapping->add_lock); + + truncate_setsize(inode, iattr->ia_size); + + /* sync appends.. */ + /* XXX what protects ei->i_size? */ + if (iattr->ia_size > ei->i_size) + ret = filemap_write_and_wait_range(mapping, ei->i_size, S64_MAX); + if (ret) + goto err_put_pagecache; + + mutex_lock(&ei->update_lock); + i_size_dirty_get(ei); + ret = bch2_write_inode_size(c, ei, inode->i_size); + mutex_unlock(&ei->update_lock); + + if (unlikely(ret)) + goto err; + + /* + * There might be persistent reservations (from fallocate()) + * above i_size, which bch2_inode_truncate() will discard - we're + * only supposed to discard them if we're doing a real truncate + * here (new i_size < current i_size): + */ + if (shrink) { + struct i_sectors_hook i_sectors_hook; + int ret; + + ret = i_sectors_dirty_get(ei, &i_sectors_hook); + if (unlikely(ret)) + goto err; + + ret = bch2_truncate_page(inode->i_mapping, iattr->ia_size); + if (unlikely(ret)) { + i_sectors_dirty_put(ei, &i_sectors_hook); + goto err; + } + + ret = bch2_inode_truncate(c, inode->i_ino, + round_up(iattr->ia_size, PAGE_SIZE) >> 9, + &i_sectors_hook.hook, + &ei->journal_seq); + + i_sectors_dirty_put(ei, &i_sectors_hook); + + if (unlikely(ret)) + goto err; + } + + mutex_lock(&ei->update_lock); + setattr_copy(inode, iattr); + inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); + + /* clear I_SIZE_DIRTY: */ + i_size_dirty_put(ei); + ret = bch2_write_inode_size(c, ei, inode->i_size); + mutex_unlock(&ei->update_lock); + + pagecache_block_put(&mapping->add_lock); + + return 0; +err: + i_size_dirty_put(ei); +err_put_pagecache: + pagecache_block_put(&mapping->add_lock); + return ret; +} + +static long bch2_fpunch(struct inode *inode, loff_t offset, loff_t len) +{ + struct address_space *mapping = inode->i_mapping; + struct bch_inode_info *ei = to_bch_ei(inode); + struct bch_fs *c = inode->i_sb->s_fs_info; + u64 ino = inode->i_ino; + u64 discard_start = round_up(offset, PAGE_SIZE) >> 9; + u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9; + int ret = 0; + + inode_lock(inode); + inode_dio_wait(inode); + pagecache_block_get(&mapping->add_lock); + + ret = __bch2_truncate_page(inode->i_mapping, + offset >> PAGE_SHIFT, + offset, offset + len); + if (unlikely(ret)) + goto out; + + if (offset >> PAGE_SHIFT != + (offset + len) >> PAGE_SHIFT) { + ret = __bch2_truncate_page(inode->i_mapping, + (offset + len) >> PAGE_SHIFT, + offset, offset + len); + if (unlikely(ret)) + goto out; + } + + truncate_pagecache_range(inode, offset, offset + len - 1); + + if (discard_start < discard_end) { + struct disk_reservation disk_res; + struct i_sectors_hook i_sectors_hook; + int ret; + + BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0)); + + ret = i_sectors_dirty_get(ei, &i_sectors_hook); + if (unlikely(ret)) + goto out; + + ret = bch2_discard(c, + POS(ino, discard_start), + POS(ino, discard_end), + ZERO_VERSION, + &disk_res, + &i_sectors_hook.hook, + &ei->journal_seq); + + i_sectors_dirty_put(ei, &i_sectors_hook); + bch2_disk_reservation_put(c, &disk_res); + } +out: + pagecache_block_put(&mapping->add_lock); + inode_unlock(inode); + + return ret; +} + +static long bch2_fcollapse(struct inode *inode, loff_t offset, loff_t len) +{ + struct address_space *mapping = inode->i_mapping; + struct bch_inode_info *ei = to_bch_ei(inode); + struct bch_fs *c = inode->i_sb->s_fs_info; + struct btree_iter src; + struct btree_iter dst; + BKEY_PADDED(k) copy; + struct bkey_s_c k; + struct i_sectors_hook i_sectors_hook; + loff_t new_size; + int ret; + + if ((offset | len) & (PAGE_SIZE - 1)) + return -EINVAL; + + bch2_btree_iter_init_intent(&dst, c, BTREE_ID_EXTENTS, + POS(inode->i_ino, offset >> 9)); + /* position will be set from dst iter's position: */ + bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN); + bch2_btree_iter_link(&src, &dst); + + /* + * We need i_mutex to keep the page cache consistent with the extents + * btree, and the btree consistent with i_size - we don't need outside + * locking for the extents btree itself, because we're using linked + * iterators + */ + inode_lock(inode); + inode_dio_wait(inode); + pagecache_block_get(&mapping->add_lock); + + ret = -EINVAL; + if (offset + len >= inode->i_size) + goto err; + + if (inode->i_size < len) + goto err; + + new_size = inode->i_size - len; + + ret = write_invalidate_inode_pages_range(inode->i_mapping, + offset, LLONG_MAX); + if (ret) + goto err; + + ret = i_sectors_dirty_get(ei, &i_sectors_hook); + if (ret) + goto err; + + while (bkey_cmp(dst.pos, + POS(inode->i_ino, + round_up(new_size, PAGE_SIZE) >> 9)) < 0) { + struct disk_reservation disk_res; + + bch2_btree_iter_set_pos(&src, + POS(dst.pos.inode, dst.pos.offset + (len >> 9))); + + ret = bch2_btree_iter_traverse(&dst); + if (ret) + goto btree_iter_err; + + k = bch2_btree_iter_peek_with_holes(&src); + if ((ret = btree_iter_err(k))) + goto btree_iter_err; + + bkey_reassemble(©.k, k); + + if (bkey_deleted(©.k.k)) + copy.k.k.type = KEY_TYPE_DISCARD; + + bch2_cut_front(src.pos, ©.k); + copy.k.k.p.offset -= len >> 9; + + BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(©.k.k))); + + ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size, + BCH_DISK_RESERVATION_NOFAIL); + BUG_ON(ret); + + ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook, + &ei->journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL, + BTREE_INSERT_ENTRY(&dst, ©.k)); + bch2_disk_reservation_put(c, &disk_res); +btree_iter_err: + if (ret < 0 && ret != -EINTR) + goto err_unwind; + + bch2_btree_iter_cond_resched(&src); + } + + bch2_btree_iter_unlock(&src); + bch2_btree_iter_unlock(&dst); + + ret = bch2_inode_truncate(c, inode->i_ino, + round_up(new_size, PAGE_SIZE) >> 9, + &i_sectors_hook.hook, + &ei->journal_seq); + if (ret) + goto err_unwind; + + i_sectors_dirty_put(ei, &i_sectors_hook); + + mutex_lock(&ei->update_lock); + i_size_write(inode, new_size); + ret = bch2_write_inode_size(c, ei, inode->i_size); + mutex_unlock(&ei->update_lock); + + pagecache_block_put(&mapping->add_lock); + inode_unlock(inode); + + return ret; +err_unwind: + /* + * XXX: we've left data with multiple pointers... which isn't a _super_ + * serious problem... + */ + i_sectors_dirty_put(ei, &i_sectors_hook); +err: + bch2_btree_iter_unlock(&src); + bch2_btree_iter_unlock(&dst); + pagecache_block_put(&mapping->add_lock); + inode_unlock(inode); + return ret; +} + +static long bch2_fallocate(struct inode *inode, int mode, + loff_t offset, loff_t len) +{ + struct address_space *mapping = inode->i_mapping; + struct bch_inode_info *ei = to_bch_ei(inode); + struct bch_fs *c = inode->i_sb->s_fs_info; + struct i_sectors_hook i_sectors_hook; + struct btree_iter iter; + struct bpos end; + loff_t block_start, block_end; + loff_t new_size = offset + len; + unsigned sectors; + unsigned replicas = READ_ONCE(c->opts.data_replicas); + int ret; + + bch2_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + + inode_lock(inode); + inode_dio_wait(inode); + pagecache_block_get(&mapping->add_lock); + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + new_size > inode->i_size) { + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto err; + } + + if (mode & FALLOC_FL_ZERO_RANGE) { + ret = __bch2_truncate_page(inode->i_mapping, + offset >> PAGE_SHIFT, + offset, offset + len); + + if (!ret && + offset >> PAGE_SHIFT != + (offset + len) >> PAGE_SHIFT) + ret = __bch2_truncate_page(inode->i_mapping, + (offset + len) >> PAGE_SHIFT, + offset, offset + len); + + if (unlikely(ret)) + goto err; + + truncate_pagecache_range(inode, offset, offset + len - 1); + + block_start = round_up(offset, PAGE_SIZE); + block_end = round_down(offset + len, PAGE_SIZE); + } else { + block_start = round_down(offset, PAGE_SIZE); + block_end = round_up(offset + len, PAGE_SIZE); + } + + bch2_btree_iter_set_pos(&iter, POS(inode->i_ino, block_start >> 9)); + end = POS(inode->i_ino, block_end >> 9); + + ret = i_sectors_dirty_get(ei, &i_sectors_hook); + if (unlikely(ret)) + goto err; + + while (bkey_cmp(iter.pos, end) < 0) { + struct disk_reservation disk_res = { 0 }; + struct bkey_i_reservation reservation; + struct bkey_s_c k; + + k = bch2_btree_iter_peek_with_holes(&iter); + if ((ret = btree_iter_err(k))) + goto btree_iter_err; + + /* already reserved */ + if (k.k->type == BCH_RESERVATION && + bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { + bch2_btree_iter_advance_pos(&iter); + continue; + } + + if (bkey_extent_is_data(k.k)) { + if (!(mode & FALLOC_FL_ZERO_RANGE)) { + bch2_btree_iter_advance_pos(&iter); + continue; + } + } + + bkey_reservation_init(&reservation.k_i); + reservation.k.type = BCH_RESERVATION; + reservation.k.p = k.k->p; + reservation.k.size = k.k->size; + + bch2_cut_front(iter.pos, &reservation.k_i); + bch2_cut_back(end, &reservation.k); + + sectors = reservation.k.size; + reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k); + + if (reservation.v.nr_replicas < replicas || + bkey_extent_is_compressed(k)) { + ret = bch2_disk_reservation_get(c, &disk_res, + sectors, 0); + if (ret) + goto err_put_sectors_dirty; + + reservation.v.nr_replicas = disk_res.nr_replicas; + } + + ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook, + &ei->journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL, + BTREE_INSERT_ENTRY(&iter, &reservation.k_i)); + bch2_disk_reservation_put(c, &disk_res); +btree_iter_err: + if (ret < 0 && ret != -EINTR) + goto err_put_sectors_dirty; + + } + bch2_btree_iter_unlock(&iter); + + i_sectors_dirty_put(ei, &i_sectors_hook); + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + new_size > inode->i_size) { + i_size_write(inode, new_size); + + mutex_lock(&ei->update_lock); + ret = bch2_write_inode_size(c, ei, inode->i_size); + mutex_unlock(&ei->update_lock); + } + + /* blech */ + if ((mode & FALLOC_FL_KEEP_SIZE) && + (mode & FALLOC_FL_ZERO_RANGE) && + ei->i_size != inode->i_size) { + /* sync appends.. */ + ret = filemap_write_and_wait_range(mapping, ei->i_size, S64_MAX); + if (ret) + goto err; + + if (ei->i_size != inode->i_size) { + mutex_lock(&ei->update_lock); + ret = bch2_write_inode_size(c, ei, inode->i_size); + mutex_unlock(&ei->update_lock); + } + } + + pagecache_block_put(&mapping->add_lock); + inode_unlock(inode); + + return 0; +err_put_sectors_dirty: + i_sectors_dirty_put(ei, &i_sectors_hook); +err: + bch2_btree_iter_unlock(&iter); + pagecache_block_put(&mapping->add_lock); + inode_unlock(inode); + return ret; +} + +long bch2_fallocate_dispatch(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + + if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) + return bch2_fallocate(inode, mode, offset, len); + + if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) + return bch2_fpunch(inode, offset, len); + + if (mode == FALLOC_FL_COLLAPSE_RANGE) + return bch2_fcollapse(inode, offset, len); + + return -EOPNOTSUPP; +} + +static bool page_is_data(struct page *page) +{ + /* XXX: should only have to check PageDirty */ + return PagePrivate(page) && + (page_state(page)->sectors || + page_state(page)->dirty_sectors); +} + +static loff_t bch2_next_pagecache_data(struct inode *inode, + loff_t start_offset, + loff_t end_offset) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + pgoff_t index; + + for (index = start_offset >> PAGE_SHIFT; + index < end_offset >> PAGE_SHIFT; + index++) { + if (find_get_pages(mapping, index, 1, &page)) { + lock_page(page); + index = page->index; + + if (page_is_data(page)) + end_offset = + min(end_offset, + max(start_offset, + ((loff_t) index) << PAGE_SHIFT)); + unlock_page(page); + put_page(page); + } else { + break; + } + } + + return end_offset; +} + +static loff_t bch2_seek_data(struct file *file, u64 offset) +{ + struct inode *inode = file->f_mapping->host; + struct bch_fs *c = inode->i_sb->s_fs_info; + struct btree_iter iter; + struct bkey_s_c k; + u64 isize, next_data = MAX_LFS_FILESIZE; + int ret; + + isize = i_size_read(inode); + if (offset >= isize) + return -ENXIO; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + POS(inode->i_ino, offset >> 9), k) { + if (k.k->p.inode != inode->i_ino) { + break; + } else if (bkey_extent_is_data(k.k)) { + next_data = max(offset, bkey_start_offset(k.k) << 9); + break; + } else if (k.k->p.offset >> 9 > isize) + break; + } + + ret = bch2_btree_iter_unlock(&iter); + if (ret) + return ret; + + if (next_data > offset) + next_data = bch2_next_pagecache_data(inode, offset, next_data); + + if (next_data > isize) + return -ENXIO; + + return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); +} + +static bool page_slot_is_data(struct address_space *mapping, pgoff_t index) +{ + struct page *page; + bool ret; + + page = find_lock_entry(mapping, index); + if (!page || radix_tree_exception(page)) + return false; + + ret = page_is_data(page); + unlock_page(page); + + return ret; +} + +static loff_t bch2_next_pagecache_hole(struct inode *inode, + loff_t start_offset, + loff_t end_offset) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + + for (index = start_offset >> PAGE_SHIFT; + index < end_offset >> PAGE_SHIFT; + index++) + if (!page_slot_is_data(mapping, index)) + end_offset = max(start_offset, + ((loff_t) index) << PAGE_SHIFT); + + return end_offset; +} + +static loff_t bch2_seek_hole(struct file *file, u64 offset) +{ + struct inode *inode = file->f_mapping->host; + struct bch_fs *c = inode->i_sb->s_fs_info; + struct btree_iter iter; + struct bkey_s_c k; + u64 isize, next_hole = MAX_LFS_FILESIZE; + int ret; + + isize = i_size_read(inode); + if (offset >= isize) + return -ENXIO; + + for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS, + POS(inode->i_ino, offset >> 9), k) { + if (k.k->p.inode != inode->i_ino) { + next_hole = bch2_next_pagecache_hole(inode, + offset, MAX_LFS_FILESIZE); + break; + } else if (!bkey_extent_is_data(k.k)) { + next_hole = bch2_next_pagecache_hole(inode, + max(offset, bkey_start_offset(k.k) << 9), + k.k->p.offset << 9); + + if (next_hole < k.k->p.offset << 9) + break; + } else { + offset = max(offset, bkey_start_offset(k.k) << 9); + } + } + + ret = bch2_btree_iter_unlock(&iter); + if (ret) + return ret; + + if (next_hole > isize) + next_hole = isize; + + return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); +} + +loff_t bch2_llseek(struct file *file, loff_t offset, int whence) +{ + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek(file, offset, whence); + case SEEK_DATA: + return bch2_seek_data(file, offset); + case SEEK_HOLE: + return bch2_seek_hole(file, offset); + } + + return -EINVAL; +} diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h new file mode 100644 index 00000000..f3fcf947 --- /dev/null +++ b/libbcachefs/fs-io.h @@ -0,0 +1,96 @@ +#ifndef _BCACHE_FS_IO_H +#define _BCACHE_FS_IO_H + +#include "buckets.h" +#include <linux/uio.h> + +int bch2_set_page_dirty(struct page *); + +int bch2_writepage(struct page *, struct writeback_control *); +int bch2_readpage(struct file *, struct page *); + +int bch2_writepages(struct address_space *, struct writeback_control *); +int bch2_readpages(struct file *, struct address_space *, + struct list_head *, unsigned); + +int bch2_write_begin(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page **, void **); +int bch2_write_end(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page *, void *); + +ssize_t bch2_direct_IO(struct kiocb *, struct iov_iter *); + +ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); + +int bch2_fsync(struct file *, loff_t, loff_t, int); + +int bch2_truncate(struct inode *, struct iattr *); +long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); + +loff_t bch2_llseek(struct file *, loff_t, int); + +int bch2_page_mkwrite(struct vm_area_struct *, struct vm_fault *); +void bch2_invalidatepage(struct page *, unsigned int, unsigned int); +int bch2_releasepage(struct page *, gfp_t); +int bch2_migrate_page(struct address_space *, struct page *, + struct page *, enum migrate_mode); + +struct i_sectors_hook { + struct extent_insert_hook hook; + s64 sectors; + struct bch_inode_info *ei; +}; + +struct bchfs_write_op { + struct bch_inode_info *ei; + s64 sectors_added; + bool is_dio; + u64 new_i_size; + struct bch_write_op op; +}; + +struct bch_writepage_io { + struct closure cl; + + struct bchfs_write_op op; + + /* must come last: */ + struct bch_write_bio bio; +}; + +extern struct bio_set *bch2_writepage_bioset; + +struct dio_write { + struct closure cl; + struct kiocb *req; + struct bch_fs *c; + long written; + long error; + loff_t offset; + + struct disk_reservation res; + + struct iovec *iovec; + struct iovec inline_vecs[UIO_FASTIOV]; + struct iov_iter iter; + + struct mm_struct *mm; + + struct bchfs_write_op iop; + + /* must be last: */ + struct bch_write_bio bio; +}; + +extern struct bio_set *bch2_dio_write_bioset; + +struct dio_read { + struct closure cl; + struct kiocb *req; + long ret; + struct bch_read_bio rbio; +}; + +extern struct bio_set *bch2_dio_read_bioset; + +#endif /* _BCACHE_FS_IO_H */ diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c new file mode 100644 index 00000000..94c5a9e6 --- /dev/null +++ b/libbcachefs/fs.c @@ -0,0 +1,1481 @@ + +#include "bcachefs.h" +#include "acl.h" +#include "btree_update.h" +#include "buckets.h" +#include "chardev.h" +#include "dirent.h" +#include "extents.h" +#include "fs.h" +#include "fs-gc.h" +#include "fs-io.h" +#include "inode.h" +#include "journal.h" +#include "keylist.h" +#include "super.h" +#include "xattr.h" + +#include <linux/aio.h> +#include <linux/backing-dev.h> +#include <linux/compat.h> +#include <linux/module.h> +#include <linux/mount.h> +#include <linux/random.h> +#include <linux/statfs.h> +#include <linux/xattr.h> + +static struct kmem_cache *bch2_inode_cache; + +static void bch2_vfs_inode_init(struct bch_fs *, + struct bch_inode_info *, + struct bch_inode_unpacked *); + +/* + * I_SIZE_DIRTY requires special handling: + * + * To the recovery code, the flag means that there is stale data past i_size + * that needs to be deleted; it's used for implementing atomic appends and + * truncates. + * + * On append, we set I_SIZE_DIRTY before doing the write, then after the write + * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size + * that exposes the data we just wrote. + * + * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting + * i_size to the new smaller size, then we delete the data that we just made + * invisible, and then we clear I_SIZE_DIRTY. + * + * Because there can be multiple appends in flight at a time, we need a refcount + * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero + * refcount means I_SIZE_DIRTY is set, zero means it's cleared. + * + * Because write_inode() can be called at any time, i_size_dirty_count means + * something different to the runtime code - it means to write_inode() "don't + * update i_size yet". + * + * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when + * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must + * be set explicitly. + */ + +int __must_check __bch2_write_inode(struct bch_fs *c, + struct bch_inode_info *ei, + inode_set_fn set, + void *p) +{ + struct btree_iter iter; + struct inode *inode = &ei->vfs_inode; + struct bch_inode_unpacked inode_u; + struct bkey_inode_buf inode_p; + u64 inum = inode->i_ino; + unsigned i_nlink = READ_ONCE(inode->i_nlink); + int ret; + + /* + * We can't write an inode with i_nlink == 0 because it's stored biased; + * however, we don't need to because if i_nlink is 0 the inode is + * getting deleted when it's evicted. + */ + if (!i_nlink) + return 0; + + lockdep_assert_held(&ei->update_lock); + + bch2_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(inum, 0)); + + do { + struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter); + + if ((ret = btree_iter_err(k))) + goto out; + + if (WARN_ONCE(k.k->type != BCH_INODE_FS, + "inode %llu not found when updating", inum)) { + bch2_btree_iter_unlock(&iter); + return -ENOENT; + } + + ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); + if (WARN_ONCE(ret, + "error %i unpacking inode %llu", ret, inum)) { + ret = -ENOENT; + break; + } + + if (set) { + ret = set(ei, &inode_u, p); + if (ret) + goto out; + } + + BUG_ON(i_nlink < nlink_bias(inode->i_mode)); + + inode_u.i_mode = inode->i_mode; + inode_u.i_uid = i_uid_read(inode); + inode_u.i_gid = i_gid_read(inode); + inode_u.i_nlink = i_nlink - nlink_bias(inode->i_mode); + inode_u.i_dev = inode->i_rdev; + inode_u.i_atime = timespec_to_bch2_time(c, inode->i_atime); + inode_u.i_mtime = timespec_to_bch2_time(c, inode->i_mtime); + inode_u.i_ctime = timespec_to_bch2_time(c, inode->i_ctime); + + bch2_inode_pack(&inode_p, &inode_u); + + ret = bch2_btree_insert_at(c, NULL, NULL, &ei->journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL, + BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i)); + } while (ret == -EINTR); + + if (!ret) { + ei->i_size = inode_u.i_size; + ei->i_flags = inode_u.i_flags; + } +out: + bch2_btree_iter_unlock(&iter); + + return ret < 0 ? ret : 0; +} + +int __must_check bch2_write_inode(struct bch_fs *c, + struct bch_inode_info *ei) +{ + return __bch2_write_inode(c, ei, NULL, NULL); +} + +int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *ei) +{ + int ret; + + mutex_lock(&ei->update_lock); + inc_nlink(&ei->vfs_inode); + ret = bch2_write_inode(c, ei); + mutex_unlock(&ei->update_lock); + + return ret; +} + +int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *ei) +{ + int ret = 0; + + mutex_lock(&ei->update_lock); + drop_nlink(&ei->vfs_inode); + ret = bch2_write_inode(c, ei); + mutex_unlock(&ei->update_lock); + + return ret; +} + +static struct inode *bch2_vfs_inode_get(struct super_block *sb, u64 inum) +{ + struct bch_fs *c = sb->s_fs_info; + struct inode *inode; + struct bch_inode_unpacked inode_u; + struct bch_inode_info *ei; + int ret; + + pr_debug("inum %llu", inum); + + inode = iget_locked(sb, inum); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ret = bch2_inode_find_by_inum(c, inum, &inode_u); + if (ret) { + iget_failed(inode); + return ERR_PTR(ret); + } + + ei = to_bch_ei(inode); + bch2_vfs_inode_init(c, ei, &inode_u); + + ei->journal_seq = bch2_inode_journal_seq(&c->journal, inum); + + unlock_new_inode(inode); + + return inode; +} + +static struct inode *bch2_vfs_inode_create(struct bch_fs *c, + struct inode *parent, + umode_t mode, dev_t rdev) +{ + struct inode *inode; + struct posix_acl *default_acl = NULL, *acl = NULL; + struct bch_inode_info *ei; + struct bch_inode_unpacked inode_u; + struct bkey_inode_buf inode_p; + int ret; + + inode = new_inode(parent->i_sb); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + + inode_init_owner(inode, parent, mode); + + ret = posix_acl_create(parent, &inode->i_mode, &default_acl, &acl); + if (ret) { + make_bad_inode(inode); + goto err; + } + + ei = to_bch_ei(inode); + + bch2_inode_init(c, &inode_u, i_uid_read(inode), + i_gid_read(inode), inode->i_mode, rdev); + bch2_inode_pack(&inode_p, &inode_u); + + ret = bch2_inode_create(c, &inode_p.inode.k_i, + BLOCKDEV_INODE_MAX, 0, + &c->unused_inode_hint); + if (unlikely(ret)) { + /* + * indicate to bch_evict_inode that the inode was never actually + * created: + */ + make_bad_inode(inode); + goto err; + } + + inode_u.inum = inode_p.inode.k.p.inode; + bch2_vfs_inode_init(c, ei, &inode_u); + + if (default_acl) { + ret = bch2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (unlikely(ret)) + goto err; + } + + if (acl) { + ret = bch2_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (unlikely(ret)) + goto err; + } + + insert_inode_hash(inode); + atomic_long_inc(&c->nr_inodes); +out: + posix_acl_release(default_acl); + posix_acl_release(acl); + return inode; +err: + clear_nlink(inode); + iput(inode); + inode = ERR_PTR(ret); + goto out; +} + +static int bch2_vfs_dirent_create(struct bch_fs *c, struct inode *dir, + u8 type, const struct qstr *name, + struct inode *dst) +{ + struct bch_inode_info *dir_ei = to_bch_ei(dir); + int ret; + + ret = bch2_dirent_create(c, dir->i_ino, &dir_ei->str_hash, + type, name, dst->i_ino, + &dir_ei->journal_seq, + BCH_HASH_SET_MUST_CREATE); + if (unlikely(ret)) + return ret; + + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + mark_inode_dirty_sync(dir); + return 0; +} + +static int __bch2_create(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct bch_inode_info *dir_ei = to_bch_ei(dir); + struct bch_fs *c = dir->i_sb->s_fs_info; + struct inode *inode; + struct bch_inode_info *ei; + int ret; + + inode = bch2_vfs_inode_create(c, dir, mode, rdev); + if (unlikely(IS_ERR(inode))) + return PTR_ERR(inode); + + ei = to_bch_ei(inode); + + ret = bch2_vfs_dirent_create(c, dir, mode_to_type(mode), + &dentry->d_name, inode); + if (unlikely(ret)) { + clear_nlink(inode); + iput(inode); + return ret; + } + + if (dir_ei->journal_seq > ei->journal_seq) + ei->journal_seq = dir_ei->journal_seq; + + d_instantiate(dentry, inode); + return 0; +} + +/* methods */ + +static struct dentry *bch2_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct bch_fs *c = dir->i_sb->s_fs_info; + struct bch_inode_info *dir_ei = to_bch_ei(dir); + struct inode *inode = NULL; + u64 inum; + + inum = bch2_dirent_lookup(c, dir->i_ino, + &dir_ei->str_hash, + &dentry->d_name); + + if (inum) + inode = bch2_vfs_inode_get(dir->i_sb, inum); + + return d_splice_alias(inode, dentry); +} + +static int bch2_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + return __bch2_create(dir, dentry, mode|S_IFREG, 0); +} + +static int bch2_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct bch_fs *c = dir->i_sb->s_fs_info; + struct inode *inode = old_dentry->d_inode; + struct bch_inode_info *ei = to_bch_ei(inode); + int ret; + + lockdep_assert_held(&inode->i_rwsem); + + inode->i_ctime = current_fs_time(dir->i_sb); + + ret = bch2_inc_nlink(c, ei); + if (ret) + return ret; + + ihold(inode); + + ret = bch2_vfs_dirent_create(c, dir, mode_to_type(inode->i_mode), + &dentry->d_name, inode); + if (unlikely(ret)) { + bch2_dec_nlink(c, ei); + iput(inode); + return ret; + } + + d_instantiate(dentry, inode); + return 0; +} + +static int bch2_unlink(struct inode *dir, struct dentry *dentry) +{ + struct bch_fs *c = dir->i_sb->s_fs_info; + struct bch_inode_info *dir_ei = to_bch_ei(dir); + struct inode *inode = dentry->d_inode; + struct bch_inode_info *ei = to_bch_ei(inode); + int ret; + + lockdep_assert_held(&inode->i_rwsem); + + ret = bch2_dirent_delete(c, dir->i_ino, &dir_ei->str_hash, + &dentry->d_name, &dir_ei->journal_seq); + if (ret) + return ret; + + if (dir_ei->journal_seq > ei->journal_seq) + ei->journal_seq = dir_ei->journal_seq; + + inode->i_ctime = dir->i_ctime; + + if (S_ISDIR(inode->i_mode)) { + bch2_dec_nlink(c, dir_ei); + drop_nlink(inode); + } + + bch2_dec_nlink(c, ei); + + return 0; +} + +static int bch2_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct bch_fs *c = dir->i_sb->s_fs_info; + struct inode *inode; + struct bch_inode_info *ei, *dir_ei = to_bch_ei(dir); + int ret; + + inode = bch2_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0); + if (unlikely(IS_ERR(inode))) + return PTR_ERR(inode); + + ei = to_bch_ei(inode); + + inode_lock(inode); + ret = page_symlink(inode, symname, strlen(symname) + 1); + inode_unlock(inode); + + if (unlikely(ret)) + goto err; + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (unlikely(ret)) + goto err; + + /* XXX: racy */ + if (dir_ei->journal_seq < ei->journal_seq) + dir_ei->journal_seq = ei->journal_seq; + + ret = bch2_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name, inode); + if (unlikely(ret)) + goto err; + + d_instantiate(dentry, inode); + return 0; +err: + clear_nlink(inode); + iput(inode); + return ret; +} + +static int bch2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct bch_fs *c = dir->i_sb->s_fs_info; + int ret; + + lockdep_assert_held(&dir->i_rwsem); + + ret = __bch2_create(dir, dentry, mode|S_IFDIR, 0); + if (unlikely(ret)) + return ret; + + bch2_inc_nlink(c, to_bch_ei(dir)); + + return 0; +} + +static int bch2_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct bch_fs *c = dir->i_sb->s_fs_info; + struct inode *inode = dentry->d_inode; + + if (bch2_empty_dir(c, inode->i_ino)) + return -ENOTEMPTY; + + return bch2_unlink(dir, dentry); +} + +static int bch2_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + return __bch2_create(dir, dentry, mode, rdev); +} + +static int bch2_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct bch_fs *c = old_dir->i_sb->s_fs_info; + struct inode *old_inode = old_dentry->d_inode; + struct bch_inode_info *ei = to_bch_ei(old_inode); + struct inode *new_inode = new_dentry->d_inode; + struct timespec now = current_fs_time(old_dir->i_sb); + int ret; + + lockdep_assert_held(&old_dir->i_rwsem); + lockdep_assert_held(&new_dir->i_rwsem); + + if (new_inode) + filemap_write_and_wait_range(old_inode->i_mapping, + 0, LLONG_MAX); + + if (new_inode && S_ISDIR(old_inode->i_mode)) { + lockdep_assert_held(&new_inode->i_rwsem); + + if (!S_ISDIR(new_inode->i_mode)) + return -ENOTDIR; + + if (bch2_empty_dir(c, new_inode->i_ino)) + return -ENOTEMPTY; + + ret = bch2_dirent_rename(c, + old_dir, &old_dentry->d_name, + new_dir, &new_dentry->d_name, + &ei->journal_seq, BCH_RENAME_OVERWRITE); + if (unlikely(ret)) + return ret; + + clear_nlink(new_inode); + bch2_dec_nlink(c, to_bch_ei(old_dir)); + } else if (new_inode) { + lockdep_assert_held(&new_inode->i_rwsem); + + ret = bch2_dirent_rename(c, + old_dir, &old_dentry->d_name, + new_dir, &new_dentry->d_name, + &ei->journal_seq, BCH_RENAME_OVERWRITE); + if (unlikely(ret)) + return ret; + + new_inode->i_ctime = now; + bch2_dec_nlink(c, to_bch_ei(new_inode)); + } else if (S_ISDIR(old_inode->i_mode)) { + ret = bch2_dirent_rename(c, + old_dir, &old_dentry->d_name, + new_dir, &new_dentry->d_name, + &ei->journal_seq, BCH_RENAME); + if (unlikely(ret)) + return ret; + + bch2_inc_nlink(c, to_bch_ei(new_dir)); + bch2_dec_nlink(c, to_bch_ei(old_dir)); + } else { + ret = bch2_dirent_rename(c, + old_dir, &old_dentry->d_name, + new_dir, &new_dentry->d_name, + &ei->journal_seq, BCH_RENAME); + if (unlikely(ret)) + return ret; + } + + old_dir->i_ctime = old_dir->i_mtime = now; + new_dir->i_ctime = new_dir->i_mtime = now; + mark_inode_dirty_sync(old_dir); + mark_inode_dirty_sync(new_dir); + + old_inode->i_ctime = now; + mark_inode_dirty_sync(old_inode); + + return 0; +} + +static int bch2_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct bch_fs *c = old_dir->i_sb->s_fs_info; + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct bch_inode_info *ei = to_bch_ei(old_inode); + struct timespec now = current_fs_time(old_dir->i_sb); + int ret; + + ret = bch2_dirent_rename(c, + old_dir, &old_dentry->d_name, + new_dir, &new_dentry->d_name, + &ei->journal_seq, BCH_RENAME_EXCHANGE); + if (unlikely(ret)) + return ret; + + if (S_ISDIR(old_inode->i_mode) != + S_ISDIR(new_inode->i_mode)) { + if (S_ISDIR(old_inode->i_mode)) { + bch2_inc_nlink(c, to_bch_ei(new_dir)); + bch2_dec_nlink(c, to_bch_ei(old_dir)); + } else { + bch2_dec_nlink(c, to_bch_ei(new_dir)); + bch2_inc_nlink(c, to_bch_ei(old_dir)); + } + } + + old_dir->i_ctime = old_dir->i_mtime = now; + new_dir->i_ctime = new_dir->i_mtime = now; + mark_inode_dirty_sync(old_dir); + mark_inode_dirty_sync(new_dir); + + old_inode->i_ctime = now; + new_inode->i_ctime = now; + mark_inode_dirty_sync(old_inode); + mark_inode_dirty_sync(new_inode); + + return 0; +} + +static int bch2_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned flags) +{ + if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) + return bch2_rename_exchange(old_dir, old_dentry, + new_dir, new_dentry); + + return bch2_rename(old_dir, old_dentry, new_dir, new_dentry); +} + +static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + struct bch_inode_info *ei = to_bch_ei(inode); + struct bch_fs *c = inode->i_sb->s_fs_info; + int ret = 0; + + lockdep_assert_held(&inode->i_rwsem); + + pr_debug("i_size was %llu update has %llu", + inode->i_size, iattr->ia_size); + + ret = setattr_prepare(dentry, iattr); + if (ret) + return ret; + + if (iattr->ia_valid & ATTR_SIZE) { + ret = bch2_truncate(inode, iattr); + } else { + mutex_lock(&ei->update_lock); + setattr_copy(inode, iattr); + ret = bch2_write_inode(c, ei); + mutex_unlock(&ei->update_lock); + } + + if (unlikely(ret)) + return ret; + + if (iattr->ia_valid & ATTR_MODE) + ret = posix_acl_chmod(inode, inode->i_mode); + + return ret; +} + +static int bch2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct bch_fs *c = dir->i_sb->s_fs_info; + struct inode *inode; + + /* XXX: i_nlink should be 0? */ + inode = bch2_vfs_inode_create(c, dir, mode, 0); + if (unlikely(IS_ERR(inode))) + return PTR_ERR(inode); + + d_tmpfile(dentry, inode); + return 0; +} + +static int bch2_fill_extent(struct fiemap_extent_info *info, + const struct bkey_i *k, unsigned flags) +{ + if (bkey_extent_is_data(&k->k)) { + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + const struct bch_extent_ptr *ptr; + const union bch_extent_crc *crc; + int ret; + + extent_for_each_ptr_crc(e, ptr, crc) { + int flags2 = 0; + u64 offset = ptr->offset; + + if (crc_compression_type(crc)) + flags2 |= FIEMAP_EXTENT_ENCODED; + else + offset += crc_offset(crc); + + if ((offset & (PAGE_SECTORS - 1)) || + (e.k->size & (PAGE_SECTORS - 1))) + flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; + + ret = fiemap_fill_next_extent(info, + bkey_start_offset(e.k) << 9, + offset << 9, + e.k->size << 9, flags|flags2); + if (ret) + return ret; + } + + return 0; + } else if (k->k.type == BCH_RESERVATION) { + return fiemap_fill_next_extent(info, + bkey_start_offset(&k->k) << 9, + 0, k->k.size << 9, + flags| + FIEMAP_EXTENT_DELALLOC| + FIEMAP_EXTENT_UNWRITTEN); + } else { + BUG(); + } +} + +static int bch2_fiemap(struct inode *inode, struct fiemap_extent_info *info, + u64 start, u64 len) +{ + struct bch_fs *c = inode->i_sb->s_fs_info; + struct btree_iter iter; + struct bkey_s_c k; + BKEY_PADDED(k) tmp; + bool have_extent = false; + int ret = 0; + + if (start + len < start) + return -EINVAL; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + POS(inode->i_ino, start >> 9), k) + if (bkey_extent_is_data(k.k) || + k.k->type == BCH_RESERVATION) { + if (bkey_cmp(bkey_start_pos(k.k), + POS(inode->i_ino, (start + len) >> 9)) >= 0) + break; + + if (have_extent) { + ret = bch2_fill_extent(info, &tmp.k, 0); + if (ret) + goto out; + } + + bkey_reassemble(&tmp.k, k); + have_extent = true; + } + + if (have_extent) + ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST); +out: + bch2_btree_iter_unlock(&iter); + return ret < 0 ? ret : 0; +} + +static const struct vm_operations_struct bch_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = bch2_page_mkwrite, +}; + +static int bch2_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + + vma->vm_ops = &bch_vm_ops; + return 0; +} + +/* Inode flags: */ + +static const unsigned bch_inode_flags_to_vfs_flags_map[] = { + [__BCH_INODE_SYNC] = S_SYNC, + [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, + [__BCH_INODE_APPEND] = S_APPEND, + [__BCH_INODE_NOATIME] = S_NOATIME, +}; + +static const unsigned bch_inode_flags_to_user_flags_map[] = { + [__BCH_INODE_SYNC] = FS_SYNC_FL, + [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, + [__BCH_INODE_APPEND] = FS_APPEND_FL, + [__BCH_INODE_NODUMP] = FS_NODUMP_FL, + [__BCH_INODE_NOATIME] = FS_NOATIME_FL, +}; + +/* Set VFS inode flags from bcachefs inode: */ +static void bch2_inode_flags_to_vfs(struct inode *inode) +{ + unsigned i, flags = to_bch_ei(inode)->i_flags; + + for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_vfs_flags_map); i++) + if (flags & (1 << i)) + inode->i_flags |= bch_inode_flags_to_vfs_flags_map[i]; + else + inode->i_flags &= ~bch_inode_flags_to_vfs_flags_map[i]; +} + +/* Get FS_IOC_GETFLAGS flags from bcachefs inode: */ +static unsigned bch2_inode_flags_to_user_flags(unsigned flags) +{ + unsigned i, ret = 0; + + for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_user_flags_map); i++) + if (flags & (1 << i)) + ret |= bch_inode_flags_to_user_flags_map[i]; + + return ret; +} + +static int bch2_inode_user_flags_set(struct bch_inode_info *ei, + struct bch_inode_unpacked *bi, + void *p) +{ + /* + * We're relying on btree locking here for exclusion with other ioctl + * calls - use the flags in the btree (@bi), not ei->i_flags: + */ + unsigned bch_flags = bi->i_flags; + unsigned oldflags = bch2_inode_flags_to_user_flags(bch_flags); + unsigned newflags = *((unsigned *) p); + unsigned i; + + if (((newflags ^ oldflags) & (FS_APPEND_FL|FS_IMMUTABLE_FL)) && + !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_user_flags_map); i++) { + if (newflags & bch_inode_flags_to_user_flags_map[i]) + bch_flags |= (1 << i); + else + bch_flags &= ~(1 << i); + + newflags &= ~bch_inode_flags_to_user_flags_map[i]; + oldflags &= ~bch_inode_flags_to_user_flags_map[i]; + } + + if (oldflags != newflags) + return -EOPNOTSUPP; + + bi->i_flags = bch_flags; + ei->vfs_inode.i_ctime = current_fs_time(ei->vfs_inode.i_sb); + + return 0; +} + +#define FS_IOC_GOINGDOWN _IOR ('X', 125, __u32) + +static long bch2_fs_file_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct bch_fs *c = sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(inode); + unsigned flags; + int ret; + + switch (cmd) { + case FS_IOC_GETFLAGS: + return put_user(bch2_inode_flags_to_user_flags(ei->i_flags), + (int __user *) arg); + + case FS_IOC_SETFLAGS: { + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (!inode_owner_or_capable(inode)) { + ret = -EACCES; + goto setflags_out; + } + + if (get_user(flags, (int __user *) arg)) { + ret = -EFAULT; + goto setflags_out; + } + + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + (flags & (FS_NODUMP_FL|FS_NOATIME_FL)) != flags) { + ret = -EINVAL; + goto setflags_out; + } + + inode_lock(inode); + + mutex_lock(&ei->update_lock); + ret = __bch2_write_inode(c, ei, bch2_inode_user_flags_set, &flags); + mutex_unlock(&ei->update_lock); + + if (!ret) + bch2_inode_flags_to_vfs(inode); + + inode_unlock(inode); +setflags_out: + mnt_drop_write_file(filp); + return ret; + } + + case FS_IOC_GETVERSION: + return -ENOTTY; + case FS_IOC_SETVERSION: + return -ENOTTY; + + case FS_IOC_GOINGDOWN: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + down_write(&sb->s_umount); + sb->s_flags |= MS_RDONLY; + bch2_fs_emergency_read_only(c); + up_write(&sb->s_umount); + return 0; + + default: + return bch2_fs_ioctl(c, cmd, (void __user *) arg); + } +} + +#ifdef CONFIG_COMPAT +static long bch2_compat_fs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case FS_IOC_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + default: + return -ENOIOCTLCMD; + } + return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +/* Directories: */ + +static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return generic_file_llseek_size(file, offset, whence, + S64_MAX, S64_MAX); +} + +static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct bch_fs *c = inode->i_sb->s_fs_info; + + return bch2_readdir(c, file, ctx); +} + +static const struct file_operations bch_file_operations = { + .llseek = bch2_llseek, + .read_iter = generic_file_read_iter, + .write_iter = bch2_write_iter, + .mmap = bch2_mmap, + .open = generic_file_open, + .fsync = bch2_fsync, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = bch2_fallocate_dispatch, + .unlocked_ioctl = bch2_fs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = bch2_compat_fs_ioctl, +#endif +}; + +static const struct inode_operations bch_file_inode_operations = { + .setattr = bch2_setattr, + .fiemap = bch2_fiemap, + .listxattr = bch2_xattr_list, + .get_acl = bch2_get_acl, + .set_acl = bch2_set_acl, +}; + +static const struct inode_operations bch_dir_inode_operations = { + .lookup = bch2_lookup, + .create = bch2_create, + .link = bch2_link, + .unlink = bch2_unlink, + .symlink = bch2_symlink, + .mkdir = bch2_mkdir, + .rmdir = bch2_rmdir, + .mknod = bch2_mknod, + .rename = bch2_rename2, + .setattr = bch2_setattr, + .tmpfile = bch2_tmpfile, + .listxattr = bch2_xattr_list, + .get_acl = bch2_get_acl, + .set_acl = bch2_set_acl, +}; + +static const struct file_operations bch_dir_file_operations = { + .llseek = bch2_dir_llseek, + .read = generic_read_dir, + .iterate = bch2_vfs_readdir, + .fsync = bch2_fsync, + .unlocked_ioctl = bch2_fs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = bch2_compat_fs_ioctl, +#endif +}; + +static const struct inode_operations bch_symlink_inode_operations = { + .readlink = generic_readlink, + .get_link = page_get_link, + .setattr = bch2_setattr, + .listxattr = bch2_xattr_list, + .get_acl = bch2_get_acl, + .set_acl = bch2_set_acl, +}; + +static const struct inode_operations bch_special_inode_operations = { + .setattr = bch2_setattr, + .listxattr = bch2_xattr_list, + .get_acl = bch2_get_acl, + .set_acl = bch2_set_acl, +}; + +static const struct address_space_operations bch_address_space_operations = { + .writepage = bch2_writepage, + .readpage = bch2_readpage, + .writepages = bch2_writepages, + .readpages = bch2_readpages, + .set_page_dirty = bch2_set_page_dirty, + .write_begin = bch2_write_begin, + .write_end = bch2_write_end, + .invalidatepage = bch2_invalidatepage, + .releasepage = bch2_releasepage, + .direct_IO = bch2_direct_IO, +#ifdef CONFIG_MIGRATION + .migratepage = bch2_migrate_page, +#endif + .error_remove_page = generic_error_remove_page, +}; + +static void bch2_vfs_inode_init(struct bch_fs *c, + struct bch_inode_info *ei, + struct bch_inode_unpacked *bi) +{ + struct inode *inode = &ei->vfs_inode; + + pr_debug("init inode %llu with mode %o", + bi->inum, bi->i_mode); + + ei->i_flags = bi->i_flags; + ei->i_size = bi->i_size; + + inode->i_mode = bi->i_mode; + i_uid_write(inode, bi->i_uid); + i_gid_write(inode, bi->i_gid); + + atomic64_set(&ei->i_sectors, bi->i_sectors); + inode->i_blocks = bi->i_sectors; + + inode->i_ino = bi->inum; + set_nlink(inode, bi->i_nlink + nlink_bias(inode->i_mode)); + inode->i_rdev = bi->i_dev; + inode->i_generation = bi->i_generation; + inode->i_size = bi->i_size; + inode->i_atime = bch2_time_to_timespec(c, bi->i_atime); + inode->i_mtime = bch2_time_to_timespec(c, bi->i_mtime); + inode->i_ctime = bch2_time_to_timespec(c, bi->i_ctime); + bch2_inode_flags_to_vfs(inode); + + ei->str_hash = bch2_hash_info_init(c, bi); + + inode->i_mapping->a_ops = &bch_address_space_operations; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &bch_file_inode_operations; + inode->i_fop = &bch_file_operations; + break; + case S_IFDIR: + inode->i_op = &bch_dir_inode_operations; + inode->i_fop = &bch_dir_file_operations; + break; + case S_IFLNK: + inode_nohighmem(inode); + inode->i_op = &bch_symlink_inode_operations; + break; + default: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + inode->i_op = &bch_special_inode_operations; + break; + } +} + +static struct inode *bch2_alloc_inode(struct super_block *sb) +{ + struct bch_inode_info *ei; + + ei = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); + if (!ei) + return NULL; + + pr_debug("allocated %p", &ei->vfs_inode); + + inode_init_once(&ei->vfs_inode); + mutex_init(&ei->update_lock); + ei->journal_seq = 0; + atomic_long_set(&ei->i_size_dirty_count, 0); + atomic_long_set(&ei->i_sectors_dirty_count, 0); + + return &ei->vfs_inode; +} + +static void bch2_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + kmem_cache_free(bch2_inode_cache, to_bch_ei(inode)); +} + +static void bch2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bch2_i_callback); +} + +static int bch2_vfs_write_inode(struct inode *inode, + struct writeback_control *wbc) +{ + struct bch_fs *c = inode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(inode); + int ret; + + mutex_lock(&ei->update_lock); + ret = bch2_write_inode(c, ei); + mutex_unlock(&ei->update_lock); + + if (c->opts.journal_flush_disabled) + return ret; + + if (!ret && wbc->sync_mode == WB_SYNC_ALL) + ret = bch2_journal_flush_seq(&c->journal, ei->journal_seq); + + return ret; +} + +static void bch2_evict_inode(struct inode *inode) +{ + struct bch_fs *c = inode->i_sb->s_fs_info; + + truncate_inode_pages_final(&inode->i_data); + + if (!bch2_journal_error(&c->journal) && !is_bad_inode(inode)) { + struct bch_inode_info *ei = to_bch_ei(inode); + + /* XXX - we want to check this stuff iff there weren't IO errors: */ + BUG_ON(atomic_long_read(&ei->i_sectors_dirty_count)); + BUG_ON(atomic64_read(&ei->i_sectors) != inode->i_blocks); + } + + clear_inode(inode); + + if (!inode->i_nlink && !is_bad_inode(inode)) { + bch2_inode_rm(c, inode->i_ino); + atomic_long_dec(&c->nr_inodes); + } +} + +static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct bch_fs *c = sb->s_fs_info; + u64 fsid; + + buf->f_type = BCACHE_STATFS_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = c->capacity >> PAGE_SECTOR_SHIFT; + buf->f_bfree = (c->capacity - bch2_fs_sectors_used(c)) >> PAGE_SECTOR_SHIFT; + buf->f_bavail = buf->f_bfree; + buf->f_files = atomic_long_read(&c->nr_inodes); + buf->f_ffree = U64_MAX; + + fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ + le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); + buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_namelen = NAME_MAX; + + return 0; +} + +static int bch2_sync_fs(struct super_block *sb, int wait) +{ + struct bch_fs *c = sb->s_fs_info; + + if (!wait) { + bch2_journal_flush_async(&c->journal, NULL); + return 0; + } + + return bch2_journal_flush(&c->journal); +} + +static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name, + struct bch_opts opts) +{ + size_t nr_devs = 0, i = 0; + char *dev_name, *s, **devs; + struct bch_fs *c = NULL; + const char *err = "cannot allocate memory"; + + dev_name = kstrdup(_dev_name, GFP_KERNEL); + if (!dev_name) + return NULL; + + for (s = dev_name; s; s = strchr(s + 1, ':')) + nr_devs++; + + devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL); + if (!devs) + goto err; + + for (i = 0, s = dev_name; + s; + (s = strchr(s, ':')) && (*s++ = '\0')) + devs[i++] = s; + + err = bch2_fs_open(devs, nr_devs, opts, &c); + if (err) { + /* + * Already open? + * Look up each block device, make sure they all belong to a + * filesystem and they all belong to the _same_ filesystem + */ + + for (i = 0; i < nr_devs; i++) { + struct block_device *bdev = lookup_bdev(devs[i]); + struct bch_fs *c2; + + if (IS_ERR(bdev)) + goto err; + + c2 = bch2_bdev_to_fs(bdev); + bdput(bdev); + + if (!c) + c = c2; + else if (c2) + closure_put(&c2->cl); + + if (!c) + goto err; + if (c != c2) { + closure_put(&c->cl); + goto err; + } + } + + mutex_lock(&c->state_lock); + + if (!bch2_fs_running(c)) { + mutex_unlock(&c->state_lock); + closure_put(&c->cl); + err = "incomplete filesystem"; + c = NULL; + goto err; + } + + mutex_unlock(&c->state_lock); + } + + set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); +err: + kfree(devs); + kfree(dev_name); + + if (!c) + pr_err("bch_fs_open err %s", err); + return c; +} + +static int bch2_remount(struct super_block *sb, int *flags, char *data) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_opts opts = bch2_opts_empty(); + int ret; + + opts.read_only = (*flags & MS_RDONLY) != 0; + + ret = bch2_parse_mount_opts(&opts, data); + if (ret) + return ret; + + if (opts.read_only >= 0 && + opts.read_only != c->opts.read_only) { + const char *err = NULL; + + if (opts.read_only) { + bch2_fs_read_only(c); + + sb->s_flags |= MS_RDONLY; + } else { + err = bch2_fs_read_write(c); + if (err) { + bch_err(c, "error going rw: %s", err); + return -EINVAL; + } + + sb->s_flags &= ~MS_RDONLY; + } + + c->opts.read_only = opts.read_only; + } + + if (opts.errors >= 0) + c->opts.errors = opts.errors; + + return ret; +} + +static const struct super_operations bch_super_operations = { + .alloc_inode = bch2_alloc_inode, + .destroy_inode = bch2_destroy_inode, + .write_inode = bch2_vfs_write_inode, + .evict_inode = bch2_evict_inode, + .sync_fs = bch2_sync_fs, + .statfs = bch2_statfs, + .show_options = generic_show_options, + .remount_fs = bch2_remount, +#if 0 + .put_super = bch2_put_super, + .freeze_fs = bch2_freeze, + .unfreeze_fs = bch2_unfreeze, +#endif +}; + +static int bch2_test_super(struct super_block *s, void *data) +{ + return s->s_fs_info == data; +} + +static int bch2_set_super(struct super_block *s, void *data) +{ + s->s_fs_info = data; + return 0; +} + +static struct dentry *bch2_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct bch_fs *c; + struct bch_dev *ca; + struct super_block *sb; + struct inode *inode; + struct bch_opts opts = bch2_opts_empty(); + unsigned i; + int ret; + + opts.read_only = (flags & MS_RDONLY) != 0; + + ret = bch2_parse_mount_opts(&opts, data); + if (ret) + return ERR_PTR(ret); + + c = bch2_open_as_blockdevs(dev_name, opts); + if (!c) + return ERR_PTR(-ENOENT); + + sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|MS_NOSEC, c); + if (IS_ERR(sb)) { + closure_put(&c->cl); + return ERR_CAST(sb); + } + + BUG_ON(sb->s_fs_info != c); + + if (sb->s_root) { + closure_put(&c->cl); + + if ((flags ^ sb->s_flags) & MS_RDONLY) { + ret = -EBUSY; + goto err_put_super; + } + goto out; + } + + /* XXX: blocksize */ + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_op = &bch_super_operations; + sb->s_xattr = bch2_xattr_handlers; + sb->s_magic = BCACHE_STATFS_MAGIC; + sb->s_time_gran = c->sb.time_precision; + c->vfs_sb = sb; + sb->s_bdi = &c->bdi; + strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); + + for_each_online_member(ca, c, i) { + struct block_device *bdev = ca->disk_sb.bdev; + + /* XXX: create an anonymous device for multi device filesystems */ + sb->s_bdev = bdev; + sb->s_dev = bdev->bd_dev; + percpu_ref_put(&ca->io_ref); + break; + } + + if (opts.posix_acl < 0) + sb->s_flags |= MS_POSIXACL; + else + sb->s_flags |= opts.posix_acl ? MS_POSIXACL : 0; + + inode = bch2_vfs_inode_get(sb, BCACHE_ROOT_INO); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto err_put_super; + } + + sb->s_root = d_make_root(inode); + if (!sb->s_root) { + ret = -ENOMEM; + goto err_put_super; + } + + sb->s_flags |= MS_ACTIVE; +out: + return dget(sb->s_root); + +err_put_super: + deactivate_locked_super(sb); + return ERR_PTR(ret); +} + +static void bch2_kill_sb(struct super_block *sb) +{ + struct bch_fs *c = sb->s_fs_info; + + generic_shutdown_super(sb); + + if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags)) + bch2_fs_stop(c); + else + closure_put(&c->cl); +} + +static struct file_system_type bcache_fs_type = { + .owner = THIS_MODULE, + .name = "bcachefs", + .mount = bch2_mount, + .kill_sb = bch2_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; + +MODULE_ALIAS_FS("bcachefs"); + +void bch2_vfs_exit(void) +{ + unregister_filesystem(&bcache_fs_type); + if (bch2_dio_write_bioset) + bioset_free(bch2_dio_write_bioset); + if (bch2_dio_read_bioset) + bioset_free(bch2_dio_read_bioset); + if (bch2_writepage_bioset) + bioset_free(bch2_writepage_bioset); + if (bch2_inode_cache) + kmem_cache_destroy(bch2_inode_cache); +} + +int __init bch2_vfs_init(void) +{ + int ret = -ENOMEM; + + bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0); + if (!bch2_inode_cache) + goto err; + + bch2_writepage_bioset = + bioset_create(4, offsetof(struct bch_writepage_io, bio.bio)); + if (!bch2_writepage_bioset) + goto err; + + bch2_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, rbio.bio)); + if (!bch2_dio_read_bioset) + goto err; + + bch2_dio_write_bioset = bioset_create(4, offsetof(struct dio_write, bio.bio)); + if (!bch2_dio_write_bioset) + goto err; + + ret = register_filesystem(&bcache_fs_type); + if (ret) + goto err; + + return 0; +err: + bch2_vfs_exit(); + return ret; +} diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h new file mode 100644 index 00000000..f7cad296 --- /dev/null +++ b/libbcachefs/fs.h @@ -0,0 +1,65 @@ +#ifndef _BCACHE_FS_H +#define _BCACHE_FS_H + +#include "str_hash.h" + +#include <linux/seqlock.h> + +struct bch_inode_info { + struct inode vfs_inode; + + struct mutex update_lock; + u64 journal_seq; + + atomic_long_t i_size_dirty_count; + + /* + * these are updated whenever we update the inode in the btree - for + * e.g. fsync + */ + u64 i_size; + u32 i_flags; + + atomic_long_t i_sectors_dirty_count; + atomic64_t i_sectors; + + struct bch_hash_info str_hash; +}; + +#define to_bch_ei(_inode) \ + container_of(_inode, struct bch_inode_info, vfs_inode) + +static inline u8 mode_to_type(umode_t mode) +{ + return (mode >> 12) & 15; +} + +static inline unsigned nlink_bias(umode_t mode) +{ + return S_ISDIR(mode) ? 2 : 1; +} + +struct bch_inode_unpacked; + +#ifndef NO_BCACHE_FS + +/* returns 0 if we want to do the update, or error is passed up */ +typedef int (*inode_set_fn)(struct bch_inode_info *, + struct bch_inode_unpacked *, void *); + +int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *, + inode_set_fn, void *); +int __must_check bch2_write_inode(struct bch_fs *, + struct bch_inode_info *); + +void bch2_vfs_exit(void); +int bch2_vfs_init(void); + +#else + +static inline void bch2_vfs_exit(void) {} +static inline int bch2_vfs_init(void) { return 0; } + +#endif + +#endif /* _BCACHE_FS_H */ diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c new file mode 100644 index 00000000..7a8467c4 --- /dev/null +++ b/libbcachefs/inode.c @@ -0,0 +1,451 @@ + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_update.h" +#include "extents.h" +#include "inode.h" +#include "io.h" +#include "keylist.h" + +#include <linux/random.h> + +#include <asm/unaligned.h> + +#define FIELD_BYTES() \ + +static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; +static const u8 bits_table[8] = { + 1 * 8 - 1, + 2 * 8 - 2, + 3 * 8 - 3, + 4 * 8 - 4, + 6 * 8 - 5, + 8 * 8 - 6, + 10 * 8 - 7, + 13 * 8 - 8, +}; + +static int inode_encode_field(u8 *out, u8 *end, const u64 in[2]) +{ + unsigned bytes, bits, shift; + + if (likely(!in[1])) + bits = fls64(in[0]); + else + bits = fls64(in[1]) + 64; + + for (shift = 1; shift <= 8; shift++) + if (bits < bits_table[shift - 1]) + goto got_shift; + + BUG(); +got_shift: + bytes = byte_table[shift - 1]; + + BUG_ON(out + bytes > end); + + if (likely(bytes <= 8)) { + u64 b = cpu_to_be64(in[0]); + + memcpy(out, (void *) &b + 8 - bytes, bytes); + } else { + u64 b = cpu_to_be64(in[1]); + + memcpy(out, (void *) &b + 16 - bytes, bytes); + put_unaligned_be64(in[0], out + bytes - 8); + } + + *out |= (1 << 8) >> shift; + + return bytes; +} + +static int inode_decode_field(const u8 *in, const u8 *end, + u64 out[2], unsigned *out_bits) +{ + unsigned bytes, bits, shift; + + if (in >= end) + return -1; + + if (!*in) + return -1; + + /* + * position of highest set bit indicates number of bytes: + * shift = number of bits to remove in high byte: + */ + shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ + bytes = byte_table[shift - 1]; + bits = bytes * 8 - shift; + + if (in + bytes > end) + return -1; + + /* + * we're assuming it's safe to deref up to 7 bytes < in; this will work + * because keys always start quite a bit more than 7 bytes after the + * start of the btree node header: + */ + if (likely(bytes <= 8)) { + out[0] = get_unaligned_be64(in + bytes - 8); + out[0] <<= 64 - bits; + out[0] >>= 64 - bits; + out[1] = 0; + } else { + out[0] = get_unaligned_be64(in + bytes - 8); + out[1] = get_unaligned_be64(in + bytes - 16); + out[1] <<= 128 - bits; + out[1] >>= 128 - bits; + } + + *out_bits = out[1] ? 64 + fls64(out[1]) : fls64(out[0]); + return bytes; +} + +void bch2_inode_pack(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + u8 *out = packed->inode.v.fields; + u8 *end = (void *) &packed[1]; + u8 *last_nonzero_field = out; + u64 field[2]; + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + + bkey_inode_init(&packed->inode.k_i); + packed->inode.k.p.inode = inode->inum; + packed->inode.v.i_hash_seed = inode->i_hash_seed; + packed->inode.v.i_flags = cpu_to_le32(inode->i_flags); + packed->inode.v.i_mode = cpu_to_le16(inode->i_mode); + +#define BCH_INODE_FIELD(_name, _bits) \ + field[0] = inode->_name; \ + field[1] = 0; \ + out += inode_encode_field(out, end, field); \ + nr_fields++; \ + \ + if (field[0] | field[1]) { \ + last_nonzero_field = out; \ + last_nonzero_fieldnr = nr_fields; \ + } + + BCH_INODE_FIELDS() +#undef BCH_INODE_FIELD + + out = last_nonzero_field; + nr_fields = last_nonzero_fieldnr; + + set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v); + memset(out, 0, + (u8 *) &packed->inode.v + + bkey_val_bytes(&packed->inode.k) - out); + + SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + struct bch_inode_unpacked unpacked; + + int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode), + &unpacked); + BUG_ON(ret); + BUG_ON(unpacked.inum != inode->inum); + BUG_ON(unpacked.i_hash_seed != inode->i_hash_seed); + BUG_ON(unpacked.i_mode != inode->i_mode); + +#define BCH_INODE_FIELD(_name, _bits) BUG_ON(unpacked._name != inode->_name); + BCH_INODE_FIELDS() +#undef BCH_INODE_FIELD + } +} + +int bch2_inode_unpack(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) +{ + const u8 *in = inode.v->fields; + const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); + u64 field[2]; + unsigned fieldnr = 0, field_bits; + int ret; + + unpacked->inum = inode.k->p.inode; + unpacked->i_hash_seed = inode.v->i_hash_seed; + unpacked->i_flags = le32_to_cpu(inode.v->i_flags); + unpacked->i_mode = le16_to_cpu(inode.v->i_mode); + +#define BCH_INODE_FIELD(_name, _bits) \ + if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ + memset(&unpacked->_name, 0, \ + sizeof(*unpacked) - \ + offsetof(struct bch_inode_unpacked, _name)); \ + return 0; \ + } \ + \ + ret = inode_decode_field(in, end, field, &field_bits); \ + if (ret < 0) \ + return ret; \ + \ + if (field_bits > sizeof(unpacked->_name) * 8) \ + return -1; \ + \ + unpacked->_name = field[0]; \ + in += ret; + + BCH_INODE_FIELDS() +#undef BCH_INODE_FIELD + + /* XXX: signal if there were more fields than expected? */ + + return 0; +} + +static const char *bch2_inode_invalid(const struct bch_fs *c, + struct bkey_s_c k) +{ + if (k.k->p.offset) + return "nonzero offset"; + + switch (k.k->type) { + case BCH_INODE_FS: { + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + struct bch_inode_unpacked unpacked; + + if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) + return "incorrect value size"; + + if (k.k->p.inode < BLOCKDEV_INODE_MAX) + return "fs inode in blockdev range"; + + if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) + return "invalid str hash type"; + + if (bch2_inode_unpack(inode, &unpacked)) + return "invalid variable length fields"; + + return NULL; + } + case BCH_INODE_BLOCKDEV: + if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev)) + return "incorrect value size"; + + if (k.k->p.inode >= BLOCKDEV_INODE_MAX) + return "blockdev inode in fs range"; + + return NULL; + default: + return "invalid type"; + } +} + +static void bch2_inode_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) +{ + struct bkey_s_c_inode inode; + struct bch_inode_unpacked unpacked; + + switch (k.k->type) { + case BCH_INODE_FS: + inode = bkey_s_c_to_inode(k); + if (bch2_inode_unpack(inode, &unpacked)) { + scnprintf(buf, size, "(unpack error)"); + break; + } + + scnprintf(buf, size, "i_size %llu", unpacked.i_size); + break; + } +} + +const struct bkey_ops bch2_bkey_inode_ops = { + .key_invalid = bch2_inode_invalid, + .val_to_text = bch2_inode_to_text, +}; + +void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev) +{ + s64 now = timespec_to_bch2_time(c, CURRENT_TIME); + + memset(inode_u, 0, sizeof(*inode_u)); + + /* ick */ + inode_u->i_flags |= c->sb.str_hash_type << INODE_STR_HASH_OFFSET; + get_random_bytes(&inode_u->i_hash_seed, sizeof(inode_u->i_hash_seed)); + + inode_u->i_mode = mode; + inode_u->i_uid = uid; + inode_u->i_gid = gid; + inode_u->i_dev = rdev; + inode_u->i_atime = now; + inode_u->i_mtime = now; + inode_u->i_ctime = now; + inode_u->i_otime = now; +} + +int bch2_inode_create(struct bch_fs *c, struct bkey_i *inode, + u64 min, u64 max, u64 *hint) +{ + struct btree_iter iter; + bool searched_from_start = false; + int ret; + + if (!max) + max = ULLONG_MAX; + + if (c->opts.inodes_32bit) + max = min_t(u64, max, U32_MAX); + + if (*hint >= max || *hint < min) + *hint = min; + + if (*hint == min) + searched_from_start = true; +again: + bch2_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(*hint, 0)); + + while (1) { + struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter); + + ret = btree_iter_err(k); + if (ret) { + bch2_btree_iter_unlock(&iter); + return ret; + } + + if (k.k->type < BCH_INODE_FS) { + inode->k.p = k.k->p; + + pr_debug("inserting inode %llu (size %u)", + inode->k.p.inode, inode->k.u64s); + + ret = bch2_btree_insert_at(c, NULL, NULL, NULL, + BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(&iter, inode)); + + if (ret == -EINTR) + continue; + + bch2_btree_iter_unlock(&iter); + if (!ret) + *hint = k.k->p.inode + 1; + + return ret; + } else { + if (iter.pos.inode == max) + break; + /* slot used */ + bch2_btree_iter_advance_pos(&iter); + } + } + bch2_btree_iter_unlock(&iter); + + if (!searched_from_start) { + /* Retry from start */ + *hint = min; + searched_from_start = true; + goto again; + } + + return -ENOSPC; +} + +int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size, + struct extent_insert_hook *hook, u64 *journal_seq) +{ + return bch2_discard(c, POS(inode_nr, new_size), POS(inode_nr + 1, 0), + ZERO_VERSION, NULL, hook, journal_seq); +} + +int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) +{ + struct bkey_i delete; + int ret; + + ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL); + if (ret < 0) + return ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, + POS(inode_nr, 0), + POS(inode_nr + 1, 0), + ZERO_VERSION, NULL, NULL, NULL); + if (ret < 0) + return ret; + + /* + * If this was a directory, there shouldn't be any real dirents left - + * but there could be whiteouts (from hash collisions) that we should + * delete: + * + * XXX: the dirent could ideally would delete whitouts when they're no + * longer needed + */ + ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, + POS(inode_nr, 0), + POS(inode_nr + 1, 0), + ZERO_VERSION, NULL, NULL, NULL); + if (ret < 0) + return ret; + + bkey_init(&delete.k); + delete.k.p.inode = inode_nr; + + return bch2_btree_insert(c, BTREE_ID_INODES, &delete, NULL, + NULL, NULL, BTREE_INSERT_NOFAIL); +} + +int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = -ENOENT; + + for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES, + POS(inode_nr, 0), k) { + switch (k.k->type) { + case BCH_INODE_FS: + ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); + break; + default: + /* hole, not found */ + break; + } + + break; + + } + + return bch2_btree_iter_unlock(&iter) ?: ret; +} + +int bch2_cached_dev_inode_find_by_uuid(struct bch_fs *c, uuid_le *uuid, + struct bkey_i_inode_blockdev *ret) +{ + struct btree_iter iter; + struct bkey_s_c k; + + for_each_btree_key(&iter, c, BTREE_ID_INODES, POS(0, 0), k) { + if (k.k->p.inode >= BLOCKDEV_INODE_MAX) + break; + + if (k.k->type == BCH_INODE_BLOCKDEV) { + struct bkey_s_c_inode_blockdev inode = + bkey_s_c_to_inode_blockdev(k); + + pr_debug("found inode %llu: %pU (u64s %u)", + inode.k->p.inode, inode.v->i_uuid.b, + inode.k->u64s); + + if (CACHED_DEV(inode.v) && + !memcmp(uuid, &inode.v->i_uuid, 16)) { + bkey_reassemble(&ret->k_i, k); + bch2_btree_iter_unlock(&iter); + return 0; + } + } + + bch2_btree_iter_cond_resched(&iter); + } + bch2_btree_iter_unlock(&iter); + return -ENOENT; +} diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h new file mode 100644 index 00000000..277d4e42 --- /dev/null +++ b/libbcachefs/inode.h @@ -0,0 +1,57 @@ +#ifndef _BCACHE_INODE_H +#define _BCACHE_INODE_H + +#include <linux/math64.h> + +extern const struct bkey_ops bch2_bkey_inode_ops; + +struct bch_inode_unpacked { + u64 inum; + __le64 i_hash_seed; + u32 i_flags; + u16 i_mode; + +#define BCH_INODE_FIELD(_name, _bits) u##_bits _name; + BCH_INODE_FIELDS() +#undef BCH_INODE_FIELD +}; + +struct bkey_inode_buf { + struct bkey_i_inode inode; + +#define BCH_INODE_FIELD(_name, _bits) + 8 + _bits / 8 + u8 _pad[0 + BCH_INODE_FIELDS()]; +#undef BCH_INODE_FIELD +} __packed; + +void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); +int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); + +void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, + uid_t, gid_t, umode_t, dev_t); +int bch2_inode_create(struct bch_fs *, struct bkey_i *, u64, u64, u64 *); +int bch2_inode_truncate(struct bch_fs *, u64, u64, + struct extent_insert_hook *, u64 *); +int bch2_inode_rm(struct bch_fs *, u64); + +int bch2_inode_find_by_inum(struct bch_fs *, u64, + struct bch_inode_unpacked *); +int bch2_cached_dev_inode_find_by_uuid(struct bch_fs *, uuid_le *, + struct bkey_i_inode_blockdev *); + +static inline struct timespec bch2_time_to_timespec(struct bch_fs *c, u64 time) +{ + return ns_to_timespec(time * c->sb.time_precision + c->sb.time_base_lo); +} + +static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts) +{ + s64 ns = timespec_to_ns(&ts) - c->sb.time_base_lo; + + if (c->sb.time_precision == 1) + return ns; + + return div_s64(ns, c->sb.time_precision); +} + +#endif diff --git a/libbcachefs/io.c b/libbcachefs/io.c new file mode 100644 index 00000000..212a5a65 --- /dev/null +++ b/libbcachefs/io.c @@ -0,0 +1,1387 @@ +/* + * Some low level IO code, and hacks for various block layer limitations + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc.h" +#include "bset.h" +#include "btree_update.h" +#include "buckets.h" +#include "checksum.h" +#include "compress.h" +#include "clock.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "io.h" +#include "journal.h" +#include "keylist.h" +#include "move.h" +#include "super-io.h" + +#include <linux/blkdev.h> +#include <linux/random.h> + +#include <trace/events/bcachefs.h> + +static inline void __bio_inc_remaining(struct bio *bio) +{ + bio_set_flag(bio, BIO_CHAIN); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_remaining); +} + +/* Allocate, free from mempool: */ + +void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) +{ + struct bio_vec *bv; + unsigned i; + + bio_for_each_segment_all(bv, bio, i) + if (bv->bv_page != ZERO_PAGE(0)) + mempool_free(bv->bv_page, &c->bio_bounce_pages); + bio->bi_vcnt = 0; +} + +static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio, + bool *using_mempool) +{ + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++]; + + if (likely(!*using_mempool)) { + bv->bv_page = alloc_page(GFP_NOIO); + if (unlikely(!bv->bv_page)) { + mutex_lock(&c->bio_bounce_pages_lock); + *using_mempool = true; + goto pool_alloc; + + } + } else { +pool_alloc: + bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); + } + + bv->bv_len = PAGE_SIZE; + bv->bv_offset = 0; +} + +void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, + size_t bytes) +{ + bool using_mempool = false; + + bio->bi_iter.bi_size = bytes; + + while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) + bch2_bio_alloc_page_pool(c, bio, &using_mempool); + + if (using_mempool) + mutex_unlock(&c->bio_bounce_pages_lock); +} + +/* Bios with headers */ + +static void bch2_submit_wbio(struct bch_fs *c, struct bch_write_bio *wbio, + struct bch_dev *ca, const struct bch_extent_ptr *ptr) +{ + wbio->ca = ca; + wbio->submit_time_us = local_clock_us(); + wbio->bio.bi_iter.bi_sector = ptr->offset; + wbio->bio.bi_bdev = ca ? ca->disk_sb.bdev : NULL; + + if (!ca) + bcache_io_error(c, &wbio->bio, "device has been removed"); + else + generic_make_request(&wbio->bio); +} + +void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, + const struct bkey_i *k) +{ + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + const struct bch_extent_ptr *ptr; + struct bch_write_bio *n; + struct bch_dev *ca; + + BUG_ON(c->opts.nochanges); + + wbio->split = false; + wbio->c = c; + + extent_for_each_ptr(e, ptr) { + ca = c->devs[ptr->dev]; + if (!percpu_ref_tryget(&ca->io_ref)) { + bch2_submit_wbio(c, wbio, NULL, ptr); + break; + } + + if (ptr + 1 < &extent_entry_last(e)->ptr) { + n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, + &ca->replica_set)); + + n->bio.bi_end_io = wbio->bio.bi_end_io; + n->bio.bi_private = wbio->bio.bi_private; + n->c = c; + n->orig = &wbio->bio; + n->bounce = false; + n->split = true; + n->put_bio = true; + n->bio.bi_opf = wbio->bio.bi_opf; + __bio_inc_remaining(n->orig); + } else { + n = wbio; + } + + if (!journal_flushes_device(ca)) + n->bio.bi_opf |= REQ_FUA; + + bch2_submit_wbio(c, n, ca, ptr); + } +} + +/* IO errors */ + +/* Writes */ + +static struct workqueue_struct *index_update_wq(struct bch_write_op *op) +{ + return op->alloc_reserve == RESERVE_MOVINGGC + ? op->c->copygc_wq + : op->c->wq; +} + +static void __bch2_write(struct closure *); + +static void bch2_write_done(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + + BUG_ON(!(op->flags & BCH_WRITE_DONE)); + + if (!op->error && (op->flags & BCH_WRITE_FLUSH)) + op->error = bch2_journal_error(&op->c->journal); + + bch2_disk_reservation_put(op->c, &op->res); + percpu_ref_put(&op->c->writes); + bch2_keylist_free(&op->insert_keys, op->inline_keys); + closure_return(cl); +} + +static u64 keylist_sectors(struct keylist *keys) +{ + struct bkey_i *k; + u64 ret = 0; + + for_each_keylist_key(keys, k) + ret += k->k.size; + + return ret; +} + +static int bch2_write_index_default(struct bch_write_op *op) +{ + struct keylist *keys = &op->insert_keys; + struct btree_iter iter; + int ret; + + bch2_btree_iter_init_intent(&iter, op->c, BTREE_ID_EXTENTS, + bkey_start_pos(&bch2_keylist_front(keys)->k)); + + ret = bch2_btree_insert_list_at(&iter, keys, &op->res, + NULL, op_journal_seq(op), + BTREE_INSERT_NOFAIL); + bch2_btree_iter_unlock(&iter); + + return ret; +} + +/** + * bch_write_index - after a write, update index to point to new data + */ +static void bch2_write_index(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; + struct keylist *keys = &op->insert_keys; + unsigned i; + + op->flags |= BCH_WRITE_LOOPED; + + if (!bch2_keylist_empty(keys)) { + u64 sectors_start = keylist_sectors(keys); + int ret = op->index_update_fn(op); + + BUG_ON(keylist_sectors(keys) && !ret); + + op->written += sectors_start - keylist_sectors(keys); + + if (ret) { + __bcache_io_error(c, "btree IO error %i", ret); + op->error = ret; + } + } + + for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++) + if (op->open_buckets[i]) { + bch2_open_bucket_put(c, + c->open_buckets + + op->open_buckets[i]); + op->open_buckets[i] = 0; + } + + if (!(op->flags & BCH_WRITE_DONE)) + continue_at(cl, __bch2_write, op->io_wq); + + if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { + bch2_journal_flush_seq_async(&c->journal, + *op_journal_seq(op), + cl); + continue_at(cl, bch2_write_done, index_update_wq(op)); + } else { + continue_at_nobarrier(cl, bch2_write_done, NULL); + } +} + +/** + * bch_write_discard - discard range of keys + * + * Used to implement discard, and to handle when writethrough write hits + * a write error on the cache device. + */ +static void bch2_write_discard(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bio *bio = &op->bio->bio; + struct bpos end = op->pos; + + end.offset += bio_sectors(bio); + + op->error = bch2_discard(op->c, op->pos, end, op->version, + &op->res, NULL, NULL); +} + +/* + * Convert extents to be inserted to discards after an error: + */ +static void bch2_write_io_error(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + + if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) { + struct bkey_i *src = bch2_keylist_front(&op->insert_keys); + struct bkey_i *dst = bch2_keylist_front(&op->insert_keys); + + /* + * Our data write just errored, which means we've got a bunch + * of keys to insert that point to data that wasn't + * successfully written. + * + * We don't have to insert those keys but we still have to + * invalidate that region of the cache - so, if we just strip + * off all the pointers from the keys we'll accomplish just + * that. + */ + + while (src != op->insert_keys.top) { + struct bkey_i *n = bkey_next(src); + + set_bkey_val_u64s(&src->k, 0); + src->k.type = KEY_TYPE_DISCARD; + bkey_copy(dst, src); + + dst = bkey_next(dst); + src = n; + } + + op->insert_keys.top = dst; + op->flags |= BCH_WRITE_DISCARD; + } else { + /* TODO: We could try to recover from this. */ + while (!bch2_keylist_empty(&op->insert_keys)) + bch2_keylist_pop_front(&op->insert_keys); + + op->error = -EIO; + op->flags |= BCH_WRITE_DONE; + } + + bch2_write_index(cl); +} + +static void bch2_write_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_write_bio *wbio = to_wbio(bio); + struct bch_fs *c = wbio->c; + struct bio *orig = wbio->orig; + struct bch_dev *ca = wbio->ca; + + if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca, + "data write")) { + set_closure_fn(cl, bch2_write_io_error, index_update_wq(op)); + } + + if (ca) + percpu_ref_put(&ca->io_ref); + + if (bio->bi_error && orig) + orig->bi_error = bio->bi_error; + + if (wbio->bounce) + bch2_bio_free_pages_pool(c, bio); + + if (wbio->put_bio) + bio_put(bio); + + if (orig) + bio_endio(orig); + else + closure_put(cl); +} + +static struct nonce extent_nonce(struct bversion version, + unsigned nonce, + unsigned uncompressed_size, + unsigned compression_type) +{ + return (struct nonce) {{ + [0] = cpu_to_le32((nonce << 12) | + (uncompressed_size << 22)), + [1] = cpu_to_le32(version.lo), + [2] = cpu_to_le32(version.lo >> 32), + [3] = cpu_to_le32(version.hi| + (compression_type << 24))^BCH_NONCE_EXTENT, + }}; +} + +static void init_append_extent(struct bch_write_op *op, + unsigned compressed_size, + unsigned uncompressed_size, + unsigned compression_type, + unsigned nonce, + struct bch_csum csum, unsigned csum_type, + struct open_bucket *ob) +{ + struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top); + + op->pos.offset += uncompressed_size; + e->k.p = op->pos; + e->k.size = uncompressed_size; + e->k.version = op->version; + bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED); + + bch2_extent_crc_append(e, compressed_size, + uncompressed_size, + compression_type, + nonce, csum, csum_type); + + bch2_alloc_sectors_append_ptrs(op->c, e, op->nr_replicas, + ob, compressed_size); + + bkey_extent_set_cached(&e->k, (op->flags & BCH_WRITE_CACHED)); + bch2_keylist_push(&op->insert_keys); +} + +static int bch2_write_extent(struct bch_write_op *op, + struct open_bucket *ob, + struct bio *orig) +{ + struct bch_fs *c = op->c; + struct bio *bio; + struct bch_write_bio *wbio; + unsigned key_to_write_offset = op->insert_keys.top_p - + op->insert_keys.keys_p; + struct bkey_i *key_to_write; + unsigned csum_type = op->csum_type; + unsigned compression_type = op->compression_type; + int ret; + + /* don't refetch csum type/compression type */ + barrier(); + + /* Need to decompress data? */ + if ((op->flags & BCH_WRITE_DATA_COMPRESSED) && + (crc_uncompressed_size(NULL, &op->crc) != op->size || + crc_compressed_size(NULL, &op->crc) > ob->sectors_free)) { + int ret; + + ret = bch2_bio_uncompress_inplace(c, orig, op->size, op->crc); + if (ret) + return ret; + + op->flags &= ~BCH_WRITE_DATA_COMPRESSED; + } + + if (op->flags & BCH_WRITE_DATA_COMPRESSED) { + init_append_extent(op, + crc_compressed_size(NULL, &op->crc), + crc_uncompressed_size(NULL, &op->crc), + op->crc.compression_type, + op->crc.nonce, + op->crc.csum, + op->crc.csum_type, + ob); + + bio = orig; + wbio = to_wbio(bio); + wbio->orig = NULL; + wbio->bounce = false; + wbio->put_bio = false; + ret = 0; + } else if (csum_type != BCH_CSUM_NONE || + compression_type != BCH_COMPRESSION_NONE) { + /* all units here in bytes */ + unsigned total_output = 0, output_available = + min(ob->sectors_free << 9, orig->bi_iter.bi_size); + unsigned crc_nonce = bch2_csum_type_is_encryption(csum_type) + ? op->nonce : 0; + struct bch_csum csum; + struct nonce nonce; + + bio = bio_alloc_bioset(GFP_NOIO, + DIV_ROUND_UP(output_available, PAGE_SIZE), + &c->bio_write); + /* + * XXX: can't use mempool for more than + * BCH_COMPRESSED_EXTENT_MAX worth of pages + */ + bch2_bio_alloc_pages_pool(c, bio, output_available); + + /* copy WRITE_SYNC flag */ + bio->bi_opf = orig->bi_opf; + wbio = to_wbio(bio); + wbio->orig = NULL; + wbio->bounce = true; + wbio->put_bio = true; + + do { + unsigned fragment_compression_type = compression_type; + size_t dst_len, src_len; + + bch2_bio_compress(c, bio, &dst_len, + orig, &src_len, + &fragment_compression_type); + + BUG_ON(!dst_len || dst_len > bio->bi_iter.bi_size); + BUG_ON(!src_len || src_len > orig->bi_iter.bi_size); + BUG_ON(dst_len & (block_bytes(c) - 1)); + BUG_ON(src_len & (block_bytes(c) - 1)); + + swap(bio->bi_iter.bi_size, dst_len); + nonce = extent_nonce(op->version, + crc_nonce, + src_len >> 9, + compression_type), + + bch2_encrypt_bio(c, csum_type, nonce, bio); + + csum = bch2_checksum_bio(c, csum_type, nonce, bio); + swap(bio->bi_iter.bi_size, dst_len); + + init_append_extent(op, + dst_len >> 9, src_len >> 9, + fragment_compression_type, + crc_nonce, csum, csum_type, ob); + + total_output += dst_len; + bio_advance(bio, dst_len); + bio_advance(orig, src_len); + } while (bio->bi_iter.bi_size && + orig->bi_iter.bi_size && + !bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_EXTENT_U64s_MAX)); + + BUG_ON(total_output > output_available); + + memset(&bio->bi_iter, 0, sizeof(bio->bi_iter)); + bio->bi_iter.bi_size = total_output; + + /* + * Free unneeded pages after compressing: + */ + while (bio->bi_vcnt * PAGE_SIZE > + round_up(bio->bi_iter.bi_size, PAGE_SIZE)) + mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page, + &c->bio_bounce_pages); + + ret = orig->bi_iter.bi_size != 0; + } else { + bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO, + &c->bio_write); + + wbio = to_wbio(bio); + wbio->orig = NULL; + wbio->bounce = false; + wbio->put_bio = bio != orig; + + init_append_extent(op, bio_sectors(bio), bio_sectors(bio), + compression_type, 0, + (struct bch_csum) { 0 }, csum_type, ob); + + ret = bio != orig; + } + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + closure_get(bio->bi_private); + + /* might have done a realloc... */ + + key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); + + bch2_check_mark_super(c, key_to_write, false); + + bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write); + return ret; +} + +static void __bch2_write(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; + struct bio *bio = &op->bio->bio; + unsigned open_bucket_nr = 0; + struct open_bucket *b; + int ret; + + memset(op->open_buckets, 0, sizeof(op->open_buckets)); + + if (op->flags & BCH_WRITE_DISCARD) { + op->flags |= BCH_WRITE_DONE; + bch2_write_discard(cl); + bio_put(bio); + continue_at(cl, bch2_write_done, index_update_wq(op)); + } + + /* + * Journal writes are marked REQ_PREFLUSH; if the original write was a + * flush, it'll wait on the journal write. + */ + bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA); + + do { + EBUG_ON(bio->bi_iter.bi_sector != op->pos.offset); + EBUG_ON(!bio_sectors(bio)); + + if (open_bucket_nr == ARRAY_SIZE(op->open_buckets)) + continue_at(cl, bch2_write_index, index_update_wq(op)); + + /* for the device pointers and 1 for the chksum */ + if (bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_EXTENT_U64s_MAX)) + continue_at(cl, bch2_write_index, index_update_wq(op)); + + b = bch2_alloc_sectors_start(c, op->wp, + op->nr_replicas, + c->opts.data_replicas_required, + op->alloc_reserve, + (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); + EBUG_ON(!b); + + if (unlikely(IS_ERR(b))) { + if (unlikely(PTR_ERR(b) != -EAGAIN)) { + ret = PTR_ERR(b); + goto err; + } + + /* + * If we already have some keys, must insert them first + * before allocating another open bucket. We only hit + * this case if open_bucket_nr > 1. + */ + if (!bch2_keylist_empty(&op->insert_keys)) + continue_at(cl, bch2_write_index, + index_update_wq(op)); + + /* + * If we've looped, we're running out of a workqueue - + * not the bch2_write() caller's context - and we don't + * want to block the workqueue: + */ + if (op->flags & BCH_WRITE_LOOPED) + continue_at(cl, __bch2_write, op->io_wq); + + /* + * Otherwise, we do want to block the caller on alloc + * failure instead of letting it queue up more and more + * writes: + * XXX: this technically needs a try_to_freeze() - + * except that that's not safe because caller may have + * issued other IO... hmm.. + */ + closure_sync(cl); + continue; + } + + BUG_ON(b - c->open_buckets == 0 || + b - c->open_buckets > U8_MAX); + op->open_buckets[open_bucket_nr++] = b - c->open_buckets; + + ret = bch2_write_extent(op, b, bio); + + bch2_alloc_sectors_done(c, op->wp, b); + + if (ret < 0) + goto err; + } while (ret); + + op->flags |= BCH_WRITE_DONE; + continue_at(cl, bch2_write_index, index_update_wq(op)); +err: + if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) { + /* + * If we were writing cached data, not doing the write is fine + * so long as we discard whatever would have been overwritten - + * then it's equivalent to doing the write and immediately + * reclaiming it. + */ + + bch2_write_discard(cl); + } else { + /* + * Right now we can only error here if we went RO - the + * allocation failed, but we already checked for -ENOSPC when we + * got our reservation. + * + * XXX capacity might have changed, but we don't check for that + * yet: + */ + op->error = ret; + } + + op->flags |= BCH_WRITE_DONE; + + /* + * No reason not to insert keys for whatever data was successfully + * written (especially for a cmpxchg operation that's moving data + * around) + */ + continue_at(cl, !bch2_keylist_empty(&op->insert_keys) + ? bch2_write_index + : bch2_write_done, index_update_wq(op)); +} + +void bch2_wake_delayed_writes(unsigned long data) +{ + struct bch_fs *c = (void *) data; + struct bch_write_op *op; + unsigned long flags; + + spin_lock_irqsave(&c->foreground_write_pd_lock, flags); + + while ((op = c->write_wait_head)) { + if (time_after(op->expires, jiffies)) { + mod_timer(&c->foreground_write_wakeup, op->expires); + break; + } + + c->write_wait_head = op->next; + if (!c->write_wait_head) + c->write_wait_tail = NULL; + + closure_put(&op->cl); + } + + spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags); +} + +/** + * bch_write - handle a write to a cache device or flash only volume + * + * This is the starting point for any data to end up in a cache device; it could + * be from a normal write, or a writeback write, or a write to a flash only + * volume - it's also used by the moving garbage collector to compact data in + * mostly empty buckets. + * + * It first writes the data to the cache, creating a list of keys to be inserted + * (if the data won't fit in a single open bucket, there will be multiple keys); + * after the data is written it calls bch_journal, and after the keys have been + * added to the next journal write they're inserted into the btree. + * + * It inserts the data in op->bio; bi_sector is used for the key offset, and + * op->inode is used for the key inode. + * + * If op->discard is true, instead of inserting the data it invalidates the + * region of the cache represented by op->bio and op->inode. + */ +void bch2_write(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bio *bio = &op->bio->bio; + struct bch_fs *c = op->c; + u64 inode = op->pos.inode; + + if (c->opts.nochanges || + !percpu_ref_tryget(&c->writes)) { + __bcache_io_error(c, "read only"); + op->error = -EROFS; + bch2_disk_reservation_put(c, &op->res); + closure_return(cl); + } + + if (bversion_zero(op->version) && + bch2_csum_type_is_encryption(op->csum_type)) + op->version.lo = + atomic64_inc_return(&c->key_version) + 1; + + if (!(op->flags & BCH_WRITE_DISCARD)) + bch2_increment_clock(c, bio_sectors(bio), WRITE); + + /* Don't call bch2_next_delay() if rate is >= 1 GB/sec */ + + if (c->foreground_write_ratelimit_enabled && + c->foreground_write_pd.rate.rate < (1 << 30) && + !(op->flags & BCH_WRITE_DISCARD) && op->wp->throttle) { + unsigned long flags; + u64 delay; + + spin_lock_irqsave(&c->foreground_write_pd_lock, flags); + bch2_ratelimit_increment(&c->foreground_write_pd.rate, + bio->bi_iter.bi_size); + + delay = bch2_ratelimit_delay(&c->foreground_write_pd.rate); + + if (delay >= HZ / 100) { + trace_write_throttle(c, inode, bio, delay); + + closure_get(&op->cl); /* list takes a ref */ + + op->expires = jiffies + delay; + op->next = NULL; + + if (c->write_wait_tail) + c->write_wait_tail->next = op; + else + c->write_wait_head = op; + c->write_wait_tail = op; + + if (!timer_pending(&c->foreground_write_wakeup)) + mod_timer(&c->foreground_write_wakeup, + op->expires); + + spin_unlock_irqrestore(&c->foreground_write_pd_lock, + flags); + continue_at(cl, __bch2_write, index_update_wq(op)); + } + + spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags); + } + + continue_at_nobarrier(cl, __bch2_write, NULL); +} + +void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, + struct bch_write_bio *bio, struct disk_reservation res, + struct write_point *wp, struct bpos pos, + u64 *journal_seq, unsigned flags) +{ + EBUG_ON(res.sectors && !res.nr_replicas); + + op->c = c; + op->io_wq = index_update_wq(op); + op->bio = bio; + op->written = 0; + op->error = 0; + op->flags = flags; + op->csum_type = bch2_data_checksum_type(c); + op->compression_type = c->opts.compression; + op->nr_replicas = res.nr_replicas; + op->alloc_reserve = RESERVE_NONE; + op->nonce = 0; + op->pos = pos; + op->version = ZERO_VERSION; + op->res = res; + op->wp = wp; + + if (journal_seq) { + op->journal_seq_p = journal_seq; + op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; + } else { + op->journal_seq = 0; + } + + op->index_update_fn = bch2_write_index_default; + + bch2_keylist_init(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys)); + + if (version_stress_test(c)) + get_random_bytes(&op->version, sizeof(op->version)); +} + +/* Discard */ + +/* bch_discard - discard a range of keys from start_key to end_key. + * @c filesystem + * @start_key pointer to start location + * NOTE: discard starts at bkey_start_offset(start_key) + * @end_key pointer to end location + * NOTE: discard ends at KEY_OFFSET(end_key) + * @version version of discard (0ULL if none) + * + * Returns: + * 0 on success + * <0 on error + * + * XXX: this needs to be refactored with inode_truncate, or more + * appropriately inode_truncate should call this + */ +int bch2_discard(struct bch_fs *c, struct bpos start, + struct bpos end, struct bversion version, + struct disk_reservation *disk_res, + struct extent_insert_hook *hook, + u64 *journal_seq) +{ + return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, start, end, version, + disk_res, hook, journal_seq); +} + +/* Cache promotion on read */ + +struct cache_promote_op { + struct closure cl; + struct migrate_write write; + struct bio_vec bi_inline_vecs[0]; /* must be last */ +}; + +/* Read */ + +static int bio_checksum_uncompress(struct bch_fs *c, + struct bch_read_bio *rbio) +{ + struct bio *src = &rbio->bio; + struct bio *dst = &bch2_rbio_parent(rbio)->bio; + struct bvec_iter dst_iter = rbio->parent_iter; + struct nonce nonce = extent_nonce(rbio->version, + rbio->crc.nonce, + crc_uncompressed_size(NULL, &rbio->crc), + rbio->crc.compression_type); + struct bch_csum csum; + int ret = 0; + + /* + * reset iterator for checksumming and copying bounced data: here we've + * set rbio->compressed_size to the amount of data we actually read, + * which was not necessarily the full extent if we were only bouncing + * in order to promote + */ + if (rbio->bounce) { + src->bi_iter.bi_size = crc_compressed_size(NULL, &rbio->crc) << 9; + src->bi_iter.bi_idx = 0; + src->bi_iter.bi_bvec_done = 0; + } else { + src->bi_iter = rbio->parent_iter; + } + + csum = bch2_checksum_bio(c, rbio->crc.csum_type, nonce, src); + if (bch2_dev_nonfatal_io_err_on(bch2_crc_cmp(rbio->crc.csum, csum), + rbio->ca, + "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)", + rbio->inode, (u64) rbio->parent_iter.bi_sector << 9, + rbio->crc.csum.hi, rbio->crc.csum.lo, csum.hi, csum.lo, + rbio->crc.csum_type)) + ret = -EIO; + + /* + * If there was a checksum error, still copy the data back - unless it + * was compressed, we don't want to decompress bad data: + */ + if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) { + if (!ret) { + bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src); + ret = bch2_bio_uncompress(c, src, dst, + dst_iter, rbio->crc); + if (ret) + __bcache_io_error(c, "decompression error"); + } + } else if (rbio->bounce) { + bio_advance(src, rbio->crc.offset << 9); + + /* don't need to decrypt the entire bio: */ + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; + + nonce = nonce_add(nonce, rbio->crc.offset << 9); + + bch2_encrypt_bio(c, rbio->crc.csum_type, + nonce, src); + + bio_copy_data_iter(dst, dst_iter, + src, src->bi_iter); + } else { + bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src); + } + + return ret; +} + +static void bch2_rbio_free(struct bch_read_bio *rbio) +{ + struct bch_fs *c = rbio->c; + struct bio *bio = &rbio->bio; + + BUG_ON(rbio->ca); + BUG_ON(!rbio->split); + + if (rbio->promote) + kfree(rbio->promote); + if (rbio->bounce) + bch2_bio_free_pages_pool(c, bio); + + bio_put(bio); +} + +static void bch2_rbio_done(struct bch_read_bio *rbio) +{ + struct bio *orig = &bch2_rbio_parent(rbio)->bio; + + percpu_ref_put(&rbio->ca->io_ref); + rbio->ca = NULL; + + if (rbio->split) { + if (rbio->bio.bi_error) + orig->bi_error = rbio->bio.bi_error; + + bio_endio(orig); + bch2_rbio_free(rbio); + } else { + if (rbio->promote) + kfree(rbio->promote); + + orig->bi_end_io = rbio->orig_bi_end_io; + bio_endio_nodec(orig); + } +} + +static void bch2_rbio_error(struct bch_read_bio *rbio, int error) +{ + bch2_rbio_parent(rbio)->bio.bi_error = error; + bch2_rbio_done(rbio); +} + +static void bch2_rbio_retry(struct bch_fs *c, struct bch_read_bio *rbio) +{ + unsigned long flags; + + percpu_ref_put(&rbio->ca->io_ref); + rbio->ca = NULL; + + spin_lock_irqsave(&c->read_retry_lock, flags); + bio_list_add(&c->read_retry_list, &rbio->bio); + spin_unlock_irqrestore(&c->read_retry_lock, flags); + queue_work(c->wq, &c->read_retry_work); +} + +static void cache_promote_done(struct closure *cl) +{ + struct cache_promote_op *op = + container_of(cl, struct cache_promote_op, cl); + + bch2_bio_free_pages_pool(op->write.op.c, &op->write.wbio.bio); + kfree(op); +} + +/* Inner part that may run in process context */ +static void __bch2_read_endio(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + int ret; + + ret = bio_checksum_uncompress(c, rbio); + if (ret) { + /* + * Checksum error: if the bio wasn't bounced, we may have been + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ + if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { + rbio->flags |= BCH_READ_FORCE_BOUNCE; + bch2_rbio_retry(c, rbio); + } else { + bch2_rbio_error(rbio, -EIO); + } + return; + } + + if (rbio->promote) { + struct cache_promote_op *promote = rbio->promote; + struct closure *cl = &promote->cl; + + BUG_ON(!rbio->split || !rbio->bounce); + + trace_promote(&rbio->bio); + + /* we now own pages: */ + swap(promote->write.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt); + rbio->promote = NULL; + + bch2_rbio_done(rbio); + + closure_init(cl, &c->cl); + closure_call(&promote->write.op.cl, bch2_write, c->wq, cl); + closure_return_with_destructor(cl, cache_promote_done); + } else { + bch2_rbio_done(rbio); + } +} + +static void bch2_read_endio(struct bio *bio) +{ + struct bch_read_bio *rbio = + container_of(bio, struct bch_read_bio, bio); + struct bch_fs *c = rbio->c; + + if (bch2_dev_nonfatal_io_err_on(bio->bi_error, rbio->ca, "data read")) { + /* XXX: retry IO errors when we have another replica */ + bch2_rbio_error(rbio, bio->bi_error); + return; + } + + if (rbio->ptr.cached && + (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + ptr_stale(rbio->ca, &rbio->ptr))) { + atomic_long_inc(&c->cache_read_races); + + if (rbio->flags & BCH_READ_RETRY_IF_STALE) + bch2_rbio_retry(c, rbio); + else + bch2_rbio_error(rbio, -EINTR); + return; + } + + if (rbio->crc.compression_type || + bch2_csum_type_is_encryption(rbio->crc.csum_type)) + queue_work(system_unbound_wq, &rbio->work); + else if (rbio->crc.csum_type) + queue_work(system_highpri_wq, &rbio->work); + else + __bch2_read_endio(&rbio->work); +} + +static bool should_promote(struct bch_fs *c, + struct extent_pick_ptr *pick, unsigned flags) +{ + if (!(flags & BCH_READ_PROMOTE)) + return false; + + if (percpu_ref_is_dying(&c->writes)) + return false; + + return c->fastest_tier && + c->fastest_tier < c->tiers + pick->ca->mi.tier; +} + +void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig, + struct bvec_iter iter, struct bkey_s_c k, + struct extent_pick_ptr *pick, unsigned flags) +{ + struct bch_read_bio *rbio; + struct cache_promote_op *promote_op = NULL; + unsigned skip = iter.bi_sector - bkey_start_offset(k.k); + bool bounce = false, split, read_full = false; + + bch2_increment_clock(c, bio_sectors(&orig->bio), READ); + + EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector || + k.k->p.offset < bvec_iter_end_sector(iter)); + + /* only promote if we're not reading from the fastest tier: */ + + /* + * XXX: multiple promotes can race with each other, wastefully. Keep a + * list of outstanding promotes? + */ + if (should_promote(c, pick, flags)) { + /* + * biovec needs to be big enough to hold decompressed data, if + * the bch2_write_extent() has to decompress/recompress it: + */ + unsigned sectors = + max_t(unsigned, k.k->size, + crc_uncompressed_size(NULL, &pick->crc)); + unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + + promote_op = kmalloc(sizeof(*promote_op) + + sizeof(struct bio_vec) * pages, GFP_NOIO); + if (promote_op) { + struct bio *promote_bio = &promote_op->write.wbio.bio; + + bio_init(promote_bio); + promote_bio->bi_max_vecs = pages; + promote_bio->bi_io_vec = promote_bio->bi_inline_vecs; + bounce = true; + /* could also set read_full */ + } + } + + /* + * note: if compression_type and crc_type both == none, then + * compressed/uncompressed size is zero + */ + if (pick->crc.compression_type != BCH_COMPRESSION_NONE || + (pick->crc.csum_type != BCH_CSUM_NONE && + (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) || + (bch2_csum_type_is_encryption(pick->crc.csum_type) && + (flags & BCH_READ_USER_MAPPED)) || + (flags & BCH_READ_FORCE_BOUNCE)))) { + read_full = true; + bounce = true; + } + + if (bounce) { + unsigned sectors = read_full + ? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size) + : bvec_iter_sectors(iter); + + rbio = container_of(bio_alloc_bioset(GFP_NOIO, + DIV_ROUND_UP(sectors, PAGE_SECTORS), + &c->bio_read_split), + struct bch_read_bio, bio); + + bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); + split = true; + } else if (!(flags & BCH_READ_MAY_REUSE_BIO) || + !(flags & BCH_READ_IS_LAST)) { + /* + * Have to clone if there were any splits, due to error + * reporting issues (if a split errored, and retrying didn't + * work, when it reports the error to its parent (us) we don't + * know if the error was from our bio, and we should retry, or + * from the whole bio, in which case we don't want to retry and + * lose the error) + */ + rbio = container_of(bio_clone_fast(&orig->bio, + GFP_NOIO, &c->bio_read_split), + struct bch_read_bio, bio); + rbio->bio.bi_iter = iter; + split = true; + } else { + rbio = orig; + rbio->bio.bi_iter = iter; + split = false; + BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); + } + + if (!(flags & BCH_READ_IS_LAST)) + __bio_inc_remaining(&orig->bio); + + if (split) + rbio->parent = orig; + else + rbio->orig_bi_end_io = orig->bio.bi_end_io; + rbio->parent_iter = iter; + + rbio->flags = flags; + rbio->bounce = bounce; + rbio->split = split; + rbio->c = c; + rbio->ca = pick->ca; + rbio->ptr = pick->ptr; + rbio->crc = pick->crc; + /* + * crc.compressed_size will be 0 if there wasn't any checksum + * information, also we need to stash the original size of the bio if we + * bounced (which isn't necessarily the original key size, if we bounced + * only for promoting) + */ + rbio->crc._compressed_size = bio_sectors(&rbio->bio) - 1; + rbio->version = k.k->version; + rbio->promote = promote_op; + rbio->inode = k.k->p.inode; + INIT_WORK(&rbio->work, __bch2_read_endio); + + rbio->bio.bi_bdev = pick->ca->disk_sb.bdev; + rbio->bio.bi_opf = orig->bio.bi_opf; + rbio->bio.bi_iter.bi_sector = pick->ptr.offset; + rbio->bio.bi_end_io = bch2_read_endio; + + if (promote_op) { + struct bio *promote_bio = &promote_op->write.wbio.bio; + + promote_bio->bi_iter = rbio->bio.bi_iter; + memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec, + sizeof(struct bio_vec) * rbio->bio.bi_vcnt); + + bch2_migrate_write_init(c, &promote_op->write, + &c->promote_write_point, + k, NULL, + BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_CACHED); + promote_op->write.promote = true; + + if (rbio->crc.compression_type) { + promote_op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED; + promote_op->write.op.crc = rbio->crc; + promote_op->write.op.size = k.k->size; + } else if (read_full) { + /* + * Adjust bio to correspond to _live_ portion of @k - + * which might be less than what we're actually reading: + */ + bio_advance(promote_bio, rbio->crc.offset << 9); + BUG_ON(bio_sectors(promote_bio) < k.k->size); + promote_bio->bi_iter.bi_size = k.k->size << 9; + } else { + /* + * Set insert pos to correspond to what we're actually + * reading: + */ + promote_op->write.op.pos.offset = iter.bi_sector; + } + + promote_bio->bi_iter.bi_sector = + promote_op->write.op.pos.offset; + } + + /* _after_ promete stuff has looked at rbio->crc.offset */ + if (read_full) + rbio->crc.offset += skip; + else + rbio->bio.bi_iter.bi_sector += skip; + + rbio->submit_time_us = local_clock_us(); + + if (bounce) + trace_read_bounce(&rbio->bio); + + if (!(flags & BCH_READ_IS_LAST)) + trace_read_split(&rbio->bio); + + generic_make_request(&rbio->bio); +} + +static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, u64 inode, + unsigned flags) +{ + struct bio *bio = &rbio->bio; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS, + POS(inode, bvec_iter.bi_sector), k) { + BKEY_PADDED(k) tmp; + struct extent_pick_ptr pick; + unsigned bytes, sectors; + bool is_last; + + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + bch2_btree_iter_unlock(&iter); + + bch2_extent_pick_ptr(c, k, &pick); + if (IS_ERR(pick.ca)) { + bcache_io_error(c, bio, "no device to read from"); + bio_endio(bio); + return; + } + + sectors = min_t(u64, k.k->p.offset, + bvec_iter_end_sector(bvec_iter)) - + bvec_iter.bi_sector; + bytes = sectors << 9; + is_last = bytes == bvec_iter.bi_size; + swap(bvec_iter.bi_size, bytes); + + if (is_last) + flags |= BCH_READ_IS_LAST; + + if (pick.ca) { + PTR_BUCKET(pick.ca, &pick.ptr)->read_prio = + c->prio_clock[READ].hand; + + bch2_read_extent_iter(c, rbio, bvec_iter, + k, &pick, flags); + + flags &= ~BCH_READ_MAY_REUSE_BIO; + } else { + zero_fill_bio_iter(bio, bvec_iter); + + if (is_last) + bio_endio(bio); + } + + if (is_last) + return; + + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(bio, &bvec_iter, bytes); + } + + /* + * If we get here, it better have been because there was an error + * reading a btree node + */ + ret = bch2_btree_iter_unlock(&iter); + BUG_ON(!ret); + bcache_io_error(c, bio, "btree IO error %i", ret); + bio_endio(bio); +} + +void bch2_read(struct bch_fs *c, struct bch_read_bio *bio, u64 inode) +{ + bch2_read_iter(c, bio, bio->bio.bi_iter, inode, + BCH_READ_RETRY_IF_STALE| + BCH_READ_PROMOTE| + BCH_READ_MAY_REUSE_BIO| + BCH_READ_USER_MAPPED); +} + +/** + * bch_read_retry - re-submit a bio originally from bch2_read() + */ +static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio) +{ + struct bch_read_bio *parent = bch2_rbio_parent(rbio); + struct bvec_iter iter = rbio->parent_iter; + unsigned flags = rbio->flags; + u64 inode = rbio->inode; + + trace_read_retry(&rbio->bio); + + if (rbio->split) + bch2_rbio_free(rbio); + else + rbio->bio.bi_end_io = rbio->orig_bi_end_io; + + bch2_read_iter(c, parent, iter, inode, flags); +} + +void bch2_read_retry_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, + read_retry_work); + struct bch_read_bio *rbio; + struct bio *bio; + unsigned long flags; + + while (1) { + spin_lock_irqsave(&c->read_retry_lock, flags); + bio = bio_list_pop(&c->read_retry_list); + spin_unlock_irqrestore(&c->read_retry_lock, flags); + + if (!bio) + break; + + rbio = container_of(bio, struct bch_read_bio, bio); + bch2_read_retry(c, rbio); + } +} diff --git a/libbcachefs/io.h b/libbcachefs/io.h new file mode 100644 index 00000000..253316a4 --- /dev/null +++ b/libbcachefs/io.h @@ -0,0 +1,87 @@ +#ifndef _BCACHE_IO_H +#define _BCACHE_IO_H + +#include "io_types.h" + +#define to_wbio(_bio) \ + container_of((_bio), struct bch_write_bio, bio) + +#define to_rbio(_bio) \ + container_of((_bio), struct bch_read_bio, bio) + +void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); +void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); + +enum bch_write_flags { + BCH_WRITE_ALLOC_NOWAIT = (1 << 0), + BCH_WRITE_DISCARD = (1 << 1), + BCH_WRITE_CACHED = (1 << 2), + BCH_WRITE_FLUSH = (1 << 3), + BCH_WRITE_DISCARD_ON_ERROR = (1 << 4), + BCH_WRITE_DATA_COMPRESSED = (1 << 5), + + /* Internal: */ + BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 6), + BCH_WRITE_DONE = (1 << 7), + BCH_WRITE_LOOPED = (1 << 8), +}; + +static inline u64 *op_journal_seq(struct bch_write_op *op) +{ + return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR) + ? op->journal_seq_p : &op->journal_seq; +} + +static inline struct write_point *foreground_write_point(struct bch_fs *c, + unsigned long v) +{ + return c->write_points + + hash_long(v, ilog2(ARRAY_SIZE(c->write_points))); +} + +void bch2_write_op_init(struct bch_write_op *, struct bch_fs *, + struct bch_write_bio *, + struct disk_reservation, struct write_point *, + struct bpos, u64 *, unsigned); +void bch2_write(struct closure *); + +struct cache_promote_op; + +struct extent_pick_ptr; + +void bch2_read_extent_iter(struct bch_fs *, struct bch_read_bio *, + struct bvec_iter, struct bkey_s_c k, + struct extent_pick_ptr *, unsigned); + +static inline void bch2_read_extent(struct bch_fs *c, + struct bch_read_bio *orig, + struct bkey_s_c k, + struct extent_pick_ptr *pick, + unsigned flags) +{ + bch2_read_extent_iter(c, orig, orig->bio.bi_iter, + k, pick, flags); +} + +enum bch_read_flags { + BCH_READ_FORCE_BOUNCE = 1 << 0, + BCH_READ_RETRY_IF_STALE = 1 << 1, + BCH_READ_PROMOTE = 1 << 2, + BCH_READ_IS_LAST = 1 << 3, + BCH_READ_MAY_REUSE_BIO = 1 << 4, + BCH_READ_USER_MAPPED = 1 << 5, +}; + +void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); + +void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, + const struct bkey_i *); + +int bch2_discard(struct bch_fs *, struct bpos, struct bpos, + struct bversion, struct disk_reservation *, + struct extent_insert_hook *, u64 *); + +void bch2_read_retry_work(struct work_struct *); +void bch2_wake_delayed_writes(unsigned long data); + +#endif /* _BCACHE_IO_H */ diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h new file mode 100644 index 00000000..07ea67c6 --- /dev/null +++ b/libbcachefs/io_types.h @@ -0,0 +1,145 @@ +#ifndef _BCACHE_IO_TYPES_H +#define _BCACHE_IO_TYPES_H + +#include "btree_types.h" +#include "buckets_types.h" +#include "keylist_types.h" + +#include <linux/llist.h> +#include <linux/workqueue.h> + +struct bch_read_bio { + /* + * Reads will often have to be split, and if the extent being read from + * was checksummed or compressed we'll also have to allocate bounce + * buffers and copy the data back into the original bio. + * + * If we didn't have to split, we have to save and restore the original + * bi_end_io - @split below indicates which: + */ + union { + struct bch_read_bio *parent; + bio_end_io_t *orig_bi_end_io; + }; + + /* + * Saved copy of parent->bi_iter, from submission time - allows us to + * resubmit on IO error, and also to copy data back to the original bio + * when we're bouncing: + */ + struct bvec_iter parent_iter; + + unsigned submit_time_us; + u16 flags; + u8 bounce:1, + split:1; + + struct bch_fs *c; + struct bch_dev *ca; + struct bch_extent_ptr ptr; + struct bch_extent_crc128 crc; + struct bversion version; + + struct cache_promote_op *promote; + + /* + * If we have to retry the read (IO error, checksum failure, read stale + * data (raced with allocator), we retry the portion of the parent bio + * that failed (i.e. this bio's portion, parent_iter). + * + * But we need to stash the inode somewhere: + */ + u64 inode; + + struct work_struct work; + + struct bio bio; +}; + +static inline struct bch_read_bio * +bch2_rbio_parent(struct bch_read_bio *rbio) +{ + return rbio->split ? rbio->parent : rbio; +} + +struct bch_write_bio { + struct bch_fs *c; + struct bch_dev *ca; + union { + struct bio *orig; + struct closure *cl; + }; + + unsigned submit_time_us; + unsigned split:1, + bounce:1, + put_bio:1; + + /* Only for btree writes: */ + unsigned used_mempool:1; + u8 order; + + struct bio bio; +}; + +struct bch_replace_info { + struct extent_insert_hook hook; + /* How many insertions succeeded */ + unsigned successes; + /* How many insertions failed */ + unsigned failures; + BKEY_PADDED(key); +}; + +struct bch_write_op { + struct closure cl; + struct bch_fs *c; + struct workqueue_struct *io_wq; + struct bch_write_bio *bio; + + unsigned written; /* sectors */ + + short error; + + u16 flags; + unsigned csum_type:4; + unsigned compression_type:4; + unsigned nr_replicas:4; + unsigned alloc_reserve:4; + unsigned nonce:14; + + struct bpos pos; + struct bversion version; + + /* For BCH_WRITE_DATA_COMPRESSED: */ + struct bch_extent_crc128 crc; + unsigned size; + + struct disk_reservation res; + + struct write_point *wp; + + union { + u8 open_buckets[16]; + struct { + struct bch_write_op *next; + unsigned long expires; + }; + }; + + /* + * If caller wants to flush but hasn't passed us a journal_seq ptr, we + * still need to stash the journal_seq somewhere: + */ + union { + u64 *journal_seq_p; + u64 journal_seq; + }; + + int (*index_update_fn)(struct bch_write_op *); + + struct keylist insert_keys; + u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; +}; + +#endif /* _BCACHE_IO_TYPES_H */ diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c new file mode 100644 index 00000000..60c5c9b0 --- /dev/null +++ b/libbcachefs/journal.c @@ -0,0 +1,2831 @@ +/* + * bcachefs journalling code, for btree insertions + * + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc.h" +#include "bkey_methods.h" +#include "buckets.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "btree_io.h" +#include "checksum.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "io.h" +#include "keylist.h" +#include "journal.h" +#include "super-io.h" +#include "vstructs.h" + +#include <trace/events/bcachefs.h> + +static void journal_write(struct closure *); +static void journal_reclaim_fast(struct journal *); +static void journal_pin_add_entry(struct journal *, + struct journal_entry_pin_list *, + struct journal_entry_pin *, + journal_pin_flush_fn); + +static inline struct journal_buf *journal_cur_buf(struct journal *j) +{ + return j->buf + j->reservations.idx; +} + +static inline struct journal_buf *journal_prev_buf(struct journal *j) +{ + return j->buf + !j->reservations.idx; +} + +/* Sequence number of oldest dirty journal entry */ + +static inline u64 last_seq(struct journal *j) +{ + return atomic64_read(&j->seq) - fifo_used(&j->pin) + 1; +} + +static inline u64 journal_pin_seq(struct journal *j, + struct journal_entry_pin_list *pin_list) +{ + return last_seq(j) + fifo_entry_idx(&j->pin, pin_list); +} + +static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, + struct jset_entry *entry, unsigned type) +{ + while (entry < vstruct_last(jset)) { + if (JOURNAL_ENTRY_TYPE(entry) == type) + return entry; + + entry = vstruct_next(entry); + } + + return NULL; +} + +#define for_each_jset_entry_type(entry, jset, type) \ + for (entry = (jset)->start; \ + (entry = __jset_entry_type_next(jset, entry, type)); \ + entry = vstruct_next(entry)) + +#define for_each_jset_key(k, _n, entry, jset) \ + for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \ + vstruct_for_each_safe(entry, k, _n) + +static inline void bch2_journal_add_entry(struct journal_buf *buf, + const void *data, size_t u64s, + unsigned type, enum btree_id id, + unsigned level) +{ + struct jset *jset = buf->data; + + bch2_journal_add_entry_at(buf, data, u64s, type, id, level, + le32_to_cpu(jset->u64s)); + le32_add_cpu(&jset->u64s, jset_u64s(u64s)); +} + +static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type, + enum btree_id id) +{ + struct jset_entry *entry; + + for_each_jset_entry_type(entry, j, type) + if (entry->btree_id == id) + return entry; + + return NULL; +} + +struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j, + enum btree_id id, unsigned *level) +{ + struct bkey_i *k; + struct jset_entry *entry = + bch2_journal_find_entry(j, JOURNAL_ENTRY_BTREE_ROOT, id); + + if (!entry) + return NULL; + + k = entry->start; + *level = entry->level; + *level = entry->level; + return k; +} + +static void bch2_journal_add_btree_root(struct journal_buf *buf, + enum btree_id id, struct bkey_i *k, + unsigned level) +{ + bch2_journal_add_entry(buf, k, k->k.u64s, + JOURNAL_ENTRY_BTREE_ROOT, id, level); +} + +static inline void bch2_journal_add_prios(struct journal *j, + struct journal_buf *buf) +{ + /* + * no prio bucket ptrs yet... XXX should change the allocator so this + * can't happen: + */ + if (!buf->nr_prio_buckets) + return; + + bch2_journal_add_entry(buf, j->prio_buckets, buf->nr_prio_buckets, + JOURNAL_ENTRY_PRIO_PTRS, 0, 0); +} + +static void journal_seq_blacklist_flush(struct journal *j, + struct journal_entry_pin *pin) +{ + struct bch_fs *c = + container_of(j, struct bch_fs, journal); + struct journal_seq_blacklist *bl = + container_of(pin, struct journal_seq_blacklist, pin); + struct blacklisted_node n; + struct closure cl; + unsigned i; + int ret; + + closure_init_stack(&cl); + + for (i = 0;; i++) { + struct btree_iter iter; + struct btree *b; + + mutex_lock(&j->blacklist_lock); + if (i >= bl->nr_entries) { + mutex_unlock(&j->blacklist_lock); + break; + } + n = bl->entries[i]; + mutex_unlock(&j->blacklist_lock); + + bch2_btree_iter_init(&iter, c, n.btree_id, n.pos); + iter.is_extents = false; +redo_peek: + b = bch2_btree_iter_peek_node(&iter); + + /* The node might have already been rewritten: */ + + if (b->data->keys.seq == n.seq && + !bkey_cmp(b->key.k.p, n.pos)) { + ret = bch2_btree_node_rewrite(&iter, b, &cl); + if (ret) { + bch2_btree_iter_unlock(&iter); + closure_sync(&cl); + + if (ret == -EAGAIN || + ret == -EINTR) + goto redo_peek; + + /* -EROFS or perhaps -ENOSPC - bail out: */ + /* XXX warn here */ + return; + } + } + + bch2_btree_iter_unlock(&iter); + } + + closure_sync(&cl); + + for (i = 0;; i++) { + struct btree_interior_update *as; + struct pending_btree_node_free *d; + + mutex_lock(&j->blacklist_lock); + if (i >= bl->nr_entries) { + mutex_unlock(&j->blacklist_lock); + break; + } + n = bl->entries[i]; + mutex_unlock(&j->blacklist_lock); +redo_wait: + mutex_lock(&c->btree_interior_update_lock); + + /* + * Is the node on the list of pending interior node updates - + * being freed? If so, wait for that to finish: + */ + for_each_pending_btree_node_free(c, as, d) + if (n.seq == d->seq && + n.btree_id == d->btree_id && + !d->level && + !bkey_cmp(n.pos, d->key.k.p)) { + closure_wait(&as->wait, &cl); + mutex_unlock(&c->btree_interior_update_lock); + closure_sync(&cl); + goto redo_wait; + } + + mutex_unlock(&c->btree_interior_update_lock); + } + + mutex_lock(&j->blacklist_lock); + + bch2_journal_pin_drop(j, &bl->pin); + list_del(&bl->list); + kfree(bl->entries); + kfree(bl); + + mutex_unlock(&j->blacklist_lock); +} + +static struct journal_seq_blacklist * +journal_seq_blacklist_find(struct journal *j, u64 seq) +{ + struct journal_seq_blacklist *bl; + + lockdep_assert_held(&j->blacklist_lock); + + list_for_each_entry(bl, &j->seq_blacklist, list) + if (seq == bl->seq) + return bl; + + return NULL; +} + +static struct journal_seq_blacklist * +bch2_journal_seq_blacklisted_new(struct journal *j, u64 seq) +{ + struct journal_seq_blacklist *bl; + + lockdep_assert_held(&j->blacklist_lock); + + bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return NULL; + + bl->seq = seq; + list_add_tail(&bl->list, &j->seq_blacklist); + return bl; +} + +/* + * Returns true if @seq is newer than the most recent journal entry that got + * written, and data corresponding to @seq should be ignored - also marks @seq + * as blacklisted so that on future restarts the corresponding data will still + * be ignored: + */ +int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b) +{ + struct journal *j = &c->journal; + struct journal_seq_blacklist *bl = NULL; + struct blacklisted_node *n; + u64 journal_seq, i; + int ret = 0; + + if (!seq) + return 0; + + journal_seq = atomic64_read(&j->seq); + + /* Interier updates aren't journalled: */ + BUG_ON(b->level); + BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)); + + if (seq <= journal_seq) { + if (list_empty_careful(&j->seq_blacklist)) + return 0; + + mutex_lock(&j->blacklist_lock); + ret = journal_seq_blacklist_find(j, seq) != NULL; + mutex_unlock(&j->blacklist_lock); + return ret; + } + + /* + * Decrease this back to j->seq + 2 when we next rev the on disk format: + * increasing it temporarily to work around bug in old kernels + */ + bch2_fs_inconsistent_on(seq > journal_seq + 4, c, + "bset journal seq too far in the future: %llu > %llu", + seq, journal_seq); + + bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting", + b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq); + + /* + * When we start the journal, bch2_journal_start() will skip over @seq: + */ + + mutex_lock(&j->blacklist_lock); + + for (i = journal_seq + 1; i <= seq; i++) { + bl = journal_seq_blacklist_find(j, i) ?: + bch2_journal_seq_blacklisted_new(j, i); + + if (!bl) { + ret = -ENOMEM; + goto out; + } + } + + for (n = bl->entries; n < bl->entries + bl->nr_entries; n++) + if (b->data->keys.seq == n->seq && + b->btree_id == n->btree_id && + !bkey_cmp(b->key.k.p, n->pos)) + goto found_entry; + + if (!bl->nr_entries || + is_power_of_2(bl->nr_entries)) { + n = krealloc(bl->entries, + max(bl->nr_entries * 2, 8UL) * sizeof(*n), + GFP_KERNEL); + if (!n) { + ret = -ENOMEM; + goto out; + } + bl->entries = n; + } + + bl->entries[bl->nr_entries++] = (struct blacklisted_node) { + .seq = b->data->keys.seq, + .btree_id = b->btree_id, + .pos = b->key.k.p, + }; +found_entry: + ret = 1; +out: + mutex_unlock(&j->blacklist_lock); + return ret; +} + +/* + * Journal replay/recovery: + * + * This code is all driven from bch2_fs_start(); we first read the journal + * entries, do some other stuff, then we mark all the keys in the journal + * entries (same as garbage collection would), then we replay them - reinserting + * them into the cache in precisely the same order as they appear in the + * journal. + * + * We only journal keys that go in leaf nodes, which simplifies things quite a + * bit. + */ + +struct journal_list { + struct closure cl; + struct mutex lock; + struct list_head *head; + int ret; +}; + +#define JOURNAL_ENTRY_ADD_OK 0 +#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 + +/* + * Given a journal entry we just read, add it to the list of journal entries to + * be replayed: + */ +static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist, + struct jset *j) +{ + struct journal_replay *i, *pos; + struct list_head *where; + size_t bytes = vstruct_bytes(j); + __le64 last_seq; + int ret; + + mutex_lock(&jlist->lock); + + last_seq = !list_empty(jlist->head) + ? list_last_entry(jlist->head, struct journal_replay, + list)->j.last_seq + : 0; + + /* Is this entry older than the range we need? */ + if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { + ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; + goto out; + } + + /* Drop entries we don't need anymore */ + list_for_each_entry_safe(i, pos, jlist->head, list) { + if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) + break; + list_del(&i->list); + kfree(i); + } + + list_for_each_entry_reverse(i, jlist->head, list) { + /* Duplicate? */ + if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { + fsck_err_on(bytes != vstruct_bytes(&i->j) || + memcmp(j, &i->j, bytes), c, + "found duplicate but non identical journal entries (seq %llu)", + le64_to_cpu(j->seq)); + + ret = JOURNAL_ENTRY_ADD_OK; + goto out; + } + + if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { + where = &i->list; + goto add; + } + } + + where = jlist->head; +add: + i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + if (!i) { + ret = -ENOMEM; + goto out; + } + + memcpy(&i->j, j, bytes); + list_add(&i->list, where); + ret = JOURNAL_ENTRY_ADD_OK; +out: +fsck_err: + mutex_unlock(&jlist->lock); + return ret; +} + +static struct nonce journal_nonce(const struct jset *jset) +{ + return (struct nonce) {{ + [0] = 0, + [1] = ((__le32 *) &jset->seq)[0], + [2] = ((__le32 *) &jset->seq)[1], + [3] = BCH_NONCE_JOURNAL, + }}; +} + +static void journal_entry_null_range(void *start, void *end) +{ + struct jset_entry *entry; + + for (entry = start; entry != end; entry = vstruct_next(entry)) { + entry->u64s = 0; + entry->btree_id = 0; + entry->level = 0; + entry->flags = 0; + SET_JOURNAL_ENTRY_TYPE(entry, 0); + } +} + +static int journal_validate_key(struct bch_fs *c, struct jset *j, + struct jset_entry *entry, + struct bkey_i *k, enum bkey_type key_type, + const char *type) +{ + void *next = vstruct_next(entry); + const char *invalid; + char buf[160]; + int ret = 0; + + if (mustfix_fsck_err_on(!k->k.u64s, c, + "invalid %s in journal: k->u64s 0", type)) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); + return 0; + } + + if (mustfix_fsck_err_on((void *) bkey_next(k) > + (void *) vstruct_next(entry), c, + "invalid %s in journal: extends past end of journal entry", + type)) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); + return 0; + } + + if (mustfix_fsck_err_on(k->k.format != KEY_FORMAT_CURRENT, c, + "invalid %s in journal: bad format %u", + type, k->k.format)) { + le16_add_cpu(&entry->u64s, -k->k.u64s); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); + return 0; + } + + if (JSET_BIG_ENDIAN(j) != CPU_BIG_ENDIAN) + bch2_bkey_swab(key_type, NULL, bkey_to_packed(k)); + + invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k)); + if (invalid) { + bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf), + bkey_i_to_s_c(k)); + mustfix_fsck_err(c, "invalid %s in journal: %s", type, buf); + + le16_add_cpu(&entry->u64s, -k->k.u64s); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); + return 0; + } +fsck_err: + return ret; +} + +#define JOURNAL_ENTRY_REREAD 5 +#define JOURNAL_ENTRY_NONE 6 +#define JOURNAL_ENTRY_BAD 7 + +static int journal_entry_validate(struct bch_fs *c, + struct jset *j, u64 sector, + unsigned bucket_sectors_left, + unsigned sectors_read) +{ + struct jset_entry *entry; + size_t bytes = vstruct_bytes(j); + struct bch_csum csum; + int ret = 0; + + if (le64_to_cpu(j->magic) != jset_magic(c)) + return JOURNAL_ENTRY_NONE; + + if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) { + bch_err(c, "unknown journal entry version %u", + le32_to_cpu(j->version)); + return BCH_FSCK_UNKNOWN_VERSION; + } + + if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c, + "journal entry too big (%zu bytes), sector %lluu", + bytes, sector)) { + /* XXX: note we might have missing journal entries */ + return JOURNAL_ENTRY_BAD; + } + + if (bytes > sectors_read << 9) + return JOURNAL_ENTRY_REREAD; + + if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c, + "journal entry with unknown csum type %llu sector %lluu", + JSET_CSUM_TYPE(j), sector)) + return JOURNAL_ENTRY_BAD; + + csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); + if (mustfix_fsck_err_on(bch2_crc_cmp(csum, j->csum), c, + "journal checksum bad, sector %llu", sector)) { + /* XXX: retry IO, when we start retrying checksum errors */ + /* XXX: note we might have missing journal entries */ + return JOURNAL_ENTRY_BAD; + } + + bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), + j->encrypted_start, + vstruct_end(j) - (void *) j->encrypted_start); + + if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c, + "invalid journal entry: last_seq > seq")) + j->last_seq = j->seq; + + vstruct_for_each(j, entry) { + struct bkey_i *k; + + if (mustfix_fsck_err_on(vstruct_next(entry) > + vstruct_last(j), c, + "journal entry extents past end of jset")) { + j->u64s = cpu_to_le64((u64 *) entry - j->_data); + break; + } + + switch (JOURNAL_ENTRY_TYPE(entry)) { + case JOURNAL_ENTRY_BTREE_KEYS: + vstruct_for_each(entry, k) { + ret = journal_validate_key(c, j, entry, k, + bkey_type(entry->level, + entry->btree_id), + "key"); + if (ret) + goto fsck_err; + } + break; + + case JOURNAL_ENTRY_BTREE_ROOT: + k = entry->start; + + if (mustfix_fsck_err_on(!entry->u64s || + le16_to_cpu(entry->u64s) != k->k.u64s, c, + "invalid btree root journal entry: wrong number of keys")) { + journal_entry_null_range(entry, + vstruct_next(entry)); + continue; + } + + ret = journal_validate_key(c, j, entry, k, + BKEY_TYPE_BTREE, "btree root"); + if (ret) + goto fsck_err; + break; + + case JOURNAL_ENTRY_PRIO_PTRS: + break; + + case JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED: + if (mustfix_fsck_err_on(le16_to_cpu(entry->u64s) != 1, c, + "invalid journal seq blacklist entry: bad size")) { + journal_entry_null_range(entry, + vstruct_next(entry)); + } + + break; + default: + mustfix_fsck_err(c, "invalid journal entry type %llu", + JOURNAL_ENTRY_TYPE(entry)); + journal_entry_null_range(entry, vstruct_next(entry)); + break; + } + } + +fsck_err: + return ret; +} + +struct journal_read_buf { + void *data; + size_t size; +}; + +static int journal_read_buf_realloc(struct journal_read_buf *b, + size_t new_size) +{ + void *n; + + new_size = roundup_pow_of_two(new_size); + n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size)); + if (!n) + return -ENOMEM; + + free_pages((unsigned long) b->data, get_order(b->size)); + b->data = n; + b->size = new_size; + return 0; +} + +static int journal_read_bucket(struct bch_dev *ca, + struct journal_read_buf *buf, + struct journal_list *jlist, + unsigned bucket, u64 *seq, bool *entries_found) +{ + struct bch_fs *c = ca->fs; + struct journal_device *ja = &ca->journal; + struct bio *bio = ja->bio; + struct jset *j = NULL; + unsigned sectors, sectors_read = 0; + u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), + end = offset + ca->mi.bucket_size; + bool saw_bad = false; + int ret = 0; + + pr_debug("reading %u", bucket); + + while (offset < end) { + if (!sectors_read) { +reread: sectors_read = min_t(unsigned, + end - offset, buf->size >> 9); + + bio_reset(bio); + bio->bi_bdev = ca->disk_sb.bdev; + bio->bi_iter.bi_sector = offset; + bio->bi_iter.bi_size = sectors_read << 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + bch2_bio_map(bio, buf->data); + + ret = submit_bio_wait(bio); + + if (bch2_dev_fatal_io_err_on(ret, ca, + "journal read from sector %llu", + offset) || + bch2_meta_read_fault("journal")) + return -EIO; + + j = buf->data; + } + + ret = journal_entry_validate(c, j, offset, + end - offset, sectors_read); + switch (ret) { + case BCH_FSCK_OK: + break; + case JOURNAL_ENTRY_REREAD: + if (vstruct_bytes(j) > buf->size) { + ret = journal_read_buf_realloc(buf, + vstruct_bytes(j)); + if (ret) + return ret; + } + goto reread; + case JOURNAL_ENTRY_NONE: + if (!saw_bad) + return 0; + sectors = c->sb.block_size; + goto next_block; + case JOURNAL_ENTRY_BAD: + saw_bad = true; + sectors = c->sb.block_size; + goto next_block; + default: + return ret; + } + + /* + * This happens sometimes if we don't have discards on - + * when we've partially overwritten a bucket with new + * journal entries. We don't need the rest of the + * bucket: + */ + if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) + return 0; + + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + + ret = journal_entry_add(c, jlist, j); + switch (ret) { + case JOURNAL_ENTRY_ADD_OK: + *entries_found = true; + break; + case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: + break; + default: + return ret; + } + + if (le64_to_cpu(j->seq) > *seq) + *seq = le64_to_cpu(j->seq); + + sectors = vstruct_sectors(j, c->block_bits); +next_block: + pr_debug("next"); + offset += sectors; + sectors_read -= sectors; + j = ((void *) j) + (sectors << 9); + } + + return 0; +} + +static void bch2_journal_read_device(struct closure *cl) +{ +#define read_bucket(b) \ + ({ \ + bool entries_found = false; \ + ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \ + &entries_found); \ + if (ret) \ + goto err; \ + __set_bit(b, bitmap); \ + entries_found; \ + }) + + struct journal_device *ja = + container_of(cl, struct journal_device, read); + struct bch_dev *ca = container_of(ja, struct bch_dev, journal); + struct journal_list *jlist = + container_of(cl->parent, struct journal_list, cl); + struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev); + struct journal_read_buf buf = { NULL, 0 }; + + DECLARE_BITMAP(bitmap, ja->nr); + unsigned i, l, r; + u64 seq = 0; + int ret; + + if (!ja->nr) + goto out; + + bitmap_zero(bitmap, ja->nr); + ret = journal_read_buf_realloc(&buf, PAGE_SIZE); + if (ret) + goto err; + + pr_debug("%u journal buckets", ja->nr); + + /* + * If the device supports discard but not secure discard, we can't do + * the fancy fibonacci hash/binary search because the live journal + * entries might not form a contiguous range: + */ + for (i = 0; i < ja->nr; i++) + read_bucket(i); + goto search_done; + + if (!blk_queue_nonrot(q)) + goto linear_scan; + + /* + * Read journal buckets ordered by golden ratio hash to quickly + * find a sequence of buckets with valid journal entries + */ + for (i = 0; i < ja->nr; i++) { + l = (i * 2654435769U) % ja->nr; + + if (test_bit(l, bitmap)) + break; + + if (read_bucket(l)) + goto bsearch; + } + + /* + * If that fails, check all the buckets we haven't checked + * already + */ + pr_debug("falling back to linear search"); +linear_scan: + for (l = find_first_zero_bit(bitmap, ja->nr); + l < ja->nr; + l = find_next_zero_bit(bitmap, ja->nr, l + 1)) + if (read_bucket(l)) + goto bsearch; + + /* no journal entries on this device? */ + if (l == ja->nr) + goto out; +bsearch: + /* Binary search */ + r = find_next_bit(bitmap, ja->nr, l + 1); + pr_debug("starting binary search, l %u r %u", l, r); + + while (l + 1 < r) { + unsigned m = (l + r) >> 1; + u64 cur_seq = seq; + + read_bucket(m); + + if (cur_seq != seq) + l = m; + else + r = m; + } + +search_done: + /* + * Find the journal bucket with the highest sequence number: + * + * If there's duplicate journal entries in multiple buckets (which + * definitely isn't supposed to happen, but...) - make sure to start + * cur_idx at the last of those buckets, so we don't deadlock trying to + * allocate + */ + seq = 0; + + for (i = 0; i < ja->nr; i++) + if (ja->bucket_seq[i] >= seq && + ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) { + /* + * When journal_next_bucket() goes to allocate for + * the first time, it'll use the bucket after + * ja->cur_idx + */ + ja->cur_idx = i; + seq = ja->bucket_seq[i]; + } + + /* + * Set last_idx to indicate the entire journal is full and needs to be + * reclaimed - journal reclaim will immediately reclaim whatever isn't + * pinned when it first runs: + */ + ja->last_idx = (ja->cur_idx + 1) % ja->nr; + + /* + * Read buckets in reverse order until we stop finding more journal + * entries: + */ + for (i = (ja->cur_idx + ja->nr - 1) % ja->nr; + i != ja->cur_idx; + i = (i + ja->nr - 1) % ja->nr) + if (!test_bit(i, bitmap) && + !read_bucket(i)) + break; +out: + free_pages((unsigned long) buf.data, get_order(buf.size)); + percpu_ref_put(&ca->io_ref); + closure_return(cl); +err: + mutex_lock(&jlist->lock); + jlist->ret = ret; + mutex_unlock(&jlist->lock); + goto out; +#undef read_bucket +} + +void bch2_journal_entries_free(struct list_head *list) +{ + + while (!list_empty(list)) { + struct journal_replay *i = + list_first_entry(list, struct journal_replay, list); + list_del(&i->list); + kvfree(i); + } +} + +static int journal_seq_blacklist_read(struct journal *j, + struct journal_replay *i, + struct journal_entry_pin_list *p) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct jset_entry *entry; + struct journal_seq_blacklist *bl; + u64 seq; + + for_each_jset_entry_type(entry, &i->j, + JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) { + seq = le64_to_cpu(entry->_data[0]); + + bch_verbose(c, "blacklisting existing journal seq %llu", seq); + + bl = bch2_journal_seq_blacklisted_new(j, seq); + if (!bl) + return -ENOMEM; + + journal_pin_add_entry(j, p, &bl->pin, + journal_seq_blacklist_flush); + bl->written = true; + } + + return 0; +} + +static inline bool journal_has_keys(struct list_head *list) +{ + struct journal_replay *i; + struct jset_entry *entry; + struct bkey_i *k, *_n; + + list_for_each_entry(i, list, list) + for_each_jset_key(k, _n, entry, &i->j) + return true; + + return false; +} + +int bch2_journal_read(struct bch_fs *c, struct list_head *list) +{ + struct jset_entry *prio_ptrs; + struct journal_list jlist; + struct journal_replay *i; + struct jset *j; + struct journal_entry_pin_list *p; + struct bch_dev *ca; + u64 cur_seq, end_seq; + unsigned iter; + int ret = 0; + + closure_init_stack(&jlist.cl); + mutex_init(&jlist.lock); + jlist.head = list; + jlist.ret = 0; + + for_each_readable_member(ca, c, iter) { + percpu_ref_get(&ca->io_ref); + closure_call(&ca->journal.read, + bch2_journal_read_device, + system_unbound_wq, + &jlist.cl); + } + + closure_sync(&jlist.cl); + + if (jlist.ret) + return jlist.ret; + + if (list_empty(list)){ + bch_err(c, "no journal entries found"); + return BCH_FSCK_REPAIR_IMPOSSIBLE; + } + + fsck_err_on(c->sb.clean && journal_has_keys(list), c, + "filesystem marked clean but journal has keys to replay"); + + j = &list_entry(list->prev, struct journal_replay, list)->j; + + unfixable_fsck_err_on(le64_to_cpu(j->seq) - + le64_to_cpu(j->last_seq) + 1 > + c->journal.pin.size, c, + "too many journal entries open for refcount fifo"); + + c->journal.pin.back = le64_to_cpu(j->seq) - + le64_to_cpu(j->last_seq) + 1; + + atomic64_set(&c->journal.seq, le64_to_cpu(j->seq)); + c->journal.last_seq_ondisk = le64_to_cpu(j->last_seq); + + BUG_ON(last_seq(&c->journal) != le64_to_cpu(j->last_seq)); + + i = list_first_entry(list, struct journal_replay, list); + + mutex_lock(&c->journal.blacklist_lock); + + fifo_for_each_entry_ptr(p, &c->journal.pin, iter) { + u64 seq = journal_pin_seq(&c->journal, p); + + INIT_LIST_HEAD(&p->list); + + if (i && le64_to_cpu(i->j.seq) == seq) { + atomic_set(&p->count, 1); + + if (journal_seq_blacklist_read(&c->journal, i, p)) { + mutex_unlock(&c->journal.blacklist_lock); + return -ENOMEM; + } + + i = list_is_last(&i->list, list) + ? NULL + : list_next_entry(i, list); + } else { + atomic_set(&p->count, 0); + } + } + + mutex_unlock(&c->journal.blacklist_lock); + + cur_seq = last_seq(&c->journal); + end_seq = le64_to_cpu(list_last_entry(list, + struct journal_replay, list)->j.seq); + + list_for_each_entry(i, list, list) { + bool blacklisted; + + mutex_lock(&c->journal.blacklist_lock); + while (cur_seq < le64_to_cpu(i->j.seq) && + journal_seq_blacklist_find(&c->journal, cur_seq)) + cur_seq++; + + blacklisted = journal_seq_blacklist_find(&c->journal, + le64_to_cpu(i->j.seq)); + mutex_unlock(&c->journal.blacklist_lock); + + fsck_err_on(blacklisted, c, + "found blacklisted journal entry %llu", + le64_to_cpu(i->j.seq)); + + fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c, + "journal entries %llu-%llu missing! (replaying %llu-%llu)", + cur_seq, le64_to_cpu(i->j.seq) - 1, + last_seq(&c->journal), end_seq); + + cur_seq = le64_to_cpu(i->j.seq) + 1; + } + + prio_ptrs = bch2_journal_find_entry(j, JOURNAL_ENTRY_PRIO_PTRS, 0); + if (prio_ptrs) { + memcpy_u64s(c->journal.prio_buckets, + prio_ptrs->_data, + le16_to_cpu(prio_ptrs->u64s)); + c->journal.nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s); + } +fsck_err: + return ret; +} + +void bch2_journal_mark(struct bch_fs *c, struct list_head *list) +{ + struct bkey_i *k, *n; + struct jset_entry *j; + struct journal_replay *r; + + list_for_each_entry(r, list, list) + for_each_jset_key(k, n, j, &r->j) { + enum bkey_type type = bkey_type(j->level, j->btree_id); + struct bkey_s_c k_s_c = bkey_i_to_s_c(k); + + if (btree_type_has_ptrs(type)) + bch2_btree_mark_key_initial(c, type, k_s_c); + } +} + +static bool journal_entry_is_open(struct journal *j) +{ + return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; +} + +void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + if (!need_write_just_set && + test_bit(JOURNAL_NEED_WRITE, &j->flags)) + __bch2_time_stats_update(j->delay_time, + j->need_write_time); +#if 0 + closure_call(&j->io, journal_write, NULL, &c->cl); +#else + /* Shut sparse up: */ + closure_init(&j->io, &c->cl); + set_closure_fn(&j->io, journal_write, NULL); + journal_write(&j->io); +#endif +} + +static void __bch2_journal_next_entry(struct journal *j) +{ + struct journal_entry_pin_list pin_list, *p; + struct journal_buf *buf; + + /* + * The fifo_push() needs to happen at the same time as j->seq is + * incremented for last_seq() to be calculated correctly + */ + atomic64_inc(&j->seq); + BUG_ON(!fifo_push(&j->pin, pin_list)); + p = &fifo_peek_back(&j->pin); + + INIT_LIST_HEAD(&p->list); + atomic_set(&p->count, 1); + + if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) { + smp_wmb(); + j->cur_pin_list = p; + } + + buf = journal_cur_buf(j); + memset(buf->has_inode, 0, sizeof(buf->has_inode)); + + memset(buf->data, 0, sizeof(*buf->data)); + buf->data->seq = cpu_to_le64(atomic64_read(&j->seq)); + buf->data->u64s = 0; + + BUG_ON(journal_pin_seq(j, p) != atomic64_read(&j->seq)); +} + +static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf) +{ + unsigned ret = BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); + + if (buf->nr_prio_buckets) + ret += JSET_KEYS_U64s + buf->nr_prio_buckets; + + return ret; +} + +static enum { + JOURNAL_ENTRY_ERROR, + JOURNAL_ENTRY_INUSE, + JOURNAL_ENTRY_CLOSED, + JOURNAL_UNLOCKED, +} journal_buf_switch(struct journal *j, bool need_write_just_set) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf; + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); + + do { + old.v = new.v = v; + if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) + return JOURNAL_ENTRY_CLOSED; + + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) + return JOURNAL_ENTRY_ERROR; + + if (new.prev_buf_unwritten) + return JOURNAL_ENTRY_INUSE; + + /* + * avoid race between setting buf->data->u64s and + * journal_res_put starting write: + */ + journal_state_inc(&new); + + new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; + new.idx++; + new.prev_buf_unwritten = 1; + + BUG_ON(journal_state_count(new, new.idx)); + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + journal_reclaim_fast(j); + + clear_bit(JOURNAL_NEED_WRITE, &j->flags); + + buf = &j->buf[old.idx]; + buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + buf->data->last_seq = cpu_to_le64(last_seq(j)); + + j->prev_buf_sectors = + vstruct_blocks_plus(buf->data, c->block_bits, + journal_entry_u64s_reserve(buf)) * + c->sb.block_size; + + BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors); + + atomic_dec_bug(&fifo_peek_back(&j->pin).count); + __bch2_journal_next_entry(j); + + cancel_delayed_work(&j->write_work); + spin_unlock(&j->lock); + + if (c->bucket_journal_seq > 1 << 14) { + c->bucket_journal_seq = 0; + bch2_bucket_seq_cleanup(c); + } + + /* ugh - might be called from __journal_res_get() under wait_event() */ + __set_current_state(TASK_RUNNING); + bch2_journal_buf_put(j, old.idx, need_write_just_set); + + return JOURNAL_UNLOCKED; +} + +void bch2_journal_halt(struct journal *j) +{ + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); + + do { + old.v = new.v = v; + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) + return; + + new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + wake_up(&j->wait); + closure_wake_up(&journal_cur_buf(j)->wait); + closure_wake_up(&journal_prev_buf(j)->wait); +} + +static unsigned journal_dev_buckets_available(struct journal *j, + struct bch_dev *ca) +{ + struct journal_device *ja = &ca->journal; + unsigned next = (ja->cur_idx + 1) % ja->nr; + unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; + + /* + * Hack to avoid a deadlock during journal replay: + * journal replay might require setting a new btree + * root, which requires writing another journal entry - + * thus, if the journal is full (and this happens when + * replaying the first journal bucket's entries) we're + * screwed. + * + * So don't let the journal fill up unless we're in + * replay: + */ + if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) + available = max((int) available - 2, 0); + + /* + * Don't use the last bucket unless writing the new last_seq + * will make another bucket available: + */ + if (ja->bucket_seq[ja->last_idx] >= last_seq(j)) + available = max((int) available - 1, 0); + + return available; +} + +/* returns number of sectors available for next journal entry: */ +static int journal_entry_sectors(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); + unsigned sectors_available = j->entry_size_max >> 9; + unsigned i, nr_online = 0, nr_devs = 0; + + lockdep_assert_held(&j->lock); + + spin_lock(&j->devs.lock); + group_for_each_dev(ca, &j->devs, i) { + unsigned buckets_required = 0; + + sectors_available = min_t(unsigned, sectors_available, + ca->mi.bucket_size); + + /* + * Note that we don't allocate the space for a journal entry + * until we write it out - thus, if we haven't started the write + * for the previous entry we have to make sure we have space for + * it too: + */ + if (bch2_extent_has_device(e.c, ca->dev_idx)) { + if (j->prev_buf_sectors > ca->journal.sectors_free) + buckets_required++; + + if (j->prev_buf_sectors + sectors_available > + ca->journal.sectors_free) + buckets_required++; + } else { + if (j->prev_buf_sectors + sectors_available > + ca->mi.bucket_size) + buckets_required++; + + buckets_required++; + } + + if (journal_dev_buckets_available(j, ca) >= buckets_required) + nr_devs++; + nr_online++; + } + spin_unlock(&j->devs.lock); + + if (nr_online < c->opts.metadata_replicas_required) + return -EROFS; + + if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas)) + return 0; + + return sectors_available; +} + +/* + * should _only_ called from journal_res_get() - when we actually want a + * journal reservation - journal entry is open means journal is dirty: + */ +static int journal_entry_open(struct journal *j) +{ + struct journal_buf *buf = journal_cur_buf(j); + ssize_t u64s; + int ret = 0, sectors; + + lockdep_assert_held(&j->lock); + BUG_ON(journal_entry_is_open(j)); + + if (!fifo_free(&j->pin)) + return 0; + + sectors = journal_entry_sectors(j); + if (sectors <= 0) + return sectors; + + j->cur_buf_sectors = sectors; + buf->nr_prio_buckets = j->nr_prio_buckets; + + u64s = (sectors << 9) / sizeof(u64); + + /* Subtract the journal header */ + u64s -= sizeof(struct jset) / sizeof(u64); + /* + * Btree roots, prio pointers don't get added until right before we do + * the write: + */ + u64s -= journal_entry_u64s_reserve(buf); + u64s = max_t(ssize_t, 0L, u64s); + + BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL); + + if (u64s > le32_to_cpu(buf->data->u64s)) { + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); + + /* + * Must be set before marking the journal entry as open: + */ + j->cur_entry_u64s = u64s; + + do { + old.v = new.v = v; + + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) + return false; + + /* Handle any already added entries */ + new.cur_entry_offset = le32_to_cpu(buf->data->u64s); + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + ret = 1; + + wake_up(&j->wait); + + if (j->res_get_blocked_start) { + __bch2_time_stats_update(j->blocked_time, + j->res_get_blocked_start); + j->res_get_blocked_start = 0; + } + + mod_delayed_work(system_freezable_wq, + &j->write_work, + msecs_to_jiffies(j->write_delay_ms)); + } + + return ret; +} + +void bch2_journal_start(struct bch_fs *c) +{ + struct journal *j = &c->journal; + struct journal_seq_blacklist *bl; + u64 new_seq = 0; + + list_for_each_entry(bl, &j->seq_blacklist, list) + new_seq = max(new_seq, bl->seq); + + spin_lock(&j->lock); + + set_bit(JOURNAL_STARTED, &j->flags); + + while (atomic64_read(&j->seq) < new_seq) { + struct journal_entry_pin_list pin_list, *p; + + BUG_ON(!fifo_push(&j->pin, pin_list)); + p = &fifo_peek_back(&j->pin); + + INIT_LIST_HEAD(&p->list); + atomic_set(&p->count, 0); + atomic64_inc(&j->seq); + } + + /* + * journal_buf_switch() only inits the next journal entry when it + * closes an open journal entry - the very first journal entry gets + * initialized here: + */ + __bch2_journal_next_entry(j); + + /* + * Adding entries to the next journal entry before allocating space on + * disk for the next journal entry - this is ok, because these entries + * only have to go down with the next journal entry we write: + */ + list_for_each_entry(bl, &j->seq_blacklist, list) + if (!bl->written) { + bch2_journal_add_entry(journal_cur_buf(j), &bl->seq, 1, + JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED, + 0, 0); + + journal_pin_add_entry(j, + &fifo_peek_back(&j->pin), + &bl->pin, + journal_seq_blacklist_flush); + bl->written = true; + } + + spin_unlock(&j->lock); + + queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0); +} + +int bch2_journal_replay(struct bch_fs *c, struct list_head *list) +{ + int ret = 0, keys = 0, entries = 0; + struct journal *j = &c->journal; + struct bkey_i *k, *_n; + struct jset_entry *entry; + struct journal_replay *i, *n; + + list_for_each_entry_safe(i, n, list, list) { + j->cur_pin_list = + &j->pin.data[((j->pin.back - 1 - + (atomic64_read(&j->seq) - + le64_to_cpu(i->j.seq))) & + j->pin.mask)]; + + for_each_jset_key(k, _n, entry, &i->j) { + struct disk_reservation disk_res; + + /* + * We might cause compressed extents to be split, so we + * need to pass in a disk_reservation: + */ + BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0)); + + ret = bch2_btree_insert(c, entry->btree_id, k, + &disk_res, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_REPLAY); + bch2_disk_reservation_put(c, &disk_res); + + if (ret) + goto err; + + cond_resched(); + keys++; + } + + if (atomic_dec_and_test(&j->cur_pin_list->count)) + wake_up(&j->wait); + + entries++; + } + + if (keys) { + bch2_btree_flush(c); + + /* + * Write a new journal entry _before_ we start journalling new data - + * otherwise, we could end up with btree node bsets with journal seqs + * arbitrarily far in the future vs. the most recently written journal + * entry on disk, if we crash before writing the next journal entry: + */ + ret = bch2_journal_meta(&c->journal); + if (ret) + goto err; + } + + bch_info(c, "journal replay done, %i keys in %i entries, seq %llu", + keys, entries, (u64) atomic64_read(&j->seq)); + + bch2_journal_set_replay_done(&c->journal); +err: + if (ret) + bch_err(c, "journal replay error: %d", ret); + + bch2_journal_entries_free(list); + + return ret; +} + +#if 0 +/* + * Allocate more journal space at runtime - not currently making use if it, but + * the code works: + */ +static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, + unsigned nr) +{ + struct journal *j = &c->journal; + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets; + struct disk_reservation disk_res = { 0, 0 }; + struct closure cl; + u64 *new_bucket_seq = NULL, *new_buckets = NULL; + int ret = 0; + + closure_init_stack(&cl); + + /* don't handle reducing nr of buckets yet: */ + if (nr <= ja->nr) + return 0; + + /* + * note: journal buckets aren't really counted as _sectors_ used yet, so + * we don't need the disk reservation to avoid the BUG_ON() in buckets.c + * when space used goes up without a reservation - but we do need the + * reservation to ensure we'll actually be able to allocate: + */ + + if (bch2_disk_reservation_get(c, &disk_res, + (nr - ja->nr) << ca->bucket_bits, 0)) + return -ENOSPC; + + mutex_lock(&c->sb_lock); + + ret = -ENOMEM; + new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); + new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); + if (!new_buckets || !new_bucket_seq) + goto err; + + journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, + nr + sizeof(*journal_buckets) / sizeof(u64)); + if (!journal_buckets) + goto err; + + spin_lock(&j->lock); + memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); + memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + + while (ja->nr < nr) { + /* must happen under journal lock, to avoid racing with gc: */ + u64 b = bch2_bucket_alloc(ca, RESERVE_NONE); + if (!b) { + if (!closure_wait(&c->freelist_wait, &cl)) { + spin_unlock(&j->lock); + closure_sync(&cl); + spin_lock(&j->lock); + } + continue; + } + + bch2_mark_metadata_bucket(ca, &ca->buckets[b], + BUCKET_JOURNAL, false); + bch2_mark_alloc_bucket(ca, &ca->buckets[b], false); + + memmove(ja->buckets + ja->last_idx + 1, + ja->buckets + ja->last_idx, + (ja->nr - ja->last_idx) * sizeof(u64)); + memmove(ja->bucket_seq + ja->last_idx + 1, + ja->bucket_seq + ja->last_idx, + (ja->nr - ja->last_idx) * sizeof(u64)); + memmove(journal_buckets->buckets + ja->last_idx + 1, + journal_buckets->buckets + ja->last_idx, + (ja->nr - ja->last_idx) * sizeof(u64)); + + ja->buckets[ja->last_idx] = b; + journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b); + + if (ja->last_idx < ja->nr) { + if (ja->cur_idx >= ja->last_idx) + ja->cur_idx++; + ja->last_idx++; + } + ja->nr++; + + } + spin_unlock(&j->lock); + + BUG_ON(bch2_validate_journal_layout(ca->disk_sb.sb, ca->mi)); + + bch2_write_super(c); + + ret = 0; +err: + mutex_unlock(&c->sb_lock); + + kfree(new_bucket_seq); + kfree(new_buckets); + bch2_disk_reservation_put(c, &disk_res); + + return ret; +} +#endif + +int bch2_dev_journal_alloc(struct bch_dev *ca) +{ + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets; + unsigned i, nr; + u64 b, *p; + + if (dynamic_fault("bcachefs:add:journal_alloc")) + return -ENOMEM; + + /* + * clamp journal size to 1024 buckets or 512MB (in sectors), whichever + * is smaller: + */ + nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, + BCH_JOURNAL_BUCKETS_MIN, + min(1 << 10, + (1 << 20) / ca->mi.bucket_size)); + + p = krealloc(ja->bucket_seq, nr * sizeof(u64), + GFP_KERNEL|__GFP_ZERO); + if (!p) + return -ENOMEM; + + ja->bucket_seq = p; + + p = krealloc(ja->buckets, nr * sizeof(u64), + GFP_KERNEL|__GFP_ZERO); + if (!p) + return -ENOMEM; + + ja->buckets = p; + + journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, + nr + sizeof(*journal_buckets) / sizeof(u64)); + if (!journal_buckets) + return -ENOMEM; + + for (i = 0, b = ca->mi.first_bucket; + i < nr && b < ca->mi.nbuckets; b++) { + if (!is_available_bucket(ca->buckets[b].mark)) + continue; + + bch2_mark_metadata_bucket(ca, &ca->buckets[b], + BUCKET_JOURNAL, true); + ja->buckets[i] = b; + journal_buckets->buckets[i] = cpu_to_le64(b); + i++; + } + + if (i < nr) + return -ENOSPC; + + BUG_ON(bch2_validate_journal_layout(ca->disk_sb.sb, ca->mi)); + + ja->nr = nr; + + return 0; +} + +/* Journalling */ + +/** + * journal_reclaim_fast - do the fast part of journal reclaim + * + * Called from IO submission context, does not block. Cleans up after btree + * write completions by advancing the journal pin and each cache's last_idx, + * kicking off discards and background reclaim as necessary. + */ +static void journal_reclaim_fast(struct journal *j) +{ + struct journal_entry_pin_list temp; + bool popped = false; + + lockdep_assert_held(&j->lock); + + /* + * Unpin journal entries whose reference counts reached zero, meaning + * all btree nodes got written out + */ + while (!atomic_read(&fifo_peek_front(&j->pin).count)) { + BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); + BUG_ON(!fifo_pop(&j->pin, temp)); + popped = true; + } + + if (popped) + wake_up(&j->wait); +} + +/* + * Journal entry pinning - machinery for holding a reference on a given journal + * entry, marking it as dirty: + */ + +static inline void __journal_pin_add(struct journal *j, + struct journal_entry_pin_list *pin_list, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + BUG_ON(journal_pin_active(pin)); + + atomic_inc(&pin_list->count); + pin->pin_list = pin_list; + pin->flush = flush_fn; + + if (flush_fn) + list_add(&pin->list, &pin_list->list); + else + INIT_LIST_HEAD(&pin->list); +} + +static void journal_pin_add_entry(struct journal *j, + struct journal_entry_pin_list *pin_list, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + spin_lock_irq(&j->pin_lock); + __journal_pin_add(j, pin_list, pin, flush_fn); + spin_unlock_irq(&j->pin_lock); +} + +void bch2_journal_pin_add(struct journal *j, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + spin_lock_irq(&j->pin_lock); + __journal_pin_add(j, j->cur_pin_list, pin, flush_fn); + spin_unlock_irq(&j->pin_lock); +} + +static inline bool __journal_pin_drop(struct journal *j, + struct journal_entry_pin *pin) +{ + struct journal_entry_pin_list *pin_list = pin->pin_list; + + pin->pin_list = NULL; + + /* journal_reclaim_work() might have already taken us off the list */ + if (!list_empty_careful(&pin->list)) + list_del_init(&pin->list); + + return atomic_dec_and_test(&pin_list->count); +} + +void bch2_journal_pin_drop(struct journal *j, + struct journal_entry_pin *pin) +{ + unsigned long flags; + bool wakeup; + + if (!journal_pin_active(pin)) + return; + + spin_lock_irqsave(&j->pin_lock, flags); + wakeup = __journal_pin_drop(j, pin); + spin_unlock_irqrestore(&j->pin_lock, flags); + + /* + * Unpinning a journal entry make make journal_next_bucket() succeed, if + * writing a new last_seq will now make another bucket available: + * + * Nested irqsave is expensive, don't do the wakeup with lock held: + */ + if (wakeup) + wake_up(&j->wait); +} + +void bch2_journal_pin_add_if_older(struct journal *j, + struct journal_entry_pin *src_pin, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + spin_lock_irq(&j->pin_lock); + + if (journal_pin_active(src_pin) && + (!journal_pin_active(pin) || + fifo_entry_idx(&j->pin, src_pin->pin_list) < + fifo_entry_idx(&j->pin, pin->pin_list))) { + if (journal_pin_active(pin)) + __journal_pin_drop(j, pin); + __journal_pin_add(j, src_pin->pin_list, pin, flush_fn); + } + + spin_unlock_irq(&j->pin_lock); +} + +static struct journal_entry_pin * +journal_get_next_pin(struct journal *j, u64 seq_to_flush) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *ret = NULL; + unsigned iter; + + /* so we don't iterate over empty fifo entries below: */ + if (!atomic_read(&fifo_peek_front(&j->pin).count)) { + spin_lock(&j->lock); + journal_reclaim_fast(j); + spin_unlock(&j->lock); + } + + spin_lock_irq(&j->pin_lock); + fifo_for_each_entry_ptr(pin_list, &j->pin, iter) { + if (journal_pin_seq(j, pin_list) > seq_to_flush) + break; + + ret = list_first_entry_or_null(&pin_list->list, + struct journal_entry_pin, list); + if (ret) { + /* must be list_del_init(), see bch2_journal_pin_drop() */ + list_del_init(&ret->list); + break; + } + } + spin_unlock_irq(&j->pin_lock); + + return ret; +} + +static bool journal_has_pins(struct journal *j) +{ + bool ret; + + spin_lock(&j->lock); + journal_reclaim_fast(j); + ret = fifo_used(&j->pin) > 1 || + atomic_read(&fifo_peek_front(&j->pin).count) > 1; + spin_unlock(&j->lock); + + return ret; +} + +void bch2_journal_flush_pins(struct journal *j) +{ + struct journal_entry_pin *pin; + + while ((pin = journal_get_next_pin(j, U64_MAX))) + pin->flush(j, pin); + + wait_event(j->wait, !journal_has_pins(j) || bch2_journal_error(j)); +} + +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{ + bool ret; + + spin_lock(&j->lock); + ret = ja->nr && + (ja->last_idx != ja->cur_idx && + ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk); + spin_unlock(&j->lock); + + return ret; +} + +/** + * journal_reclaim_work - free up journal buckets + * + * Background journal reclaim writes out btree nodes. It should be run + * early enough so that we never completely run out of journal buckets. + * + * High watermarks for triggering background reclaim: + * - FIFO has fewer than 512 entries left + * - fewer than 25% journal buckets free + * + * Background reclaim runs until low watermarks are reached: + * - FIFO has more than 1024 entries left + * - more than 50% journal buckets free + * + * As long as a reclaim can complete in the time it takes to fill up + * 512 journal entries or 25% of all journal buckets, then + * journal_next_bucket() should not stall. + */ +static void journal_reclaim_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(to_delayed_work(work), + struct bch_fs, journal.reclaim_work); + struct journal *j = &c->journal; + struct bch_dev *ca; + struct journal_entry_pin *pin; + u64 seq_to_flush = 0; + unsigned iter, bucket_to_flush; + unsigned long next_flush; + bool reclaim_lock_held = false, need_flush; + + /* + * Advance last_idx to point to the oldest journal entry containing + * btree node updates that have not yet been written out + */ + for_each_rw_member(ca, c, iter) { + struct journal_device *ja = &ca->journal; + + if (!ja->nr) + continue; + + while (should_discard_bucket(j, ja)) { + if (!reclaim_lock_held) { + /* + * ugh: + * might be called from __journal_res_get() + * under wait_event() - have to go back to + * TASK_RUNNING before doing something that + * would block, but only if we're doing work: + */ + __set_current_state(TASK_RUNNING); + + mutex_lock(&j->reclaim_lock); + reclaim_lock_held = true; + /* recheck under reclaim_lock: */ + continue; + } + + if (ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, + ja->buckets[ja->last_idx]), + ca->mi.bucket_size, GFP_NOIO, 0); + + spin_lock(&j->lock); + ja->last_idx = (ja->last_idx + 1) % ja->nr; + spin_unlock(&j->lock); + + wake_up(&j->wait); + } + + /* + * Write out enough btree nodes to free up 50% journal + * buckets + */ + spin_lock(&j->lock); + bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr; + seq_to_flush = max_t(u64, seq_to_flush, + ja->bucket_seq[bucket_to_flush]); + spin_unlock(&j->lock); + } + + if (reclaim_lock_held) + mutex_unlock(&j->reclaim_lock); + + /* Also flush if the pin fifo is more than half full */ + seq_to_flush = max_t(s64, seq_to_flush, + (s64) atomic64_read(&j->seq) - + (j->pin.size >> 1)); + + /* + * If it's been longer than j->reclaim_delay_ms since we last flushed, + * make sure to flush at least one journal pin: + */ + next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); + need_flush = time_after(jiffies, next_flush); + + while ((pin = journal_get_next_pin(j, need_flush + ? U64_MAX + : seq_to_flush))) { + __set_current_state(TASK_RUNNING); + pin->flush(j, pin); + need_flush = false; + + j->last_flushed = jiffies; + } + + if (!test_bit(BCH_FS_RO, &c->flags)) + queue_delayed_work(system_freezable_wq, &j->reclaim_work, + msecs_to_jiffies(j->reclaim_delay_ms)); +} + +/** + * journal_next_bucket - move on to the next journal bucket if possible + */ +static int journal_write_alloc(struct journal *j, unsigned sectors) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); + struct bch_extent_ptr *ptr; + struct journal_device *ja; + struct bch_dev *ca; + bool swapped; + unsigned i, replicas, replicas_want = + READ_ONCE(c->opts.metadata_replicas); + + spin_lock(&j->lock); + + /* + * Drop any pointers to devices that have been removed, are no longer + * empty, or filled up their current journal bucket: + * + * Note that a device may have had a small amount of free space (perhaps + * one sector) that wasn't enough for the smallest possible journal + * entry - that's why we drop pointers to devices <= current free space, + * i.e. whichever device was limiting the current journal entry size. + */ + extent_for_each_ptr_backwards(e, ptr) { + ca = c->devs[ptr->dev]; + + if (ca->mi.state != BCH_MEMBER_STATE_RW || + ca->journal.sectors_free <= sectors) + __bch2_extent_drop_ptr(e, ptr); + else + ca->journal.sectors_free -= sectors; + } + + replicas = bch2_extent_nr_ptrs(e.c); + + spin_lock(&j->devs.lock); + + /* Sort by tier: */ + do { + swapped = false; + + for (i = 0; i + 1 < j->devs.nr; i++) + if (j->devs.d[i + 0].dev->mi.tier > + j->devs.d[i + 1].dev->mi.tier) { + swap(j->devs.d[i], j->devs.d[i + 1]); + swapped = true; + } + } while (swapped); + + /* + * Pick devices for next journal write: + * XXX: sort devices by free journal space? + */ + group_for_each_dev(ca, &j->devs, i) { + ja = &ca->journal; + + if (replicas >= replicas_want) + break; + + /* + * Check that we can use this device, and aren't already using + * it: + */ + if (bch2_extent_has_device(e.c, ca->dev_idx) || + !journal_dev_buckets_available(j, ca) || + sectors > ca->mi.bucket_size) + continue; + + ja->sectors_free = ca->mi.bucket_size - sectors; + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq); + + extent_ptr_append(bkey_i_to_extent(&j->key), + (struct bch_extent_ptr) { + .offset = bucket_to_sector(ca, + ja->buckets[ja->cur_idx]), + .dev = ca->dev_idx, + }); + replicas++; + } + spin_unlock(&j->devs.lock); + + j->prev_buf_sectors = 0; + spin_unlock(&j->lock); + + if (replicas < c->opts.metadata_replicas_required) + return -EROFS; + + BUG_ON(!replicas); + + return 0; +} + +static void journal_write_compact(struct jset *jset) +{ + struct jset_entry *i, *next, *prev = NULL; + + /* + * Simple compaction, dropping empty jset_entries (from journal + * reservations that weren't fully used) and merging jset_entries that + * can be. + * + * If we wanted to be really fancy here, we could sort all the keys in + * the jset and drop keys that were overwritten - probably not worth it: + */ + vstruct_for_each_safe(jset, i, next) { + unsigned u64s = le16_to_cpu(i->u64s); + + /* Empty entry: */ + if (!u64s) + continue; + + /* Can we merge with previous entry? */ + if (prev && + i->btree_id == prev->btree_id && + i->level == prev->level && + JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) && + JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS && + le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { + memmove_u64s_down(vstruct_next(prev), + i->_data, + u64s); + le16_add_cpu(&prev->u64s, u64s); + continue; + } + + /* Couldn't merge, move i into new position (after prev): */ + prev = prev ? vstruct_next(prev) : jset->start; + if (i != prev) + memmove_u64s_down(prev, i, jset_u64s(u64s)); + } + + prev = prev ? vstruct_next(prev) : jset->start; + jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); +} + +static void journal_write_endio(struct bio *bio) +{ + struct bch_dev *ca = bio->bi_private; + struct journal *j = &ca->fs->journal; + + if (bch2_dev_fatal_io_err_on(bio->bi_error, ca, "journal write") || + bch2_meta_write_fault("journal")) + bch2_journal_halt(j); + + closure_put(&j->io); + percpu_ref_put(&ca->io_ref); +} + +static void journal_write_done(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct journal_buf *w = journal_prev_buf(j); + + j->last_seq_ondisk = le64_to_cpu(w->data->last_seq); + + __bch2_time_stats_update(j->write_time, j->write_start_time); + + BUG_ON(!j->reservations.prev_buf_unwritten); + atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, + &j->reservations.counter); + + /* + * XXX: this is racy, we could technically end up doing the wake up + * after the journal_buf struct has been reused for the next write + * (because we're clearing JOURNAL_IO_IN_FLIGHT) and wake up things that + * are waiting on the _next_ write, not this one. + * + * The wake up can't come before, because journal_flush_seq_async() is + * looking at JOURNAL_IO_IN_FLIGHT when it has to wait on a journal + * write that was already in flight. + * + * The right fix is to use a lock here, but using j.lock here means it + * has to be a spin_lock_irqsave() lock which then requires propagating + * the irq()ness to other locks and it's all kinds of nastiness. + */ + + closure_wake_up(&w->wait); + wake_up(&j->wait); + + /* + * Updating last_seq_ondisk may let journal_reclaim_work() discard more + * buckets: + */ + mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0); +} + +static void journal_write(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_buf *w = journal_prev_buf(j); + struct jset *jset = w->data; + struct bio *bio; + struct bch_extent_ptr *ptr; + unsigned i, sectors, bytes; + + j->write_start_time = local_clock(); + + bch2_journal_add_prios(j, w); + + mutex_lock(&c->btree_root_lock); + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_root *r = &c->btree_roots[i]; + + if (r->alive) + bch2_journal_add_btree_root(w, i, &r->key, r->level); + } + mutex_unlock(&c->btree_root_lock); + + journal_write_compact(jset); + + jset->read_clock = cpu_to_le16(c->prio_clock[READ].hand); + jset->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand); + jset->magic = cpu_to_le64(jset_magic(c)); + jset->version = cpu_to_le32(BCACHE_JSET_VERSION); + + SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); + SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); + + bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); + + jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), + journal_nonce(jset), jset); + + sectors = vstruct_sectors(jset, c->block_bits); + BUG_ON(sectors > j->prev_buf_sectors); + + bytes = vstruct_bytes(w->data); + memset((void *) w->data + bytes, 0, (sectors << 9) - bytes); + + if (journal_write_alloc(j, sectors)) { + bch2_journal_halt(j); + bch_err(c, "Unable to allocate journal write"); + bch2_fatal_error(c); + closure_return_with_destructor(cl, journal_write_done); + } + + bch2_check_mark_super(c, &j->key, true); + + /* + * XXX: we really should just disable the entire journal in nochanges + * mode + */ + if (c->opts.nochanges) + goto no_io; + + extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) { + ca = c->devs[ptr->dev]; + if (!percpu_ref_tryget(&ca->io_ref)) { + /* XXX: fix this */ + bch_err(c, "missing device for journal write\n"); + continue; + } + + atomic64_add(sectors, &ca->meta_sectors_written); + + bio = ca->journal.bio; + bio_reset(bio); + bio->bi_iter.bi_sector = ptr->offset; + bio->bi_bdev = ca->disk_sb.bdev; + bio->bi_iter.bi_size = sectors << 9; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + bio_set_op_attrs(bio, REQ_OP_WRITE, + REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); + bch2_bio_map(bio, jset); + + trace_journal_write(bio); + closure_bio_submit(bio, cl); + + ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq); + } + + for_each_rw_member(ca, c, i) + if (journal_flushes_device(ca) && + !bch2_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) { + percpu_ref_get(&ca->io_ref); + + bio = ca->journal.bio; + bio_reset(bio); + bio->bi_bdev = ca->disk_sb.bdev; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + closure_bio_submit(bio, cl); + } + +no_io: + extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) + ptr->offset += sectors; + + closure_return_with_destructor(cl, journal_write_done); +} + +static void journal_write_work(struct work_struct *work) +{ + struct journal *j = container_of(to_delayed_work(work), + struct journal, write_work); + spin_lock(&j->lock); + set_bit(JOURNAL_NEED_WRITE, &j->flags); + + if (journal_buf_switch(j, false) != JOURNAL_UNLOCKED) + spin_unlock(&j->lock); +} + +/* + * Given an inode number, if that inode number has data in the journal that + * hasn't yet been flushed, return the journal sequence number that needs to be + * flushed: + */ +u64 bch2_inode_journal_seq(struct journal *j, u64 inode) +{ + size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); + u64 seq = 0; + + if (!test_bit(h, j->buf[0].has_inode) && + !test_bit(h, j->buf[1].has_inode)) + return 0; + + spin_lock(&j->lock); + if (test_bit(h, journal_cur_buf(j)->has_inode)) + seq = atomic64_read(&j->seq); + else if (test_bit(h, journal_prev_buf(j)->has_inode)) + seq = atomic64_read(&j->seq) - 1; + spin_unlock(&j->lock); + + return seq; +} + +static int __journal_res_get(struct journal *j, struct journal_res *res, + unsigned u64s_min, unsigned u64s_max) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + int ret; +retry: + ret = journal_res_get_fast(j, res, u64s_min, u64s_max); + if (ret) + return ret; + + spin_lock(&j->lock); + /* + * Recheck after taking the lock, so we don't race with another thread + * that just did journal_entry_open() and call journal_entry_close() + * unnecessarily + */ + ret = journal_res_get_fast(j, res, u64s_min, u64s_max); + if (ret) { + spin_unlock(&j->lock); + return 1; + } + + /* + * Ok, no more room in the current journal entry - try to start a new + * one: + */ + switch (journal_buf_switch(j, false)) { + case JOURNAL_ENTRY_ERROR: + spin_unlock(&j->lock); + return -EIO; + case JOURNAL_ENTRY_INUSE: + /* haven't finished writing out the previous one: */ + spin_unlock(&j->lock); + trace_journal_entry_full(c); + goto blocked; + case JOURNAL_ENTRY_CLOSED: + break; + case JOURNAL_UNLOCKED: + goto retry; + } + + /* We now have a new, closed journal buf - see if we can open it: */ + ret = journal_entry_open(j); + spin_unlock(&j->lock); + + if (ret < 0) + return ret; + if (ret) + goto retry; + + /* Journal's full, we have to wait */ + + /* + * Direct reclaim - can't rely on reclaim from work item + * due to freezing.. + */ + journal_reclaim_work(&j->reclaim_work.work); + + trace_journal_full(c); +blocked: + if (!j->res_get_blocked_start) + j->res_get_blocked_start = local_clock() ?: 1; + return 0; +} + +/* + * Essentially the entry function to the journaling code. When bcachefs is doing + * a btree insert, it calls this function to get the current journal write. + * Journal write is the structure used set up journal writes. The calling + * function will then add its keys to the structure, queuing them for the next + * write. + * + * To ensure forward progress, the current task must not be holding any + * btree node write locks. + */ +int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, + unsigned u64s_min, unsigned u64s_max) +{ + int ret; + + wait_event(j->wait, + (ret = __journal_res_get(j, res, u64s_min, + u64s_max))); + return ret < 0 ? ret : 0; +} + +void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent) +{ + spin_lock(&j->lock); + + BUG_ON(seq > atomic64_read(&j->seq)); + + if (bch2_journal_error(j)) { + spin_unlock(&j->lock); + return; + } + + if (seq == atomic64_read(&j->seq)) { + if (!closure_wait(&journal_cur_buf(j)->wait, parent)) + BUG(); + } else if (seq + 1 == atomic64_read(&j->seq) && + j->reservations.prev_buf_unwritten) { + if (!closure_wait(&journal_prev_buf(j)->wait, parent)) + BUG(); + + smp_mb(); + + /* check if raced with write completion (or failure) */ + if (!j->reservations.prev_buf_unwritten || + bch2_journal_error(j)) + closure_wake_up(&journal_prev_buf(j)->wait); + } + + spin_unlock(&j->lock); +} + +void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) +{ + spin_lock(&j->lock); + + BUG_ON(seq > atomic64_read(&j->seq)); + + if (bch2_journal_error(j)) { + spin_unlock(&j->lock); + return; + } + + if (seq == atomic64_read(&j->seq)) { + bool set_need_write = false; + + if (parent && + !closure_wait(&journal_cur_buf(j)->wait, parent)) + BUG(); + + if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) { + j->need_write_time = local_clock(); + set_need_write = true; + } + + switch (journal_buf_switch(j, set_need_write)) { + case JOURNAL_ENTRY_ERROR: + if (parent) + closure_wake_up(&journal_cur_buf(j)->wait); + break; + case JOURNAL_ENTRY_CLOSED: + /* + * Journal entry hasn't been opened yet, but caller + * claims it has something (seq == j->seq): + */ + BUG(); + case JOURNAL_ENTRY_INUSE: + break; + case JOURNAL_UNLOCKED: + return; + } + } else if (parent && + seq + 1 == atomic64_read(&j->seq) && + j->reservations.prev_buf_unwritten) { + if (!closure_wait(&journal_prev_buf(j)->wait, parent)) + BUG(); + + smp_mb(); + + /* check if raced with write completion (or failure) */ + if (!j->reservations.prev_buf_unwritten || + bch2_journal_error(j)) + closure_wake_up(&journal_prev_buf(j)->wait); + } + + spin_unlock(&j->lock); +} + +int bch2_journal_flush_seq(struct journal *j, u64 seq) +{ + struct closure cl; + u64 start_time = local_clock(); + + closure_init_stack(&cl); + bch2_journal_flush_seq_async(j, seq, &cl); + closure_sync(&cl); + + bch2_time_stats_update(j->flush_seq_time, start_time); + + return bch2_journal_error(j); +} + +void bch2_journal_meta_async(struct journal *j, struct closure *parent) +{ + struct journal_res res; + unsigned u64s = jset_u64s(0); + + memset(&res, 0, sizeof(res)); + + bch2_journal_res_get(j, &res, u64s, u64s); + bch2_journal_res_put(j, &res); + + bch2_journal_flush_seq_async(j, res.seq, parent); +} + +int bch2_journal_meta(struct journal *j) +{ + struct journal_res res; + unsigned u64s = jset_u64s(0); + int ret; + + memset(&res, 0, sizeof(res)); + + ret = bch2_journal_res_get(j, &res, u64s, u64s); + if (ret) + return ret; + + bch2_journal_res_put(j, &res); + + return bch2_journal_flush_seq(j, res.seq); +} + +void bch2_journal_flush_async(struct journal *j, struct closure *parent) +{ + u64 seq, journal_seq; + + spin_lock(&j->lock); + journal_seq = atomic64_read(&j->seq); + + if (journal_entry_is_open(j)) { + seq = journal_seq; + } else if (journal_seq) { + seq = journal_seq - 1; + } else { + spin_unlock(&j->lock); + return; + } + spin_unlock(&j->lock); + + bch2_journal_flush_seq_async(j, seq, parent); +} + +int bch2_journal_flush(struct journal *j) +{ + u64 seq, journal_seq; + + spin_lock(&j->lock); + journal_seq = atomic64_read(&j->seq); + + if (journal_entry_is_open(j)) { + seq = journal_seq; + } else if (journal_seq) { + seq = journal_seq - 1; + } else { + spin_unlock(&j->lock); + return 0; + } + spin_unlock(&j->lock); + + return bch2_journal_flush_seq(j, seq); +} + +ssize_t bch2_journal_print_debug(struct journal *j, char *buf) +{ + union journal_res_state *s = &j->reservations; + struct bch_dev *ca; + unsigned iter; + ssize_t ret = 0; + + rcu_read_lock(); + spin_lock(&j->lock); + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "active journal entries:\t%zu\n" + "seq:\t\t\t%llu\n" + "last_seq:\t\t%llu\n" + "last_seq_ondisk:\t%llu\n" + "reservation count:\t%u\n" + "reservation offset:\t%u\n" + "current entry u64s:\t%u\n" + "io in flight:\t\t%i\n" + "need write:\t\t%i\n" + "dirty:\t\t\t%i\n" + "replay done:\t\t%i\n", + fifo_used(&j->pin), + (u64) atomic64_read(&j->seq), + last_seq(j), + j->last_seq_ondisk, + journal_state_count(*s, s->idx), + s->cur_entry_offset, + j->cur_entry_u64s, + s->prev_buf_unwritten, + test_bit(JOURNAL_NEED_WRITE, &j->flags), + journal_entry_is_open(j), + test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + + spin_lock(&j->devs.lock); + group_for_each_dev(ca, &j->devs, iter) { + struct journal_device *ja = &ca->journal; + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "dev %u:\n" + "\tnr\t\t%u\n" + "\tcur_idx\t\t%u (seq %llu)\n" + "\tlast_idx\t%u (seq %llu)\n", + iter, ja->nr, + ja->cur_idx, ja->bucket_seq[ja->cur_idx], + ja->last_idx, ja->bucket_seq[ja->last_idx]); + } + spin_unlock(&j->devs.lock); + + spin_unlock(&j->lock); + rcu_read_unlock(); + + return ret; +} + +static bool bch2_journal_writing_to_device(struct bch_dev *ca) +{ + struct journal *j = &ca->fs->journal; + bool ret; + + spin_lock(&j->lock); + ret = bch2_extent_has_device(bkey_i_to_s_c_extent(&j->key), + ca->dev_idx); + spin_unlock(&j->lock); + + return ret; +} + +/* + * This asumes that ca has already been marked read-only so that + * journal_next_bucket won't pick buckets out of ca any more. + * Hence, if the journal is not currently pointing to ca, there + * will be no new writes to journal entries in ca after all the + * pending ones have been flushed to disk. + * + * If the journal is being written to ca, write a new record, and + * journal_next_bucket will notice that the device is no longer + * writeable and pick a new set of devices to write to. + */ + +int bch2_journal_move(struct bch_dev *ca) +{ + u64 last_flushed_seq; + struct journal_device *ja = &ca->journal; + struct bch_fs *c = ca->fs; + struct journal *j = &c->journal; + unsigned i; + int ret = 0; /* Success */ + + if (bch2_journal_writing_to_device(ca)) { + /* + * bch_journal_meta will write a record and we'll wait + * for the write to complete. + * Actually writing the journal (journal_write_locked) + * will call journal_next_bucket which notices that the + * device is no longer writeable, and picks a new one. + */ + bch2_journal_meta(j); + BUG_ON(bch2_journal_writing_to_device(ca)); + } + + /* + * Flush all btree updates to backing store so that any + * journal entries written to ca become stale and are no + * longer needed. + */ + + /* + * XXX: switch to normal journal reclaim machinery + */ + bch2_btree_flush(c); + + /* + * Force a meta-data journal entry to be written so that + * we have newer journal entries in devices other than ca, + * and wait for the meta data write to complete. + */ + bch2_journal_meta(j); + + /* + * Verify that we no longer need any of the journal entries in + * the device + */ + spin_lock(&j->lock); + last_flushed_seq = last_seq(j); + spin_unlock(&j->lock); + + for (i = 0; i < ja->nr; i += 1) + BUG_ON(ja->bucket_seq[i] > last_flushed_seq); + + return ret; +} + +void bch2_fs_journal_stop(struct journal *j) +{ + if (!test_bit(JOURNAL_STARTED, &j->flags)) + return; + + /* + * Empty out the journal by first flushing everything pinning existing + * journal entries, then force a brand new empty journal entry to be + * written: + */ + bch2_journal_flush_pins(j); + bch2_journal_flush_async(j, NULL); + bch2_journal_meta(j); + + cancel_delayed_work_sync(&j->write_work); + cancel_delayed_work_sync(&j->reclaim_work); +} + +void bch2_dev_journal_exit(struct bch_dev *ca) +{ + kfree(ca->journal.bio); + kfree(ca->journal.buckets); + kfree(ca->journal.bucket_seq); + + ca->journal.bio = NULL; + ca->journal.buckets = NULL; + ca->journal.bucket_seq = NULL; +} + +int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) +{ + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets = + bch2_sb_get_journal(sb); + unsigned i, journal_entry_pages; + + journal_entry_pages = + DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb), + PAGE_SECTORS); + + ja->nr = bch2_nr_journal_buckets(journal_buckets); + + ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->bucket_seq) + return -ENOMEM; + + ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages); + if (!ca->journal.bio) + return -ENOMEM; + + ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->buckets) + return -ENOMEM; + + for (i = 0; i < ja->nr; i++) + ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); + + return 0; +} + +void bch2_fs_journal_exit(struct journal *j) +{ + unsigned order = get_order(j->entry_size_max); + + free_pages((unsigned long) j->buf[1].data, order); + free_pages((unsigned long) j->buf[0].data, order); + free_fifo(&j->pin); +} + +int bch2_fs_journal_init(struct journal *j, unsigned entry_size_max) +{ + static struct lock_class_key res_key; + unsigned order = get_order(entry_size_max); + + spin_lock_init(&j->lock); + spin_lock_init(&j->pin_lock); + init_waitqueue_head(&j->wait); + INIT_DELAYED_WORK(&j->write_work, journal_write_work); + INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work); + mutex_init(&j->blacklist_lock); + INIT_LIST_HEAD(&j->seq_blacklist); + spin_lock_init(&j->devs.lock); + mutex_init(&j->reclaim_lock); + + lockdep_init_map(&j->res_map, "journal res", &res_key, 0); + + j->entry_size_max = entry_size_max; + j->write_delay_ms = 100; + j->reclaim_delay_ms = 100; + + bkey_extent_init(&j->key); + + atomic64_set(&j->reservations.counter, + ((union journal_res_state) + { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); + + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) || + !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order))) + return -ENOMEM; + + return 0; +} diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h new file mode 100644 index 00000000..f5fc465a --- /dev/null +++ b/libbcachefs/journal.h @@ -0,0 +1,373 @@ +#ifndef _BCACHE_JOURNAL_H +#define _BCACHE_JOURNAL_H + +/* + * THE JOURNAL: + * + * The primary purpose of the journal is to log updates (insertions) to the + * b-tree, to avoid having to do synchronous updates to the b-tree on disk. + * + * Without the journal, the b-tree is always internally consistent on + * disk - and in fact, in the earliest incarnations bcache didn't have a journal + * but did handle unclean shutdowns by doing all index updates synchronously + * (with coalescing). + * + * Updates to interior nodes still happen synchronously and without the journal + * (for simplicity) - this may change eventually but updates to interior nodes + * are rare enough it's not a huge priority. + * + * This means the journal is relatively separate from the b-tree; it consists of + * just a list of keys and journal replay consists of just redoing those + * insertions in same order that they appear in the journal. + * + * PERSISTENCE: + * + * For synchronous updates (where we're waiting on the index update to hit + * disk), the journal entry will be written out immediately (or as soon as + * possible, if the write for the previous journal entry was still in flight). + * + * Synchronous updates are specified by passing a closure (@flush_cl) to + * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter + * down to the journalling code. That closure will will wait on the journal + * write to complete (via closure_wait()). + * + * If the index update wasn't synchronous, the journal entry will be + * written out after 10 ms have elapsed, by default (the delay_ms field + * in struct journal). + * + * JOURNAL ENTRIES: + * + * A journal entry is variable size (struct jset), it's got a fixed length + * header and then a variable number of struct jset_entry entries. + * + * Journal entries are identified by monotonically increasing 64 bit sequence + * numbers - jset->seq; other places in the code refer to this sequence number. + * + * A jset_entry entry contains one or more bkeys (which is what gets inserted + * into the b-tree). We need a container to indicate which b-tree the key is + * for; also, the roots of the various b-trees are stored in jset_entry entries + * (one for each b-tree) - this lets us add new b-tree types without changing + * the on disk format. + * + * We also keep some things in the journal header that are logically part of the + * superblock - all the things that are frequently updated. This is for future + * bcache on raw flash support; the superblock (which will become another + * journal) can't be moved or wear leveled, so it contains just enough + * information to find the main journal, and the superblock only has to be + * rewritten when we want to move/wear level the main journal. + * + * JOURNAL LAYOUT ON DISK: + * + * The journal is written to a ringbuffer of buckets (which is kept in the + * superblock); the individual buckets are not necessarily contiguous on disk + * which means that journal entries are not allowed to span buckets, but also + * that we can resize the journal at runtime if desired (unimplemented). + * + * The journal buckets exist in the same pool as all the other buckets that are + * managed by the allocator and garbage collection - garbage collection marks + * the journal buckets as metadata buckets. + * + * OPEN/DIRTY JOURNAL ENTRIES: + * + * Open/dirty journal entries are journal entries that contain b-tree updates + * that have not yet been written out to the b-tree on disk. We have to track + * which journal entries are dirty, and we also have to avoid wrapping around + * the journal and overwriting old but still dirty journal entries with new + * journal entries. + * + * On disk, this is represented with the "last_seq" field of struct jset; + * last_seq is the first sequence number that journal replay has to replay. + * + * To avoid overwriting dirty journal entries on disk, we keep a mapping (in + * journal_device->seq) of for each journal bucket, the highest sequence number + * any journal entry it contains. Then, by comparing that against last_seq we + * can determine whether that journal bucket contains dirty journal entries or + * not. + * + * To track which journal entries are dirty, we maintain a fifo of refcounts + * (where each entry corresponds to a specific sequence number) - when a ref + * goes to 0, that journal entry is no longer dirty. + * + * Journalling of index updates is done at the same time as the b-tree itself is + * being modified (see btree_insert_key()); when we add the key to the journal + * the pending b-tree write takes a ref on the journal entry the key was added + * to. If a pending b-tree write would need to take refs on multiple dirty + * journal entries, it only keeps the ref on the oldest one (since a newer + * journal entry will still be replayed if an older entry was dirty). + * + * JOURNAL FILLING UP: + * + * There are two ways the journal could fill up; either we could run out of + * space to write to, or we could have too many open journal entries and run out + * of room in the fifo of refcounts. Since those refcounts are decremented + * without any locking we can't safely resize that fifo, so we handle it the + * same way. + * + * If the journal fills up, we start flushing dirty btree nodes until we can + * allocate space for a journal write again - preferentially flushing btree + * nodes that are pinning the oldest journal entries first. + */ + +#include <linux/hash.h> + +#include "journal_types.h" + +/* + * Only used for holding the journal entries we read in btree_journal_read() + * during cache_registration + */ +struct journal_replay { + struct list_head list; + struct jset j; +}; + +#define JOURNAL_PIN ((32 * 1024) - 1) + +static inline bool journal_pin_active(struct journal_entry_pin *pin) +{ + return pin->pin_list != NULL; +} + +void bch2_journal_pin_add(struct journal *, struct journal_entry_pin *, + journal_pin_flush_fn); +void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); +void bch2_journal_pin_add_if_older(struct journal *, + struct journal_entry_pin *, + struct journal_entry_pin *, + journal_pin_flush_fn); +void bch2_journal_flush_pins(struct journal *); + +struct closure; +struct bch_fs; +struct keylist; + +struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *, + enum btree_id, unsigned *); + +int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *); + +u64 bch2_inode_journal_seq(struct journal *, u64); + +static inline int journal_state_count(union journal_res_state s, int idx) +{ + return idx == 0 ? s.buf0_count : s.buf1_count; +} + +static inline void journal_state_inc(union journal_res_state *s) +{ + s->buf0_count += s->idx == 0; + s->buf1_count += s->idx == 1; +} + +static inline void bch2_journal_set_has_inode(struct journal_buf *buf, u64 inum) +{ + set_bit(hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)), buf->has_inode); +} + +/* + * Amount of space that will be taken up by some keys in the journal (i.e. + * including the jset header) + */ +static inline unsigned jset_u64s(unsigned u64s) +{ + return u64s + sizeof(struct jset_entry) / sizeof(u64); +} + +static inline void bch2_journal_add_entry_at(struct journal_buf *buf, + const void *data, size_t u64s, + unsigned type, enum btree_id id, + unsigned level, unsigned offset) +{ + struct jset_entry *entry = vstruct_idx(buf->data, offset); + + entry->u64s = cpu_to_le16(u64s); + entry->btree_id = id; + entry->level = level; + entry->flags = 0; + SET_JOURNAL_ENTRY_TYPE(entry, type); + + memcpy_u64s(entry->_data, data, u64s); +} + +static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, + enum btree_id id, const struct bkey_i *k) +{ + struct journal_buf *buf = &j->buf[res->idx]; + unsigned actual = jset_u64s(k->k.u64s); + + EBUG_ON(!res->ref); + BUG_ON(actual > res->u64s); + + bch2_journal_set_has_inode(buf, k->k.p.inode); + + bch2_journal_add_entry_at(buf, k, k->k.u64s, + JOURNAL_ENTRY_BTREE_KEYS, id, + 0, res->offset); + + res->offset += actual; + res->u64s -= actual; +} + +void bch2_journal_buf_put_slowpath(struct journal *, bool); + +static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, + bool need_write_just_set) +{ + union journal_res_state s; + + s.v = atomic64_sub_return(((union journal_res_state) { + .buf0_count = idx == 0, + .buf1_count = idx == 1, + }).v, &j->reservations.counter); + + EBUG_ON(s.idx != idx && !s.prev_buf_unwritten); + + /* + * Do not initiate a journal write if the journal is in an error state + * (previous journal entry write may have failed) + */ + if (s.idx != idx && + !journal_state_count(s, idx) && + s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL) + bch2_journal_buf_put_slowpath(j, need_write_just_set); +} + +/* + * This function releases the journal write structure so other threads can + * then proceed to add their keys as well. + */ +static inline void bch2_journal_res_put(struct journal *j, + struct journal_res *res) +{ + if (!res->ref) + return; + + lock_release(&j->res_map, 0, _RET_IP_); + + while (res->u64s) { + bch2_journal_add_entry_at(&j->buf[res->idx], NULL, 0, + JOURNAL_ENTRY_BTREE_KEYS, + 0, 0, res->offset); + res->offset += jset_u64s(0); + res->u64s -= jset_u64s(0); + } + + bch2_journal_buf_put(j, res->idx, false); + + res->ref = 0; +} + +int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, + unsigned, unsigned); + +static inline int journal_res_get_fast(struct journal *j, + struct journal_res *res, + unsigned u64s_min, + unsigned u64s_max) +{ + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); + + do { + old.v = new.v = v; + + /* + * Check if there is still room in the current journal + * entry: + */ + if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s) + return 0; + + res->offset = old.cur_entry_offset; + res->u64s = min(u64s_max, j->cur_entry_u64s - + old.cur_entry_offset); + + journal_state_inc(&new); + new.cur_entry_offset += res->u64s; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + res->ref = true; + res->idx = new.idx; + res->seq = le64_to_cpu(j->buf[res->idx].data->seq); + return 1; +} + +static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, + unsigned u64s_min, unsigned u64s_max) +{ + int ret; + + EBUG_ON(res->ref); + EBUG_ON(u64s_max < u64s_min); + + if (journal_res_get_fast(j, res, u64s_min, u64s_max)) + goto out; + + ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max); + if (ret) + return ret; +out: + lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_); + EBUG_ON(!res->ref); + return 0; +} + +void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); +void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); +void bch2_journal_flush_async(struct journal *, struct closure *); +void bch2_journal_meta_async(struct journal *, struct closure *); + +int bch2_journal_flush_seq(struct journal *, u64); +int bch2_journal_flush(struct journal *); +int bch2_journal_meta(struct journal *); + +void bch2_journal_halt(struct journal *); + +static inline int bch2_journal_error(struct journal *j) +{ + return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL + ? -EIO : 0; +} + +static inline bool journal_flushes_device(struct bch_dev *ca) +{ + return true; +} + +void bch2_journal_start(struct bch_fs *); +void bch2_journal_mark(struct bch_fs *, struct list_head *); +void bch2_journal_entries_free(struct list_head *); +int bch2_journal_read(struct bch_fs *, struct list_head *); +int bch2_journal_replay(struct bch_fs *, struct list_head *); + +static inline void bch2_journal_set_replay_done(struct journal *j) +{ + spin_lock(&j->lock); + BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); + + set_bit(JOURNAL_REPLAY_DONE, &j->flags); + j->cur_pin_list = &fifo_peek_back(&j->pin); + spin_unlock(&j->lock); +} + +ssize_t bch2_journal_print_debug(struct journal *, char *); + +int bch2_dev_journal_alloc(struct bch_dev *); + +static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) +{ + return j + ? (__le64 *) vstruct_end(&j->field) - j->buckets + : 0; +} + +int bch2_journal_move(struct bch_dev *); + +void bch2_fs_journal_stop(struct journal *); +void bch2_dev_journal_exit(struct bch_dev *); +int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); +void bch2_fs_journal_exit(struct journal *); +int bch2_fs_journal_init(struct journal *, unsigned); + +#endif /* _BCACHE_JOURNAL_H */ diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h new file mode 100644 index 00000000..ebc340ad --- /dev/null +++ b/libbcachefs/journal_types.h @@ -0,0 +1,242 @@ +#ifndef _BCACHE_JOURNAL_TYPES_H +#define _BCACHE_JOURNAL_TYPES_H + +#include <linux/cache.h> +#include <linux/workqueue.h> + +#include "alloc_types.h" +#include "fifo.h" + +struct journal_res; + +/* + * We put two of these in struct journal; we used them for writes to the + * journal that are being staged or in flight. + */ +struct journal_buf { + struct jset *data; + struct closure_waitlist wait; + + /* + * ugh, prio_buckets are stupid - need to convert them to new + * transaction machinery when it arrives + */ + unsigned nr_prio_buckets; + + /* bloom filter: */ + unsigned long has_inode[1024 / sizeof(unsigned long)]; +}; + +/* + * Something that makes a journal entry dirty - i.e. a btree node that has to be + * flushed: + */ + +struct journal_entry_pin_list { + struct list_head list; + atomic_t count; +}; + +struct journal; +struct journal_entry_pin; +typedef void (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin *); + +struct journal_entry_pin { + struct list_head list; + journal_pin_flush_fn flush; + struct journal_entry_pin_list *pin_list; +}; + +/* corresponds to a btree node with a blacklisted bset: */ +struct blacklisted_node { + __le64 seq; + enum btree_id btree_id; + struct bpos pos; +}; + +struct journal_seq_blacklist { + struct list_head list; + u64 seq; + bool written; + struct journal_entry_pin pin; + + struct blacklisted_node *entries; + size_t nr_entries; +}; + +struct journal_res { + bool ref; + u8 idx; + u16 u64s; + u32 offset; + u64 seq; +}; + +union journal_res_state { + struct { + atomic64_t counter; + }; + + struct { + u64 v; + }; + + struct { + u64 cur_entry_offset:20, + idx:1, + prev_buf_unwritten:1, + buf0_count:21, + buf1_count:21; + }; +}; + +/* 4 mb, in bytes: */ +#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) + +/* + * We stash some journal state as sentinal values in cur_entry_offset: + */ +#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) + +#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) +#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) + +/* + * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, + * either because something's waiting on the write to complete or because it's + * been dirty too long and the timer's expired. + */ + +enum { + JOURNAL_REPLAY_DONE, + JOURNAL_STARTED, + JOURNAL_NEED_WRITE, +}; + +/* Embedded in struct bch_fs */ +struct journal { + /* Fastpath stuff up front: */ + + unsigned long flags; + + union journal_res_state reservations; + unsigned cur_entry_u64s; + unsigned prev_buf_sectors; + unsigned cur_buf_sectors; + unsigned entry_size_max; /* bytes */ + + /* + * Two journal entries -- one is currently open for new entries, the + * other is possibly being written out. + */ + struct journal_buf buf[2]; + + spinlock_t lock; + + /* Used when waiting because the journal was full */ + wait_queue_head_t wait; + + struct closure io; + struct delayed_work write_work; + + /* Sequence number of most recent journal entry (last entry in @pin) */ + atomic64_t seq; + + /* last_seq from the most recent journal entry written */ + u64 last_seq_ondisk; + + /* + * FIFO of journal entries whose btree updates have not yet been + * written out. + * + * Each entry is a reference count. The position in the FIFO is the + * entry's sequence number relative to @seq. + * + * The journal entry itself holds a reference count, put when the + * journal entry is written out. Each btree node modified by the journal + * entry also holds a reference count, put when the btree node is + * written. + * + * When a reference count reaches zero, the journal entry is no longer + * needed. When all journal entries in the oldest journal bucket are no + * longer needed, the bucket can be discarded and reused. + */ + DECLARE_FIFO(struct journal_entry_pin_list, pin); + struct journal_entry_pin_list *cur_pin_list; + + /* + * Protects the pin lists - the fifo itself is still protected by + * j->lock though: + */ + spinlock_t pin_lock; + + struct mutex blacklist_lock; + struct list_head seq_blacklist; + + BKEY_PADDED(key); + struct dev_group devs; + + struct delayed_work reclaim_work; + unsigned long last_flushed; + + /* protects advancing ja->last_idx: */ + struct mutex reclaim_lock; + + /* + * ugh: need to get prio_buckets converted over to the eventual new + * transaction machinery + */ + __le64 prio_buckets[BCH_SB_MEMBERS_MAX]; + unsigned nr_prio_buckets; + + unsigned write_delay_ms; + unsigned reclaim_delay_ms; + + u64 res_get_blocked_start; + u64 need_write_time; + u64 write_start_time; + + struct time_stats *write_time; + struct time_stats *delay_time; + struct time_stats *blocked_time; + struct time_stats *flush_seq_time; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map res_map; +#endif +}; + +/* + * Embedded in struct bch_dev. First three fields refer to the array of journal + * buckets, in bch_sb. + */ +struct journal_device { + /* + * For each journal bucket, contains the max sequence number of the + * journal writes it contains - so we know when a bucket can be reused. + */ + u64 *bucket_seq; + + unsigned sectors_free; + + /* Journal bucket we're currently writing to */ + unsigned cur_idx; + + /* Last journal bucket that still contains an open journal entry */ + + /* + * j->lock and j->reclaim_lock must both be held to modify, j->lock + * sufficient to read: + */ + unsigned last_idx; + unsigned nr; + u64 *buckets; + + /* Bio for journal reads/writes to this device */ + struct bio *bio; + + /* for bch_journal_read_device */ + struct closure read; +}; + +#endif /* _BCACHE_JOURNAL_TYPES_H */ diff --git a/libbcachefs/keylist.c b/libbcachefs/keylist.c new file mode 100644 index 00000000..51dd7edc --- /dev/null +++ b/libbcachefs/keylist.c @@ -0,0 +1,55 @@ + +#include "bcachefs.h" +#include "keylist.h" + +int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, + size_t nr_inline_u64s, size_t new_u64s) +{ + size_t oldsize = bch_keylist_u64s(l); + size_t newsize = oldsize + new_u64s; + u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; + u64 *new_keys; + + newsize = roundup_pow_of_two(newsize); + + if (newsize <= nr_inline_u64s || + (old_buf && roundup_pow_of_two(oldsize) == newsize)) + return 0; + + new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO); + if (!new_keys) + return -ENOMEM; + + if (!old_buf) + memcpy_u64s(new_keys, inline_u64s, oldsize); + + l->keys_p = new_keys; + l->top_p = new_keys + oldsize; + + return 0; +} + +void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) +{ + struct bkey_i *where; + + for_each_keylist_key(l, where) + if (bkey_cmp(insert->k.p, where->k.p) < 0) + break; + + memmove_u64s_up((u64 *) where + insert->k.u64s, + where, + ((u64 *) l->top) - ((u64 *) where)); + + l->top_p += insert->k.u64s; + bkey_copy(where, insert); +} + +void bch2_keylist_pop_front(struct keylist *l) +{ + l->top_p -= bch2_keylist_front(l)->k.u64s; + + memmove_u64s_down(l->keys, + bkey_next(l->keys), + bch_keylist_u64s(l)); +} diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h new file mode 100644 index 00000000..66628058 --- /dev/null +++ b/libbcachefs/keylist.h @@ -0,0 +1,62 @@ +#ifndef _BCACHE_KEYLIST_H +#define _BCACHE_KEYLIST_H + +#include "keylist_types.h" + +int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); +void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *); +void bch2_keylist_pop_front(struct keylist *); + +static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys, + size_t nr_inline_u64s) +{ + l->top_p = l->keys_p = inline_keys; +} + +static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) +{ + if (l->keys_p != inline_keys) + kfree(l->keys_p); + memset(l, 0, sizeof(*l)); +} + +static inline void bch2_keylist_push(struct keylist *l) +{ + l->top = bkey_next(l->top); +} + +static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) +{ + bkey_copy(l->top, k); + bch2_keylist_push(l); +} + +static inline bool bch2_keylist_empty(struct keylist *l) +{ + return l->top == l->keys; +} + +static inline size_t bch_keylist_u64s(struct keylist *l) +{ + return l->top_p - l->keys_p; +} + +static inline size_t bch2_keylist_bytes(struct keylist *l) +{ + return bch_keylist_u64s(l) * sizeof(u64); +} + +static inline struct bkey_i *bch2_keylist_front(struct keylist *l) +{ + return l->keys; +} + +#define for_each_keylist_key(_keylist, _k) \ + for (_k = (_keylist)->keys; \ + _k != (_keylist)->top; \ + _k = bkey_next(_k)) + +#define keylist_single(k) \ + ((struct keylist) { .keys = k, .top = bkey_next(k) }) + +#endif /* _BCACHE_KEYLIST_H */ diff --git a/libbcachefs/keylist_types.h b/libbcachefs/keylist_types.h new file mode 100644 index 00000000..195785bf --- /dev/null +++ b/libbcachefs/keylist_types.h @@ -0,0 +1,15 @@ +#ifndef _BCACHE_KEYLIST_TYPES_H +#define _BCACHE_KEYLIST_TYPES_H + +struct keylist { + union { + struct bkey_i *keys; + u64 *keys_p; + }; + union { + struct bkey_i *top; + u64 *top_p; + }; +}; + +#endif /* _BCACHE_KEYLIST_TYPES_H */ diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c new file mode 100644 index 00000000..f79b624d --- /dev/null +++ b/libbcachefs/migrate.c @@ -0,0 +1,395 @@ +/* + * Code for moving data off a device. + */ + +#include "bcachefs.h" +#include "btree_update.h" +#include "buckets.h" +#include "extents.h" +#include "io.h" +#include "journal.h" +#include "keylist.h" +#include "migrate.h" +#include "move.h" +#include "super-io.h" + +static int issue_migration_move(struct bch_dev *ca, + struct moving_context *ctxt, + struct bkey_s_c k) +{ + struct bch_fs *c = ca->fs; + struct disk_reservation res; + const struct bch_extent_ptr *ptr; + int ret; + + if (bch2_disk_reservation_get(c, &res, k.k->size, 0)) + return -ENOSPC; + + extent_for_each_ptr(bkey_s_c_to_extent(k), ptr) + if (ptr->dev == ca->dev_idx) + goto found; + + BUG(); +found: + /* XXX: we need to be doing something with the disk reservation */ + + ret = bch2_data_move(c, ctxt, &c->migration_write_point, k, ptr); + if (ret) + bch2_disk_reservation_put(c, &res); + return ret; +} + +#define MAX_DATA_OFF_ITER 10 + +/* + * This moves only the data off, leaving the meta-data (if any) in place. + * It walks the key space, and for any key with a valid pointer to the + * relevant device, it copies it elsewhere, updating the key to point to + * the copy. + * The meta-data is moved off by bch_move_meta_data_off_device. + * + * Note: If the number of data replicas desired is > 1, ideally, any + * new copies would not be made in the same device that already have a + * copy (if there are enough devices). + * This is _not_ currently implemented. The multiple replicas can + * land in the same device even if there are others available. + */ + +int bch2_move_data_off_device(struct bch_dev *ca) +{ + struct moving_context ctxt; + struct bch_fs *c = ca->fs; + struct bch_sb_field_members *mi; + unsigned pass = 0; + u64 seen_key_count; + int ret = 0; + + BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); + + if (!ca->mi.has_data) + return 0; + + bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE); + ctxt.avoid = ca; + + /* + * In theory, only one pass should be necessary as we've + * quiesced all writes before calling this. + * + * However, in practice, more than one pass may be necessary: + * - Some move fails due to an error. We can can find this out + * from the moving_context. + * - Some key swap failed because some of the pointers in the + * key in the tree changed due to caching behavior, btree gc + * pruning stale pointers, or tiering (if the device being + * removed is in tier 0). A smarter bkey_cmpxchg would + * handle these cases. + * + * Thus this scans the tree one more time than strictly necessary, + * but that can be viewed as a verification pass. + */ + + do { + struct btree_iter iter; + struct bkey_s_c k; + + seen_key_count = 0; + atomic_set(&ctxt.error_count, 0); + atomic_set(&ctxt.error_flags, 0); + + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + + while (!bch2_move_ctxt_wait(&ctxt) && + (k = bch2_btree_iter_peek(&iter)).k && + !(ret = btree_iter_err(k))) { + if (!bkey_extent_is_data(k.k) || + !bch2_extent_has_device(bkey_s_c_to_extent(k), + ca->dev_idx)) + goto next; + + ret = issue_migration_move(ca, &ctxt, k); + if (ret == -ENOMEM) { + bch2_btree_iter_unlock(&iter); + + /* + * memory allocation failure, wait for some IO + * to finish + */ + bch2_move_ctxt_wait_for_io(&ctxt); + continue; + } + if (ret == -ENOSPC) + break; + BUG_ON(ret); + + seen_key_count++; +next: + bch2_btree_iter_advance_pos(&iter); + bch2_btree_iter_cond_resched(&iter); + + } + bch2_btree_iter_unlock(&iter); + bch2_move_ctxt_exit(&ctxt); + + if (ret) + return ret; + } while (seen_key_count && pass++ < MAX_DATA_OFF_ITER); + + if (seen_key_count) { + pr_err("Unable to migrate all data in %d iterations.", + MAX_DATA_OFF_ITER); + return -1; + } + + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb); + SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + +/* + * This walks the btree, and for any node on the relevant device it moves the + * node elsewhere. + */ +static int bch2_move_btree_off(struct bch_dev *ca, enum btree_id id) +{ + struct bch_fs *c = ca->fs; + struct btree_iter iter; + struct closure cl; + struct btree *b; + int ret; + + BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); + + closure_init_stack(&cl); + + for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); +retry: + if (!bch2_extent_has_device(e, ca->dev_idx)) + continue; + + ret = bch2_btree_node_rewrite(&iter, b, &cl); + if (ret == -EINTR || ret == -ENOSPC) { + /* + * Drop locks to upgrade locks or wait on + * reserve: after retaking, recheck in case we + * raced. + */ + bch2_btree_iter_unlock(&iter); + closure_sync(&cl); + b = bch2_btree_iter_peek_node(&iter); + goto retry; + } + if (ret) { + bch2_btree_iter_unlock(&iter); + return ret; + } + + bch2_btree_iter_set_locks_want(&iter, 0); + } + ret = bch2_btree_iter_unlock(&iter); + if (ret) + return ret; /* btree IO error */ + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); + + BUG_ON(bch2_extent_has_device(e, ca->dev_idx)); + } + bch2_btree_iter_unlock(&iter); + } + + return 0; +} + +/* + * This moves only the meta-data off, leaving the data (if any) in place. + * The data is moved off by bch_move_data_off_device, if desired, and + * called first. + * + * Before calling this, allocation of buckets to the device must have + * been disabled, as else we'll continue to write meta-data to the device + * when new buckets are picked for meta-data writes. + * In addition, the copying gc and allocator threads for the device + * must have been stopped. The allocator thread is the only thread + * that writes prio/gen information. + * + * Meta-data consists of: + * - Btree nodes + * - Prio/gen information + * - Journal entries + * - Superblock + * + * This has to move the btree nodes and the journal only: + * - prio/gen information is not written once the allocator thread is stopped. + * also, as the prio/gen information is per-device it is not moved. + * - the superblock will be written by the caller once after everything + * is stopped. + * + * Note that currently there is no way to stop btree node and journal + * meta-data writes to a device without moving the meta-data because + * once a bucket is open for a btree node, unless a replacement btree + * node is allocated (and the tree updated), the bucket will continue + * to be written with updates. Similarly for the journal (it gets + * written until filled). + * + * This routine leaves the data (if any) in place. Whether the data + * should be moved off is a decision independent of whether the meta + * data should be moved off and stopped: + * + * - For device removal, both data and meta-data are moved off, in + * that order. + * + * - However, for turning a device read-only without removing it, only + * meta-data is moved off since that's the only way to prevent it + * from being written. Data is left in the device, but no new data + * is written. + */ + +int bch2_move_metadata_off_device(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_sb_field_members *mi; + unsigned i; + int ret; + + BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); + + if (!ca->mi.has_metadata) + return 0; + + /* 1st, Move the btree nodes off the device */ + + for (i = 0; i < BTREE_ID_NR; i++) { + ret = bch2_move_btree_off(ca, i); + if (ret) + return ret; + } + + /* There are no prios/gens to move -- they are already in the device. */ + + /* 2nd. Move the journal off the device */ + + ret = bch2_journal_move(ca); + if (ret) + return ret; + + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb); + SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + +/* + * Flagging data bad when forcibly removing a device after failing to + * migrate the data off the device. + */ + +static int bch2_flag_key_bad(struct btree_iter *iter, + struct bch_dev *ca, + struct bkey_s_c_extent orig) +{ + BKEY_PADDED(key) tmp; + struct bkey_s_extent e; + struct bch_extent_ptr *ptr; + struct bch_fs *c = ca->fs; + + bkey_reassemble(&tmp.key, orig.s_c); + e = bkey_i_to_s_extent(&tmp.key); + + extent_for_each_ptr_backwards(e, ptr) + if (ptr->dev == ca->dev_idx) + bch2_extent_drop_ptr(e, ptr); + + /* + * If the new extent no longer has any pointers, bch2_extent_normalize() + * will do the appropriate thing with it (turning it into a + * KEY_TYPE_ERROR key, or just a discard if it was a cached extent) + */ + bch2_extent_normalize(c, e.s); + + return bch2_btree_insert_at(c, NULL, NULL, NULL, + BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(iter, &tmp.key)); +} + +/* + * This doesn't actually move any data -- it marks the keys as bad + * if they contain a pointer to a device that is forcibly removed + * and don't have other valid pointers. If there are valid pointers, + * the necessary pointers to the removed device are replaced with + * bad pointers instead. + * + * This is only called if bch_move_data_off_device above failed, meaning + * that we've already tried to move the data MAX_DATA_OFF_ITER times and + * are not likely to succeed if we try again. + */ +int bch2_flag_data_bad(struct bch_dev *ca) +{ + int ret = 0; + struct bkey_s_c k; + struct bkey_s_c_extent e; + struct btree_iter iter; + + bch2_btree_iter_init(&iter, ca->fs, BTREE_ID_EXTENTS, POS_MIN); + + while ((k = bch2_btree_iter_peek(&iter)).k && + !(ret = btree_iter_err(k))) { + if (!bkey_extent_is_data(k.k)) + goto advance; + + e = bkey_s_c_to_extent(k); + if (!bch2_extent_has_device(e, ca->dev_idx)) + goto advance; + + ret = bch2_flag_key_bad(&iter, ca, e); + + /* + * don't want to leave ret == -EINTR, since if we raced and + * something else overwrote the key we could spuriously return + * -EINTR below: + */ + if (ret == -EINTR) + ret = 0; + if (ret) + break; + + /* + * If the replica we're dropping was dirty and there is an + * additional cached replica, the cached replica will now be + * considered dirty - upon inserting the new version of the key, + * the bucket accounting will be updated to reflect the fact + * that the cached data is now dirty and everything works out as + * if by magic without us having to do anything. + * + * The one thing we need to be concerned with here is there's a + * race between when we drop any stale pointers from the key + * we're about to insert, and when the key actually gets + * inserted and the cached data is marked as dirty - we could + * end up trying to insert a key with a pointer that should be + * dirty, but points to stale data. + * + * If that happens the insert code just bails out and doesn't do + * the insert - however, it doesn't return an error. Hence we + * need to always recheck the current key before advancing to + * the next: + */ + continue; +advance: + bch2_btree_iter_advance_pos(&iter); + } + + bch2_btree_iter_unlock(&iter); + + return ret; +} diff --git a/libbcachefs/migrate.h b/libbcachefs/migrate.h new file mode 100644 index 00000000..81776bdc --- /dev/null +++ b/libbcachefs/migrate.h @@ -0,0 +1,8 @@ +#ifndef _BCACHE_MIGRATE_H +#define _BCACHE_MIGRATE_H + +int bch2_move_data_off_device(struct bch_dev *); +int bch2_move_metadata_off_device(struct bch_dev *); +int bch2_flag_data_bad(struct bch_dev *); + +#endif /* _BCACHE_MIGRATE_H */ diff --git a/libbcachefs/move.c b/libbcachefs/move.c new file mode 100644 index 00000000..f718f42a --- /dev/null +++ b/libbcachefs/move.c @@ -0,0 +1,392 @@ + +#include "bcachefs.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "buckets.h" +#include "io.h" +#include "move.h" +#include "super-io.h" +#include "keylist.h" + +#include <linux/ioprio.h> + +#include <trace/events/bcachefs.h> + +static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c, + struct bkey_s_extent e, + struct bch_extent_ptr ptr) +{ + struct bch_extent_ptr *ptr2; + unsigned bucket_bits = c->devs[ptr.dev]->bucket_bits; + + extent_for_each_ptr(e, ptr2) + if (ptr2->dev == ptr.dev && + ptr2->gen == ptr.gen && + (ptr2->offset >> bucket_bits) == + (ptr.offset >> bucket_bits)) + return ptr2; + + return NULL; +} + +static struct bch_extent_ptr *bch2_migrate_matching_ptr(struct migrate_write *m, + struct bkey_s_extent e) +{ + const struct bch_extent_ptr *ptr; + struct bch_extent_ptr *ret; + + if (m->move) + ret = bkey_find_ptr(m->op.c, e, m->move_ptr); + else + extent_for_each_ptr(bkey_i_to_s_c_extent(&m->key), ptr) + if ((ret = bkey_find_ptr(m->op.c, e, *ptr))) + break; + + return ret; +} + +static int bch2_migrate_index_update(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct migrate_write *m = + container_of(op, struct migrate_write, op); + struct keylist *keys = &op->insert_keys; + struct btree_iter iter; + int ret = 0; + + bch2_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, + bkey_start_pos(&bch2_keylist_front(keys)->k)); + + while (1) { + struct bkey_s_extent insert = + bkey_i_to_s_extent(bch2_keylist_front(keys)); + struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter); + struct bch_extent_ptr *ptr; + struct bkey_s_extent e; + BKEY_PADDED(k) new; + + if (!k.k) { + ret = bch2_btree_iter_unlock(&iter); + break; + } + + if (!bkey_extent_is_data(k.k)) + goto nomatch; + + bkey_reassemble(&new.k, k); + bch2_cut_front(iter.pos, &new.k); + bch2_cut_back(insert.k->p, &new.k.k); + e = bkey_i_to_s_extent(&new.k); + + /* hack - promotes can race: */ + if (m->promote) + extent_for_each_ptr(insert, ptr) + if (bch2_extent_has_device(e.c, ptr->dev)) + goto nomatch; + + ptr = bch2_migrate_matching_ptr(m, e); + if (ptr) { + int nr_new_dirty = bch2_extent_nr_dirty_ptrs(insert.s_c); + unsigned insert_flags = + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL; + + /* copygc uses btree node reserve: */ + if (m->move) + insert_flags |= BTREE_INSERT_USE_RESERVE; + + if (m->move) { + nr_new_dirty -= !ptr->cached; + __bch2_extent_drop_ptr(e, ptr); + } + + BUG_ON(nr_new_dirty < 0); + + memcpy_u64s(extent_entry_last(e), + insert.v, + bkey_val_u64s(insert.k)); + e.k->u64s += bkey_val_u64s(insert.k); + + bch2_extent_narrow_crcs(e); + bch2_extent_drop_redundant_crcs(e); + bch2_extent_normalize(c, e.s); + bch2_extent_mark_replicas_cached(c, e, nr_new_dirty); + + ret = bch2_btree_insert_at(c, &op->res, + NULL, op_journal_seq(op), + insert_flags, + BTREE_INSERT_ENTRY(&iter, &new.k)); + if (ret && ret != -EINTR) + break; + } else { +nomatch: + bch2_btree_iter_advance_pos(&iter); + } + + while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) { + bch2_keylist_pop_front(keys); + if (bch2_keylist_empty(keys)) + goto out; + } + + bch2_cut_front(iter.pos, bch2_keylist_front(keys)); + } +out: + bch2_btree_iter_unlock(&iter); + return ret; +} + +void bch2_migrate_write_init(struct bch_fs *c, + struct migrate_write *m, + struct write_point *wp, + struct bkey_s_c k, + const struct bch_extent_ptr *move_ptr, + unsigned flags) +{ + bkey_reassemble(&m->key, k); + + m->promote = false; + m->move = move_ptr != NULL; + if (move_ptr) + m->move_ptr = *move_ptr; + + if (bkey_extent_is_cached(k.k) || + (move_ptr && move_ptr->cached)) + flags |= BCH_WRITE_CACHED; + + bch2_write_op_init(&m->op, c, &m->wbio, + (struct disk_reservation) { 0 }, + wp, + bkey_start_pos(k.k), + NULL, flags); + + if (m->move) + m->op.alloc_reserve = RESERVE_MOVINGGC; + + m->op.nonce = extent_current_nonce(bkey_s_c_to_extent(k)); + m->op.nr_replicas = 1; + m->op.index_update_fn = bch2_migrate_index_update; +} + +static void migrate_bio_init(struct moving_io *io, struct bio *bio, + unsigned sectors) +{ + bio_init(bio); + bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + bio->bi_iter.bi_size = sectors << 9; + bio->bi_max_vecs = DIV_ROUND_UP(sectors, PAGE_SECTORS); + bio->bi_private = &io->cl; + bio->bi_io_vec = io->bi_inline_vecs; + bch2_bio_map(bio, NULL); +} + +static void moving_io_destructor(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct moving_context *ctxt = io->ctxt; + struct bio_vec *bv; + int i; + + //if (io->replace.failures) + // trace_copy_collision(q, &io->key.k); + + atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight); + wake_up(&ctxt->wait); + + bio_for_each_segment_all(bv, &io->write.wbio.bio, i) + if (bv->bv_page) + __free_page(bv->bv_page); + + kfree(io); +} + +static void moving_error(struct moving_context *ctxt, unsigned flag) +{ + atomic_inc(&ctxt->error_count); + //atomic_or(flag, &ctxt->error_flags); +} + +static void moving_io_after_write(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct moving_context *ctxt = io->ctxt; + + if (io->write.op.error) + moving_error(ctxt, MOVING_FLAG_WRITE); + + moving_io_destructor(cl); +} + +static void write_moving(struct moving_io *io) +{ + struct bch_write_op *op = &io->write.op; + + if (op->error) { + closure_return_with_destructor(&io->cl, moving_io_destructor); + } else { + closure_call(&op->cl, bch2_write, NULL, &io->cl); + closure_return_with_destructor(&io->cl, moving_io_after_write); + } +} + +static inline struct moving_io *next_pending_write(struct moving_context *ctxt) +{ + struct moving_io *io = + list_first_entry_or_null(&ctxt->reads, struct moving_io, list); + + return io && io->read_completed ? io : NULL; +} + +static void read_moving_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct moving_context *ctxt = io->ctxt; + + trace_move_read_done(&io->write.key.k); + + if (bio->bi_error) { + io->write.op.error = bio->bi_error; + moving_error(io->ctxt, MOVING_FLAG_READ); + } + + io->read_completed = true; + if (next_pending_write(ctxt)) + wake_up(&ctxt->wait); + + closure_put(&ctxt->cl); +} + +static void __bch2_data_move(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct bch_fs *c = io->write.op.c; + struct extent_pick_ptr pick; + + bch2_extent_pick_ptr_avoiding(c, bkey_i_to_s_c(&io->write.key), + io->ctxt->avoid, &pick); + if (IS_ERR_OR_NULL(pick.ca)) + closure_return_with_destructor(cl, moving_io_destructor); + + bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); + io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&io->write.key.k); + io->rbio.bio.bi_end_io = read_moving_endio; + + /* + * dropped by read_moving_endio() - guards against use after free of + * ctxt when doing wakeup + */ + closure_get(&io->ctxt->cl); + + bch2_read_extent(c, &io->rbio, + bkey_i_to_s_c(&io->write.key), + &pick, BCH_READ_IS_LAST); +} + +int bch2_data_move(struct bch_fs *c, + struct moving_context *ctxt, + struct write_point *wp, + struct bkey_s_c k, + const struct bch_extent_ptr *move_ptr) +{ + struct moving_io *io; + + io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(k.k->size, PAGE_SECTORS), + GFP_KERNEL); + if (!io) + return -ENOMEM; + + io->ctxt = ctxt; + + migrate_bio_init(io, &io->rbio.bio, k.k->size); + + if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) { + kfree(io); + return -ENOMEM; + } + + migrate_bio_init(io, &io->write.wbio.bio, k.k->size); + bio_get(&io->write.wbio.bio); + io->write.wbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); + + bch2_migrate_write_init(c, &io->write, wp, k, move_ptr, 0); + + trace_move_read(&io->write.key.k); + + ctxt->keys_moved++; + ctxt->sectors_moved += k.k->size; + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, k.k->size); + + atomic_add(k.k->size, &ctxt->sectors_in_flight); + list_add_tail(&io->list, &ctxt->reads); + + closure_call(&io->cl, __bch2_data_move, NULL, &ctxt->cl); + return 0; +} + +static void do_pending_writes(struct moving_context *ctxt) +{ + struct moving_io *io; + + while ((io = next_pending_write(ctxt))) { + list_del(&io->list); + trace_move_write(&io->write.key.k); + write_moving(io); + } +} + +#define move_ctxt_wait_event(_ctxt, _cond) \ +do { \ + do_pending_writes(_ctxt); \ + \ + if (_cond) \ + break; \ + __wait_event((_ctxt)->wait, \ + next_pending_write(_ctxt) || (_cond)); \ +} while (1) + +int bch2_move_ctxt_wait(struct moving_context *ctxt) +{ + move_ctxt_wait_event(ctxt, + atomic_read(&ctxt->sectors_in_flight) < + ctxt->max_sectors_in_flight); + + return ctxt->rate + ? bch2_ratelimit_wait_freezable_stoppable(ctxt->rate) + : 0; +} + +void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) +{ + unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight); + + move_ctxt_wait_event(ctxt, + !atomic_read(&ctxt->sectors_in_flight) || + atomic_read(&ctxt->sectors_in_flight) != sectors_pending); +} + +void bch2_move_ctxt_exit(struct moving_context *ctxt) +{ + move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight)); + closure_sync(&ctxt->cl); + + EBUG_ON(!list_empty(&ctxt->reads)); + EBUG_ON(atomic_read(&ctxt->sectors_in_flight)); +} + +void bch2_move_ctxt_init(struct moving_context *ctxt, + struct bch_ratelimit *rate, + unsigned max_sectors_in_flight) +{ + memset(ctxt, 0, sizeof(*ctxt)); + closure_init_stack(&ctxt->cl); + + ctxt->rate = rate; + ctxt->max_sectors_in_flight = max_sectors_in_flight; + + INIT_LIST_HEAD(&ctxt->reads); + init_waitqueue_head(&ctxt->wait); +} diff --git a/libbcachefs/move.h b/libbcachefs/move.h new file mode 100644 index 00000000..548f0f0a --- /dev/null +++ b/libbcachefs/move.h @@ -0,0 +1,87 @@ +#ifndef _BCACHE_MOVE_H +#define _BCACHE_MOVE_H + +#include "buckets.h" +#include "io_types.h" +#include "move_types.h" + +enum moving_flag_bitnos { + MOVING_FLAG_BITNO_READ = 0, + MOVING_FLAG_BITNO_WRITE, +}; + +#define MOVING_FLAG_READ (1U << MOVING_FLAG_BITNO_READ) +#define MOVING_FLAG_WRITE (1U << MOVING_FLAG_BITNO_WRITE) + +struct migrate_write { + BKEY_PADDED(key); + bool promote; + bool move; + struct bch_extent_ptr move_ptr; + struct bch_write_op op; + struct bch_write_bio wbio; +}; + +void bch2_migrate_write_init(struct bch_fs *, + struct migrate_write *, + struct write_point *, + struct bkey_s_c, + const struct bch_extent_ptr *, + unsigned); + +#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 + +struct moving_context { + /* Closure for waiting on all reads and writes to complete */ + struct closure cl; + + /* Number and types of errors reported */ + atomic_t error_count; + atomic_t error_flags; + + /* Key and sector moves issued, updated from submission context */ + u64 keys_moved; + u64 sectors_moved; + + /* Rate-limiter counting submitted reads */ + struct bch_ratelimit *rate; + + /* Try to avoid reading the following device */ + struct bch_dev *avoid; + + struct list_head reads; + + /* Configuration */ + unsigned max_sectors_in_flight; + atomic_t sectors_in_flight; + + wait_queue_head_t wait; +}; + +struct moving_io { + struct list_head list; + struct rb_node node; + struct closure cl; + struct moving_context *ctxt; + struct migrate_write write; + bool read_completed; + + struct bch_read_bio rbio; + /* Must be last since it is variable size */ + struct bio_vec bi_inline_vecs[0]; +}; + +int bch2_data_move(struct bch_fs *, + struct moving_context *, + struct write_point *, + struct bkey_s_c, + const struct bch_extent_ptr *); + +int bch2_move_ctxt_wait(struct moving_context *); +void bch2_move_ctxt_wait_for_io(struct moving_context *); + +void bch2_move_ctxt_exit(struct moving_context *); +void bch2_move_ctxt_init(struct moving_context *, struct bch_ratelimit *, + unsigned); + +#endif /* _BCACHE_MOVE_H */ diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h new file mode 100644 index 00000000..0e2275e2 --- /dev/null +++ b/libbcachefs/move_types.h @@ -0,0 +1,4 @@ +#ifndef _BCACHE_MOVE_TYPES_H +#define _BCACHE_MOVE_TYPES_H + +#endif /* _BCACHE_MOVE_TYPES_H */ diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c new file mode 100644 index 00000000..8804dbb3 --- /dev/null +++ b/libbcachefs/movinggc.c @@ -0,0 +1,297 @@ +/* + * Moving/copying garbage collector + * + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "btree_iter.h" +#include "buckets.h" +#include "clock.h" +#include "extents.h" +#include "io.h" +#include "keylist.h" +#include "move.h" +#include "movinggc.h" + +#include <trace/events/bcachefs.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/wait.h> + +/* Moving GC - IO loop */ + +static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca, + struct bkey_s_c k) +{ + const struct bch_extent_ptr *ptr; + + if (bkey_extent_is_data(k.k) && + (ptr = bch2_extent_has_device(bkey_s_c_to_extent(k), + ca->dev_idx)) && + PTR_BUCKET(ca, ptr)->mark.copygc) + return ptr; + + return NULL; +} + +static int issue_moving_gc_move(struct bch_dev *ca, + struct moving_context *ctxt, + struct bkey_s_c k) +{ + struct bch_fs *c = ca->fs; + const struct bch_extent_ptr *ptr; + int ret; + + ptr = moving_pred(ca, k); + if (!ptr) /* We raced - bucket's been reused */ + return 0; + + ret = bch2_data_move(c, ctxt, &ca->copygc_write_point, k, ptr); + if (!ret) + trace_gc_copy(k.k); + else + trace_moving_gc_alloc_fail(c, k.k->size); + return ret; +} + +static void read_moving(struct bch_dev *ca, size_t buckets_to_move, + u64 sectors_to_move) +{ + struct bch_fs *c = ca->fs; + struct bucket *g; + struct moving_context ctxt; + struct btree_iter iter; + struct bkey_s_c k; + u64 sectors_not_moved = 0; + size_t buckets_not_moved = 0; + + bch2_ratelimit_reset(&ca->moving_gc_pd.rate); + bch2_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate, + SECTORS_IN_FLIGHT_PER_DEVICE); + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + + while (1) { + if (kthread_should_stop()) + goto out; + if (bch2_move_ctxt_wait(&ctxt)) + goto out; + k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + if (btree_iter_err(k)) + goto out; + + if (!moving_pred(ca, k)) + goto next; + + if (issue_moving_gc_move(ca, &ctxt, k)) { + bch2_btree_iter_unlock(&iter); + + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(&ctxt); + continue; + } +next: + bch2_btree_iter_advance_pos(&iter); + //bch2_btree_iter_cond_resched(&iter); + + /* unlock before calling moving_context_wait() */ + bch2_btree_iter_unlock(&iter); + cond_resched(); + } + + bch2_btree_iter_unlock(&iter); + bch2_move_ctxt_exit(&ctxt); + trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved, + buckets_to_move); + + /* don't check this if we bailed out early: */ + for_each_bucket(g, ca) + if (g->mark.copygc && bucket_sectors_used(g)) { + sectors_not_moved += bucket_sectors_used(g); + buckets_not_moved++; + } + + if (sectors_not_moved) + bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved", + sectors_not_moved, sectors_to_move, + buckets_not_moved, buckets_to_move); + return; +out: + bch2_btree_iter_unlock(&iter); + bch2_move_ctxt_exit(&ctxt); + trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved, + buckets_to_move); +} + +static bool have_copygc_reserve(struct bch_dev *ca) +{ + bool ret; + + spin_lock(&ca->freelist_lock); + ret = fifo_used(&ca->free[RESERVE_MOVINGGC]) >= + COPYGC_BUCKETS_PER_ITER(ca); + spin_unlock(&ca->freelist_lock); + + return ret; +} + +static void bch2_moving_gc(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bucket *g; + struct bucket_mark new; + u64 sectors_to_move; + size_t buckets_to_move, buckets_unused = 0; + struct bucket_heap_entry e; + unsigned sectors_used, i; + int reserve_sectors; + + if (!have_copygc_reserve(ca)) { + struct closure cl; + + closure_init_stack(&cl); + while (1) { + closure_wait(&c->freelist_wait, &cl); + if (have_copygc_reserve(ca)) + break; + closure_sync(&cl); + } + closure_wake_up(&c->freelist_wait); + } + + reserve_sectors = COPYGC_SECTORS_PER_ITER(ca); + + trace_moving_gc_start(ca); + + /* + * Find buckets with lowest sector counts, skipping completely + * empty buckets, by building a maxheap sorted by sector count, + * and repeatedly replacing the maximum element until all + * buckets have been visited. + */ + + /* + * We need bucket marks to be up to date, so gc can't be recalculating + * them, and we don't want the allocator invalidating a bucket after + * we've decided to evacuate it but before we set copygc: + */ + down_read(&c->gc_lock); + mutex_lock(&ca->heap_lock); + mutex_lock(&ca->fs->bucket_lock); + + ca->heap.used = 0; + for_each_bucket(g, ca) { + bucket_cmpxchg(g, new, new.copygc = 0); + + if (bucket_unused(g)) { + buckets_unused++; + continue; + } + + if (g->mark.owned_by_allocator || + g->mark.data_type != BUCKET_DATA) + continue; + + sectors_used = bucket_sectors_used(g); + + if (sectors_used >= ca->mi.bucket_size) + continue; + + bucket_heap_push(ca, g, sectors_used); + } + + sectors_to_move = 0; + for (i = 0; i < ca->heap.used; i++) + sectors_to_move += ca->heap.data[i].val; + + while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { + BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp)); + sectors_to_move -= e.val; + } + + for (i = 0; i < ca->heap.used; i++) + bucket_cmpxchg(ca->heap.data[i].g, new, new.copygc = 1); + + buckets_to_move = ca->heap.used; + + mutex_unlock(&ca->fs->bucket_lock); + mutex_unlock(&ca->heap_lock); + up_read(&c->gc_lock); + + read_moving(ca, buckets_to_move, sectors_to_move); +} + +static int bch2_moving_gc_thread(void *arg) +{ + struct bch_dev *ca = arg; + struct bch_fs *c = ca->fs; + struct io_clock *clock = &c->io_clock[WRITE]; + unsigned long last; + u64 available, want, next; + + set_freezable(); + + while (!kthread_should_stop()) { + if (kthread_wait_freezable(c->copy_gc_enabled)) + break; + + last = atomic_long_read(&clock->now); + /* + * don't start copygc until less than half the gc reserve is + * available: + */ + available = dev_buckets_available(ca); + want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) * + c->opts.gc_reserve_percent, 200); + if (available > want) { + next = last + (available - want) * + ca->mi.bucket_size; + bch2_kthread_io_clock_wait(clock, next); + continue; + } + + bch2_moving_gc(ca); + } + + return 0; +} + +void bch2_moving_gc_stop(struct bch_dev *ca) +{ + ca->moving_gc_pd.rate.rate = UINT_MAX; + bch2_ratelimit_reset(&ca->moving_gc_pd.rate); + + if (ca->moving_gc_read) + kthread_stop(ca->moving_gc_read); + ca->moving_gc_read = NULL; +} + +int bch2_moving_gc_start(struct bch_dev *ca) +{ + struct task_struct *t; + + BUG_ON(ca->moving_gc_read); + + if (ca->fs->opts.nochanges) + return 0; + + if (bch2_fs_init_fault("moving_gc_start")) + return -ENOMEM; + + t = kthread_create(bch2_moving_gc_thread, ca, "bch_copygc_read"); + if (IS_ERR(t)) + return PTR_ERR(t); + + ca->moving_gc_read = t; + wake_up_process(ca->moving_gc_read); + + return 0; +} + +void bch2_dev_moving_gc_init(struct bch_dev *ca) +{ + bch2_pd_controller_init(&ca->moving_gc_pd); + ca->moving_gc_pd.d_term = 0; +} diff --git a/libbcachefs/movinggc.h b/libbcachefs/movinggc.h new file mode 100644 index 00000000..e27ccc35 --- /dev/null +++ b/libbcachefs/movinggc.h @@ -0,0 +1,30 @@ +#ifndef _BCACHE_MOVINGGC_H +#define _BCACHE_MOVINGGC_H + +/* + * We can't use the entire copygc reserve in one iteration of copygc: we may + * need the buckets we're freeing up to go back into the copygc reserve to make + * forward progress, but if the copygc reserve is full they'll be available for + * any allocation - and it's possible that in a given iteration, we free up most + * of the buckets we're going to free before we allocate most of the buckets + * we're going to allocate. + * + * If we only use half of the reserve per iteration, then in steady state we'll + * always have room in the reserve for the buckets we're going to need in the + * next iteration: + */ +#define COPYGC_BUCKETS_PER_ITER(ca) \ + ((ca)->free[RESERVE_MOVINGGC].size / 2) + +/* + * Max sectors to move per iteration: Have to take into account internal + * fragmentation from the multiple write points for each generation: + */ +#define COPYGC_SECTORS_PER_ITER(ca) \ + ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) + +void bch2_moving_gc_stop(struct bch_dev *); +int bch2_moving_gc_start(struct bch_dev *); +void bch2_dev_moving_gc_init(struct bch_dev *); + +#endif diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c new file mode 100644 index 00000000..7c4cf804 --- /dev/null +++ b/libbcachefs/opts.c @@ -0,0 +1,241 @@ + +#include <linux/kernel.h> + +#include "opts.h" +#include "util.h" + +const char * const bch2_error_actions[] = { + "continue", + "remount-ro", + "panic", + NULL +}; + +const char * const bch2_csum_types[] = { + "none", + "crc32c", + "crc64", + NULL +}; + +const char * const bch2_compression_types[] = { + "none", + "lz4", + "gzip", + NULL +}; + +const char * const bch2_str_hash_types[] = { + "crc32c", + "crc64", + "siphash", + NULL +}; + +const char * const bch2_cache_replacement_policies[] = { + "lru", + "fifo", + "random", + NULL +}; + +/* Default is -1; we skip past it for struct cached_dev's cache mode */ +const char * const bch2_cache_modes[] = { + "default", + "writethrough", + "writeback", + "writearound", + "none", + NULL +}; + +const char * const bch2_dev_state[] = { + "readwrite", + "readonly", + "failed", + "spare", + NULL +}; + +const struct bch_option bch2_opt_table[] = { +#define OPT_BOOL() .type = BCH_OPT_BOOL +#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max +#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices + +#define BCH_OPT(_name, _mode, _sb_opt, _bits, _type) \ + [Opt_##_name] = { \ + .name = #_name, \ + .set_sb = SET_##_sb_opt, \ + _type \ + }, + BCH_VISIBLE_OPTS() +#undef BCH_OPT +}; + +static enum bch_opt_id bch2_opt_lookup(const char *name) +{ + const struct bch_option *i; + + for (i = bch2_opt_table; + i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); + i++) + if (!strcmp(name, i->name)) + return i - bch2_opt_table; + + return -1; +} + +static u64 bch2_opt_get(struct bch_opts *opts, enum bch_opt_id id) +{ + switch (id) { +#define BCH_OPT(_name, ...) \ + case Opt_##_name: \ + return opts->_name; \ + + BCH_VISIBLE_OPTS() +#undef BCH_OPT + + default: + BUG(); + } +} + +void bch2_opt_set(struct bch_opts *opts, enum bch_opt_id id, u64 v) +{ + switch (id) { +#define BCH_OPT(_name, ...) \ + case Opt_##_name: \ + opts->_name = v; \ + break; + + BCH_VISIBLE_OPTS() +#undef BCH_OPT + + default: + BUG(); + } +} + +/* + * Initial options from superblock - here we don't want any options undefined, + * any options the superblock doesn't specify are set to 0: + */ +struct bch_opts bch2_sb_opts(struct bch_sb *sb) +{ + struct bch_opts opts = bch2_opts_empty(); + +#define BCH_OPT(_name, _mode, _sb_opt, ...) \ + if (_sb_opt != NO_SB_OPT) \ + opts._name = _sb_opt(sb); + + BCH_OPTS() +#undef BCH_OPT + + return opts; +} + +static int parse_one_opt(enum bch_opt_id id, const char *val, u64 *res) +{ + const struct bch_option *opt = &bch2_opt_table[id]; + ssize_t ret; + + switch (opt->type) { + case BCH_OPT_BOOL: + ret = kstrtou64(val, 10, res); + if (ret < 0) + return ret; + + if (*res > 1) + return -ERANGE; + break; + case BCH_OPT_UINT: + ret = kstrtou64(val, 10, res); + if (ret < 0) + return ret; + + if (*res < opt->min || *res >= opt->max) + return -ERANGE; + break; + case BCH_OPT_STR: + ret = bch2_read_string_list(val, opt->choices); + if (ret < 0) + return ret; + + *res = ret; + break; + } + + return 0; +} + +int bch2_parse_mount_opts(struct bch_opts *opts, char *options) +{ + char *opt, *name, *val; + int ret, id; + u64 v; + + while ((opt = strsep(&options, ",")) != NULL) { + name = strsep(&opt, "="); + val = opt; + + if (val) { + id = bch2_opt_lookup(name); + if (id < 0) + return -EINVAL; + + ret = parse_one_opt(id, val, &v); + if (ret < 0) + return ret; + } else { + id = bch2_opt_lookup(name); + v = 1; + + if (id < 0 && + !strncmp("no", name, 2)) { + id = bch2_opt_lookup(name + 2); + v = 0; + } + + if (bch2_opt_table[id].type != BCH_OPT_BOOL) + return -EINVAL; + } + + bch2_opt_set(opts, id, v); + } + + return 0; +} + +enum bch_opt_id bch2_parse_sysfs_opt(const char *name, const char *val, + u64 *res) +{ + enum bch_opt_id id = bch2_opt_lookup(name); + int ret; + + if (id < 0) + return -EINVAL; + + ret = parse_one_opt(id, val, res); + if (ret < 0) + return ret; + + return id; +} + +ssize_t bch2_opt_show(struct bch_opts *opts, const char *name, + char *buf, size_t size) +{ + enum bch_opt_id id = bch2_opt_lookup(name); + const struct bch_option *opt; + u64 v; + + if (id < 0) + return -EINVAL; + + v = bch2_opt_get(opts, id); + opt = &bch2_opt_table[id]; + + return opt->type == BCH_OPT_STR + ? bch2_snprint_string_list(buf, size, opt->choices, v) + : snprintf(buf, size, "%lli\n", v); +} diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h new file mode 100644 index 00000000..6fa707db --- /dev/null +++ b/libbcachefs/opts.h @@ -0,0 +1,168 @@ +#ifndef _BCACHE_OPTS_H +#define _BCACHE_OPTS_H + +#include <linux/bug.h> +#include <linux/log2.h> +#include <linux/string.h> +#include "bcachefs_format.h" + +extern const char * const bch2_error_actions[]; +extern const char * const bch2_csum_types[]; +extern const char * const bch2_compression_types[]; +extern const char * const bch2_str_hash_types[]; +extern const char * const bch2_cache_replacement_policies[]; +extern const char * const bch2_cache_modes[]; +extern const char * const bch2_dev_state[]; + +/* + * Mount options; we also store defaults in the superblock. + * + * Also exposed via sysfs: if an option is writeable, and it's also stored in + * the superblock, changing it via sysfs (currently? might change this) also + * updates the superblock. + * + * We store options as signed integers, where -1 means undefined. This means we + * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only + * apply the options from that struct that are defined. + */ + +/* dummy option, for options that aren't stored in the superblock */ +LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); + +/** + * BCH_OPT(name, mode, sb_opt, type, ...) + * + * @name - name of mount option, sysfs attribute, and struct bch_opts + * member + * + * @mode - sysfs attr permissions + * + * @sb_option - name of corresponding superblock option + * + * @type - one of OPT_BOOL, OPT_UINT, OPT_STR + */ + +enum opt_type { + BCH_OPT_BOOL, + BCH_OPT_UINT, + BCH_OPT_STR, +}; + +#define BCH_VISIBLE_OPTS() \ + BCH_OPT(errors, 0644, BCH_SB_ERROR_ACTION, \ + s8, OPT_STR(bch2_error_actions)) \ + BCH_OPT(metadata_replicas, 0444, BCH_SB_META_REPLICAS_WANT,\ + s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \ + BCH_OPT(data_replicas, 0444, BCH_SB_DATA_REPLICAS_WANT,\ + s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \ + BCH_OPT(metadata_replicas_required, 0444, BCH_SB_META_REPLICAS_REQ,\ + s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \ + BCH_OPT(data_replicas_required, 0444, BCH_SB_DATA_REPLICAS_REQ,\ + s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \ + BCH_OPT(metadata_checksum, 0644, BCH_SB_META_CSUM_TYPE, \ + s8, OPT_STR(bch2_csum_types)) \ + BCH_OPT(data_checksum, 0644, BCH_SB_DATA_CSUM_TYPE, \ + s8, OPT_STR(bch2_csum_types)) \ + BCH_OPT(compression, 0644, BCH_SB_COMPRESSION_TYPE,\ + s8, OPT_STR(bch2_compression_types)) \ + BCH_OPT(str_hash, 0644, BCH_SB_STR_HASH_TYPE, \ + s8, OPT_STR(bch2_str_hash_types)) \ + BCH_OPT(inodes_32bit, 0644, BCH_SB_INODE_32BIT, \ + s8, OPT_BOOL()) \ + BCH_OPT(gc_reserve_percent, 0444, BCH_SB_GC_RESERVE, \ + s8, OPT_UINT(5, 21)) \ + BCH_OPT(root_reserve_percent, 0444, BCH_SB_ROOT_RESERVE, \ + s8, OPT_UINT(0, 100)) \ + BCH_OPT(wide_macs, 0644, BCH_SB_128_BIT_MACS, \ + s8, OPT_BOOL()) \ + BCH_OPT(verbose_recovery, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_OPT(posix_acl, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_OPT(journal_flush_disabled, 0644, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_OPT(nofsck, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_OPT(fix_errors, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_OPT(nochanges, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_OPT(noreplay, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_OPT(norecovery, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_OPT(noexcl, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_OPT(sb, 0444, NO_SB_OPT, \ + s64, OPT_UINT(0, S64_MAX)) \ + +#define BCH_OPTS() \ + BCH_OPT(read_only, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_OPT(nostart, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_VISIBLE_OPTS() + +struct bch_opts { +#define BCH_OPT(_name, _mode, _sb_opt, _bits, ...) \ + _bits _name; + + BCH_OPTS() +#undef BCH_OPT +}; + +enum bch_opt_id { +#define BCH_OPT(_name, ...) \ + Opt_##_name, + + BCH_VISIBLE_OPTS() +#undef BCH_OPT +}; + +struct bch_option { + const char *name; + void (*set_sb)(struct bch_sb *, u64); + enum opt_type type; + + union { + struct { + u64 min, max; + }; + struct { + const char * const *choices; + }; + }; + +}; + +extern const struct bch_option bch2_opt_table[]; + +static inline struct bch_opts bch2_opts_empty(void) +{ + struct bch_opts ret; + + memset(&ret, 255, sizeof(ret)); + return ret; +} + +static inline void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) +{ +#define BCH_OPT(_name, ...) \ + if (src._name >= 0) \ + dst->_name = src._name; + + BCH_OPTS() +#undef BCH_OPT +} + +#define opt_defined(_opt) ((_opt) >= 0) + +void bch2_opt_set(struct bch_opts *, enum bch_opt_id, u64); +struct bch_opts bch2_sb_opts(struct bch_sb *); + +int bch2_parse_mount_opts(struct bch_opts *, char *); +enum bch_opt_id bch2_parse_sysfs_opt(const char *, const char *, u64 *); + +ssize_t bch2_opt_show(struct bch_opts *, const char *, char *, size_t); + +#endif /* _BCACHE_OPTS_H */ diff --git a/libbcachefs/siphash.c b/libbcachefs/siphash.c new file mode 100644 index 00000000..3a6c9c82 --- /dev/null +++ b/libbcachefs/siphash.c @@ -0,0 +1,172 @@ +/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ + +/*- + * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d + * are the number of compression rounds and the number of finalization rounds. + * A compression round is identical to a finalization round and this round + * function is called SipRound. Given a 128-bit key k and a (possibly empty) + * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). + * + * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, + * by Jean-Philippe Aumasson and Daniel J. Bernstein, + * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa + * https://131002.net/siphash/siphash.pdf + * https://131002.net/siphash/ + */ + +#include <asm/byteorder.h> +#include <asm/unaligned.h> +#include <linux/bitops.h> +#include <linux/string.h> + +#include "siphash.h" + +static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) +{ + while (rounds--) { + ctx->v[0] += ctx->v[1]; + ctx->v[2] += ctx->v[3]; + ctx->v[1] = rol64(ctx->v[1], 13); + ctx->v[3] = rol64(ctx->v[3], 16); + + ctx->v[1] ^= ctx->v[0]; + ctx->v[3] ^= ctx->v[2]; + ctx->v[0] = rol64(ctx->v[0], 32); + + ctx->v[2] += ctx->v[1]; + ctx->v[0] += ctx->v[3]; + ctx->v[1] = rol64(ctx->v[1], 17); + ctx->v[3] = rol64(ctx->v[3], 21); + + ctx->v[1] ^= ctx->v[2]; + ctx->v[3] ^= ctx->v[0]; + ctx->v[2] = rol64(ctx->v[2], 32); + } +} + +static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) +{ + u64 m = get_unaligned_le64(ptr); + + ctx->v[3] ^= m; + SipHash_Rounds(ctx, rounds); + ctx->v[0] ^= m; +} + +void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) +{ + u64 k0, k1; + + k0 = le64_to_cpu(key->k0); + k1 = le64_to_cpu(key->k1); + + ctx->v[0] = 0x736f6d6570736575ULL ^ k0; + ctx->v[1] = 0x646f72616e646f6dULL ^ k1; + ctx->v[2] = 0x6c7967656e657261ULL ^ k0; + ctx->v[3] = 0x7465646279746573ULL ^ k1; + + memset(ctx->buf, 0, sizeof(ctx->buf)); + ctx->bytes = 0; +} + +void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, + const void *src, size_t len) +{ + const u8 *ptr = src; + size_t left, used; + + if (len == 0) + return; + + used = ctx->bytes % sizeof(ctx->buf); + ctx->bytes += len; + + if (used > 0) { + left = sizeof(ctx->buf) - used; + + if (len >= left) { + memcpy(&ctx->buf[used], ptr, left); + SipHash_CRounds(ctx, ctx->buf, rc); + len -= left; + ptr += left; + } else { + memcpy(&ctx->buf[used], ptr, len); + return; + } + } + + while (len >= sizeof(ctx->buf)) { + SipHash_CRounds(ctx, ptr, rc); + len -= sizeof(ctx->buf); + ptr += sizeof(ctx->buf); + } + + if (len > 0) + memcpy(&ctx->buf[used], ptr, len); +} + +void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) +{ + u64 r; + + r = SipHash_End(ctx, rc, rf); + + *((__le64 *) dst) = cpu_to_le64(r); +} + +u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) +{ + u64 r; + size_t left, used; + + used = ctx->bytes % sizeof(ctx->buf); + left = sizeof(ctx->buf) - used; + memset(&ctx->buf[used], 0, left - 1); + ctx->buf[7] = ctx->bytes; + + SipHash_CRounds(ctx, ctx->buf, rc); + ctx->v[2] ^= 0xff; + SipHash_Rounds(ctx, rf); + + r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); + memset(ctx, 0, sizeof(*ctx)); + return (r); +} + +u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) +{ + SIPHASH_CTX ctx; + + SipHash_Init(&ctx, key); + SipHash_Update(&ctx, rc, rf, src, len); + return SipHash_End(&ctx, rc, rf); +} diff --git a/libbcachefs/siphash.h b/libbcachefs/siphash.h new file mode 100644 index 00000000..7a4b2241 --- /dev/null +++ b/libbcachefs/siphash.h @@ -0,0 +1,86 @@ +/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ +/*- + * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) + * optimized for speed on short messages returning a 64bit hash/digest value. + * + * The number of rounds is defined during the initialization: + * SipHash24_Init() for the fast and resonable strong version + * SipHash48_Init() for the strong version (half as fast) + * + * struct SIPHASH_CTX ctx; + * SipHash24_Init(&ctx); + * SipHash_SetKey(&ctx, "16bytes long key"); + * SipHash_Update(&ctx, pointer_to_string, length_of_string); + * SipHash_Final(output, &ctx); + */ + +#ifndef _SIPHASH_H_ +#define _SIPHASH_H_ + +#include <linux/types.h> + +#define SIPHASH_BLOCK_LENGTH 8 +#define SIPHASH_KEY_LENGTH 16 +#define SIPHASH_DIGEST_LENGTH 8 + +typedef struct _SIPHASH_CTX { + u64 v[4]; + u8 buf[SIPHASH_BLOCK_LENGTH]; + u32 bytes; +} SIPHASH_CTX; + +typedef struct { + __le64 k0; + __le64 k1; +} SIPHASH_KEY; + +void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); +void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); +u64 SipHash_End(SIPHASH_CTX *, int, int); +void SipHash_Final(void *, SIPHASH_CTX *, int, int); +u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); + +#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) +#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) +#define SipHash24_End(_d) SipHash_End((_d), 2, 4) +#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) +#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) + +#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) +#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) +#define SipHash48_End(_d) SipHash_End((_d), 4, 8) +#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) +#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) + +#endif /* _SIPHASH_H_ */ diff --git a/libbcachefs/six.c b/libbcachefs/six.c new file mode 100644 index 00000000..1bb8bfcc --- /dev/null +++ b/libbcachefs/six.c @@ -0,0 +1,396 @@ + +#include <linux/sched.h> +#include <linux/sched/rt.h> + +#include "six.h" + +#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) +#define six_release(l) lock_release(l, 0, _RET_IP_) + +#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) +#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) +#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) + +struct six_lock_vals { + /* Value we add to the lock in order to take the lock: */ + u64 lock_val; + + /* If the lock has this value (used as a mask), taking the lock fails: */ + u64 lock_fail; + + /* Value we add to the lock in order to release the lock: */ + u64 unlock_val; + + /* Mask that indicates lock is held for this type: */ + u64 held_mask; + + /* Waitlist we wakeup when releasing the lock: */ + enum six_lock_type unlock_wakeup; +}; + +#define LOCK_VALS { \ + [SIX_LOCK_read] = { \ + .lock_val = __SIX_VAL(read_lock, 1), \ + .lock_fail = __SIX_LOCK_HELD_write, \ + .unlock_val = -__SIX_VAL(read_lock, 1), \ + .held_mask = __SIX_LOCK_HELD_read, \ + .unlock_wakeup = SIX_LOCK_write, \ + }, \ + [SIX_LOCK_intent] = { \ + .lock_val = __SIX_VAL(intent_lock, 1), \ + .lock_fail = __SIX_LOCK_HELD_intent, \ + .unlock_val = -__SIX_VAL(intent_lock, 1), \ + .held_mask = __SIX_LOCK_HELD_intent, \ + .unlock_wakeup = SIX_LOCK_intent, \ + }, \ + [SIX_LOCK_write] = { \ + .lock_val = __SIX_VAL(seq, 1), \ + .lock_fail = __SIX_LOCK_HELD_read, \ + .unlock_val = __SIX_VAL(seq, 1), \ + .held_mask = __SIX_LOCK_HELD_write, \ + .unlock_wakeup = SIX_LOCK_read, \ + }, \ +} + +static void six_set_owner(struct six_lock *lock, enum six_lock_type type) +{ + if (type == SIX_LOCK_intent) + lock->owner = current; +} + +static void six_clear_owner(struct six_lock *lock, enum six_lock_type type) +{ + if (type == SIX_LOCK_intent) + lock->owner = NULL; +} + +static inline bool __six_trylock_type(struct six_lock *lock, + enum six_lock_type type) +{ + const struct six_lock_vals l[] = LOCK_VALS; + union six_lock_state old; + u64 v = READ_ONCE(lock->state.v); + + do { + old.v = v; + + EBUG_ON(type == SIX_LOCK_write && + ((old.v & __SIX_LOCK_HELD_write) || + !(old.v & __SIX_LOCK_HELD_intent))); + + if (old.v & l[type].lock_fail) + return false; + } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, + old.v, + old.v + l[type].lock_val)) != old.v); + return true; +} + +bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) +{ + bool ret = __six_trylock_type(lock, type); + + if (ret) { + six_acquire(&lock->dep_map, 1); + six_set_owner(lock, type); + } + + return ret; +} + +bool six_relock_type(struct six_lock *lock, enum six_lock_type type, + unsigned seq) +{ + const struct six_lock_vals l[] = LOCK_VALS; + union six_lock_state old; + u64 v = READ_ONCE(lock->state.v); + + do { + old.v = v; + + if (old.seq != seq || old.v & l[type].lock_fail) + return false; + } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, + old.v, + old.v + l[type].lock_val)) != old.v); + + six_acquire(&lock->dep_map, 1); + six_set_owner(lock, type); + return true; +} + +struct six_lock_waiter { + struct list_head list; + struct task_struct *task; +}; + +/* This is probably up there with the more evil things I've done */ +#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) + +static inline int six_can_spin_on_owner(struct six_lock *lock) +{ + struct task_struct *owner; + int retval = 1; + + if (need_resched()) + return 0; + + rcu_read_lock(); + owner = READ_ONCE(lock->owner); + if (owner) + retval = owner->on_cpu; + rcu_read_unlock(); + /* + * if lock->owner is not set, the mutex owner may have just acquired + * it and not set the owner yet or the mutex has been released. + */ + return retval; +} + +static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner) +{ + bool ret = true; + + rcu_read_lock(); + while (lock->owner == owner) { + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking lock->owner still matches owner. If that fails, + * owner might point to freed memory. If it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + if (!owner->on_cpu || need_resched()) { + ret = false; + break; + } + + cpu_relax_lowlatency(); + } + rcu_read_unlock(); + + return ret; +} + +static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +{ + struct task_struct *task = current; + + if (type == SIX_LOCK_write) + return false; + + preempt_disable(); + if (!six_can_spin_on_owner(lock)) + goto fail; + + if (!osq_lock(&lock->osq)) + goto fail; + + while (1) { + struct task_struct *owner; + + /* + * If there's an owner, wait for it to either + * release the lock or go to sleep. + */ + owner = READ_ONCE(lock->owner); + if (owner && !six_spin_on_owner(lock, owner)) + break; + + if (__six_trylock_type(lock, type)) { + osq_unlock(&lock->osq); + preempt_enable(); + return true; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(task))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax_lowlatency(); + } + + osq_unlock(&lock->osq); +fail: + preempt_enable(); + + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock again. This avoids getting + * scheduled out right after we obtained the lock. + */ + if (need_resched()) + schedule(); + + return false; +} + +void six_lock_type(struct six_lock *lock, enum six_lock_type type) +{ + const struct six_lock_vals l[] = LOCK_VALS; + union six_lock_state old, new; + struct six_lock_waiter wait; + u64 v; + + six_acquire(&lock->dep_map, 0); + + if (__six_trylock_type(lock, type)) + goto done; + + if (six_optimistic_spin(lock, type)) + goto done; + + lock_contended(&lock->dep_map, _RET_IP_); + + INIT_LIST_HEAD(&wait.list); + wait.task = current; + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (list_empty_careful(&wait.list)) { + raw_spin_lock(&lock->wait_lock); + list_add_tail(&wait.list, &lock->wait_list[type]); + raw_spin_unlock(&lock->wait_lock); + } + + v = READ_ONCE(lock->state.v); + do { + new.v = old.v = v; + + if (!(old.v & l[type].lock_fail)) + new.v += l[type].lock_val; + else if (!(new.waiters & (1 << type))) + new.waiters |= 1 << type; + else + break; /* waiting bit already set */ + } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, + old.v, new.v)) != old.v); + + if (!(old.v & l[type].lock_fail)) + break; + + schedule(); + } + + __set_current_state(TASK_RUNNING); + + if (!list_empty_careful(&wait.list)) { + raw_spin_lock(&lock->wait_lock); + list_del_init(&wait.list); + raw_spin_unlock(&lock->wait_lock); + } +done: + lock_acquired(&lock->dep_map, _RET_IP_); + six_set_owner(lock, type); +} + +static inline void six_lock_wakeup(struct six_lock *lock, + union six_lock_state state, + unsigned waitlist_id) +{ + struct list_head *wait_list = &lock->wait_list[waitlist_id]; + struct six_lock_waiter *w, *next; + + if (waitlist_id == SIX_LOCK_write && state.read_lock) + return; + + if (!(state.waiters & (1 << waitlist_id))) + return; + + clear_bit(waitlist_bitnr(waitlist_id), + (unsigned long *) &lock->state.v); + + raw_spin_lock(&lock->wait_lock); + + list_for_each_entry_safe(w, next, wait_list, list) { + list_del_init(&w->list); + + if (wake_up_process(w->task) && + waitlist_id != SIX_LOCK_read) { + if (!list_empty(wait_list)) + set_bit(waitlist_bitnr(waitlist_id), + (unsigned long *) &lock->state.v); + break; + } + } + + raw_spin_unlock(&lock->wait_lock); +} + +void six_unlock_type(struct six_lock *lock, enum six_lock_type type) +{ + const struct six_lock_vals l[] = LOCK_VALS; + union six_lock_state state; + + six_clear_owner(lock, type); + + EBUG_ON(!(lock->state.v & l[type].held_mask)); + EBUG_ON(type == SIX_LOCK_write && + !(lock->state.v & __SIX_LOCK_HELD_intent)); + + state.v = atomic64_add_return_release(l[type].unlock_val, + &lock->state.counter); + six_release(&lock->dep_map); + six_lock_wakeup(lock, state, l[type].unlock_wakeup); +} + +bool six_trylock_convert(struct six_lock *lock, + enum six_lock_type from, + enum six_lock_type to) +{ + const struct six_lock_vals l[] = LOCK_VALS; + union six_lock_state old, new; + u64 v = READ_ONCE(lock->state.v); + + do { + new.v = old.v = v; + new.v += l[from].unlock_val; + + if (new.v & l[to].lock_fail) + return false; + } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, + old.v, + new.v + l[to].lock_val)) != old.v); + + six_clear_owner(lock, from); + six_set_owner(lock, to); + + six_lock_wakeup(lock, new, l[from].unlock_wakeup); + + return true; +} + +/* + * Increment read/intent lock count, assuming we already have it read or intent + * locked: + */ +void six_lock_increment(struct six_lock *lock, enum six_lock_type type) +{ + const struct six_lock_vals l[] = LOCK_VALS; + + EBUG_ON(type == SIX_LOCK_write); + six_acquire(&lock->dep_map, 0); + + /* XXX: assert already locked, and that we don't overflow: */ + + atomic64_add(l[type].lock_val, &lock->state.counter); +} + +/* Convert from intent to read: */ +void six_lock_downgrade(struct six_lock *lock) +{ + six_lock_increment(lock, SIX_LOCK_read); + six_unlock_intent(lock); +} diff --git a/libbcachefs/six.h b/libbcachefs/six.h new file mode 100644 index 00000000..01ed3385 --- /dev/null +++ b/libbcachefs/six.h @@ -0,0 +1,136 @@ + +#ifndef _BCACHE_SIX_H +#define _BCACHE_SIX_H + +#include <linux/lockdep.h> +#include <linux/osq_lock.h> +#include <linux/sched.h> +#include <linux/types.h> + +#include "util.h" + +/* + * LOCK STATES: + * + * read, intent, write (i.e. shared/intent/exclusive, hence the name) + * + * read and write work as with normal read/write locks - a lock can have + * multiple readers, but write excludes reads and other write locks. + * + * Intent does not block read, but it does block other intent locks. The idea is + * by taking an intent lock, you can then later upgrade to a write lock without + * dropping your read lock and without deadlocking - because no other thread has + * the intent lock and thus no other thread could be trying to take the write + * lock. + */ + +union six_lock_state { + struct { + atomic64_t counter; + }; + + struct { + u64 v; + }; + + struct { + /* for waitlist_bitnr() */ + unsigned long l; + }; + + struct { + unsigned read_lock:26; + unsigned intent_lock:3; + unsigned waiters:3; + /* + * seq works much like in seqlocks: it's incremented every time + * we lock and unlock for write. + * + * If it's odd write lock is held, even unlocked. + * + * Thus readers can unlock, and then lock again later iff it + * hasn't been modified in the meantime. + */ + u32 seq; + }; +}; + +#define SIX_LOCK_MAX_RECURSE ((1 << 3) - 1) + +enum six_lock_type { + SIX_LOCK_read, + SIX_LOCK_intent, + SIX_LOCK_write, +}; + +struct six_lock { + union six_lock_state state; + struct task_struct *owner; + struct optimistic_spin_queue osq; + + raw_spinlock_t wait_lock; + struct list_head wait_list[3]; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +static __always_inline void __six_lock_init(struct six_lock *lock, + const char *name, + struct lock_class_key *key) +{ + atomic64_set(&lock->state.counter, 0); + raw_spin_lock_init(&lock->wait_lock); + INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); + INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); + INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_write]); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *) lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif +} + +#define six_lock_init(lock) \ +do { \ + static struct lock_class_key __key; \ + \ + __six_lock_init((lock), #lock, &__key); \ +} while (0) + +bool six_trylock_type(struct six_lock *, enum six_lock_type); +bool six_relock_type(struct six_lock *, enum six_lock_type, unsigned); +void six_lock_type(struct six_lock *, enum six_lock_type); +void six_unlock_type(struct six_lock *, enum six_lock_type); +bool six_trylock_convert(struct six_lock *, enum six_lock_type, + enum six_lock_type); +void six_lock_increment(struct six_lock *, enum six_lock_type); +void six_lock_downgrade(struct six_lock *); + +#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) + +#define __SIX_LOCK(type) \ +static __always_inline bool six_trylock_##type(struct six_lock *lock) \ +{ \ + return six_trylock_type(lock, SIX_LOCK_##type); \ +} \ + \ +static __always_inline bool six_relock_##type(struct six_lock *lock, u32 seq)\ +{ \ + return six_relock_type(lock, SIX_LOCK_##type, seq); \ +} \ + \ +static __always_inline void six_lock_##type(struct six_lock *lock) \ +{ \ + six_lock_type(lock, SIX_LOCK_##type); \ +} \ + \ +static __always_inline void six_unlock_##type(struct six_lock *lock) \ +{ \ + six_unlock_type(lock, SIX_LOCK_##type); \ +} + +__SIX_LOCK(read) +__SIX_LOCK(intent) +__SIX_LOCK(write) + +#endif /* _BCACHE_SIX_H */ diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h new file mode 100644 index 00000000..f70fc1a9 --- /dev/null +++ b/libbcachefs/str_hash.h @@ -0,0 +1,385 @@ +#ifndef _BCACHE_STR_HASH_H +#define _BCACHE_STR_HASH_H + +#include "btree_iter.h" +#include "checksum.h" +#include "inode.h" +#include "siphash.h" +#include "super.h" + +#include <linux/crc32c.h> +#include <crypto/hash.h> + +struct bch_hash_info { + u8 type; + union { + __le64 crc_key; + SIPHASH_KEY siphash_key; + }; +}; + +static inline struct bch_hash_info +bch2_hash_info_init(struct bch_fs *c, + const struct bch_inode_unpacked *bi) +{ + /* XXX ick */ + struct bch_hash_info info = { + .type = (bi->i_flags >> INODE_STR_HASH_OFFSET) & + ~(~0 << INODE_STR_HASH_BITS) + }; + + switch (info.type) { + case BCH_STR_HASH_CRC32C: + case BCH_STR_HASH_CRC64: + info.crc_key = bi->i_hash_seed; + break; + case BCH_STR_HASH_SIPHASH: { + SHASH_DESC_ON_STACK(desc, c->sha256); + u8 digest[crypto_shash_digestsize(c->sha256)]; + + desc->tfm = c->sha256; + desc->flags = 0; + + crypto_shash_digest(desc, (void *) &bi->i_hash_seed, + sizeof(bi->i_hash_seed), digest); + memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); + break; + } + default: + BUG(); + } + + return info; +} + +struct bch_str_hash_ctx { + union { + u32 crc32c; + u64 crc64; + SIPHASH_CTX siphash; + }; +}; + +static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info) +{ + switch (info->type) { + case BCH_STR_HASH_CRC32C: + ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key)); + break; + case BCH_STR_HASH_CRC64: + ctx->crc64 = bch2_crc64_update(~0, &info->crc_key, sizeof(info->crc_key)); + break; + case BCH_STR_HASH_SIPHASH: + SipHash24_Init(&ctx->siphash, &info->siphash_key); + break; + default: + BUG(); + } +} + +static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info, + const void *data, size_t len) +{ + switch (info->type) { + case BCH_STR_HASH_CRC32C: + ctx->crc32c = crc32c(ctx->crc32c, data, len); + break; + case BCH_STR_HASH_CRC64: + ctx->crc64 = bch2_crc64_update(ctx->crc64, data, len); + break; + case BCH_STR_HASH_SIPHASH: + SipHash24_Update(&ctx->siphash, data, len); + break; + default: + BUG(); + } +} + +static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info) +{ + switch (info->type) { + case BCH_STR_HASH_CRC32C: + return ctx->crc32c; + case BCH_STR_HASH_CRC64: + return ctx->crc64 >> 1; + case BCH_STR_HASH_SIPHASH: + return SipHash24_End(&ctx->siphash) >> 1; + default: + BUG(); + } +} + +struct bch_hash_desc { + enum btree_id btree_id; + u8 key_type; + u8 whiteout_type; + + u64 (*hash_key)(const struct bch_hash_info *, const void *); + u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); + bool (*cmp_key)(struct bkey_s_c, const void *); + bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); +}; + +static inline struct bkey_s_c +bch2_hash_lookup_at(const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct btree_iter *iter, const void *search) +{ + u64 inode = iter->pos.inode; + + do { + struct bkey_s_c k = bch2_btree_iter_peek_with_holes(iter); + + if (btree_iter_err(k)) + return k; + + if (k.k->type == desc.key_type) { + if (!desc.cmp_key(k, search)) + return k; + } else if (k.k->type == desc.whiteout_type) { + ; + } else { + /* hole, not found */ + break; + } + + bch2_btree_iter_advance_pos(iter); + } while (iter->pos.inode == inode); + + return bkey_s_c_err(-ENOENT); +} + +static inline struct bkey_s_c +bch2_hash_lookup_bkey_at(const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct btree_iter *iter, struct bkey_s_c search) +{ + u64 inode = iter->pos.inode; + + do { + struct bkey_s_c k = bch2_btree_iter_peek_with_holes(iter); + + if (btree_iter_err(k)) + return k; + + if (k.k->type == desc.key_type) { + if (!desc.cmp_bkey(k, search)) + return k; + } else if (k.k->type == desc.whiteout_type) { + ; + } else { + /* hole, not found */ + break; + } + + bch2_btree_iter_advance_pos(iter); + } while (iter->pos.inode == inode); + + return bkey_s_c_err(-ENOENT); +} + +static inline struct bkey_s_c +bch2_hash_lookup(const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct bch_fs *c, u64 inode, + struct btree_iter *iter, const void *key) +{ + bch2_btree_iter_init(iter, c, desc.btree_id, + POS(inode, desc.hash_key(info, key))); + + return bch2_hash_lookup_at(desc, info, iter, key); +} + +static inline struct bkey_s_c +bch2_hash_lookup_intent(const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct bch_fs *c, u64 inode, + struct btree_iter *iter, const void *key) +{ + bch2_btree_iter_init_intent(iter, c, desc.btree_id, + POS(inode, desc.hash_key(info, key))); + + return bch2_hash_lookup_at(desc, info, iter, key); +} + +static inline struct bkey_s_c +bch2_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter) +{ + while (1) { + struct bkey_s_c k = bch2_btree_iter_peek_with_holes(iter); + + if (btree_iter_err(k)) + return k; + + if (k.k->type != desc.key_type) + return k; + + /* hash collision, keep going */ + bch2_btree_iter_advance_pos(iter); + if (iter->pos.inode != k.k->p.inode) + return bkey_s_c_err(-ENOENT); + } +} + +static inline struct bkey_s_c bch2_hash_hole(const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct bch_fs *c, u64 inode, + struct btree_iter *iter, + const void *key) +{ + bch2_btree_iter_init_intent(iter, c, desc.btree_id, + POS(inode, desc.hash_key(info, key))); + + return bch2_hash_hole_at(desc, iter); +} + +static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct btree_iter *iter, + struct btree_iter *start) +{ + bch2_btree_iter_set_pos(iter, + btree_type_successor(start->btree_id, start->pos)); + + while (1) { + struct bkey_s_c k = bch2_btree_iter_peek_with_holes(iter); + int ret = btree_iter_err(k); + + if (ret) + return ret; + + if (k.k->type != desc.key_type && + k.k->type != desc.whiteout_type) + return false; + + if (k.k->type == desc.key_type && + desc.hash_bkey(info, k) <= start->pos.offset) + return true; + + bch2_btree_iter_advance_pos(iter); + } +} + +#define BCH_HASH_SET_MUST_CREATE 1 +#define BCH_HASH_SET_MUST_REPLACE 2 + +static inline int bch2_hash_set(const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct bch_fs *c, u64 inode, + u64 *journal_seq, + struct bkey_i *insert, int flags) +{ + struct btree_iter iter, hashed_slot; + struct bkey_s_c k; + int ret; + + bch2_btree_iter_init_intent(&hashed_slot, c, desc.btree_id, + POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)))); + bch2_btree_iter_init_intent(&iter, c, desc.btree_id, hashed_slot.pos); + bch2_btree_iter_link(&hashed_slot, &iter); +retry: + /* + * On hash collision, we have to keep the slot we hashed to locked while + * we do the insert - to avoid racing with another thread deleting + * whatever's in the slot we hashed to: + */ + ret = bch2_btree_iter_traverse(&hashed_slot); + if (ret) + goto err; + + /* + * On -EINTR/retry, we dropped locks - always restart from the slot we + * hashed to: + */ + bch2_btree_iter_copy(&iter, &hashed_slot); + + k = bch2_hash_lookup_bkey_at(desc, info, &iter, bkey_i_to_s_c(insert)); + + ret = btree_iter_err(k); + if (ret == -ENOENT) { + if (flags & BCH_HASH_SET_MUST_REPLACE) { + ret = -ENOENT; + goto err; + } + + /* + * Not found, so we're now looking for any open + * slot - we might have skipped over a whiteout + * that we could have used, so restart from the + * slot we hashed to: + */ + bch2_btree_iter_copy(&iter, &hashed_slot); + k = bch2_hash_hole_at(desc, &iter); + if ((ret = btree_iter_err(k))) + goto err; + } else if (!ret) { + if (flags & BCH_HASH_SET_MUST_CREATE) { + ret = -EEXIST; + goto err; + } + } else { + goto err; + } + + insert->k.p = iter.pos; + ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, + BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(&iter, insert)); +err: + if (ret == -EINTR) + goto retry; + + /* + * On successful insert, we don't want to clobber ret with error from + * iter: + */ + bch2_btree_iter_unlock(&iter); + bch2_btree_iter_unlock(&hashed_slot); + return ret; +} + +static inline int bch2_hash_delete(const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct bch_fs *c, u64 inode, + u64 *journal_seq, const void *key) +{ + struct btree_iter iter, whiteout_iter; + struct bkey_s_c k; + struct bkey_i delete; + int ret = -ENOENT; + + bch2_btree_iter_init_intent(&iter, c, desc.btree_id, + POS(inode, desc.hash_key(info, key))); + bch2_btree_iter_init(&whiteout_iter, c, desc.btree_id, + POS(inode, desc.hash_key(info, key))); + bch2_btree_iter_link(&iter, &whiteout_iter); +retry: + k = bch2_hash_lookup_at(desc, info, &iter, key); + if ((ret = btree_iter_err(k))) + goto err; + + ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, &iter); + if (ret < 0) + goto err; + + bkey_init(&delete.k); + delete.k.p = k.k->p; + delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED; + + ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_ATOMIC, + BTREE_INSERT_ENTRY(&iter, &delete)); +err: + if (ret == -EINTR) + goto retry; + + bch2_btree_iter_unlock(&whiteout_iter); + bch2_btree_iter_unlock(&iter); + return ret; +} + +#endif /* _BCACHE_STR_HASH_H */ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c new file mode 100644 index 00000000..9f41d71d --- /dev/null +++ b/libbcachefs/super-io.c @@ -0,0 +1,817 @@ + +#include "bcachefs.h" +#include "checksum.h" +#include "error.h" +#include "io.h" +#include "journal.h" +#include "super-io.h" +#include "super.h" +#include "vstructs.h" + +#include <linux/backing-dev.h> +#include <linux/sort.h> + +static inline void __bch2_sb_layout_size_assert(void) +{ + BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); +} + +struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, + enum bch_sb_field_type type) +{ + struct bch_sb_field *f; + + /* XXX: need locking around superblock to access optional fields */ + + vstruct_for_each(sb, f) + if (le32_to_cpu(f->type) == type) + return f; + return NULL; +} + +void bch2_free_super(struct bcache_superblock *sb) +{ + if (sb->bio) + bio_put(sb->bio); + if (!IS_ERR_OR_NULL(sb->bdev)) + blkdev_put(sb->bdev, sb->mode); + + free_pages((unsigned long) sb->sb, sb->page_order); + memset(sb, 0, sizeof(*sb)); +} + +static int __bch2_super_realloc(struct bcache_superblock *sb, unsigned order) +{ + struct bch_sb *new_sb; + struct bio *bio; + + if (sb->page_order >= order && sb->sb) + return 0; + + if (dynamic_fault("bcachefs:add:super_realloc")) + return -ENOMEM; + + bio = bio_kmalloc(GFP_KERNEL, 1 << order); + if (!bio) + return -ENOMEM; + + if (sb->bio) + bio_put(sb->bio); + sb->bio = bio; + + new_sb = (void *) __get_free_pages(GFP_KERNEL, order); + if (!new_sb) + return -ENOMEM; + + if (sb->sb) + memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); + + free_pages((unsigned long) sb->sb, sb->page_order); + sb->sb = new_sb; + + sb->page_order = order; + + return 0; +} + +static int bch2_sb_realloc(struct bcache_superblock *sb, unsigned u64s) +{ + u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s); + u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; + + if (new_bytes > max_bytes) { + char buf[BDEVNAME_SIZE]; + + pr_err("%s: superblock too big: want %llu but have %llu", + bdevname(sb->bdev, buf), new_bytes, max_bytes); + return -ENOSPC; + } + + return __bch2_super_realloc(sb, get_order(new_bytes)); +} + +static int bch2_fs_sb_realloc(struct bch_fs *c, unsigned u64s) +{ + u64 bytes = __vstruct_bytes(struct bch_sb, u64s); + struct bch_sb *sb; + unsigned order = get_order(bytes); + + if (c->disk_sb && order <= c->disk_sb_order) + return 0; + + sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + if (!sb) + return -ENOMEM; + + if (c->disk_sb) + memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order); + + free_pages((unsigned long) c->disk_sb, c->disk_sb_order); + + c->disk_sb = sb; + c->disk_sb_order = order; + return 0; +} + +static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb *sb, + struct bch_sb_field *f, + unsigned u64s) +{ + unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; + + if (!f) { + f = vstruct_last(sb); + memset(f, 0, sizeof(u64) * u64s); + f->u64s = cpu_to_le32(u64s); + f->type = 0; + } else { + void *src, *dst; + + src = vstruct_end(f); + f->u64s = cpu_to_le32(u64s); + dst = vstruct_end(f); + + memmove(dst, src, vstruct_end(sb) - src); + + if (dst > src) + memset(src, 0, dst - src); + } + + le32_add_cpu(&sb->u64s, u64s - old_u64s); + + return f; +} + +struct bch_sb_field *bch2_sb_field_resize(struct bcache_superblock *sb, + enum bch_sb_field_type type, + unsigned u64s) +{ + struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); + ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; + ssize_t d = -old_u64s + u64s; + + if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) + return NULL; + + f = __bch2_sb_field_resize(sb->sb, f, u64s); + f->type = type; + return f; +} + +struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c, + enum bch_sb_field_type type, + unsigned u64s) +{ + struct bch_sb_field *f = bch2_sb_field_get(c->disk_sb, type); + ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; + ssize_t d = -old_u64s + u64s; + struct bch_dev *ca; + unsigned i; + + lockdep_assert_held(&c->sb_lock); + + if (bch2_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d)) + return NULL; + + /* XXX: we're not checking that offline device have enough space */ + + for_each_online_member(ca, c, i) { + struct bcache_superblock *sb = &ca->disk_sb; + + if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { + percpu_ref_put(&ca->ref); + return NULL; + } + } + + f = __bch2_sb_field_resize(c->disk_sb, f, u64s); + f->type = type; + return f; +} + +static const char *validate_sb_layout(struct bch_sb_layout *layout) +{ + u64 offset, prev_offset, max_sectors; + unsigned i; + + if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) + return "Not a bcachefs superblock layout"; + + if (layout->layout_type != 0) + return "Invalid superblock layout type"; + + if (!layout->nr_superblocks) + return "Invalid superblock layout: no superblocks"; + + if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) + return "Invalid superblock layout: too many superblocks"; + + max_sectors = 1 << layout->sb_max_size_bits; + + prev_offset = le64_to_cpu(layout->sb_offset[0]); + + for (i = 1; i < layout->nr_superblocks; i++) { + offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset < prev_offset + max_sectors) + return "Invalid superblock layout: superblocks overlap"; + prev_offset = offset; + } + + return NULL; +} + +static int u64_cmp(const void *_l, const void *_r) +{ + u64 l = *((const u64 *) _l), r = *((const u64 *) _r); + + return l < r ? -1 : l > r ? 1 : 0; +} + +const char *bch2_validate_journal_layout(struct bch_sb *sb, + struct bch_member_cpu mi) +{ + struct bch_sb_field_journal *journal; + const char *err; + unsigned nr; + unsigned i; + u64 *b; + + journal = bch2_sb_get_journal(sb); + if (!journal) + return NULL; + + nr = bch2_nr_journal_buckets(journal); + if (!nr) + return NULL; + + b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); + if (!b) + return "cannot allocate memory"; + + for (i = 0; i < nr; i++) + b[i] = le64_to_cpu(journal->buckets[i]); + + sort(b, nr, sizeof(u64), u64_cmp, NULL); + + err = "journal bucket at sector 0"; + if (!b[0]) + goto err; + + err = "journal bucket before first bucket"; + if (b[0] < mi.first_bucket) + goto err; + + err = "journal bucket past end of device"; + if (b[nr - 1] >= mi.nbuckets) + goto err; + + err = "duplicate journal buckets"; + for (i = 0; i + 1 < nr; i++) + if (b[i] == b[i + 1]) + goto err; + + err = NULL; +err: + kfree(b); + return err; +} + +static const char *bch2_sb_validate_members(struct bch_sb *sb) +{ + struct bch_sb_field_members *mi; + unsigned i; + + mi = bch2_sb_get_members(sb); + if (!mi) + return "Invalid superblock: member info area missing"; + + if ((void *) (mi->members + sb->nr_devices) > + vstruct_end(&mi->field)) + return "Invalid superblock: bad member info"; + + for (i = 0; i < sb->nr_devices; i++) { + if (bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) + continue; + + if (le16_to_cpu(mi->members[i].bucket_size) < + BCH_SB_BTREE_NODE_SIZE(sb)) + return "bucket size smaller than btree node size"; + } + + return NULL; +} + +const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb) +{ + struct bch_sb *sb = disk_sb->sb; + struct bch_sb_field *f; + struct bch_sb_field_members *sb_mi; + struct bch_member_cpu mi; + const char *err; + u16 block_size; + + switch (le64_to_cpu(sb->version)) { + case BCACHE_SB_VERSION_CDEV_V4: + break; + default: + return"Unsupported superblock version"; + } + + if (BCH_SB_INITIALIZED(sb) && + le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V4) + return "Unsupported superblock version"; + + block_size = le16_to_cpu(sb->block_size); + + if (!is_power_of_2(block_size) || + block_size > PAGE_SECTORS) + return "Bad block size"; + + if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) + return "Bad user UUID"; + + if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) + return "Bad internal UUID"; + + if (!sb->nr_devices || + sb->nr_devices <= sb->dev_idx || + sb->nr_devices > BCH_SB_MEMBERS_MAX) + return "Bad cache device number in set"; + + if (!BCH_SB_META_REPLICAS_WANT(sb) || + BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) + return "Invalid number of metadata replicas"; + + if (!BCH_SB_META_REPLICAS_REQ(sb) || + BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) + return "Invalid number of metadata replicas"; + + if (!BCH_SB_META_REPLICAS_HAVE(sb) || + BCH_SB_META_REPLICAS_HAVE(sb) > + BCH_SB_META_REPLICAS_WANT(sb)) + return "Invalid number of metadata replicas"; + + if (!BCH_SB_DATA_REPLICAS_WANT(sb) || + BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) + return "Invalid number of data replicas"; + + if (!BCH_SB_DATA_REPLICAS_REQ(sb) || + BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) + return "Invalid number of metadata replicas"; + + if (!BCH_SB_DATA_REPLICAS_HAVE(sb) || + BCH_SB_DATA_REPLICAS_HAVE(sb) > + BCH_SB_DATA_REPLICAS_WANT(sb)) + return "Invalid number of data replicas"; + + if (!BCH_SB_BTREE_NODE_SIZE(sb)) + return "Btree node size not set"; + + if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) + return "Btree node size not a power of two"; + + if (BCH_SB_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX) + return "Btree node size too large"; + + if (BCH_SB_GC_RESERVE(sb) < 5) + return "gc reserve percentage too small"; + + if (1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) < block_size) + return "max journal entry size too small"; + + /* 4 mb max: */ + if (512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX) + return "max journal entry size too big"; + + if (!sb->time_precision || + le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) + return "invalid time precision"; + + /* validate layout */ + err = validate_sb_layout(&sb->layout); + if (err) + return err; + + vstruct_for_each(sb, f) { + if (!f->u64s) + return "Invalid superblock: invalid optional field"; + + if (vstruct_next(f) > vstruct_last(sb)) + return "Invalid superblock: invalid optional field"; + + if (le32_to_cpu(f->type) >= BCH_SB_FIELD_NR) + return "Invalid superblock: unknown optional field type"; + } + + err = bch2_sb_validate_members(sb); + if (err) + return err; + + sb_mi = bch2_sb_get_members(sb); + mi = bch2_mi_to_cpu(sb_mi->members + sb->dev_idx); + + if (mi.nbuckets > LONG_MAX) + return "Too many buckets"; + + if (mi.nbuckets - mi.first_bucket < 1 << 10) + return "Not enough buckets"; + + if (!is_power_of_2(mi.bucket_size) || + mi.bucket_size < PAGE_SECTORS || + mi.bucket_size < block_size) + return "Bad bucket size"; + + if (get_capacity(disk_sb->bdev->bd_disk) < + mi.bucket_size * mi.nbuckets) + return "Invalid superblock: device too small"; + + err = bch2_validate_journal_layout(sb, mi); + if (err) + return err; + + return NULL; +} + +/* device open: */ + +static const char *bch2_blkdev_open(const char *path, fmode_t mode, + void *holder, struct block_device **ret) +{ + struct block_device *bdev; + + *ret = NULL; + bdev = blkdev_get_by_path(path, mode, holder); + if (bdev == ERR_PTR(-EBUSY)) + return "device busy"; + + if (IS_ERR(bdev)) + return "failed to open device"; + + if (mode & FMODE_WRITE) + bdev_get_queue(bdev)->backing_dev_info.capabilities + |= BDI_CAP_STABLE_WRITES; + + *ret = bdev; + return NULL; +} + +static void bch2_sb_update(struct bch_fs *c) +{ + struct bch_sb *src = c->disk_sb; + struct bch_sb_field_members *mi = bch2_sb_get_members(src); + struct bch_dev *ca; + unsigned i; + + lockdep_assert_held(&c->sb_lock); + + c->sb.uuid = src->uuid; + c->sb.user_uuid = src->user_uuid; + c->sb.block_size = le16_to_cpu(src->block_size); + c->sb.btree_node_size = BCH_SB_BTREE_NODE_SIZE(src); + c->sb.nr_devices = src->nr_devices; + c->sb.clean = BCH_SB_CLEAN(src); + c->sb.meta_replicas_have= BCH_SB_META_REPLICAS_HAVE(src); + c->sb.data_replicas_have= BCH_SB_DATA_REPLICAS_HAVE(src); + c->sb.str_hash_type = BCH_SB_STR_HASH_TYPE(src); + c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); + c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); + c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); + c->sb.time_precision = le32_to_cpu(src->time_precision); + + for_each_member_device(ca, c, i) + ca->mi = bch2_mi_to_cpu(mi->members + i); +} + +/* doesn't copy member info */ +static void __copy_super(struct bch_sb *dst, struct bch_sb *src) +{ + struct bch_sb_field *src_f, *dst_f; + + dst->version = src->version; + dst->seq = src->seq; + dst->uuid = src->uuid; + dst->user_uuid = src->user_uuid; + memcpy(dst->label, src->label, sizeof(dst->label)); + + dst->block_size = src->block_size; + dst->nr_devices = src->nr_devices; + + dst->time_base_lo = src->time_base_lo; + dst->time_base_hi = src->time_base_hi; + dst->time_precision = src->time_precision; + + memcpy(dst->flags, src->flags, sizeof(dst->flags)); + memcpy(dst->features, src->features, sizeof(dst->features)); + memcpy(dst->compat, src->compat, sizeof(dst->compat)); + + vstruct_for_each(src, src_f) { + if (src_f->type == BCH_SB_FIELD_journal) + continue; + + dst_f = bch2_sb_field_get(dst, src_f->type); + dst_f = __bch2_sb_field_resize(dst, dst_f, + le32_to_cpu(src_f->u64s)); + + memcpy(dst_f, src_f, vstruct_bytes(src_f)); + } +} + +int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) +{ + struct bch_sb_field_journal *journal_buckets = + bch2_sb_get_journal(src); + unsigned journal_u64s = journal_buckets + ? le32_to_cpu(journal_buckets->field.u64s) + : 0; + + lockdep_assert_held(&c->sb_lock); + + if (bch2_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s)) + return -ENOMEM; + + __copy_super(c->disk_sb, src); + bch2_sb_update(c); + + return 0; +} + +int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) +{ + struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb; + struct bch_sb_field_journal *journal_buckets = + bch2_sb_get_journal(dst); + unsigned journal_u64s = journal_buckets + ? le32_to_cpu(journal_buckets->field.u64s) + : 0; + unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; + int ret; + + ret = bch2_sb_realloc(&ca->disk_sb, u64s); + if (ret) + return ret; + + __copy_super(dst, src); + + return 0; +} + +/* read superblock: */ + +static const char *read_one_super(struct bcache_superblock *sb, u64 offset) +{ + struct bch_csum csum; + size_t bytes; + unsigned order; +reread: + bio_reset(sb->bio); + sb->bio->bi_bdev = sb->bdev; + sb->bio->bi_iter.bi_sector = offset; + sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order; + bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); + bch2_bio_map(sb->bio, sb->sb); + + if (submit_bio_wait(sb->bio)) + return "IO error"; + + if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) + return "Not a bcachefs superblock"; + + if (le64_to_cpu(sb->sb->version) != BCACHE_SB_VERSION_CDEV_V4) + return "Unsupported superblock version"; + + bytes = vstruct_bytes(sb->sb); + + if (bytes > 512 << sb->sb->layout.sb_max_size_bits) + return "Bad superblock: too big"; + + order = get_order(bytes); + if (order > sb->page_order) { + if (__bch2_super_realloc(sb, order)) + return "cannot allocate memory"; + goto reread; + } + + if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) + return "unknown csum type"; + + /* XXX: verify MACs */ + csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), + (struct nonce) { 0 }, sb->sb); + + if (bch2_crc_cmp(csum, sb->sb->csum)) + return "bad checksum reading superblock"; + + return NULL; +} + +const char *bch2_read_super(struct bcache_superblock *sb, + struct bch_opts opts, + const char *path) +{ + u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR; + struct bch_sb_layout layout; + const char *err; + unsigned i; + + memset(sb, 0, sizeof(*sb)); + sb->mode = FMODE_READ; + + if (!(opt_defined(opts.noexcl) && opts.noexcl)) + sb->mode |= FMODE_EXCL; + + if (!(opt_defined(opts.nochanges) && opts.nochanges)) + sb->mode |= FMODE_WRITE; + + err = bch2_blkdev_open(path, sb->mode, sb, &sb->bdev); + if (err) + return err; + + err = "cannot allocate memory"; + if (__bch2_super_realloc(sb, 0)) + goto err; + + err = "dynamic fault"; + if (bch2_fs_init_fault("read_super")) + goto err; + + err = read_one_super(sb, offset); + if (!err) + goto got_super; + + if (offset != BCH_SB_SECTOR) { + pr_err("error reading superblock: %s", err); + goto err; + } + + pr_err("error reading default superblock: %s", err); + + /* + * Error reading primary superblock - read location of backup + * superblocks: + */ + bio_reset(sb->bio); + sb->bio->bi_bdev = sb->bdev; + sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; + sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout); + bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); + /* + * use sb buffer to read layout, since sb buffer is page aligned but + * layout won't be: + */ + bch2_bio_map(sb->bio, sb->sb); + + err = "IO error"; + if (submit_bio_wait(sb->bio)) + goto err; + + memcpy(&layout, sb->sb, sizeof(layout)); + err = validate_sb_layout(&layout); + if (err) + goto err; + + for (i = 0; i < layout.nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout.sb_offset[i]); + + if (offset == BCH_SB_SECTOR) + continue; + + err = read_one_super(sb, offset); + if (!err) + goto got_super; + } + goto err; +got_super: + pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", + le64_to_cpu(sb->sb->version), + le64_to_cpu(sb->sb->flags), + le64_to_cpu(sb->sb->seq), + le16_to_cpu(sb->sb->u64s)); + + err = "Superblock block size smaller than device block size"; + if (le16_to_cpu(sb->sb->block_size) << 9 < + bdev_logical_block_size(sb->bdev)) + goto err; + + return NULL; +err: + bch2_free_super(sb); + return err; +} + +/* write superblock: */ + +static void write_super_endio(struct bio *bio) +{ + struct bch_dev *ca = bio->bi_private; + + /* XXX: return errors directly */ + + bch2_dev_fatal_io_err_on(bio->bi_error, ca, "superblock write"); + + closure_put(&ca->fs->sb_write); + percpu_ref_put(&ca->io_ref); +} + +static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) +{ + struct bch_sb *sb = ca->disk_sb.sb; + struct bio *bio = ca->disk_sb.bio; + + if (idx >= sb->layout.nr_superblocks) + return false; + + if (!percpu_ref_tryget(&ca->io_ref)) + return false; + + sb->offset = sb->layout.sb_offset[idx]; + + SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); + sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), + (struct nonce) { 0 }, sb); + + bio_reset(bio); + bio->bi_bdev = ca->disk_sb.bdev; + bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); + bio->bi_iter.bi_size = + roundup(vstruct_bytes(sb), + bdev_logical_block_size(ca->disk_sb.bdev)); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; + bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); + bch2_bio_map(bio, sb); + + closure_bio_submit(bio, &c->sb_write); + return true; +} + +void bch2_write_super(struct bch_fs *c) +{ + struct closure *cl = &c->sb_write; + struct bch_dev *ca; + unsigned i, super_idx = 0; + bool wrote; + + lockdep_assert_held(&c->sb_lock); + + closure_init_stack(cl); + + le64_add_cpu(&c->disk_sb->seq, 1); + + for_each_online_member(ca, c, i) + bch2_sb_from_fs(c, ca); + + if (c->opts.nochanges) + goto out; + + do { + wrote = false; + for_each_online_member(ca, c, i) + if (write_one_super(c, ca, super_idx)) + wrote = true; + + closure_sync(cl); + super_idx++; + } while (wrote); +out: + /* Make new options visible after they're persistent: */ + bch2_sb_update(c); +} + +void bch2_check_mark_super_slowpath(struct bch_fs *c, const struct bkey_i *k, + bool meta) +{ + struct bch_member *mi; + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + const struct bch_extent_ptr *ptr; + unsigned nr_replicas = 0; + + mutex_lock(&c->sb_lock); + + /* recheck, might have raced */ + if (bch2_check_super_marked(c, k, meta)) { + mutex_unlock(&c->sb_lock); + return; + } + + mi = bch2_sb_get_members(c->disk_sb)->members; + + extent_for_each_ptr(e, ptr) + if (!ptr->cached) { + (meta + ? SET_BCH_MEMBER_HAS_METADATA + : SET_BCH_MEMBER_HAS_DATA)(mi + ptr->dev, true); + nr_replicas++; + } + + nr_replicas = min_t(unsigned, nr_replicas, + (meta + ? BCH_SB_META_REPLICAS_HAVE + : BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb)); + (meta + ? SET_BCH_SB_META_REPLICAS_HAVE + : SET_BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb, nr_replicas); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h new file mode 100644 index 00000000..8f0d82db --- /dev/null +++ b/libbcachefs/super-io.h @@ -0,0 +1,159 @@ +#ifndef _BCACHE_SUPER_IO_H +#define _BCACHE_SUPER_IO_H + +#include "extents.h" +#include "super_types.h" + +#include <asm/byteorder.h> + +struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); +struct bch_sb_field *bch2_sb_field_resize(struct bcache_superblock *, + enum bch_sb_field_type, unsigned); +struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *, + enum bch_sb_field_type, unsigned); + +#define field_to_type(_f, _name) \ + container_of_or_null(_f, struct bch_sb_field_##_name, field) + +#define BCH_SB_FIELD_TYPE(_name) \ +static inline struct bch_sb_field_##_name * \ +bch2_sb_get_##_name(struct bch_sb *sb) \ +{ \ + return field_to_type(bch2_sb_field_get(sb, \ + BCH_SB_FIELD_##_name), _name); \ +} \ + \ +static inline struct bch_sb_field_##_name * \ +bch2_sb_resize_##_name(struct bcache_superblock *sb, unsigned u64s) \ +{ \ + return field_to_type(bch2_sb_field_resize(sb, \ + BCH_SB_FIELD_##_name, u64s), _name); \ +} \ + \ +static inline struct bch_sb_field_##_name * \ +bch2_fs_sb_resize_##_name(struct bch_fs *c, unsigned u64s) \ +{ \ + return field_to_type(bch2_fs_sb_field_resize(c, \ + BCH_SB_FIELD_##_name, u64s), _name); \ +} + +BCH_SB_FIELD_TYPE(journal); +BCH_SB_FIELD_TYPE(members); +BCH_SB_FIELD_TYPE(crypt); + +static inline bool bch2_sb_test_feature(struct bch_sb *sb, + enum bch_sb_features f) +{ + unsigned w = f / 64; + unsigned b = f % 64; + + return le64_to_cpu(sb->features[w]) & (1ULL << b); +} + +static inline void bch2_sb_set_feature(struct bch_sb *sb, + enum bch_sb_features f) +{ + if (!bch2_sb_test_feature(sb, f)) { + unsigned w = f / 64; + unsigned b = f % 64; + + le64_add_cpu(&sb->features[w], 1ULL << b); + } +} + +static inline __le64 bch2_sb_magic(struct bch_fs *c) +{ + __le64 ret; + memcpy(&ret, &c->sb.uuid, sizeof(ret)); + return ret; +} + +static inline __u64 jset_magic(struct bch_fs *c) +{ + return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); +} + +static inline __u64 pset_magic(struct bch_fs *c) +{ + return __le64_to_cpu(bch2_sb_magic(c) ^ PSET_MAGIC); +} + +static inline __u64 bset_magic(struct bch_fs *c) +{ + return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); +} + +static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) +{ + return (struct bch_member_cpu) { + .nbuckets = le64_to_cpu(mi->nbuckets), + .first_bucket = le16_to_cpu(mi->first_bucket), + .bucket_size = le16_to_cpu(mi->bucket_size), + .state = BCH_MEMBER_STATE(mi), + .tier = BCH_MEMBER_TIER(mi), + .has_metadata = BCH_MEMBER_HAS_METADATA(mi), + .has_data = BCH_MEMBER_HAS_DATA(mi), + .replacement = BCH_MEMBER_REPLACEMENT(mi), + .discard = BCH_MEMBER_DISCARD(mi), + .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), + }; +} + +int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); +int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); + +void bch2_free_super(struct bcache_superblock *); +int bch2_super_realloc(struct bcache_superblock *, unsigned); + +const char *bch2_validate_journal_layout(struct bch_sb *, + struct bch_member_cpu); +const char *bch2_validate_cache_super(struct bcache_superblock *); + +const char *bch2_read_super(struct bcache_superblock *, + struct bch_opts, const char *); +void bch2_write_super(struct bch_fs *); + +void bch2_check_mark_super_slowpath(struct bch_fs *, + const struct bkey_i *, bool); + +static inline bool bch2_check_super_marked(struct bch_fs *c, + const struct bkey_i *k, bool meta) +{ + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + const struct bch_extent_ptr *ptr; + unsigned nr_replicas = 0; + bool ret = true; + + extent_for_each_ptr(e, ptr) { + struct bch_dev *ca = c->devs[ptr->dev]; + + if (ptr->cached) + continue; + + if (!(meta + ? ca->mi.has_metadata + : ca->mi.has_data)) { + ret = false; + break; + } + + nr_replicas++; + } + + if (nr_replicas < + (meta ? c->sb.meta_replicas_have : c->sb.data_replicas_have)) + ret = false; + + return ret; +} + +static inline void bch2_check_mark_super(struct bch_fs *c, + const struct bkey_i *k, bool meta) +{ + if (bch2_check_super_marked(c, k, meta)) + return; + + bch2_check_mark_super_slowpath(c, k, meta); +} + +#endif /* _BCACHE_SUPER_IO_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c new file mode 100644 index 00000000..8aa5cc00 --- /dev/null +++ b/libbcachefs/super.c @@ -0,0 +1,1832 @@ +/* + * bcachefs setup/teardown code, and some metadata io - read a superblock and + * figure out what to do with it. + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc.h" +#include "btree_cache.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "btree_io.h" +#include "chardev.h" +#include "checksum.h" +#include "clock.h" +#include "compress.h" +#include "debug.h" +#include "error.h" +#include "fs.h" +#include "fs-gc.h" +#include "inode.h" +#include "io.h" +#include "journal.h" +#include "keylist.h" +#include "move.h" +#include "migrate.h" +#include "movinggc.h" +#include "super.h" +#include "super-io.h" +#include "tier.h" + +#include <linux/backing-dev.h> +#include <linux/blkdev.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/genhd.h> +#include <linux/idr.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/random.h> +#include <linux/sysfs.h> +#include <crypto/hash.h> + +#include <trace/events/bcachefs.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); + +static const uuid_le invalid_uuid = { + .b = { + 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78, + 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99 + } +}; + +static struct kset *bcachefs_kset; +static LIST_HEAD(bch_fs_list); +static DEFINE_MUTEX(bch_fs_list_lock); + +static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); + +static void bch2_dev_free(struct bch_dev *); +static int bch2_dev_alloc(struct bch_fs *, unsigned); +static int bch2_dev_sysfs_online(struct bch_dev *); +static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); + +struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev) +{ + struct bch_fs *c; + struct bch_dev *ca; + unsigned i; + + mutex_lock(&bch_fs_list_lock); + rcu_read_lock(); + + list_for_each_entry(c, &bch_fs_list, list) + for_each_member_device_rcu(ca, c, i) + if (ca->disk_sb.bdev == bdev) { + closure_get(&c->cl); + goto found; + } + c = NULL; +found: + rcu_read_unlock(); + mutex_unlock(&bch_fs_list_lock); + + return c; +} + +static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid) +{ + struct bch_fs *c; + + lockdep_assert_held(&bch_fs_list_lock); + + list_for_each_entry(c, &bch_fs_list, list) + if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le))) + return c; + + return NULL; +} + +struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) +{ + struct bch_fs *c; + + mutex_lock(&bch_fs_list_lock); + c = __bch2_uuid_to_fs(uuid); + if (c) + closure_get(&c->cl); + mutex_unlock(&bch_fs_list_lock); + + return c; +} + +int bch2_congested(struct bch_fs *c, int bdi_bits) +{ + struct backing_dev_info *bdi; + struct bch_dev *ca; + unsigned i; + int ret = 0; + + if (bdi_bits & (1 << WB_sync_congested)) { + /* Reads - check all devices: */ + for_each_readable_member(ca, c, i) { + bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); + + if (bdi_congested(bdi, bdi_bits)) { + ret = 1; + break; + } + } + } else { + /* Writes prefer fastest tier: */ + struct bch_tier *tier = READ_ONCE(c->fastest_tier); + struct dev_group *grp = tier ? &tier->devs : &c->all_devs; + + rcu_read_lock(); + group_for_each_dev(ca, grp, i) { + bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); + + if (bdi_congested(bdi, bdi_bits)) { + ret = 1; + break; + } + } + rcu_read_unlock(); + } + + return ret; +} + +static int bch2_congested_fn(void *data, int bdi_bits) +{ + struct bch_fs *c = data; + + return bch2_congested(c, bdi_bits); +} + +/* Filesystem RO/RW: */ + +/* + * For startup/shutdown of RW stuff, the dependencies are: + * + * - foreground writes depend on copygc and tiering (to free up space) + * + * - copygc and tiering depend on mark and sweep gc (they actually probably + * don't because they either reserve ahead of time or don't block if + * allocations fail, but allocations can require mark and sweep gc to run + * because of generation number wraparound) + * + * - all of the above depends on the allocator threads + * + * - allocator depends on the journal (when it rewrites prios and gens) + */ + +static void __bch2_fs_read_only(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + bch2_tiering_stop(c); + + for_each_member_device(ca, c, i) + bch2_moving_gc_stop(ca); + + bch2_gc_thread_stop(c); + + bch2_btree_flush(c); + + for_each_member_device(ca, c, i) + bch2_dev_allocator_stop(ca); + + bch2_fs_journal_stop(&c->journal); +} + +static void bch2_writes_disabled(struct percpu_ref *writes) +{ + struct bch_fs *c = container_of(writes, struct bch_fs, writes); + + set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + wake_up(&bch_read_only_wait); +} + +void bch2_fs_read_only(struct bch_fs *c) +{ + mutex_lock(&c->state_lock); + if (c->state != BCH_FS_STARTING && + c->state != BCH_FS_RW) + goto out; + + if (test_bit(BCH_FS_ERROR, &c->flags)) + goto out; + + /* + * Block new foreground-end write operations from starting - any new + * writes will return -EROFS: + * + * (This is really blocking new _allocations_, writes to previously + * allocated space can still happen until stopping the allocator in + * bch2_dev_allocator_stop()). + */ + percpu_ref_kill(&c->writes); + + del_timer(&c->foreground_write_wakeup); + cancel_delayed_work(&c->pd_controllers_update); + + c->foreground_write_pd.rate.rate = UINT_MAX; + bch2_wake_delayed_writes((unsigned long) c); + + /* + * If we're not doing an emergency shutdown, we want to wait on + * outstanding writes to complete so they don't see spurious errors due + * to shutting down the allocator: + * + * If we are doing an emergency shutdown outstanding writes may + * hang until we shutdown the allocator so we don't want to wait + * on outstanding writes before shutting everything down - but + * we do need to wait on them before returning and signalling + * that going RO is complete: + */ + wait_event(bch_read_only_wait, + test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || + test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); + + __bch2_fs_read_only(c); + + wait_event(bch_read_only_wait, + test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + + clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + + if (!bch2_journal_error(&c->journal) && + !test_bit(BCH_FS_ERROR, &c->flags)) { + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb, true); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + c->state = BCH_FS_RO; +out: + mutex_unlock(&c->state_lock); +} + +static void bch2_fs_read_only_work(struct work_struct *work) +{ + struct bch_fs *c = + container_of(work, struct bch_fs, read_only_work); + + bch2_fs_read_only(c); +} + +static void bch2_fs_read_only_async(struct bch_fs *c) +{ + queue_work(system_long_wq, &c->read_only_work); +} + +bool bch2_fs_emergency_read_only(struct bch_fs *c) +{ + bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); + + bch2_fs_read_only_async(c); + bch2_journal_halt(&c->journal); + + wake_up(&bch_read_only_wait); + return ret; +} + +const char *bch2_fs_read_write(struct bch_fs *c) +{ + struct bch_dev *ca; + const char *err = NULL; + unsigned i; + + mutex_lock(&c->state_lock); + if (c->state != BCH_FS_STARTING && + c->state != BCH_FS_RO) + goto out; + + err = "error starting allocator thread"; + for_each_rw_member(ca, c, i) + if (bch2_dev_allocator_start(ca)) { + percpu_ref_put(&ca->io_ref); + goto err; + } + + err = "error starting btree GC thread"; + if (bch2_gc_thread_start(c)) + goto err; + + err = "error starting moving GC thread"; + for_each_rw_member(ca, c, i) + if (bch2_moving_gc_start(ca)) { + percpu_ref_put(&ca->io_ref); + goto err; + } + + err = "error starting tiering thread"; + if (bch2_tiering_start(c)) + goto err; + + schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); + + if (c->state != BCH_FS_STARTING) + percpu_ref_reinit(&c->writes); + + c->state = BCH_FS_RW; + err = NULL; +out: + mutex_unlock(&c->state_lock); + return err; +err: + __bch2_fs_read_only(c); + goto out; +} + +/* Filesystem startup/shutdown: */ + +static void bch2_fs_free(struct bch_fs *c) +{ + bch2_fs_encryption_exit(c); + bch2_fs_btree_exit(c); + bch2_fs_journal_exit(&c->journal); + bch2_io_clock_exit(&c->io_clock[WRITE]); + bch2_io_clock_exit(&c->io_clock[READ]); + bch2_fs_compress_exit(c); + bdi_destroy(&c->bdi); + lg_lock_free(&c->usage_lock); + free_percpu(c->usage_percpu); + mempool_exit(&c->btree_bounce_pool); + mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->bio_write); + bioset_exit(&c->bio_read_split); + bioset_exit(&c->bio_read); + bioset_exit(&c->btree_read_bio); + mempool_exit(&c->btree_interior_update_pool); + mempool_exit(&c->btree_reserve_pool); + mempool_exit(&c->fill_iter); + percpu_ref_exit(&c->writes); + + if (c->copygc_wq) + destroy_workqueue(c->copygc_wq); + if (c->wq) + destroy_workqueue(c->wq); + + free_pages((unsigned long) c->disk_sb, c->disk_sb_order); + kfree(c); + module_put(THIS_MODULE); +} + +static void bch2_fs_exit(struct bch_fs *c) +{ + unsigned i; + + del_timer_sync(&c->foreground_write_wakeup); + cancel_delayed_work_sync(&c->pd_controllers_update); + cancel_work_sync(&c->read_only_work); + cancel_work_sync(&c->read_retry_work); + + for (i = 0; i < c->sb.nr_devices; i++) + if (c->devs[i]) + bch2_dev_free(c->devs[i]); + + closure_debug_destroy(&c->cl); + kobject_put(&c->kobj); +} + +static void bch2_fs_offline(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + mutex_lock(&bch_fs_list_lock); + list_del(&c->list); + mutex_unlock(&bch_fs_list_lock); + + for_each_member_device(ca, c, i) + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev) + sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, + "bcachefs"); + + if (c->kobj.state_in_sysfs) + kobject_del(&c->kobj); + + bch2_fs_debug_exit(c); + bch2_fs_chardev_exit(c); + + kobject_put(&c->time_stats); + kobject_put(&c->opts_dir); + kobject_put(&c->internal); + + __bch2_fs_read_only(c); +} + +void bch2_fs_release(struct kobject *kobj) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + + bch2_fs_free(c); +} + +void bch2_fs_stop(struct bch_fs *c) +{ + mutex_lock(&c->state_lock); + BUG_ON(c->state == BCH_FS_STOPPING); + c->state = BCH_FS_STOPPING; + mutex_unlock(&c->state_lock); + + bch2_fs_offline(c); + + closure_sync(&c->cl); + + bch2_fs_exit(c); +} + +#define alloc_bucket_pages(gfp, ca) \ + ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca)))) + +static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) +{ + struct bch_sb_field_members *mi; + struct bch_fs *c; + unsigned i, iter_size, journal_entry_bytes; + + c = kzalloc(sizeof(struct bch_fs), GFP_KERNEL); + if (!c) + return NULL; + + __module_get(THIS_MODULE); + + c->minor = -1; + + mutex_init(&c->state_lock); + mutex_init(&c->sb_lock); + mutex_init(&c->btree_cache_lock); + mutex_init(&c->bucket_lock); + mutex_init(&c->btree_root_lock); + INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); + + init_rwsem(&c->gc_lock); + +#define BCH_TIME_STAT(name, frequency_units, duration_units) \ + spin_lock_init(&c->name##_time.lock); + BCH_TIME_STATS() +#undef BCH_TIME_STAT + + bch2_fs_allocator_init(c); + bch2_fs_tiering_init(c); + + INIT_LIST_HEAD(&c->list); + INIT_LIST_HEAD(&c->btree_cache); + INIT_LIST_HEAD(&c->btree_cache_freeable); + INIT_LIST_HEAD(&c->btree_cache_freed); + + INIT_LIST_HEAD(&c->btree_interior_update_list); + mutex_init(&c->btree_reserve_cache_lock); + mutex_init(&c->btree_interior_update_lock); + + mutex_init(&c->bio_bounce_pages_lock); + bio_list_init(&c->read_retry_list); + spin_lock_init(&c->read_retry_lock); + INIT_WORK(&c->read_retry_work, bch2_read_retry_work); + mutex_init(&c->zlib_workspace_lock); + + seqcount_init(&c->gc_pos_lock); + + c->prio_clock[READ].hand = 1; + c->prio_clock[READ].min_prio = 0; + c->prio_clock[WRITE].hand = 1; + c->prio_clock[WRITE].min_prio = 0; + + init_waitqueue_head(&c->writeback_wait); + c->writeback_pages_max = (256 << 10) / PAGE_SIZE; + + c->copy_gc_enabled = 1; + c->tiering_enabled = 1; + c->tiering_percent = 10; + + c->foreground_target_percent = 20; + + c->journal.write_time = &c->journal_write_time; + c->journal.delay_time = &c->journal_delay_time; + c->journal.blocked_time = &c->journal_blocked_time; + c->journal.flush_seq_time = &c->journal_flush_seq_time; + + mutex_lock(&c->sb_lock); + + if (bch2_sb_to_fs(c, sb)) { + mutex_unlock(&c->sb_lock); + goto err; + } + + mutex_unlock(&c->sb_lock); + + scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); + + bch2_opts_apply(&c->opts, bch2_sb_opts(sb)); + bch2_opts_apply(&c->opts, opts); + + c->opts.nochanges |= c->opts.noreplay; + c->opts.read_only |= c->opts.nochanges; + + c->block_bits = ilog2(c->sb.block_size); + + if (bch2_fs_init_fault("fs_alloc")) + goto err; + + iter_size = (btree_blocks(c) + 1) * 2 * + sizeof(struct btree_node_iter_set); + + journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb); + + if (!(c->wq = alloc_workqueue("bcachefs", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || + !(c->copygc_wq = alloc_workqueue("bcache_copygc", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || + percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) || + mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1, + sizeof(struct btree_reserve)) || + mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, + sizeof(struct btree_interior_update)) || + mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || + bioset_init(&c->btree_read_bio, 1, 0) || + bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) || + bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) || + bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) || + mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->sb.btree_node_size, + BCH_ENCODED_EXTENT_MAX) / + PAGE_SECTORS, 0) || + !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) || + lg_lock_init(&c->usage_lock) || + mempool_init_page_pool(&c->btree_bounce_pool, 1, + ilog2(btree_pages(c))) || + bdi_setup_and_register(&c->bdi, "bcachefs") || + bch2_io_clock_init(&c->io_clock[READ]) || + bch2_io_clock_init(&c->io_clock[WRITE]) || + bch2_fs_journal_init(&c->journal, journal_entry_bytes) || + bch2_fs_btree_init(c) || + bch2_fs_encryption_init(c) || + bch2_fs_compress_init(c) || + bch2_check_set_has_compressed_data(c, c->opts.compression)) + goto err; + + c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + c->bdi.congested_fn = bch2_congested_fn; + c->bdi.congested_data = c; + + mi = bch2_sb_get_members(c->disk_sb); + for (i = 0; i < c->sb.nr_devices; i++) + if (!bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) && + bch2_dev_alloc(c, i)) + goto err; + + /* + * Now that all allocations have succeeded, init various refcounty + * things that let us shutdown: + */ + closure_init(&c->cl, NULL); + + c->kobj.kset = bcachefs_kset; + kobject_init(&c->kobj, &bch2_fs_ktype); + kobject_init(&c->internal, &bch2_fs_internal_ktype); + kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); + kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); + return c; +err: + bch2_fs_free(c); + return NULL; +} + +static const char *__bch2_fs_online(struct bch_fs *c) +{ + struct bch_dev *ca; + const char *err = NULL; + unsigned i; + int ret; + + lockdep_assert_held(&bch_fs_list_lock); + + if (!list_empty(&c->list)) + return NULL; + + if (__bch2_uuid_to_fs(c->sb.uuid)) + return "filesystem UUID already open"; + + ret = bch2_fs_chardev_init(c); + if (ret) + return "error creating character device"; + + bch2_fs_debug_init(c); + + if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || + kobject_add(&c->internal, &c->kobj, "internal") || + kobject_add(&c->opts_dir, &c->kobj, "options") || + kobject_add(&c->time_stats, &c->kobj, "time_stats")) + return "error creating sysfs objects"; + + mutex_lock(&c->state_lock); + + err = "error creating sysfs objects"; + __for_each_member_device(ca, c, i) + if (bch2_dev_sysfs_online(ca)) + goto err; + + list_add(&c->list, &bch_fs_list); + err = NULL; +err: + mutex_unlock(&c->state_lock); + return err; +} + +static const char *bch2_fs_online(struct bch_fs *c) +{ + const char *err; + + mutex_lock(&bch_fs_list_lock); + err = __bch2_fs_online(c); + mutex_unlock(&bch_fs_list_lock); + + return err; +} + +static const char *__bch2_fs_start(struct bch_fs *c) +{ + const char *err = "cannot allocate memory"; + struct bch_sb_field_members *mi; + struct bch_dev *ca; + unsigned i, id; + time64_t now; + LIST_HEAD(journal); + struct jset *j; + int ret = -EINVAL; + + BUG_ON(c->state != BCH_FS_STARTING); + + mutex_lock(&c->sb_lock); + for_each_online_member(ca, c, i) + bch2_sb_from_fs(c, ca); + mutex_unlock(&c->sb_lock); + + if (BCH_SB_INITIALIZED(c->disk_sb)) { + ret = bch2_journal_read(c, &journal); + if (ret) + goto err; + + j = &list_entry(journal.prev, struct journal_replay, list)->j; + + c->prio_clock[READ].hand = le16_to_cpu(j->read_clock); + c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock); + + err = "error reading priorities"; + for_each_readable_member(ca, c, i) { + ret = bch2_prio_read(ca); + if (ret) { + percpu_ref_put(&ca->io_ref); + goto err; + } + } + + for (id = 0; id < BTREE_ID_NR; id++) { + unsigned level; + struct bkey_i *k; + + err = "bad btree root"; + k = bch2_journal_find_btree_root(c, j, id, &level); + if (!k && id == BTREE_ID_EXTENTS) + goto err; + if (!k) { + pr_debug("missing btree root: %d", id); + continue; + } + + err = "error reading btree root"; + if (bch2_btree_root_read(c, id, k, level)) + goto err; + } + + bch_verbose(c, "starting mark and sweep:"); + + err = "error in recovery"; + if (bch2_initial_gc(c, &journal)) + goto err; + + if (c->opts.noreplay) + goto recovery_done; + + bch_verbose(c, "mark and sweep done"); + + /* + * bch2_journal_start() can't happen sooner, or btree_gc_finish() + * will give spurious errors about oldest_gen > bucket_gen - + * this is a hack but oh well. + */ + bch2_journal_start(c); + + err = "error starting allocator thread"; + for_each_rw_member(ca, c, i) + if (bch2_dev_allocator_start(ca)) { + percpu_ref_put(&ca->io_ref); + goto err; + } + + bch_verbose(c, "starting journal replay:"); + + err = "journal replay failed"; + ret = bch2_journal_replay(c, &journal); + if (ret) + goto err; + + bch_verbose(c, "journal replay done"); + + if (c->opts.norecovery) + goto recovery_done; + + bch_verbose(c, "starting fsck:"); + err = "error in fsck"; + ret = bch2_fsck(c, !c->opts.nofsck); + if (ret) + goto err; + + bch_verbose(c, "fsck done"); + } else { + struct bch_inode_unpacked inode; + struct bkey_inode_buf packed_inode; + struct closure cl; + + closure_init_stack(&cl); + + bch_notice(c, "initializing new filesystem"); + + bch2_initial_gc(c, NULL); + + err = "unable to allocate journal buckets"; + for_each_rw_member(ca, c, i) + if (bch2_dev_journal_alloc(ca)) { + percpu_ref_put(&ca->io_ref); + goto err; + } + + /* + * journal_res_get() will crash if called before this has + * set up the journal.pin FIFO and journal.cur pointer: + */ + bch2_journal_start(c); + bch2_journal_set_replay_done(&c->journal); + + err = "error starting allocator thread"; + for_each_rw_member(ca, c, i) + if (bch2_dev_allocator_start(ca)) { + percpu_ref_put(&ca->io_ref); + goto err; + } + + err = "cannot allocate new btree root"; + for (id = 0; id < BTREE_ID_NR; id++) + if (bch2_btree_root_alloc(c, id, &cl)) { + closure_sync(&cl); + goto err; + } + + /* Wait for new btree roots to be written: */ + closure_sync(&cl); + + bch2_inode_init(c, &inode, 0, 0, + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); + inode.inum = BCACHE_ROOT_INO; + + bch2_inode_pack(&packed_inode, &inode); + + err = "error creating root directory"; + if (bch2_btree_insert(c, BTREE_ID_INODES, + &packed_inode.inode.k_i, + NULL, NULL, NULL, 0)) + goto err; + + err = "error writing first journal entry"; + if (bch2_journal_meta(&c->journal)) + goto err; + } +recovery_done: + err = "dynamic fault"; + if (bch2_fs_init_fault("fs_start")) + goto err; + + if (c->opts.read_only) { + bch2_fs_read_only(c); + } else { + err = bch2_fs_read_write(c); + if (err) + goto err; + } + + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb); + now = ktime_get_seconds(); + + for_each_member_device(ca, c, i) + mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); + + SET_BCH_SB_INITIALIZED(c->disk_sb, true); + SET_BCH_SB_CLEAN(c->disk_sb, false); + c->disk_sb->version = BCACHE_SB_VERSION_CDEV; + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + err = NULL; +out: + bch2_journal_entries_free(&journal); + return err; +err: + switch (ret) { + case BCH_FSCK_ERRORS_NOT_FIXED: + bch_err(c, "filesystem contains errors: please report this to the developers"); + pr_cont("mount with -o fix_errors to repair"); + err = "fsck error"; + break; + case BCH_FSCK_REPAIR_UNIMPLEMENTED: + bch_err(c, "filesystem contains errors: please report this to the developers"); + pr_cont("repair unimplemented: inform the developers so that it can be added"); + err = "fsck error"; + break; + case BCH_FSCK_REPAIR_IMPOSSIBLE: + bch_err(c, "filesystem contains errors, but repair impossible"); + err = "fsck error"; + break; + case BCH_FSCK_UNKNOWN_VERSION: + err = "unknown metadata version";; + break; + case -ENOMEM: + err = "cannot allocate memory"; + break; + case -EIO: + err = "IO error"; + break; + } + + BUG_ON(!err); + set_bit(BCH_FS_ERROR, &c->flags); + goto out; +} + +const char *bch2_fs_start(struct bch_fs *c) +{ + return __bch2_fs_start(c) ?: bch2_fs_online(c); +} + +static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) +{ + struct bch_sb_field_members *sb_mi; + + sb_mi = bch2_sb_get_members(sb); + if (!sb_mi) + return "Invalid superblock: member info area missing"; + + if (le16_to_cpu(sb->block_size) != c->sb.block_size) + return "mismatched block size"; + + if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < + BCH_SB_BTREE_NODE_SIZE(c->disk_sb)) + return "new cache bucket size is too small"; + + return NULL; +} + +static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) +{ + struct bch_sb *newest = + le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; + struct bch_sb_field_members *mi = bch2_sb_get_members(newest); + + if (uuid_le_cmp(fs->uuid, sb->uuid)) + return "device not a member of filesystem"; + + if (sb->dev_idx >= newest->nr_devices) + return "device has invalid dev_idx"; + + if (bch2_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le))) + return "device has been removed"; + + if (fs->block_size != sb->block_size) + return "mismatched block size"; + + return NULL; +} + +/* Device startup/shutdown: */ + +void bch2_dev_release(struct kobject *kobj) +{ + struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); + + kfree(ca); +} + +static void bch2_dev_free(struct bch_dev *ca) +{ + unsigned i; + + cancel_work_sync(&ca->io_error_work); + + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev) + sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, + "bcachefs"); + + if (ca->kobj.state_in_sysfs) + kobject_del(&ca->kobj); + + bch2_free_super(&ca->disk_sb); + bch2_dev_journal_exit(ca); + + free_percpu(ca->sectors_written); + bioset_exit(&ca->replica_set); + free_percpu(ca->usage_percpu); + free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); + kfree(ca->prio_buckets); + kfree(ca->bio_prio); + vfree(ca->buckets); + vfree(ca->oldest_gens); + free_heap(&ca->heap); + free_fifo(&ca->free_inc); + + for (i = 0; i < RESERVE_NR; i++) + free_fifo(&ca->free[i]); + + percpu_ref_exit(&ca->io_ref); + percpu_ref_exit(&ca->ref); + kobject_put(&ca->kobj); +} + +static void bch2_dev_io_ref_release(struct percpu_ref *ref) +{ + struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); + + complete(&ca->offline_complete); +} + +static void __bch2_dev_offline(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + lockdep_assert_held(&c->state_lock); + + __bch2_dev_read_only(ca->fs, ca); + + reinit_completion(&ca->offline_complete); + percpu_ref_kill(&ca->io_ref); + wait_for_completion(&ca->offline_complete); + + if (ca->kobj.state_in_sysfs) { + struct kobject *block = + &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; + + sysfs_remove_link(block, "bcachefs"); + sysfs_remove_link(&ca->kobj, "block"); + } + + bch2_free_super(&ca->disk_sb); + bch2_dev_journal_exit(ca); +} + +static void bch2_dev_ref_release(struct percpu_ref *ref) +{ + struct bch_dev *ca = container_of(ref, struct bch_dev, ref); + + complete(&ca->stop_complete); +} + +static void bch2_dev_stop(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + lockdep_assert_held(&c->state_lock); + + BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca); + rcu_assign_pointer(c->devs[ca->dev_idx], NULL); + + synchronize_rcu(); + + reinit_completion(&ca->stop_complete); + percpu_ref_kill(&ca->ref); + wait_for_completion(&ca->stop_complete); +} + +static int bch2_dev_sysfs_online(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + int ret; + + if (!c->kobj.state_in_sysfs) + return 0; + + if (!ca->kobj.state_in_sysfs) { + ret = kobject_add(&ca->kobj, &ca->fs->kobj, + "dev-%u", ca->dev_idx); + if (ret) + return ret; + } + + if (ca->disk_sb.bdev) { + struct kobject *block = + &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; + + ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); + if (ret) + return ret; + ret = sysfs_create_link(&ca->kobj, block, "block"); + if (ret) + return ret; + } + + return 0; +} + +static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) +{ + struct bch_member *member; + size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve; + size_t heap_size; + unsigned i; + struct bch_dev *ca; + + if (bch2_fs_init_fault("dev_alloc")) + return -ENOMEM; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + return -ENOMEM; + + kobject_init(&ca->kobj, &bch2_dev_ktype); + init_completion(&ca->stop_complete); + init_completion(&ca->offline_complete); + + spin_lock_init(&ca->self.lock); + ca->self.nr = 1; + rcu_assign_pointer(ca->self.d[0].dev, ca); + ca->dev_idx = dev_idx; + + spin_lock_init(&ca->freelist_lock); + spin_lock_init(&ca->prio_buckets_lock); + mutex_init(&ca->heap_lock); + bch2_dev_moving_gc_init(ca); + + INIT_WORK(&ca->io_error_work, bch2_nonfatal_io_error_work); + + if (bch2_fs_init_fault("dev_alloc")) + goto err; + + member = bch2_sb_get_members(c->disk_sb)->members + dev_idx; + + ca->mi = bch2_mi_to_cpu(member); + ca->uuid = member->uuid; + ca->bucket_bits = ilog2(ca->mi.bucket_size); + scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); + + /* XXX: tune these */ + movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7); + reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9); + /* + * free_inc must be smaller than the copygc reserve: if it was bigger, + * one copygc iteration might not make enough buckets available to fill + * up free_inc and allow the allocator to make forward progress + */ + free_inc_reserve = movinggc_reserve / 2; + heap_size = movinggc_reserve * 8; + + if (percpu_ref_init(&ca->ref, bch2_dev_ref_release, + 0, GFP_KERNEL) || + percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_MOVINGGC], + movinggc_reserve, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) || + !init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) || + !init_heap(&ca->heap, heap_size, GFP_KERNEL) || + !(ca->oldest_gens = vzalloc(sizeof(u8) * + ca->mi.nbuckets)) || + !(ca->buckets = vzalloc(sizeof(struct bucket) * + ca->mi.nbuckets)) || + !(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) * + 2, GFP_KERNEL)) || + !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || + !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) || + !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) || + bioset_init(&ca->replica_set, 4, + offsetof(struct bch_write_bio, bio)) || + !(ca->sectors_written = alloc_percpu(*ca->sectors_written))) + goto err; + + ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); + + total_reserve = ca->free_inc.size; + for (i = 0; i < RESERVE_NR; i++) + total_reserve += ca->free[i].size; + + ca->copygc_write_point.group = &ca->self; + ca->tiering_write_point.group = &ca->self; + + ca->fs = c; + rcu_assign_pointer(c->devs[ca->dev_idx], ca); + + if (bch2_dev_sysfs_online(ca)) + pr_warn("error creating sysfs objects"); + + return 0; +err: + bch2_dev_free(ca); + return -ENOMEM; +} + +static int __bch2_dev_online(struct bch_fs *c, struct bcache_superblock *sb) +{ + struct bch_dev *ca; + int ret; + + lockdep_assert_held(&c->sb_lock); + + if (le64_to_cpu(sb->sb->seq) > + le64_to_cpu(c->disk_sb->seq)) + bch2_sb_to_fs(c, sb->sb); + + BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || + !c->devs[sb->sb->dev_idx]); + + ca = c->devs[sb->sb->dev_idx]; + if (ca->disk_sb.bdev) { + bch_err(c, "already have device online in slot %u", + sb->sb->dev_idx); + return -EINVAL; + } + + ret = bch2_dev_journal_init(ca, sb->sb); + if (ret) + return ret; + + /* + * Increase journal write timeout if flushes to this device are + * expensive: + */ + if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) && + journal_flushes_device(ca)) + c->journal.write_delay_ms = + max(c->journal.write_delay_ms, 1000U); + + /* Commit: */ + ca->disk_sb = *sb; + if (sb->mode & FMODE_EXCL) + ca->disk_sb.bdev->bd_holder = ca; + memset(sb, 0, sizeof(*sb)); + + if (c->sb.nr_devices == 1) + bdevname(ca->disk_sb.bdev, c->name); + bdevname(ca->disk_sb.bdev, ca->name); + + if (bch2_dev_sysfs_online(ca)) + pr_warn("error creating sysfs objects"); + + lg_local_lock(&c->usage_lock); + if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA))) + bch2_mark_dev_metadata(ca->fs, ca); + lg_local_unlock(&c->usage_lock); + + percpu_ref_reinit(&ca->io_ref); + return 0; +} + +/* Device management: */ + +bool bch2_fs_may_start(struct bch_fs *c, int flags) +{ + struct bch_sb_field_members *mi; + unsigned meta_missing = 0; + unsigned data_missing = 0; + bool degraded = false; + unsigned i; + + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb); + + for (i = 0; i < c->disk_sb->nr_devices; i++) + if (!c->devs[i] && + !bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) { + degraded = true; + if (BCH_MEMBER_HAS_METADATA(&mi->members[i])) + meta_missing++; + if (BCH_MEMBER_HAS_DATA(&mi->members[i])) + data_missing++; + } + mutex_unlock(&c->sb_lock); + + if (degraded && + !(flags & BCH_FORCE_IF_DEGRADED)) + return false; + + if (meta_missing && + !(flags & BCH_FORCE_IF_METADATA_DEGRADED)) + return false; + + if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) && + !(flags & BCH_FORCE_IF_METADATA_LOST)) + return false; + + if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) + return false; + + if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) && + !(flags & BCH_FORCE_IF_DATA_LOST)) + return false; + + return true; +} + +bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, + enum bch_member_state new_state, int flags) +{ + lockdep_assert_held(&c->state_lock); + + if (new_state == BCH_MEMBER_STATE_RW) + return true; + + if (ca->mi.has_data && + !(flags & BCH_FORCE_IF_DATA_DEGRADED)) + return false; + + if (ca->mi.has_data && + c->sb.data_replicas_have <= 1 && + !(flags & BCH_FORCE_IF_DATA_LOST)) + return false; + + if (ca->mi.has_metadata && + !(flags & BCH_FORCE_IF_METADATA_DEGRADED)) + return false; + + if (ca->mi.has_metadata && + c->sb.meta_replicas_have <= 1 && + !(flags & BCH_FORCE_IF_METADATA_LOST)) + return false; + + return true; +} + +static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) +{ + bch2_moving_gc_stop(ca); + + /* + * This stops new data writes (e.g. to existing open data + * buckets) and then waits for all existing writes to + * complete. + */ + bch2_dev_allocator_stop(ca); + + bch2_dev_group_remove(&c->journal.devs, ca); +} + +static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) +{ + lockdep_assert_held(&c->state_lock); + + BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); + + if (bch2_dev_allocator_start(ca)) + return "error starting allocator thread"; + + if (bch2_moving_gc_start(ca)) + return "error starting moving GC thread"; + + if (bch2_tiering_start(c)) + return "error starting tiering thread"; + + return NULL; +} + +int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + enum bch_member_state new_state, int flags) +{ + struct bch_sb_field_members *mi; + + if (ca->mi.state == new_state) + return 0; + + if (!bch2_dev_state_allowed(c, ca, new_state, flags)) + return -EINVAL; + + if (new_state == BCH_MEMBER_STATE_RW) { + if (__bch2_dev_read_write(c, ca)) + return -ENOMEM; + } else { + __bch2_dev_read_only(c, ca); + } + + bch_notice(ca, "%s", bch2_dev_state[new_state]); + + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb); + SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + +int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + enum bch_member_state new_state, int flags) +{ + int ret; + + mutex_lock(&c->state_lock); + ret = __bch2_dev_set_state(c, ca, new_state, flags); + mutex_unlock(&c->state_lock); + + return ret; +} + +/* Device add/removal: */ + +int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) +{ + struct bch_sb_field_members *mi; + unsigned dev_idx = ca->dev_idx; + int ret = -EINVAL; + + mutex_lock(&c->state_lock); + + percpu_ref_put(&ca->ref); /* XXX */ + + if (ca->mi.state == BCH_MEMBER_STATE_RW) { + bch_err(ca, "Cannot remove RW device"); + goto err; + } + + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { + bch_err(ca, "Cannot remove without losing data"); + goto err; + } + + /* + * XXX: verify that dev_idx is really not in use anymore, anywhere + * + * flag_data_bad() does not check btree pointers + */ + ret = bch2_flag_data_bad(ca); + if (ret) { + bch_err(ca, "Remove failed"); + goto err; + } + + if (ca->mi.has_data || ca->mi.has_metadata) { + bch_err(ca, "Remove failed, still has data"); + goto err; + } + + /* + * Ok, really doing the remove: + * Drop device's prio pointer before removing it from superblock: + */ + spin_lock(&c->journal.lock); + c->journal.prio_buckets[dev_idx] = 0; + spin_unlock(&c->journal.lock); + + bch2_journal_meta(&c->journal); + + __bch2_dev_offline(ca); + bch2_dev_stop(ca); + bch2_dev_free(ca); + + /* + * Free this device's slot in the bch_member array - all pointers to + * this device must be gone: + */ + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb); + memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); + + bch2_write_super(c); + + mutex_unlock(&c->sb_lock); + mutex_unlock(&c->state_lock); + return 0; +err: + mutex_unlock(&c->state_lock); + return ret; +} + +int bch2_dev_add(struct bch_fs *c, const char *path) +{ + struct bcache_superblock sb; + const char *err; + struct bch_dev *ca = NULL; + struct bch_sb_field_members *mi, *dev_mi; + struct bch_member saved_mi; + unsigned dev_idx, nr_devices, u64s; + int ret = -EINVAL; + + err = bch2_read_super(&sb, bch2_opts_empty(), path); + if (err) + return -EINVAL; + + err = bch2_validate_cache_super(&sb); + if (err) + return -EINVAL; + + err = bch2_dev_may_add(sb.sb, c); + if (err) + return -EINVAL; + + mutex_lock(&c->state_lock); + mutex_lock(&c->sb_lock); + + /* + * Preserve the old cache member information (esp. tier) + * before we start bashing the disk stuff. + */ + dev_mi = bch2_sb_get_members(sb.sb); + saved_mi = dev_mi->members[sb.sb->dev_idx]; + saved_mi.last_mount = cpu_to_le64(ktime_get_seconds()); + + if (dynamic_fault("bcachefs:add:no_slot")) + goto no_slot; + + mi = bch2_sb_get_members(c->disk_sb); + for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) + if (dev_idx >= c->sb.nr_devices || + bch2_is_zero(mi->members[dev_idx].uuid.b, + sizeof(uuid_le))) + goto have_slot; +no_slot: + err = "no slots available in superblock"; + ret = -ENOSPC; + goto err_unlock; + +have_slot: + nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); + u64s = (sizeof(struct bch_sb_field_members) + + sizeof(struct bch_member) * nr_devices) / sizeof(u64); + err = "no space in superblock for member info"; + + mi = bch2_fs_sb_resize_members(c, u64s); + if (!mi) + goto err_unlock; + + dev_mi = bch2_sb_resize_members(&sb, u64s); + if (!dev_mi) + goto err_unlock; + + memcpy(dev_mi, mi, u64s * sizeof(u64)); + dev_mi->members[dev_idx] = saved_mi; + + sb.sb->uuid = c->disk_sb->uuid; + sb.sb->dev_idx = dev_idx; + sb.sb->nr_devices = nr_devices; + + /* commit new member info */ + memcpy(mi, dev_mi, u64s * sizeof(u64)); + c->disk_sb->nr_devices = nr_devices; + c->sb.nr_devices = nr_devices; + + if (bch2_dev_alloc(c, dev_idx)) { + err = "cannot allocate memory"; + ret = -ENOMEM; + goto err_unlock; + } + + if (__bch2_dev_online(c, &sb)) { + err = "bch2_dev_online() error"; + ret = -ENOMEM; + goto err_unlock; + } + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + ca = c->devs[dev_idx]; + if (ca->mi.state == BCH_MEMBER_STATE_RW) { + err = "journal alloc failed"; + if (bch2_dev_journal_alloc(ca)) + goto err; + + err = __bch2_dev_read_write(c, ca); + if (err) + goto err; + } + + mutex_unlock(&c->state_lock); + return 0; +err_unlock: + mutex_unlock(&c->sb_lock); +err: + mutex_unlock(&c->state_lock); + bch2_free_super(&sb); + + bch_err(c, "Unable to add device: %s", err); + return ret ?: -EINVAL; +} + +int bch2_dev_online(struct bch_fs *c, const char *path) +{ + struct bcache_superblock sb = { 0 }; + struct bch_dev *ca; + unsigned dev_idx; + const char *err; + + mutex_lock(&c->state_lock); + + err = bch2_read_super(&sb, bch2_opts_empty(), path); + if (err) + goto err; + + dev_idx = sb.sb->dev_idx; + + err = bch2_dev_in_fs(c->disk_sb, sb.sb); + if (err) + goto err; + + mutex_lock(&c->sb_lock); + if (__bch2_dev_online(c, &sb)) { + err = "__bch2_dev_online() error"; + mutex_unlock(&c->sb_lock); + goto err; + } + mutex_unlock(&c->sb_lock); + + ca = c->devs[dev_idx]; + if (ca->mi.state == BCH_MEMBER_STATE_RW) { + err = __bch2_dev_read_write(c, ca); + if (err) + goto err; + } + + mutex_unlock(&c->state_lock); + return 0; +err: + mutex_unlock(&c->state_lock); + bch2_free_super(&sb); + bch_err(c, "error bringing %s online: %s", path, err); + return -EINVAL; +} + +int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) +{ + mutex_lock(&c->state_lock); + + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { + bch_err(ca, "Cannot offline required disk"); + mutex_unlock(&c->state_lock); + return -EINVAL; + } + + __bch2_dev_read_only(c, ca); + __bch2_dev_offline(ca); + + mutex_unlock(&c->state_lock); + return 0; +} + +int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca) +{ + int ret; + + mutex_lock(&c->state_lock); + + if (ca->mi.state == BCH_MEMBER_STATE_RW) { + bch_err(ca, "Cannot migrate data off RW device"); + mutex_unlock(&c->state_lock); + return -EINVAL; + } + + mutex_unlock(&c->state_lock); + + ret = bch2_move_data_off_device(ca); + if (ret) { + bch_err(ca, "Error migrating data: %i", ret); + return ret; + } + + ret = bch2_move_metadata_off_device(ca); + if (ret) { + bch_err(ca, "Error migrating metadata: %i", ret); + return ret; + } + + if (ca->mi.has_data || ca->mi.has_metadata) { + bch_err(ca, "Migrate error: data still present"); + return -EINVAL; + } + + return 0; +} + +/* Filesystem open: */ + +const char *bch2_fs_open(char * const *devices, unsigned nr_devices, + struct bch_opts opts, struct bch_fs **ret) +{ + const char *err; + struct bch_fs *c = NULL; + struct bcache_superblock *sb; + unsigned i, best_sb = 0; + + if (!nr_devices) + return "need at least one device"; + + if (!try_module_get(THIS_MODULE)) + return "module unloading"; + + err = "cannot allocate memory"; + sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); + if (!sb) + goto err; + + for (i = 0; i < nr_devices; i++) { + err = bch2_read_super(&sb[i], opts, devices[i]); + if (err) + goto err; + + err = "attempting to register backing device"; + if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version))) + goto err; + + err = bch2_validate_cache_super(&sb[i]); + if (err) + goto err; + } + + for (i = 1; i < nr_devices; i++) + if (le64_to_cpu(sb[i].sb->seq) > + le64_to_cpu(sb[best_sb].sb->seq)) + best_sb = i; + + for (i = 0; i < nr_devices; i++) { + err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); + if (err) + goto err; + } + + err = "cannot allocate memory"; + c = bch2_fs_alloc(sb[best_sb].sb, opts); + if (!c) + goto err; + + err = "bch2_dev_online() error"; + mutex_lock(&c->sb_lock); + for (i = 0; i < nr_devices; i++) + if (__bch2_dev_online(c, &sb[i])) { + mutex_unlock(&c->sb_lock); + goto err; + } + mutex_unlock(&c->sb_lock); + + err = "insufficient devices"; + if (!bch2_fs_may_start(c, 0)) + goto err; + + if (!c->opts.nostart) { + err = __bch2_fs_start(c); + if (err) + goto err; + } + + err = bch2_fs_online(c); + if (err) + goto err; + + if (ret) + *ret = c; + else + closure_put(&c->cl); + + err = NULL; +out: + kfree(sb); + module_put(THIS_MODULE); + if (err) + c = NULL; + return err; +err: + if (c) + bch2_fs_stop(c); + + for (i = 0; i < nr_devices; i++) + bch2_free_super(&sb[i]); + goto out; +} + +static const char *__bch2_fs_open_incremental(struct bcache_superblock *sb, + struct bch_opts opts) +{ + const char *err; + struct bch_fs *c; + bool allocated_fs = false; + + err = bch2_validate_cache_super(sb); + if (err) + return err; + + mutex_lock(&bch_fs_list_lock); + c = __bch2_uuid_to_fs(sb->sb->uuid); + if (c) { + closure_get(&c->cl); + + err = bch2_dev_in_fs(c->disk_sb, sb->sb); + if (err) + goto err; + } else { + c = bch2_fs_alloc(sb->sb, opts); + err = "cannot allocate memory"; + if (!c) + goto err; + + allocated_fs = true; + } + + err = "bch2_dev_online() error"; + + mutex_lock(&c->sb_lock); + if (__bch2_dev_online(c, sb)) { + mutex_unlock(&c->sb_lock); + goto err; + } + mutex_unlock(&c->sb_lock); + + if (!c->opts.nostart && bch2_fs_may_start(c, 0)) { + err = __bch2_fs_start(c); + if (err) + goto err; + } + + err = __bch2_fs_online(c); + if (err) + goto err; + + closure_put(&c->cl); + mutex_unlock(&bch_fs_list_lock); + + return NULL; +err: + mutex_unlock(&bch_fs_list_lock); + + if (allocated_fs) + bch2_fs_stop(c); + else if (c) + closure_put(&c->cl); + + return err; +} + +const char *bch2_fs_open_incremental(const char *path) +{ + struct bcache_superblock sb; + struct bch_opts opts = bch2_opts_empty(); + const char *err; + + err = bch2_read_super(&sb, opts, path); + if (err) + return err; + + if (!__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) + err = __bch2_fs_open_incremental(&sb, opts); + else + err = "not a bcachefs superblock"; + + bch2_free_super(&sb); + + return err; +} + +/* Global interfaces/init */ + +static void bcachefs_exit(void) +{ + bch2_debug_exit(); + bch2_vfs_exit(); + bch2_chardev_exit(); + if (bcachefs_kset) + kset_unregister(bcachefs_kset); +} + +static int __init bcachefs_init(void) +{ + bch2_bkey_pack_test(); + + if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || + bch2_chardev_init() || + bch2_vfs_init() || + bch2_debug_init()) + goto err; + + return 0; +err: + bcachefs_exit(); + return -ENOMEM; +} + +#define BCH_DEBUG_PARAM(name, description) \ + bool bch2_##name; \ + module_param_named(name, bch2_##name, bool, 0644); \ + MODULE_PARM_DESC(name, description); +BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + +module_exit(bcachefs_exit); +module_init(bcachefs_init); diff --git a/libbcachefs/super.h b/libbcachefs/super.h new file mode 100644 index 00000000..94424414 --- /dev/null +++ b/libbcachefs/super.h @@ -0,0 +1,130 @@ +#ifndef _BCACHE_SUPER_H +#define _BCACHE_SUPER_H + +#include "extents.h" + +#include "bcachefs_ioctl.h" + +static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) +{ + return s >> ca->bucket_bits; +} + +static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) +{ + return ((sector_t) b) << ca->bucket_bits; +} + +static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) +{ + return s & (ca->mi.bucket_size - 1); +} + +static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter) +{ + struct bch_dev *ca = NULL; + + while (*iter < c->sb.nr_devices && + !(ca = rcu_dereference_check(c->devs[*iter], + lockdep_is_held(&c->state_lock)))) + (*iter)++; + + return ca; +} + +#define __for_each_member_device(ca, c, iter) \ + for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter))); (iter)++) + +#define for_each_member_device_rcu(ca, c, iter) \ + __for_each_member_device(ca, c, iter) + +static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) +{ + struct bch_dev *ca; + + rcu_read_lock(); + if ((ca = __bch2_next_dev(c, iter))) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + + return ca; +} + +/* + * If you break early, you must drop your ref on the current device + */ +#define for_each_member_device(ca, c, iter) \ + for ((iter) = 0; \ + (ca = bch2_get_next_dev(c, &(iter))); \ + percpu_ref_put(&ca->ref), (iter)++) + +static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, + unsigned *iter, + int state_mask) +{ + struct bch_dev *ca; + + rcu_read_lock(); + while ((ca = __bch2_next_dev(c, iter)) && + (!((1 << ca->mi.state) & state_mask) || + !percpu_ref_tryget(&ca->io_ref))) + (*iter)++; + rcu_read_unlock(); + + return ca; +} + +#define __for_each_online_member(ca, c, iter, state_mask) \ + for ((iter) = 0; \ + (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ + percpu_ref_put(&ca->io_ref), (iter)++) + +#define for_each_online_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, ~0) + +#define for_each_rw_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW) + +#define for_each_readable_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, \ + (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) + +struct bch_fs *bch2_bdev_to_fs(struct block_device *); +struct bch_fs *bch2_uuid_to_fs(uuid_le); +int bch2_congested(struct bch_fs *, int); + +void bch2_dev_release(struct kobject *); + +bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, + enum bch_member_state, int); +int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, + enum bch_member_state, int); +int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, + enum bch_member_state, int); + +int bch2_dev_fail(struct bch_dev *, int); +int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); +int bch2_dev_add(struct bch_fs *, const char *); +int bch2_dev_online(struct bch_fs *, const char *); +int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); +int bch2_dev_evacuate(struct bch_fs *, struct bch_dev *); + +bool bch2_fs_emergency_read_only(struct bch_fs *); +void bch2_fs_read_only(struct bch_fs *); +const char *bch2_fs_read_write(struct bch_fs *); + +void bch2_fs_release(struct kobject *); +void bch2_fs_stop(struct bch_fs *); + +const char *bch2_fs_start(struct bch_fs *); +const char *bch2_fs_open(char * const *, unsigned, struct bch_opts, + struct bch_fs **); +const char *bch2_fs_open_incremental(const char *path); + +extern struct kobj_type bch2_fs_ktype; +extern struct kobj_type bch2_fs_internal_ktype; +extern struct kobj_type bch2_fs_time_stats_ktype; +extern struct kobj_type bch2_fs_opts_dir_ktype; +extern struct kobj_type bch2_dev_ktype; + +#endif /* _BCACHE_SUPER_H */ diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h new file mode 100644 index 00000000..69c747de --- /dev/null +++ b/libbcachefs/super_types.h @@ -0,0 +1,12 @@ +#ifndef _BCACHE_SUPER_TYPES_H +#define _BCACHE_SUPER_TYPES_H + +struct bcache_superblock { + struct bch_sb *sb; + struct block_device *bdev; + struct bio *bio; + unsigned page_order; + fmode_t mode; +}; + +#endif /* _BCACHE_SUPER_TYPES_H */ diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c new file mode 100644 index 00000000..11c6cdcc --- /dev/null +++ b/libbcachefs/sysfs.c @@ -0,0 +1,935 @@ +/* + * bcache sysfs interfaces + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc.h" +#include "compress.h" +#include "sysfs.h" +#include "btree_cache.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "btree_gc.h" +#include "buckets.h" +#include "inode.h" +#include "journal.h" +#include "keylist.h" +#include "move.h" +#include "opts.h" +#include "super-io.h" +#include "tier.h" + +#include <linux/blkdev.h> +#include <linux/sort.h> + +write_attribute(trigger_btree_coalesce); +write_attribute(trigger_gc); +write_attribute(prune_cache); + +read_attribute(uuid); +read_attribute(minor); +read_attribute(bucket_size); +read_attribute(bucket_size_bytes); +read_attribute(block_size); +read_attribute(block_size_bytes); +read_attribute(btree_node_size); +read_attribute(btree_node_size_bytes); +read_attribute(first_bucket); +read_attribute(nbuckets); +read_attribute(tree_depth); +read_attribute(root_usage_percent); +read_attribute(read_priority_stats); +read_attribute(write_priority_stats); +read_attribute(fragmentation_stats); +read_attribute(oldest_gen_stats); +read_attribute(reserve_stats); +read_attribute(btree_cache_size); +read_attribute(cache_available_percent); +read_attribute(compression_stats); +read_attribute(written); +read_attribute(btree_written); +read_attribute(metadata_written); +read_attribute(journal_debug); +write_attribute(journal_flush); +read_attribute(internal_uuid); + +read_attribute(btree_gc_running); + +read_attribute(btree_nodes); +read_attribute(btree_used_percent); +read_attribute(average_key_size); +read_attribute(available_buckets); +read_attribute(free_buckets); +read_attribute(dirty_data); +read_attribute(dirty_bytes); +read_attribute(dirty_buckets); +read_attribute(cached_data); +read_attribute(cached_bytes); +read_attribute(cached_buckets); +read_attribute(meta_buckets); +read_attribute(alloc_buckets); +read_attribute(has_data); +read_attribute(has_metadata); +read_attribute(bset_tree_stats); +read_attribute(alloc_debug); + +read_attribute(cache_read_races); + +rw_attribute(journal_write_delay_ms); +rw_attribute(journal_reclaim_delay_ms); +read_attribute(journal_entry_size_max); + +rw_attribute(discard); +rw_attribute(cache_replacement_policy); + +rw_attribute(foreground_write_ratelimit_enabled); +rw_attribute(copy_gc_enabled); +sysfs_pd_controller_attribute(copy_gc); + +rw_attribute(tier); +rw_attribute(tiering_enabled); +rw_attribute(tiering_percent); +sysfs_pd_controller_attribute(tiering); + +sysfs_pd_controller_attribute(foreground_write); + +rw_attribute(pd_controllers_update_seconds); + +rw_attribute(foreground_target_percent); + +read_attribute(meta_replicas_have); +read_attribute(data_replicas_have); + +#define BCH_DEBUG_PARAM(name, description) \ + rw_attribute(name); + + BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + +#define BCH_OPT(_name, _mode, ...) \ + static struct attribute sysfs_opt_##_name = { \ + .name = #_name, .mode = _mode, \ + }; + + BCH_VISIBLE_OPTS() +#undef BCH_OPT + +#define BCH_TIME_STAT(name, frequency_units, duration_units) \ + sysfs_time_stats_attribute(name, frequency_units, duration_units); + BCH_TIME_STATS() +#undef BCH_TIME_STAT + +static struct attribute sysfs_state_rw = { + .name = "state", + .mode = S_IRUGO +}; + +static int bch2_bset_print_stats(struct bch_fs *c, char *buf) +{ + struct bset_stats stats; + size_t nodes = 0; + struct btree *b; + struct bucket_table *tbl; + struct rhash_head *pos; + unsigned iter; + + memset(&stats, 0, sizeof(stats)); + + rcu_read_lock(); + for_each_cached_btree(b, c, tbl, iter, pos) { + bch2_btree_keys_stats(b, &stats); + nodes++; + } + rcu_read_unlock(); + + return snprintf(buf, PAGE_SIZE, + "btree nodes: %zu\n" + "written sets: %zu\n" + "written key bytes: %zu\n" + "unwritten sets: %zu\n" + "unwritten key bytes: %zu\n" + "no table sets: %zu\n" + "no table key bytes: %zu\n" + "floats: %zu\n" + "failed unpacked: %zu\n" + "failed prev: %zu\n" + "failed overflow: %zu\n", + nodes, + stats.sets[BSET_RO_AUX_TREE].nr, + stats.sets[BSET_RO_AUX_TREE].bytes, + stats.sets[BSET_RW_AUX_TREE].nr, + stats.sets[BSET_RW_AUX_TREE].bytes, + stats.sets[BSET_NO_AUX_TREE].nr, + stats.sets[BSET_NO_AUX_TREE].bytes, + stats.floats, + stats.failed_unpacked, + stats.failed_prev, + stats.failed_overflow); +} + +static unsigned bch2_root_usage(struct bch_fs *c) +{ + unsigned bytes = 0; + struct bkey_packed *k; + struct btree *b; + struct btree_node_iter iter; + + goto lock_root; + + do { + six_unlock_read(&b->lock); +lock_root: + b = c->btree_roots[BTREE_ID_EXTENTS].b; + six_lock_read(&b->lock); + } while (b != c->btree_roots[BTREE_ID_EXTENTS].b); + + for_each_btree_node_key(b, k, &iter, btree_node_is_extents(b)) + bytes += bkey_bytes(k); + + six_unlock_read(&b->lock); + + return (bytes * 100) / btree_bytes(c); +} + +static size_t bch2_btree_cache_size(struct bch_fs *c) +{ + size_t ret = 0; + struct btree *b; + + mutex_lock(&c->btree_cache_lock); + list_for_each_entry(b, &c->btree_cache, list) + ret += btree_bytes(c); + + mutex_unlock(&c->btree_cache_lock); + return ret; +} + +static unsigned bch2_fs_available_percent(struct bch_fs *c) +{ + return div64_u64((u64) sectors_available(c) * 100, + c->capacity ?: 1); +} + +#if 0 +static unsigned bch2_btree_used(struct bch_fs *c) +{ + return div64_u64(c->gc_stats.key_bytes * 100, + (c->gc_stats.nodes ?: 1) * btree_bytes(c)); +} + +static unsigned bch2_average_key_size(struct bch_fs *c) +{ + return c->gc_stats.nkeys + ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) + : 0; +} +#endif + +static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) +{ + struct bch_fs_usage stats = bch2_fs_usage_read(c); + + return scnprintf(buf, PAGE_SIZE, + "capacity:\t\t%llu\n" + "compressed:\n" + "\tmeta:\t\t%llu\n" + "\tdirty:\t\t%llu\n" + "\tcached:\t\t%llu\n" + "uncompressed:\n" + "\tmeta:\t\t%llu\n" + "\tdirty:\t\t%llu\n" + "\tcached:\t\t%llu\n" + "persistent reserved sectors:\t%llu\n" + "online reserved sectors:\t%llu\n", + c->capacity, + stats.s[S_COMPRESSED][S_META], + stats.s[S_COMPRESSED][S_DIRTY], + stats.s[S_COMPRESSED][S_CACHED], + stats.s[S_UNCOMPRESSED][S_META], + stats.s[S_UNCOMPRESSED][S_DIRTY], + stats.s[S_UNCOMPRESSED][S_CACHED], + stats.persistent_reserved, + stats.online_reserved); +} + +static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, + nr_compressed_extents = 0, + compressed_sectors_compressed = 0, + compressed_sectors_uncompressed = 0; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k) + if (k.k->type == BCH_EXTENT) { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr; + const union bch_extent_crc *crc; + + extent_for_each_ptr_crc(e, ptr, crc) { + if (crc_compression_type(crc) == BCH_COMPRESSION_NONE) { + nr_uncompressed_extents++; + uncompressed_sectors += e.k->size; + } else { + nr_compressed_extents++; + compressed_sectors_compressed += + crc_compressed_size(e.k, crc); + compressed_sectors_uncompressed += + crc_uncompressed_size(e.k, crc); + } + + /* only looking at the first ptr */ + break; + } + } + bch2_btree_iter_unlock(&iter); + + return snprintf(buf, PAGE_SIZE, + "uncompressed data:\n" + " nr extents: %llu\n" + " size (bytes): %llu\n" + "compressed data:\n" + " nr extents: %llu\n" + " compressed size (bytes): %llu\n" + " uncompressed size (bytes): %llu\n", + nr_uncompressed_extents, + uncompressed_sectors << 9, + nr_compressed_extents, + compressed_sectors_compressed << 9, + compressed_sectors_uncompressed << 9); +} + +SHOW(bch2_fs) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + + sysfs_print(minor, c->minor); + + sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); + sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); + sysfs_hprint(journal_entry_size_max, c->journal.entry_size_max); + + sysfs_hprint(block_size, block_bytes(c)); + sysfs_print(block_size_bytes, block_bytes(c)); + sysfs_hprint(btree_node_size, c->sb.btree_node_size << 9); + sysfs_print(btree_node_size_bytes, c->sb.btree_node_size << 9); + + sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); + sysfs_print(cache_available_percent, bch2_fs_available_percent(c)); + + sysfs_print(btree_gc_running, c->gc_pos.phase != GC_PHASE_DONE); + +#if 0 + /* XXX: reimplement */ + sysfs_print(btree_used_percent, bch2_btree_used(c)); + sysfs_print(btree_nodes, c->gc_stats.nodes); + sysfs_hprint(average_key_size, bch2_average_key_size(c)); +#endif + + sysfs_print(cache_read_races, + atomic_long_read(&c->cache_read_races)); + + sysfs_printf(foreground_write_ratelimit_enabled, "%i", + c->foreground_write_ratelimit_enabled); + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_pd_controller_show(foreground_write, &c->foreground_write_pd); + + sysfs_print(pd_controllers_update_seconds, + c->pd_controllers_update_seconds); + sysfs_print(foreground_target_percent, c->foreground_target_percent); + + sysfs_printf(tiering_enabled, "%i", c->tiering_enabled); + sysfs_print(tiering_percent, c->tiering_percent); + + sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */ + + sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have); + sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have); + + /* Debugging: */ + + if (attr == &sysfs_journal_debug) + return bch2_journal_print_debug(&c->journal, buf); + +#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); + BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + + if (!bch2_fs_running(c)) + return -EPERM; + + if (attr == &sysfs_bset_tree_stats) + return bch2_bset_print_stats(c, buf); + if (attr == &sysfs_alloc_debug) + return show_fs_alloc_debug(c, buf); + + sysfs_print(tree_depth, c->btree_roots[BTREE_ID_EXTENTS].b->level); + sysfs_print(root_usage_percent, bch2_root_usage(c)); + + if (attr == &sysfs_compression_stats) + return bch2_compression_stats(c, buf); + + sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); + + return 0; +} + +STORE(__bch2_fs) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + + sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); + sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); + + sysfs_strtoul(foreground_write_ratelimit_enabled, + c->foreground_write_ratelimit_enabled); + + if (attr == &sysfs_copy_gc_enabled) { + struct bch_dev *ca; + unsigned i; + ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) + ?: (ssize_t) size; + + for_each_member_device(ca, c, i) + if (ca->moving_gc_read) + wake_up_process(ca->moving_gc_read); + return ret; + } + + if (attr == &sysfs_tiering_enabled) { + ssize_t ret = strtoul_safe(buf, c->tiering_enabled) + ?: (ssize_t) size; + + bch2_tiering_start(c); /* issue wakeups */ + return ret; + } + + sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd); + + sysfs_strtoul(pd_controllers_update_seconds, + c->pd_controllers_update_seconds); + sysfs_strtoul(foreground_target_percent, c->foreground_target_percent); + + sysfs_strtoul(tiering_percent, c->tiering_percent); + sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */ + + /* Debugging: */ + +#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); + BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + + if (!bch2_fs_running(c)) + return -EPERM; + + if (attr == &sysfs_journal_flush) { + bch2_journal_meta_async(&c->journal, NULL); + + return size; + } + + if (attr == &sysfs_trigger_btree_coalesce) + bch2_coalesce(c); + + /* Debugging: */ + + if (attr == &sysfs_trigger_gc) + bch2_gc(c); + + if (attr == &sysfs_prune_cache) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = strtoul_or_return(buf); + c->btree_cache_shrink.scan_objects(&c->btree_cache_shrink, &sc); + } + + return size; +} + +STORE(bch2_fs) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + + mutex_lock(&c->state_lock); + size = __bch2_fs_store(kobj, attr, buf, size); + mutex_unlock(&c->state_lock); + + return size; +} + +static struct attribute *bch2_fs_files[] = { + &sysfs_journal_write_delay_ms, + &sysfs_journal_reclaim_delay_ms, + &sysfs_journal_entry_size_max, + + &sysfs_block_size, + &sysfs_block_size_bytes, + &sysfs_btree_node_size, + &sysfs_btree_node_size_bytes, + &sysfs_tree_depth, + &sysfs_root_usage_percent, + &sysfs_btree_cache_size, + &sysfs_cache_available_percent, + &sysfs_compression_stats, + + &sysfs_average_key_size, + + &sysfs_meta_replicas_have, + &sysfs_data_replicas_have, + + &sysfs_foreground_target_percent, + &sysfs_tiering_percent, + + &sysfs_journal_flush, + NULL +}; +KTYPE(bch2_fs); + +/* internal dir - just a wrapper */ + +SHOW(bch2_fs_internal) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, internal); + return bch2_fs_show(&c->kobj, attr, buf); +} + +STORE(bch2_fs_internal) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, internal); + return bch2_fs_store(&c->kobj, attr, buf, size); +} + +static void bch2_fs_internal_release(struct kobject *k) +{ +} + +static struct attribute *bch2_fs_internal_files[] = { + &sysfs_journal_debug, + + &sysfs_alloc_debug, + + &sysfs_btree_gc_running, + + &sysfs_btree_nodes, + &sysfs_btree_used_percent, + + &sysfs_bset_tree_stats, + &sysfs_cache_read_races, + + &sysfs_trigger_btree_coalesce, + &sysfs_trigger_gc, + &sysfs_prune_cache, + &sysfs_foreground_write_ratelimit_enabled, + &sysfs_copy_gc_enabled, + &sysfs_tiering_enabled, + sysfs_pd_controller_files(tiering), + sysfs_pd_controller_files(foreground_write), + &sysfs_internal_uuid, + +#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, + BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + + NULL +}; +KTYPE(bch2_fs_internal); + +/* options */ + +SHOW(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + + return bch2_opt_show(&c->opts, attr->name, buf, PAGE_SIZE); +} + +STORE(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + const struct bch_option *opt; + enum bch_opt_id id; + u64 v; + + id = bch2_parse_sysfs_opt(attr->name, buf, &v); + if (id < 0) + return id; + + opt = &bch2_opt_table[id]; + + mutex_lock(&c->sb_lock); + + if (id == Opt_compression) { + int ret = bch2_check_set_has_compressed_data(c, v); + if (ret) { + mutex_unlock(&c->sb_lock); + return ret; + } + } + + if (opt->set_sb != SET_NO_SB_OPT) { + opt->set_sb(c->disk_sb, v); + bch2_write_super(c); + } + + bch2_opt_set(&c->opts, id, v); + + mutex_unlock(&c->sb_lock); + + return size; +} + +static void bch2_fs_opts_dir_release(struct kobject *k) +{ +} + +static struct attribute *bch2_fs_opts_dir_files[] = { +#define BCH_OPT(_name, ...) \ + &sysfs_opt_##_name, + + BCH_VISIBLE_OPTS() +#undef BCH_OPT + + NULL +}; +KTYPE(bch2_fs_opts_dir); + +/* time stats */ + +SHOW(bch2_fs_time_stats) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); + +#define BCH_TIME_STAT(name, frequency_units, duration_units) \ + sysfs_print_time_stats(&c->name##_time, name, \ + frequency_units, duration_units); + BCH_TIME_STATS() +#undef BCH_TIME_STAT + + return 0; +} + +STORE(bch2_fs_time_stats) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); + +#define BCH_TIME_STAT(name, frequency_units, duration_units) \ + sysfs_clear_time_stats(&c->name##_time, name); + BCH_TIME_STATS() +#undef BCH_TIME_STAT + + return size; +} + +static void bch2_fs_time_stats_release(struct kobject *k) +{ +} + +static struct attribute *bch2_fs_time_stats_files[] = { +#define BCH_TIME_STAT(name, frequency_units, duration_units) \ + sysfs_time_stats_attribute_list(name, frequency_units, duration_units) + BCH_TIME_STATS() +#undef BCH_TIME_STAT + + NULL +}; +KTYPE(bch2_fs_time_stats); + +typedef unsigned (bucket_map_fn)(struct bch_dev *, struct bucket *, void *); + +static unsigned bucket_priority_fn(struct bch_dev *ca, struct bucket *g, + void *private) +{ + int rw = (private ? 1 : 0); + + return ca->fs->prio_clock[rw].hand - g->prio[rw]; +} + +static unsigned bucket_sectors_used_fn(struct bch_dev *ca, struct bucket *g, + void *private) +{ + return bucket_sectors_used(g); +} + +static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, struct bucket *g, + void *private) +{ + return bucket_gc_gen(ca, g); +} + +static ssize_t show_quantiles(struct bch_dev *ca, char *buf, + bucket_map_fn *fn, void *private) +{ + int cmp(const void *l, const void *r) + { return *((unsigned *) r) - *((unsigned *) l); } + + size_t n = ca->mi.nbuckets, i; + /* Compute 31 quantiles */ + unsigned q[31], *p; + ssize_t ret = 0; + + p = vzalloc(ca->mi.nbuckets * sizeof(unsigned)); + if (!p) + return -ENOMEM; + + for (i = ca->mi.first_bucket; i < n; i++) + p[i] = fn(ca, &ca->buckets[i], private); + + sort(p, n, sizeof(unsigned), cmp, NULL); + + while (n && + !p[n - 1]) + --n; + + for (i = 0; i < ARRAY_SIZE(q); i++) + q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)]; + + vfree(p); + + for (i = 0; i < ARRAY_SIZE(q); i++) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "%u ", q[i]); + buf[ret - 1] = '\n'; + + return ret; + +} + +static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) +{ + enum alloc_reserve i; + ssize_t ret; + + spin_lock(&ca->freelist_lock); + + ret = scnprintf(buf, PAGE_SIZE, + "free_inc:\t%zu\t%zu\n", + fifo_used(&ca->free_inc), + ca->free_inc.size); + + for (i = 0; i < RESERVE_NR; i++) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "free[%u]:\t%zu\t%zu\n", i, + fifo_used(&ca->free[i]), + ca->free[i].size); + + spin_unlock(&ca->freelist_lock); + + return ret; +} + +static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) +{ + struct bch_fs *c = ca->fs; + struct bch_dev_usage stats = bch2_dev_usage_read(ca); + + return scnprintf(buf, PAGE_SIZE, + "free_inc: %zu/%zu\n" + "free[RESERVE_PRIO]: %zu/%zu\n" + "free[RESERVE_BTREE]: %zu/%zu\n" + "free[RESERVE_MOVINGGC]: %zu/%zu\n" + "free[RESERVE_NONE]: %zu/%zu\n" + "alloc: %llu/%llu\n" + "meta: %llu/%llu\n" + "dirty: %llu/%llu\n" + "available: %llu/%llu\n" + "freelist_wait: %s\n" + "open buckets: %u/%u (reserved %u)\n" + "open_buckets_wait: %s\n", + fifo_used(&ca->free_inc), ca->free_inc.size, + fifo_used(&ca->free[RESERVE_PRIO]), ca->free[RESERVE_PRIO].size, + fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, + fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, + fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, + stats.buckets_alloc, ca->mi.nbuckets - ca->mi.first_bucket, + stats.buckets_meta, ca->mi.nbuckets - ca->mi.first_bucket, + stats.buckets_dirty, ca->mi.nbuckets - ca->mi.first_bucket, + __dev_buckets_available(ca, stats), ca->mi.nbuckets - ca->mi.first_bucket, + c->freelist_wait.list.first ? "waiting" : "empty", + c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE, + c->open_buckets_wait.list.first ? "waiting" : "empty"); +} + +static u64 sectors_written(struct bch_dev *ca) +{ + u64 ret = 0; + int cpu; + + for_each_possible_cpu(cpu) + ret += *per_cpu_ptr(ca->sectors_written, cpu); + + return ret; +} + +SHOW(bch2_dev) +{ + struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); + struct bch_fs *c = ca->fs; + struct bch_dev_usage stats = bch2_dev_usage_read(ca); + + sysfs_printf(uuid, "%pU\n", ca->uuid.b); + + sysfs_hprint(bucket_size, bucket_bytes(ca)); + sysfs_print(bucket_size_bytes, bucket_bytes(ca)); + sysfs_hprint(block_size, block_bytes(c)); + sysfs_print(block_size_bytes, block_bytes(c)); + sysfs_print(first_bucket, ca->mi.first_bucket); + sysfs_print(nbuckets, ca->mi.nbuckets); + sysfs_print(discard, ca->mi.discard); + sysfs_hprint(written, sectors_written(ca) << 9); + sysfs_hprint(btree_written, + atomic64_read(&ca->btree_sectors_written) << 9); + sysfs_hprint(metadata_written, + (atomic64_read(&ca->meta_sectors_written) + + atomic64_read(&ca->btree_sectors_written)) << 9); + + sysfs_hprint(dirty_data, stats.sectors[S_DIRTY] << 9); + sysfs_print(dirty_bytes, stats.sectors[S_DIRTY] << 9); + sysfs_print(dirty_buckets, stats.buckets_dirty); + sysfs_hprint(cached_data, stats.sectors[S_CACHED] << 9); + sysfs_print(cached_bytes, stats.sectors[S_CACHED] << 9); + sysfs_print(cached_buckets, stats.buckets_cached); + sysfs_print(meta_buckets, stats.buckets_meta); + sysfs_print(alloc_buckets, stats.buckets_alloc); + sysfs_print(available_buckets, dev_buckets_available(ca)); + sysfs_print(free_buckets, dev_buckets_free(ca)); + sysfs_print(has_data, ca->mi.has_data); + sysfs_print(has_metadata, ca->mi.has_metadata); + + sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd); + + if (attr == &sysfs_cache_replacement_policy) + return bch2_snprint_string_list(buf, PAGE_SIZE, + bch2_cache_replacement_policies, + ca->mi.replacement); + + sysfs_print(tier, ca->mi.tier); + + if (attr == &sysfs_state_rw) + return bch2_snprint_string_list(buf, PAGE_SIZE, + bch2_dev_state, + ca->mi.state); + + if (attr == &sysfs_read_priority_stats) + return show_quantiles(ca, buf, bucket_priority_fn, (void *) 0); + if (attr == &sysfs_write_priority_stats) + return show_quantiles(ca, buf, bucket_priority_fn, (void *) 1); + if (attr == &sysfs_fragmentation_stats) + return show_quantiles(ca, buf, bucket_sectors_used_fn, NULL); + if (attr == &sysfs_oldest_gen_stats) + return show_quantiles(ca, buf, bucket_oldest_gen_fn, NULL); + if (attr == &sysfs_reserve_stats) + return show_reserve_stats(ca, buf); + if (attr == &sysfs_alloc_debug) + return show_dev_alloc_debug(ca, buf); + + return 0; +} + +STORE(bch2_dev) +{ + struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); + struct bch_fs *c = ca->fs; + struct bch_member *mi; + + sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd); + + if (attr == &sysfs_discard) { + bool v = strtoul_or_return(buf); + + mutex_lock(&c->sb_lock); + mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx]; + + if (v != BCH_MEMBER_DISCARD(mi)) { + SET_BCH_MEMBER_DISCARD(mi, v); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + } + + if (attr == &sysfs_cache_replacement_policy) { + ssize_t v = bch2_read_string_list(buf, bch2_cache_replacement_policies); + + if (v < 0) + return v; + + mutex_lock(&c->sb_lock); + mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx]; + + if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { + SET_BCH_MEMBER_REPLACEMENT(mi, v); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + } + + if (attr == &sysfs_tier) { + unsigned prev_tier; + unsigned v = strtoul_restrict_or_return(buf, + 0, BCH_TIER_MAX - 1); + + mutex_lock(&c->sb_lock); + prev_tier = ca->mi.tier; + + if (v == ca->mi.tier) { + mutex_unlock(&c->sb_lock); + return size; + } + + mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx]; + SET_BCH_MEMBER_TIER(mi, v); + bch2_write_super(c); + + bch2_dev_group_remove(&c->tiers[prev_tier].devs, ca); + bch2_dev_group_add(&c->tiers[ca->mi.tier].devs, ca); + mutex_unlock(&c->sb_lock); + + bch2_recalc_capacity(c); + bch2_tiering_start(c); + } + + return size; +} + +static struct attribute *bch2_dev_files[] = { + &sysfs_uuid, + &sysfs_bucket_size, + &sysfs_bucket_size_bytes, + &sysfs_block_size, + &sysfs_block_size_bytes, + &sysfs_first_bucket, + &sysfs_nbuckets, + &sysfs_read_priority_stats, + &sysfs_write_priority_stats, + &sysfs_fragmentation_stats, + &sysfs_oldest_gen_stats, + &sysfs_reserve_stats, + &sysfs_available_buckets, + &sysfs_free_buckets, + &sysfs_dirty_data, + &sysfs_dirty_bytes, + &sysfs_dirty_buckets, + &sysfs_cached_data, + &sysfs_cached_bytes, + &sysfs_cached_buckets, + &sysfs_meta_buckets, + &sysfs_alloc_buckets, + &sysfs_has_data, + &sysfs_has_metadata, + &sysfs_discard, + &sysfs_written, + &sysfs_btree_written, + &sysfs_metadata_written, + &sysfs_cache_replacement_policy, + &sysfs_tier, + &sysfs_state_rw, + &sysfs_alloc_debug, + + sysfs_pd_controller_files(copy_gc), + NULL +}; +KTYPE(bch2_dev); diff --git a/libbcachefs/sysfs.h b/libbcachefs/sysfs.h new file mode 100644 index 00000000..d1f17cff --- /dev/null +++ b/libbcachefs/sysfs.h @@ -0,0 +1,103 @@ +#ifndef _BCACHE_SYSFS_H_ +#define _BCACHE_SYSFS_H_ + +#include "util.h" + +#define KTYPE(type) \ +struct kobj_type type ## _ktype = { \ + .release = type ## _release, \ + .sysfs_ops = &((const struct sysfs_ops) { \ + .show = type ## _show, \ + .store = type ## _store \ + }), \ + .default_attrs = type ## _files \ +} + +#define SHOW(fn) \ +static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ + char *buf) \ + +#define STORE(fn) \ +static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ + const char *buf, size_t size) \ + +#define __sysfs_attribute(_name, _mode) \ + static struct attribute sysfs_##_name = \ + { .name = #_name, .mode = _mode } + +#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) +#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) +#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) + +#define sysfs_printf(file, fmt, ...) \ +do { \ + if (attr == &sysfs_ ## file) \ + return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \ +} while (0) + +#define sysfs_print(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return snprint(buf, PAGE_SIZE, var); \ +} while (0) + +#define sysfs_hprint(file, val) \ +do { \ + if (attr == &sysfs_ ## file) { \ + ssize_t ret = bch2_hprint(buf, val); \ + strcat(buf, "\n"); \ + return ret + 1; \ + } \ +} while (0) + +#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) +#define var_print(_var) sysfs_print(_var, var(_var)) +#define var_hprint(_var) sysfs_hprint(_var, var(_var)) + +#define sysfs_strtoul(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoul_safe(buf, var) ?: (ssize_t) size; \ +} while (0) + +#define sysfs_strtoul_clamp(file, var, min, max) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoul_safe_clamp(buf, var, min, max) \ + ?: (ssize_t) size; \ +} while (0) + +#define strtoul_or_return(cp) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (_r) \ + return _r; \ + _v; \ +}) + +#define strtoul_restrict_or_return(cp, min, max) \ +({ \ + unsigned long __v = 0; \ + int _r = strtoul_safe_restrict(cp, __v, min, max); \ + if (_r) \ + return _r; \ + __v; \ +}) + +#define strtoi_h_or_return(cp) \ +({ \ + u64 _v; \ + int _r = strtoi_h(cp, &_v); \ + if (_r) \ + return _r; \ + _v; \ +}) + +#define sysfs_hatoi(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoi_h(buf, &var) ?: (ssize_t) size; \ +} while (0) + +#endif /* _BCACHE_SYSFS_H_ */ diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c new file mode 100644 index 00000000..16d32928 --- /dev/null +++ b/libbcachefs/tier.c @@ -0,0 +1,282 @@ + +#include "bcachefs.h" +#include "alloc.h" +#include "btree_iter.h" +#include "buckets.h" +#include "clock.h" +#include "extents.h" +#include "io.h" +#include "keylist.h" +#include "move.h" +#include "super-io.h" +#include "tier.h" + +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <trace/events/bcachefs.h> + +struct tiering_state { + struct bch_tier *tier; + unsigned sectors; + unsigned stripe_size; + unsigned dev_idx; + struct bch_dev *ca; +}; + +static bool tiering_pred(struct bch_fs *c, + struct tiering_state *s, + struct bkey_s_c k) +{ + if (bkey_extent_is_data(k.k)) { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr; + unsigned replicas = 0; + + /* Make sure we have room to add a new pointer: */ + if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > + BKEY_EXTENT_VAL_U64s_MAX) + return false; + + extent_for_each_ptr(e, ptr) + if (c->devs[ptr->dev]->mi.tier >= s->tier->idx) + replicas++; + + return replicas < c->opts.data_replicas; + } + + return false; +} + +static void tier_put_device(struct tiering_state *s) +{ + if (s->ca) + percpu_ref_put(&s->ca->io_ref); + s->ca = NULL; +} + +/** + * refill_next - move on to refilling the next cache's tiering keylist + */ +static void tier_next_device(struct bch_fs *c, struct tiering_state *s) +{ + if (!s->ca || s->sectors > s->stripe_size) { + tier_put_device(s); + s->sectors = 0; + s->dev_idx++; + + spin_lock(&s->tier->devs.lock); + if (s->dev_idx >= s->tier->devs.nr) + s->dev_idx = 0; + + if (s->tier->devs.nr) { + s->ca = s->tier->devs.d[s->dev_idx].dev; + percpu_ref_get(&s->ca->io_ref); + } + spin_unlock(&s->tier->devs.lock); + } +} + +static int issue_tiering_move(struct bch_fs *c, + struct tiering_state *s, + struct moving_context *ctxt, + struct bkey_s_c k) +{ + int ret; + + ret = bch2_data_move(c, ctxt, &s->ca->tiering_write_point, k, NULL); + if (!ret) { + trace_tiering_copy(k.k); + s->sectors += k.k->size; + } else { + trace_tiering_alloc_fail(c, k.k->size); + } + + return ret; +} + +/** + * tiering_next_cache - issue a move to write an extent to the next cache + * device in round robin order + */ +static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier) +{ + struct moving_context ctxt; + struct tiering_state s; + struct btree_iter iter; + struct bkey_s_c k; + unsigned nr_devices = READ_ONCE(tier->devs.nr); + int ret; + + if (!nr_devices) + return 0; + + trace_tiering_start(c); + + memset(&s, 0, sizeof(s)); + s.tier = tier; + s.stripe_size = 2048; /* 1 mb for now */ + + bch2_move_ctxt_init(&ctxt, &tier->pd.rate, + nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE); + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + + while (!kthread_should_stop() && + !bch2_move_ctxt_wait(&ctxt) && + (k = bch2_btree_iter_peek(&iter)).k && + !btree_iter_err(k)) { + if (!tiering_pred(c, &s, k)) + goto next; + + tier_next_device(c, &s); + if (!s.ca) + break; + + ret = issue_tiering_move(c, &s, &ctxt, k); + if (ret) { + bch2_btree_iter_unlock(&iter); + + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(&ctxt); + continue; + } +next: + bch2_btree_iter_advance_pos(&iter); + //bch2_btree_iter_cond_resched(&iter); + + /* unlock before calling moving_context_wait() */ + bch2_btree_iter_unlock(&iter); + cond_resched(); + } + + bch2_btree_iter_unlock(&iter); + tier_put_device(&s); + bch2_move_ctxt_exit(&ctxt); + trace_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved); + + return ctxt.sectors_moved; +} + +static int bch2_tiering_thread(void *arg) +{ + struct bch_tier *tier = arg; + struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]); + struct io_clock *clock = &c->io_clock[WRITE]; + struct bch_dev *ca; + u64 tier_capacity, available_sectors; + unsigned long last; + unsigned i; + + set_freezable(); + + while (!kthread_should_stop()) { + if (kthread_wait_freezable(c->tiering_enabled && + tier->devs.nr)) + break; + + while (1) { + struct bch_tier *faster_tier; + + last = atomic_long_read(&clock->now); + + tier_capacity = available_sectors = 0; + for (faster_tier = c->tiers; + faster_tier != tier; + faster_tier++) { + spin_lock(&faster_tier->devs.lock); + group_for_each_dev(ca, &faster_tier->devs, i) { + tier_capacity += + (ca->mi.nbuckets - + ca->mi.first_bucket) << ca->bucket_bits; + available_sectors += + dev_buckets_available(ca) << ca->bucket_bits; + } + spin_unlock(&faster_tier->devs.lock); + } + + if (available_sectors < (tier_capacity >> 1)) + break; + + bch2_kthread_io_clock_wait(clock, + last + + available_sectors - + (tier_capacity >> 1)); + if (kthread_should_stop()) + return 0; + } + + read_tiering(c, tier); + } + + return 0; +} + +static void __bch2_tiering_stop(struct bch_tier *tier) +{ + tier->pd.rate.rate = UINT_MAX; + bch2_ratelimit_reset(&tier->pd.rate); + + if (tier->migrate) + kthread_stop(tier->migrate); + + tier->migrate = NULL; +} + +void bch2_tiering_stop(struct bch_fs *c) +{ + struct bch_tier *tier; + + for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) + __bch2_tiering_stop(tier); +} + +static int __bch2_tiering_start(struct bch_tier *tier) +{ + if (!tier->migrate) { + struct task_struct *p = + kthread_create(bch2_tiering_thread, tier, + "bch_tier[%u]", tier->idx); + if (IS_ERR(p)) + return PTR_ERR(p); + + tier->migrate = p; + } + + wake_up_process(tier->migrate); + return 0; +} + +int bch2_tiering_start(struct bch_fs *c) +{ + struct bch_tier *tier; + bool have_faster_tier = false; + + if (c->opts.nochanges) + return 0; + + for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { + if (!tier->devs.nr) + continue; + + if (have_faster_tier) { + int ret = __bch2_tiering_start(tier); + if (ret) + return ret; + } else { + __bch2_tiering_stop(tier); + } + + have_faster_tier = true; + } + + return 0; +} + +void bch2_fs_tiering_init(struct bch_fs *c) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { + c->tiers[i].idx = i; + bch2_pd_controller_init(&c->tiers[i].pd); + } +} diff --git a/libbcachefs/tier.h b/libbcachefs/tier.h new file mode 100644 index 00000000..a4fd6225 --- /dev/null +++ b/libbcachefs/tier.h @@ -0,0 +1,8 @@ +#ifndef _BCACHE_TIER_H +#define _BCACHE_TIER_H + +void bch2_tiering_stop(struct bch_fs *); +int bch2_tiering_start(struct bch_fs *); +void bch2_fs_tiering_init(struct bch_fs *); + +#endif diff --git a/libbcachefs/trace.c b/libbcachefs/trace.c new file mode 100644 index 00000000..13f0fc24 --- /dev/null +++ b/libbcachefs/trace.c @@ -0,0 +1,11 @@ +#include "bcachefs.h" +#include "alloc_types.h" +#include "buckets.h" +#include "btree_types.h" +#include "keylist.h" + +#include <linux/blktrace_api.h> +#include "keylist.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/bcachefs.h> diff --git a/libbcachefs/util.c b/libbcachefs/util.c new file mode 100644 index 00000000..e4cd6317 --- /dev/null +++ b/libbcachefs/util.c @@ -0,0 +1,418 @@ +/* + * random utiility code, for bcache but in theory not specific to bcache + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/seq_file.h> +#include <linux/types.h> + +#include <linux/freezer.h> +#include <linux/kthread.h> + +#include "util.h" + +#define simple_strtoint(c, end, base) simple_strtol(c, end, base) +#define simple_strtouint(c, end, base) simple_strtoul(c, end, base) + +#define STRTO_H(name, type) \ +int bch2_ ## name ## _h(const char *cp, type *res) \ +{ \ + int u = 0; \ + char *e; \ + type i = simple_ ## name(cp, &e, 10); \ + \ + switch (tolower(*e)) { \ + default: \ + return -EINVAL; \ + case 'y': \ + case 'z': \ + u++; \ + case 'e': \ + u++; \ + case 'p': \ + u++; \ + case 't': \ + u++; \ + case 'g': \ + u++; \ + case 'm': \ + u++; \ + case 'k': \ + u++; \ + if (e++ == cp) \ + return -EINVAL; \ + case '\n': \ + case '\0': \ + if (*e == '\n') \ + e++; \ + } \ + \ + if (*e) \ + return -EINVAL; \ + \ + while (u--) { \ + if ((type) ~0 > 0 && \ + (type) ~0 / 1024 <= i) \ + return -EINVAL; \ + if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \ + (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \ + return -EINVAL; \ + i *= 1024; \ + } \ + \ + *res = i; \ + return 0; \ +} \ + +STRTO_H(strtoint, int) +STRTO_H(strtouint, unsigned int) +STRTO_H(strtoll, long long) +STRTO_H(strtoull, unsigned long long) + +ssize_t bch2_hprint(char *buf, s64 v) +{ + static const char units[] = "?kMGTPEZY"; + char dec[4] = ""; + int u, t = 0; + + for (u = 0; v >= 1024 || v <= -1024; u++) { + t = v & ~(~0 << 10); + v >>= 10; + } + + if (!u) + return sprintf(buf, "%lli", v); + + /* + * 103 is magic: t is in the range [-1023, 1023] and we want + * to turn it into [-9, 9] + */ + if (v < 100 && v > -100) + snprintf(dec, sizeof(dec), ".%i", t / 103); + + return sprintf(buf, "%lli%s%c", v, dec, units[u]); +} + +ssize_t bch2_snprint_string_list(char *buf, size_t size, const char * const list[], + size_t selected) +{ + char *out = buf; + size_t i; + + for (i = 0; list[i]; i++) + out += snprintf(out, buf + size - out, + i == selected ? "[%s] " : "%s ", list[i]); + + out[-1] = '\n'; + return out - buf; +} + +ssize_t bch2_read_string_list(const char *buf, const char * const list[]) +{ + size_t i; + char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL); + if (!d) + return -ENOMEM; + + s = strim(d); + + for (i = 0; list[i]; i++) + if (!strcmp(list[i], s)) + break; + + kfree(d); + + if (!list[i]) + return -EINVAL; + + return i; +} + +bool bch2_is_zero(const void *_p, size_t n) +{ + const char *p = _p; + size_t i; + + for (i = 0; i < n; i++) + if (p[i]) + return false; + return true; +} + +void bch2_time_stats_clear(struct time_stats *stats) +{ + spin_lock(&stats->lock); + + stats->count = 0; + stats->last_duration = 0; + stats->max_duration = 0; + stats->average_duration = 0; + stats->average_frequency = 0; + stats->last = 0; + + spin_unlock(&stats->lock); +} + +void __bch2_time_stats_update(struct time_stats *stats, u64 start_time) +{ + u64 now, duration, last; + + stats->count++; + + now = local_clock(); + duration = time_after64(now, start_time) + ? now - start_time : 0; + last = time_after64(now, stats->last) + ? now - stats->last : 0; + + stats->last_duration = duration; + stats->max_duration = max(stats->max_duration, duration); + + if (stats->last) { + stats->average_duration = ewma_add(stats->average_duration, + duration << 8, 3); + + if (stats->average_frequency) + stats->average_frequency = + ewma_add(stats->average_frequency, + last << 8, 3); + else + stats->average_frequency = last << 8; + } else { + stats->average_duration = duration << 8; + } + + stats->last = now ?: 1; +} + +void bch2_time_stats_update(struct time_stats *stats, u64 start_time) +{ + spin_lock(&stats->lock); + __bch2_time_stats_update(stats, start_time); + spin_unlock(&stats->lock); +} + +/** + * bch2_ratelimit_delay() - return how long to delay until the next time to do + * some work + * + * @d - the struct bch_ratelimit to update + * + * Returns the amount of time to delay by, in jiffies + */ +u64 bch2_ratelimit_delay(struct bch_ratelimit *d) +{ + u64 now = local_clock(); + + return time_after64(d->next, now) + ? nsecs_to_jiffies(d->next - now) + : 0; +} + +/** + * bch2_ratelimit_increment() - increment @d by the amount of work done + * + * @d - the struct bch_ratelimit to update + * @done - the amount of work done, in arbitrary units + */ +void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) +{ + u64 now = local_clock(); + + d->next += div_u64(done * NSEC_PER_SEC, d->rate); + + if (time_before64(now + NSEC_PER_SEC, d->next)) + d->next = now + NSEC_PER_SEC; + + if (time_after64(now - NSEC_PER_SEC * 2, d->next)) + d->next = now - NSEC_PER_SEC * 2; +} + +int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d) +{ + while (1) { + u64 delay = bch2_ratelimit_delay(d); + + if (delay) + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + return 1; + + if (!delay) + return 0; + + schedule_timeout(delay); + try_to_freeze(); + } +} + +/* + * Updates pd_controller. Attempts to scale inputed values to units per second. + * @target: desired value + * @actual: current value + * + * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing + * it makes actual go down. + */ +void bch2_pd_controller_update(struct bch_pd_controller *pd, + s64 target, s64 actual, int sign) +{ + s64 proportional, derivative, change; + + unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; + + if (seconds_since_update == 0) + return; + + pd->last_update = jiffies; + + proportional = actual - target; + proportional *= seconds_since_update; + proportional = div_s64(proportional, pd->p_term_inverse); + + derivative = actual - pd->last_actual; + derivative = div_s64(derivative, seconds_since_update); + derivative = ewma_add(pd->smoothed_derivative, derivative, + (pd->d_term / seconds_since_update) ?: 1); + derivative = derivative * pd->d_term; + derivative = div_s64(derivative, pd->p_term_inverse); + + change = proportional + derivative; + + /* Don't increase rate if not keeping up */ + if (change > 0 && + pd->backpressure && + time_after64(local_clock(), + pd->rate.next + NSEC_PER_MSEC)) + change = 0; + + change *= (sign * -1); + + pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, + 1, UINT_MAX); + + pd->last_actual = actual; + pd->last_derivative = derivative; + pd->last_proportional = proportional; + pd->last_change = change; + pd->last_target = target; +} + +void bch2_pd_controller_init(struct bch_pd_controller *pd) +{ + pd->rate.rate = 1024; + pd->last_update = jiffies; + pd->p_term_inverse = 6000; + pd->d_term = 30; + pd->d_smooth = pd->d_term; + pd->backpressure = 1; +} + +size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) +{ + /* 2^64 - 1 is 20 digits, plus null byte */ + char rate[21]; + char actual[21]; + char target[21]; + char proportional[21]; + char derivative[21]; + char change[21]; + s64 next_io; + + bch2_hprint(rate, pd->rate.rate); + bch2_hprint(actual, pd->last_actual); + bch2_hprint(target, pd->last_target); + bch2_hprint(proportional, pd->last_proportional); + bch2_hprint(derivative, pd->last_derivative); + bch2_hprint(change, pd->last_change); + + next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); + + return sprintf(buf, + "rate:\t\t%s/sec\n" + "target:\t\t%s\n" + "actual:\t\t%s\n" + "proportional:\t%s\n" + "derivative:\t%s\n" + "change:\t\t%s/sec\n" + "next io:\t%llims\n", + rate, target, actual, proportional, + derivative, change, next_io); +} + +void bch2_bio_map(struct bio *bio, void *base) +{ + size_t size = bio->bi_iter.bi_size; + struct bio_vec *bv = bio->bi_io_vec; + + BUG_ON(!bio->bi_iter.bi_size); + BUG_ON(bio->bi_vcnt); + + bv->bv_offset = base ? offset_in_page(base) : 0; + goto start; + + for (; size; bio->bi_vcnt++, bv++) { + bv->bv_offset = 0; +start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, + size); + BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs); + if (base) { + bv->bv_page = is_vmalloc_addr(base) + ? vmalloc_to_page(base) + : virt_to_page(base); + + base += bv->bv_len; + } + + size -= bv->bv_len; + } +} + +size_t bch2_rand_range(size_t max) +{ + size_t rand; + + do { + get_random_bytes(&rand, sizeof(rand)); + rand &= roundup_pow_of_two(max) - 1; + } while (rand >= max); + + return rand; +} + +void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src) +{ + struct bio_vec bv; + struct bvec_iter iter; + + __bio_for_each_segment(bv, dst, iter, dst_iter) { + void *dstp = kmap_atomic(bv.bv_page); + memcpy(dstp + bv.bv_offset, src, bv.bv_len); + kunmap_atomic(dstp); + + src += bv.bv_len; + } +} + +void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) +{ + struct bio_vec bv; + struct bvec_iter iter; + + __bio_for_each_segment(bv, src, iter, src_iter) { + void *srcp = kmap_atomic(bv.bv_page); + memcpy(dst, srcp + bv.bv_offset, bv.bv_len); + kunmap_atomic(srcp); + + dst += bv.bv_len; + } +} diff --git a/libbcachefs/util.h b/libbcachefs/util.h new file mode 100644 index 00000000..5f13c824 --- /dev/null +++ b/libbcachefs/util.h @@ -0,0 +1,748 @@ +#ifndef _BCACHE_UTIL_H +#define _BCACHE_UTIL_H + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/closure.h> +#include <linux/errno.h> +#include <linux/blkdev.h> +#include <linux/freezer.h> +#include <linux/kernel.h> +#include <linux/llist.h> +#include <linux/ratelimit.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/workqueue.h> + +#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) +#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT) + +struct closure; + +#ifdef CONFIG_BCACHEFS_DEBUG + +#define EBUG_ON(cond) BUG_ON(cond) +#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) +#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) +#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0) +#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0) +#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0) +#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0) +#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0) +#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i) +#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) +#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) + +#define memcpy(_dst, _src, _len) \ +do { \ + BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ + (void *) (_dst) + (_len) <= (void *) (_src))); \ + memcpy(_dst, _src, _len); \ +} while (0) + +#else /* DEBUG */ + +#define EBUG_ON(cond) +#define atomic_dec_bug(v) atomic_dec(v) +#define atomic_inc_bug(v, i) atomic_inc(v) +#define atomic_sub_bug(i, v) atomic_sub(i, v) +#define atomic_add_bug(i, v) atomic_add(i, v) +#define atomic_long_dec_bug(v) atomic_long_dec(v) +#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v) +#define atomic64_dec_bug(v) atomic64_dec(v) +#define atomic64_inc_bug(v, i) atomic64_inc(v) +#define atomic64_sub_bug(i, v) atomic64_sub(i, v) +#define atomic64_add_bug(i, v) atomic64_add(i, v) + +#endif + +#ifndef __CHECKER__ +#define __flatten __attribute__((flatten)) +#else +/* sparse doesn't know about attribute((flatten)) */ +#define __flatten +#endif + +#ifdef __LITTLE_ENDIAN +#define CPU_BIG_ENDIAN 0 +#else +#define CPU_BIG_ENDIAN 1 +#endif + +/* type hackery */ + +#define type_is_exact(_val, _type) \ + __builtin_types_compatible_p(typeof(_val), _type) + +#define type_is(_val, _type) \ + (__builtin_types_compatible_p(typeof(_val), _type) || \ + __builtin_types_compatible_p(typeof(_val), const _type)) + +static inline void *kvmalloc(size_t bytes, gfp_t gfp) +{ + if (bytes <= PAGE_SIZE || + !(gfp & GFP_KERNEL)) + return kmalloc(bytes, gfp); + + return ((bytes <= KMALLOC_MAX_SIZE) + ? kmalloc(bytes, gfp|__GFP_NOWARN) + : NULL) ?: + vmalloc(bytes); +} + +#define DECLARE_HEAP(type, name) \ + struct { \ + size_t size, used; \ + type *data; \ + } name + +#define init_heap(heap, _size, gfp) \ +({ \ + size_t _bytes; \ + (heap)->used = 0; \ + (heap)->size = (_size); \ + _bytes = (heap)->size * sizeof(*(heap)->data); \ + (heap)->data = kvmalloc(_bytes, (gfp)); \ + (heap)->data; \ +}) + +#define free_heap(heap) \ +do { \ + kvfree((heap)->data); \ + (heap)->data = NULL; \ +} while (0) + +#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) + +#define heap_sift(h, i, cmp) \ +do { \ + size_t _r, _j = i; \ + \ + for (; _j * 2 + 1 < (h)->used; _j = _r) { \ + _r = _j * 2 + 1; \ + if (_r + 1 < (h)->used && \ + cmp((h)->data[_r], (h)->data[_r + 1])) \ + _r++; \ + \ + if (cmp((h)->data[_r], (h)->data[_j])) \ + break; \ + heap_swap(h, _r, _j); \ + } \ +} while (0) + +#define heap_sift_down(h, i, cmp) \ +do { \ + while (i) { \ + size_t p = (i - 1) / 2; \ + if (cmp((h)->data[i], (h)->data[p])) \ + break; \ + heap_swap(h, i, p); \ + i = p; \ + } \ +} while (0) + +#define heap_add(h, d, cmp) \ +({ \ + bool _r = !heap_full(h); \ + if (_r) { \ + size_t _i = (h)->used++; \ + (h)->data[_i] = d; \ + \ + heap_sift_down(h, _i, cmp); \ + heap_sift(h, _i, cmp); \ + } \ + _r; \ +}) + +#define heap_del(h, i, cmp) \ +do { \ + size_t _i = (i); \ + \ + BUG_ON(_i >= (h)->used); \ + (h)->used--; \ + heap_swap(h, _i, (h)->used); \ + heap_sift_down(h, _i, cmp); \ + heap_sift(h, _i, cmp); \ +} while (0) + +#define heap_pop(h, d, cmp) \ +({ \ + bool _r = (h)->used; \ + if (_r) { \ + (d) = (h)->data[0]; \ + heap_del(h, 0, cmp); \ + } \ + _r; \ +}) + +#define heap_peek(h) \ +({ \ + EBUG_ON(!(h)->used); \ + (h)->data[0]; \ +}) + +#define heap_full(h) ((h)->used == (h)->size) + +#define heap_resort(heap, cmp) \ +do { \ + ssize_t _i; \ + for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ + heap_sift(heap, _i, cmp); \ +} while (0) + +/* + * Simple array based allocator - preallocates a number of elements and you can + * never allocate more than that, also has no locking. + * + * Handy because if you know you only need a fixed number of elements you don't + * have to worry about memory allocation failure, and sometimes a mempool isn't + * what you want. + * + * We treat the free elements as entries in a singly linked list, and the + * freelist as a stack - allocating and freeing push and pop off the freelist. + */ + +#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \ + struct { \ + type *freelist; \ + type data[size]; \ + } name + +#define array_alloc(array) \ +({ \ + typeof((array)->freelist) _ret = (array)->freelist; \ + \ + if (_ret) \ + (array)->freelist = *((typeof((array)->freelist) *) _ret);\ + \ + _ret; \ +}) + +#define array_free(array, ptr) \ +do { \ + typeof((array)->freelist) _ptr = ptr; \ + \ + *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \ + (array)->freelist = _ptr; \ +} while (0) + +#define array_allocator_init(array) \ +do { \ + typeof((array)->freelist) _i; \ + \ + BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \ + (array)->freelist = NULL; \ + \ + for (_i = (array)->data; \ + _i < (array)->data + ARRAY_SIZE((array)->data); \ + _i++) \ + array_free(array, _i); \ +} while (0) + +#define array_freelist_empty(array) ((array)->freelist == NULL) + +#define ANYSINT_MAX(t) \ + ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) + +int bch2_strtoint_h(const char *, int *); +int bch2_strtouint_h(const char *, unsigned int *); +int bch2_strtoll_h(const char *, long long *); +int bch2_strtoull_h(const char *, unsigned long long *); + +static inline int bch2_strtol_h(const char *cp, long *res) +{ +#if BITS_PER_LONG == 32 + return bch2_strtoint_h(cp, (int *) res); +#else + return bch2_strtoll_h(cp, (long long *) res); +#endif +} + +static inline int bch2_strtoul_h(const char *cp, long *res) +{ +#if BITS_PER_LONG == 32 + return bch2_strtouint_h(cp, (unsigned int *) res); +#else + return bch2_strtoull_h(cp, (unsigned long long *) res); +#endif +} + +#define strtoi_h(cp, res) \ + ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ + : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ + : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ + : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ + : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ + : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ + : -EINVAL) + +#define strtoul_safe(cp, var) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r) \ + var = _v; \ + _r; \ +}) + +#define strtoul_safe_clamp(cp, var, min, max) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r) \ + var = clamp_t(typeof(var), _v, min, max); \ + _r; \ +}) + +#define strtoul_safe_restrict(cp, var, min, max) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r && _v >= min && _v <= max) \ + var = _v; \ + else \ + _r = -EINVAL; \ + _r; \ +}) + +#define snprint(buf, size, var) \ + snprintf(buf, size, \ + type_is(var, int) ? "%i\n" \ + : type_is(var, unsigned) ? "%u\n" \ + : type_is(var, long) ? "%li\n" \ + : type_is(var, unsigned long) ? "%lu\n" \ + : type_is(var, s64) ? "%lli\n" \ + : type_is(var, u64) ? "%llu\n" \ + : type_is(var, char *) ? "%s\n" \ + : "%i\n", var) + +ssize_t bch2_hprint(char *buf, s64 v); + +bool bch2_is_zero(const void *, size_t); + +ssize_t bch2_snprint_string_list(char *buf, size_t size, const char * const list[], + size_t selected); + +ssize_t bch2_read_string_list(const char *buf, const char * const list[]); + +struct time_stats { + spinlock_t lock; + u64 count; + /* + * all fields are in nanoseconds, averages are ewmas stored left shifted + * by 8 + */ + u64 last_duration; + u64 max_duration; + u64 average_duration; + u64 average_frequency; + u64 last; +}; + +void bch2_time_stats_clear(struct time_stats *stats); +void __bch2_time_stats_update(struct time_stats *stats, u64 time); +void bch2_time_stats_update(struct time_stats *stats, u64 time); + +static inline unsigned local_clock_us(void) +{ + return local_clock() >> 10; +} + +#define NSEC_PER_ns 1L +#define NSEC_PER_us NSEC_PER_USEC +#define NSEC_PER_ms NSEC_PER_MSEC +#define NSEC_PER_sec NSEC_PER_SEC + +#define __print_time_stat(stats, name, stat, units) \ + sysfs_print(name ## _ ## stat ## _ ## units, \ + div_u64((stats)->stat >> 8, NSEC_PER_ ## units)) + +#define sysfs_print_time_stats(stats, name, \ + frequency_units, \ + duration_units) \ +do { \ + __print_time_stat(stats, name, \ + average_frequency, frequency_units); \ + __print_time_stat(stats, name, \ + average_duration, duration_units); \ + sysfs_print(name ## _ ##count, (stats)->count); \ + sysfs_print(name ## _ ##last_duration ## _ ## duration_units, \ + div_u64((stats)->last_duration, \ + NSEC_PER_ ## duration_units)); \ + sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \ + div_u64((stats)->max_duration, \ + NSEC_PER_ ## duration_units)); \ + \ + sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ + ? div_s64(local_clock() - (stats)->last, \ + NSEC_PER_ ## frequency_units) \ + : -1LL); \ +} while (0) + +#define sysfs_clear_time_stats(stats, name) \ +do { \ + if (attr == &sysfs_ ## name ## _clear) \ + bch2_time_stats_clear(stats); \ +} while (0) + +#define sysfs_time_stats_attribute(name, \ + frequency_units, \ + duration_units) \ +write_attribute(name ## _clear); \ +read_attribute(name ## _count); \ +read_attribute(name ## _average_frequency_ ## frequency_units); \ +read_attribute(name ## _average_duration_ ## duration_units); \ +read_attribute(name ## _last_duration_ ## duration_units); \ +read_attribute(name ## _max_duration_ ## duration_units); \ +read_attribute(name ## _last_ ## frequency_units) + +#define sysfs_time_stats_attribute_list(name, \ + frequency_units, \ + duration_units) \ +&sysfs_ ## name ## _clear, \ +&sysfs_ ## name ## _count, \ +&sysfs_ ## name ## _average_frequency_ ## frequency_units, \ +&sysfs_ ## name ## _average_duration_ ## duration_units, \ +&sysfs_ ## name ## _last_duration_ ## duration_units, \ +&sysfs_ ## name ## _max_duration_ ## duration_units, \ +&sysfs_ ## name ## _last_ ## frequency_units, + +#define ewma_add(ewma, val, weight) \ +({ \ + typeof(ewma) _ewma = (ewma); \ + typeof(weight) _weight = (weight); \ + \ + (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ +}) + +struct bch_ratelimit { + /* Next time we want to do some work, in nanoseconds */ + u64 next; + + /* + * Rate at which we want to do work, in units per nanosecond + * The units here correspond to the units passed to + * bch2_ratelimit_increment() + */ + unsigned rate; +}; + +static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) +{ + d->next = local_clock(); +} + +u64 bch2_ratelimit_delay(struct bch_ratelimit *); +void bch2_ratelimit_increment(struct bch_ratelimit *, u64); +int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *); + +struct bch_pd_controller { + struct bch_ratelimit rate; + unsigned long last_update; + + s64 last_actual; + s64 smoothed_derivative; + + unsigned p_term_inverse; + unsigned d_smooth; + unsigned d_term; + + /* for exporting to sysfs (no effect on behavior) */ + s64 last_derivative; + s64 last_proportional; + s64 last_change; + s64 last_target; + + /* If true, the rate will not increase if bch2_ratelimit_delay() + * is not being called often enough. */ + bool backpressure; +}; + +void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); +void bch2_pd_controller_init(struct bch_pd_controller *); +size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *); + +#define sysfs_pd_controller_attribute(name) \ + rw_attribute(name##_rate); \ + rw_attribute(name##_rate_bytes); \ + rw_attribute(name##_rate_d_term); \ + rw_attribute(name##_rate_p_term_inverse); \ + read_attribute(name##_rate_debug) + +#define sysfs_pd_controller_files(name) \ + &sysfs_##name##_rate, \ + &sysfs_##name##_rate_bytes, \ + &sysfs_##name##_rate_d_term, \ + &sysfs_##name##_rate_p_term_inverse, \ + &sysfs_##name##_rate_debug + +#define sysfs_pd_controller_show(name, var) \ +do { \ + sysfs_hprint(name##_rate, (var)->rate.rate); \ + sysfs_print(name##_rate_bytes, (var)->rate.rate); \ + sysfs_print(name##_rate_d_term, (var)->d_term); \ + sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ + \ + if (attr == &sysfs_##name##_rate_debug) \ + return bch2_pd_controller_print_debug(var, buf); \ +} while (0) + +#define sysfs_pd_controller_store(name, var) \ +do { \ + sysfs_strtoul_clamp(name##_rate, \ + (var)->rate.rate, 1, UINT_MAX); \ + sysfs_strtoul_clamp(name##_rate_bytes, \ + (var)->rate.rate, 1, UINT_MAX); \ + sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ + sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ + (var)->p_term_inverse, 1, INT_MAX); \ +} while (0) + +#define __DIV_SAFE(n, d, zero) \ +({ \ + typeof(n) _n = (n); \ + typeof(d) _d = (d); \ + _d ? _n / _d : zero; \ +}) + +#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0) + +#define container_of_or_null(ptr, type, member) \ +({ \ + typeof(ptr) _ptr = ptr; \ + _ptr ? container_of(_ptr, type, member) : NULL; \ +}) + +#define RB_INSERT(root, new, member, cmp) \ +({ \ + __label__ dup; \ + struct rb_node **n = &(root)->rb_node, *parent = NULL; \ + typeof(new) this; \ + int res, ret = -1; \ + \ + while (*n) { \ + parent = *n; \ + this = container_of(*n, typeof(*(new)), member); \ + res = cmp(new, this); \ + if (!res) \ + goto dup; \ + n = res < 0 \ + ? &(*n)->rb_left \ + : &(*n)->rb_right; \ + } \ + \ + rb_link_node(&(new)->member, parent, n); \ + rb_insert_color(&(new)->member, root); \ + ret = 0; \ +dup: \ + ret; \ +}) + +#define RB_SEARCH(root, search, member, cmp) \ +({ \ + struct rb_node *n = (root)->rb_node; \ + typeof(&(search)) this, ret = NULL; \ + int res; \ + \ + while (n) { \ + this = container_of(n, typeof(search), member); \ + res = cmp(&(search), this); \ + if (!res) { \ + ret = this; \ + break; \ + } \ + n = res < 0 \ + ? n->rb_left \ + : n->rb_right; \ + } \ + ret; \ +}) + +#define RB_GREATER(root, search, member, cmp) \ +({ \ + struct rb_node *n = (root)->rb_node; \ + typeof(&(search)) this, ret = NULL; \ + int res; \ + \ + while (n) { \ + this = container_of(n, typeof(search), member); \ + res = cmp(&(search), this); \ + if (res < 0) { \ + ret = this; \ + n = n->rb_left; \ + } else \ + n = n->rb_right; \ + } \ + ret; \ +}) + +#define RB_FIRST(root, type, member) \ + container_of_or_null(rb_first(root), type, member) + +#define RB_LAST(root, type, member) \ + container_of_or_null(rb_last(root), type, member) + +#define RB_NEXT(ptr, member) \ + container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member) + +#define RB_PREV(ptr, member) \ + container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) + +/* Does linear interpolation between powers of two */ +static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) +{ + unsigned fract = x & ~(~0 << fract_bits); + + x >>= fract_bits; + x = 1 << x; + x += (x * fract) >> fract_bits; + + return x; +} + +void bch2_bio_map(struct bio *bio, void *base); + +static inline sector_t bdev_sectors(struct block_device *bdev) +{ + return bdev->bd_inode->i_size >> 9; +} + +#define closure_bio_submit(bio, cl) \ +do { \ + closure_get(cl); \ + submit_bio(bio); \ +} while (0) + +#define kthread_wait_freezable(cond) \ +({ \ + int _ret = 0; \ + while (1) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (kthread_should_stop()) { \ + _ret = -1; \ + break; \ + } \ + \ + if (cond) \ + break; \ + \ + schedule(); \ + try_to_freeze(); \ + } \ + set_current_state(TASK_RUNNING); \ + _ret; \ +}) + +size_t bch2_rand_range(size_t); + +void memcpy_to_bio(struct bio *, struct bvec_iter, void *); +void memcpy_from_bio(void *, struct bio *, struct bvec_iter); + +static inline void __memcpy_u64s(void *dst, const void *src, + unsigned u64s) +{ +#ifdef CONFIG_X86_64 + long d0, d1, d2; + asm volatile("rep ; movsq" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (u64s), "1" (dst), "2" (src) + : "memory"); +#else + u64 *d = dst; + const u64 *s = src; + + while (u64s--) + *d++ = *s++; +#endif +} + +static inline void memcpy_u64s(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(!(dst >= src + u64s * sizeof(u64) || + dst + u64s * sizeof(u64) <= src)); + + __memcpy_u64s(dst, src, u64s); +} + +static inline void __memmove_u64s_down(void *dst, const void *src, + unsigned u64s) +{ + __memcpy_u64s(dst, src, u64s); +} + +static inline void memmove_u64s_down(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst > src); + + __memmove_u64s_down(dst, src, u64s); +} + +static inline void __memmove_u64s_up(void *_dst, const void *_src, + unsigned u64s) +{ + u64 *dst = (u64 *) _dst + u64s - 1; + u64 *src = (u64 *) _src + u64s - 1; + +#ifdef CONFIG_X86_64 + long d0, d1, d2; + asm volatile("std ;\n" + "rep ; movsq\n" + "cld ;\n" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (u64s), "1" (dst), "2" (src) + : "memory"); +#else + while (u64s--) + *dst-- = *src--; +#endif +} + +static inline void memmove_u64s_up(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst < src); + + __memmove_u64s_up(dst, src, u64s); +} + +static inline void memmove_u64s(void *dst, const void *src, + unsigned u64s) +{ + if (dst < src) + __memmove_u64s_down(dst, src, u64s); + else + __memmove_u64s_up(dst, src, u64s); +} + +static inline struct bio_vec next_contig_bvec(struct bio *bio, + struct bvec_iter *iter) +{ + struct bio_vec bv = bio_iter_iovec(bio, *iter); + + bio_advance_iter(bio, iter, bv.bv_len); +#ifndef CONFIG_HIGHMEM + while (iter->bi_size) { + struct bio_vec next = bio_iter_iovec(bio, *iter); + + if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != + page_address(next.bv_page) + next.bv_offset) + break; + + bv.bv_len += next.bv_len; + bio_advance_iter(bio, iter, next.bv_len); + } +#endif + return bv; +} + +#define __bio_for_each_contig_segment(bv, bio, iter, start) \ + for (iter = (start); \ + (iter).bi_size && \ + ((bv = next_contig_bvec((bio), &(iter))), 1);) + +#define bio_for_each_contig_segment(bv, bio, iter) \ + __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) + +#endif /* _BCACHE_UTIL_H */ diff --git a/libbcachefs/vstructs.h b/libbcachefs/vstructs.h new file mode 100644 index 00000000..ce2cece0 --- /dev/null +++ b/libbcachefs/vstructs.h @@ -0,0 +1,62 @@ +#ifndef _VSTRUCTS_H +#define _VSTRUCTS_H + +#include "util.h" + +/* + * NOTE: we can't differentiate between __le64 and u64 with type_is - this + * assumes u64 is little endian: + */ +#define __vstruct_u64s(_s) \ +({ \ + ( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s) \ + : type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s) \ + : type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s) \ + : ((_s)->u64s)); \ +}) + +#define __vstruct_bytes(_type, _u64s) \ +({ \ + BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ + \ + (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ +}) + +#define vstruct_bytes(_s) \ + __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) + +#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ + (round_up(__vstruct_bytes(_type, _u64s), \ + 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) + +#define vstruct_blocks(_s, _sector_block_bits) \ + __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) + +#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ + __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ + __vstruct_u64s(_s) + (_u64s)) + +#define vstruct_sectors(_s, _sector_block_bits) \ + (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) + +#define vstruct_next(_s) \ + ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) +#define vstruct_last(_s) \ + ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) +#define vstruct_end(_s) \ + ((void *) ((_s)->_data + __vstruct_u64s(_s))) + +#define vstruct_for_each(_s, _i) \ + for (_i = (_s)->start; \ + _i < vstruct_last(_s); \ + _i = vstruct_next(_i)) + +#define vstruct_for_each_safe(_s, _i, _t) \ + for (_i = (_s)->start; \ + _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ + _i = _t) + +#define vstruct_idx(_s, _idx) \ + ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) + +#endif /* _VSTRUCTS_H */ diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c new file mode 100644 index 00000000..62a08897 --- /dev/null +++ b/libbcachefs/xattr.c @@ -0,0 +1,365 @@ + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_update.h" +#include "extents.h" +#include "fs.h" +#include "str_hash.h" +#include "xattr.h" + +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + +struct xattr_search_key { + u8 type; + struct qstr name; +}; + +#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ + { .type = _type, .name = QSTR_INIT(_name, _len) }) + +static u64 bch2_xattr_hash(const struct bch_hash_info *info, + const struct xattr_search_key *key) +{ + struct bch_str_hash_ctx ctx; + + bch2_str_hash_init(&ctx, info); + bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); + bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); + + return bch2_str_hash_end(&ctx, info); +} + +#define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len) + +static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) +{ + return bch2_xattr_hash(info, key); +} + +static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) +{ + struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); + + return bch2_xattr_hash(info, + &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); +} + +static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) +{ + struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); + const struct xattr_search_key *r = _r; + + return l.v->x_type != r->type || + l.v->x_name_len != r->name.len || + memcmp(l.v->x_name, r->name.name, r->name.len); +} + +static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) +{ + struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); + struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); + + return l.v->x_type != r.v->x_type || + l.v->x_name_len != r.v->x_name_len || + memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); +} + +static const struct bch_hash_desc xattr_hash_desc = { + .btree_id = BTREE_ID_XATTRS, + .key_type = BCH_XATTR, + .whiteout_type = BCH_XATTR_WHITEOUT, + .hash_key = xattr_hash_key, + .hash_bkey = xattr_hash_bkey, + .cmp_key = xattr_cmp_key, + .cmp_bkey = xattr_cmp_bkey, +}; + +static const char *bch2_xattr_invalid(const struct bch_fs *c, + struct bkey_s_c k) +{ + switch (k.k->type) { + case BCH_XATTR: + return bkey_val_bytes(k.k) < sizeof(struct bch_xattr) + ? "value too small" + : NULL; + + case BCH_XATTR_WHITEOUT: + return bkey_val_bytes(k.k) != 0 + ? "value size should be zero" + : NULL; + + default: + return "invalid type"; + } +} + +static void bch2_xattr_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) +{ + struct bkey_s_c_xattr xattr; + int n; + + switch (k.k->type) { + case BCH_XATTR: + xattr = bkey_s_c_to_xattr(k); + + if (size) { + n = min_t(unsigned, size, xattr.v->x_name_len); + memcpy(buf, xattr.v->x_name, n); + buf[size - 1] = '\0'; + buf += n; + size -= n; + } + + n = scnprintf(buf, size, " -> "); + buf += n; + size -= n; + + if (size) { + n = min_t(unsigned, size, + le16_to_cpu(xattr.v->x_val_len)); + memcpy(buf, xattr_val(xattr.v), n); + buf[size - 1] = '\0'; + buf += n; + size -= n; + } + + break; + case BCH_XATTR_WHITEOUT: + scnprintf(buf, size, "whiteout"); + break; + } +} + +const struct bkey_ops bch2_bkey_xattr_ops = { + .key_invalid = bch2_xattr_invalid, + .val_to_text = bch2_xattr_to_text, +}; + +int bch2_xattr_get(struct bch_fs *c, struct inode *inode, + const char *name, void *buffer, size_t size, int type) +{ + struct bch_inode_info *ei = to_bch_ei(inode); + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_xattr xattr; + int ret; + + k = bch2_hash_lookup(xattr_hash_desc, &ei->str_hash, c, + ei->vfs_inode.i_ino, &iter, + &X_SEARCH(type, name, strlen(name))); + if (IS_ERR(k.k)) + return bch2_btree_iter_unlock(&iter) ?: -ENODATA; + + xattr = bkey_s_c_to_xattr(k); + ret = le16_to_cpu(xattr.v->x_val_len); + if (buffer) { + if (ret > size) + ret = -ERANGE; + else + memcpy(buffer, xattr_val(xattr.v), ret); + } + + bch2_btree_iter_unlock(&iter); + return ret; +} + +int __bch2_xattr_set(struct bch_fs *c, u64 inum, + const struct bch_hash_info *hash_info, + const char *name, const void *value, size_t size, + int flags, int type, u64 *journal_seq) +{ + struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); + int ret; + + if (!value) { + ret = bch2_hash_delete(xattr_hash_desc, hash_info, + c, inum, + journal_seq, &search); + } else { + struct bkey_i_xattr *xattr; + unsigned u64s = BKEY_U64s + + DIV_ROUND_UP(sizeof(struct bch_xattr) + + search.name.len + size, + sizeof(u64)); + + if (u64s > U8_MAX) + return -ERANGE; + + xattr = kmalloc(u64s * sizeof(u64), GFP_NOFS); + if (!xattr) + return -ENOMEM; + + bkey_xattr_init(&xattr->k_i); + xattr->k.u64s = u64s; + xattr->v.x_type = type; + xattr->v.x_name_len = search.name.len; + xattr->v.x_val_len = cpu_to_le16(size); + memcpy(xattr->v.x_name, search.name.name, search.name.len); + memcpy(xattr_val(&xattr->v), value, size); + + ret = bch2_hash_set(xattr_hash_desc, hash_info, c, + inum, journal_seq, + &xattr->k_i, + (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| + (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); + kfree(xattr); + } + + if (ret == -ENOENT) + ret = flags & XATTR_REPLACE ? -ENODATA : 0; + + return ret; +} + +int bch2_xattr_set(struct bch_fs *c, struct inode *inode, + const char *name, const void *value, size_t size, + int flags, int type) +{ + struct bch_inode_info *ei = to_bch_ei(inode); + + return __bch2_xattr_set(c, inode->i_ino, &ei->str_hash, + name, value, size, flags, type, + &ei->journal_seq); +} + +static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); + +static size_t bch2_xattr_emit(struct dentry *dentry, + const struct bch_xattr *xattr, + char *buffer, size_t buffer_size) +{ + const struct xattr_handler *handler = + bch2_xattr_type_to_handler(xattr->x_type); + + if (handler && (!handler->list || handler->list(dentry))) { + const char *prefix = handler->prefix ?: handler->name; + const size_t prefix_len = strlen(prefix); + const size_t total_len = prefix_len + xattr->x_name_len + 1; + + if (buffer && total_len <= buffer_size) { + memcpy(buffer, prefix, prefix_len); + memcpy(buffer + prefix_len, + xattr->x_name, xattr->x_name_len); + buffer[prefix_len + xattr->x_name_len] = '\0'; + } + + return total_len; + } else { + return 0; + } +} + +ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct bch_fs *c = dentry->d_sb->s_fs_info; + struct btree_iter iter; + struct bkey_s_c k; + const struct bch_xattr *xattr; + u64 inum = dentry->d_inode->i_ino; + ssize_t ret = 0; + size_t len; + + for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), k) { + BUG_ON(k.k->p.inode < inum); + + if (k.k->p.inode > inum) + break; + + if (k.k->type != BCH_XATTR) + continue; + + xattr = bkey_s_c_to_xattr(k).v; + + len = bch2_xattr_emit(dentry, xattr, buffer, buffer_size); + if (buffer) { + if (len > buffer_size) { + bch2_btree_iter_unlock(&iter); + return -ERANGE; + } + + buffer += len; + buffer_size -= len; + } + + ret += len; + + } + bch2_btree_iter_unlock(&iter); + + return ret; +} + +static int bch2_xattr_get_handler(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + struct bch_fs *c = inode->i_sb->s_fs_info; + + return bch2_xattr_get(c, inode, name, buffer, size, handler->flags); +} + +static int bch2_xattr_set_handler(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + struct bch_fs *c = inode->i_sb->s_fs_info; + + return bch2_xattr_set(c, inode, name, value, size, flags, + handler->flags); +} + +static const struct xattr_handler bch_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = bch2_xattr_get_handler, + .set = bch2_xattr_set_handler, + .flags = BCH_XATTR_INDEX_USER, +}; + +static bool bch2_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +static const struct xattr_handler bch_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = bch2_xattr_trusted_list, + .get = bch2_xattr_get_handler, + .set = bch2_xattr_set_handler, + .flags = BCH_XATTR_INDEX_TRUSTED, +}; + +static const struct xattr_handler bch_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = bch2_xattr_get_handler, + .set = bch2_xattr_set_handler, + .flags = BCH_XATTR_INDEX_SECURITY, +}; + +static const struct xattr_handler *bch_xattr_handler_map[] = { + [BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler, + [BCH_XATTR_INDEX_POSIX_ACL_ACCESS] = + &posix_acl_access_xattr_handler, + [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT] = + &posix_acl_default_xattr_handler, + [BCH_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, + [BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, +}; + +const struct xattr_handler *bch2_xattr_handlers[] = { + &bch_xattr_user_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + &bch_xattr_trusted_handler, + &bch_xattr_security_handler, + NULL +}; + +static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) +{ + return type < ARRAY_SIZE(bch_xattr_handler_map) + ? bch_xattr_handler_map[type] + : NULL; +} diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h new file mode 100644 index 00000000..14eba241 --- /dev/null +++ b/libbcachefs/xattr.h @@ -0,0 +1,20 @@ +#ifndef _BCACHE_XATTR_H +#define _BCACHE_XATTR_H + +extern const struct bkey_ops bch2_bkey_xattr_ops; + +struct dentry; +struct xattr_handler; +struct bch_hash_info; + +int bch2_xattr_get(struct bch_fs *, struct inode *, + const char *, void *, size_t, int); +int __bch2_xattr_set(struct bch_fs *, u64, const struct bch_hash_info *, + const char *, const void *, size_t, int, int, u64 *); +int bch2_xattr_set(struct bch_fs *, struct inode *, + const char *, const void *, size_t, int, int); +ssize_t bch2_xattr_list(struct dentry *, char *, size_t); + +extern const struct xattr_handler *bch2_xattr_handlers[]; + +#endif /* _BCACHE_XATTR_H */ |