diff options
Diffstat (limited to 'libbcache')
109 files changed, 0 insertions, 47946 deletions
diff --git a/libbcache/acl.c b/libbcache/acl.c deleted file mode 100644 index 4363c57e..00000000 --- a/libbcache/acl.c +++ /dev/null @@ -1,225 +0,0 @@ -#include "bcache.h" - -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/fs.h> - -#include "xattr.h" -#include "acl.h" - -/* - * Convert from filesystem to in-memory representation. - */ -static struct posix_acl *bch_acl_from_disk(const void *value, size_t size) -{ - const char *end = (char *)value + size; - int n, count; - struct posix_acl *acl; - - if (!value) - return NULL; - if (size < sizeof(bch_acl_header)) - return ERR_PTR(-EINVAL); - if (((bch_acl_header *)value)->a_version != - cpu_to_le32(BCH_ACL_VERSION)) - return ERR_PTR(-EINVAL); - value = (char *)value + sizeof(bch_acl_header); - count = bch_acl_count(size); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; - acl = posix_acl_alloc(count, GFP_KERNEL); - if (!acl) - return ERR_PTR(-ENOMEM); - for (n = 0; n < count; n++) { - bch_acl_entry *entry = - (bch_acl_entry *)value; - if ((char *)value + sizeof(bch_acl_entry_short) > end) - goto fail; - acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); - acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); - switch (acl->a_entries[n].e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - value = (char *)value + - sizeof(bch_acl_entry_short); - break; - - case ACL_USER: - value = (char *)value + sizeof(bch_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_uid = - make_kuid(&init_user_ns, - le32_to_cpu(entry->e_id)); - break; - case ACL_GROUP: - value = (char *)value + sizeof(bch_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_gid = - make_kgid(&init_user_ns, - le32_to_cpu(entry->e_id)); - break; - - default: - goto fail; - } - } - if (value != end) - goto fail; - return acl; - -fail: - posix_acl_release(acl); - return ERR_PTR(-EINVAL); -} - -/* - * Convert from in-memory to filesystem representation. - */ -static void *bch_acl_to_disk(const struct posix_acl *acl, size_t *size) -{ - bch_acl_header *ext_acl; - char *e; - size_t n; - - *size = bch_acl_size(acl->a_count); - ext_acl = kmalloc(sizeof(bch_acl_header) + acl->a_count * - sizeof(bch_acl_entry), GFP_KERNEL); - if (!ext_acl) - return ERR_PTR(-ENOMEM); - ext_acl->a_version = cpu_to_le32(BCH_ACL_VERSION); - e = (char *)ext_acl + sizeof(bch_acl_header); - for (n = 0; n < acl->a_count; n++) { - const struct posix_acl_entry *acl_e = &acl->a_entries[n]; - bch_acl_entry *entry = (bch_acl_entry *)e; - - entry->e_tag = cpu_to_le16(acl_e->e_tag); - entry->e_perm = cpu_to_le16(acl_e->e_perm); - switch (acl_e->e_tag) { - case ACL_USER: - entry->e_id = cpu_to_le32( - from_kuid(&init_user_ns, acl_e->e_uid)); - e += sizeof(bch_acl_entry); - break; - case ACL_GROUP: - entry->e_id = cpu_to_le32( - from_kgid(&init_user_ns, acl_e->e_gid)); - e += sizeof(bch_acl_entry); - break; - - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - e += sizeof(bch_acl_entry_short); - break; - - default: - goto fail; - } - } - return (char *)ext_acl; - -fail: - kfree(ext_acl); - return ERR_PTR(-EINVAL); -} - -struct posix_acl *bch_get_acl(struct inode *inode, int type) -{ - struct bch_fs *c = inode->i_sb->s_fs_info; - int name_index; - char *value = NULL; - struct posix_acl *acl; - int ret; - - switch (type) { - case ACL_TYPE_ACCESS: - name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT; - break; - default: - BUG(); - } - ret = bch_xattr_get(c, inode, "", NULL, 0, name_index); - if (ret > 0) { - value = kmalloc(ret, GFP_KERNEL); - if (!value) - return ERR_PTR(-ENOMEM); - ret = bch_xattr_get(c, inode, "", value, - ret, name_index); - } - if (ret > 0) - acl = bch_acl_from_disk(value, ret); - else if (ret == -ENODATA || ret == -ENOSYS) - acl = NULL; - else - acl = ERR_PTR(ret); - kfree(value); - - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); - - return acl; -} - -int bch_set_acl(struct inode *inode, struct posix_acl *acl, int type) -{ - struct bch_fs *c = inode->i_sb->s_fs_info; - int name_index; - void *value = NULL; - size_t size = 0; - int ret; - - switch (type) { - case ACL_TYPE_ACCESS: - name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - ret = posix_acl_equiv_mode(acl, &inode->i_mode); - if (ret < 0) - return ret; - else { - inode->i_ctime = current_fs_time(inode->i_sb); - mark_inode_dirty(inode); - if (ret == 0) - acl = NULL; - } - } - break; - - case ACL_TYPE_DEFAULT: - name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - return acl ? -EACCES : 0; - break; - - default: - return -EINVAL; - } - - if (acl) { - value = bch_acl_to_disk(acl, &size); - if (IS_ERR(value)) - return (int)PTR_ERR(value); - } - - ret = bch_xattr_set(c, inode, "", value, size, 0, name_index); - - kfree(value); - - if (ret == -ERANGE) - ret = -E2BIG; - - if (!ret) - set_cached_acl(inode, type, acl); - - return ret; -} diff --git a/libbcache/acl.h b/libbcache/acl.h deleted file mode 100644 index 079e5689..00000000 --- a/libbcache/acl.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - File: fs/bch/acl.h - - (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> -*/ - -#include <linux/posix_acl_xattr.h> - -#define BCH_ACL_VERSION 0x0001 - -typedef struct { - __le16 e_tag; - __le16 e_perm; - __le32 e_id; -} bch_acl_entry; - -typedef struct { - __le16 e_tag; - __le16 e_perm; -} bch_acl_entry_short; - -typedef struct { - __le32 a_version; -} bch_acl_header; - -static inline size_t bch_acl_size(int count) -{ - if (count <= 4) { - return sizeof(bch_acl_header) + - count * sizeof(bch_acl_entry_short); - } else { - return sizeof(bch_acl_header) + - 4 * sizeof(bch_acl_entry_short) + - (count - 4) * sizeof(bch_acl_entry); - } -} - -static inline int bch_acl_count(size_t size) -{ - ssize_t s; - - size -= sizeof(bch_acl_header); - s = size - 4 * sizeof(bch_acl_entry_short); - if (s < 0) { - if (size % sizeof(bch_acl_entry_short)) - return -1; - return size / sizeof(bch_acl_entry_short); - } else { - if (s % sizeof(bch_acl_entry)) - return -1; - return s / sizeof(bch_acl_entry) + 4; - } -} - -extern struct posix_acl *bch_get_acl(struct inode *, int); -extern int bch_set_acl(struct inode *, struct posix_acl *, int); diff --git a/libbcache/alloc.c b/libbcache/alloc.c deleted file mode 100644 index 2f892914..00000000 --- a/libbcache/alloc.c +++ /dev/null @@ -1,1913 +0,0 @@ -/* - * Primary bucket allocation code - * - * Copyright 2012 Google, Inc. - * - * Allocation in bcache is done in terms of buckets: - * - * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in - * btree pointers - they must match for the pointer to be considered valid. - * - * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a - * bucket simply by incrementing its gen. - * - * The gens (along with the priorities; it's really the gens are important but - * the code is named as if it's the priorities) are written in an arbitrary list - * of buckets on disk, with a pointer to them in the journal header. - * - * When we invalidate a bucket, we have to write its new gen to disk and wait - * for that write to complete before we use it - otherwise after a crash we - * could have pointers that appeared to be good but pointed to data that had - * been overwritten. - * - * Since the gens and priorities are all stored contiguously on disk, we can - * batch this up: We fill up the free_inc list with freshly invalidated buckets, - * call prio_write(), and when prio_write() finishes we pull buckets off the - * free_inc list and optionally discard them. - * - * free_inc isn't the only freelist - if it was, we'd often have to sleep while - * priorities and gens were being written before we could allocate. c->free is a - * smaller freelist, and buckets on that list are always ready to be used. - * - * If we've got discards enabled, that happens when a bucket moves from the - * free_inc list to the free list. - * - * It's important to ensure that gens don't wrap around - with respect to - * either the oldest gen in the btree or the gen on disk. This is quite - * difficult to do in practice, but we explicitly guard against it anyways - if - * a bucket is in danger of wrapping around we simply skip invalidating it that - * time around, and we garbage collect or rewrite the priorities sooner than we - * would have otherwise. - * - * bch_bucket_alloc() allocates a single bucket from a specific device. - * - * bch_bucket_alloc_set() allocates one or more buckets from different devices - * in a given filesystem. - * - * invalidate_buckets() drives all the processes described above. It's called - * from bch_bucket_alloc() and a few other places that need to make sure free - * buckets are ready. - * - * invalidate_buckets_(lru|fifo)() find buckets that are available to be - * invalidated, and then invalidate them and stick them on the free_inc list - - * in either lru or fifo order. - */ - -#include "bcache.h" -#include "alloc.h" -#include "btree_update.h" -#include "buckets.h" -#include "checksum.h" -#include "clock.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "io.h" -#include "journal.h" -#include "super-io.h" - -#include <linux/blkdev.h> -#include <linux/kthread.h> -#include <linux/math64.h> -#include <linux/random.h> -#include <linux/rcupdate.h> -#include <trace/events/bcache.h> - -static void __bch_bucket_free(struct bch_dev *, struct bucket *); -static void bch_recalc_min_prio(struct bch_dev *, int); - -/* Allocation groups: */ - -void bch_dev_group_remove(struct dev_group *grp, struct bch_dev *ca) -{ - unsigned i; - - spin_lock(&grp->lock); - - for (i = 0; i < grp->nr; i++) - if (grp->d[i].dev == ca) { - grp->nr--; - memmove(&grp->d[i], - &grp->d[i + 1], - (grp->nr- i) * sizeof(grp->d[0])); - break; - } - - spin_unlock(&grp->lock); -} - -void bch_dev_group_add(struct dev_group *grp, struct bch_dev *ca) -{ - unsigned i; - - spin_lock(&grp->lock); - for (i = 0; i < grp->nr; i++) - if (grp->d[i].dev == ca) - goto out; - - BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX); - - grp->d[grp->nr++].dev = ca; -out: - spin_unlock(&grp->lock); -} - -/* Ratelimiting/PD controllers */ - -static void pd_controllers_update(struct work_struct *work) -{ - struct bch_fs *c = container_of(to_delayed_work(work), - struct bch_fs, - pd_controllers_update); - struct bch_dev *ca; - unsigned i, iter; - - /* All units are in bytes */ - u64 faster_tiers_size = 0; - u64 faster_tiers_dirty = 0; - - u64 fastest_tier_size = 0; - u64 fastest_tier_free = 0; - u64 copygc_can_free = 0; - - rcu_read_lock(); - for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { - bch_pd_controller_update(&c->tiers[i].pd, - div_u64(faster_tiers_size * - c->tiering_percent, 100), - faster_tiers_dirty, - -1); - - spin_lock(&c->tiers[i].devs.lock); - group_for_each_dev(ca, &c->tiers[i].devs, iter) { - struct bch_dev_usage stats = bch_dev_usage_read(ca); - unsigned bucket_bits = ca->bucket_bits + 9; - - u64 size = (ca->mi.nbuckets - - ca->mi.first_bucket) << bucket_bits; - u64 dirty = stats.buckets_dirty << bucket_bits; - u64 free = __dev_buckets_free(ca, stats) << bucket_bits; - /* - * Bytes of internal fragmentation, which can be - * reclaimed by copy GC - */ - s64 fragmented = ((stats.buckets_dirty + - stats.buckets_cached) << - bucket_bits) - - ((stats.sectors[S_DIRTY] + - stats.sectors[S_CACHED] ) << 9); - - fragmented = max(0LL, fragmented); - - bch_pd_controller_update(&ca->moving_gc_pd, - free, fragmented, -1); - - faster_tiers_size += size; - faster_tiers_dirty += dirty; - - if (!c->fastest_tier || - c->fastest_tier == &c->tiers[i]) { - fastest_tier_size += size; - fastest_tier_free += free; - } - - copygc_can_free += fragmented; - } - spin_unlock(&c->tiers[i].devs.lock); - } - - rcu_read_unlock(); - - /* - * Throttle foreground writes if tier 0 is running out of free buckets, - * and either tiering or copygc can free up space. - * - * Target will be small if there isn't any work to do - we don't want to - * throttle foreground writes if we currently have all the free space - * we're ever going to have. - * - * Otherwise, if there's work to do, try to keep 20% of tier0 available - * for foreground writes. - */ - if (c->fastest_tier) - copygc_can_free = U64_MAX; - - bch_pd_controller_update(&c->foreground_write_pd, - min(copygc_can_free, - div_u64(fastest_tier_size * - c->foreground_target_percent, - 100)), - fastest_tier_free, - -1); - - schedule_delayed_work(&c->pd_controllers_update, - c->pd_controllers_update_seconds * HZ); -} - -/* - * Bucket priorities/gens: - * - * For each bucket, we store on disk its - * 8 bit gen - * 16 bit priority - * - * See alloc.c for an explanation of the gen. The priority is used to implement - * lru (and in the future other) cache replacement policies; for most purposes - * it's just an opaque integer. - * - * The gens and the priorities don't have a whole lot to do with each other, and - * it's actually the gens that must be written out at specific times - it's no - * big deal if the priorities don't get written, if we lose them we just reuse - * buckets in suboptimal order. - * - * On disk they're stored in a packed array, and in as many buckets are required - * to fit them all. The buckets we use to store them form a list; the journal - * header points to the first bucket, the first bucket points to the second - * bucket, et cetera. - * - * This code is used by the allocation code; periodically (whenever it runs out - * of buckets to allocate from) the allocation code will invalidate some - * buckets, but it can't use those buckets until their new gens are safely on - * disk. - */ - -static int prio_io(struct bch_dev *ca, uint64_t bucket, int op) -{ - bio_init(ca->bio_prio); - bio_set_op_attrs(ca->bio_prio, op, REQ_SYNC|REQ_META); - - ca->bio_prio->bi_max_vecs = bucket_pages(ca); - ca->bio_prio->bi_io_vec = ca->bio_prio->bi_inline_vecs; - ca->bio_prio->bi_iter.bi_sector = bucket * ca->mi.bucket_size; - ca->bio_prio->bi_bdev = ca->disk_sb.bdev; - ca->bio_prio->bi_iter.bi_size = bucket_bytes(ca); - bch_bio_map(ca->bio_prio, ca->disk_buckets); - - return submit_bio_wait(ca->bio_prio); -} - -static struct nonce prio_nonce(struct prio_set *p) -{ - return (struct nonce) {{ - [0] = 0, - [1] = p->nonce[0], - [2] = p->nonce[1], - [3] = p->nonce[2]^BCH_NONCE_PRIO, - }}; -} - -static int bch_prio_write(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct journal *j = &c->journal; - struct journal_res res = { 0 }; - bool need_new_journal_entry; - int i, ret; - - if (c->opts.nochanges) - return 0; - - trace_bcache_prio_write_start(ca); - - atomic64_add(ca->mi.bucket_size * prio_buckets(ca), - &ca->meta_sectors_written); - - for (i = prio_buckets(ca) - 1; i >= 0; --i) { - struct bucket *g; - struct prio_set *p = ca->disk_buckets; - struct bucket_disk *d = p->data; - struct bucket_disk *end = d + prios_per_bucket(ca); - size_t r; - - for (r = i * prios_per_bucket(ca); - r < ca->mi.nbuckets && d < end; - r++, d++) { - g = ca->buckets + r; - d->read_prio = cpu_to_le16(g->read_prio); - d->write_prio = cpu_to_le16(g->write_prio); - d->gen = ca->buckets[r].mark.gen; - } - - p->next_bucket = cpu_to_le64(ca->prio_buckets[i + 1]); - p->magic = cpu_to_le64(pset_magic(c)); - get_random_bytes(&p->nonce, sizeof(p->nonce)); - - spin_lock(&ca->prio_buckets_lock); - r = bch_bucket_alloc(ca, RESERVE_PRIO); - BUG_ON(!r); - - /* - * goes here before dropping prio_buckets_lock to guard against - * it getting gc'd from under us - */ - ca->prio_buckets[i] = r; - bch_mark_metadata_bucket(ca, ca->buckets + r, - BUCKET_PRIOS, false); - spin_unlock(&ca->prio_buckets_lock); - - SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c)); - - bch_encrypt(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - p->encrypted_start, - bucket_bytes(ca) - - offsetof(struct prio_set, encrypted_start)); - - p->csum = bch_checksum(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - (void *) p + sizeof(p->csum), - bucket_bytes(ca) - sizeof(p->csum)); - - ret = prio_io(ca, r, REQ_OP_WRITE); - if (bch_dev_fatal_io_err_on(ret, ca, - "prio write to bucket %zu", r) || - bch_meta_write_fault("prio")) - return ret; - } - - spin_lock(&j->lock); - j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]); - j->nr_prio_buckets = max_t(unsigned, - ca->dev_idx + 1, - j->nr_prio_buckets); - spin_unlock(&j->lock); - - do { - unsigned u64s = jset_u64s(0); - - if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) - break; - - ret = bch_journal_res_get(j, &res, u64s, u64s); - if (ret) - return ret; - - need_new_journal_entry = j->buf[res.idx].nr_prio_buckets < - ca->dev_idx + 1; - bch_journal_res_put(j, &res); - - ret = bch_journal_flush_seq(j, res.seq); - if (ret) - return ret; - } while (need_new_journal_entry); - - /* - * Don't want the old priorities to get garbage collected until after we - * finish writing the new ones, and they're journalled - */ - - spin_lock(&ca->prio_buckets_lock); - - for (i = 0; i < prio_buckets(ca); i++) { - if (ca->prio_last_buckets[i]) - __bch_bucket_free(ca, - &ca->buckets[ca->prio_last_buckets[i]]); - - ca->prio_last_buckets[i] = ca->prio_buckets[i]; - } - - spin_unlock(&ca->prio_buckets_lock); - - trace_bcache_prio_write_end(ca); - return 0; -} - -int bch_prio_read(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct prio_set *p = ca->disk_buckets; - struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; - struct bucket_mark new; - struct bch_csum csum; - unsigned bucket_nr = 0; - u64 bucket, expect, got; - size_t b; - int ret = 0; - - spin_lock(&c->journal.lock); - bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]); - spin_unlock(&c->journal.lock); - - /* - * If the device hasn't been used yet, there won't be a prio bucket ptr - */ - if (!bucket) - return 0; - - unfixable_fsck_err_on(bucket < ca->mi.first_bucket || - bucket >= ca->mi.nbuckets, c, - "bad prio bucket %llu", bucket); - - for (b = 0; b < ca->mi.nbuckets; b++, d++) { - if (d == end) { - ca->prio_last_buckets[bucket_nr] = bucket; - bucket_nr++; - - ret = prio_io(ca, bucket, REQ_OP_READ); - if (bch_dev_fatal_io_err_on(ret, ca, - "prior read from bucket %llu", - bucket) || - bch_meta_read_fault("prio")) - return -EIO; - - got = le64_to_cpu(p->magic); - expect = pset_magic(c); - unfixable_fsck_err_on(got != expect, c, - "bad magic (got %llu expect %llu) while reading prios from bucket %llu", - got, expect, bucket); - - unfixable_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c, - "prio bucket with unknown csum type %llu bucket %lluu", - PSET_CSUM_TYPE(p), bucket); - - csum = bch_checksum(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - (void *) p + sizeof(p->csum), - bucket_bytes(ca) - sizeof(p->csum)); - unfixable_fsck_err_on(bch_crc_cmp(csum, p->csum), c, - "bad checksum reading prios from bucket %llu", - bucket); - - bch_encrypt(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - p->encrypted_start, - bucket_bytes(ca) - - offsetof(struct prio_set, encrypted_start)); - - bucket = le64_to_cpu(p->next_bucket); - d = p->data; - } - - ca->buckets[b].read_prio = le16_to_cpu(d->read_prio); - ca->buckets[b].write_prio = le16_to_cpu(d->write_prio); - - bucket_cmpxchg(&ca->buckets[b], new, new.gen = d->gen); - } - - mutex_lock(&c->bucket_lock); - bch_recalc_min_prio(ca, READ); - bch_recalc_min_prio(ca, WRITE); - mutex_unlock(&c->bucket_lock); - - ret = 0; -fsck_err: - return ret; -} - -#define BUCKET_GC_GEN_MAX 96U - -/** - * wait_buckets_available - wait on reclaimable buckets - * - * If there aren't enough available buckets to fill up free_inc, wait until - * there are. - */ -static int wait_buckets_available(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - int ret = 0; - - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) { - ret = -1; - break; - } - - if (ca->inc_gen_needs_gc >= fifo_free(&ca->free_inc)) { - if (c->gc_thread) { - trace_bcache_gc_cannot_inc_gens(ca->fs); - atomic_inc(&c->kick_gc); - wake_up_process(ca->fs->gc_thread); - } - - /* - * We are going to wait for GC to wake us up, even if - * bucket counters tell us enough buckets are available, - * because we are actually waiting for GC to rewrite - * nodes with stale pointers - */ - } else if (dev_buckets_available(ca) >= - fifo_free(&ca->free_inc)) - break; - - up_read(&ca->fs->gc_lock); - schedule(); - try_to_freeze(); - down_read(&ca->fs->gc_lock); - } - - __set_current_state(TASK_RUNNING); - return ret; -} - -static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket) -{ - if (expensive_debug_checks(ca->fs)) { - size_t iter; - long i; - unsigned j; - - for (iter = 0; iter < prio_buckets(ca) * 2; iter++) - BUG_ON(ca->prio_buckets[iter] == bucket); - - for (j = 0; j < RESERVE_NR; j++) - fifo_for_each_entry(i, &ca->free[j], iter) - BUG_ON(i == bucket); - fifo_for_each_entry(i, &ca->free_inc, iter) - BUG_ON(i == bucket); - } -} - -/* Bucket heap / gen */ - -void bch_recalc_min_prio(struct bch_dev *ca, int rw) -{ - struct bch_fs *c = ca->fs; - struct prio_clock *clock = &c->prio_clock[rw]; - struct bucket *g; - u16 max_delta = 1; - unsigned i; - - lockdep_assert_held(&c->bucket_lock); - - /* Determine min prio for this particular cache */ - for_each_bucket(g, ca) - max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw])); - - ca->min_prio[rw] = clock->hand - max_delta; - - /* - * This may possibly increase the min prio for the whole cache, check - * that as well. - */ - max_delta = 1; - - for_each_member_device(ca, c, i) - max_delta = max(max_delta, - (u16) (clock->hand - ca->min_prio[rw])); - - clock->min_prio = clock->hand - max_delta; -} - -static void bch_rescale_prios(struct bch_fs *c, int rw) -{ - struct prio_clock *clock = &c->prio_clock[rw]; - struct bch_dev *ca; - struct bucket *g; - unsigned i; - - trace_bcache_rescale_prios(c); - - for_each_member_device(ca, c, i) { - for_each_bucket(g, ca) - g->prio[rw] = clock->hand - - (clock->hand - g->prio[rw]) / 2; - - bch_recalc_min_prio(ca, rw); - } -} - -static void bch_inc_clock_hand(struct io_timer *timer) -{ - struct prio_clock *clock = container_of(timer, - struct prio_clock, rescale); - struct bch_fs *c = container_of(clock, - struct bch_fs, prio_clock[clock->rw]); - u64 capacity; - - mutex_lock(&c->bucket_lock); - - clock->hand++; - - /* if clock cannot be advanced more, rescale prio */ - if (clock->hand == (u16) (clock->min_prio - 1)) - bch_rescale_prios(c, clock->rw); - - mutex_unlock(&c->bucket_lock); - - capacity = READ_ONCE(c->capacity); - - if (!capacity) - return; - - /* - * we only increment when 0.1% of the filesystem capacity has been read - * or written too, this determines if it's time - * - * XXX: we shouldn't really be going off of the capacity of devices in - * RW mode (that will be 0 when we're RO, yet we can still service - * reads) - */ - timer->expire += capacity >> 10; - - bch_io_timer_add(&c->io_clock[clock->rw], timer); -} - -static void bch_prio_timer_init(struct bch_fs *c, int rw) -{ - struct prio_clock *clock = &c->prio_clock[rw]; - struct io_timer *timer = &clock->rescale; - - clock->rw = rw; - timer->fn = bch_inc_clock_hand; - timer->expire = c->capacity >> 10; -} - -/* - * Background allocation thread: scans for buckets to be invalidated, - * invalidates them, rewrites prios/gens (marking them as invalidated on disk), - * then optionally issues discard commands to the newly free buckets, then puts - * them on the various freelists. - */ - -static inline bool can_inc_bucket_gen(struct bch_dev *ca, struct bucket *g) -{ - return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX; -} - -static bool bch_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g) -{ - if (!is_available_bucket(READ_ONCE(g->mark))) - return false; - - if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX - 1) - ca->inc_gen_needs_gc++; - - return can_inc_bucket_gen(ca, g); -} - -static void bch_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g) -{ - spin_lock(&ca->freelist_lock); - - bch_invalidate_bucket(ca, g); - - g->read_prio = ca->fs->prio_clock[READ].hand; - g->write_prio = ca->fs->prio_clock[WRITE].hand; - - verify_not_on_freelist(ca, g - ca->buckets); - BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); - - spin_unlock(&ca->freelist_lock); -} - -/* - * Determines what order we're going to reuse buckets, smallest bucket_key() - * first. - * - * - * - We take into account the read prio of the bucket, which gives us an - * indication of how hot the data is -- we scale the prio so that the prio - * farthest from the clock is worth 1/8th of the closest. - * - * - The number of sectors of cached data in the bucket, which gives us an - * indication of the cost in cache misses this eviction will cause. - * - * - The difference between the bucket's current gen and oldest gen of any - * pointer into it, which gives us an indication of the cost of an eventual - * btree GC to rewrite nodes with stale pointers. - */ - -#define bucket_sort_key(g) \ -({ \ - unsigned long prio = g->read_prio - ca->min_prio[READ]; \ - prio = (prio * 7) / (ca->fs->prio_clock[READ].hand - \ - ca->min_prio[READ]); \ - \ - (((prio + 1) * bucket_sectors_used(g)) << 8) | bucket_gc_gen(ca, g);\ -}) - -static void invalidate_buckets_lru(struct bch_dev *ca) -{ - struct bucket_heap_entry e; - struct bucket *g; - unsigned i; - - mutex_lock(&ca->heap_lock); - - ca->heap.used = 0; - - mutex_lock(&ca->fs->bucket_lock); - bch_recalc_min_prio(ca, READ); - bch_recalc_min_prio(ca, WRITE); - - /* - * Find buckets with lowest read priority, by building a maxheap sorted - * by read priority and repeatedly replacing the maximum element until - * all buckets have been visited. - */ - for_each_bucket(g, ca) { - if (!bch_can_invalidate_bucket(ca, g)) - continue; - - bucket_heap_push(ca, g, bucket_sort_key(g)); - } - - /* Sort buckets by physical location on disk for better locality */ - for (i = 0; i < ca->heap.used; i++) { - struct bucket_heap_entry *e = &ca->heap.data[i]; - - e->val = e->g - ca->buckets; - } - - heap_resort(&ca->heap, bucket_max_cmp); - - /* - * If we run out of buckets to invalidate, bch_allocator_thread() will - * kick stuff and retry us - */ - while (!fifo_full(&ca->free_inc) && - heap_pop(&ca->heap, e, bucket_max_cmp)) { - BUG_ON(!bch_can_invalidate_bucket(ca, e.g)); - bch_invalidate_one_bucket(ca, e.g); - } - - mutex_unlock(&ca->fs->bucket_lock); - mutex_unlock(&ca->heap_lock); -} - -static void invalidate_buckets_fifo(struct bch_dev *ca) -{ - struct bucket *g; - size_t checked = 0; - - while (!fifo_full(&ca->free_inc)) { - if (ca->fifo_last_bucket < ca->mi.first_bucket || - ca->fifo_last_bucket >= ca->mi.nbuckets) - ca->fifo_last_bucket = ca->mi.first_bucket; - - g = ca->buckets + ca->fifo_last_bucket++; - - if (bch_can_invalidate_bucket(ca, g)) - bch_invalidate_one_bucket(ca, g); - - if (++checked >= ca->mi.nbuckets) - return; - } -} - -static void invalidate_buckets_random(struct bch_dev *ca) -{ - struct bucket *g; - size_t checked = 0; - - while (!fifo_full(&ca->free_inc)) { - size_t n = bch_rand_range(ca->mi.nbuckets - - ca->mi.first_bucket) + - ca->mi.first_bucket; - - g = ca->buckets + n; - - if (bch_can_invalidate_bucket(ca, g)) - bch_invalidate_one_bucket(ca, g); - - if (++checked >= ca->mi.nbuckets / 2) - return; - } -} - -static void invalidate_buckets(struct bch_dev *ca) -{ - ca->inc_gen_needs_gc = 0; - - switch (ca->mi.replacement) { - case CACHE_REPLACEMENT_LRU: - invalidate_buckets_lru(ca); - break; - case CACHE_REPLACEMENT_FIFO: - invalidate_buckets_fifo(ca); - break; - case CACHE_REPLACEMENT_RANDOM: - invalidate_buckets_random(ca); - break; - } -} - -static bool __bch_allocator_push(struct bch_dev *ca, long bucket) -{ - if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) - goto success; - - if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket)) - goto success; - - if (fifo_push(&ca->free[RESERVE_BTREE], bucket)) - goto success; - - if (fifo_push(&ca->free[RESERVE_NONE], bucket)) - goto success; - - return false; -success: - closure_wake_up(&ca->fs->freelist_wait); - return true; -} - -static bool bch_allocator_push(struct bch_dev *ca, long bucket) -{ - bool ret; - - spin_lock(&ca->freelist_lock); - ret = __bch_allocator_push(ca, bucket); - if (ret) - fifo_pop(&ca->free_inc, bucket); - spin_unlock(&ca->freelist_lock); - - return ret; -} - -static void bch_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca) -{ - u16 last_seq_ondisk = c->journal.last_seq_ondisk; - struct bucket *g; - - for_each_bucket(g, ca) { - struct bucket_mark m = READ_ONCE(g->mark); - - if (is_available_bucket(m) && - !m.cached_sectors && - !m.had_metadata && - !bucket_needs_journal_commit(m, last_seq_ondisk)) { - spin_lock(&ca->freelist_lock); - - bch_mark_alloc_bucket(ca, g, true); - g->read_prio = c->prio_clock[READ].hand; - g->write_prio = c->prio_clock[WRITE].hand; - - verify_not_on_freelist(ca, g - ca->buckets); - BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); - - spin_unlock(&ca->freelist_lock); - - if (fifo_full(&ca->free_inc)) - break; - } - } -} - -/** - * bch_allocator_thread - move buckets from free_inc to reserves - * - * The free_inc FIFO is populated by invalidate_buckets(), and - * the reserves are depleted by bucket allocation. When we run out - * of free_inc, try to invalidate some buckets and write out - * prios and gens. - */ -static int bch_allocator_thread(void *arg) -{ - struct bch_dev *ca = arg; - struct bch_fs *c = ca->fs; - int ret; - - set_freezable(); - - bch_find_empty_buckets(c, ca); - - while (1) { - /* - * First, we pull buckets off of the free_inc list, possibly - * issue discards to them, then we add the bucket to a - * free list: - */ - - while (!fifo_empty(&ca->free_inc)) { - long bucket = fifo_peek(&ca->free_inc); - - /* - * Don't remove from free_inc until after it's added - * to freelist, so gc doesn't miss it while we've - * dropped bucket lock - */ - - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, bucket), - ca->mi.bucket_size, GFP_NOIO, 0); - - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (bch_allocator_push(ca, bucket)) - break; - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - goto out; - } - schedule(); - try_to_freeze(); - } - - __set_current_state(TASK_RUNNING); - } - - down_read(&c->gc_lock); - - /* - * See if we have buckets we can reuse without invalidating them - * or forcing a journal commit: - */ - //bch_find_empty_buckets(c, ca); - - if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) { - up_read(&c->gc_lock); - continue; - } - - /* We've run out of free buckets! */ - - while (!fifo_full(&ca->free_inc)) { - if (wait_buckets_available(ca)) { - up_read(&c->gc_lock); - goto out; - } - - /* - * Find some buckets that we can invalidate, either - * they're completely unused, or only contain clean data - * that's been written back to the backing device or - * another cache tier - */ - - invalidate_buckets(ca); - trace_bcache_alloc_batch(ca, fifo_used(&ca->free_inc), - ca->free_inc.size); - } - - up_read(&c->gc_lock); - - /* - * free_inc is full of newly-invalidated buckets, must write out - * prios and gens before they can be re-used - */ - ret = bch_prio_write(ca); - if (ret) { - /* - * Emergency read only - allocator thread has to - * shutdown. - * - * N.B. we better be going into RO mode, else - * allocations would hang indefinitely - whatever - * generated the error will have sent us into RO mode. - * - * Clear out the free_inc freelist so things are - * consistent-ish: - */ - spin_lock(&ca->freelist_lock); - while (!fifo_empty(&ca->free_inc)) { - long bucket; - - fifo_pop(&ca->free_inc, bucket); - bch_mark_free_bucket(ca, ca->buckets + bucket); - } - spin_unlock(&ca->freelist_lock); - goto out; - } - } -out: - /* - * Avoid a race with bch_usage_update() trying to wake us up after - * we've exited: - */ - synchronize_rcu(); - return 0; -} - -/* Allocation */ - -/** - * bch_bucket_alloc - allocate a single bucket from a specific device - * - * Returns index of bucket on success, 0 on failure - * */ -size_t bch_bucket_alloc(struct bch_dev *ca, enum alloc_reserve reserve) -{ - struct bucket *g; - long r; - - spin_lock(&ca->freelist_lock); - if (fifo_pop(&ca->free[RESERVE_NONE], r) || - fifo_pop(&ca->free[reserve], r)) - goto out; - - spin_unlock(&ca->freelist_lock); - - trace_bcache_bucket_alloc_fail(ca, reserve); - return 0; -out: - verify_not_on_freelist(ca, r); - spin_unlock(&ca->freelist_lock); - - trace_bcache_bucket_alloc(ca, reserve); - - bch_wake_allocator(ca); - - g = ca->buckets + r; - - g->read_prio = ca->fs->prio_clock[READ].hand; - g->write_prio = ca->fs->prio_clock[WRITE].hand; - - return r; -} - -static void __bch_bucket_free(struct bch_dev *ca, struct bucket *g) -{ - bch_mark_free_bucket(ca, g); - - g->read_prio = ca->fs->prio_clock[READ].hand; - g->write_prio = ca->fs->prio_clock[WRITE].hand; -} - -enum bucket_alloc_ret { - ALLOC_SUCCESS, - NO_DEVICES, /* -EROFS */ - FREELIST_EMPTY, /* Allocator thread not keeping up */ -}; - -static void recalc_alloc_group_weights(struct bch_fs *c, - struct dev_group *devs) -{ - struct bch_dev *ca; - u64 available_buckets = 1; /* avoid a divide by zero... */ - unsigned i; - - for (i = 0; i < devs->nr; i++) { - ca = devs->d[i].dev; - - devs->d[i].weight = dev_buckets_free(ca); - available_buckets += devs->d[i].weight; - } - - for (i = 0; i < devs->nr; i++) { - const unsigned min_weight = U32_MAX >> 4; - const unsigned max_weight = U32_MAX; - - devs->d[i].weight = - min_weight + - div64_u64(devs->d[i].weight * - devs->nr * - (max_weight - min_weight), - available_buckets); - devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight); - } -} - -static enum bucket_alloc_ret bch_bucket_alloc_group(struct bch_fs *c, - struct open_bucket *ob, - enum alloc_reserve reserve, - unsigned nr_replicas, - struct dev_group *devs, - long *devs_used) -{ - enum bucket_alloc_ret ret; - unsigned fail_idx = -1, i; - unsigned available = 0; - - BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs)); - - if (ob->nr_ptrs >= nr_replicas) - return ALLOC_SUCCESS; - - spin_lock(&devs->lock); - - for (i = 0; i < devs->nr; i++) - available += !test_bit(devs->d[i].dev->dev_idx, - devs_used); - - recalc_alloc_group_weights(c, devs); - - i = devs->cur_device; - - while (ob->nr_ptrs < nr_replicas) { - struct bch_dev *ca; - u64 bucket; - - if (!available) { - ret = NO_DEVICES; - goto err; - } - - i++; - i %= devs->nr; - - ret = FREELIST_EMPTY; - if (i == fail_idx) - goto err; - - ca = devs->d[i].dev; - - if (test_bit(ca->dev_idx, devs_used)) - continue; - - if (fail_idx == -1 && - get_random_int() > devs->d[i].weight) - continue; - - bucket = bch_bucket_alloc(ca, reserve); - if (!bucket) { - if (fail_idx == -1) - fail_idx = i; - continue; - } - - /* - * open_bucket_add_buckets expects new pointers at the head of - * the list: - */ - memmove(&ob->ptrs[1], - &ob->ptrs[0], - ob->nr_ptrs * sizeof(ob->ptrs[0])); - memmove(&ob->ptr_offset[1], - &ob->ptr_offset[0], - ob->nr_ptrs * sizeof(ob->ptr_offset[0])); - ob->nr_ptrs++; - ob->ptrs[0] = (struct bch_extent_ptr) { - .gen = ca->buckets[bucket].mark.gen, - .offset = bucket_to_sector(ca, bucket), - .dev = ca->dev_idx, - }; - ob->ptr_offset[0] = 0; - - __set_bit(ca->dev_idx, devs_used); - available--; - devs->cur_device = i; - } - - ret = ALLOC_SUCCESS; -err: - EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC); - spin_unlock(&devs->lock); - return ret; -} - -static enum bucket_alloc_ret __bch_bucket_alloc_set(struct bch_fs *c, - struct write_point *wp, - struct open_bucket *ob, - unsigned nr_replicas, - enum alloc_reserve reserve, - long *devs_used) -{ - struct bch_tier *tier; - /* - * this should implement policy - for a given type of allocation, decide - * which devices to allocate from: - * - * XXX: switch off wp->type and do something more intelligent here - */ - if (wp->group) - return bch_bucket_alloc_group(c, ob, reserve, nr_replicas, - wp->group, devs_used); - - /* foreground writes: prefer fastest tier: */ - tier = READ_ONCE(c->fastest_tier); - if (tier) - bch_bucket_alloc_group(c, ob, reserve, nr_replicas, - &tier->devs, devs_used); - - return bch_bucket_alloc_group(c, ob, reserve, nr_replicas, - &c->all_devs, devs_used); -} - -static int bch_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, - struct open_bucket *ob, unsigned nr_replicas, - enum alloc_reserve reserve, long *devs_used, - struct closure *cl) -{ - bool waiting = false; - - while (1) { - switch (__bch_bucket_alloc_set(c, wp, ob, nr_replicas, - reserve, devs_used)) { - case ALLOC_SUCCESS: - if (waiting) - closure_wake_up(&c->freelist_wait); - - return 0; - - case NO_DEVICES: - if (waiting) - closure_wake_up(&c->freelist_wait); - return -EROFS; - - case FREELIST_EMPTY: - if (!cl || waiting) - trace_bcache_freelist_empty_fail(c, - reserve, cl); - - if (!cl) - return -ENOSPC; - - if (waiting) - return -EAGAIN; - - /* Retry allocation after adding ourself to waitlist: */ - closure_wait(&c->freelist_wait, cl); - waiting = true; - break; - default: - BUG(); - } - } -} - -/* Open buckets: */ - -/* - * Open buckets represent one or more buckets (on multiple devices) that are - * currently being allocated from. They serve two purposes: - * - * - They track buckets that have been partially allocated, allowing for - * sub-bucket sized allocations - they're used by the sector allocator below - * - * - They provide a reference to the buckets they own that mark and sweep GC - * can find, until the new allocation has a pointer to it inserted into the - * btree - * - * When allocating some space with the sector allocator, the allocation comes - * with a reference to an open bucket - the caller is required to put that - * reference _after_ doing the index update that makes its allocation reachable. - */ - -static void __bch_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -{ - const struct bch_extent_ptr *ptr; - - lockdep_assert_held(&c->open_buckets_lock); - - open_bucket_for_each_ptr(ob, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; - - bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false); - } - - ob->nr_ptrs = 0; - - list_move(&ob->list, &c->open_buckets_free); - c->open_buckets_nr_free++; - closure_wake_up(&c->open_buckets_wait); -} - -void bch_open_bucket_put(struct bch_fs *c, struct open_bucket *b) -{ - if (atomic_dec_and_test(&b->pin)) { - spin_lock(&c->open_buckets_lock); - __bch_open_bucket_put(c, b); - spin_unlock(&c->open_buckets_lock); - } -} - -static struct open_bucket *bch_open_bucket_get(struct bch_fs *c, - unsigned nr_reserved, - struct closure *cl) -{ - struct open_bucket *ret; - - spin_lock(&c->open_buckets_lock); - - if (c->open_buckets_nr_free > nr_reserved) { - BUG_ON(list_empty(&c->open_buckets_free)); - ret = list_first_entry(&c->open_buckets_free, - struct open_bucket, list); - list_move(&ret->list, &c->open_buckets_open); - BUG_ON(ret->nr_ptrs); - - atomic_set(&ret->pin, 1); /* XXX */ - ret->has_full_ptrs = false; - - c->open_buckets_nr_free--; - trace_bcache_open_bucket_alloc(c, cl); - } else { - trace_bcache_open_bucket_alloc_fail(c, cl); - - if (cl) { - closure_wait(&c->open_buckets_wait, cl); - ret = ERR_PTR(-EAGAIN); - } else - ret = ERR_PTR(-ENOSPC); - } - - spin_unlock(&c->open_buckets_lock); - - return ret; -} - -static unsigned ob_ptr_sectors_free(struct bch_fs *c, - struct open_bucket *ob, - struct bch_extent_ptr *ptr) -{ - struct bch_dev *ca = c->devs[ptr->dev]; - unsigned i = ptr - ob->ptrs; - unsigned bucket_size = ca->mi.bucket_size; - unsigned used = (ptr->offset & (bucket_size - 1)) + - ob->ptr_offset[i]; - - BUG_ON(used > bucket_size); - - return bucket_size - used; -} - -static unsigned open_bucket_sectors_free(struct bch_fs *c, - struct open_bucket *ob, - unsigned nr_replicas) -{ - unsigned i, sectors_free = UINT_MAX; - - for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++) - sectors_free = min(sectors_free, - ob_ptr_sectors_free(c, ob, &ob->ptrs[i])); - - return sectors_free != UINT_MAX ? sectors_free : 0; -} - -static void open_bucket_copy_unused_ptrs(struct bch_fs *c, - struct open_bucket *new, - struct open_bucket *old) -{ - unsigned i; - - for (i = 0; i < old->nr_ptrs; i++) - if (ob_ptr_sectors_free(c, old, &old->ptrs[i])) { - struct bch_extent_ptr tmp = old->ptrs[i]; - - tmp.offset += old->ptr_offset[i]; - new->ptrs[new->nr_ptrs] = tmp; - new->ptr_offset[new->nr_ptrs] = 0; - new->nr_ptrs++; - } -} - -static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob) -{ -#ifdef CONFIG_BCACHE_DEBUG - const struct bch_extent_ptr *ptr; - - open_bucket_for_each_ptr(ob, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; - - BUG_ON(ptr_stale(ca, ptr)); - } -#endif -} - -/* Sector allocator */ - -static struct open_bucket *lock_writepoint(struct bch_fs *c, - struct write_point *wp) -{ - struct open_bucket *ob; - - while ((ob = ACCESS_ONCE(wp->b))) { - mutex_lock(&ob->lock); - if (wp->b == ob) - break; - - mutex_unlock(&ob->lock); - } - - return ob; -} - -static int open_bucket_add_buckets(struct bch_fs *c, - struct write_point *wp, - struct open_bucket *ob, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - struct closure *cl) -{ - long devs_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; - unsigned i; - int ret; - - /* - * We might be allocating pointers to add to an existing extent - * (tiering/copygc/migration) - if so, some of the pointers in our - * existing open bucket might duplicate devices we already have. This is - * moderately annoying. - */ - - /* Short circuit all the fun stuff if posssible: */ - if (ob->nr_ptrs >= nr_replicas) - return 0; - - memset(devs_used, 0, sizeof(devs_used)); - - for (i = 0; i < ob->nr_ptrs; i++) - __set_bit(ob->ptrs[i].dev, devs_used); - - ret = bch_bucket_alloc_set(c, wp, ob, nr_replicas, - reserve, devs_used, cl); - - if (ret == -EROFS && - ob->nr_ptrs >= nr_replicas_required) - ret = 0; - - return ret; -} - -/* - * Get us an open_bucket we can allocate from, return with it locked: - */ -struct open_bucket *bch_alloc_sectors_start(struct bch_fs *c, - struct write_point *wp, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - struct closure *cl) -{ - struct open_bucket *ob; - unsigned open_buckets_reserved = wp == &c->btree_write_point - ? 0 : BTREE_NODE_RESERVE; - int ret; - - BUG_ON(!reserve); - BUG_ON(!nr_replicas); -retry: - ob = lock_writepoint(c, wp); - - /* - * If ob->sectors_free == 0, one or more of the buckets ob points to is - * full. We can't drop pointers from an open bucket - garbage collection - * still needs to find them; instead, we must allocate a new open bucket - * and copy any pointers to non-full buckets into the new open bucket. - */ - if (!ob || ob->has_full_ptrs) { - struct open_bucket *new_ob; - - new_ob = bch_open_bucket_get(c, open_buckets_reserved, cl); - if (IS_ERR(new_ob)) - return new_ob; - - mutex_lock(&new_ob->lock); - - /* - * We point the write point at the open_bucket before doing the - * allocation to avoid a race with shutdown: - */ - if (race_fault() || - cmpxchg(&wp->b, ob, new_ob) != ob) { - /* We raced: */ - mutex_unlock(&new_ob->lock); - bch_open_bucket_put(c, new_ob); - - if (ob) - mutex_unlock(&ob->lock); - goto retry; - } - - if (ob) { - open_bucket_copy_unused_ptrs(c, new_ob, ob); - mutex_unlock(&ob->lock); - bch_open_bucket_put(c, ob); - } - - ob = new_ob; - } - - ret = open_bucket_add_buckets(c, wp, ob, nr_replicas, - nr_replicas_required, - reserve, cl); - if (ret) { - mutex_unlock(&ob->lock); - return ERR_PTR(ret); - } - - ob->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas); - - BUG_ON(!ob->sectors_free); - verify_not_stale(c, ob); - - return ob; -} - -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ -void bch_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, - unsigned nr_replicas, struct open_bucket *ob, - unsigned sectors) -{ - struct bch_extent_ptr tmp; - bool has_data = false; - unsigned i; - - /* - * We're keeping any existing pointer k has, and appending new pointers: - * __bch_write() will only write to the pointers we add here: - */ - - BUG_ON(sectors > ob->sectors_free); - - /* didn't use all the ptrs: */ - if (nr_replicas < ob->nr_ptrs) - has_data = true; - - for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) { - EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev)); - - tmp = ob->ptrs[i]; - tmp.cached = bkey_extent_is_cached(&e->k); - tmp.offset += ob->ptr_offset[i]; - extent_ptr_append(e, tmp); - - ob->ptr_offset[i] += sectors; - - this_cpu_add(*c->devs[tmp.dev]->sectors_written, sectors); - } -} - -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ -void bch_alloc_sectors_done(struct bch_fs *c, struct write_point *wp, - struct open_bucket *ob) -{ - bool has_data = false; - unsigned i; - - for (i = 0; i < ob->nr_ptrs; i++) { - if (!ob_ptr_sectors_free(c, ob, &ob->ptrs[i])) - ob->has_full_ptrs = true; - else - has_data = true; - } - - if (likely(has_data)) - atomic_inc(&ob->pin); - else - BUG_ON(xchg(&wp->b, NULL) != ob); - - mutex_unlock(&ob->lock); -} - -/* - * Allocates some space in the cache to write to, and k to point to the newly - * allocated space, and updates k->size and k->offset (to point to the - * end of the newly allocated space). - * - * May allocate fewer sectors than @sectors, k->size indicates how many - * sectors were actually allocated. - * - * Return codes: - * - -EAGAIN: closure was added to waitlist - * - -ENOSPC: out of space and no closure provided - * - * @c - filesystem. - * @wp - write point to use for allocating sectors. - * @k - key to return the allocated space information. - * @cl - closure to wait for a bucket - */ -struct open_bucket *bch_alloc_sectors(struct bch_fs *c, - struct write_point *wp, - struct bkey_i_extent *e, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - struct closure *cl) -{ - struct open_bucket *ob; - - ob = bch_alloc_sectors_start(c, wp, nr_replicas, - nr_replicas_required, - reserve, cl); - if (IS_ERR_OR_NULL(ob)) - return ob; - - if (e->k.size > ob->sectors_free) - bch_key_resize(&e->k, ob->sectors_free); - - bch_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size); - - bch_alloc_sectors_done(c, wp, ob); - - return ob; -} - -/* Startup/shutdown (ro/rw): */ - -void bch_recalc_capacity(struct bch_fs *c) -{ - struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier; - struct bch_dev *ca; - u64 total_capacity, capacity = 0, reserved_sectors = 0; - unsigned long ra_pages = 0; - unsigned i, j; - - for_each_online_member(ca, c, i) { - struct backing_dev_info *bdi = - blk_get_backing_dev_info(ca->disk_sb.bdev); - - ra_pages += bdi->ra_pages; - } - - c->bdi.ra_pages = ra_pages; - - /* Find fastest, slowest tiers with devices: */ - - for (tier = c->tiers; - tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { - if (!tier->devs.nr) - continue; - if (!fastest_tier) - fastest_tier = tier; - slowest_tier = tier; - } - - c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL; - - c->promote_write_point.group = &fastest_tier->devs; - - if (!fastest_tier) - goto set_capacity; - - /* - * Capacity of the filesystem is the capacity of all the devices in the - * slowest (highest) tier - we don't include lower tier devices. - */ - spin_lock(&slowest_tier->devs.lock); - group_for_each_dev(ca, &slowest_tier->devs, i) { - size_t reserve = 0; - - /* - * We need to reserve buckets (from the number - * of currently available buckets) against - * foreground writes so that mainly copygc can - * make forward progress. - * - * We need enough to refill the various reserves - * from scratch - copygc will use its entire - * reserve all at once, then run against when - * its reserve is refilled (from the formerly - * available buckets). - * - * This reserve is just used when considering if - * allocations for foreground writes must wait - - * not -ENOSPC calculations. - */ - for (j = 0; j < RESERVE_NONE; j++) - reserve += ca->free[j].size; - - reserve += ca->free_inc.size; - - reserve += ARRAY_SIZE(c->write_points); - - if (ca->mi.tier) - reserve += 1; /* tiering write point */ - reserve += 1; /* btree write point */ - - reserved_sectors += reserve << ca->bucket_bits; - - capacity += (ca->mi.nbuckets - - ca->mi.first_bucket) << - ca->bucket_bits; - } - spin_unlock(&slowest_tier->devs.lock); -set_capacity: - total_capacity = capacity; - - capacity *= (100 - c->opts.gc_reserve_percent); - capacity = div64_u64(capacity, 100); - - BUG_ON(capacity + reserved_sectors > total_capacity); - - c->capacity = capacity; - - if (c->capacity) { - bch_io_timer_add(&c->io_clock[READ], - &c->prio_clock[READ].rescale); - bch_io_timer_add(&c->io_clock[WRITE], - &c->prio_clock[WRITE].rescale); - } else { - bch_io_timer_del(&c->io_clock[READ], - &c->prio_clock[READ].rescale); - bch_io_timer_del(&c->io_clock[WRITE], - &c->prio_clock[WRITE].rescale); - } - - /* Wake up case someone was waiting for buckets */ - closure_wake_up(&c->freelist_wait); -} - -static void bch_stop_write_point(struct bch_dev *ca, - struct write_point *wp) -{ - struct bch_fs *c = ca->fs; - struct open_bucket *ob; - struct bch_extent_ptr *ptr; - - ob = lock_writepoint(c, wp); - if (!ob) - return; - - for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++) - if (ptr->dev == ca->dev_idx) - goto found; - - mutex_unlock(&ob->lock); - return; -found: - BUG_ON(xchg(&wp->b, NULL) != ob); - mutex_unlock(&ob->lock); - - /* Drop writepoint's ref: */ - bch_open_bucket_put(c, ob); -} - -static bool bch_dev_has_open_write_point(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_extent_ptr *ptr; - struct open_bucket *ob; - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) - if (atomic_read(&ob->pin)) { - mutex_lock(&ob->lock); - for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++) - if (ptr->dev == ca->dev_idx) { - mutex_unlock(&ob->lock); - return true; - } - mutex_unlock(&ob->lock); - } - - return false; -} - -/* device goes ro: */ -void bch_dev_allocator_stop(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct dev_group *tier = &c->tiers[ca->mi.tier].devs; - struct task_struct *p; - struct closure cl; - unsigned i; - - closure_init_stack(&cl); - - /* First, remove device from allocation groups: */ - - bch_dev_group_remove(tier, ca); - bch_dev_group_remove(&c->all_devs, ca); - - bch_recalc_capacity(c); - - /* - * Stopping the allocator thread comes after removing from allocation - * groups, else pending allocations will hang: - */ - - p = ca->alloc_thread; - ca->alloc_thread = NULL; - smp_wmb(); - - /* - * We need an rcu barrier between setting ca->alloc_thread = NULL and - * the thread shutting down to avoid a race with bch_usage_update() - - * the allocator thread itself does a synchronize_rcu() on exit. - * - * XXX: it would be better to have the rcu barrier be asynchronous - * instead of blocking us here - */ - if (p) { - kthread_stop(p); - put_task_struct(p); - } - - /* Next, close write points that point to this device... */ - - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - bch_stop_write_point(ca, &c->write_points[i]); - - bch_stop_write_point(ca, &ca->copygc_write_point); - bch_stop_write_point(ca, &c->promote_write_point); - bch_stop_write_point(ca, &ca->tiering_write_point); - bch_stop_write_point(ca, &c->migration_write_point); - bch_stop_write_point(ca, &c->btree_write_point); - - mutex_lock(&c->btree_reserve_cache_lock); - while (c->btree_reserve_cache_nr) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - bch_open_bucket_put(c, a->ob); - } - mutex_unlock(&c->btree_reserve_cache_lock); - - /* Avoid deadlocks.. */ - - closure_wake_up(&c->freelist_wait); - wake_up(&c->journal.wait); - - /* Now wait for any in flight writes: */ - - while (1) { - closure_wait(&c->open_buckets_wait, &cl); - - if (!bch_dev_has_open_write_point(ca)) { - closure_wake_up(&c->open_buckets_wait); - break; - } - - closure_sync(&cl); - } -} - -/* - * Startup the allocator thread for transition to RW mode: - */ -int bch_dev_allocator_start(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct dev_group *tier = &c->tiers[ca->mi.tier].devs; - struct bch_sb_field_journal *journal_buckets; - bool has_journal; - struct task_struct *k; - - /* - * allocator thread already started? - */ - if (ca->alloc_thread) - return 0; - - k = kthread_create(bch_allocator_thread, ca, "bcache_allocator"); - if (IS_ERR(k)) - return 0; - - get_task_struct(k); - ca->alloc_thread = k; - - bch_dev_group_add(tier, ca); - bch_dev_group_add(&c->all_devs, ca); - - mutex_lock(&c->sb_lock); - journal_buckets = bch_sb_get_journal(ca->disk_sb.sb); - has_journal = bch_nr_journal_buckets(journal_buckets) >= - BCH_JOURNAL_BUCKETS_MIN; - mutex_unlock(&c->sb_lock); - - if (has_journal) - bch_dev_group_add(&c->journal.devs, ca); - - bch_recalc_capacity(c); - - /* - * Don't wake up allocator thread until after adding device to - * allocator groups - otherwise, alloc thread could get a spurious - * -EROFS due to prio_write() -> journal_meta() not finding any devices: - */ - wake_up_process(k); - return 0; -} - -void bch_fs_allocator_init(struct bch_fs *c) -{ - unsigned i; - - INIT_LIST_HEAD(&c->open_buckets_open); - INIT_LIST_HEAD(&c->open_buckets_free); - spin_lock_init(&c->open_buckets_lock); - bch_prio_timer_init(c, READ); - bch_prio_timer_init(c, WRITE); - - /* open bucket 0 is a sentinal NULL: */ - mutex_init(&c->open_buckets[0].lock); - INIT_LIST_HEAD(&c->open_buckets[0].list); - - for (i = 1; i < ARRAY_SIZE(c->open_buckets); i++) { - mutex_init(&c->open_buckets[i].lock); - c->open_buckets_nr_free++; - list_add(&c->open_buckets[i].list, &c->open_buckets_free); - } - - spin_lock_init(&c->all_devs.lock); - - for (i = 0; i < ARRAY_SIZE(c->tiers); i++) - spin_lock_init(&c->tiers[i].devs.lock); - - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - c->write_points[i].throttle = true; - - c->pd_controllers_update_seconds = 5; - INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); - - spin_lock_init(&c->foreground_write_pd_lock); - bch_pd_controller_init(&c->foreground_write_pd); - /* - * We do not want the write rate to have an effect on the computed - * rate, for two reasons: - * - * We do not call bch_ratelimit_delay() at all if the write rate - * exceeds 1GB/s. In this case, the PD controller will think we are - * not "keeping up" and not change the rate. - */ - c->foreground_write_pd.backpressure = 0; - init_timer(&c->foreground_write_wakeup); - - c->foreground_write_wakeup.data = (unsigned long) c; - c->foreground_write_wakeup.function = bch_wake_delayed_writes; -} diff --git a/libbcache/alloc.h b/libbcache/alloc.h deleted file mode 100644 index f8aa762d..00000000 --- a/libbcache/alloc.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef _BCACHE_ALLOC_H -#define _BCACHE_ALLOC_H - -#include "alloc_types.h" - -struct bkey; -struct bucket; -struct bch_dev; -struct bch_fs; -struct dev_group; - -static inline size_t prios_per_bucket(const struct bch_dev *ca) -{ - return (bucket_bytes(ca) - sizeof(struct prio_set)) / - sizeof(struct bucket_disk); -} - -static inline size_t prio_buckets(const struct bch_dev *ca) -{ - return DIV_ROUND_UP((size_t) (ca)->mi.nbuckets, prios_per_bucket(ca)); -} - -void bch_dev_group_remove(struct dev_group *, struct bch_dev *); -void bch_dev_group_add(struct dev_group *, struct bch_dev *); - -int bch_prio_read(struct bch_dev *); - -size_t bch_bucket_alloc(struct bch_dev *, enum alloc_reserve); - -void bch_open_bucket_put(struct bch_fs *, struct open_bucket *); - -struct open_bucket *bch_alloc_sectors_start(struct bch_fs *, - struct write_point *, - unsigned, unsigned, - enum alloc_reserve, - struct closure *); - -void bch_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *, - unsigned, struct open_bucket *, unsigned); -void bch_alloc_sectors_done(struct bch_fs *, struct write_point *, - struct open_bucket *); - -struct open_bucket *bch_alloc_sectors(struct bch_fs *, struct write_point *, - struct bkey_i_extent *, unsigned, unsigned, - enum alloc_reserve, struct closure *); - -static inline void bch_wake_allocator(struct bch_dev *ca) -{ - struct task_struct *p; - - rcu_read_lock(); - if ((p = ACCESS_ONCE(ca->alloc_thread))) - wake_up_process(p); - rcu_read_unlock(); -} - -static inline struct bch_dev *dev_group_next(struct dev_group *devs, - unsigned *iter) -{ - struct bch_dev *ret = NULL; - - while (*iter < devs->nr && - !(ret = rcu_dereference_check(devs->d[*iter].dev, - lockdep_is_held(&devs->lock)))) - (*iter)++; - - return ret; -} - -#define group_for_each_dev(ca, devs, iter) \ - for ((iter) = 0; \ - ((ca) = dev_group_next((devs), &(iter))); \ - (iter)++) - -#define open_bucket_for_each_ptr(_ob, _ptr) \ - for ((_ptr) = (_ob)->ptrs; \ - (_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs; \ - (_ptr)++) - -void bch_recalc_capacity(struct bch_fs *); -void bch_dev_allocator_stop(struct bch_dev *); -int bch_dev_allocator_start(struct bch_dev *); -void bch_fs_allocator_init(struct bch_fs *); - -#endif /* _BCACHE_ALLOC_H */ diff --git a/libbcache/alloc_types.h b/libbcache/alloc_types.h deleted file mode 100644 index 1bf48ef9..00000000 --- a/libbcache/alloc_types.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef _BCACHE_ALLOC_TYPES_H -#define _BCACHE_ALLOC_TYPES_H - -#include <linux/mutex.h> - -#include "clock_types.h" - -/* - * There's two of these clocks, one for reads and one for writes: - * - * All fields protected by bucket_lock - */ -struct prio_clock { - /* - * "now" in (read/write) IO time - incremented whenever we do X amount - * of reads or writes. - * - * Goes with the bucket read/write prios: when we read or write to a - * bucket we reset the bucket's prio to the current hand; thus hand - - * prio = time since bucket was last read/written. - * - * The units are some amount (bytes/sectors) of data read/written, and - * the units can change on the fly if we need to rescale to fit - * everything in a u16 - your only guarantee is that the units are - * consistent. - */ - u16 hand; - u16 min_prio; - - int rw; - - struct io_timer rescale; -}; - -/* There is one reserve for each type of btree, one for prios and gens - * and one for moving GC */ -enum alloc_reserve { - RESERVE_PRIO, - RESERVE_BTREE, - RESERVE_METADATA_LAST = RESERVE_BTREE, - RESERVE_MOVINGGC, - - RESERVE_NONE, - RESERVE_NR, -}; - -static inline bool allocation_is_metadata(enum alloc_reserve id) -{ - return id <= RESERVE_METADATA_LAST; -} - -struct dev_group { - spinlock_t lock; - unsigned nr; - unsigned cur_device; - struct { - u64 weight; - struct bch_dev *dev; - } d[BCH_SB_MEMBERS_MAX]; -}; - -/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ -#define OPEN_BUCKETS_COUNT 256 - -#define WRITE_POINT_COUNT 16 - -struct open_bucket { - struct list_head list; - struct mutex lock; - atomic_t pin; - bool has_full_ptrs; - /* - * recalculated every time we allocate from this open_bucket based on - * how many pointers we're actually going to use: - */ - unsigned sectors_free; - unsigned nr_ptrs; - struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; - unsigned ptr_offset[BCH_REPLICAS_MAX]; -}; - -struct write_point { - struct open_bucket *b; - - /* - * Throttle writes to this write point if tier 0 is full? - */ - bool throttle; - - /* - * If not NULL, cache group for tiering, promotion and moving GC - - * always allocates a single replica - */ - struct dev_group *group; - - /* - * Otherwise do a normal replicated bucket allocation that could come - * from any device in tier 0 (foreground write) - */ -}; - -#endif /* _BCACHE_ALLOC_TYPES_H */ diff --git a/libbcache/bcache.h b/libbcache/bcache.h deleted file mode 100644 index 1d0e998c..00000000 --- a/libbcache/bcache.h +++ /dev/null @@ -1,831 +0,0 @@ -#ifndef _BCACHE_H -#define _BCACHE_H - -/* - * SOME HIGH LEVEL CODE DOCUMENTATION: - * - * Bcache mostly works with cache sets, cache devices, and backing devices. - * - * Support for multiple cache devices hasn't quite been finished off yet, but - * it's about 95% plumbed through. A cache set and its cache devices is sort of - * like a md raid array and its component devices. Most of the code doesn't care - * about individual cache devices, the main abstraction is the cache set. - * - * Multiple cache devices is intended to give us the ability to mirror dirty - * cached data and metadata, without mirroring clean cached data. - * - * Backing devices are different, in that they have a lifetime independent of a - * cache set. When you register a newly formatted backing device it'll come up - * in passthrough mode, and then you can attach and detach a backing device from - * a cache set at runtime - while it's mounted and in use. Detaching implicitly - * invalidates any cached data for that backing device. - * - * A cache set can have multiple (many) backing devices attached to it. - * - * There's also flash only volumes - this is the reason for the distinction - * between struct cached_dev and struct bcache_device. A flash only volume - * works much like a bcache device that has a backing device, except the - * "cached" data is always dirty. The end result is that we get thin - * provisioning with very little additional code. - * - * Flash only volumes work but they're not production ready because the moving - * garbage collector needs more work. More on that later. - * - * BUCKETS/ALLOCATION: - * - * Bcache is primarily designed for caching, which means that in normal - * operation all of our available space will be allocated. Thus, we need an - * efficient way of deleting things from the cache so we can write new things to - * it. - * - * To do this, we first divide the cache device up into buckets. A bucket is the - * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ - * works efficiently. - * - * Each bucket has a 16 bit priority, and an 8 bit generation associated with - * it. The gens and priorities for all the buckets are stored contiguously and - * packed on disk (in a linked list of buckets - aside from the superblock, all - * of bcache's metadata is stored in buckets). - * - * The priority is used to implement an LRU. We reset a bucket's priority when - * we allocate it or on cache it, and every so often we decrement the priority - * of each bucket. It could be used to implement something more sophisticated, - * if anyone ever gets around to it. - * - * The generation is used for invalidating buckets. Each pointer also has an 8 - * bit generation embedded in it; for a pointer to be considered valid, its gen - * must match the gen of the bucket it points into. Thus, to reuse a bucket all - * we have to do is increment its gen (and write its new gen to disk; we batch - * this up). - * - * Bcache is entirely COW - we never write twice to a bucket, even buckets that - * contain metadata (including btree nodes). - * - * THE BTREE: - * - * Bcache is in large part design around the btree. - * - * At a high level, the btree is just an index of key -> ptr tuples. - * - * Keys represent extents, and thus have a size field. Keys also have a variable - * number of pointers attached to them (potentially zero, which is handy for - * invalidating the cache). - * - * The key itself is an inode:offset pair. The inode number corresponds to a - * backing device or a flash only volume. The offset is the ending offset of the - * extent within the inode - not the starting offset; this makes lookups - * slightly more convenient. - * - * Pointers contain the cache device id, the offset on that device, and an 8 bit - * generation number. More on the gen later. - * - * Index lookups are not fully abstracted - cache lookups in particular are - * still somewhat mixed in with the btree code, but things are headed in that - * direction. - * - * Updates are fairly well abstracted, though. There are two different ways of - * updating the btree; insert and replace. - * - * BTREE_INSERT will just take a list of keys and insert them into the btree - - * overwriting (possibly only partially) any extents they overlap with. This is - * used to update the index after a write. - * - * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is - * overwriting a key that matches another given key. This is used for inserting - * data into the cache after a cache miss, and for background writeback, and for - * the moving garbage collector. - * - * There is no "delete" operation; deleting things from the index is - * accomplished by either by invalidating pointers (by incrementing a bucket's - * gen) or by inserting a key with 0 pointers - which will overwrite anything - * previously present at that location in the index. - * - * This means that there are always stale/invalid keys in the btree. They're - * filtered out by the code that iterates through a btree node, and removed when - * a btree node is rewritten. - * - * BTREE NODES: - * - * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and - * free smaller than a bucket - so, that's how big our btree nodes are. - * - * (If buckets are really big we'll only use part of the bucket for a btree node - * - no less than 1/4th - but a bucket still contains no more than a single - * btree node. I'd actually like to change this, but for now we rely on the - * bucket's gen for deleting btree nodes when we rewrite/split a node.) - * - * Anyways, btree nodes are big - big enough to be inefficient with a textbook - * btree implementation. - * - * The way this is solved is that btree nodes are internally log structured; we - * can append new keys to an existing btree node without rewriting it. This - * means each set of keys we write is sorted, but the node is not. - * - * We maintain this log structure in memory - keeping 1Mb of keys sorted would - * be expensive, and we have to distinguish between the keys we have written and - * the keys we haven't. So to do a lookup in a btree node, we have to search - * each sorted set. But we do merge written sets together lazily, so the cost of - * these extra searches is quite low (normally most of the keys in a btree node - * will be in one big set, and then there'll be one or two sets that are much - * smaller). - * - * This log structure makes bcache's btree more of a hybrid between a - * conventional btree and a compacting data structure, with some of the - * advantages of both. - * - * GARBAGE COLLECTION: - * - * We can't just invalidate any bucket - it might contain dirty data or - * metadata. If it once contained dirty data, other writes might overwrite it - * later, leaving no valid pointers into that bucket in the index. - * - * Thus, the primary purpose of garbage collection is to find buckets to reuse. - * It also counts how much valid data it each bucket currently contains, so that - * allocation can reuse buckets sooner when they've been mostly overwritten. - * - * It also does some things that are really internal to the btree - * implementation. If a btree node contains pointers that are stale by more than - * some threshold, it rewrites the btree node to avoid the bucket's generation - * wrapping around. It also merges adjacent btree nodes if they're empty enough. - * - * THE JOURNAL: - * - * Bcache's journal is not necessary for consistency; we always strictly - * order metadata writes so that the btree and everything else is consistent on - * disk in the event of an unclean shutdown, and in fact bcache had writeback - * caching (with recovery from unclean shutdown) before journalling was - * implemented. - * - * Rather, the journal is purely a performance optimization; we can't complete a - * write until we've updated the index on disk, otherwise the cache would be - * inconsistent in the event of an unclean shutdown. This means that without the - * journal, on random write workloads we constantly have to update all the leaf - * nodes in the btree, and those writes will be mostly empty (appending at most - * a few keys each) - highly inefficient in terms of amount of metadata writes, - * and it puts more strain on the various btree resorting/compacting code. - * - * The journal is just a log of keys we've inserted; on startup we just reinsert - * all the keys in the open journal entries. That means that when we're updating - * a node in the btree, we can wait until a 4k block of keys fills up before - * writing them out. - * - * For simplicity, we only journal updates to leaf nodes; updates to parent - * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth - * the complexity to deal with journalling them (in particular, journal replay) - * - updates to non leaf nodes just happen synchronously (see btree_split()). - */ - -#undef pr_fmt -#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ - -#include <linux/bug.h> -#include <linux/bcache.h> -#include <linux/bio.h> -#include <linux/kobject.h> -#include <linux/lglock.h> -#include <linux/list.h> -#include <linux/mutex.h> -#include <linux/percpu-refcount.h> -#include <linux/radix-tree.h> -#include <linux/rbtree.h> -#include <linux/rhashtable.h> -#include <linux/rwsem.h> -#include <linux/seqlock.h> -#include <linux/shrinker.h> -#include <linux/types.h> -#include <linux/workqueue.h> - -#include "bset.h" -#include "fifo.h" -#include "util.h" -#include "closure.h" -#include "opts.h" - -#include <linux/dynamic_fault.h> - -#define bch_fs_init_fault(name) \ - dynamic_fault("bcache:bch_fs_init:" name) -#define bch_meta_read_fault(name) \ - dynamic_fault("bcache:meta:read:" name) -#define bch_meta_write_fault(name) \ - dynamic_fault("bcache:meta:write:" name) - -#ifndef bch_fmt -#define bch_fmt(_c, fmt) "bcache (%s): " fmt "\n", ((_c)->name) -#endif - -#define bch_info(c, fmt, ...) \ - printk(KERN_INFO bch_fmt(c, fmt), ##__VA_ARGS__) -#define bch_notice(c, fmt, ...) \ - printk(KERN_NOTICE bch_fmt(c, fmt), ##__VA_ARGS__) -#define bch_warn(c, fmt, ...) \ - printk(KERN_WARNING bch_fmt(c, fmt), ##__VA_ARGS__) -#define bch_err(c, fmt, ...) \ - printk(KERN_ERR bch_fmt(c, fmt), ##__VA_ARGS__) - -#define bch_verbose(c, fmt, ...) \ -do { \ - if ((c)->opts.verbose_recovery) \ - bch_info(c, fmt, ##__VA_ARGS__); \ -} while (0) - -/* Parameters that are useful for debugging, but should always be compiled in: */ -#define BCH_DEBUG_PARAMS_ALWAYS() \ - BCH_DEBUG_PARAM(key_merging_disabled, \ - "Disables merging of extents") \ - BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ - "Causes mark and sweep to compact and rewrite every " \ - "btree node it traverses") \ - BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ - "Disables rewriting of btree nodes during mark and sweep")\ - BCH_DEBUG_PARAM(btree_gc_coalesce_disabled, \ - "Disables coalescing of btree nodes") \ - BCH_DEBUG_PARAM(btree_shrinker_disabled, \ - "Disables the shrinker callback for the btree node cache") - -/* Parameters that should only be compiled in in debug mode: */ -#define BCH_DEBUG_PARAMS_DEBUG() \ - BCH_DEBUG_PARAM(expensive_debug_checks, \ - "Enables various runtime debugging checks that " \ - "significantly affect performance") \ - BCH_DEBUG_PARAM(debug_check_bkeys, \ - "Run bkey_debugcheck (primarily checking GC/allocation "\ - "information) when iterating over keys") \ - BCH_DEBUG_PARAM(version_stress_test, \ - "Assigns random version numbers to newly written " \ - "extents, to test overlapping extent cases") \ - BCH_DEBUG_PARAM(verify_btree_ondisk, \ - "Reread btree nodes at various points to verify the " \ - "mergesort in the read path against modifications " \ - "done in memory") \ - -#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() - -#ifdef CONFIG_BCACHE_DEBUG -#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() -#else -#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() -#endif - -/* name, frequency_units, duration_units */ -#define BCH_TIME_STATS() \ - BCH_TIME_STAT(mca_alloc, sec, us) \ - BCH_TIME_STAT(mca_scan, sec, ms) \ - BCH_TIME_STAT(btree_gc, sec, ms) \ - BCH_TIME_STAT(btree_coalesce, sec, ms) \ - BCH_TIME_STAT(btree_split, sec, us) \ - BCH_TIME_STAT(btree_sort, ms, us) \ - BCH_TIME_STAT(btree_read, ms, us) \ - BCH_TIME_STAT(journal_write, us, us) \ - BCH_TIME_STAT(journal_delay, ms, us) \ - BCH_TIME_STAT(journal_blocked, sec, ms) \ - BCH_TIME_STAT(journal_flush_seq, us, us) - -#include "alloc_types.h" -#include "blockdev_types.h" -#include "buckets_types.h" -#include "clock_types.h" -#include "io_types.h" -#include "journal_types.h" -#include "keylist_types.h" -#include "keybuf_types.h" -#include "move_types.h" -#include "stats_types.h" -#include "super_types.h" - -/* 256k, in sectors */ -#define BTREE_NODE_SIZE_MAX 512 - -/* - * Number of nodes we might have to allocate in a worst case btree split - * operation - we split all the way up to the root, then allocate a new root. - */ -#define btree_reserve_required_nodes(depth) (((depth) + 1) * 2 + 1) - -/* Number of nodes btree coalesce will try to coalesce at once */ -#define GC_MERGE_NODES 4U - -/* Maximum number of nodes we might need to allocate atomically: */ -#define BTREE_RESERVE_MAX \ - (btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES) - -/* Size of the freelist we allocate btree nodes from: */ -#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 2) - -struct btree; -struct crypto_blkcipher; -struct crypto_ahash; - -enum gc_phase { - GC_PHASE_SB_METADATA = BTREE_ID_NR + 1, - GC_PHASE_PENDING_DELETE, - GC_PHASE_DONE -}; - -struct gc_pos { - enum gc_phase phase; - struct bpos pos; - unsigned level; -}; - -struct bch_member_cpu { - u64 nbuckets; /* device size */ - u16 first_bucket; /* index of first bucket used */ - u16 bucket_size; /* sectors */ - u8 state; - u8 tier; - u8 has_metadata; - u8 has_data; - u8 replacement; - u8 discard; - u8 valid; -}; - -struct bch_dev { - struct kobject kobj; - struct percpu_ref ref; - struct percpu_ref io_ref; - struct completion stop_complete; - struct completion offline_complete; - - struct bch_fs *fs; - - u8 dev_idx; - /* - * Cached version of this device's member info from superblock - * Committed by bch_write_super() -> bch_fs_mi_update() - */ - struct bch_member_cpu mi; - uuid_le uuid; - char name[BDEVNAME_SIZE]; - - struct bcache_superblock disk_sb; - - struct dev_group self; - - /* biosets used in cloned bios for replicas and moving_gc */ - struct bio_set replica_set; - - struct task_struct *alloc_thread; - - struct prio_set *disk_buckets; - - /* - * When allocating new buckets, prio_write() gets first dibs - since we - * may not be allocate at all without writing priorities and gens. - * prio_last_buckets[] contains the last buckets we wrote priorities to - * (so gc can mark them as metadata). - */ - u64 *prio_buckets; - u64 *prio_last_buckets; - spinlock_t prio_buckets_lock; - struct bio *bio_prio; - - /* - * free: Buckets that are ready to be used - * - * free_inc: Incoming buckets - these are buckets that currently have - * cached data in them, and we can't reuse them until after we write - * their new gen to disk. After prio_write() finishes writing the new - * gens/prios, they'll be moved to the free list (and possibly discarded - * in the process) - */ - DECLARE_FIFO(long, free)[RESERVE_NR]; - DECLARE_FIFO(long, free_inc); - spinlock_t freelist_lock; - - size_t fifo_last_bucket; - - /* Allocation stuff: */ - - /* most out of date gen in the btree */ - u8 *oldest_gens; - struct bucket *buckets; - unsigned short bucket_bits; /* ilog2(bucket_size) */ - - /* last calculated minimum prio */ - u16 min_prio[2]; - - /* - * Bucket book keeping. The first element is updated by GC, the - * second contains a saved copy of the stats from the beginning - * of GC. - */ - struct bch_dev_usage __percpu *usage_percpu; - struct bch_dev_usage usage_cached; - - atomic_long_t saturated_count; - size_t inc_gen_needs_gc; - - struct mutex heap_lock; - DECLARE_HEAP(struct bucket_heap_entry, heap); - - /* Moving GC: */ - struct task_struct *moving_gc_read; - - struct bch_pd_controller moving_gc_pd; - - /* Tiering: */ - struct write_point tiering_write_point; - - struct write_point copygc_write_point; - - struct journal_device journal; - - struct work_struct io_error_work; - - /* The rest of this all shows up in sysfs */ -#define IO_ERROR_SHIFT 20 - atomic_t io_errors; - atomic_t io_count; - - atomic64_t meta_sectors_written; - atomic64_t btree_sectors_written; - u64 __percpu *sectors_written; -}; - -/* - * Flag bits for what phase of startup/shutdown the cache set is at, how we're - * shutting down, etc.: - * - * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching - * all the backing devices first (their cached data gets invalidated, and they - * won't automatically reattach). - */ -enum { - BCH_FS_INITIAL_GC_DONE, - BCH_FS_DETACHING, - BCH_FS_EMERGENCY_RO, - BCH_FS_WRITE_DISABLE_COMPLETE, - BCH_FS_GC_STOPPING, - BCH_FS_GC_FAILURE, - BCH_FS_BDEV_MOUNTED, - BCH_FS_ERROR, - BCH_FS_FSCK_FIXED_ERRORS, -}; - -struct btree_debug { - unsigned id; - struct dentry *btree; - struct dentry *btree_format; - struct dentry *failed; -}; - -struct bch_tier { - unsigned idx; - struct task_struct *migrate; - struct bch_pd_controller pd; - - struct dev_group devs; -}; - -enum bch_fs_state { - BCH_FS_STARTING = 0, - BCH_FS_STOPPING, - BCH_FS_RO, - BCH_FS_RW, -}; - -struct bch_fs { - struct closure cl; - - struct list_head list; - struct kobject kobj; - struct kobject internal; - struct kobject opts_dir; - struct kobject time_stats; - unsigned long flags; - - int minor; - struct device *chardev; - struct super_block *vfs_sb; - char name[40]; - - /* ro/rw, add/remove devices: */ - struct mutex state_lock; - enum bch_fs_state state; - - /* Counts outstanding writes, for clean transition to read-only */ - struct percpu_ref writes; - struct work_struct read_only_work; - - struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; - - struct bch_opts opts; - - /* Updated by bch_sb_update():*/ - struct { - uuid_le uuid; - uuid_le user_uuid; - - u16 block_size; - u16 btree_node_size; - - u8 nr_devices; - u8 clean; - - u8 meta_replicas_have; - u8 data_replicas_have; - - u8 str_hash_type; - u8 encryption_type; - - u64 time_base_lo; - u32 time_base_hi; - u32 time_precision; - } sb; - - struct bch_sb *disk_sb; - unsigned disk_sb_order; - - unsigned short block_bits; /* ilog2(block_size) */ - - struct closure sb_write; - struct mutex sb_lock; - - struct backing_dev_info bdi; - - /* BTREE CACHE */ - struct bio_set btree_read_bio; - - struct btree_root btree_roots[BTREE_ID_NR]; - struct mutex btree_root_lock; - - bool btree_cache_table_init_done; - struct rhashtable btree_cache_table; - - /* - * We never free a struct btree, except on shutdown - we just put it on - * the btree_cache_freed list and reuse it later. This simplifies the - * code, and it doesn't cost us much memory as the memory usage is - * dominated by buffers that hold the actual btree node data and those - * can be freed - and the number of struct btrees allocated is - * effectively bounded. - * - * btree_cache_freeable effectively is a small cache - we use it because - * high order page allocations can be rather expensive, and it's quite - * common to delete and allocate btree nodes in quick succession. It - * should never grow past ~2-3 nodes in practice. - */ - struct mutex btree_cache_lock; - struct list_head btree_cache; - struct list_head btree_cache_freeable; - struct list_head btree_cache_freed; - - /* Number of elements in btree_cache + btree_cache_freeable lists */ - unsigned btree_cache_used; - unsigned btree_cache_reserve; - struct shrinker btree_cache_shrink; - - /* - * If we need to allocate memory for a new btree node and that - * allocation fails, we can cannibalize another node in the btree cache - * to satisfy the allocation - lock to guarantee only one thread does - * this at a time: - */ - struct closure_waitlist mca_wait; - struct task_struct *btree_cache_alloc_lock; - - mempool_t btree_reserve_pool; - - /* - * Cache of allocated btree nodes - if we allocate a btree node and - * don't use it, if we free it that space can't be reused until going - * _all_ the way through the allocator (which exposes us to a livelock - * when allocating btree reserves fail halfway through) - instead, we - * can stick them here: - */ - struct btree_alloc { - struct open_bucket *ob; - BKEY_PADDED(k); - } btree_reserve_cache[BTREE_NODE_RESERVE * 2]; - unsigned btree_reserve_cache_nr; - struct mutex btree_reserve_cache_lock; - - mempool_t btree_interior_update_pool; - struct list_head btree_interior_update_list; - struct mutex btree_interior_update_lock; - - struct workqueue_struct *wq; - /* copygc needs its own workqueue for index updates.. */ - struct workqueue_struct *copygc_wq; - - /* ALLOCATION */ - struct bch_pd_controller foreground_write_pd; - struct delayed_work pd_controllers_update; - unsigned pd_controllers_update_seconds; - spinlock_t foreground_write_pd_lock; - struct bch_write_op *write_wait_head; - struct bch_write_op *write_wait_tail; - - struct timer_list foreground_write_wakeup; - - /* - * These contain all r/w devices - i.e. devices we can currently - * allocate from: - */ - struct dev_group all_devs; - struct bch_tier tiers[BCH_TIER_MAX]; - /* NULL if we only have devices in one tier: */ - struct bch_tier *fastest_tier; - - u64 capacity; /* sectors */ - - /* - * When capacity _decreases_ (due to a disk being removed), we - * increment capacity_gen - this invalidates outstanding reservations - * and forces them to be revalidated - */ - u32 capacity_gen; - - atomic64_t sectors_available; - - struct bch_fs_usage __percpu *usage_percpu; - struct bch_fs_usage usage_cached; - struct lglock usage_lock; - - struct mutex bucket_lock; - - struct closure_waitlist freelist_wait; - - /* - * When we invalidate buckets, we use both the priority and the amount - * of good data to determine which buckets to reuse first - to weight - * those together consistently we keep track of the smallest nonzero - * priority of any bucket. - */ - struct prio_clock prio_clock[2]; - - struct io_clock io_clock[2]; - - /* SECTOR ALLOCATOR */ - struct list_head open_buckets_open; - struct list_head open_buckets_free; - unsigned open_buckets_nr_free; - struct closure_waitlist open_buckets_wait; - spinlock_t open_buckets_lock; - struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; - - struct write_point btree_write_point; - - struct write_point write_points[WRITE_POINT_COUNT]; - struct write_point promote_write_point; - - /* - * This write point is used for migrating data off a device - * and can point to any other device. - * We can't use the normal write points because those will - * gang up n replicas, and for migration we want only one new - * replica. - */ - struct write_point migration_write_point; - - /* GARBAGE COLLECTION */ - struct task_struct *gc_thread; - atomic_t kick_gc; - - /* - * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] - * has been marked by GC. - * - * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) - * - * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not - * currently running, and gc marks are currently valid - * - * Protected by gc_pos_lock. Only written to by GC thread, so GC thread - * can read without a lock. - */ - seqcount_t gc_pos_lock; - struct gc_pos gc_pos; - - /* - * The allocation code needs gc_mark in struct bucket to be correct, but - * it's not while a gc is in progress. - */ - struct rw_semaphore gc_lock; - - /* IO PATH */ - struct bio_set bio_read; - struct bio_set bio_read_split; - struct bio_set bio_write; - struct mutex bio_bounce_pages_lock; - mempool_t bio_bounce_pages; - - mempool_t lz4_workspace_pool; - void *zlib_workspace; - struct mutex zlib_workspace_lock; - mempool_t compression_bounce[2]; - - struct crypto_blkcipher *chacha20; - struct crypto_shash *poly1305; - - atomic64_t key_version; - - /* For punting bio submissions to workqueue, io.c */ - struct bio_list bio_submit_list; - struct work_struct bio_submit_work; - spinlock_t bio_submit_lock; - - struct bio_list read_retry_list; - struct work_struct read_retry_work; - spinlock_t read_retry_lock; - - /* FILESYSTEM */ - wait_queue_head_t writeback_wait; - atomic_t writeback_pages; - unsigned writeback_pages_max; - atomic_long_t nr_inodes; - - /* NOTIFICATIONS */ - struct mutex uevent_lock; - struct kobj_uevent_env uevent_env; - - /* DEBUG JUNK */ - struct dentry *debug; - struct btree_debug btree_debug[BTREE_ID_NR]; -#ifdef CONFIG_BCACHE_DEBUG - struct btree *verify_data; - struct btree_node *verify_ondisk; - struct mutex verify_lock; -#endif - - u64 unused_inode_hint; - - /* - * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - have to dynamically allocate them - */ - mempool_t fill_iter; - - mempool_t btree_bounce_pool; - - struct journal journal; - - unsigned bucket_journal_seq; - - /* CACHING OTHER BLOCK DEVICES */ - mempool_t search; - struct radix_tree_root devices; - struct list_head cached_devs; - u64 cached_dev_sectors; - struct closure caching; - -#define CONGESTED_MAX 1024 - unsigned congested_last_us; - atomic_t congested; - - /* The rest of this all shows up in sysfs */ - unsigned congested_read_threshold_us; - unsigned congested_write_threshold_us; - - struct cache_accounting accounting; - atomic_long_t cache_read_races; - atomic_long_t writeback_keys_done; - atomic_long_t writeback_keys_failed; - - unsigned error_limit; - unsigned error_decay; - - unsigned foreground_write_ratelimit_enabled:1; - unsigned copy_gc_enabled:1; - unsigned tiering_enabled:1; - unsigned tiering_percent; - - /* - * foreground writes will be throttled when the number of free - * buckets is below this percentage - */ - unsigned foreground_target_percent; - -#define BCH_DEBUG_PARAM(name, description) bool name; - BCH_DEBUG_PARAMS_ALL() -#undef BCH_DEBUG_PARAM - -#define BCH_TIME_STAT(name, frequency_units, duration_units) \ - struct time_stats name##_time; - BCH_TIME_STATS() -#undef BCH_TIME_STAT -}; - -static inline bool bch_fs_running(struct bch_fs *c) -{ - return c->state == BCH_FS_RO || c->state == BCH_FS_RW; -} - -static inline unsigned bucket_pages(const struct bch_dev *ca) -{ - return ca->mi.bucket_size / PAGE_SECTORS; -} - -static inline unsigned bucket_bytes(const struct bch_dev *ca) -{ - return ca->mi.bucket_size << 9; -} - -static inline unsigned block_bytes(const struct bch_fs *c) -{ - return c->sb.block_size << 9; -} - -#endif /* _BCACHE_H */ diff --git a/libbcache/bkey.c b/libbcache/bkey.c deleted file mode 100644 index 374237e2..00000000 --- a/libbcache/bkey.c +++ /dev/null @@ -1,1167 +0,0 @@ - -#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ - -#include <linux/kernel.h> - -#include "bkey.h" -#include "bset.h" -#include "util.h" - -const struct bkey_format bch_bkey_format_current = BKEY_FORMAT_CURRENT; - -struct bkey __bkey_unpack_key(const struct bkey_format *, - const struct bkey_packed *); - -void bch_to_binary(char *out, const u64 *p, unsigned nr_bits) -{ - unsigned bit = high_bit_offset, done = 0; - - while (1) { - while (bit < 64) { - if (done && !(done % 8)) - *out++ = ' '; - *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0'; - bit++; - done++; - if (done == nr_bits) { - *out++ = '\0'; - return; - } - } - - p = next_word(p); - bit = 0; - } -} - -#ifdef CONFIG_BCACHE_DEBUG - -static void bch_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) -{ - struct bkey tmp; - - BUG_ON(bkeyp_val_u64s(format, packed) != - bkey_val_u64s(unpacked)); - - BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); - - tmp = __bkey_unpack_key(format, packed); - - if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { - char buf1[160], buf2[160]; - char buf3[160], buf4[160]; - - bch_bkey_to_text(buf1, sizeof(buf1), unpacked); - bch_bkey_to_text(buf2, sizeof(buf2), &tmp); - bch_to_binary(buf3, (void *) unpacked, 80); - bch_to_binary(buf4, high_word(format, packed), 80); - - panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n", - format->key_u64s, - format->bits_per_field[0], - format->bits_per_field[1], - format->bits_per_field[2], - format->bits_per_field[3], - format->bits_per_field[4], - buf1, buf2, buf3, buf4); - } -} - -#else -static inline void bch_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) {} -#endif - -int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) -{ - char *out = buf, *end = buf + size; - -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - p("u64s %u type %u %llu:%llu snap %u len %u ver %llu", - k->u64s, k->type, k->p.inode, k->p.offset, - k->p.snapshot, k->size, k->version.lo); - - BUG_ON(bkey_packed(k)); - - switch (k->type) { - case KEY_TYPE_DELETED: - p(" deleted"); - break; - case KEY_TYPE_DISCARD: - p(" discard"); - break; - case KEY_TYPE_ERROR: - p(" error"); - break; - case KEY_TYPE_COOKIE: - p(" cookie"); - break; - } -#undef p - - return out - buf; -} - -struct pack_state { - const struct bkey_format *format; - unsigned bits; /* bits remaining in current word */ - u64 w; /* current word */ - u64 *p; /* pointer to next word */ -}; - -__always_inline -static struct pack_state pack_state_init(const struct bkey_format *format, - struct bkey_packed *k) -{ - u64 *p = high_word(format, k); - - return (struct pack_state) { - .format = format, - .bits = 64 - high_bit_offset, - .w = 0, - .p = p, - }; -} - -__always_inline -static void pack_state_finish(struct pack_state *state, - struct bkey_packed *k) -{ - EBUG_ON(state->p < k->_data); - EBUG_ON(state->p >= k->_data + state->format->key_u64s); - - *state->p = state->w; -} - -struct unpack_state { - const struct bkey_format *format; - unsigned bits; /* bits remaining in current word */ - u64 w; /* current word */ - const u64 *p; /* pointer to next word */ -}; - -__always_inline -static struct unpack_state unpack_state_init(const struct bkey_format *format, - const struct bkey_packed *k) -{ - const u64 *p = high_word(format, k); - - return (struct unpack_state) { - .format = format, - .bits = 64 - high_bit_offset, - .w = *p << high_bit_offset, - .p = p, - }; -} - -__always_inline -static u64 get_inc_field(struct unpack_state *state, unsigned field) -{ - unsigned bits = state->format->bits_per_field[field]; - u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); - - if (bits >= state->bits) { - v = state->w >> (64 - bits); - bits -= state->bits; - - state->p = next_word(state->p); - state->w = *state->p; - state->bits = 64; - } - - /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ - v |= (state->w >> 1) >> (63 - bits); - state->w <<= bits; - state->bits -= bits; - - return v + offset; -} - -__always_inline -static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) -{ - unsigned bits = state->format->bits_per_field[field]; - u64 offset = le64_to_cpu(state->format->field_offset[field]); - - if (v < offset) - return false; - - v -= offset; - - if (fls64(v) > bits) - return false; - - if (bits > state->bits) { - bits -= state->bits; - /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ - state->w |= (v >> 1) >> (bits - 1); - - *state->p = state->w; - state->p = next_word(state->p); - state->w = 0; - state->bits = 64; - } - - state->bits -= bits; - state->w |= v << state->bits; - - return true; -} - -/* - * Note: does NOT set out->format (we don't know what it should be here!) - * - * Also: doesn't work on extents - it doesn't preserve the invariant that - * if k is packed bkey_start_pos(k) will successfully pack - */ -static bool bch_bkey_transform_key(const struct bkey_format *out_f, - struct bkey_packed *out, - const struct bkey_format *in_f, - const struct bkey_packed *in) -{ - struct pack_state out_s = pack_state_init(out_f, out); - struct unpack_state in_s = unpack_state_init(in_f, in); - unsigned i; - - out->_data[0] = 0; - - for (i = 0; i < BKEY_NR_FIELDS; i++) - if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) - return false; - - /* Can't happen because the val would be too big to unpack: */ - EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); - - pack_state_finish(&out_s, out); - out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; - out->needs_whiteout = in->needs_whiteout; - out->type = in->type; - - return true; -} - -bool bch_bkey_transform(const struct bkey_format *out_f, - struct bkey_packed *out, - const struct bkey_format *in_f, - const struct bkey_packed *in) -{ - if (!bch_bkey_transform_key(out_f, out, in_f, in)) - return false; - - memcpy_u64s((u64 *) out + out_f->key_u64s, - (u64 *) in + in_f->key_u64s, - (in->u64s - in_f->key_u64s)); - return true; -} - -#define bkey_fields() \ - x(BKEY_FIELD_INODE, p.inode) \ - x(BKEY_FIELD_OFFSET, p.offset) \ - x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ - x(BKEY_FIELD_SIZE, size) \ - x(BKEY_FIELD_VERSION_HI, version.hi) \ - x(BKEY_FIELD_VERSION_LO, version.lo) - -struct bkey __bkey_unpack_key(const struct bkey_format *format, - const struct bkey_packed *in) -{ - struct unpack_state state = unpack_state_init(format, in); - struct bkey out; - - EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); - EBUG_ON(in->u64s < format->key_u64s); - EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); - EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); - - out.u64s = BKEY_U64s + in->u64s - format->key_u64s; - out.format = KEY_FORMAT_CURRENT; - out.needs_whiteout = in->needs_whiteout; - out.type = in->type; - out.pad[0] = 0; - -#define x(id, field) out.field = get_inc_field(&state, id); - bkey_fields() -#undef x - - return out; -} - -#ifndef HAVE_BCACHE_COMPILED_UNPACK -struct bpos __bkey_unpack_pos(const struct bkey_format *format, - const struct bkey_packed *in) -{ - struct unpack_state state = unpack_state_init(format, in); - struct bpos out; - - EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); - EBUG_ON(in->u64s < format->key_u64s); - EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); - - out.inode = get_inc_field(&state, BKEY_FIELD_INODE); - out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); - out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); - - return out; -} -#endif - -/** - * bkey_pack_key -- pack just the key, not the value - */ -bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in, - const struct bkey_format *format) -{ - struct pack_state state = pack_state_init(format, out); - - EBUG_ON((void *) in == (void *) out); - EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); - EBUG_ON(in->format != KEY_FORMAT_CURRENT); - - out->_data[0] = 0; - -#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; - bkey_fields() -#undef x - - /* - * Extents - we have to guarantee that if an extent is packed, a trimmed - * version will also pack: - */ - if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET]) - return false; - - pack_state_finish(&state, out); - out->u64s = format->key_u64s + in->u64s - BKEY_U64s; - out->format = KEY_FORMAT_LOCAL_BTREE; - out->needs_whiteout = in->needs_whiteout; - out->type = in->type; - - bch_bkey_pack_verify(out, in, format); - return true; -} - -/** - * bkey_unpack -- unpack the key and the value - */ -void bkey_unpack(const struct btree *b, struct bkey_i *dst, - const struct bkey_packed *src) -{ - dst->k = bkey_unpack_key(b, src); - - memcpy_u64s(&dst->v, - bkeyp_val(&b->format, src), - bkeyp_val_u64s(&b->format, src)); -} - -/** - * bkey_pack -- pack the key and the value - */ -bool bkey_pack(struct bkey_packed *out, const struct bkey_i *in, - const struct bkey_format *format) -{ - struct bkey_packed tmp; - - if (!bkey_pack_key(&tmp, &in->k, format)) - return false; - - memmove_u64s((u64 *) out + format->key_u64s, - &in->v, - bkey_val_u64s(&in->k)); - memcpy_u64s(out, &tmp, format->key_u64s); - - return true; -} - -__always_inline -static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) -{ - unsigned bits = state->format->bits_per_field[field]; - u64 offset = le64_to_cpu(state->format->field_offset[field]); - bool ret = true; - - EBUG_ON(v < offset); - v -= offset; - - if (fls64(v) > bits) { - v = ~(~0ULL << bits); - ret = false; - } - - if (bits > state->bits) { - bits -= state->bits; - state->w |= (v >> 1) >> (bits - 1); - - *state->p = state->w; - state->p = next_word(state->p); - state->w = 0; - state->bits = 64; - } - - state->bits -= bits; - state->w |= v << state->bits; - - return ret; -} - -#ifdef CONFIG_BCACHE_DEBUG -static bool bkey_packed_successor(struct bkey_packed *out, - const struct btree *b, - struct bkey_packed k) -{ - const struct bkey_format *f = &b->format; - unsigned nr_key_bits = b->nr_key_bits; - unsigned first_bit, offset; - u64 *p; - - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); - - if (!nr_key_bits) - return false; - - *out = k; - - first_bit = high_bit_offset + nr_key_bits - 1; - p = nth_word(high_word(f, out), first_bit >> 6); - offset = 63 - (first_bit & 63); - - while (nr_key_bits) { - unsigned bits = min(64 - offset, nr_key_bits); - u64 mask = (~0ULL >> (64 - bits)) << offset; - - if ((*p & mask) != mask) { - *p += 1ULL << offset; - EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); - return true; - } - - *p &= ~mask; - p = prev_word(p); - nr_key_bits -= bits; - offset = 0; - } - - return false; -} -#endif - -/* - * Returns a packed key that compares <= in - * - * This is used in bset_search_tree(), where we need a packed pos in order to be - * able to compare against the keys in the auxiliary search tree - and it's - * legal to use a packed pos that isn't equivalent to the original pos, - * _provided_ it compares <= to the original pos. - */ -enum bkey_pack_pos_ret bkey_pack_pos_lossy(struct bkey_packed *out, - struct bpos in, - const struct btree *b) -{ - const struct bkey_format *f = &b->format; - struct pack_state state = pack_state_init(f, out); -#ifdef CONFIG_BCACHE_DEBUG - struct bpos orig = in; -#endif - bool exact = true; - - out->_data[0] = 0; - - if (unlikely(in.snapshot < - le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { - if (!in.offset-- && - !in.inode--) - return BKEY_PACK_POS_FAIL; - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(in.offset < - le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { - if (!in.inode--) - return BKEY_PACK_POS_FAIL; - in.offset = KEY_OFFSET_MAX; - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(in.inode < - le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) - return BKEY_PACK_POS_FAIL; - - if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) { - in.offset = KEY_OFFSET_MAX; - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) { - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)) - exact = false; - - pack_state_finish(&state, out); - out->u64s = f->key_u64s; - out->format = KEY_FORMAT_LOCAL_BTREE; - out->type = KEY_TYPE_DELETED; - -#ifdef CONFIG_BCACHE_DEBUG - if (exact) { - BUG_ON(bkey_cmp_left_packed(b, out, &orig)); - } else { - struct bkey_packed successor; - - BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); - BUG_ON(bkey_packed_successor(&successor, b, *out) && - bkey_cmp_left_packed(b, &successor, &orig) < 0); - } -#endif - - return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; -} - -void bch_bkey_format_init(struct bkey_format_state *s) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(s->field_min); i++) - s->field_min[i] = U64_MAX; - - for (i = 0; i < ARRAY_SIZE(s->field_max); i++) - s->field_max[i] = 0; - - /* Make sure we can store a size of 0: */ - s->field_min[BKEY_FIELD_SIZE] = 0; -} - -static void __bkey_format_add(struct bkey_format_state *s, - unsigned field, u64 v) -{ - s->field_min[field] = min(s->field_min[field], v); - s->field_max[field] = max(s->field_max[field], v); -} - -/* - * Changes @format so that @k can be successfully packed with @format - */ -void bch_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) -{ -#define x(id, field) __bkey_format_add(s, id, k->field); - bkey_fields() -#undef x - __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k)); -} - -void bch_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) -{ - unsigned field = 0; - - __bkey_format_add(s, field++, p.inode); - __bkey_format_add(s, field++, p.offset); - __bkey_format_add(s, field++, p.snapshot); -} - -/* - * We don't want it to be possible for the packed format to represent fields - * bigger than a u64... that will cause confusion and issues (like with - * bkey_packed_successor()) - */ -static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, - unsigned bits, u64 offset) -{ - offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1)); - - f->bits_per_field[i] = bits; - f->field_offset[i] = cpu_to_le64(offset); -} - -struct bkey_format bch_bkey_format_done(struct bkey_format_state *s) -{ - unsigned i, bits = KEY_PACKED_BITS_START; - struct bkey_format ret = { - .nr_fields = BKEY_NR_FIELDS, - }; - - for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { - s->field_min[i] = min(s->field_min[i], s->field_max[i]); - - set_format_field(&ret, i, - fls64(s->field_max[i] - s->field_min[i]), - s->field_min[i]); - - bits += ret.bits_per_field[i]; - } - - /* allow for extent merging: */ - if (ret.bits_per_field[BKEY_FIELD_SIZE]) { - ret.bits_per_field[BKEY_FIELD_SIZE] += 4; - bits += 4; - } - - ret.key_u64s = DIV_ROUND_UP(bits, 64); - - /* if we have enough spare bits, round fields up to nearest byte */ - bits = ret.key_u64s * 64 - bits; - - for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { - unsigned r = round_up(ret.bits_per_field[i], 8) - - ret.bits_per_field[i]; - - if (r <= bits) { - set_format_field(&ret, i, - ret.bits_per_field[i] + r, - le64_to_cpu(ret.field_offset[i])); - bits -= r; - } - } - - EBUG_ON(bch_bkey_format_validate(&ret)); - return ret; -} - -const char *bch_bkey_format_validate(struct bkey_format *f) -{ - unsigned i, bits = KEY_PACKED_BITS_START; - - if (f->nr_fields != BKEY_NR_FIELDS) - return "invalid format: incorrect number of fields"; - - for (i = 0; i < f->nr_fields; i++) { - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (f->bits_per_field[i] > 64) - return "invalid format: field too large"; - - if (field_offset && - (f->bits_per_field[i] == 64 || - (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < - field_offset))) - return "invalid format: offset + bits overflow"; - - bits += f->bits_per_field[i]; - } - - if (f->key_u64s != DIV_ROUND_UP(bits, 64)) - return "invalid format: incorrect key_u64s"; - - return NULL; -} - -/* - * Most significant differing bit - * Bits are indexed from 0 - return is [0, nr_key_bits) - */ -__pure -unsigned bkey_greatest_differing_bit(const struct btree *b, - const struct bkey_packed *l_k, - const struct bkey_packed *r_k) -{ - const u64 *l = high_word(&b->format, l_k); - const u64 *r = high_word(&b->format, r_k); - unsigned nr_key_bits = b->nr_key_bits; - unsigned word_bits = 64 - high_bit_offset; - u64 l_v, r_v; - - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); - - /* for big endian, skip past header */ - l_v = *l & (~0ULL >> high_bit_offset); - r_v = *r & (~0ULL >> high_bit_offset); - - while (nr_key_bits) { - if (nr_key_bits < word_bits) { - l_v >>= word_bits - nr_key_bits; - r_v >>= word_bits - nr_key_bits; - nr_key_bits = 0; - } else { - nr_key_bits -= word_bits; - } - - if (l_v != r_v) - return fls64(l_v ^ r_v) - 1 + nr_key_bits; - - l = next_word(l); - r = next_word(r); - - l_v = *l; - r_v = *r; - word_bits = 64; - } - - return 0; -} - -/* - * First set bit - * Bits are indexed from 0 - return is [0, nr_key_bits) - */ -__pure -unsigned bkey_ffs(const struct btree *b, - const struct bkey_packed *k) -{ - const u64 *p = high_word(&b->format, k); - unsigned nr_key_bits = b->nr_key_bits; - unsigned ret = 0, offset; - - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); - - offset = nr_key_bits; - while (offset > 64) { - p = next_word(p); - offset -= 64; - } - - offset = 64 - offset; - - while (nr_key_bits) { - unsigned bits = nr_key_bits + offset < 64 - ? nr_key_bits - : 64 - offset; - - u64 mask = (~0ULL >> (64 - bits)) << offset; - - if (*p & mask) - return ret + __ffs64(*p & mask) - offset; - - p = prev_word(p); - nr_key_bits -= bits; - ret += bits; - offset = 0; - } - - return 0; -} - -#ifdef CONFIG_X86_64 - -static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, - unsigned nr_key_bits) -{ - long d0, d1, d2, d3; - int cmp; - - /* we shouldn't need asm for this, but gcc is being retarded: */ - - asm(".intel_syntax noprefix;" - "xor eax, eax;" - "xor edx, edx;" - "1:;" - "mov r8, [rdi];" - "mov r9, [rsi];" - "sub ecx, 64;" - "jl 2f;" - - "cmp r8, r9;" - "jnz 3f;" - - "lea rdi, [rdi - 8];" - "lea rsi, [rsi - 8];" - "jmp 1b;" - - "2:;" - "not ecx;" - "shr r8, 1;" - "shr r9, 1;" - "shr r8, cl;" - "shr r9, cl;" - "cmp r8, r9;" - - "3:\n" - "seta al;" - "setb dl;" - "sub eax, edx;" - ".att_syntax prefix;" - : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) - : "0" (l), "1" (r), "3" (nr_key_bits) - : "r8", "r9", "cc", "memory"); - - return cmp; -} - -#define I(_x) (*(out)++ = (_x)) -#define I1(i0) I(i0) -#define I2(i0, i1) (I1(i0), I(i1)) -#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) -#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) -#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) - -static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, - enum bch_bkey_fields field, - unsigned dst_offset, unsigned dst_size, - bool *eax_zeroed) -{ - unsigned byte = format->key_u64s * sizeof(u64); - unsigned bits = format->bits_per_field[field]; - u64 offset = format->field_offset[field]; - unsigned i, bit_offset = 0; - unsigned shl, shr; - - if (!bits && !offset) { - if (!*eax_zeroed) { - /* xor eax, eax */ - I2(0x31, 0xc0); - } - - *eax_zeroed = true; - goto set_field; - } - - if (!bits) { - /* just return offset: */ - - switch (dst_size) { - case 8: - if (offset > S32_MAX) { - /* mov [rdi + dst_offset], offset */ - I3(0xc7, 0x47, dst_offset); - memcpy(out, &offset, 4); - out += 4; - - I3(0xc7, 0x47, dst_offset + 4); - memcpy(out, (void *) &offset + 4, 4); - out += 4; - } else { - /* mov [rdi + dst_offset], offset */ - /* sign extended */ - I4(0x48, 0xc7, 0x47, dst_offset); - memcpy(out, &offset, 4); - out += 4; - } - break; - case 4: - /* mov [rdi + dst_offset], offset */ - I3(0xc7, 0x47, dst_offset); - memcpy(out, &offset, 4); - out += 4; - break; - default: - BUG(); - } - - return out; - } - - for (i = 0; i <= field; i++) - bit_offset += format->bits_per_field[i]; - - byte -= DIV_ROUND_UP(bit_offset, 8); - bit_offset = round_up(bit_offset, 8) - bit_offset; - - *eax_zeroed = false; - - if (bit_offset == 0 && bits == 8) { - /* movzx eax, BYTE PTR [rsi + imm8] */ - I4(0x0f, 0xb6, 0x46, byte); - } else if (bit_offset == 0 && bits == 16) { - /* movzx eax, WORD PTR [rsi + imm8] */ - I4(0x0f, 0xb7, 0x46, byte); - } else if (bit_offset + bits <= 32) { - /* mov eax, [rsi + imm8] */ - I3(0x8b, 0x46, byte); - - if (bit_offset) { - /* shr eax, imm8 */ - I3(0xc1, 0xe8, bit_offset); - } - - if (bit_offset + bits < 32) { - unsigned mask = ~0U >> (32 - bits); - - /* and eax, imm32 */ - I1(0x25); - memcpy(out, &mask, 4); - out += 4; - } - } else if (bit_offset + bits <= 64) { - /* mov rax, [rsi + imm8] */ - I4(0x48, 0x8b, 0x46, byte); - - shl = 64 - bit_offset - bits; - shr = bit_offset + shl; - - if (shl) { - /* shl rax, imm8 */ - I4(0x48, 0xc1, 0xe0, shl); - } - - if (shr) { - /* shr rax, imm8 */ - I4(0x48, 0xc1, 0xe8, shr); - } - } else { - /* mov rax, [rsi + byte] */ - I4(0x48, 0x8b, 0x46, byte); - - /* mov edx, [rsi + byte + 8] */ - I3(0x8b, 0x56, byte + 8); - - /* bits from next word: */ - shr = bit_offset + bits - 64; - BUG_ON(shr > bit_offset); - - /* shr rax, bit_offset */ - I4(0x48, 0xc1, 0xe8, shr); - - /* shl rdx, imm8 */ - I4(0x48, 0xc1, 0xe2, 64 - shr); - - /* or rax, rdx */ - I3(0x48, 0x09, 0xd0); - - shr = bit_offset - shr; - - if (shr) { - /* shr rax, imm8 */ - I4(0x48, 0xc1, 0xe8, shr); - } - } - - /* rax += offset: */ - if (offset > S32_MAX) { - /* mov rdx, imm64 */ - I2(0x48, 0xba); - memcpy(out, &offset, 8); - out += 8; - /* add %rdx, %rax */ - I3(0x48, 0x01, 0xd0); - } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { - /* add rax, imm32 */ - I2(0x48, 0x05); - memcpy(out, &offset, 4); - out += 4; - } else if (offset) { - /* add eax, imm32 */ - I1(0x05); - memcpy(out, &offset, 4); - out += 4; - } -set_field: - switch (dst_size) { - case 8: - /* mov [rdi + dst_offset], rax */ - I4(0x48, 0x89, 0x47, dst_offset); - break; - case 4: - /* mov [rdi + dst_offset], eax */ - I3(0x89, 0x47, dst_offset); - break; - default: - BUG(); - } - - return out; -} - -int bch_compile_bkey_format(const struct bkey_format *format, void *_out) -{ - bool eax_zeroed = false; - u8 *out = _out; - - /* - * rdi: dst - unpacked key - * rsi: src - packed key - */ - - /* k->u64s, k->format, k->type */ - - /* mov eax, [rsi] */ - I2(0x8b, 0x06); - - /* add eax, BKEY_U64s - format->key_u64s */ - I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); - - /* and eax, imm32: mask out k->pad: */ - I5(0x25, 0xff, 0xff, 0xff, 0); - - /* mov [rdi], eax */ - I2(0x89, 0x07); - -#define x(id, field) \ - out = compile_bkey_field(format, out, id, \ - offsetof(struct bkey, field), \ - sizeof(((struct bkey *) NULL)->field), \ - &eax_zeroed); - bkey_fields() -#undef x - - /* retq */ - I1(0xc3); - - return (void *) out - _out; -} - -#else -static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, - unsigned nr_key_bits) -{ - u64 l_v, r_v; - - if (!nr_key_bits) - return 0; - - /* for big endian, skip past header */ - nr_key_bits += high_bit_offset; - l_v = *l & (~0ULL >> high_bit_offset); - r_v = *r & (~0ULL >> high_bit_offset); - - while (1) { - if (nr_key_bits < 64) { - l_v >>= 64 - nr_key_bits; - r_v >>= 64 - nr_key_bits; - nr_key_bits = 0; - } else { - nr_key_bits -= 64; - } - - if (l_v != r_v) - return l_v < r_v ? -1 : 1; - - if (!nr_key_bits) - return 0; - - l = next_word(l); - r = next_word(r); - - l_v = *l; - r_v = *r; - } -} -#endif - -__pure -int __bkey_cmp_packed_format_checked(const struct bkey_packed *l, - const struct bkey_packed *r, - const struct btree *b) -{ - const struct bkey_format *f = &b->format; - int ret; - - EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); - - ret = __bkey_cmp_bits(high_word(f, l), - high_word(f, r), - b->nr_key_bits); - - EBUG_ON(ret != bkey_cmp(bkey_unpack_key_format_checked(b, l).p, - bkey_unpack_key_format_checked(b, r).p)); - return ret; -} - -__pure __flatten -int __bkey_cmp_left_packed_format_checked(const struct btree *b, - const struct bkey_packed *l, - const struct bpos *r) -{ - return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); -} - -__pure __flatten -int __bkey_cmp_packed(const struct bkey_packed *l, - const struct bkey_packed *r, - const struct btree *b) -{ - int packed = bkey_lr_packed(l, r); - - if (likely(packed == BKEY_PACKED_BOTH)) - return __bkey_cmp_packed_format_checked(l, r, b); - - switch (packed) { - case BKEY_PACKED_NONE: - return bkey_cmp(((struct bkey *) l)->p, - ((struct bkey *) r)->p); - case BKEY_PACKED_LEFT: - return __bkey_cmp_left_packed_format_checked(b, - (struct bkey_packed *) l, - &((struct bkey *) r)->p); - case BKEY_PACKED_RIGHT: - return -__bkey_cmp_left_packed_format_checked(b, - (struct bkey_packed *) r, - &((struct bkey *) l)->p); - default: - unreachable(); - } -} - -__pure __flatten -int bkey_cmp_left_packed(const struct btree *b, - const struct bkey_packed *l, const struct bpos *r) -{ - const struct bkey *l_unpacked; - - return unlikely(l_unpacked = packed_to_bkey_c(l)) - ? bkey_cmp(l_unpacked->p, *r) - : __bkey_cmp_left_packed_format_checked(b, l, r); -} - -void bch_bpos_swab(struct bpos *p) -{ - u8 *l = (u8 *) p; - u8 *h = ((u8 *) &p[1]) - 1; - - while (l < h) { - swap(*l, *h); - l++; - --h; - } -} - -void bch_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) -{ - const struct bkey_format *f = bkey_packed(k) ? _f : &bch_bkey_format_current; - u8 *l = k->key_start; - u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; - - while (l < h) { - swap(*l, *h); - l++; - --h; - } -} - -#ifdef CONFIG_BCACHE_DEBUG -void bkey_pack_test(void) -{ - struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); - struct bkey_packed p; - - struct bkey_format test_format = { - .key_u64s = 2, - .nr_fields = BKEY_NR_FIELDS, - .bits_per_field = { - 13, - 64, - }, - }; - - struct unpack_state in_s = - unpack_state_init(&bch_bkey_format_current, (void *) &t); - struct pack_state out_s = pack_state_init(&test_format, &p); - unsigned i; - - for (i = 0; i < out_s.format->nr_fields; i++) { - u64 a, v = get_inc_field(&in_s, i); - - switch (i) { -#define x(id, field) case id: a = t.field; break; - bkey_fields() -#undef x - default: - BUG(); - } - - if (a != v) - panic("got %llu actual %llu i %u\n", v, a, i); - - if (!set_inc_field(&out_s, i, v)) - panic("failed at %u\n", i); - } - - BUG_ON(!bkey_pack_key(&p, &t, &test_format)); -} -#endif diff --git a/libbcache/bkey.h b/libbcache/bkey.h deleted file mode 100644 index 0893134f..00000000 --- a/libbcache/bkey.h +++ /dev/null @@ -1,606 +0,0 @@ -#ifndef _BCACHE_BKEY_H -#define _BCACHE_BKEY_H - -#include <linux/bug.h> -#include <linux/bcache.h> - -#include "util.h" -#include "vstructs.h" - -void bch_to_binary(char *, const u64 *, unsigned); -int bch_bkey_to_text(char *, size_t, const struct bkey *); - -#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) - -/* bkey with split value, const */ -struct bkey_s_c { - const struct bkey *k; - const struct bch_val *v; -}; - -/* bkey with split value */ -struct bkey_s { - union { - struct { - struct bkey *k; - struct bch_val *v; - }; - struct bkey_s_c s_c; - }; -}; - -#define bkey_next(_k) vstruct_next(_k) - -static inline unsigned bkey_val_u64s(const struct bkey *k) -{ - return k->u64s - BKEY_U64s; -} - -static inline size_t bkey_val_bytes(const struct bkey *k) -{ - return bkey_val_u64s(k) * sizeof(u64); -} - -static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -{ - k->u64s = BKEY_U64s + val_u64s; -} - -static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -{ - k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); -} - -/* - * Mark a key as deleted without changing the size of the value (i.e. modifying - * keys in the btree in place) - */ -static inline void __set_bkey_deleted(struct bkey *k) -{ - k->type = KEY_TYPE_DELETED; -} - -static inline void set_bkey_deleted(struct bkey *k) -{ - __set_bkey_deleted(k); - set_bkey_val_u64s(k, 0); -} - -#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_DELETED) - -#define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD) - -#define bkey_packed_typecheck(_k) \ -({ \ - BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ - !type_is(_k, struct bkey_packed *)); \ - type_is(_k, struct bkey_packed *); \ -}) - -enum bkey_lr_packed { - BKEY_PACKED_BOTH, - BKEY_PACKED_RIGHT, - BKEY_PACKED_LEFT, - BKEY_PACKED_NONE, -}; - -#define bkey_lr_packed_typecheck(_l, _r) \ - (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) - -#define bkey_lr_packed(_l, _r) \ - ((_l)->format + ((_r)->format << 1)) - -#define bkey_copy(_dst, _src) \ -do { \ - BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ - !type_is(_dst, struct bkey_packed *)); \ - BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ - !type_is(_src, struct bkey_packed *)); \ - EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ - (u64 *) (_dst) < (u64 *) (_src) + \ - ((struct bkey *) (_src))->u64s); \ - \ - __memmove_u64s_down((_dst), (_src), \ - ((struct bkey *) (_src))->u64s); \ -} while (0) - -struct btree; - -struct bkey_format_state { - u64 field_min[BKEY_NR_FIELDS]; - u64 field_max[BKEY_NR_FIELDS]; -}; - -void bch_bkey_format_init(struct bkey_format_state *); -void bch_bkey_format_add_key(struct bkey_format_state *, const struct bkey *); -void bch_bkey_format_add_pos(struct bkey_format_state *, struct bpos); -struct bkey_format bch_bkey_format_done(struct bkey_format_state *); -const char *bch_bkey_format_validate(struct bkey_format *); - -__pure -unsigned bkey_greatest_differing_bit(const struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); -__pure -unsigned bkey_ffs(const struct btree *, const struct bkey_packed *); - -__pure -int __bkey_cmp_packed_format_checked(const struct bkey_packed *, - const struct bkey_packed *, - const struct btree *); - -__pure -int __bkey_cmp_left_packed_format_checked(const struct btree *, - const struct bkey_packed *, - const struct bpos *); - -__pure -int __bkey_cmp_packed(const struct bkey_packed *, - const struct bkey_packed *, - const struct btree *); - -__pure -int bkey_cmp_left_packed(const struct btree *, - const struct bkey_packed *, - const struct bpos *); - -/* - * we prefer to pass bpos by ref, but it's often enough terribly convenient to - * pass it by by val... as much as I hate c++, const ref would be nice here: - */ -__pure __flatten -static inline int bkey_cmp_left_packed_byval(const struct btree *b, - const struct bkey_packed *l, - struct bpos r) -{ - return bkey_cmp_left_packed(b, l, &r); -} - -/* - * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to - * skip dispatching on k->format: - */ -#define bkey_cmp_packed(_b, _l, _r) \ -({ \ - int _cmp; \ - \ - switch (bkey_lr_packed_typecheck(_l, _r)) { \ - case BKEY_PACKED_NONE: \ - _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ - ((struct bkey *) (_r))->p); \ - break; \ - case BKEY_PACKED_LEFT: \ - _cmp = bkey_cmp_left_packed((_b), \ - (struct bkey_packed *) (_l), \ - &((struct bkey *) (_r))->p); \ - break; \ - case BKEY_PACKED_RIGHT: \ - _cmp = -bkey_cmp_left_packed((_b), \ - (struct bkey_packed *) (_r), \ - &((struct bkey *) (_l))->p); \ - break; \ - case BKEY_PACKED_BOTH: \ - _cmp = __bkey_cmp_packed((void *) (_l), \ - (void *) (_r), (_b)); \ - break; \ - } \ - _cmp; \ -}) - -#if 1 -static __always_inline int bkey_cmp(struct bpos l, struct bpos r) -{ - if (l.inode != r.inode) - return l.inode < r.inode ? -1 : 1; - if (l.offset != r.offset) - return l.offset < r.offset ? -1 : 1; - if (l.snapshot != r.snapshot) - return l.snapshot < r.snapshot ? -1 : 1; - return 0; -} -#else -int bkey_cmp(struct bpos l, struct bpos r); -#endif - -static inline struct bpos bpos_min(struct bpos l, struct bpos r) -{ - return bkey_cmp(l, r) < 0 ? l : r; -} - -void bch_bpos_swab(struct bpos *); -void bch_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); - -static __always_inline int bversion_cmp(struct bversion l, struct bversion r) -{ - if (l.hi != r.hi) - return l.hi < r.hi ? -1 : 1; - if (l.lo != r.lo) - return l.lo < r.lo ? -1 : 1; - return 0; -} - -#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) - -static __always_inline int bversion_zero(struct bversion v) -{ - return !bversion_cmp(v, ZERO_VERSION); -} - -#ifdef CONFIG_BCACHE_DEBUG -/* statement expressions confusing unlikely()? */ -#define bkey_packed(_k) \ - ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ - (_k)->format != KEY_FORMAT_CURRENT; }) -#else -#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) -#endif - -/* - * It's safe to treat an unpacked bkey as a packed one, but not the reverse - */ -static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) -{ - return (struct bkey_packed *) k; -} - -static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) -{ - return (const struct bkey_packed *) k; -} - -static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) -{ - return bkey_packed(k) ? NULL : (struct bkey_i *) k; -} - -static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) -{ - return bkey_packed(k) ? NULL : (const struct bkey *) k; -} - -static inline unsigned bkey_format_key_bits(const struct bkey_format *format) -{ - return format->bits_per_field[BKEY_FIELD_INODE] + - format->bits_per_field[BKEY_FIELD_OFFSET] + - format->bits_per_field[BKEY_FIELD_SNAPSHOT]; -} - -static inline struct bpos bkey_successor(struct bpos p) -{ - struct bpos ret = p; - - if (!++ret.offset) - BUG_ON(!++ret.inode); - - return ret; -} - -static inline u64 bkey_start_offset(const struct bkey *k) -{ - return k->p.offset - k->size; -} - -static inline struct bpos bkey_start_pos(const struct bkey *k) -{ - return (struct bpos) { - .inode = k->p.inode, - .offset = bkey_start_offset(k), - .snapshot = k->p.snapshot, - }; -} - -/* Packed helpers */ - -static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, - const struct bkey_packed *k) -{ - unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; - - EBUG_ON(k->u64s < ret); - return ret; -} - -static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return bkeyp_key_u64s(format, k) * sizeof(u64); -} - -static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return k->u64s - bkeyp_key_u64s(format, k); -} - -static inline size_t bkeyp_val_bytes(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return bkeyp_val_u64s(format, k) * sizeof(u64); -} - -static inline void set_bkeyp_val_u64s(const struct bkey_format *format, - struct bkey_packed *k, unsigned val_u64s) -{ - k->u64s = bkeyp_key_u64s(format, k) + val_u64s; -} - -#define bkeyp_val(_format, _k) \ - ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) - -extern const struct bkey_format bch_bkey_format_current; - -bool bch_bkey_transform(const struct bkey_format *, - struct bkey_packed *, - const struct bkey_format *, - const struct bkey_packed *); - -struct bkey __bkey_unpack_key(const struct bkey_format *, - const struct bkey_packed *); - -#ifndef HAVE_BCACHE_COMPILED_UNPACK -struct bpos __bkey_unpack_pos(const struct bkey_format *, - const struct bkey_packed *); -#endif - -bool bkey_pack_key(struct bkey_packed *, const struct bkey *, - const struct bkey_format *); - -enum bkey_pack_pos_ret { - BKEY_PACK_POS_EXACT, - BKEY_PACK_POS_SMALLER, - BKEY_PACK_POS_FAIL, -}; - -enum bkey_pack_pos_ret bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, - const struct btree *); - -static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, - const struct btree *b) -{ - return bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; -} - -void bkey_unpack(const struct btree *, struct bkey_i *, - const struct bkey_packed *); -bool bkey_pack(struct bkey_packed *, const struct bkey_i *, - const struct bkey_format *); - -static inline u64 bkey_field_max(const struct bkey_format *f, - enum bch_bkey_fields nr) -{ - return f->bits_per_field[nr] < 64 - ? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr]) - : U64_MAX; -} - -#ifdef CONFIG_X86_64 -#define HAVE_BCACHE_COMPILED_UNPACK 1 - -int bch_compile_bkey_format(const struct bkey_format *, void *); - -#else - -static inline int bch_compile_bkey_format(const struct bkey_format *format, - void *out) { return 0; } - -#endif - -static inline void bkey_reassemble(struct bkey_i *dst, - struct bkey_s_c src) -{ - BUG_ON(bkey_packed(src.k)); - dst->k = *src.k; - memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k)); -} - -#define bkey_s_null ((struct bkey_s) { .k = NULL }) -#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) - -#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) - -static inline struct bkey_s bkey_to_s(struct bkey *k) -{ - return (struct bkey_s) { .k = k, .v = NULL }; -} - -static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -{ - return (struct bkey_s_c) { .k = k, .v = NULL }; -} - -static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -{ - return (struct bkey_s) { .k = &k->k, .v = &k->v }; -} - -static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -{ - return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -} - -/* - * For a given type of value (e.g. struct bch_extent), generates the types for - * bkey + bch_extent - inline, split, split const - and also all the conversion - * functions, which also check that the value is of the correct type. - * - * We use anonymous unions for upcasting - e.g. converting from e.g. a - * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion - * functions. - */ -#define __BKEY_VAL_ACCESSORS(name, nr, _assert) \ -struct bkey_s_c_##name { \ - union { \ - struct { \ - const struct bkey *k; \ - const struct bch_##name *v; \ - }; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -struct bkey_s_##name { \ - union { \ - struct { \ - struct bkey *k; \ - struct bch_##name *v; \ - }; \ - struct bkey_s_c_##name c; \ - struct bkey_s s; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -{ \ - _assert(k->k.type, nr); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline const struct bkey_i_##name * \ -bkey_i_to_##name##_c(const struct bkey_i *k) \ -{ \ - _assert(k->k.type, nr); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -{ \ - _assert(k.k->type, nr); \ - return (struct bkey_s_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -{ \ - _assert(k.k->type, nr); \ - return (struct bkey_s_c_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -{ \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -name##_i_to_s_c(const struct bkey_i_##name *k) \ -{ \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -{ \ - _assert(k->k.type, nr); \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -bkey_i_to_s_c_##name(const struct bkey_i *k) \ -{ \ - _assert(k->k.type, nr); \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bch_##name * \ -bkey_p_##name##_val(const struct bkey_format *f, \ - struct bkey_packed *k) \ -{ \ - return container_of(bkeyp_val(f, k), struct bch_##name, v); \ -} \ - \ -static inline const struct bch_##name * \ -bkey_p_c_##name##_val(const struct bkey_format *f, \ - const struct bkey_packed *k) \ -{ \ - return container_of(bkeyp_val(f, k), struct bch_##name, v); \ -} \ - \ -static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -{ \ - struct bkey_i_##name *k = \ - container_of(&_k->k, struct bkey_i_##name, k); \ - \ - bkey_init(&k->k); \ - memset(&k->v, 0, sizeof(k->v)); \ - k->k.type = nr; \ - set_bkey_val_bytes(&k->k, sizeof(k->v)); \ - \ - return k; \ -} - -#define __BKEY_VAL_ASSERT(_type, _nr) EBUG_ON(_type != _nr) - -#define BKEY_VAL_ACCESSORS(name, _nr) \ - static inline void __bch_##name##_assert(u8 type, u8 nr) \ - { \ - EBUG_ON(type != _nr); \ - } \ - \ - __BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert) - -BKEY_VAL_ACCESSORS(cookie, KEY_TYPE_COOKIE); - -static inline void __bch_extent_assert(u8 type, u8 nr) -{ - EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED); -} - -__BKEY_VAL_ACCESSORS(extent, BCH_EXTENT, __bch_extent_assert); -BKEY_VAL_ACCESSORS(reservation, BCH_RESERVATION); - -BKEY_VAL_ACCESSORS(inode, BCH_INODE_FS); -BKEY_VAL_ACCESSORS(inode_blockdev, BCH_INODE_BLOCKDEV); - -BKEY_VAL_ACCESSORS(dirent, BCH_DIRENT); - -BKEY_VAL_ACCESSORS(xattr, BCH_XATTR); - -/* byte order helpers */ - -#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN) -#error edit for your odd byteorder. -#endif - -#ifdef __LITTLE_ENDIAN - -#define high_bit_offset 0 -#define __high_word(u64s, k) ((k)->_data + (u64s) - 1) -#define nth_word(p, n) ((p) - (n)) - -#else - -#define high_bit_offset KEY_PACKED_BITS_START -#define __high_word(u64s, k) ((k)->_data) -#define nth_word(p, n) ((p) + (n)) - -#endif - -#define high_word(format, k) __high_word((format)->key_u64s, k) -#define next_word(p) nth_word(p, 1) -#define prev_word(p) nth_word(p, -1) - -#ifdef CONFIG_BCACHE_DEBUG -void bkey_pack_test(void); -#else -static inline void bkey_pack_test(void) {} -#endif - -#endif /* _BCACHE_BKEY_H */ diff --git a/libbcache/bkey_methods.c b/libbcache/bkey_methods.c deleted file mode 100644 index 2908489c..00000000 --- a/libbcache/bkey_methods.c +++ /dev/null @@ -1,127 +0,0 @@ - -#include "bcache.h" -#include "bkey_methods.h" -#include "btree_types.h" -#include "dirent.h" -#include "error.h" -#include "extents.h" -#include "inode.h" -#include "xattr.h" - -const struct bkey_ops *bch_bkey_ops[] = { - [BKEY_TYPE_EXTENTS] = &bch_bkey_extent_ops, - [BKEY_TYPE_INODES] = &bch_bkey_inode_ops, - [BKEY_TYPE_DIRENTS] = &bch_bkey_dirent_ops, - [BKEY_TYPE_XATTRS] = &bch_bkey_xattr_ops, - [BKEY_TYPE_BTREE] = &bch_bkey_btree_ops, -}; - -/* Returns string indicating reason for being invalid, or NULL if valid: */ -const char *bkey_invalid(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k) -{ - const struct bkey_ops *ops = bch_bkey_ops[type]; - - if (k.k->u64s < BKEY_U64s) - return "u64s too small"; - - if (k.k->size && - (bkey_deleted(k.k) || !ops->is_extents)) - return "nonzero size field"; - - switch (k.k->type) { - case KEY_TYPE_DELETED: - case KEY_TYPE_DISCARD: - return NULL; - - case KEY_TYPE_ERROR: - return bkey_val_bytes(k.k) != 0 - ? "value size should be zero" - : NULL; - - case KEY_TYPE_COOKIE: - return bkey_val_bytes(k.k) != sizeof(struct bch_cookie) - ? "incorrect value size" - : NULL; - - default: - if (k.k->type < KEY_TYPE_GENERIC_NR) - return "invalid type"; - - return ops->key_invalid(c, k); - } -} - -const char *btree_bkey_invalid(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) -{ - if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0) - return "key before start of btree node"; - - if (bkey_cmp(k.k->p, b->data->max_key) > 0) - return "key past end of btree node"; - - if (k.k->p.snapshot) - return "nonzero snapshot"; - - return bkey_invalid(c, btree_node_type(b), k); -} - -void bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) -{ - enum bkey_type type = btree_node_type(b); - const struct bkey_ops *ops = bch_bkey_ops[type]; - const char *invalid; - - BUG_ON(!k.k->u64s); - - invalid = btree_bkey_invalid(c, b, k); - if (invalid) { - char buf[160]; - - bch_bkey_val_to_text(c, type, buf, sizeof(buf), k); - bch_fs_bug(c, "invalid bkey %s: %s", buf, invalid); - return; - } - - if (k.k->type >= KEY_TYPE_GENERIC_NR && - ops->key_debugcheck) - ops->key_debugcheck(c, b, k); -} - -void bch_val_to_text(struct bch_fs *c, enum bkey_type type, - char *buf, size_t size, struct bkey_s_c k) -{ - const struct bkey_ops *ops = bch_bkey_ops[type]; - - if (k.k->type >= KEY_TYPE_GENERIC_NR && - ops->val_to_text) - ops->val_to_text(c, buf, size, k); -} - -void bch_bkey_val_to_text(struct bch_fs *c, enum bkey_type type, - char *buf, size_t size, struct bkey_s_c k) -{ - const struct bkey_ops *ops = bch_bkey_ops[type]; - char *out = buf, *end = buf + size; - - out += bch_bkey_to_text(out, end - out, k.k); - - if (k.k->type >= KEY_TYPE_GENERIC_NR && - ops->val_to_text) { - out += scnprintf(out, end - out, " -> "); - ops->val_to_text(c, out, end - out, k); - } -} - -void bch_bkey_swab(enum bkey_type type, - const struct bkey_format *f, - struct bkey_packed *k) -{ - const struct bkey_ops *ops = bch_bkey_ops[type]; - - bch_bkey_swab_key(f, k); - - if (ops->swab) - ops->swab(f, k); -} diff --git a/libbcache/bkey_methods.h b/libbcache/bkey_methods.h deleted file mode 100644 index 111b1789..00000000 --- a/libbcache/bkey_methods.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef _BCACHE_BKEY_METHODS_H -#define _BCACHE_BKEY_METHODS_H - -#include "bkey.h" - -#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val, - -enum bkey_type { - DEFINE_BCH_BTREE_IDS() - BKEY_TYPE_BTREE, -}; - -/* Type of a key in btree @id at level @level: */ -static inline enum bkey_type bkey_type(unsigned level, enum btree_id id) -{ - return level ? BKEY_TYPE_BTREE : id; -} - -static inline bool btree_type_has_ptrs(enum bkey_type type) -{ - switch (type) { - case BKEY_TYPE_BTREE: - case BKEY_TYPE_EXTENTS: - return true; - default: - return false; - } -} - -struct bch_fs; -struct btree; -struct bkey; - -enum merge_result { - BCH_MERGE_NOMERGE, - - /* - * The keys were mergeable, but would have overflowed size - so instead - * l was changed to the maximum size, and both keys were modified: - */ - BCH_MERGE_PARTIAL, - BCH_MERGE_MERGE, -}; - -typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *, - struct bkey_s); -typedef enum merge_result (*key_merge_fn)(struct bch_fs *, - struct btree *, - struct bkey_i *, struct bkey_i *); - -struct bkey_ops { - /* Returns reason for being invalid if invalid, else NULL: */ - const char * (*key_invalid)(const struct bch_fs *, - struct bkey_s_c); - void (*key_debugcheck)(struct bch_fs *, struct btree *, - struct bkey_s_c); - void (*val_to_text)(struct bch_fs *, char *, - size_t, struct bkey_s_c); - void (*swab)(const struct bkey_format *, struct bkey_packed *); - key_filter_fn key_normalize; - key_merge_fn key_merge; - bool is_extents; -}; - -const char *bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c); -const char *btree_bkey_invalid(struct bch_fs *, struct btree *, - struct bkey_s_c); - -void bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); -void bch_val_to_text(struct bch_fs *, enum bkey_type, - char *, size_t, struct bkey_s_c); -void bch_bkey_val_to_text(struct bch_fs *, enum bkey_type, - char *, size_t, struct bkey_s_c); - -void bch_bkey_swab(enum bkey_type, const struct bkey_format *, - struct bkey_packed *); - -extern const struct bkey_ops *bch_bkey_ops[]; - -#undef DEF_BTREE_ID - -#endif /* _BCACHE_BKEY_METHODS_H */ diff --git a/libbcache/blockdev.c b/libbcache/blockdev.c deleted file mode 100644 index a4522ad2..00000000 --- a/libbcache/blockdev.c +++ /dev/null @@ -1,819 +0,0 @@ - -#include "bcache.h" -#include "blockdev.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "checksum.h" -#include "error.h" -#include "inode.h" -#include "request.h" -#include "super-io.h" -#include "writeback.h" - -#include <linux/kthread.h> -#include <linux/module.h> -#include <linux/random.h> - -static int bch_blockdev_major; -static DEFINE_IDA(bch_blockdev_minor); -static LIST_HEAD(uncached_devices); -static DEFINE_MUTEX(bch_blockdev_lock); - -static struct kmem_cache *bch_search_cache; - -static void write_bdev_super_endio(struct bio *bio) -{ - struct cached_dev *dc = bio->bi_private; - /* XXX: error checking */ - - closure_put(&dc->sb_write); -} - -static void bch_write_bdev_super_unlock(struct closure *cl) -{ - struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write); - - up(&dc->sb_write_mutex); -} - -void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) -{ - struct backingdev_sb *sb = dc->disk_sb.sb; - struct closure *cl = &dc->sb_write; - struct bio *bio = dc->disk_sb.bio; - - down(&dc->sb_write_mutex); - closure_init(cl, parent); - - sb->csum = csum_vstruct(NULL, BCH_CSUM_CRC64, - (struct nonce) { 0 }, sb).lo; - - bio_reset(bio); - bio->bi_bdev = dc->disk_sb.bdev; - bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); - bio->bi_iter.bi_size = - roundup(vstruct_bytes(sb), - bdev_logical_block_size(dc->disk_sb.bdev)); - bio->bi_end_io = write_bdev_super_endio; - bio->bi_private = dc; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FUA|REQ_META); - bch_bio_map(bio, sb); - - closure_get(cl); - - closure_return_with_destructor(cl, bch_write_bdev_super_unlock); -} - -static int open_dev(struct block_device *b, fmode_t mode) -{ - struct bcache_device *d = b->bd_disk->private_data; - - if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) - return -ENXIO; - - closure_get(&d->cl); - return 0; -} - -static void release_dev(struct gendisk *b, fmode_t mode) -{ - struct bcache_device *d = b->private_data; - - closure_put(&d->cl); -} - -static int ioctl_dev(struct block_device *b, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct bcache_device *d = b->bd_disk->private_data; - - return d->ioctl(d, mode, cmd, arg); -} - -static const struct block_device_operations bcache_ops = { - .open = open_dev, - .release = release_dev, - .ioctl = ioctl_dev, - .owner = THIS_MODULE, -}; - -void bch_blockdev_stop(struct bcache_device *d) -{ - if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) - closure_queue(&d->cl); -} - -static void bcache_device_unlink(struct bcache_device *d) -{ - if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { - sysfs_remove_link(&d->c->kobj, d->name); - sysfs_remove_link(&d->kobj, "cache"); - } -} - -static void bcache_device_link(struct bcache_device *d, struct bch_fs *c, - const char *name) -{ - snprintf(d->name, BCACHEDEVNAME_SIZE, - "%s%llu", name, bcache_dev_inum(d)); - - WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || - sysfs_create_link(&c->kobj, &d->kobj, d->name), - "Couldn't create device <-> cache set symlinks"); - - clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); -} - -static void bcache_device_detach(struct bcache_device *d) -{ - if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { - mutex_lock(&d->inode_lock); - bch_inode_rm(d->c, bcache_dev_inum(d)); - mutex_unlock(&d->inode_lock); - } - - bcache_device_unlink(d); - - radix_tree_delete(&d->c->devices, bcache_dev_inum(d)); - - closure_put(&d->c->caching); - d->c = NULL; -} - -static int bcache_device_attach(struct bcache_device *d, struct bch_fs *c) -{ - int ret; - - ret = radix_tree_insert(&c->devices, bcache_dev_inum(d), d); - if (ret) { - pr_err("radix_tree_insert() error for inum %llu", - bcache_dev_inum(d)); - return ret; - } - - d->c = c; - closure_get(&c->caching); - - return ret; -} - -static void bcache_device_free(struct bcache_device *d) -{ - pr_info("%s stopped", d->disk->disk_name); - - if (d->c) - bcache_device_detach(d); - if (d->disk && d->disk->flags & GENHD_FL_UP) - del_gendisk(d->disk); - if (d->disk && d->disk->queue) - blk_cleanup_queue(d->disk->queue); - if (d->disk) { - ida_simple_remove(&bch_blockdev_minor, d->disk->first_minor); - put_disk(d->disk); - } - - bioset_exit(&d->bio_split); - - closure_debug_destroy(&d->cl); -} - -static int bcache_device_init(struct bcache_device *d, unsigned block_size, - sector_t sectors) -{ - struct request_queue *q; - int minor; - - mutex_init(&d->inode_lock); - - minor = ida_simple_get(&bch_blockdev_minor, 0, MINORMASK + 1, GFP_KERNEL); - if (minor < 0) { - pr_err("cannot allocate minor"); - return minor; - } - - if (!(d->disk = alloc_disk(1)) || - bioset_init(&d->bio_split, 4, offsetof(struct bch_read_bio, bio))) { - pr_err("cannot allocate disk"); - ida_simple_remove(&bch_blockdev_minor, minor); - return -ENOMEM; - } - - set_capacity(d->disk, sectors); - snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); - - d->disk->major = bch_blockdev_major; - d->disk->first_minor = minor; - d->disk->fops = &bcache_ops; - d->disk->private_data = d; - - q = blk_alloc_queue(GFP_KERNEL); - if (!q) { - pr_err("cannot allocate queue"); - return -ENOMEM; - } - - blk_queue_make_request(q, NULL); - d->disk->queue = q; - q->queuedata = d; - q->backing_dev_info.congested_data = d; - q->limits.max_hw_sectors = UINT_MAX; - q->limits.max_sectors = UINT_MAX; - q->limits.max_segment_size = UINT_MAX; - q->limits.max_segments = BIO_MAX_PAGES; - blk_queue_max_discard_sectors(q, UINT_MAX); - q->limits.discard_granularity = 512; - q->limits.io_min = block_size; - q->limits.logical_block_size = block_size; - q->limits.physical_block_size = block_size; - set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); - clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags); - set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); - - blk_queue_write_cache(q, true, true); - - return 0; -} - -/* Cached device */ - -static void calc_cached_dev_sectors(struct bch_fs *c) -{ - u64 sectors = 0; - struct cached_dev *dc; - - list_for_each_entry(dc, &c->cached_devs, list) - sectors += bdev_sectors(dc->disk_sb.bdev); - - c->cached_dev_sectors = sectors; -} - -void bch_cached_dev_run(struct cached_dev *dc) -{ - struct bcache_device *d = &dc->disk; - char buf[BCH_SB_LABEL_SIZE + 1]; - char *env[] = { - "DRIVER=bcache", - kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", - dc->disk_sb.sb->disk_uuid.b), - NULL, - NULL, - }; - - memcpy(buf, dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE); - buf[BCH_SB_LABEL_SIZE] = '\0'; - env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); - - if (atomic_xchg(&dc->running, 1)) { - kfree(env[1]); - kfree(env[2]); - return; - } - - if (!d->c && - BDEV_STATE(dc->disk_sb.sb) != BDEV_STATE_NONE) { - struct closure cl; - - closure_init_stack(&cl); - - SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_STALE); - bch_write_bdev_super(dc, &cl); - closure_sync(&cl); - } - - add_disk(d->disk); - bd_link_disk_holder(dc->disk_sb.bdev, dc->disk.disk); - /* won't show up in the uevent file, use udevadm monitor -e instead - * only class / kset properties are persistent */ - kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); - kfree(env[1]); - kfree(env[2]); - - if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || - sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) - pr_debug("error creating sysfs link"); -} - -static void cached_dev_detach_finish(struct work_struct *w) -{ - struct cached_dev *dc = container_of(w, struct cached_dev, detach); - char buf[BDEVNAME_SIZE]; - struct closure cl; - - closure_init_stack(&cl); - - BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); - BUG_ON(atomic_read(&dc->count)); - - mutex_lock(&bch_blockdev_lock); - - memset(&dc->disk_sb.sb->set_uuid, 0, 16); - SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_NONE); - - bch_write_bdev_super(dc, &cl); - closure_sync(&cl); - - bcache_device_detach(&dc->disk); - list_move(&dc->list, &uncached_devices); - - clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); - clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags); - - mutex_unlock(&bch_blockdev_lock); - - pr_info("Caching disabled for %s", bdevname(dc->disk_sb.bdev, buf)); - - /* Drop ref we took in cached_dev_detach() */ - closure_put(&dc->disk.cl); -} - -void bch_cached_dev_detach(struct cached_dev *dc) -{ - if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) - return; - - if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - return; - - /* - * Block the device from being closed and freed until we're finished - * detaching - */ - closure_get(&dc->disk.cl); - - dc->writeback_pd.rate.rate = UINT_MAX; - bch_writeback_queue(dc); - cached_dev_put(dc); -} - -int bch_cached_dev_attach(struct cached_dev *dc, struct bch_fs *c) -{ - __le64 rtime = cpu_to_le64(ktime_get_seconds()); - char buf[BDEVNAME_SIZE]; - bool found; - int ret; - - lockdep_assert_held(&c->state_lock); - - bdevname(dc->disk_sb.bdev, buf); - - if (memcmp(&dc->disk_sb.sb->set_uuid, - &c->sb.uuid, - sizeof(c->sb.uuid))) - return -ENOENT; - - if (dc->disk.c) { - pr_err("Can't attach %s: already attached", buf); - return -EINVAL; - } - - if (!bch_fs_running(c)) { - pr_err("Can't attach %s: not running", buf); - return -EINVAL; - } - - if (le16_to_cpu(dc->disk_sb.sb->block_size) < c->sb.block_size) { - /* Will die */ - pr_err("Couldn't attach %s: block size less than set's block size", - buf); - return -EINVAL; - } - - found = !bch_cached_dev_inode_find_by_uuid(c, - &dc->disk_sb.sb->disk_uuid, - &dc->disk.inode); - - if (!found && BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) { - pr_err("Couldn't find uuid for %s in set", buf); - return -ENOENT; - } - - if (found && - (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE || - BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE)) { - found = false; - bch_inode_rm(c, bcache_dev_inum(&dc->disk)); - } - - /* Deadlocks since we're called via sysfs... - sysfs_remove_file(&dc->kobj, &sysfs_attach); - */ - - if (!found) { - struct closure cl; - - closure_init_stack(&cl); - - bkey_inode_blockdev_init(&dc->disk.inode.k_i); - dc->disk.inode.k.type = BCH_INODE_BLOCKDEV; - SET_CACHED_DEV(&dc->disk.inode.v, true); - dc->disk.inode.v.i_uuid = dc->disk_sb.sb->disk_uuid; - memcpy(dc->disk.inode.v.i_label, - dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE); - dc->disk.inode.v.i_ctime = rtime; - dc->disk.inode.v.i_mtime = rtime; - - ret = bch_inode_create(c, &dc->disk.inode.k_i, - 0, BLOCKDEV_INODE_MAX, - &c->unused_inode_hint); - if (ret) { - pr_err("Error %d, not caching %s", ret, buf); - return ret; - } - - pr_info("attached inode %llu", bcache_dev_inum(&dc->disk)); - - dc->disk_sb.sb->set_uuid = c->sb.uuid; - SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN); - - bch_write_bdev_super(dc, &cl); - closure_sync(&cl); - } else { - dc->disk.inode.v.i_mtime = rtime; - bch_btree_update(c, BTREE_ID_INODES, - &dc->disk.inode.k_i, NULL); - } - - /* Count dirty sectors before attaching */ - if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) - bch_sectors_dirty_init(dc, c); - - ret = bcache_device_attach(&dc->disk, c); - if (ret) - return ret; - - list_move(&dc->list, &c->cached_devs); - calc_cached_dev_sectors(c); - - /* - * dc->c must be set before dc->count != 0 - paired with the mb in - * cached_dev_get() - */ - smp_wmb(); - atomic_set(&dc->count, 1); - - if (bch_cached_dev_writeback_start(dc)) - return -ENOMEM; - - if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) { - atomic_set(&dc->has_dirty, 1); - atomic_inc(&dc->count); - } - - bch_cached_dev_run(dc); - bcache_device_link(&dc->disk, c, "bdev"); - - pr_info("Caching %s as %s on set %pU", - bdevname(dc->disk_sb.bdev, buf), dc->disk.disk->disk_name, - dc->disk.c->sb.uuid.b); - return 0; -} - -void bch_attach_backing_devs(struct bch_fs *c) -{ - struct cached_dev *dc, *t; - - lockdep_assert_held(&c->state_lock); - - mutex_lock(&bch_blockdev_lock); - - list_for_each_entry_safe(dc, t, &uncached_devices, list) - bch_cached_dev_attach(dc, c); - - mutex_unlock(&bch_blockdev_lock); -} - -void bch_cached_dev_release(struct kobject *kobj) -{ - struct cached_dev *dc = container_of(kobj, struct cached_dev, - disk.kobj); - kfree(dc); - module_put(THIS_MODULE); -} - -static void cached_dev_free(struct closure *cl) -{ - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - - bch_cached_dev_writeback_stop(dc); - bch_cached_dev_writeback_free(dc); - - mutex_lock(&bch_blockdev_lock); - - if (atomic_read(&dc->running)) - bd_unlink_disk_holder(dc->disk_sb.bdev, dc->disk.disk); - bcache_device_free(&dc->disk); - list_del(&dc->list); - - mutex_unlock(&bch_blockdev_lock); - - bch_free_super((void *) &dc->disk_sb); - - kobject_put(&dc->disk.kobj); -} - -static void cached_dev_flush(struct closure *cl) -{ - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - struct bcache_device *d = &dc->disk; - - bch_cache_accounting_destroy(&dc->accounting); - bcache_device_unlink(d); - kobject_del(&d->kobj); - - continue_at(cl, cached_dev_free, system_wq); -} - -static int cached_dev_init(struct cached_dev *dc, unsigned block_size) -{ - int ret; - struct io *io; - struct request_queue *q = bdev_get_queue(dc->disk_sb.bdev); - - dc->sequential_cutoff = 4 << 20; - - for (io = dc->io; io < dc->io + RECENT_IO; io++) { - list_add(&io->lru, &dc->io_lru); - hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); - } - - dc->disk.stripe_size = q->limits.io_opt >> 9; - - if (dc->disk.stripe_size) - dc->partial_stripes_expensive = - q->limits.raid_partial_stripes_expensive; - - ret = bcache_device_init(&dc->disk, block_size, - dc->disk_sb.bdev->bd_part->nr_sects - - le64_to_cpu(dc->disk_sb.sb->data_offset)); - if (ret) - return ret; - - dc->disk.disk->queue->backing_dev_info.ra_pages = - max(dc->disk.disk->queue->backing_dev_info.ra_pages, - q->backing_dev_info.ra_pages); - - bch_cached_dev_request_init(dc); - ret = bch_cached_dev_writeback_init(dc); - if (ret) - return ret; - - return 0; -} - -/* Cached device - bcache superblock */ - -static const char *bdev_validate_super(struct backingdev_sb *sb) -{ - switch (le64_to_cpu(sb->version)) { - case BCACHE_SB_VERSION_BDEV: - sb->data_offset = cpu_to_le64(BDEV_DATA_START_DEFAULT); - break; - case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: - if (le64_to_cpu(sb->data_offset) < BDEV_DATA_START_DEFAULT) - return "Bad data offset"; - - break; - default: - return"Unsupported superblock version"; - } - - sb->last_mount = cpu_to_le32(get_seconds()); - - return NULL; -} - -const char *bch_backing_dev_register(struct bcache_superblock *sb) -{ - char name[BDEVNAME_SIZE]; - const char *err; - struct bch_fs *c; - struct cached_dev *dc; - - dc = kzalloc(sizeof(*dc), GFP_KERNEL); - if (!dc) - return "cannot allocate memory"; - - __module_get(THIS_MODULE); - INIT_LIST_HEAD(&dc->list); - closure_init(&dc->disk.cl, NULL); - set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); - kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); - INIT_WORK(&dc->detach, cached_dev_detach_finish); - sema_init(&dc->sb_write_mutex, 1); - INIT_LIST_HEAD(&dc->io_lru); - spin_lock_init(&dc->io_lock); - bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); - - memcpy(&dc->disk_sb, sb, sizeof(*sb)); - dc->disk_sb.bdev->bd_holder = dc; - memset(sb, 0, sizeof(*sb)); - - err = bdev_validate_super(dc->disk_sb.sb); - if (err) - goto err; - - if (cached_dev_init(dc, le16_to_cpu(dc->disk_sb.sb->block_size) << 9)) - goto err; - - err = "error creating kobject"; - if (kobject_add(&dc->disk.kobj, - &part_to_dev(dc->disk_sb.bdev->bd_part)->kobj, - "bcache")) - goto err; - - err = "error accounting kobject"; - if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) - goto err; - - pr_info("registered backing device %s", - bdevname(dc->disk_sb.bdev, name)); - - list_add(&dc->list, &uncached_devices); - c = bch_uuid_to_fs(dc->disk_sb.sb->set_uuid); - if (c) { - bch_cached_dev_attach(dc, c); - closure_put(&c->cl); - } - - if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE || - BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE) - bch_cached_dev_run(dc); - - return NULL; -err: - bch_blockdev_stop(&dc->disk); - return err; -} - -/* Flash only volumes */ - -void bch_blockdev_volume_release(struct kobject *kobj) -{ - struct bcache_device *d = container_of(kobj, struct bcache_device, - kobj); - kfree(d); -} - -static void blockdev_volume_free(struct closure *cl) -{ - struct bcache_device *d = container_of(cl, struct bcache_device, cl); - - bcache_device_free(d); - kobject_put(&d->kobj); -} - -static void blockdev_volume_flush(struct closure *cl) -{ - struct bcache_device *d = container_of(cl, struct bcache_device, cl); - - bcache_device_unlink(d); - kobject_del(&d->kobj); - continue_at(cl, blockdev_volume_free, system_wq); -} - -static int blockdev_volume_run(struct bch_fs *c, - struct bkey_s_c_inode_blockdev inode) -{ - struct bcache_device *d = kzalloc(sizeof(struct bcache_device), - GFP_KERNEL); - int ret = -ENOMEM; - - if (!d) - return ret; - - bkey_reassemble(&d->inode.k_i, inode.s_c); - - closure_init(&d->cl, NULL); - set_closure_fn(&d->cl, blockdev_volume_flush, system_wq); - - kobject_init(&d->kobj, &bch_blockdev_volume_ktype); - - ret = bcache_device_init(d, block_bytes(c), - le64_to_cpu(inode.v->i_size) >> 9); - if (ret) - goto err; - - ret = bcache_device_attach(d, c); - if (ret) - goto err; - - bch_blockdev_volume_request_init(d); - add_disk(d->disk); - - if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache")) - goto err; - - bcache_device_link(d, c, "volume"); - - return 0; -err: - kobject_put(&d->kobj); - return ret; -} - -int bch_blockdev_volumes_start(struct bch_fs *c) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_inode_blockdev inode; - int ret = 0; - - if (!bch_fs_running(c)) - return -EINVAL; - - for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) { - if (k.k->p.inode >= BLOCKDEV_INODE_MAX) - break; - - if (k.k->type != BCH_INODE_BLOCKDEV) - continue; - - inode = bkey_s_c_to_inode_blockdev(k); - - ret = blockdev_volume_run(c, inode); - if (ret) - break; - } - bch_btree_iter_unlock(&iter); - - return ret; -} - -int bch_blockdev_volume_create(struct bch_fs *c, u64 size) -{ - __le64 rtime = cpu_to_le64(ktime_get_seconds()); - struct bkey_i_inode_blockdev inode; - int ret; - - bkey_inode_blockdev_init(&inode.k_i); - get_random_bytes(&inode.v.i_uuid, sizeof(inode.v.i_uuid)); - inode.v.i_ctime = rtime; - inode.v.i_mtime = rtime; - inode.v.i_size = cpu_to_le64(size); - - ret = bch_inode_create(c, &inode.k_i, 0, BLOCKDEV_INODE_MAX, - &c->unused_inode_hint); - if (ret) { - pr_err("Can't create volume: %d", ret); - return ret; - } - - return blockdev_volume_run(c, inode_blockdev_i_to_s_c(&inode)); -} - -void bch_blockdevs_stop(struct bch_fs *c) -{ - struct cached_dev *dc; - struct bcache_device *d; - struct radix_tree_iter iter; - void **slot; - - mutex_lock(&bch_blockdev_lock); - rcu_read_lock(); - - radix_tree_for_each_slot(slot, &c->devices, &iter, 0) { - d = radix_tree_deref_slot(slot); - - if (CACHED_DEV(&d->inode.v) && - test_bit(BCH_FS_DETACHING, &c->flags)) { - dc = container_of(d, struct cached_dev, disk); - bch_cached_dev_detach(dc); - } else { - bch_blockdev_stop(d); - } - } - - rcu_read_unlock(); - mutex_unlock(&bch_blockdev_lock); -} - -void bch_fs_blockdev_exit(struct bch_fs *c) -{ - mempool_exit(&c->search); -} - -int bch_fs_blockdev_init(struct bch_fs *c) -{ - return mempool_init_slab_pool(&c->search, 1, bch_search_cache); -} - -void bch_blockdev_exit(void) -{ - kmem_cache_destroy(bch_search_cache); - - if (bch_blockdev_major >= 0) - unregister_blkdev(bch_blockdev_major, "bcache"); -} - -int __init bch_blockdev_init(void) -{ - bch_blockdev_major = register_blkdev(0, "bcache"); - if (bch_blockdev_major < 0) - return bch_blockdev_major; - - bch_search_cache = KMEM_CACHE(search, 0); - if (!bch_search_cache) - return -ENOMEM; - - return 0; -} diff --git a/libbcache/blockdev.h b/libbcache/blockdev.h deleted file mode 100644 index 5423d776..00000000 --- a/libbcache/blockdev.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef _BCACHE_BLOCKDEV_H -#define _BCACHE_BLOCKDEV_H - -#include "blockdev_types.h" -#include "io_types.h" - -struct search { - /* Stack frame for bio_complete */ - struct closure cl; - - union { - struct bch_read_bio rbio; - struct bch_write_bio wbio; - }; - /* Not modified */ - struct bio *orig_bio; - struct bcache_device *d; - - unsigned inode; - unsigned write:1; - - /* Flags only used for reads */ - unsigned recoverable:1; - unsigned read_dirty_data:1; - unsigned cache_miss:1; - - /* - * For reads: bypass read from cache and insertion into cache - * For writes: discard key range from cache, sending the write to - * the backing device (if there is a backing device) - */ - unsigned bypass:1; - - unsigned long start_time; - - /* - * Mostly only used for writes. For reads, we still make use of - * some trivial fields: - * - c - * - error - */ - struct bch_write_op iop; -}; - -#ifndef NO_BCACHE_BLOCKDEV - -extern struct kobj_type bch_cached_dev_ktype; -extern struct kobj_type bch_blockdev_volume_ktype; - -void bch_write_bdev_super(struct cached_dev *, struct closure *); - -void bch_cached_dev_release(struct kobject *); -void bch_blockdev_volume_release(struct kobject *); - -int bch_cached_dev_attach(struct cached_dev *, struct bch_fs *); -void bch_attach_backing_devs(struct bch_fs *); - -void bch_cached_dev_detach(struct cached_dev *); -void bch_cached_dev_run(struct cached_dev *); -void bch_blockdev_stop(struct bcache_device *); - -const char *bch_backing_dev_register(struct bcache_superblock *); - -int bch_blockdev_volume_create(struct bch_fs *, u64); -int bch_blockdev_volumes_start(struct bch_fs *); - -void bch_blockdevs_stop(struct bch_fs *); - -void bch_fs_blockdev_exit(struct bch_fs *); -int bch_fs_blockdev_init(struct bch_fs *); -void bch_blockdev_exit(void); -int bch_blockdev_init(void); - -#else - -static inline void bch_write_bdev_super(struct cached_dev *dc, - struct closure *cl) {} - -static inline void bch_cached_dev_release(struct kobject *kobj) {} -static inline void bch_blockdev_volume_release(struct kobject *kobj) {} - -static inline int bch_cached_dev_attach(struct cached_dev *dc, struct bch_fs *c) -{ - return 0; -} -static inline void bch_attach_backing_devs(struct bch_fs *c) {} - -static inline void bch_cached_dev_detach(struct cached_dev *dc) {} -static inline void bch_cached_dev_run(struct cached_dev *dc) {} -static inline void bch_blockdev_stop(struct bcache_device *d) {} - -static inline const char *bch_backing_dev_register(struct bcache_superblock *sb) -{ - return "not implemented"; -} - -static inline int bch_blockdev_volume_create(struct bch_fs *c, u64 s) { return 0; } -static inline int bch_blockdev_volumes_start(struct bch_fs *c) { return 0; } - -static inline void bch_blockdevs_stop(struct bch_fs *c) {} -static inline void bch_fs_blockdev_exit(struct bch_fs *c) {} -static inline int bch_fs_blockdev_init(struct bch_fs *c) { return 0; } -static inline void bch_blockdev_exit(void) {} -static inline int bch_blockdev_init(void) { return 0; } - -#endif - -static inline void cached_dev_put(struct cached_dev *dc) -{ - if (atomic_dec_and_test(&dc->count)) - schedule_work(&dc->detach); -} - -static inline bool cached_dev_get(struct cached_dev *dc) -{ - if (!atomic_inc_not_zero(&dc->count)) - return false; - - /* Paired with the mb in cached_dev_attach */ - smp_mb__after_atomic(); - return true; -} - -static inline u64 bcache_dev_inum(struct bcache_device *d) -{ - return d->inode.k.p.inode; -} - -static inline struct bcache_device *bch_dev_find(struct bch_fs *c, u64 inode) -{ - return radix_tree_lookup(&c->devices, inode); -} - -#endif /* _BCACHE_BLOCKDEV_H */ diff --git a/libbcache/blockdev_types.h b/libbcache/blockdev_types.h deleted file mode 100644 index e5172004..00000000 --- a/libbcache/blockdev_types.h +++ /dev/null @@ -1,123 +0,0 @@ -#ifndef _BCACHE_BLOCKDEV_TYPES_H -#define _BCACHE_BLOCKDEV_TYPES_H - -#include "keybuf_types.h" -#include "stats_types.h" -#include "super_types.h" -#include "util.h" - -struct bcache_device { - struct closure cl; - - struct kobject kobj; - - struct bch_fs *c; - - struct rb_node node; - struct bkey_i_inode_blockdev inode; - struct mutex inode_lock; - -#define BCACHEDEVNAME_SIZE 12 - char name[BCACHEDEVNAME_SIZE]; - - struct gendisk *disk; - - unsigned long flags; -#define BCACHE_DEV_CLOSING 0 -#define BCACHE_DEV_DETACHING 1 -#define BCACHE_DEV_UNLINK_DONE 2 - - unsigned nr_stripes; - unsigned stripe_size; - atomic_t *stripe_sectors_dirty; - unsigned long *full_dirty_stripes; - - struct bio_set bio_split; - - unsigned data_csum:1; - - int (*ioctl)(struct bcache_device *, fmode_t, unsigned, unsigned long); -}; - -struct io { - /* Used to track sequential IO so it can be skipped */ - struct hlist_node hash; - struct list_head lru; - - unsigned long last_io; - unsigned sequential; - sector_t last; -}; - -struct cached_dev { - struct list_head list; - struct bcache_device disk; - - //struct backingdev_sb sb; - - struct { - struct backingdev_sb *sb; - struct block_device *bdev; - struct bio *bio; - unsigned page_order; - } disk_sb; - struct closure sb_write; - struct semaphore sb_write_mutex; - - /* Refcount on the cache set. Always nonzero when we're caching. */ - atomic_t count; - struct work_struct detach; - - /* - * Device might not be running if it's dirty and the cache set hasn't - * showed up yet. - */ - atomic_t running; - - /* - * Writes take a shared lock from start to finish; scanning for dirty - * data to refill the rb tree requires an exclusive lock. - */ - struct rw_semaphore writeback_lock; - - /* - * Nonzero, and writeback has a refcount (d->count), iff there is dirty - * data in the cache. Protected by writeback_lock; must have an - * shared lock to set and exclusive lock to clear. - */ - atomic_t has_dirty; - - /* for dynamic rate control of writeback */ - struct bch_pd_controller writeback_pd; - struct delayed_work writeback_pd_update; - unsigned writeback_pd_update_seconds; - - struct task_struct *writeback_thread; - struct keybuf writeback_keys; - mempool_t writeback_io_pool; - mempool_t writeback_page_pool; - - /* For tracking sequential IO */ -#define RECENT_IO_BITS 7 -#define RECENT_IO (1 << RECENT_IO_BITS) - struct io io[RECENT_IO]; - struct hlist_head io_hash[RECENT_IO + 1]; - struct list_head io_lru; - spinlock_t io_lock; - - struct cache_accounting accounting; - - /* The rest of this all shows up in sysfs */ - unsigned sequential_cutoff; - unsigned readahead; - - unsigned verify:1; - unsigned bypass_torture_test:1; - - unsigned partial_stripes_expensive:1; - unsigned writeback_metadata:1; - unsigned writeback_running:1; - unsigned char writeback_percent; -}; - -#endif /* _BCACHE_BLOCKDEV_TYPES_H */ diff --git a/libbcache/bset.c b/libbcache/bset.c deleted file mode 100644 index a88d8017..00000000 --- a/libbcache/bset.c +++ /dev/null @@ -1,1846 +0,0 @@ -/* - * Code for working with individual keys, and sorted sets of keys with in a - * btree node - * - * Copyright 2012 Google, Inc. - */ - -#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ - -#include "eytzinger.h" -#include "util.h" -#include "bset.h" - -#include <asm/unaligned.h> -#include <linux/dynamic_fault.h> -#include <linux/console.h> -#include <linux/random.h> -#include <linux/prefetch.h> - -/* hack.. */ -#include "alloc_types.h" -#include <trace/events/bcache.h> - -struct bset_tree *bch_bkey_to_bset(struct btree *b, struct bkey_packed *k) -{ - struct bset_tree *t; - - for_each_bset(b, t) - if (k >= btree_bkey_first(b, t) && - k < btree_bkey_last(b, t)) - return t; - - BUG(); -} - -/* - * There are never duplicate live keys in the btree - but including keys that - * have been flagged as deleted (and will be cleaned up later) we _will_ see - * duplicates. - * - * Thus the sort order is: usual key comparison first, but for keys that compare - * equal the deleted key(s) come first, and the (at most one) live version comes - * last. - * - * The main reason for this is insertion: to handle overwrites, we first iterate - * over keys that compare equal to our insert key, and then insert immediately - * prior to the first key greater than the key we're inserting - our insert - * position will be after all keys that compare equal to our insert key, which - * by the time we actually do the insert will all be deleted. - */ - -void bch_dump_bset(struct btree *b, struct bset *i, unsigned set) -{ - struct bkey_packed *_k, *_n; - struct bkey k, n; - char buf[120]; - - if (!i->u64s) - return; - - for (_k = i->start, k = bkey_unpack_key(b, _k); - _k < vstruct_last(i); - _k = _n, k = n) { - _n = bkey_next(_k); - - bch_bkey_to_text(buf, sizeof(buf), &k); - printk(KERN_ERR "block %u key %zi/%u: %s\n", set, - _k->_data - i->_data, i->u64s, buf); - - if (_n == vstruct_last(i)) - continue; - - n = bkey_unpack_key(b, _n); - - if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) { - printk(KERN_ERR "Key skipped backwards\n"); - continue; - } - - /* - * Weird check for duplicate non extent keys: extents are - * deleted iff they have 0 size, so if it has zero size and it's - * not deleted these aren't extents: - */ - if (((!k.size && !bkey_deleted(&k)) || - (!n.size && !bkey_deleted(&n))) && - !bkey_deleted(&k) && - !bkey_cmp(n.p, k.p)) - printk(KERN_ERR "Duplicate keys\n"); - } -} - -void bch_dump_btree_node(struct btree *b) -{ - struct bset_tree *t; - - console_lock(); - for_each_bset(b, t) - bch_dump_bset(b, bset(b, t), t - b->set); - console_unlock(); -} - -void bch_dump_btree_node_iter(struct btree *b, - struct btree_node_iter *iter) -{ - struct btree_node_iter_set *set; - - printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets); - - btree_node_iter_for_each(iter, set) { - struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); - struct bset_tree *t = bch_bkey_to_bset(b, k); - struct bkey uk = bkey_unpack_key(b, k); - char buf[100]; - - bch_bkey_to_text(buf, sizeof(buf), &uk); - printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set, - k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf); - } -} - -#ifdef CONFIG_BCACHE_DEBUG - -static bool keys_out_of_order(struct btree *b, - const struct bkey_packed *prev, - const struct bkey_packed *next, - bool is_extents) -{ - struct bkey nextu = bkey_unpack_key(b, next); - - return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 || - ((is_extents - ? !bkey_deleted(next) - : !bkey_deleted(prev)) && - !bkey_cmp_packed(b, prev, next)); -} - -void __bch_verify_btree_nr_keys(struct btree *b) -{ - struct bset_tree *t; - struct bkey_packed *k; - struct btree_nr_keys nr = { 0 }; - - for_each_bset(b, t) - for (k = btree_bkey_first(b, t); - k != btree_bkey_last(b, t); - k = bkey_next(k)) - if (!bkey_whiteout(k)) - btree_keys_account_key_add(&nr, t - b->set, k); - - BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); -} - -static void bch_btree_node_iter_next_check(struct btree_node_iter *iter, - struct btree *b, - struct bkey_packed *k) -{ - const struct bkey_packed *n = bch_btree_node_iter_peek_all(iter, b); - - bkey_unpack_key(b, k); - - if (n && - keys_out_of_order(b, k, n, iter->is_extents)) { - struct bkey ku = bkey_unpack_key(b, k); - struct bkey nu = bkey_unpack_key(b, n); - char buf1[80], buf2[80]; - - bch_dump_btree_node(b); - bch_bkey_to_text(buf1, sizeof(buf1), &ku); - bch_bkey_to_text(buf2, sizeof(buf2), &nu); - panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2); - } -} - -void bch_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) -{ - struct btree_node_iter_set *set; - struct bset_tree *t; - struct bkey_packed *k, *first; - - BUG_ON(iter->used > MAX_BSETS); - - if (!iter->used) - return; - - btree_node_iter_for_each(iter, set) { - k = __btree_node_offset_to_key(b, set->k); - t = bch_bkey_to_bset(b, k); - - BUG_ON(__btree_node_offset_to_key(b, set->end) != - btree_bkey_last(b, t)); - - BUG_ON(set + 1 < iter->data + iter->used && - btree_node_iter_cmp(iter, b, set[0], set[1]) > 0); - } - - first = __btree_node_offset_to_key(b, iter->data[0].k); - - for_each_bset(b, t) - if (bch_btree_node_iter_bset_pos(iter, b, t) == - btree_bkey_last(b, t) && - (k = bkey_prev_all(b, t, btree_bkey_last(b, t)))) - BUG_ON(__btree_node_iter_cmp(iter->is_extents, b, - k, first) > 0); -} - -void bch_verify_key_order(struct btree *b, - struct btree_node_iter *iter, - struct bkey_packed *where) -{ - struct bset_tree *t = bch_bkey_to_bset(b, where); - struct bkey_packed *k, *prev; - struct bkey uk, uw = bkey_unpack_key(b, where); - - k = bkey_prev_all(b, t, where); - if (k && - keys_out_of_order(b, k, where, iter->is_extents)) { - char buf1[100], buf2[100]; - - bch_dump_btree_node(b); - uk = bkey_unpack_key(b, k); - bch_bkey_to_text(buf1, sizeof(buf1), &uk); - bch_bkey_to_text(buf2, sizeof(buf2), &uw); - panic("out of order with prev:\n%s\n%s\n", - buf1, buf2); - } - - k = bkey_next(where); - BUG_ON(k != btree_bkey_last(b, t) && - keys_out_of_order(b, where, k, iter->is_extents)); - - for_each_bset(b, t) { - if (where >= btree_bkey_first(b, t) || - where < btree_bkey_last(b, t)) - continue; - - k = bch_btree_node_iter_bset_pos(iter, b, t); - - if (k == btree_bkey_last(b, t)) - k = bkey_prev_all(b, t, k); - - while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 && - (prev = bkey_prev_all(b, t, k))) - k = prev; - - for (; - k != btree_bkey_last(b, t); - k = bkey_next(k)) { - uk = bkey_unpack_key(b, k); - - if (iter->is_extents) { - BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 || - bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0)); - } else { - BUG_ON(!bkey_cmp(uw.p, uk.p) && - !bkey_deleted(&uk)); - } - - if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0) - break; - } - } -} - -#else - -static void bch_btree_node_iter_next_check(struct btree_node_iter *iter, - struct btree *b, - struct bkey_packed *k) {} - -#endif - -/* Auxiliary search trees */ - -#define BFLOAT_FAILED_UNPACKED (U8_MAX - 0) -#define BFLOAT_FAILED_PREV (U8_MAX - 1) -#define BFLOAT_FAILED_OVERFLOW (U8_MAX - 2) -#define BFLOAT_FAILED (U8_MAX - 2) - -#define KEY_WORDS BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS) - -struct bkey_float { - u8 exponent; - u8 key_offset; - union { - u32 mantissa32; - struct { - u16 mantissa16; - u16 _pad; - }; - }; -} __packed; - -#define BFLOAT_32BIT_NR 32U - -static unsigned bkey_float_byte_offset(unsigned idx) -{ - int d = (idx - BFLOAT_32BIT_NR) << 1; - - d &= ~(d >> 31); - - return idx * 6 - d; -} - -struct ro_aux_tree { - struct bkey_float _d[0]; -}; - -struct rw_aux_tree { - u16 offset; - struct bpos k; -}; - -/* - * BSET_CACHELINE was originally intended to match the hardware cacheline size - - * it used to be 64, but I realized the lookup code would touch slightly less - * memory if it was 128. - * - * It definites the number of bytes (in struct bset) per struct bkey_float in - * the auxiliar search tree - when we're done searching the bset_float tree we - * have this many bytes left that we do a linear search over. - * - * Since (after level 5) every level of the bset_tree is on a new cacheline, - * we're touching one fewer cacheline in the bset tree in exchange for one more - * cacheline in the linear search - but the linear search might stop before it - * gets to the second cacheline. - */ - -#define BSET_CACHELINE 128 - -/* Space required for the btree node keys */ -static inline size_t btree_keys_bytes(struct btree *b) -{ - return PAGE_SIZE << b->page_order; -} - -static inline size_t btree_keys_cachelines(struct btree *b) -{ - return btree_keys_bytes(b) / BSET_CACHELINE; -} - -static inline size_t btree_aux_data_bytes(struct btree *b) -{ - return btree_keys_cachelines(b) * 8; -} - -static inline size_t btree_aux_data_u64s(struct btree *b) -{ - return btree_aux_data_bytes(b) / sizeof(u64); -} - -static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) -{ - BUG_ON(t->aux_data_offset == U16_MAX); - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - return t->aux_data_offset; - case BSET_RO_AUX_TREE: - return t->aux_data_offset + - DIV_ROUND_UP(bkey_float_byte_offset(t->size) + - sizeof(u8) * t->size, 8); - case BSET_RW_AUX_TREE: - return t->aux_data_offset + - DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); - default: - BUG(); - } -} - -static unsigned bset_aux_tree_buf_start(const struct btree *b, - const struct bset_tree *t) -{ - return t == b->set - ? DIV_ROUND_UP(b->unpack_fn_len, 8) - : bset_aux_tree_buf_end(t - 1); -} - -static void *__aux_tree_base(const struct btree *b, - const struct bset_tree *t) -{ - return b->aux_data + t->aux_data_offset * 8; -} - -static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, - const struct bset_tree *t) -{ - EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); - - return __aux_tree_base(b, t); -} - -static u8 *ro_aux_tree_prev(const struct btree *b, - const struct bset_tree *t) -{ - EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); - - return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); -} - -static struct bkey_float *bkey_float_get(struct ro_aux_tree *b, - unsigned idx) -{ - return (void *) b + bkey_float_byte_offset(idx); -} - -static struct bkey_float *bkey_float(const struct btree *b, - const struct bset_tree *t, - unsigned idx) -{ - return bkey_float_get(ro_aux_tree_base(b, t), idx); -} - -static void bset_aux_tree_verify(struct btree *b) -{ -#ifdef CONFIG_BCACHE_DEBUG - struct bset_tree *t; - - for_each_bset(b, t) { - if (t->aux_data_offset == U16_MAX) - continue; - - BUG_ON(t != b->set && - t[-1].aux_data_offset == U16_MAX); - - BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); - BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); - BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); - } -#endif -} - -/* Memory allocation */ - -void bch_btree_keys_free(struct btree *b) -{ - vfree(b->aux_data); - b->aux_data = NULL; -} - -int bch_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp) -{ - b->page_order = page_order; - b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, - PAGE_KERNEL_EXEC); - if (!b->aux_data) - return -ENOMEM; - - return 0; -} - -void bch_btree_keys_init(struct btree *b, bool *expensive_debug_checks) -{ - unsigned i; - - b->nsets = 0; - memset(&b->nr, 0, sizeof(b->nr)); -#ifdef CONFIG_BCACHE_DEBUG - b->expensive_debug_checks = expensive_debug_checks; -#endif - for (i = 0; i < MAX_BSETS; i++) - b->set[i].data_offset = U16_MAX; - - bch_bset_set_no_aux_tree(b, b->set); -} - -/* Binary tree stuff for auxiliary search trees */ - -/* - * Cacheline/offset <-> bkey pointer arithmetic: - * - * t->tree is a binary search tree in an array; each node corresponds to a key - * in one cacheline in t->set (BSET_CACHELINE bytes). - * - * This means we don't have to store the full index of the key that a node in - * the binary tree points to; eytzinger_to_inorder() gives us the cacheline, and - * then bkey_float->m gives us the offset within that cacheline, in units of 8 - * bytes. - * - * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to - * make this work. - * - * To construct the bfloat for an arbitrary key we need to know what the key - * immediately preceding it is: we have to check if the two keys differ in the - * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size - * of the previous key so we can walk backwards to it from t->tree[j]'s key. - */ - -static inline void *bset_cacheline(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline) -{ - return (void *) round_down((unsigned long) btree_bkey_first(b, t), - L1_CACHE_BYTES) + - cacheline * BSET_CACHELINE; -} - -static struct bkey_packed *cacheline_to_bkey(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline, - unsigned offset) -{ - return bset_cacheline(b, t, cacheline) + offset * 8; -} - -static unsigned bkey_to_cacheline(const struct btree *b, - const struct bset_tree *t, - const struct bkey_packed *k) -{ - return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; -} - -static ssize_t __bkey_to_cacheline_offset(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline, - const struct bkey_packed *k) -{ - return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); -} - -static unsigned bkey_to_cacheline_offset(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline, - const struct bkey_packed *k) -{ - size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); - - EBUG_ON(m > U8_MAX); - return m; -} - -static inline struct bkey_packed *tree_to_bkey(const struct btree *b, - const struct bset_tree *t, - unsigned j) -{ - return cacheline_to_bkey(b, t, - __eytzinger_to_inorder(j, t->size, t->extra), - bkey_float(b, t, j)->key_offset); -} - -static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, - const struct bset_tree *t, - unsigned j) -{ - unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; - - return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); -} - -static struct rw_aux_tree *rw_aux_tree(const struct btree *b, - const struct bset_tree *t) -{ - EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); - - return __aux_tree_base(b, t); -} - -/* - * For the write set - the one we're currently inserting keys into - we don't - * maintain a full search tree, we just keep a simple lookup table in t->prev. - */ -static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, - struct bset_tree *t, - unsigned j) -{ - return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); -} - -static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, - unsigned j, struct bkey_packed *k) -{ - BUG_ON(k >= btree_bkey_last(b, t)); - - rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { - .offset = __btree_node_key_to_offset(b, k), - .k = bkey_unpack_pos(b, k), - }; -} - -static void bch_bset_verify_rw_aux_tree(struct btree *b, - struct bset_tree *t) -{ - struct bkey_packed *k = btree_bkey_first(b, t); - unsigned j = 0; - - if (!btree_keys_expensive_checks(b)) - return; - - BUG_ON(bset_has_ro_aux_tree(t)); - - if (!bset_has_rw_aux_tree(t)) - return; - - BUG_ON(t->size < 1); - BUG_ON(rw_aux_to_bkey(b, t, j) != k); - - goto start; - while (1) { - if (rw_aux_to_bkey(b, t, j) == k) { - BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, - bkey_unpack_pos(b, k))); -start: - if (++j == t->size) - break; - - BUG_ON(rw_aux_tree(b, t)[j].offset <= - rw_aux_tree(b, t)[j - 1].offset); - } - - k = bkey_next(k); - BUG_ON(k >= btree_bkey_last(b, t)); - } -} - -/* returns idx of first entry >= offset: */ -static unsigned rw_aux_tree_bsearch(struct btree *b, - struct bset_tree *t, - unsigned offset) -{ - unsigned l = 0, r = t->size; - - BUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); - - while (l < r) { - unsigned m = (l + r) >> 1; - - if (rw_aux_tree(b, t)[m].offset < offset) - l = m + 1; - else - r = m; - } - - BUG_ON(l < t->size && - rw_aux_tree(b, t)[l].offset < offset); - BUG_ON(l && - rw_aux_tree(b, t)[l - 1].offset >= offset); - - BUG_ON(l > r); - BUG_ON(l > t->size); - - return l; -} - -static inline unsigned bfloat_mantissa(const struct bkey_float *f, - unsigned idx) -{ - return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16; -} - -static inline void bfloat_mantissa_set(struct bkey_float *f, - unsigned idx, unsigned mantissa) -{ - if (idx < BFLOAT_32BIT_NR) - f->mantissa32 = mantissa; - else - f->mantissa16 = mantissa; -} - -static inline unsigned bkey_mantissa(const struct bkey_packed *k, - const struct bkey_float *f, - unsigned idx) -{ - u64 v; - - EBUG_ON(!bkey_packed(k)); - - v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); - - /* - * In little endian, we're shifting off low bits (and then the bits we - * want are at the low end), in big endian we're shifting off high bits - * (and then the bits we want are at the high end, so we shift them - * back down): - */ -#ifdef __LITTLE_ENDIAN - v >>= f->exponent & 7; -#else - v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16); -#endif - return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v; -} - -static void make_bfloat(struct btree *b, struct bset_tree *t, - unsigned j, - struct bkey_packed *min_key, - struct bkey_packed *max_key) -{ - struct bkey_float *f = bkey_float(b, t, j); - struct bkey_packed *m = tree_to_bkey(b, t, j); - struct bkey_packed *p = tree_to_prev_bkey(b, t, j); - struct bkey_packed *l, *r; - unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16; - unsigned mantissa; - int shift, exponent; - - EBUG_ON(bkey_next(p) != m); - - if (is_power_of_2(j)) { - l = min_key; - - if (!l->u64s) { - if (!bkey_pack_pos(l, b->data->min_key, b)) { - struct bkey_i tmp; - - bkey_init(&tmp.k); - tmp.k.p = b->data->min_key; - bkey_copy(l, &tmp); - } - } - } else { - l = tree_to_prev_bkey(b, t, j >> ffs(j)); - - EBUG_ON(m < l); - } - - if (is_power_of_2(j + 1)) { - r = max_key; - - if (!r->u64s) { - if (!bkey_pack_pos(r, t->max_key, b)) { - struct bkey_i tmp; - - bkey_init(&tmp.k); - tmp.k.p = t->max_key; - bkey_copy(r, &tmp); - } - } - } else { - r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); - - EBUG_ON(m > r); - } - - /* - * for failed bfloats, the lookup code falls back to comparing against - * the original key. - */ - - if (!bkey_packed(l) || !bkey_packed(r) || - !bkey_packed(p) || !bkey_packed(m)) { - f->exponent = BFLOAT_FAILED_UNPACKED; - return; - } - - /* - * The greatest differing bit of l and r is the first bit we must - * include in the bfloat mantissa we're creating in order to do - * comparisons - that bit always becomes the high bit of - * bfloat->mantissa, and thus the exponent we're calculating here is - * the position of what will become the low bit in bfloat->mantissa: - * - * Note that this may be negative - we may be running off the low end - * of the key: we handle this later: - */ - exponent = (int) bkey_greatest_differing_bit(b, l, r) - (bits - 1); - - /* - * Then we calculate the actual shift value, from the start of the key - * (k->_data), to get the key bits starting at exponent: - */ -#ifdef __LITTLE_ENDIAN - shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; - - EBUG_ON(shift + bits > b->format.key_u64s * 64); -#else - shift = high_bit_offset + - b->nr_key_bits - - exponent - - bits; - - EBUG_ON(shift < KEY_PACKED_BITS_START); -#endif - EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); - - f->exponent = shift; - mantissa = bkey_mantissa(m, f, j); - - /* - * If we've got garbage bits, set them to all 1s - it's legal for the - * bfloat to compare larger than the original key, but not smaller: - */ - if (exponent < 0) - mantissa |= ~(~0U << -exponent); - - bfloat_mantissa_set(f, j, mantissa); - - /* - * The bfloat must be able to tell its key apart from the previous key - - * if its key and the previous key don't differ in the required bits, - * flag as failed - unless the keys are actually equal, in which case - * we aren't required to return a specific one: - */ - if (exponent > 0 && - bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) && - bkey_cmp_packed(b, p, m)) { - f->exponent = BFLOAT_FAILED_PREV; - return; - } - - /* - * f->mantissa must compare >= the original key - for transitivity with - * the comparison in bset_search_tree. If we're dropping set bits, - * increment it: - */ - if (exponent > (int) bkey_ffs(b, m)) { - if (j < BFLOAT_32BIT_NR - ? f->mantissa32 == U32_MAX - : f->mantissa16 == U16_MAX) - f->exponent = BFLOAT_FAILED_OVERFLOW; - - if (j < BFLOAT_32BIT_NR) - f->mantissa32++; - else - f->mantissa16++; - } -} - -/* bytes remaining - only valid for last bset: */ -static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) -{ - bset_aux_tree_verify(b); - - return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); -} - -static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) -{ - unsigned bytes = __bset_tree_capacity(b, t); - - if (bytes < 7 * BFLOAT_32BIT_NR) - return bytes / 7; - - bytes -= 7 * BFLOAT_32BIT_NR; - - return BFLOAT_32BIT_NR + bytes / 5; -} - -static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) -{ - return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); -} - -static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) -{ - struct bkey_packed *k; - - t->size = 1; - t->extra = BSET_RW_AUX_TREE_VAL; - rw_aux_tree(b, t)[0].offset = - __btree_node_key_to_offset(b, btree_bkey_first(b, t)); - - for (k = btree_bkey_first(b, t); - k != btree_bkey_last(b, t); - k = bkey_next(k)) { - if (t->size == bset_rw_tree_capacity(b, t)) - break; - - if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > - L1_CACHE_BYTES) - rw_aux_tree_set(b, t, t->size++, k); - } -} - -static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) -{ - struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); - struct bkey_packed min_key, max_key; - unsigned j, cacheline = 1; - - /* signal to make_bfloat() that they're uninitialized: */ - min_key.u64s = max_key.u64s = 0; - - t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), - bset_ro_tree_capacity(b, t)); -retry: - if (t->size < 2) { - t->size = 0; - t->extra = BSET_NO_AUX_TREE_VAL; - return; - } - - t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; - - /* First we figure out where the first key in each cacheline is */ - eytzinger_for_each(j, t->size) { - while (bkey_to_cacheline(b, t, k) < cacheline) - prev = k, k = bkey_next(k); - - if (k >= btree_bkey_last(b, t)) { - t->size--; - goto retry; - } - - ro_aux_tree_prev(b, t)[j] = prev->u64s; - bkey_float(b, t, j)->key_offset = - bkey_to_cacheline_offset(b, t, cacheline++, k); - - BUG_ON(tree_to_prev_bkey(b, t, j) != prev); - BUG_ON(tree_to_bkey(b, t, j) != k); - } - - while (bkey_next(k) != btree_bkey_last(b, t)) - k = bkey_next(k); - - t->max_key = bkey_unpack_pos(b, k); - - /* Then we build the tree */ - eytzinger_for_each(j, t->size) - make_bfloat(b, t, j, &min_key, &max_key); -} - -static void bset_alloc_tree(struct btree *b, struct bset_tree *t) -{ - struct bset_tree *i; - - for (i = b->set; i != t; i++) - BUG_ON(bset_has_rw_aux_tree(i)); - - bch_bset_set_no_aux_tree(b, t); - - /* round up to next cacheline: */ - t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), - SMP_CACHE_BYTES / sizeof(u64)); - - bset_aux_tree_verify(b); -} - -void bch_bset_build_aux_tree(struct btree *b, struct bset_tree *t, - bool writeable) -{ - if (writeable - ? bset_has_rw_aux_tree(t) - : bset_has_ro_aux_tree(t)) - return; - - bset_alloc_tree(b, t); - - if (!__bset_tree_capacity(b, t)) - return; - - if (writeable) - __build_rw_aux_tree(b, t); - else - __build_ro_aux_tree(b, t); - - bset_aux_tree_verify(b); -} - -void bch_bset_init_first(struct btree *b, struct bset *i) -{ - struct bset_tree *t; - - BUG_ON(b->nsets); - - memset(i, 0, sizeof(*i)); - get_random_bytes(&i->seq, sizeof(i->seq)); - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - - t = &b->set[b->nsets++]; - set_btree_bset(b, t, i); -} - -void bch_bset_init_next(struct btree *b, struct bset *i) -{ - struct bset_tree *t; - - BUG_ON(b->nsets >= MAX_BSETS); - - memset(i, 0, sizeof(*i)); - i->seq = btree_bset_first(b)->seq; - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - - t = &b->set[b->nsets++]; - set_btree_bset(b, t, i); -} - -static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - struct bkey_packed *p; - unsigned offset; - int j; - - EBUG_ON(k < btree_bkey_first(b, t) || - k > btree_bkey_last(b, t)); - - if (k == btree_bkey_first(b, t)) - return NULL; - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - p = btree_bkey_first(b, t); - break; - case BSET_RO_AUX_TREE: - j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); - - do { - p = j ? tree_to_bkey(b, t, - __inorder_to_eytzinger(j--, - t->size, t->extra)) - : btree_bkey_first(b, t); - } while (p >= k); - break; - case BSET_RW_AUX_TREE: - offset = __btree_node_key_to_offset(b, k); - j = rw_aux_tree_bsearch(b, t, offset); - p = j ? rw_aux_to_bkey(b, t, j - 1) - : btree_bkey_first(b, t); - break; - } - - return p; -} - -struct bkey_packed *bkey_prev_all(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - struct bkey_packed *p; - - p = __bkey_prev(b, t, k); - if (!p) - return NULL; - - while (bkey_next(p) != k) - p = bkey_next(p); - - return p; -} - -struct bkey_packed *bkey_prev(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - while (1) { - struct bkey_packed *p, *i, *ret = NULL; - - p = __bkey_prev(b, t, k); - if (!p) - return NULL; - - for (i = p; i != k; i = bkey_next(i)) - if (!bkey_deleted(i)) - ret = i; - - if (ret) - return ret; - - k = p; - } -} - -/* Insert */ - -static void rw_aux_tree_fix_invalidated_key(struct btree *b, - struct bset_tree *t, - struct bkey_packed *k) -{ - unsigned offset = __btree_node_key_to_offset(b, k); - unsigned j = rw_aux_tree_bsearch(b, t, offset); - - if (j < t->size && - rw_aux_tree(b, t)[j].offset == offset) - rw_aux_tree_set(b, t, j, k); - - bch_bset_verify_rw_aux_tree(b, t); -} - -static void ro_aux_tree_fix_invalidated_key(struct btree *b, - struct bset_tree *t, - struct bkey_packed *k) -{ - struct bkey_packed min_key, max_key; - unsigned inorder, j; - - BUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); - - /* signal to make_bfloat() that they're uninitialized: */ - min_key.u64s = max_key.u64s = 0; - - if (bkey_next(k) == btree_bkey_last(b, t)) { - t->max_key = bkey_unpack_pos(b, k); - - for (j = 1; j < t->size; j = j * 2 + 1) - make_bfloat(b, t, j, &min_key, &max_key); - } - - inorder = bkey_to_cacheline(b, t, k); - - if (inorder && - inorder < t->size) { - j = __inorder_to_eytzinger(inorder, t->size, t->extra); - - if (k == tree_to_bkey(b, t, j)) { - /* Fix the node this key corresponds to */ - make_bfloat(b, t, j, &min_key, &max_key); - - /* Children for which this key is the right boundary */ - for (j = eytzinger_left_child(j); - j < t->size; - j = eytzinger_right_child(j)) - make_bfloat(b, t, j, &min_key, &max_key); - } - } - - if (inorder + 1 < t->size) { - j = __inorder_to_eytzinger(inorder + 1, t->size, t->extra); - - if (k == tree_to_prev_bkey(b, t, j)) { - make_bfloat(b, t, j, &min_key, &max_key); - - /* Children for which this key is the left boundary */ - for (j = eytzinger_right_child(j); - j < t->size; - j = eytzinger_left_child(j)) - make_bfloat(b, t, j, &min_key, &max_key); - } - } -} - -/** - * bch_bset_fix_invalidated_key() - given an existing key @k that has been - * modified, fix any auxiliary search tree by remaking all the nodes in the - * auxiliary search tree that @k corresponds to - */ -void bch_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - break; - case BSET_RO_AUX_TREE: - ro_aux_tree_fix_invalidated_key(b, t, k); - break; - case BSET_RW_AUX_TREE: - rw_aux_tree_fix_invalidated_key(b, t, k); - break; - } -} - -static void bch_bset_fix_lookup_table(struct btree *b, - struct bset_tree *t, - struct bkey_packed *_where, - unsigned clobber_u64s, - unsigned new_u64s) -{ - int shift = new_u64s - clobber_u64s; - unsigned l, j, where = __btree_node_key_to_offset(b, _where); - - BUG_ON(bset_has_ro_aux_tree(t)); - - if (!bset_has_rw_aux_tree(t)) - return; - - l = rw_aux_tree_bsearch(b, t, where); - - /* l is first >= than @where */ - - BUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where); - BUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where); - - if (!l) /* never delete first entry */ - l++; - else if (l < t->size && - where < t->end_offset && - rw_aux_tree(b, t)[l].offset == where) - rw_aux_tree_set(b, t, l++, _where); - - /* l now > where */ - - for (j = l; - j < t->size && - rw_aux_tree(b, t)[j].offset < where + clobber_u64s; - j++) - ; - - if (j < t->size && - rw_aux_tree(b, t)[j].offset + shift == - rw_aux_tree(b, t)[l - 1].offset) - j++; - - memmove(&rw_aux_tree(b, t)[l], - &rw_aux_tree(b, t)[j], - (void *) &rw_aux_tree(b, t)[t->size] - - (void *) &rw_aux_tree(b, t)[j]); - t->size -= j - l; - - for (j = l; j < t->size; j++) - rw_aux_tree(b, t)[j].offset += shift; - - BUG_ON(l < t->size && - rw_aux_tree(b, t)[l].offset == - rw_aux_tree(b, t)[l - 1].offset); - - if (t->size < bset_rw_tree_capacity(b, t) && - (l < t->size - ? rw_aux_tree(b, t)[l].offset - : t->end_offset) - - rw_aux_tree(b, t)[l - 1].offset > - L1_CACHE_BYTES / sizeof(u64)) { - struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); - struct bkey_packed *end = l < t->size - ? rw_aux_to_bkey(b, t, l) - : btree_bkey_last(b, t); - struct bkey_packed *k = start; - - while (1) { - k = bkey_next(k); - if (k == end) - break; - - if ((void *) k - (void *) start >= L1_CACHE_BYTES) { - memmove(&rw_aux_tree(b, t)[l + 1], - &rw_aux_tree(b, t)[l], - (void *) &rw_aux_tree(b, t)[t->size] - - (void *) &rw_aux_tree(b, t)[l]); - t->size++; - rw_aux_tree_set(b, t, l, k); - break; - } - } - } - - bch_bset_verify_rw_aux_tree(b, t); - bset_aux_tree_verify(b); -} - -void bch_bset_insert(struct btree *b, - struct btree_node_iter *iter, - struct bkey_packed *where, - struct bkey_i *insert, - unsigned clobber_u64s) -{ - struct bkey_format *f = &b->format; - struct bset_tree *t = bset_tree_last(b); - struct bkey_packed packed, *src = bkey_to_packed(insert); - - bch_bset_verify_rw_aux_tree(b, t); - - if (bkey_pack_key(&packed, &insert->k, f)) - src = &packed; - - if (!bkey_whiteout(&insert->k)) - btree_keys_account_key_add(&b->nr, t - b->set, src); - - if (src->u64s != clobber_u64s) { - u64 *src_p = where->_data + clobber_u64s; - u64 *dst_p = where->_data + src->u64s; - - BUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < - (int) clobber_u64s - src->u64s); - - memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); - le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); - set_btree_bset_end(b, t); - } - - memcpy_u64s(where, src, - bkeyp_key_u64s(f, src)); - memcpy_u64s(bkeyp_val(f, where), &insert->v, - bkeyp_val_u64s(f, src)); - - bch_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); - - bch_verify_key_order(b, iter, where); - bch_verify_btree_nr_keys(b); -} - -void bch_bset_delete(struct btree *b, - struct bkey_packed *where, - unsigned clobber_u64s) -{ - struct bset_tree *t = bset_tree_last(b); - u64 *src_p = where->_data + clobber_u64s; - u64 *dst_p = where->_data; - - bch_bset_verify_rw_aux_tree(b, t); - - BUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); - - memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); - le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); - set_btree_bset_end(b, t); - - bch_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); -} - -/* Lookup */ - -__flatten -static struct bkey_packed *bset_search_write_set(const struct btree *b, - struct bset_tree *t, - struct bpos search, - const struct bkey_packed *packed_search) -{ - unsigned l = 0, r = t->size; - - while (l + 1 != r) { - unsigned m = (l + r) >> 1; - - if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0) - l = m; - else - r = m; - } - - return rw_aux_to_bkey(b, t, l); -} - -noinline -static int bset_search_tree_slowpath(const struct btree *b, - struct bset_tree *t, struct bpos *search, - const struct bkey_packed *packed_search, - unsigned n) -{ - return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n), - packed_search, search) < 0; -} - -__flatten -static struct bkey_packed *bset_search_tree(const struct btree *b, - struct bset_tree *t, - struct bpos search, - const struct bkey_packed *packed_search) -{ - struct ro_aux_tree *base = ro_aux_tree_base(b, t); - struct bkey_float *f = bkey_float_get(base, 1); - void *p; - unsigned inorder, n = 1; - - while (1) { - if (likely(n << 4 < t->size)) { - p = bkey_float_get(base, n << 4); - prefetch(p); - } else if (n << 3 < t->size) { - inorder = __eytzinger_to_inorder(n, t->size, t->extra); - p = bset_cacheline(b, t, inorder); -#ifdef CONFIG_X86_64 - asm(".intel_syntax noprefix;" - "prefetcht0 [%0 - 127 + 64 * 0];" - "prefetcht0 [%0 - 127 + 64 * 1];" - "prefetcht0 [%0 - 127 + 64 * 2];" - "prefetcht0 [%0 - 127 + 64 * 3];" - ".att_syntax prefix;" - : - : "r" (p + 127)); -#else - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - prefetch(p + L1_CACHE_BYTES * 3); -#endif - } else if (n >= t->size) - break; - - f = bkey_float_get(base, n); - - if (packed_search && - likely(f->exponent < BFLOAT_FAILED)) - n = n * 2 + (bfloat_mantissa(f, n) < - bkey_mantissa(packed_search, f, n)); - else - n = n * 2 + bset_search_tree_slowpath(b, t, - &search, packed_search, n); - } while (n < t->size); - - inorder = __eytzinger_to_inorder(n >> 1, t->size, t->extra); - - /* - * n would have been the node we recursed to - the low bit tells us if - * we recursed left or recursed right. - */ - if (n & 1) { - return cacheline_to_bkey(b, t, inorder, f->key_offset); - } else { - if (--inorder) { - n = eytzinger_prev(n >> 1, t->size); - f = bkey_float_get(base, n); - return cacheline_to_bkey(b, t, inorder, f->key_offset); - } else - return btree_bkey_first(b, t); - } -} - -/* - * Returns the first key greater than or equal to @search - */ -__always_inline __flatten -static struct bkey_packed *bch_bset_search(struct btree *b, - struct bset_tree *t, - struct bpos search, - struct bkey_packed *packed_search, - const struct bkey_packed *lossy_packed_search, - bool strictly_greater) -{ - struct bkey_packed *m; - - /* - * First, we search for a cacheline, then lastly we do a linear search - * within that cacheline. - * - * To search for the cacheline, there's three different possibilities: - * * The set is too small to have a search tree, so we just do a linear - * search over the whole set. - * * The set is the one we're currently inserting into; keeping a full - * auxiliary search tree up to date would be too expensive, so we - * use a much simpler lookup table to do a binary search - - * bset_search_write_set(). - * * Or we use the auxiliary search tree we constructed earlier - - * bset_search_tree() - */ - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - m = btree_bkey_first(b, t); - break; - case BSET_RW_AUX_TREE: - m = bset_search_write_set(b, t, search, lossy_packed_search); - break; - case BSET_RO_AUX_TREE: - /* - * Each node in the auxiliary search tree covers a certain range - * of bits, and keys above and below the set it covers might - * differ outside those bits - so we have to special case the - * start and end - handle that here: - */ - - if (bkey_cmp(search, t->max_key) > 0) - return btree_bkey_last(b, t); - - m = bset_search_tree(b, t, search, lossy_packed_search); - break; - } - - if (lossy_packed_search) - while (m != btree_bkey_last(b, t) && - !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search, - m, strictly_greater)) - m = bkey_next(m); - - if (!packed_search) - while (m != btree_bkey_last(b, t) && - !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater)) - m = bkey_next(m); - - if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) { - struct bkey_packed *prev = bkey_prev_all(b, t, m); - - BUG_ON(prev && - btree_iter_pos_cmp_p_or_unp(b, search, packed_search, - prev, strictly_greater)); - } - - return m; -} - -/* Btree node iterator */ - -void bch_btree_node_iter_push(struct btree_node_iter *iter, - struct btree *b, - const struct bkey_packed *k, - const struct bkey_packed *end) -{ - if (k != end) { - struct btree_node_iter_set *pos, n = - ((struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k), - __btree_node_key_to_offset(b, end) - }); - - btree_node_iter_for_each(iter, pos) - if (btree_node_iter_cmp(iter, b, n, *pos) <= 0) - break; - - memmove(pos + 1, pos, - (void *) (iter->data + iter->used) - (void *) pos); - iter->used++; - *pos = n; - } -} - -noinline __flatten __attribute__((cold)) -static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, - struct btree *b, struct bpos search, - bool strictly_greater, bool is_extents) -{ - struct bset_tree *t; - - trace_bkey_pack_pos_fail(search); - - for_each_bset(b, t) - __bch_btree_node_iter_push(iter, b, - bch_bset_search(b, t, search, NULL, NULL, - strictly_greater), - btree_bkey_last(b, t)); - - bch_btree_node_iter_sort(iter, b); -} - -/** - * bch_btree_node_iter_init - initialize a btree node iterator, starting from a - * given position - * - * Main entry point to the lookup code for individual btree nodes: - * - * NOTE: - * - * When you don't filter out deleted keys, btree nodes _do_ contain duplicate - * keys. This doesn't matter for most code, but it does matter for lookups. - * - * Some adjacent keys with a string of equal keys: - * i j k k k k l m - * - * If you search for k, the lookup code isn't guaranteed to return you any - * specific k. The lookup code is conceptually doing a binary search and - * iterating backwards is very expensive so if the pivot happens to land at the - * last k that's what you'll get. - * - * This works out ok, but it's something to be aware of: - * - * - For non extents, we guarantee that the live key comes last - see - * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't - * see will only be deleted keys you don't care about. - * - * - For extents, deleted keys sort last (see the comment at the top of this - * file). But when you're searching for extents, you actually want the first - * key strictly greater than your search key - an extent that compares equal - * to the search key is going to have 0 sectors after the search key. - * - * But this does mean that we can't just search for - * bkey_successor(start_of_range) to get the first extent that overlaps with - * the range we want - if we're unlucky and there's an extent that ends - * exactly where we searched, then there could be a deleted key at the same - * position and we'd get that when we search instead of the preceding extent - * we needed. - * - * So we've got to search for start_of_range, then after the lookup iterate - * past any extents that compare equal to the position we searched for. - */ -void bch_btree_node_iter_init(struct btree_node_iter *iter, - struct btree *b, struct bpos search, - bool strictly_greater, bool is_extents) -{ - struct bset_tree *t; - struct bkey_packed p, *packed_search = NULL; - - EBUG_ON(bkey_cmp(search, b->data->min_key) < 0); - bset_aux_tree_verify(b); - - __bch_btree_node_iter_init(iter, is_extents); - - //if (bkey_cmp(search, b->curr_max_key) > 0) - // return; - - switch (bkey_pack_pos_lossy(&p, search, b)) { - case BKEY_PACK_POS_EXACT: - packed_search = &p; - break; - case BKEY_PACK_POS_SMALLER: - packed_search = NULL; - break; - case BKEY_PACK_POS_FAIL: - btree_node_iter_init_pack_failed(iter, b, search, - strictly_greater, is_extents); - return; - } - - for_each_bset(b, t) - __bch_btree_node_iter_push(iter, b, - bch_bset_search(b, t, search, - packed_search, &p, - strictly_greater), - btree_bkey_last(b, t)); - - bch_btree_node_iter_sort(iter, b); -} - -void bch_btree_node_iter_init_from_start(struct btree_node_iter *iter, - struct btree *b, - bool is_extents) -{ - struct bset_tree *t; - - __bch_btree_node_iter_init(iter, is_extents); - - for_each_bset(b, t) - __bch_btree_node_iter_push(iter, b, - btree_bkey_first(b, t), - btree_bkey_last(b, t)); - bch_btree_node_iter_sort(iter, b); -} - -struct bkey_packed *bch_btree_node_iter_bset_pos(struct btree_node_iter *iter, - struct btree *b, - struct bset_tree *t) -{ - struct btree_node_iter_set *set; - - BUG_ON(iter->used > MAX_BSETS); - - btree_node_iter_for_each(iter, set) - if (set->end == t->end_offset) - return __btree_node_offset_to_key(b, set->k); - - return btree_bkey_last(b, t); -} - -static inline void btree_node_iter_sift(struct btree_node_iter *iter, - struct btree *b, - unsigned start) -{ - unsigned i; - - EBUG_ON(iter->used > MAX_BSETS); - - for (i = start; - i + 1 < iter->used && - btree_node_iter_cmp(iter, b, iter->data[i], iter->data[i + 1]) > 0; - i++) - swap(iter->data[i], iter->data[i + 1]); -} - -static inline void btree_node_iter_sort_two(struct btree_node_iter *iter, - struct btree *b, - unsigned first) -{ - if (btree_node_iter_cmp(iter, b, - iter->data[first], - iter->data[first + 1]) > 0) - swap(iter->data[first], iter->data[first + 1]); -} - -void bch_btree_node_iter_sort(struct btree_node_iter *iter, - struct btree *b) -{ - EBUG_ON(iter->used > 3); - - /* unrolled bubble sort: */ - - if (iter->used > 2) { - btree_node_iter_sort_two(iter, b, 0); - btree_node_iter_sort_two(iter, b, 1); - } - - if (iter->used > 1) - btree_node_iter_sort_two(iter, b, 0); -} -EXPORT_SYMBOL(bch_btree_node_iter_sort); - -/** - * bch_btree_node_iter_advance - advance @iter by one key - * - * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might - * momentarily have out of order extents. - */ -void bch_btree_node_iter_advance(struct btree_node_iter *iter, - struct btree *b) -{ - struct bkey_packed *k = bch_btree_node_iter_peek_all(iter, b); - - iter->data->k += __bch_btree_node_iter_peek_all(iter, b)->u64s; - - BUG_ON(iter->data->k > iter->data->end); - - if (iter->data->k == iter->data->end) { - BUG_ON(iter->used == 0); - iter->data[0] = iter->data[--iter->used]; - } - - btree_node_iter_sift(iter, b, 0); - - bch_btree_node_iter_next_check(iter, b, k); -} - -/* - * Expensive: - */ -struct bkey_packed *bch_btree_node_iter_prev_all(struct btree_node_iter *iter, - struct btree *b) -{ - struct bkey_packed *k, *prev = NULL; - struct btree_node_iter_set *set; - struct bset_tree *t; - struct bset_tree *prev_t; - unsigned end; - - bch_btree_node_iter_verify(iter, b); - - for_each_bset(b, t) { - k = bkey_prev_all(b, t, - bch_btree_node_iter_bset_pos(iter, b, t)); - if (k && - (!prev || __btree_node_iter_cmp(iter->is_extents, b, - k, prev) > 0)) { - prev = k; - prev_t = t; - } - } - - if (!prev) - return NULL; - - /* - * We're manually memmoving instead of just calling sort() to ensure the - * prev we picked ends up in slot 0 - sort won't necessarily put it - * there because of duplicate deleted keys: - */ - end = __btree_node_key_to_offset(b, btree_bkey_last(b, prev_t)); - btree_node_iter_for_each(iter, set) - if (set->end == end) { - memmove(&iter->data[1], - &iter->data[0], - (void *) set - (void *) &iter->data[0]); - goto out; - } - - memmove(&iter->data[1], - &iter->data[0], - (void *) &iter->data[iter->used] - (void *) &iter->data[0]); - iter->used++; -out: - iter->data[0].k = __btree_node_key_to_offset(b, prev); - iter->data[0].end = end; - return prev; -} - -struct bkey_packed *bch_btree_node_iter_prev(struct btree_node_iter *iter, - struct btree *b) -{ - struct bkey_packed *k; - - do { - k = bch_btree_node_iter_prev_all(iter, b); - } while (k && bkey_deleted(k)); - - return k; -} - -struct bkey_s_c bch_btree_node_iter_peek_unpack(struct btree_node_iter *iter, - struct btree *b, - struct bkey *u) -{ - struct bkey_packed *k = bch_btree_node_iter_peek(iter, b); - - return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; -} -EXPORT_SYMBOL(bch_btree_node_iter_peek_unpack); - -/* Mergesort */ - -void bch_btree_keys_stats(struct btree *b, struct bset_stats *stats) -{ - struct bset_tree *t; - - for_each_bset(b, t) { - enum bset_aux_tree_type type = bset_aux_tree_type(t); - size_t j; - - stats->sets[type].nr++; - stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * - sizeof(u64); - - if (bset_has_ro_aux_tree(t)) { - stats->floats += t->size - 1; - - for (j = 1; j < t->size; j++) - switch (bkey_float(b, t, j)->exponent) { - case BFLOAT_FAILED_UNPACKED: - stats->failed_unpacked++; - break; - case BFLOAT_FAILED_PREV: - stats->failed_prev++; - break; - case BFLOAT_FAILED_OVERFLOW: - stats->failed_overflow++; - break; - } - } - } -} - -int bch_bkey_print_bfloat(struct btree *b, struct bkey_packed *k, - char *buf, size_t size) -{ - struct bset_tree *t = bch_bkey_to_bset(b, k); - struct bkey_packed *l, *r, *p; - struct bkey uk, up; - char buf1[200], buf2[200]; - unsigned j; - - if (!size) - return 0; - - if (!bset_has_ro_aux_tree(t)) - goto out; - - j = __inorder_to_eytzinger(bkey_to_cacheline(b, t, k), t->size, t->extra); - if (j && - j < t->size && - k == tree_to_bkey(b, t, j)) - switch (bkey_float(b, t, j)->exponent) { - case BFLOAT_FAILED_UNPACKED: - uk = bkey_unpack_key(b, k); - return scnprintf(buf, size, - " failed unpacked at depth %u\n" - "\t%llu:%llu\n", - ilog2(j), - uk.p.inode, uk.p.offset); - case BFLOAT_FAILED_PREV: - p = tree_to_prev_bkey(b, t, j); - l = is_power_of_2(j) - ? btree_bkey_first(b, t) - : tree_to_prev_bkey(b, t, j >> ffs(j)); - r = is_power_of_2(j + 1) - ? bkey_prev_all(b, t, btree_bkey_last(b, t)) - : tree_to_bkey(b, t, j >> (ffz(j) + 1)); - - up = bkey_unpack_key(b, p); - uk = bkey_unpack_key(b, k); - bch_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits); - bch_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits); - - return scnprintf(buf, size, - " failed prev at depth %u\n" - "\tkey starts at bit %u but first differing bit at %u\n" - "\t%llu:%llu\n" - "\t%llu:%llu\n" - "\t%s\n" - "\t%s\n", - ilog2(j), - bkey_greatest_differing_bit(b, l, r), - bkey_greatest_differing_bit(b, p, k), - uk.p.inode, uk.p.offset, - up.p.inode, up.p.offset, - buf1, buf2); - case BFLOAT_FAILED_OVERFLOW: - uk = bkey_unpack_key(b, k); - return scnprintf(buf, size, - " failed overflow at depth %u\n" - "\t%llu:%llu\n", - ilog2(j), - uk.p.inode, uk.p.offset); - } -out: - *buf = '\0'; - return 0; -} diff --git a/libbcache/bset.h b/libbcache/bset.h deleted file mode 100644 index 70868c51..00000000 --- a/libbcache/bset.h +++ /dev/null @@ -1,615 +0,0 @@ -#ifndef _BCACHE_BSET_H -#define _BCACHE_BSET_H - -#include <linux/bcache.h> -#include <linux/kernel.h> -#include <linux/types.h> - -#include "bkey.h" -#include "bkey_methods.h" -#include "btree_types.h" -#include "util.h" /* for time_stats */ -#include "vstructs.h" - -/* - * BKEYS: - * - * A bkey contains a key, a size field, a variable number of pointers, and some - * ancillary flag bits. - * - * We use two different functions for validating bkeys, bkey_invalid and - * bkey_deleted(). - * - * The one exception to the rule that ptr_invalid() filters out invalid keys is - * that it also filters out keys of size 0 - these are keys that have been - * completely overwritten. It'd be safe to delete these in memory while leaving - * them on disk, just unnecessary work - so we filter them out when resorting - * instead. - * - * We can't filter out stale keys when we're resorting, because garbage - * collection needs to find them to ensure bucket gens don't wrap around - - * unless we're rewriting the btree node those stale keys still exist on disk. - * - * We also implement functions here for removing some number of sectors from the - * front or the back of a bkey - this is mainly used for fixing overlapping - * extents, by removing the overlapping sectors from the older key. - * - * BSETS: - * - * A bset is an array of bkeys laid out contiguously in memory in sorted order, - * along with a header. A btree node is made up of a number of these, written at - * different times. - * - * There could be many of them on disk, but we never allow there to be more than - * 4 in memory - we lazily resort as needed. - * - * We implement code here for creating and maintaining auxiliary search trees - * (described below) for searching an individial bset, and on top of that we - * implement a btree iterator. - * - * BTREE ITERATOR: - * - * Most of the code in bcache doesn't care about an individual bset - it needs - * to search entire btree nodes and iterate over them in sorted order. - * - * The btree iterator code serves both functions; it iterates through the keys - * in a btree node in sorted order, starting from either keys after a specific - * point (if you pass it a search key) or the start of the btree node. - * - * AUXILIARY SEARCH TREES: - * - * Since keys are variable length, we can't use a binary search on a bset - we - * wouldn't be able to find the start of the next key. But binary searches are - * slow anyways, due to terrible cache behaviour; bcache originally used binary - * searches and that code topped out at under 50k lookups/second. - * - * So we need to construct some sort of lookup table. Since we only insert keys - * into the last (unwritten) set, most of the keys within a given btree node are - * usually in sets that are mostly constant. We use two different types of - * lookup tables to take advantage of this. - * - * Both lookup tables share in common that they don't index every key in the - * set; they index one key every BSET_CACHELINE bytes, and then a linear search - * is used for the rest. - * - * For sets that have been written to disk and are no longer being inserted - * into, we construct a binary search tree in an array - traversing a binary - * search tree in an array gives excellent locality of reference and is very - * fast, since both children of any node are adjacent to each other in memory - * (and their grandchildren, and great grandchildren...) - this means - * prefetching can be used to great effect. - * - * It's quite useful performance wise to keep these nodes small - not just - * because they're more likely to be in L2, but also because we can prefetch - * more nodes on a single cacheline and thus prefetch more iterations in advance - * when traversing this tree. - * - * Nodes in the auxiliary search tree must contain both a key to compare against - * (we don't want to fetch the key from the set, that would defeat the purpose), - * and a pointer to the key. We use a few tricks to compress both of these. - * - * To compress the pointer, we take advantage of the fact that one node in the - * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have - * a function (to_inorder()) that takes the index of a node in a binary tree and - * returns what its index would be in an inorder traversal, so we only have to - * store the low bits of the offset. - * - * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To - * compress that, we take advantage of the fact that when we're traversing the - * search tree at every iteration we know that both our search key and the key - * we're looking for lie within some range - bounded by our previous - * comparisons. (We special case the start of a search so that this is true even - * at the root of the tree). - * - * So we know the key we're looking for is between a and b, and a and b don't - * differ higher than bit 50, we don't need to check anything higher than bit - * 50. - * - * We don't usually need the rest of the bits, either; we only need enough bits - * to partition the key range we're currently checking. Consider key n - the - * key our auxiliary search tree node corresponds to, and key p, the key - * immediately preceding n. The lowest bit we need to store in the auxiliary - * search tree is the highest bit that differs between n and p. - * - * Note that this could be bit 0 - we might sometimes need all 80 bits to do the - * comparison. But we'd really like our nodes in the auxiliary search tree to be - * of fixed size. - * - * The solution is to make them fixed size, and when we're constructing a node - * check if p and n differed in the bits we needed them to. If they don't we - * flag that node, and when doing lookups we fallback to comparing against the - * real key. As long as this doesn't happen to often (and it seems to reliably - * happen a bit less than 1% of the time), we win - even on failures, that key - * is then more likely to be in cache than if we were doing binary searches all - * the way, since we're touching so much less memory. - * - * The keys in the auxiliary search tree are stored in (software) floating - * point, with an exponent and a mantissa. The exponent needs to be big enough - * to address all the bits in the original key, but the number of bits in the - * mantissa is somewhat arbitrary; more bits just gets us fewer failures. - * - * We need 7 bits for the exponent and 3 bits for the key's offset (since keys - * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. - * We need one node per 128 bytes in the btree node, which means the auxiliary - * search trees take up 3% as much memory as the btree itself. - * - * Constructing these auxiliary search trees is moderately expensive, and we - * don't want to be constantly rebuilding the search tree for the last set - * whenever we insert another key into it. For the unwritten set, we use a much - * simpler lookup table - it's just a flat array, so index i in the lookup table - * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing - * within each byte range works the same as with the auxiliary search trees. - * - * These are much easier to keep up to date when we insert a key - we do it - * somewhat lazily; when we shift a key up we usually just increment the pointer - * to it, only when it would overflow do we go to the trouble of finding the - * first key in that range of bytes again. - */ - -struct btree_node_iter; -struct btree_node_iter_set; - -enum bset_aux_tree_type { - BSET_NO_AUX_TREE, - BSET_RO_AUX_TREE, - BSET_RW_AUX_TREE, -}; - -#define BSET_TREE_NR_TYPES 3 - -#define BSET_NO_AUX_TREE_VAL (U16_MAX) -#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) - -static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) -{ - switch (t->extra) { - case BSET_NO_AUX_TREE_VAL: - EBUG_ON(t->size); - return BSET_NO_AUX_TREE; - case BSET_RW_AUX_TREE_VAL: - EBUG_ON(!t->size); - return BSET_RW_AUX_TREE; - default: - EBUG_ON(!t->size); - return BSET_RO_AUX_TREE; - } -} - -typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); - -static inline struct bkey -bkey_unpack_key_format_checked(const struct btree *b, - const struct bkey_packed *src) -{ - struct bkey dst; - -#ifdef HAVE_BCACHE_COMPILED_UNPACK - { - compiled_unpack_fn unpack_fn = b->aux_data; - unpack_fn(&dst, src); - - if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) { - struct bkey dst2 = __bkey_unpack_key(&b->format, src); - - BUG_ON(memcmp(&dst, &dst2, sizeof(dst))); - } - } -#else - dst = __bkey_unpack_key(&b->format, src); -#endif - return dst; -} - -/** - * bkey_unpack_key -- unpack just the key, not the value - */ -static inline struct bkey bkey_unpack_key(const struct btree *b, - const struct bkey_packed *src) -{ - return likely(bkey_packed(src)) - ? bkey_unpack_key_format_checked(b, src) - : *packed_to_bkey_c(src); -} - -static inline struct bpos -bkey_unpack_pos_format_checked(const struct btree *b, - const struct bkey_packed *src) -{ -#ifdef HAVE_BCACHE_COMPILED_UNPACK - return bkey_unpack_key_format_checked(b, src).p; -#else - return __bkey_unpack_pos(&b->format, src); -#endif -} - -static inline struct bpos bkey_unpack_pos(const struct btree *b, - const struct bkey_packed *src) -{ - return likely(bkey_packed(src)) - ? bkey_unpack_pos_format_checked(b, src) - : packed_to_bkey_c(src)->p; -} - -/* Disassembled bkeys */ - -static inline struct bkey_s_c bkey_disassemble(struct btree *b, - const struct bkey_packed *k, - struct bkey *u) -{ - *u = bkey_unpack_key(b, k); - - return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; -} - -/* non const version: */ -static inline struct bkey_s __bkey_disassemble(struct btree *b, - struct bkey_packed *k, - struct bkey *u) -{ - *u = bkey_unpack_key(b, k); - - return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; -} - -#define for_each_bset(_b, _t) \ - for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) - -extern bool bch_expensive_debug_checks; - -static inline bool btree_keys_expensive_checks(struct btree *b) -{ -#ifdef CONFIG_BCACHE_DEBUG - return bch_expensive_debug_checks || *b->expensive_debug_checks; -#else - return false; -#endif -} - -static inline bool bset_has_ro_aux_tree(struct bset_tree *t) -{ - return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; -} - -static inline bool bset_has_rw_aux_tree(struct bset_tree *t) -{ - return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; -} - -static inline void bch_bset_set_no_aux_tree(struct btree *b, - struct bset_tree *t) -{ - BUG_ON(t < b->set); - - for (; t < b->set + ARRAY_SIZE(b->set); t++) { - t->size = 0; - t->extra = BSET_NO_AUX_TREE_VAL; - t->aux_data_offset = U16_MAX; - } -} - -static inline void btree_node_set_format(struct btree *b, - struct bkey_format f) -{ - int len; - - b->format = f; - b->nr_key_bits = bkey_format_key_bits(&f); - - len = bch_compile_bkey_format(&b->format, b->aux_data); - BUG_ON(len < 0 || len > U8_MAX); - - b->unpack_fn_len = len; - - bch_bset_set_no_aux_tree(b, b->set); -} - -static inline struct bset *bset_next_set(struct btree *b, - unsigned block_bytes) -{ - struct bset *i = btree_bset_last(b); - - EBUG_ON(!is_power_of_2(block_bytes)); - - return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); -} - -void bch_btree_keys_free(struct btree *); -int bch_btree_keys_alloc(struct btree *, unsigned, gfp_t); -void bch_btree_keys_init(struct btree *, bool *); - -void bch_bset_init_first(struct btree *, struct bset *); -void bch_bset_init_next(struct btree *, struct bset *); -void bch_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); -void bch_bset_fix_invalidated_key(struct btree *, struct bset_tree *, - struct bkey_packed *); - -void bch_bset_insert(struct btree *, struct btree_node_iter *, - struct bkey_packed *, struct bkey_i *, unsigned); -void bch_bset_delete(struct btree *, struct bkey_packed *, unsigned); - -/* Bkey utility code */ - -/* packed or unpacked */ -static inline int bkey_cmp_p_or_unp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r_packed, - struct bpos *r) -{ - EBUG_ON(r_packed && !bkey_packed(r_packed)); - - if (unlikely(!bkey_packed(l))) - return bkey_cmp(packed_to_bkey_c(l)->p, *r); - - if (likely(r_packed)) - return __bkey_cmp_packed_format_checked(l, r_packed, b); - - return __bkey_cmp_left_packed_format_checked(b, l, r); -} - -/* Returns true if @k is after iterator position @pos */ -static inline bool btree_iter_pos_cmp(struct bpos pos, const struct bkey *k, - bool strictly_greater) -{ - int cmp = bkey_cmp(k->p, pos); - - return cmp > 0 || - (cmp == 0 && !strictly_greater && !bkey_deleted(k)); -} - -static inline bool btree_iter_pos_cmp_packed(const struct btree *b, - struct bpos *pos, - const struct bkey_packed *k, - bool strictly_greater) -{ - int cmp = bkey_cmp_left_packed(b, k, pos); - - return cmp > 0 || - (cmp == 0 && !strictly_greater && !bkey_deleted(k)); -} - -static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b, - struct bpos pos, - const struct bkey_packed *pos_packed, - const struct bkey_packed *k, - bool strictly_greater) -{ - int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos); - - return cmp > 0 || - (cmp == 0 && !strictly_greater && !bkey_deleted(k)); -} - -struct bset_tree *bch_bkey_to_bset(struct btree *, struct bkey_packed *); -struct bkey_packed *bkey_prev_all(struct btree *, struct bset_tree *, - struct bkey_packed *); -struct bkey_packed *bkey_prev(struct btree *, struct bset_tree *, - struct bkey_packed *); - -enum bch_extent_overlap { - BCH_EXTENT_OVERLAP_ALL = 0, - BCH_EXTENT_OVERLAP_BACK = 1, - BCH_EXTENT_OVERLAP_FRONT = 2, - BCH_EXTENT_OVERLAP_MIDDLE = 3, -}; - -/* Returns how k overlaps with m */ -static inline enum bch_extent_overlap bch_extent_overlap(const struct bkey *k, - const struct bkey *m) -{ - int cmp1 = bkey_cmp(k->p, m->p) < 0; - int cmp2 = bkey_cmp(bkey_start_pos(k), - bkey_start_pos(m)) > 0; - - return (cmp1 << 1) + cmp2; -} - -/* Btree key iteration */ - -struct btree_node_iter { - u8 is_extents; - u16 used; - - struct btree_node_iter_set { - u16 k, end; - } data[MAX_BSETS]; -}; - -static inline void __bch_btree_node_iter_init(struct btree_node_iter *iter, - bool is_extents) -{ - iter->used = 0; - iter->is_extents = is_extents; -} - -void bch_btree_node_iter_push(struct btree_node_iter *, struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); -void bch_btree_node_iter_init(struct btree_node_iter *, struct btree *, - struct bpos, bool, bool); -void bch_btree_node_iter_init_from_start(struct btree_node_iter *, - struct btree *, bool); -struct bkey_packed *bch_btree_node_iter_bset_pos(struct btree_node_iter *, - struct btree *, - struct bset_tree *); - -void bch_btree_node_iter_sort(struct btree_node_iter *, struct btree *); -void bch_btree_node_iter_advance(struct btree_node_iter *, struct btree *); - -#define btree_node_iter_for_each(_iter, _set) \ - for (_set = (_iter)->data; \ - _set < (_iter)->data + (_iter)->used; \ - _set++) - -static inline bool bch_btree_node_iter_end(struct btree_node_iter *iter) -{ - return !iter->used; -} - -static inline int __btree_node_iter_cmp(bool is_extents, - struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - /* - * For non extents, when keys compare equal the deleted keys have to - * come first - so that bch_btree_node_iter_next_check() can detect - * duplicate nondeleted keys (and possibly other reasons?) - * - * For extents, bkey_deleted() is used as a proxy for k->size == 0, so - * deleted keys have to sort last. - */ - return bkey_cmp_packed(b, l, r) ?: is_extents - ? (int) bkey_deleted(l) - (int) bkey_deleted(r) - : (int) bkey_deleted(r) - (int) bkey_deleted(l); -} - -static inline int btree_node_iter_cmp(struct btree_node_iter *iter, - struct btree *b, - struct btree_node_iter_set l, - struct btree_node_iter_set r) -{ - return __btree_node_iter_cmp(iter->is_extents, b, - __btree_node_offset_to_key(b, l.k), - __btree_node_offset_to_key(b, r.k)); -} - -static inline void __bch_btree_node_iter_push(struct btree_node_iter *iter, - struct btree *b, - const struct bkey_packed *k, - const struct bkey_packed *end) -{ - if (k != end) - iter->data[iter->used++] = (struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k), - __btree_node_key_to_offset(b, end) - }; -} - -static inline struct bkey_packed * -__bch_btree_node_iter_peek_all(struct btree_node_iter *iter, - struct btree *b) -{ - return __btree_node_offset_to_key(b, iter->data->k); -} - -static inline struct bkey_packed * -bch_btree_node_iter_peek_all(struct btree_node_iter *iter, - struct btree *b) -{ - return bch_btree_node_iter_end(iter) - ? NULL - : __bch_btree_node_iter_peek_all(iter, b); -} - -static inline struct bkey_packed * -bch_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) -{ - struct bkey_packed *ret; - - while ((ret = bch_btree_node_iter_peek_all(iter, b)) && - bkey_deleted(ret)) - bch_btree_node_iter_advance(iter, b); - - return ret; -} - -static inline struct bkey_packed * -bch_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) -{ - struct bkey_packed *ret = bch_btree_node_iter_peek_all(iter, b); - - if (ret) - bch_btree_node_iter_advance(iter, b); - - return ret; -} - -struct bkey_packed *bch_btree_node_iter_prev_all(struct btree_node_iter *, - struct btree *); -struct bkey_packed *bch_btree_node_iter_prev(struct btree_node_iter *, - struct btree *); - -/* - * Iterates over all _live_ keys - skipping deleted (and potentially - * overlapping) keys - */ -#define for_each_btree_node_key(b, k, iter, _is_extents) \ - for (bch_btree_node_iter_init_from_start((iter), (b), (_is_extents));\ - ((k) = bch_btree_node_iter_peek(iter, b)); \ - bch_btree_node_iter_advance(iter, b)) - -struct bkey_s_c bch_btree_node_iter_peek_unpack(struct btree_node_iter *, - struct btree *, - struct bkey *); - -#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\ - for (bch_btree_node_iter_init_from_start((iter), (b), (_is_extents));\ - (k = bch_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ - bch_btree_node_iter_advance(iter, b)) - -/* Accounting: */ - -static inline void btree_keys_account_key(struct btree_nr_keys *n, - unsigned bset, - struct bkey_packed *k, - int sign) -{ - n->live_u64s += k->u64s * sign; - n->bset_u64s[bset] += k->u64s * sign; - - if (bkey_packed(k)) - n->packed_keys += sign; - else - n->unpacked_keys += sign; -} - -#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ - btree_keys_account_key(_nr, _bset_idx, _k, 1) -#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ - btree_keys_account_key(_nr, _bset_idx, _k, -1) - -struct bset_stats { - struct { - size_t nr, bytes; - } sets[BSET_TREE_NR_TYPES]; - - size_t floats; - size_t failed_unpacked; - size_t failed_prev; - size_t failed_overflow; -}; - -void bch_btree_keys_stats(struct btree *, struct bset_stats *); -int bch_bkey_print_bfloat(struct btree *, struct bkey_packed *, - char *, size_t); - -/* Debug stuff */ - -void bch_dump_bset(struct btree *, struct bset *, unsigned); -void bch_dump_btree_node(struct btree *); -void bch_dump_btree_node_iter(struct btree *, struct btree_node_iter *); - -#ifdef CONFIG_BCACHE_DEBUG - -void __bch_verify_btree_nr_keys(struct btree *); -void bch_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -void bch_verify_key_order(struct btree *, struct btree_node_iter *, - struct bkey_packed *); - -#else - -static inline void __bch_verify_btree_nr_keys(struct btree *b) {} -static inline void bch_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) {} -static inline void bch_verify_key_order(struct btree *b, - struct btree_node_iter *iter, - struct bkey_packed *where) {} -#endif - -static inline void bch_verify_btree_nr_keys(struct btree *b) -{ - if (btree_keys_expensive_checks(b)) - __bch_verify_btree_nr_keys(b); -} - -#endif diff --git a/libbcache/btree_cache.c b/libbcache/btree_cache.c deleted file mode 100644 index a43e12da..00000000 --- a/libbcache/btree_cache.c +++ /dev/null @@ -1,756 +0,0 @@ - -#include "bcache.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "debug.h" -#include "extents.h" - -#include <trace/events/bcache.h> - -#define DEF_BTREE_ID(kwd, val, name) name, - -const char * const bch_btree_ids[] = { - DEFINE_BCH_BTREE_IDS() - NULL -}; - -#undef DEF_BTREE_ID - -void bch_recalc_btree_reserve(struct bch_fs *c) -{ - unsigned i, reserve = 16; - - if (!c->btree_roots[0].b) - reserve += 8; - - for (i = 0; i < BTREE_ID_NR; i++) - if (c->btree_roots[i].b) - reserve += min_t(unsigned, 1, - c->btree_roots[i].b->level) * 8; - - c->btree_cache_reserve = reserve; -} - -#define mca_can_free(c) \ - max_t(int, 0, c->btree_cache_used - c->btree_cache_reserve) - -static void __mca_data_free(struct bch_fs *c, struct btree *b) -{ - EBUG_ON(btree_node_write_in_flight(b)); - - free_pages((unsigned long) b->data, btree_page_order(c)); - b->data = NULL; - bch_btree_keys_free(b); -} - -static void mca_data_free(struct bch_fs *c, struct btree *b) -{ - __mca_data_free(c, b); - c->btree_cache_used--; - list_move(&b->list, &c->btree_cache_freed); -} - -#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0]) - -static const struct rhashtable_params bch_btree_cache_params = { - .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, key.v), - .key_len = sizeof(struct bch_extent_ptr), -}; - -static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) -{ - unsigned order = ilog2(btree_pages(c)); - - b->data = (void *) __get_free_pages(gfp, order); - if (!b->data) - goto err; - - if (bch_btree_keys_alloc(b, order, gfp)) - goto err; - - c->btree_cache_used++; - list_move(&b->list, &c->btree_cache_freeable); - return; -err: - free_pages((unsigned long) b->data, order); - b->data = NULL; - list_move(&b->list, &c->btree_cache_freed); -} - -static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp) -{ - struct btree *b = kzalloc(sizeof(struct btree), gfp); - if (!b) - return NULL; - - six_lock_init(&b->lock); - INIT_LIST_HEAD(&b->list); - INIT_LIST_HEAD(&b->write_blocked); - - mca_data_alloc(c, b, gfp); - return b->data ? b : NULL; -} - -/* Btree in memory cache - hash table */ - -void mca_hash_remove(struct bch_fs *c, struct btree *b) -{ - BUG_ON(btree_node_dirty(b)); - - b->nsets = 0; - - rhashtable_remove_fast(&c->btree_cache_table, &b->hash, - bch_btree_cache_params); - - /* Cause future lookups for this node to fail: */ - bkey_i_to_extent(&b->key)->v._data[0] = 0; -} - -int mca_hash_insert(struct bch_fs *c, struct btree *b, - unsigned level, enum btree_id id) -{ - int ret; - b->level = level; - b->btree_id = id; - - ret = rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash, - bch_btree_cache_params); - if (ret) - return ret; - - mutex_lock(&c->btree_cache_lock); - list_add(&b->list, &c->btree_cache); - mutex_unlock(&c->btree_cache_lock); - - return 0; -} - -__flatten -static inline struct btree *mca_find(struct bch_fs *c, - const struct bkey_i *k) -{ - return rhashtable_lookup_fast(&c->btree_cache_table, &PTR_HASH(k), - bch_btree_cache_params); -} - -/* - * this version is for btree nodes that have already been freed (we're not - * reaping a real btree node) - */ -static int mca_reap_notrace(struct bch_fs *c, struct btree *b, bool flush) -{ - lockdep_assert_held(&c->btree_cache_lock); - - if (!six_trylock_intent(&b->lock)) - return -ENOMEM; - - if (!six_trylock_write(&b->lock)) - goto out_unlock_intent; - - if (btree_node_write_error(b) || - btree_node_noevict(b)) - goto out_unlock; - - if (!list_empty(&b->write_blocked)) - goto out_unlock; - - if (!flush && - (btree_node_dirty(b) || - btree_node_write_in_flight(b))) - goto out_unlock; - - /* - * Using the underscore version because we don't want to compact bsets - * after the write, since this node is about to be evicted - unless - * btree verify mode is enabled, since it runs out of the post write - * cleanup: - */ - if (btree_node_dirty(b)) { - if (verify_btree_ondisk(c)) - bch_btree_node_write(c, b, NULL, SIX_LOCK_intent, -1); - else - __bch_btree_node_write(c, b, NULL, SIX_LOCK_read, -1); - } - - /* wait for any in flight btree write */ - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); - - return 0; -out_unlock: - six_unlock_write(&b->lock); -out_unlock_intent: - six_unlock_intent(&b->lock); - return -ENOMEM; -} - -static int mca_reap(struct bch_fs *c, struct btree *b, bool flush) -{ - int ret = mca_reap_notrace(c, b, flush); - - trace_bcache_mca_reap(c, b, ret); - return ret; -} - -static unsigned long bch_mca_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct bch_fs *c = container_of(shrink, struct bch_fs, - btree_cache_shrink); - struct btree *b, *t; - unsigned long nr = sc->nr_to_scan; - unsigned long can_free; - unsigned long touched = 0; - unsigned long freed = 0; - unsigned i; - - u64 start_time = local_clock(); - - if (btree_shrinker_disabled(c)) - return SHRINK_STOP; - - if (c->btree_cache_alloc_lock) - return SHRINK_STOP; - - /* Return -1 if we can't do anything right now */ - if (sc->gfp_mask & __GFP_IO) - mutex_lock(&c->btree_cache_lock); - else if (!mutex_trylock(&c->btree_cache_lock)) - return -1; - - /* - * It's _really_ critical that we don't free too many btree nodes - we - * have to always leave ourselves a reserve. The reserve is how we - * guarantee that allocating memory for a new btree node can always - * succeed, so that inserting keys into the btree can always succeed and - * IO can always make forward progress: - */ - nr /= btree_pages(c); - can_free = mca_can_free(c); - nr = min_t(unsigned long, nr, can_free); - - i = 0; - list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { - touched++; - - if (freed >= nr) - break; - - if (++i > 3 && - !mca_reap_notrace(c, b, false)) { - mca_data_free(c, b); - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); - freed++; - } - } -restart: - list_for_each_entry_safe(b, t, &c->btree_cache, list) { - touched++; - - if (freed >= nr) { - /* Save position */ - if (&t->list != &c->btree_cache) - list_move_tail(&c->btree_cache, &t->list); - break; - } - - if (!btree_node_accessed(b) && - !mca_reap(c, b, false)) { - /* can't call mca_hash_remove under btree_cache_lock */ - freed++; - if (&t->list != &c->btree_cache) - list_move_tail(&c->btree_cache, &t->list); - - mca_data_free(c, b); - mutex_unlock(&c->btree_cache_lock); - - mca_hash_remove(c, b); - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); - - if (freed >= nr) - goto out; - - if (sc->gfp_mask & __GFP_IO) - mutex_lock(&c->btree_cache_lock); - else if (!mutex_trylock(&c->btree_cache_lock)) - goto out; - goto restart; - } else - clear_btree_node_accessed(b); - } - - mutex_unlock(&c->btree_cache_lock); -out: - bch_time_stats_update(&c->mca_scan_time, start_time); - - trace_bcache_mca_scan(c, - touched * btree_pages(c), - freed * btree_pages(c), - can_free * btree_pages(c), - sc->nr_to_scan); - - return (unsigned long) freed * btree_pages(c); -} - -static unsigned long bch_mca_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct bch_fs *c = container_of(shrink, struct bch_fs, - btree_cache_shrink); - - if (btree_shrinker_disabled(c)) - return 0; - - if (c->btree_cache_alloc_lock) - return 0; - - return mca_can_free(c) * btree_pages(c); -} - -void bch_fs_btree_exit(struct bch_fs *c) -{ - struct btree *b; - unsigned i; - - if (c->btree_cache_shrink.list.next) - unregister_shrinker(&c->btree_cache_shrink); - - mutex_lock(&c->btree_cache_lock); - -#ifdef CONFIG_BCACHE_DEBUG - if (c->verify_data) - list_move(&c->verify_data->list, &c->btree_cache); - - free_pages((unsigned long) c->verify_ondisk, ilog2(btree_pages(c))); -#endif - - for (i = 0; i < BTREE_ID_NR; i++) - if (c->btree_roots[i].b) - list_add(&c->btree_roots[i].b->list, &c->btree_cache); - - list_splice(&c->btree_cache_freeable, - &c->btree_cache); - - while (!list_empty(&c->btree_cache)) { - b = list_first_entry(&c->btree_cache, struct btree, list); - - if (btree_node_dirty(b)) - bch_btree_complete_write(c, b, btree_current_write(b)); - clear_btree_node_dirty(b); - - mca_data_free(c, b); - } - - while (!list_empty(&c->btree_cache_freed)) { - b = list_first_entry(&c->btree_cache_freed, - struct btree, list); - list_del(&b->list); - kfree(b); - } - - mutex_unlock(&c->btree_cache_lock); - - if (c->btree_cache_table_init_done) - rhashtable_destroy(&c->btree_cache_table); -} - -int bch_fs_btree_init(struct bch_fs *c) -{ - unsigned i; - int ret; - - ret = rhashtable_init(&c->btree_cache_table, &bch_btree_cache_params); - if (ret) - return ret; - - c->btree_cache_table_init_done = true; - - bch_recalc_btree_reserve(c); - - for (i = 0; i < c->btree_cache_reserve; i++) - if (!mca_bucket_alloc(c, GFP_KERNEL)) - return -ENOMEM; - - list_splice_init(&c->btree_cache, - &c->btree_cache_freeable); - -#ifdef CONFIG_BCACHE_DEBUG - mutex_init(&c->verify_lock); - - c->verify_ondisk = (void *) - __get_free_pages(GFP_KERNEL, ilog2(btree_pages(c))); - if (!c->verify_ondisk) - return -ENOMEM; - - c->verify_data = mca_bucket_alloc(c, GFP_KERNEL); - if (!c->verify_data) - return -ENOMEM; - - list_del_init(&c->verify_data->list); -#endif - - c->btree_cache_shrink.count_objects = bch_mca_count; - c->btree_cache_shrink.scan_objects = bch_mca_scan; - c->btree_cache_shrink.seeks = 4; - c->btree_cache_shrink.batch = btree_pages(c) * 2; - register_shrinker(&c->btree_cache_shrink); - - return 0; -} - -/* - * We can only have one thread cannibalizing other cached btree nodes at a time, - * or we'll deadlock. We use an open coded mutex to ensure that, which a - * cannibalize_bucket() will take. This means every time we unlock the root of - * the btree, we need to release this lock if we have it held. - */ -void mca_cannibalize_unlock(struct bch_fs *c) -{ - if (c->btree_cache_alloc_lock == current) { - trace_bcache_mca_cannibalize_unlock(c); - c->btree_cache_alloc_lock = NULL; - closure_wake_up(&c->mca_wait); - } -} - -int mca_cannibalize_lock(struct bch_fs *c, struct closure *cl) -{ - struct task_struct *old; - - old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); - if (old == NULL || old == current) - goto success; - - if (!cl) { - trace_bcache_mca_cannibalize_lock_fail(c); - return -ENOMEM; - } - - closure_wait(&c->mca_wait, cl); - - /* Try again, after adding ourselves to waitlist */ - old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); - if (old == NULL || old == current) { - /* We raced */ - closure_wake_up(&c->mca_wait); - goto success; - } - - trace_bcache_mca_cannibalize_lock_fail(c); - return -EAGAIN; - -success: - trace_bcache_mca_cannibalize_lock(c); - return 0; -} - -static struct btree *mca_cannibalize(struct bch_fs *c) -{ - struct btree *b; - - list_for_each_entry_reverse(b, &c->btree_cache, list) - if (!mca_reap(c, b, false)) - return b; - - while (1) { - list_for_each_entry_reverse(b, &c->btree_cache, list) - if (!mca_reap(c, b, true)) - return b; - - /* - * Rare case: all nodes were intent-locked. - * Just busy-wait. - */ - WARN_ONCE(1, "btree cache cannibalize failed\n"); - cond_resched(); - } -} - -struct btree *mca_alloc(struct bch_fs *c) -{ - struct btree *b; - u64 start_time = local_clock(); - - mutex_lock(&c->btree_cache_lock); - - /* - * btree_free() doesn't free memory; it sticks the node on the end of - * the list. Check if there's any freed nodes there: - */ - list_for_each_entry(b, &c->btree_cache_freeable, list) - if (!mca_reap_notrace(c, b, false)) - goto out_unlock; - - /* - * We never free struct btree itself, just the memory that holds the on - * disk node. Check the freed list before allocating a new one: - */ - list_for_each_entry(b, &c->btree_cache_freed, list) - if (!mca_reap_notrace(c, b, false)) { - mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO); - if (b->data) - goto out_unlock; - - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); - goto err; - } - - b = mca_bucket_alloc(c, __GFP_NOWARN|GFP_NOIO); - if (!b) - goto err; - - BUG_ON(!six_trylock_intent(&b->lock)); - BUG_ON(!six_trylock_write(&b->lock)); -out_unlock: - BUG_ON(bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key)); - BUG_ON(btree_node_write_in_flight(b)); - - list_del_init(&b->list); - mutex_unlock(&c->btree_cache_lock); -out: - b->flags = 0; - b->written = 0; - b->nsets = 0; - b->sib_u64s[0] = 0; - b->sib_u64s[1] = 0; - b->whiteout_u64s = 0; - b->uncompacted_whiteout_u64s = 0; - bch_btree_keys_init(b, &c->expensive_debug_checks); - - bch_time_stats_update(&c->mca_alloc_time, start_time); - - return b; -err: - /* Try to cannibalize another cached btree node: */ - if (c->btree_cache_alloc_lock == current) { - b = mca_cannibalize(c); - list_del_init(&b->list); - mutex_unlock(&c->btree_cache_lock); - - mca_hash_remove(c, b); - - trace_bcache_mca_cannibalize(c); - goto out; - } - - mutex_unlock(&c->btree_cache_lock); - return ERR_PTR(-ENOMEM); -} - -/* Slowpath, don't want it inlined into btree_iter_traverse() */ -static noinline struct btree *bch_btree_node_fill(struct btree_iter *iter, - const struct bkey_i *k, - unsigned level, - enum six_lock_type lock_type) -{ - struct bch_fs *c = iter->c; - struct btree *b; - - b = mca_alloc(c); - if (IS_ERR(b)) - return b; - - bkey_copy(&b->key, k); - if (mca_hash_insert(c, b, level, iter->btree_id)) { - /* raced with another fill: */ - - /* mark as unhashed... */ - bkey_i_to_extent(&b->key)->v._data[0] = 0; - - mutex_lock(&c->btree_cache_lock); - list_add(&b->list, &c->btree_cache_freeable); - mutex_unlock(&c->btree_cache_lock); - - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); - return NULL; - } - - /* - * If the btree node wasn't cached, we can't drop our lock on - * the parent until after it's added to the cache - because - * otherwise we could race with a btree_split() freeing the node - * we're trying to lock. - * - * But the deadlock described below doesn't exist in this case, - * so it's safe to not drop the parent lock until here: - */ - if (btree_node_read_locked(iter, level + 1)) - btree_node_unlock(iter, level + 1); - - bch_btree_node_read(c, b); - six_unlock_write(&b->lock); - - if (lock_type == SIX_LOCK_read) - six_lock_downgrade(&b->lock); - - return b; -} - -/** - * bch_btree_node_get - find a btree node in the cache and lock it, reading it - * in from disk if necessary. - * - * If IO is necessary and running under generic_make_request, returns -EAGAIN. - * - * The btree node will have either a read or a write lock held, depending on - * the @write parameter. - */ -struct btree *bch_btree_node_get(struct btree_iter *iter, - const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type) -{ - struct btree *b; - struct bset_tree *t; - - BUG_ON(level >= BTREE_MAX_DEPTH); -retry: - rcu_read_lock(); - b = mca_find(iter->c, k); - rcu_read_unlock(); - - if (unlikely(!b)) { - /* - * We must have the parent locked to call bch_btree_node_fill(), - * else we could read in a btree node from disk that's been - * freed: - */ - b = bch_btree_node_fill(iter, k, level, lock_type); - - /* We raced and found the btree node in the cache */ - if (!b) - goto retry; - - if (IS_ERR(b)) - return b; - } else { - /* - * There's a potential deadlock with splits and insertions into - * interior nodes we have to avoid: - * - * The other thread might be holding an intent lock on the node - * we want, and they want to update its parent node so they're - * going to upgrade their intent lock on the parent node to a - * write lock. - * - * But if we're holding a read lock on the parent, and we're - * trying to get the intent lock they're holding, we deadlock. - * - * So to avoid this we drop the read locks on parent nodes when - * we're starting to take intent locks - and handle the race. - * - * The race is that they might be about to free the node we - * want, and dropping our read lock on the parent node lets them - * update the parent marking the node we want as freed, and then - * free it: - * - * To guard against this, btree nodes are evicted from the cache - * when they're freed - and PTR_HASH() is zeroed out, which we - * check for after we lock the node. - * - * Then, btree_node_relock() on the parent will fail - because - * the parent was modified, when the pointer to the node we want - * was removed - and we'll bail out: - */ - if (btree_node_read_locked(iter, level + 1)) - btree_node_unlock(iter, level + 1); - - if (!btree_node_lock(b, k->k.p, level, iter, lock_type)) - return ERR_PTR(-EINTR); - - if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) || - b->level != level || - race_fault())) { - six_unlock_type(&b->lock, lock_type); - if (btree_node_relock(iter, level + 1)) - goto retry; - - return ERR_PTR(-EINTR); - } - } - - prefetch(b->aux_data); - - for_each_bset(b, t) { - void *p = (u64 *) b->aux_data + t->aux_data_offset; - - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - } - - /* avoid atomic set bit if it's not needed: */ - if (btree_node_accessed(b)) - set_btree_node_accessed(b); - - if (unlikely(btree_node_read_error(b))) { - six_unlock_type(&b->lock, lock_type); - return ERR_PTR(-EIO); - } - - EBUG_ON(!b->written); - EBUG_ON(b->btree_id != iter->btree_id || - BTREE_NODE_LEVEL(b->data) != level || - bkey_cmp(b->data->max_key, k->k.p)); - - return b; -} - -int bch_print_btree_node(struct bch_fs *c, struct btree *b, - char *buf, size_t len) -{ - const struct bkey_format *f = &b->format; - struct bset_stats stats; - char ptrs[100]; - - memset(&stats, 0, sizeof(stats)); - - bch_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs), - bkey_i_to_s_c(&b->key)); - bch_btree_keys_stats(b, &stats); - - return scnprintf(buf, len, - "l %u %llu:%llu - %llu:%llu:\n" - " ptrs: %s\n" - " format: u64s %u fields %u %u %u %u %u\n" - " unpack fn len: %u\n" - " bytes used %zu/%zu (%zu%% full)\n" - " sib u64s: %u, %u (merge threshold %zu)\n" - " nr packed keys %u\n" - " nr unpacked keys %u\n" - " floats %zu\n" - " failed unpacked %zu\n" - " failed prev %zu\n" - " failed overflow %zu\n", - b->level, - b->data->min_key.inode, - b->data->min_key.offset, - b->data->max_key.inode, - b->data->max_key.offset, - ptrs, - f->key_u64s, - f->bits_per_field[0], - f->bits_per_field[1], - f->bits_per_field[2], - f->bits_per_field[3], - f->bits_per_field[4], - b->unpack_fn_len, - b->nr.live_u64s * sizeof(u64), - btree_bytes(c) - sizeof(struct btree_node), - b->nr.live_u64s * 100 / btree_max_u64s(c), - b->sib_u64s[0], - b->sib_u64s[1], - BTREE_FOREGROUND_MERGE_THRESHOLD(c), - b->nr.packed_keys, - b->nr.unpacked_keys, - stats.floats, - stats.failed_unpacked, - stats.failed_prev, - stats.failed_overflow); -} diff --git a/libbcache/btree_cache.h b/libbcache/btree_cache.h deleted file mode 100644 index 0d1c00c4..00000000 --- a/libbcache/btree_cache.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef _BCACHE_BTREE_CACHE_H -#define _BCACHE_BTREE_CACHE_H - -#include "bcache.h" -#include "btree_types.h" - -struct btree_iter; - -extern const char * const bch_btree_ids[]; - -void bch_recalc_btree_reserve(struct bch_fs *); - -void mca_hash_remove(struct bch_fs *, struct btree *); -int mca_hash_insert(struct bch_fs *, struct btree *, - unsigned, enum btree_id); - -void mca_cannibalize_unlock(struct bch_fs *); -int mca_cannibalize_lock(struct bch_fs *, struct closure *); - -struct btree *mca_alloc(struct bch_fs *); - -struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *, - unsigned, enum six_lock_type); - -void bch_fs_btree_exit(struct bch_fs *); -int bch_fs_btree_init(struct bch_fs *); - -#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ - for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl, \ - &(_c)->btree_cache_table), \ - _iter = 0; _iter < (_tbl)->size; _iter++) \ - rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) - -static inline size_t btree_bytes(struct bch_fs *c) -{ - return c->sb.btree_node_size << 9; -} - -static inline size_t btree_max_u64s(struct bch_fs *c) -{ - return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); -} - -static inline size_t btree_pages(struct bch_fs *c) -{ - return c->sb.btree_node_size >> (PAGE_SHIFT - 9); -} - -static inline size_t btree_page_order(struct bch_fs *c) -{ - return ilog2(btree_pages(c)); -} - -static inline unsigned btree_blocks(struct bch_fs *c) -{ - return c->sb.btree_node_size >> c->block_bits; -} - -#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4) - -#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) -#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ - (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ - (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) - -#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->btree_id].b) - -int bch_print_btree_node(struct bch_fs *, struct btree *, - char *, size_t); - -#endif /* _BCACHE_BTREE_CACHE_H */ diff --git a/libbcache/btree_gc.c b/libbcache/btree_gc.c deleted file mode 100644 index 5270d442..00000000 --- a/libbcache/btree_gc.c +++ /dev/null @@ -1,955 +0,0 @@ -/* - * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright (C) 2014 Datera Inc. - */ - -#include "bcache.h" -#include "alloc.h" -#include "bkey_methods.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "btree_io.h" -#include "btree_gc.h" -#include "buckets.h" -#include "clock.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "journal.h" -#include "keylist.h" -#include "move.h" -#include "super-io.h" -#include "writeback.h" - -#include <linux/slab.h> -#include <linux/bitops.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/rcupdate.h> -#include <trace/events/bcache.h> - -struct range_checks { - struct range_level { - struct bpos min; - struct bpos max; - } l[BTREE_MAX_DEPTH]; - unsigned depth; -}; - -static void btree_node_range_checks_init(struct range_checks *r, unsigned depth) -{ - unsigned i; - - for (i = 0; i < BTREE_MAX_DEPTH; i++) - r->l[i].min = r->l[i].max = POS_MIN; - r->depth = depth; -} - -static void btree_node_range_checks(struct bch_fs *c, struct btree *b, - struct range_checks *r) -{ - struct range_level *l = &r->l[b->level]; - - struct bpos expected_min = bkey_cmp(l->min, l->max) - ? btree_type_successor(b->btree_id, l->max) - : l->max; - - bch_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c, - "btree node has incorrect min key: %llu:%llu != %llu:%llu", - b->data->min_key.inode, - b->data->min_key.offset, - expected_min.inode, - expected_min.offset); - - l->max = b->data->max_key; - - if (b->level > r->depth) { - l = &r->l[b->level - 1]; - - bch_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c, - "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu", - b->data->min_key.inode, - b->data->min_key.offset, - l->min.inode, - l->min.offset); - - bch_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c, - "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu", - b->data->max_key.inode, - b->data->max_key.offset, - l->max.inode, - l->max.offset); - - if (bkey_cmp(b->data->max_key, POS_MAX)) - l->min = l->max = - btree_type_successor(b->btree_id, - b->data->max_key); - } -} - -u8 bch_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k) -{ - const struct bch_extent_ptr *ptr; - u8 max_stale = 0; - - if (bkey_extent_is_data(k.k)) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; - size_t b = PTR_BUCKET_NR(ca, ptr); - - if (__gen_after(ca->oldest_gens[b], ptr->gen)) - ca->oldest_gens[b] = ptr->gen; - - max_stale = max(max_stale, ptr_stale(ca, ptr)); - } - } - - return max_stale; -} - -/* - * For runtime mark and sweep: - */ -static u8 bch_btree_mark_key(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k) -{ - switch (type) { - case BKEY_TYPE_BTREE: - bch_gc_mark_key(c, k, c->sb.btree_node_size, true); - return 0; - case BKEY_TYPE_EXTENTS: - bch_gc_mark_key(c, k, k.k->size, false); - return bch_btree_key_recalc_oldest_gen(c, k); - default: - BUG(); - } -} - -u8 bch_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k) -{ - atomic64_set(&c->key_version, - max_t(u64, k.k->version.lo, - atomic64_read(&c->key_version))); - - return bch_btree_mark_key(c, type, k); -} - -static bool btree_gc_mark_node(struct bch_fs *c, struct btree *b) -{ - if (btree_node_has_ptrs(b)) { - struct btree_node_iter iter; - struct bkey unpacked; - struct bkey_s_c k; - u8 stale = 0; - - for_each_btree_node_key_unpack(b, k, &iter, - btree_node_is_extents(b), - &unpacked) { - bkey_debugcheck(c, b, k); - stale = max(stale, bch_btree_mark_key(c, - btree_node_type(b), k)); - } - - if (btree_gc_rewrite_disabled(c)) - return false; - - if (stale > 10) - return true; - } - - if (btree_gc_always_rewrite(c)) - return true; - - return false; -} - -static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -{ - write_seqcount_begin(&c->gc_pos_lock); - c->gc_pos = new_pos; - write_seqcount_end(&c->gc_pos_lock); -} - -static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -{ - BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); - __gc_pos_set(c, new_pos); -} - -static int bch_gc_btree(struct bch_fs *c, enum btree_id btree_id) -{ - struct btree_iter iter; - struct btree *b; - bool should_rewrite; - struct range_checks r; - unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1; - int ret; - - /* - * if expensive_debug_checks is on, run range_checks on all leaf nodes: - */ - if (expensive_debug_checks(c)) - depth = 0; - - btree_node_range_checks_init(&r, depth); - - for_each_btree_node(&iter, c, btree_id, POS_MIN, depth, b) { - btree_node_range_checks(c, b, &r); - - bch_verify_btree_nr_keys(b); - - should_rewrite = btree_gc_mark_node(c, b); - - gc_pos_set(c, gc_pos_btree_node(b)); - - if (should_rewrite) - bch_btree_node_rewrite(&iter, b, NULL); - - bch_btree_iter_cond_resched(&iter); - } - ret = bch_btree_iter_unlock(&iter); - if (ret) - return ret; - - mutex_lock(&c->btree_root_lock); - - b = c->btree_roots[btree_id].b; - bch_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key)); - gc_pos_set(c, gc_pos_btree_root(b->btree_id)); - - mutex_unlock(&c->btree_root_lock); - return 0; -} - -static void bch_mark_allocator_buckets(struct bch_fs *c) -{ - struct bch_dev *ca; - struct open_bucket *ob; - size_t i, j, iter; - unsigned ci; - - for_each_member_device(ca, c, ci) { - spin_lock(&ca->freelist_lock); - - fifo_for_each_entry(i, &ca->free_inc, iter) - bch_mark_alloc_bucket(ca, &ca->buckets[i], true); - - for (j = 0; j < RESERVE_NR; j++) - fifo_for_each_entry(i, &ca->free[j], iter) - bch_mark_alloc_bucket(ca, &ca->buckets[i], true); - - spin_unlock(&ca->freelist_lock); - } - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { - const struct bch_extent_ptr *ptr; - - mutex_lock(&ob->lock); - open_bucket_for_each_ptr(ob, ptr) { - ca = c->devs[ptr->dev]; - bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), true); - } - mutex_unlock(&ob->lock); - } -} - -static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end, - enum bucket_data_type type) -{ - u64 b = start >> ca->bucket_bits; - - do { - bch_mark_metadata_bucket(ca, ca->buckets + b, type, true); - b++; - } while (b < end >> ca->bucket_bits); -} - -static void bch_dev_mark_superblocks(struct bch_dev *ca) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - unsigned i; - - for (i = 0; i < layout->nr_superblocks; i++) { - if (layout->sb_offset[i] == BCH_SB_SECTOR) - mark_metadata_sectors(ca, 0, BCH_SB_SECTOR, - BUCKET_SB); - - mark_metadata_sectors(ca, - layout->sb_offset[i], - layout->sb_offset[i] + - (1 << layout->sb_max_size_bits), - BUCKET_SB); - } -} - -/* - * Mark non btree metadata - prios, journal - */ -void bch_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca) -{ - unsigned i; - u64 b; - - lockdep_assert_held(&c->sb_lock); - - bch_dev_mark_superblocks(ca); - - spin_lock(&c->journal.lock); - - for (i = 0; i < ca->journal.nr; i++) { - b = ca->journal.buckets[i]; - bch_mark_metadata_bucket(ca, ca->buckets + b, - BUCKET_JOURNAL, true); - } - - spin_unlock(&c->journal.lock); - - spin_lock(&ca->prio_buckets_lock); - - for (i = 0; i < prio_buckets(ca) * 2; i++) { - b = ca->prio_buckets[i]; - if (b) - bch_mark_metadata_bucket(ca, ca->buckets + b, - BUCKET_PRIOS, true); - } - - spin_unlock(&ca->prio_buckets_lock); -} - -static void bch_mark_metadata(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned i; - - mutex_lock(&c->sb_lock); - gc_pos_set(c, gc_phase(GC_PHASE_SB_METADATA)); - - for_each_online_member(ca, c, i) - bch_mark_dev_metadata(c, ca); - mutex_unlock(&c->sb_lock); -} - -/* Also see bch_pending_btree_node_free_insert_done() */ -static void bch_mark_pending_btree_node_frees(struct bch_fs *c) -{ - struct bch_fs_usage stats = { 0 }; - struct btree_interior_update *as; - struct pending_btree_node_free *d; - - mutex_lock(&c->btree_interior_update_lock); - gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); - - for_each_pending_btree_node_free(c, as, d) - if (d->index_update_done) - __bch_gc_mark_key(c, bkey_i_to_s_c(&d->key), - c->sb.btree_node_size, true, - &stats); - /* - * Don't apply stats - pending deletes aren't tracked in - * bch_alloc_stats: - */ - - mutex_unlock(&c->btree_interior_update_lock); -} - -/** - * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes - */ -void bch_gc(struct bch_fs *c) -{ - struct bch_dev *ca; - struct bucket *g; - struct bucket_mark new; - u64 start_time = local_clock(); - unsigned i; - int cpu; - - /* - * Walk _all_ references to buckets, and recompute them: - * - * Order matters here: - * - Concurrent GC relies on the fact that we have a total ordering for - * everything that GC walks - see gc_will_visit_node(), - * gc_will_visit_root() - * - * - also, references move around in the course of index updates and - * various other crap: everything needs to agree on the ordering - * references are allowed to move around in - e.g., we're allowed to - * start with a reference owned by an open_bucket (the allocator) and - * move it to the btree, but not the reverse. - * - * This is necessary to ensure that gc doesn't miss references that - * move around - if references move backwards in the ordering GC - * uses, GC could skip past them - */ - - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) - return; - - trace_bcache_gc_start(c); - - /* - * Do this before taking gc_lock - bch_disk_reservation_get() blocks on - * gc_lock if sectors_available goes to 0: - */ - bch_recalc_sectors_available(c); - - down_write(&c->gc_lock); - - lg_global_lock(&c->usage_lock); - - /* - * Indicates to buckets code that gc is now in progress - done under - * usage_lock to avoid racing with bch_mark_key(): - */ - __gc_pos_set(c, GC_POS_MIN); - - /* Save a copy of the existing bucket stats while we recompute them: */ - for_each_member_device(ca, c, i) { - ca->usage_cached = __bch_dev_usage_read(ca); - for_each_possible_cpu(cpu) { - struct bch_dev_usage *p = - per_cpu_ptr(ca->usage_percpu, cpu); - memset(p, 0, sizeof(*p)); - } - } - - c->usage_cached = __bch_fs_usage_read(c); - for_each_possible_cpu(cpu) { - struct bch_fs_usage *p = - per_cpu_ptr(c->usage_percpu, cpu); - - memset(p->s, 0, sizeof(p->s)); - p->persistent_reserved = 0; - } - - lg_global_unlock(&c->usage_lock); - - /* Clear bucket marks: */ - for_each_member_device(ca, c, i) - for_each_bucket(g, ca) { - bucket_cmpxchg(g, new, ({ - new.owned_by_allocator = 0; - new.data_type = 0; - new.cached_sectors = 0; - new.dirty_sectors = 0; - })); - ca->oldest_gens[g - ca->buckets] = new.gen; - } - - /* Walk allocator's references: */ - bch_mark_allocator_buckets(c); - - /* Walk btree: */ - while (c->gc_pos.phase < (int) BTREE_ID_NR) { - int ret = c->btree_roots[c->gc_pos.phase].b - ? bch_gc_btree(c, (int) c->gc_pos.phase) - : 0; - - if (ret) { - bch_err(c, "btree gc failed: %d", ret); - set_bit(BCH_FS_GC_FAILURE, &c->flags); - up_write(&c->gc_lock); - return; - } - - gc_pos_set(c, gc_phase(c->gc_pos.phase + 1)); - } - - bch_mark_metadata(c); - bch_mark_pending_btree_node_frees(c); - bch_writeback_recalc_oldest_gens(c); - - for_each_member_device(ca, c, i) - atomic_long_set(&ca->saturated_count, 0); - - /* Indicates that gc is no longer in progress: */ - gc_pos_set(c, gc_phase(GC_PHASE_DONE)); - - up_write(&c->gc_lock); - trace_bcache_gc_end(c); - bch_time_stats_update(&c->btree_gc_time, start_time); - - /* - * Wake up allocator in case it was waiting for buckets - * because of not being able to inc gens - */ - for_each_member_device(ca, c, i) - bch_wake_allocator(ca); -} - -/* Btree coalescing */ - -static void recalc_packed_keys(struct btree *b) -{ - struct bkey_packed *k; - - memset(&b->nr, 0, sizeof(b->nr)); - - BUG_ON(b->nsets != 1); - - for (k = btree_bkey_first(b, b->set); - k != btree_bkey_last(b, b->set); - k = bkey_next(k)) - btree_keys_account_key_add(&b->nr, 0, k); -} - -static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], - struct btree_iter *iter) -{ - struct btree *parent = iter->nodes[old_nodes[0]->level + 1]; - struct bch_fs *c = iter->c; - unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; - unsigned blocks = btree_blocks(c) * 2 / 3; - struct btree *new_nodes[GC_MERGE_NODES]; - struct btree_interior_update *as; - struct btree_reserve *res; - struct keylist keylist; - struct bkey_format_state format_state; - struct bkey_format new_format; - - memset(new_nodes, 0, sizeof(new_nodes)); - bch_keylist_init(&keylist, NULL, 0); - - /* Count keys that are not deleted */ - for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) - u64s += old_nodes[i]->nr.live_u64s; - - nr_old_nodes = nr_new_nodes = i; - - /* Check if all keys in @old_nodes could fit in one fewer node */ - if (nr_old_nodes <= 1 || - __vstruct_blocks(struct btree_node, c->block_bits, - DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) - return; - - res = bch_btree_reserve_get(c, parent, nr_old_nodes, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE, - NULL); - if (IS_ERR(res)) { - trace_bcache_btree_gc_coalesce_fail(c, - BTREE_GC_COALESCE_FAIL_RESERVE_GET); - return; - } - - if (bch_keylist_realloc(&keylist, NULL, 0, - (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { - trace_bcache_btree_gc_coalesce_fail(c, - BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); - goto out; - } - - /* Find a format that all keys in @old_nodes can pack into */ - bch_bkey_format_init(&format_state); - - for (i = 0; i < nr_old_nodes; i++) - __bch_btree_calc_format(&format_state, old_nodes[i]); - - new_format = bch_bkey_format_done(&format_state); - - /* Check if repacking would make any nodes too big to fit */ - for (i = 0; i < nr_old_nodes; i++) - if (!bch_btree_node_format_fits(c, old_nodes[i], &new_format)) { - trace_bcache_btree_gc_coalesce_fail(c, - BTREE_GC_COALESCE_FAIL_FORMAT_FITS); - goto out; - } - - trace_bcache_btree_gc_coalesce(c, parent, nr_old_nodes); - - as = bch_btree_interior_update_alloc(c); - - for (i = 0; i < nr_old_nodes; i++) - bch_btree_interior_update_will_free_node(c, as, old_nodes[i]); - - /* Repack everything with @new_format and sort down to one bset */ - for (i = 0; i < nr_old_nodes; i++) - new_nodes[i] = __btree_node_alloc_replacement(c, old_nodes[i], - new_format, res); - - /* - * Conceptually we concatenate the nodes together and slice them - * up at different boundaries. - */ - for (i = nr_new_nodes - 1; i > 0; --i) { - struct btree *n1 = new_nodes[i]; - struct btree *n2 = new_nodes[i - 1]; - - struct bset *s1 = btree_bset_first(n1); - struct bset *s2 = btree_bset_first(n2); - struct bkey_packed *k, *last = NULL; - - /* Calculate how many keys from @n2 we could fit inside @n1 */ - u64s = 0; - - for (k = s2->start; - k < vstruct_last(s2) && - vstruct_blocks_plus(n1->data, c->block_bits, - u64s + k->u64s) <= blocks; - k = bkey_next(k)) { - last = k; - u64s += k->u64s; - } - - if (u64s == le16_to_cpu(s2->u64s)) { - /* n2 fits entirely in n1 */ - n1->key.k.p = n1->data->max_key = n2->data->max_key; - - memcpy_u64s(vstruct_last(s1), - s2->start, - le16_to_cpu(s2->u64s)); - le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); - - set_btree_bset_end(n1, n1->set); - - six_unlock_write(&n2->lock); - bch_btree_node_free_never_inserted(c, n2); - six_unlock_intent(&n2->lock); - - memmove(new_nodes + i - 1, - new_nodes + i, - sizeof(new_nodes[0]) * (nr_new_nodes - i)); - new_nodes[--nr_new_nodes] = NULL; - } else if (u64s) { - /* move part of n2 into n1 */ - n1->key.k.p = n1->data->max_key = - bkey_unpack_pos(n1, last); - - n2->data->min_key = - btree_type_successor(iter->btree_id, - n1->data->max_key); - - memcpy_u64s(vstruct_last(s1), - s2->start, u64s); - le16_add_cpu(&s1->u64s, u64s); - - memmove(s2->start, - vstruct_idx(s2, u64s), - (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); - s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); - - set_btree_bset_end(n1, n1->set); - set_btree_bset_end(n2, n2->set); - } - } - - for (i = 0; i < nr_new_nodes; i++) { - struct btree *n = new_nodes[i]; - - recalc_packed_keys(n); - btree_node_reset_sib_u64s(n); - - bch_btree_build_aux_trees(n); - six_unlock_write(&n->lock); - - bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1); - } - - /* - * The keys for the old nodes get deleted. We don't want to insert keys - * that compare equal to the keys for the new nodes we'll also be - * inserting - we can't because keys on a keylist must be strictly - * greater than the previous keys, and we also don't need to since the - * key for the new node will serve the same purpose (overwriting the key - * for the old node). - */ - for (i = 0; i < nr_old_nodes; i++) { - struct bkey_i delete; - unsigned j; - - for (j = 0; j < nr_new_nodes; j++) - if (!bkey_cmp(old_nodes[i]->key.k.p, - new_nodes[j]->key.k.p)) - goto next; - - bkey_init(&delete.k); - delete.k.p = old_nodes[i]->key.k.p; - bch_keylist_add_in_order(&keylist, &delete); -next: - i = i; - } - - /* - * Keys for the new nodes get inserted: bch_btree_insert_keys() only - * does the lookup once and thus expects the keys to be in sorted order - * so we have to make sure the new keys are correctly ordered with - * respect to the deleted keys added in the previous loop - */ - for (i = 0; i < nr_new_nodes; i++) - bch_keylist_add_in_order(&keylist, &new_nodes[i]->key); - - /* Insert the newly coalesced nodes */ - bch_btree_insert_node(parent, iter, &keylist, res, as); - - BUG_ON(!bch_keylist_empty(&keylist)); - - BUG_ON(iter->nodes[old_nodes[0]->level] != old_nodes[0]); - - BUG_ON(!bch_btree_iter_node_replace(iter, new_nodes[0])); - - for (i = 0; i < nr_new_nodes; i++) - btree_open_bucket_put(c, new_nodes[i]); - - /* Free the old nodes and update our sliding window */ - for (i = 0; i < nr_old_nodes; i++) { - bch_btree_node_free_inmem(iter, old_nodes[i]); - six_unlock_intent(&old_nodes[i]->lock); - - /* - * the index update might have triggered a split, in which case - * the nodes we coalesced - the new nodes we just created - - * might not be sibling nodes anymore - don't add them to the - * sliding window (except the first): - */ - if (!i) { - old_nodes[i] = new_nodes[i]; - } else { - old_nodes[i] = NULL; - if (new_nodes[i]) - six_unlock_intent(&new_nodes[i]->lock); - } - } -out: - bch_keylist_free(&keylist, NULL); - bch_btree_reserve_put(c, res); -} - -static int bch_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) -{ - struct btree_iter iter; - struct btree *b; - unsigned i; - - /* Sliding window of adjacent btree nodes */ - struct btree *merge[GC_MERGE_NODES]; - u32 lock_seq[GC_MERGE_NODES]; - - /* - * XXX: We don't have a good way of positively matching on sibling nodes - * that have the same parent - this code works by handling the cases - * where they might not have the same parent, and is thus fragile. Ugh. - * - * Perhaps redo this to use multiple linked iterators? - */ - memset(merge, 0, sizeof(merge)); - - __for_each_btree_node(&iter, c, btree_id, POS_MIN, 0, b, U8_MAX) { - memmove(merge + 1, merge, - sizeof(merge) - sizeof(merge[0])); - memmove(lock_seq + 1, lock_seq, - sizeof(lock_seq) - sizeof(lock_seq[0])); - - merge[0] = b; - - for (i = 1; i < GC_MERGE_NODES; i++) { - if (!merge[i] || - !six_relock_intent(&merge[i]->lock, lock_seq[i])) - break; - - if (merge[i]->level != merge[0]->level) { - six_unlock_intent(&merge[i]->lock); - break; - } - } - memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); - - bch_coalesce_nodes(merge, &iter); - - for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { - lock_seq[i] = merge[i]->lock.state.seq; - six_unlock_intent(&merge[i]->lock); - } - - lock_seq[0] = merge[0]->lock.state.seq; - - if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) { - bch_btree_iter_unlock(&iter); - return -ESHUTDOWN; - } - - bch_btree_iter_cond_resched(&iter); - - /* - * If the parent node wasn't relocked, it might have been split - * and the nodes in our sliding window might not have the same - * parent anymore - blow away the sliding window: - */ - if (iter.nodes[iter.level + 1] && - !btree_node_intent_locked(&iter, iter.level + 1)) - memset(merge + 1, 0, - (GC_MERGE_NODES - 1) * sizeof(merge[0])); - } - return bch_btree_iter_unlock(&iter); -} - -/** - * bch_coalesce - coalesce adjacent nodes with low occupancy - */ -void bch_coalesce(struct bch_fs *c) -{ - u64 start_time; - enum btree_id id; - - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) - return; - - down_read(&c->gc_lock); - trace_bcache_gc_coalesce_start(c); - start_time = local_clock(); - - for (id = 0; id < BTREE_ID_NR; id++) { - int ret = c->btree_roots[id].b - ? bch_coalesce_btree(c, id) - : 0; - - if (ret) { - if (ret != -ESHUTDOWN) - bch_err(c, "btree coalescing failed: %d", ret); - set_bit(BCH_FS_GC_FAILURE, &c->flags); - return; - } - } - - bch_time_stats_update(&c->btree_coalesce_time, start_time); - trace_bcache_gc_coalesce_end(c); - up_read(&c->gc_lock); -} - -static int bch_gc_thread(void *arg) -{ - struct bch_fs *c = arg; - struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last = atomic_long_read(&clock->now); - unsigned last_kick = atomic_read(&c->kick_gc); - - set_freezable(); - - while (1) { - unsigned long next = last + c->capacity / 16; - - while (atomic_long_read(&clock->now) < next) { - set_current_state(TASK_INTERRUPTIBLE); - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - return 0; - } - - if (atomic_read(&c->kick_gc) != last_kick) { - __set_current_state(TASK_RUNNING); - break; - } - - bch_io_clock_schedule_timeout(clock, next); - try_to_freeze(); - } - - last = atomic_long_read(&clock->now); - last_kick = atomic_read(&c->kick_gc); - - bch_gc(c); - if (!btree_gc_coalesce_disabled(c)) - bch_coalesce(c); - - debug_check_no_locks_held(); - } - - return 0; -} - -void bch_gc_thread_stop(struct bch_fs *c) -{ - set_bit(BCH_FS_GC_STOPPING, &c->flags); - - if (c->gc_thread) - kthread_stop(c->gc_thread); - - c->gc_thread = NULL; - clear_bit(BCH_FS_GC_STOPPING, &c->flags); -} - -int bch_gc_thread_start(struct bch_fs *c) -{ - struct task_struct *p; - - BUG_ON(c->gc_thread); - - p = kthread_create(bch_gc_thread, c, "bcache_gc"); - if (IS_ERR(p)) - return PTR_ERR(p); - - c->gc_thread = p; - wake_up_process(c->gc_thread); - return 0; -} - -/* Initial GC computes bucket marks during startup */ - -static void bch_initial_gc_btree(struct bch_fs *c, enum btree_id id) -{ - struct btree_iter iter; - struct btree *b; - struct range_checks r; - - btree_node_range_checks_init(&r, 0); - - if (!c->btree_roots[id].b) - return; - - /* - * We have to hit every btree node before starting journal replay, in - * order for the journal seq blacklist machinery to work: - */ - for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { - btree_node_range_checks(c, b, &r); - - if (btree_node_has_ptrs(b)) { - struct btree_node_iter node_iter; - struct bkey unpacked; - struct bkey_s_c k; - - for_each_btree_node_key_unpack(b, k, &node_iter, - btree_node_is_extents(b), - &unpacked) - bch_btree_mark_key_initial(c, btree_node_type(b), k); - } - - bch_btree_iter_cond_resched(&iter); - } - - bch_btree_iter_unlock(&iter); - - bch_btree_mark_key(c, BKEY_TYPE_BTREE, - bkey_i_to_s_c(&c->btree_roots[id].b->key)); -} - -int bch_initial_gc(struct bch_fs *c, struct list_head *journal) -{ - enum btree_id id; - - for (id = 0; id < BTREE_ID_NR; id++) - bch_initial_gc_btree(c, id); - - if (journal) - bch_journal_mark(c, journal); - - bch_mark_metadata(c); - - /* - * Skip past versions that might have possibly been used (as nonces), - * but hadn't had their pointers written: - */ - if (c->sb.encryption_type) - atomic64_add(1 << 16, &c->key_version); - - gc_pos_set(c, gc_phase(GC_PHASE_DONE)); - set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); - - return 0; -} diff --git a/libbcache/btree_gc.h b/libbcache/btree_gc.h deleted file mode 100644 index f1794fdf..00000000 --- a/libbcache/btree_gc.h +++ /dev/null @@ -1,104 +0,0 @@ -#ifndef _BCACHE_GC_H -#define _BCACHE_GC_H - -#include "btree_types.h" - -enum bkey_type; - -void bch_coalesce(struct bch_fs *); -void bch_gc(struct bch_fs *); -void bch_gc_thread_stop(struct bch_fs *); -int bch_gc_thread_start(struct bch_fs *); -int bch_initial_gc(struct bch_fs *, struct list_head *); -u8 bch_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c); -u8 bch_btree_mark_key_initial(struct bch_fs *, enum bkey_type, - struct bkey_s_c); -void bch_mark_dev_metadata(struct bch_fs *, struct bch_dev *); - -/* - * For concurrent mark and sweep (with other index updates), we define a total - * ordering of _all_ references GC walks: - * - * Note that some references will have the same GC position as others - e.g. - * everything within the same btree node; in those cases we're relying on - * whatever locking exists for where those references live, i.e. the write lock - * on a btree node. - * - * That locking is also required to ensure GC doesn't pass the updater in - * between the updater adding/removing the reference and updating the GC marks; - * without that, we would at best double count sometimes. - * - * That part is important - whenever calling bch_mark_pointers(), a lock _must_ - * be held that prevents GC from passing the position the updater is at. - * - * (What about the start of gc, when we're clearing all the marks? GC clears the - * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc - * position inside its cmpxchg loop, so crap magically works). - */ - -/* Position of (the start of) a gc phase: */ -static inline struct gc_pos gc_phase(enum gc_phase phase) -{ - return (struct gc_pos) { - .phase = phase, - .pos = POS_MIN, - .level = 0, - }; -} - -#define GC_POS_MIN gc_phase(0) - -static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) -{ - if (l.phase != r.phase) - return l.phase < r.phase ? -1 : 1; - if (bkey_cmp(l.pos, r.pos)) - return bkey_cmp(l.pos, r.pos); - if (l.level != r.level) - return l.level < r.level ? -1 : 1; - return 0; -} - -/* - * GC position of the pointers within a btree node: note, _not_ for &b->key - * itself, that lives in the parent node: - */ -static inline struct gc_pos gc_pos_btree_node(struct btree *b) -{ - return (struct gc_pos) { - .phase = b->btree_id, - .pos = b->key.k.p, - .level = b->level, - }; -} - -/* - * GC position of the pointer to a btree root: we don't use - * gc_pos_pointer_to_btree_node() here to avoid a potential race with - * btree_split() increasing the tree depth - the new root will have level > the - * old root and thus have a greater gc position than the old root, but that - * would be incorrect since once gc has marked the root it's not coming back. - */ -static inline struct gc_pos gc_pos_btree_root(enum btree_id id) -{ - return (struct gc_pos) { - .phase = (int) id, - .pos = POS_MAX, - .level = U8_MAX, - }; -} - -static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos) -{ - unsigned seq; - bool ret; - - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - ret = gc_pos_cmp(c->gc_pos, pos) < 0; - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - - return ret; -} - -#endif diff --git a/libbcache/btree_io.c b/libbcache/btree_io.c deleted file mode 100644 index 737e54ec..00000000 --- a/libbcache/btree_io.c +++ /dev/null @@ -1,1738 +0,0 @@ - -#include "bcache.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_update.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "buckets.h" -#include "checksum.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "io.h" -#include "journal.h" -#include "super-io.h" - -#include <trace/events/bcache.h> - -static void verify_no_dups(struct btree *b, - struct bkey_packed *start, - struct bkey_packed *end) -{ -#ifdef CONFIG_BCACHE_DEBUG - struct bkey_packed *k; - - for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) { - struct bkey l = bkey_unpack_key(b, k); - struct bkey r = bkey_unpack_key(b, bkey_next(k)); - - BUG_ON(btree_node_is_extents(b) - ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 - : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); - //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0); - } -#endif -} - -static void clear_needs_whiteout(struct bset *i) -{ - struct bkey_packed *k; - - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) - k->needs_whiteout = false; -} - -static void set_needs_whiteout(struct bset *i) -{ - struct bkey_packed *k; - - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) - k->needs_whiteout = true; -} - -static void btree_bounce_free(struct bch_fs *c, unsigned order, - bool used_mempool, void *p) -{ - if (used_mempool) - mempool_free(virt_to_page(p), &c->btree_bounce_pool); - else - free_pages((unsigned long) p, order); -} - -static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, - bool *used_mempool) -{ - void *p; - - BUG_ON(1 << order > btree_pages(c)); - - *used_mempool = false; - p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); - if (p) - return p; - - *used_mempool = true; - return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO)); -} - -typedef int (*sort_cmp_fn)(struct btree *, - struct bkey_packed *, - struct bkey_packed *); - -struct sort_iter { - struct btree *b; - unsigned used; - - struct sort_iter_set { - struct bkey_packed *k, *end; - } data[MAX_BSETS + 1]; -}; - -static void sort_iter_init(struct sort_iter *iter, struct btree *b) -{ - memset(iter, 0, sizeof(*iter)); - iter->b = b; -} - -static inline void __sort_iter_sift(struct sort_iter *iter, - unsigned from, - sort_cmp_fn cmp) -{ - unsigned i; - - for (i = from; - i + 1 < iter->used && - cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; - i++) - swap(iter->data[i], iter->data[i + 1]); -} - -static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) -{ - - __sort_iter_sift(iter, 0, cmp); -} - -static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) -{ - unsigned i = iter->used; - - while (i--) - __sort_iter_sift(iter, i, cmp); -} - -static void sort_iter_add(struct sort_iter *iter, - struct bkey_packed *k, - struct bkey_packed *end) -{ - BUG_ON(iter->used >= ARRAY_SIZE(iter->data)); - - if (k != end) - iter->data[iter->used++] = (struct sort_iter_set) { k, end }; -} - -static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) -{ - return iter->used ? iter->data->k : NULL; -} - -static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) -{ - iter->data->k = bkey_next(iter->data->k); - - BUG_ON(iter->data->k > iter->data->end); - - if (iter->data->k == iter->data->end) - memmove(&iter->data[0], - &iter->data[1], - sizeof(iter->data[0]) * --iter->used); - else - sort_iter_sift(iter, cmp); -} - -static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, - sort_cmp_fn cmp) -{ - struct bkey_packed *ret = sort_iter_peek(iter); - - if (ret) - sort_iter_advance(iter, cmp); - - return ret; -} - -static inline int sort_key_whiteouts_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r); -} - -static unsigned sort_key_whiteouts(struct bkey_packed *dst, - struct sort_iter *iter) -{ - struct bkey_packed *in, *out = dst; - - sort_iter_sort(iter, sort_key_whiteouts_cmp); - - while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) { - bkey_copy(out, in); - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - -static inline int sort_extent_whiteouts_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - struct bkey ul = bkey_unpack_key(b, l); - struct bkey ur = bkey_unpack_key(b, r); - - return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); -} - -static unsigned sort_extent_whiteouts(struct bkey_packed *dst, - struct sort_iter *iter) -{ - const struct bkey_format *f = &iter->b->format; - struct bkey_packed *in, *out = dst; - struct bkey_i l, r; - bool prev = false, l_packed = false; - u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); - u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); - u64 new_size; - - max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); - - sort_iter_sort(iter, sort_extent_whiteouts_cmp); - - while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { - EBUG_ON(bkeyp_val_u64s(f, in)); - EBUG_ON(in->type != KEY_TYPE_DISCARD); - - r.k = bkey_unpack_key(iter->b, in); - - if (prev && - bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { - if (bkey_cmp(l.k.p, r.k.p) >= 0) - continue; - - new_size = l_packed - ? min(max_packed_size, max_packed_offset - - bkey_start_offset(&l.k)) - : KEY_SIZE_MAX; - - new_size = min(new_size, r.k.p.offset - - bkey_start_offset(&l.k)); - - BUG_ON(new_size < l.k.size); - - bch_key_resize(&l.k, new_size); - - if (bkey_cmp(l.k.p, r.k.p) >= 0) - continue; - - bch_cut_front(l.k.p, &r); - } - - if (prev) { - if (!bkey_pack(out, &l, f)) { - BUG_ON(l_packed); - bkey_copy(out, &l); - } - out = bkey_next(out); - } - - l = r; - prev = true; - l_packed = bkey_packed(in); - } - - if (prev) { - if (!bkey_pack(out, &l, f)) { - BUG_ON(l_packed); - bkey_copy(out, &l); - } - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - -static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, - bool compacting, - enum compact_mode mode) -{ - unsigned live_u64s = b->nr.bset_u64s[t - b->set]; - unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); - - if (live_u64s == bset_u64s) - return 0; - - if (mode == COMPACT_LAZY) { - if (live_u64s * 4 < bset_u64s * 3 || - (compacting && bset_unwritten(b, bset(b, t)))) - return bset_u64s - live_u64s; - } else { - if (bset_written(b, bset(b, t))) - return bset_u64s - live_u64s; - } - - return 0; -} - -bool __bch_compact_whiteouts(struct bch_fs *c, struct btree *b, - enum compact_mode mode) -{ - const struct bkey_format *f = &b->format; - struct bset_tree *t; - struct bkey_packed *whiteouts = NULL; - struct bkey_packed *u_start, *u_pos; - struct sort_iter sort_iter; - unsigned order, whiteout_u64s = 0, u64s; - bool used_mempool, compacting = false; - - for_each_bset(b, t) - whiteout_u64s += should_compact_bset(b, t, - whiteout_u64s != 0, mode); - - if (!whiteout_u64s) - return false; - - sort_iter_init(&sort_iter, b); - - whiteout_u64s += b->whiteout_u64s; - order = get_order(whiteout_u64s * sizeof(u64)); - - whiteouts = btree_bounce_alloc(c, order, &used_mempool); - u_start = u_pos = whiteouts; - - memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), - b->whiteout_u64s); - u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); - - sort_iter_add(&sort_iter, u_start, u_pos); - - for_each_bset(b, t) { - struct bset *i = bset(b, t); - struct bkey_packed *k, *n, *out, *start, *end; - struct btree_node_entry *src = NULL, *dst = NULL; - - if (t != b->set && bset_unwritten(b, i)) { - src = container_of(i, struct btree_node_entry, keys); - dst = max(write_block(b), - (void *) btree_bkey_last(b, t -1)); - } - - if (!should_compact_bset(b, t, compacting, mode)) { - if (src != dst) { - memmove(dst, src, sizeof(*src) + - le16_to_cpu(src->keys.u64s) * - sizeof(u64)); - i = &dst->keys; - set_btree_bset(b, t, i); - } - continue; - } - - compacting = true; - u_start = u_pos; - start = i->start; - end = vstruct_last(i); - - if (src != dst) { - memmove(dst, src, sizeof(*src)); - i = &dst->keys; - set_btree_bset(b, t, i); - } - - out = i->start; - - for (k = start; k != end; k = n) { - n = bkey_next(k); - - if (bkey_deleted(k) && btree_node_is_extents(b)) - continue; - - if (bkey_whiteout(k) && !k->needs_whiteout) - continue; - - if (bkey_whiteout(k)) { - unreserve_whiteout(b, t, k); - memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); - set_bkeyp_val_u64s(f, u_pos, 0); - u_pos = bkey_next(u_pos); - } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { - bkey_copy(out, k); - out = bkey_next(out); - } - } - - sort_iter_add(&sort_iter, u_start, u_pos); - - if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { - i->u64s = cpu_to_le16((u64 *) out - i->_data); - set_btree_bset_end(b, t); - bch_bset_set_no_aux_tree(b, t); - } - } - - b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; - - BUG_ON((void *) unwritten_whiteouts_start(c, b) < - (void *) btree_bkey_last(b, bset_tree_last(b))); - - u64s = btree_node_is_extents(b) - ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b), - &sort_iter) - : sort_key_whiteouts(unwritten_whiteouts_start(c, b), - &sort_iter); - - BUG_ON(u64s > b->whiteout_u64s); - BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b)); - BUG_ON(u_pos != whiteouts && !u64s); - - if (u64s != b->whiteout_u64s) { - void *src = unwritten_whiteouts_start(c, b); - - b->whiteout_u64s = u64s; - memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); - } - - verify_no_dups(b, - unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b)); - - btree_bounce_free(c, order, used_mempool, whiteouts); - - if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) - bch_btree_build_aux_trees(b); - - bch_btree_keys_u64s_remaining(c, b); - bch_verify_btree_nr_keys(b); - - return true; -} - -static bool bch_drop_whiteouts(struct btree *b) -{ - struct bset_tree *t; - bool ret = false; - - for_each_bset(b, t) { - struct bset *i = bset(b, t); - struct bkey_packed *k, *n, *out, *start, *end; - - if (!should_compact_bset(b, t, true, true)) - continue; - - start = btree_bkey_first(b, t); - end = btree_bkey_last(b, t); - - if (bset_unwritten(b, i) && - t != b->set) { - struct bset *dst = - max_t(struct bset *, write_block(b), - (void *) btree_bkey_last(b, t -1)); - - memmove(dst, i, sizeof(struct bset)); - i = dst; - set_btree_bset(b, t, i); - } - - out = i->start; - - for (k = start; k != end; k = n) { - n = bkey_next(k); - - if (!bkey_whiteout(k)) { - bkey_copy(out, k); - out = bkey_next(out); - } - } - - i->u64s = cpu_to_le16((u64 *) out - i->_data); - bch_bset_set_no_aux_tree(b, t); - ret = true; - } - - bch_verify_btree_nr_keys(b); - - return ret; -} - -static inline int sort_keys_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r) ?: - (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?: - (int) l->needs_whiteout - (int) r->needs_whiteout; -} - -static unsigned sort_keys(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) -{ - const struct bkey_format *f = &iter->b->format; - struct bkey_packed *in, *next, *out = dst; - - sort_iter_sort(iter, sort_keys_cmp); - - while ((in = sort_iter_next(iter, sort_keys_cmp))) { - if (bkey_whiteout(in) && - (filter_whiteouts || !in->needs_whiteout)) - continue; - - if (bkey_whiteout(in) && - (next = sort_iter_peek(iter)) && - !bkey_cmp_packed(iter->b, in, next)) { - BUG_ON(in->needs_whiteout && - next->needs_whiteout); - /* - * XXX racy, called with read lock from write path - * - * leads to spurious BUG_ON() in bkey_unpack_key() in - * debug mode - */ - next->needs_whiteout |= in->needs_whiteout; - continue; - } - - if (bkey_whiteout(in)) { - memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); - set_bkeyp_val_u64s(f, out, 0); - } else { - bkey_copy(out, in); - } - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - -static inline int sort_extents_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r) ?: - (int) bkey_deleted(l) - (int) bkey_deleted(r); -} - -static unsigned sort_extents(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) -{ - struct bkey_packed *in, *out = dst; - - sort_iter_sort(iter, sort_extents_cmp); - - while ((in = sort_iter_next(iter, sort_extents_cmp))) { - if (bkey_deleted(in)) - continue; - - if (bkey_whiteout(in) && - (filter_whiteouts || !in->needs_whiteout)) - continue; - - bkey_copy(out, in); - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - -static void btree_node_sort(struct bch_fs *c, struct btree *b, - struct btree_iter *iter, - unsigned start_idx, - unsigned end_idx, - bool filter_whiteouts) -{ - struct btree_node *out; - struct sort_iter sort_iter; - struct bset_tree *t; - struct bset *start_bset = bset(b, &b->set[start_idx]); - bool used_mempool = false; - u64 start_time; - unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; - bool sorting_entire_node = start_idx == 0 && - end_idx == b->nsets; - - sort_iter_init(&sort_iter, b); - - for (t = b->set + start_idx; - t < b->set + end_idx; - t++) { - u64s += le16_to_cpu(bset(b, t)->u64s); - sort_iter_add(&sort_iter, - btree_bkey_first(b, t), - btree_bkey_last(b, t)); - } - - order = sorting_entire_node - ? btree_page_order(c) - : get_order(__vstruct_bytes(struct btree_node, u64s)); - - out = btree_bounce_alloc(c, order, &used_mempool); - - start_time = local_clock(); - - if (btree_node_is_extents(b)) - filter_whiteouts = bset_written(b, start_bset); - - u64s = btree_node_is_extents(b) - ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts) - : sort_keys(out->keys.start, &sort_iter, filter_whiteouts); - - out->keys.u64s = cpu_to_le16(u64s); - - BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); - - if (sorting_entire_node) - bch_time_stats_update(&c->btree_sort_time, start_time); - - /* Make sure we preserve bset journal_seq: */ - for (t = b->set + start_idx + 1; - t < b->set + end_idx; - t++) - start_bset->journal_seq = - max(start_bset->journal_seq, - bset(b, t)->journal_seq); - - if (sorting_entire_node) { - unsigned u64s = le16_to_cpu(out->keys.u64s); - - BUG_ON(order != btree_page_order(c)); - - /* - * Our temporary buffer is the same size as the btree node's - * buffer, we can just swap buffers instead of doing a big - * memcpy() - */ - *out = *b->data; - out->keys.u64s = cpu_to_le16(u64s); - swap(out, b->data); - set_btree_bset(b, b->set, &b->data->keys); - } else { - start_bset->u64s = out->keys.u64s; - memcpy_u64s(start_bset->start, - out->keys.start, - le16_to_cpu(out->keys.u64s)); - } - - for (i = start_idx + 1; i < end_idx; i++) - b->nr.bset_u64s[start_idx] += - b->nr.bset_u64s[i]; - - b->nsets -= shift; - - for (i = start_idx + 1; i < b->nsets; i++) { - b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; - b->set[i] = b->set[i + shift]; - } - - for (i = b->nsets; i < MAX_BSETS; i++) - b->nr.bset_u64s[i] = 0; - - set_btree_bset_end(b, &b->set[start_idx]); - bch_bset_set_no_aux_tree(b, &b->set[start_idx]); - - btree_bounce_free(c, order, used_mempool, out); - - bch_verify_btree_nr_keys(b); -} - -/* Sort + repack in a new format: */ -static struct btree_nr_keys sort_repack(struct bset *dst, - struct btree *src, - struct btree_node_iter *src_iter, - struct bkey_format *out_f, - bool filter_whiteouts) -{ - struct bkey_format *in_f = &src->format; - struct bkey_packed *in, *out = vstruct_last(dst); - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - - while ((in = bch_btree_node_iter_next_all(src_iter, src))) { - if (filter_whiteouts && bkey_whiteout(in)) - continue; - - if (bch_bkey_transform(out_f, out, bkey_packed(in) - ? in_f : &bch_bkey_format_current, in)) - out->format = KEY_FORMAT_LOCAL_BTREE; - else - bkey_unpack(src, (void *) out, in); - - btree_keys_account_key_add(&nr, 0, out); - out = bkey_next(out); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -/* Sort, repack, and merge: */ -static struct btree_nr_keys sort_repack_merge(struct bch_fs *c, - struct bset *dst, - struct btree *src, - struct btree_node_iter *iter, - struct bkey_format *out_f, - bool filter_whiteouts, - key_filter_fn filter, - key_merge_fn merge) -{ - struct bkey_packed *k, *prev = NULL, *out; - struct btree_nr_keys nr; - BKEY_PADDED(k) tmp; - - memset(&nr, 0, sizeof(nr)); - - while ((k = bch_btree_node_iter_next_all(iter, src))) { - if (filter_whiteouts && bkey_whiteout(k)) - continue; - - /* - * The filter might modify pointers, so we have to unpack the - * key and values to &tmp.k: - */ - bkey_unpack(src, &tmp.k, k); - - if (filter && filter(c, src, bkey_i_to_s(&tmp.k))) - continue; - - /* prev is always unpacked, for key merging: */ - - if (prev && - merge && - merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE) - continue; - - /* - * the current key becomes the new prev: advance prev, then - * copy the current key - but first pack prev (in place): - */ - if (prev) { - bkey_pack(prev, (void *) prev, out_f); - - btree_keys_account_key_add(&nr, 0, prev); - prev = bkey_next(prev); - } else { - prev = vstruct_last(dst); - } - - bkey_copy(prev, &tmp.k); - } - - if (prev) { - bkey_pack(prev, (void *) prev, out_f); - btree_keys_account_key_add(&nr, 0, prev); - out = bkey_next(prev); - } else { - out = vstruct_last(dst); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -void bch_btree_sort_into(struct bch_fs *c, - struct btree *dst, - struct btree *src) -{ - struct btree_nr_keys nr; - struct btree_node_iter src_iter; - u64 start_time = local_clock(); - - BUG_ON(dst->nsets != 1); - - bch_bset_set_no_aux_tree(dst, dst->set); - - bch_btree_node_iter_init_from_start(&src_iter, src, - btree_node_is_extents(src)); - - if (btree_node_ops(src)->key_normalize || - btree_node_ops(src)->key_merge) - nr = sort_repack_merge(c, btree_bset_first(dst), - src, &src_iter, - &dst->format, - true, - btree_node_ops(src)->key_normalize, - btree_node_ops(src)->key_merge); - else - nr = sort_repack(btree_bset_first(dst), - src, &src_iter, - &dst->format, - true); - - bch_time_stats_update(&c->btree_sort_time, start_time); - - set_btree_bset_end(dst, dst->set); - - dst->nr.live_u64s += nr.live_u64s; - dst->nr.bset_u64s[0] += nr.bset_u64s[0]; - dst->nr.packed_keys += nr.packed_keys; - dst->nr.unpacked_keys += nr.unpacked_keys; - - bch_verify_btree_nr_keys(dst); -} - -#define SORT_CRIT (4096 / sizeof(u64)) - -/* - * We're about to add another bset to the btree node, so if there's currently - * too many bsets - sort some of them together: - */ -static bool btree_node_compact(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) -{ - unsigned unwritten_idx; - bool ret = false; - - for (unwritten_idx = 0; - unwritten_idx < b->nsets; - unwritten_idx++) - if (bset_unwritten(b, bset(b, &b->set[unwritten_idx]))) - break; - - if (b->nsets - unwritten_idx > 1) { - btree_node_sort(c, b, iter, unwritten_idx, - b->nsets, false); - ret = true; - } - - if (unwritten_idx > 1) { - btree_node_sort(c, b, iter, 0, unwritten_idx, false); - ret = true; - } - - return ret; -} - -void bch_btree_build_aux_trees(struct btree *b) -{ - struct bset_tree *t; - - for_each_bset(b, t) - bch_bset_build_aux_tree(b, t, - bset_unwritten(b, bset(b, t)) && - t == bset_tree_last(b)); -} - -/* - * @bch_btree_init_next - initialize a new (unwritten) bset that can then be - * inserted into - * - * Safe to call if there already is an unwritten bset - will only add a new bset - * if @b doesn't already have one. - * - * Returns true if we sorted (i.e. invalidated iterators - */ -void bch_btree_init_next(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) -{ - struct btree_node_entry *bne; - bool did_sort; - - EBUG_ON(!(b->lock.state.seq & 1)); - EBUG_ON(iter && iter->nodes[b->level] != b); - - did_sort = btree_node_compact(c, b, iter); - - bne = want_new_bset(c, b); - if (bne) - bch_bset_init_next(b, &bne->keys); - - bch_btree_build_aux_trees(b); - - if (iter && did_sort) - bch_btree_iter_reinit_node(iter, b); -} - -static struct nonce btree_nonce(struct btree *b, - struct bset *i, - unsigned offset) -{ - return (struct nonce) {{ - [0] = cpu_to_le32(offset), - [1] = ((__le32 *) &i->seq)[0], - [2] = ((__le32 *) &i->seq)[1], - [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, - }}; -} - -static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce) -{ - bch_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, - vstruct_end(i) - (void *) i->_data); -} - -#define btree_node_error(b, c, ptr, fmt, ...) \ - bch_fs_inconsistent(c, \ - "btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\ - (b)->btree_id, (b)->level, btree_node_root(c, b) \ - ? btree_node_root(c, b)->level : -1, \ - PTR_BUCKET_NR(ca, ptr), (b)->written, \ - le16_to_cpu((i)->u64s), ##__VA_ARGS__) - -static const char *validate_bset(struct bch_fs *c, struct btree *b, - struct bch_dev *ca, - const struct bch_extent_ptr *ptr, - struct bset *i, unsigned sectors, - unsigned *whiteout_u64s) -{ - struct bkey_packed *k, *prev = NULL; - struct bpos prev_pos = POS_MIN; - bool seen_non_whiteout = false; - - if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) - return "unsupported bset version"; - - if (b->written + sectors > c->sb.btree_node_size) - return "bset past end of btree node"; - - if (i != &b->data->keys && !i->u64s) - btree_node_error(b, c, ptr, "empty set"); - - if (!BSET_SEPARATE_WHITEOUTS(i)) { - seen_non_whiteout = true; - whiteout_u64s = 0; - } - - for (k = i->start; - k != vstruct_last(i);) { - struct bkey_s_c u; - struct bkey tmp; - const char *invalid; - - if (!k->u64s) { - btree_node_error(b, c, ptr, - "KEY_U64s 0: %zu bytes of metadata lost", - vstruct_end(i) - (void *) k); - - i->u64s = cpu_to_le16((u64 *) k - i->_data); - break; - } - - if (bkey_next(k) > vstruct_last(i)) { - btree_node_error(b, c, ptr, - "key extends past end of bset"); - - i->u64s = cpu_to_le16((u64 *) k - i->_data); - break; - } - - if (k->format > KEY_FORMAT_CURRENT) { - btree_node_error(b, c, ptr, - "invalid bkey format %u", k->format); - - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - continue; - } - - if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) - bch_bkey_swab(btree_node_type(b), &b->format, k); - - u = bkey_disassemble(b, k, &tmp); - - invalid = btree_bkey_invalid(c, b, u); - if (invalid) { - char buf[160]; - - bch_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), u); - btree_node_error(b, c, ptr, - "invalid bkey %s: %s", buf, invalid); - - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - continue; - } - - /* - * with the separate whiteouts thing (used for extents), the - * second set of keys actually can have whiteouts too, so we - * can't solely go off bkey_whiteout()... - */ - - if (!seen_non_whiteout && - (!bkey_whiteout(k) || - (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) { - *whiteout_u64s = k->_data - i->_data; - seen_non_whiteout = true; - } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) { - btree_node_error(b, c, ptr, - "keys out of order: %llu:%llu > %llu:%llu", - prev_pos.inode, - prev_pos.offset, - u.k->p.inode, - bkey_start_offset(u.k)); - /* XXX: repair this */ - } - - prev_pos = u.k->p; - prev = k; - k = bkey_next(k); - } - - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - return NULL; -} - -static bool extent_contains_ptr(struct bkey_s_c_extent e, - struct bch_extent_ptr match) -{ - const struct bch_extent_ptr *ptr; - - extent_for_each_ptr(e, ptr) - if (!memcmp(ptr, &match, sizeof(*ptr))) - return true; - - return false; -} - -void bch_btree_node_read_done(struct bch_fs *c, struct btree *b, - struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - struct btree_node_entry *bne; - struct bset *i = &b->data->keys; - struct btree_node_iter *iter; - struct btree_node *sorted; - bool used_mempool; - unsigned u64s; - const char *err; - struct bch_csum csum; - struct nonce nonce; - int ret; - - iter = mempool_alloc(&c->fill_iter, GFP_NOIO); - __bch_btree_node_iter_init(iter, btree_node_is_extents(b)); - - err = "dynamic fault"; - if (bch_meta_read_fault("btree")) - goto err; - - while (b->written < c->sb.btree_node_size) { - unsigned sectors, whiteout_u64s = 0; - - if (!b->written) { - i = &b->data->keys; - - err = "bad magic"; - if (le64_to_cpu(b->data->magic) != bset_magic(c)) - goto err; - - err = "bad btree header"; - if (!b->data->keys.seq) - goto err; - - err = "unknown checksum type"; - if (!bch_checksum_type_valid(c, BSET_CSUM_TYPE(i))) - goto err; - - /* XXX: retry checksum errors */ - - nonce = btree_nonce(b, i, b->written << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); - - err = "bad checksum"; - if (bch_crc_cmp(csum, b->data->csum)) - goto err; - - bch_encrypt(c, BSET_CSUM_TYPE(i), nonce, - &b->data->flags, - (void *) &b->data->keys - - (void *) &b->data->flags); - nonce = nonce_add(nonce, - round_up((void *) &b->data->keys - - (void *) &b->data->flags, - CHACHA20_BLOCK_SIZE)); - bset_encrypt(c, i, nonce); - - sectors = vstruct_sectors(b->data, c->block_bits); - - if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { - u64 *p = (u64 *) &b->data->ptr; - - *p = swab64(*p); - bch_bpos_swab(&b->data->min_key); - bch_bpos_swab(&b->data->max_key); - } - - err = "incorrect btree id"; - if (BTREE_NODE_ID(b->data) != b->btree_id) - goto err; - - err = "incorrect level"; - if (BTREE_NODE_LEVEL(b->data) != b->level) - goto err; - - err = "incorrect max key"; - if (bkey_cmp(b->data->max_key, b->key.k.p)) - goto err; - - err = "incorrect backpointer"; - if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), - b->data->ptr)) - goto err; - - err = bch_bkey_format_validate(&b->data->format); - if (err) - goto err; - - set_btree_bset(b, b->set, &b->data->keys); - - btree_node_set_format(b, b->data->format); - } else { - bne = write_block(b); - i = &bne->keys; - - if (i->seq != b->data->keys.seq) - break; - - err = "unknown checksum type"; - if (!bch_checksum_type_valid(c, BSET_CSUM_TYPE(i))) - goto err; - - nonce = btree_nonce(b, i, b->written << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - - err = "bad checksum"; - if (memcmp(&csum, &bne->csum, sizeof(csum))) - goto err; - - bset_encrypt(c, i, nonce); - - sectors = vstruct_sectors(bne, c->block_bits); - } - - err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s); - if (err) - goto err; - - b->written += sectors; - - err = "insufficient memory"; - ret = bch_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b); - if (ret < 0) - goto err; - - if (ret) - continue; - - __bch_btree_node_iter_push(iter, b, - i->start, - vstruct_idx(i, whiteout_u64s)); - - __bch_btree_node_iter_push(iter, b, - vstruct_idx(i, whiteout_u64s), - vstruct_last(i)); - } - - err = "corrupted btree"; - for (bne = write_block(b); - bset_byte_offset(b, bne) < btree_bytes(c); - bne = (void *) bne + block_bytes(c)) - if (bne->keys.seq == b->data->keys.seq) - goto err; - - sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool); - sorted->keys.u64s = 0; - - b->nr = btree_node_is_extents(b) - ? bch_extent_sort_fix_overlapping(c, &sorted->keys, b, iter) - : bch_key_sort_fix_overlapping(&sorted->keys, b, iter); - - u64s = le16_to_cpu(sorted->keys.u64s); - *sorted = *b->data; - sorted->keys.u64s = cpu_to_le16(u64s); - swap(sorted, b->data); - set_btree_bset(b, b->set, &b->data->keys); - b->nsets = 1; - - BUG_ON(b->nr.live_u64s != u64s); - - btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted); - - bch_bset_build_aux_tree(b, b->set, false); - - set_needs_whiteout(btree_bset_first(b)); - - btree_node_reset_sib_u64s(b); -out: - mempool_free(iter, &c->fill_iter); - return; -err: - set_btree_node_read_error(b); - btree_node_error(b, c, ptr, "%s", err); - goto out; -} - -static void btree_node_read_endio(struct bio *bio) -{ - closure_put(bio->bi_private); -} - -void bch_btree_node_read(struct bch_fs *c, struct btree *b) -{ - uint64_t start_time = local_clock(); - struct closure cl; - struct bio *bio; - struct extent_pick_ptr pick; - - trace_bcache_btree_read(c, b); - - closure_init_stack(&cl); - - pick = bch_btree_pick_ptr(c, b); - if (bch_fs_fatal_err_on(!pick.ca, c, - "no cache device for btree node")) { - set_btree_node_read_error(b); - return; - } - - bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio); - bio->bi_bdev = pick.ca->disk_sb.bdev; - bio->bi_iter.bi_sector = pick.ptr.offset; - bio->bi_iter.bi_size = btree_bytes(c); - bio->bi_end_io = btree_node_read_endio; - bio->bi_private = &cl; - bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC); - - bch_bio_map(bio, b->data); - - closure_get(&cl); - bch_generic_make_request(bio, c); - closure_sync(&cl); - - if (bch_dev_fatal_io_err_on(bio->bi_error, - pick.ca, "IO error reading bucket %zu", - PTR_BUCKET_NR(pick.ca, &pick.ptr)) || - bch_meta_read_fault("btree")) { - set_btree_node_read_error(b); - goto out; - } - - bch_btree_node_read_done(c, b, pick.ca, &pick.ptr); - bch_time_stats_update(&c->btree_read_time, start_time); -out: - bio_put(bio); - percpu_ref_put(&pick.ca->io_ref); -} - -int bch_btree_root_read(struct bch_fs *c, enum btree_id id, - const struct bkey_i *k, unsigned level) -{ - struct closure cl; - struct btree *b; - int ret; - - closure_init_stack(&cl); - - do { - ret = mca_cannibalize_lock(c, &cl); - closure_sync(&cl); - } while (ret); - - b = mca_alloc(c); - mca_cannibalize_unlock(c); - - BUG_ON(IS_ERR(b)); - - bkey_copy(&b->key, k); - BUG_ON(mca_hash_insert(c, b, level, id)); - - bch_btree_node_read(c, b); - six_unlock_write(&b->lock); - - if (btree_node_read_error(b)) { - six_unlock_intent(&b->lock); - return -EIO; - } - - bch_btree_set_root_initial(c, b, NULL); - six_unlock_intent(&b->lock); - - return 0; -} - -void bch_btree_complete_write(struct bch_fs *c, struct btree *b, - struct btree_write *w) -{ - bch_journal_pin_drop(&c->journal, &w->journal); - closure_wake_up(&w->wait); -} - -static void btree_node_write_done(struct bch_fs *c, struct btree *b) -{ - struct btree_write *w = btree_prev_write(b); - - /* - * Before calling bch_btree_complete_write() - if the write errored, we - * have to halt new journal writes before they see this btree node - * write as completed: - */ - if (btree_node_write_error(b)) - bch_journal_halt(&c->journal); - - bch_btree_complete_write(c, b, w); - btree_node_io_unlock(b); -} - -static void btree_node_write_endio(struct bio *bio) -{ - struct btree *b = bio->bi_private; - struct bch_write_bio *wbio = to_wbio(bio); - struct bch_fs *c = wbio->c; - struct bio *orig = wbio->split ? wbio->orig : NULL; - struct closure *cl = !wbio->split ? wbio->cl : NULL; - struct bch_dev *ca = wbio->ca; - - if (bch_dev_fatal_io_err_on(bio->bi_error, ca, "btree write") || - bch_meta_write_fault("btree")) - set_btree_node_write_error(b); - - if (wbio->bounce) - btree_bounce_free(c, - wbio->order, - wbio->used_mempool, - page_address(bio->bi_io_vec[0].bv_page)); - - if (wbio->put_bio) - bio_put(bio); - - if (orig) { - bio_endio(orig); - } else { - btree_node_write_done(c, b); - if (cl) - closure_put(cl); - } - - if (ca) - percpu_ref_put(&ca->io_ref); -} - -void __bch_btree_node_write(struct bch_fs *c, struct btree *b, - struct closure *parent, - enum six_lock_type lock_type_held, - int idx_to_write) -{ - struct bio *bio; - struct bch_write_bio *wbio; - struct bset_tree *t; - struct bset *i; - struct btree_node *bn = NULL; - struct btree_node_entry *bne = NULL; - BKEY_PADDED(key) k; - struct bkey_s_extent e; - struct bch_extent_ptr *ptr; - struct sort_iter sort_iter; - struct nonce nonce; - unsigned bytes_to_write, sectors_to_write, order, bytes, u64s; - u64 seq = 0; - bool used_mempool; - unsigned long old, new; - void *data; - - /* - * We may only have a read lock on the btree node - the dirty bit is our - * "lock" against racing with other threads that may be trying to start - * a write, we do a write iff we clear the dirty bit. Since setting the - * dirty bit requires a write lock, we can't race with other threads - * redirtying it: - */ - do { - old = new = READ_ONCE(b->flags); - - if (!(old & (1 << BTREE_NODE_dirty))) - return; - - if (idx_to_write >= 0 && - idx_to_write != !!(old & (1 << BTREE_NODE_write_idx))) - return; - - if (old & (1 << BTREE_NODE_write_in_flight)) { - wait_on_bit_io(&b->flags, - BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); - continue; - } - - new &= ~(1 << BTREE_NODE_dirty); - new |= (1 << BTREE_NODE_write_in_flight); - new |= (1 << BTREE_NODE_just_written); - new ^= (1 << BTREE_NODE_write_idx); - } while (cmpxchg_acquire(&b->flags, old, new) != old); - - BUG_ON(!list_empty(&b->write_blocked)); - - BUG_ON(b->written >= c->sb.btree_node_size); - BUG_ON(bset_written(b, btree_bset_last(b))); - BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); - BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); - - if (lock_type_held == SIX_LOCK_intent) { - six_lock_write(&b->lock); - __bch_compact_whiteouts(c, b, COMPACT_WRITTEN); - six_unlock_write(&b->lock); - } else { - __bch_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK); - } - - BUG_ON(b->uncompacted_whiteout_u64s); - - sort_iter_init(&sort_iter, b); - - bytes = !b->written - ? sizeof(struct btree_node) - : sizeof(struct btree_node_entry); - - bytes += b->whiteout_u64s * sizeof(u64); - - for_each_bset(b, t) { - i = bset(b, t); - - if (bset_written(b, i)) - continue; - - bytes += le16_to_cpu(i->u64s) * sizeof(u64); - sort_iter_add(&sort_iter, - btree_bkey_first(b, t), - btree_bkey_last(b, t)); - seq = max(seq, le64_to_cpu(i->journal_seq)); - } - - order = get_order(bytes); - data = btree_bounce_alloc(c, order, &used_mempool); - - if (!b->written) { - bn = data; - *bn = *b->data; - i = &bn->keys; - } else { - bne = data; - bne->keys = b->data->keys; - i = &bne->keys; - } - - i->journal_seq = cpu_to_le64(seq); - i->u64s = 0; - - if (!btree_node_is_extents(b)) { - sort_iter_add(&sort_iter, - unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b)); - SET_BSET_SEPARATE_WHITEOUTS(i, false); - } else { - memcpy_u64s(i->start, - unwritten_whiteouts_start(c, b), - b->whiteout_u64s); - i->u64s = cpu_to_le16(b->whiteout_u64s); - SET_BSET_SEPARATE_WHITEOUTS(i, true); - } - - b->whiteout_u64s = 0; - - u64s = btree_node_is_extents(b) - ? sort_extents(vstruct_last(i), &sort_iter, false) - : sort_keys(i->start, &sort_iter, false); - le16_add_cpu(&i->u64s, u64s); - - clear_needs_whiteout(i); - - if (b->written && !i->u64s) { - /* Nothing to write: */ - btree_bounce_free(c, order, used_mempool, data); - btree_node_write_done(c, b); - return; - } - - BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); - BUG_ON(i->seq != b->data->keys.seq); - - i->version = cpu_to_le16(BCACHE_BSET_VERSION); - SET_BSET_CSUM_TYPE(i, bch_meta_checksum_type(c)); - - nonce = btree_nonce(b, i, b->written << 9); - - if (bn) { - bch_encrypt(c, BSET_CSUM_TYPE(i), nonce, - &bn->flags, - (void *) &b->data->keys - - (void *) &b->data->flags); - nonce = nonce_add(nonce, - round_up((void *) &b->data->keys - - (void *) &b->data->flags, - CHACHA20_BLOCK_SIZE)); - bset_encrypt(c, i, nonce); - - nonce = btree_nonce(b, i, b->written << 9); - bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); - } else { - bset_encrypt(c, i, nonce); - - bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - } - - bytes_to_write = vstruct_end(i) - data; - sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; - - memset(data + bytes_to_write, 0, - (sectors_to_write << 9) - bytes_to_write); - - BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size); - - trace_bcache_btree_write(b, bytes_to_write, sectors_to_write); - - /* - * We handle btree write errors by immediately halting the journal - - * after we've done that, we can't issue any subsequent btree writes - * because they might have pointers to new nodes that failed to write. - * - * Furthermore, there's no point in doing any more btree writes because - * with the journal stopped, we're never going to update the journal to - * reflect that those writes were done and the data flushed from the - * journal: - * - * Make sure to update b->written so bch_btree_init_next() doesn't - * break: - */ - if (bch_journal_error(&c->journal) || - c->opts.nochanges) { - set_btree_node_noevict(b); - b->written += sectors_to_write; - - btree_bounce_free(c, order, used_mempool, data); - btree_node_write_done(c, b); - return; - } - - bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write); - - wbio = to_wbio(bio); - wbio->cl = parent; - wbio->bounce = true; - wbio->put_bio = true; - wbio->order = order; - wbio->used_mempool = used_mempool; - bio->bi_iter.bi_size = sectors_to_write << 9; - bio->bi_end_io = btree_node_write_endio; - bio->bi_private = b; - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA); - - if (parent) - closure_get(parent); - - bch_bio_map(bio, data); - - /* - * If we're appending to a leaf node, we don't technically need FUA - - * this write just needs to be persisted before the next journal write, - * which will be marked FLUSH|FUA. - * - * Similarly if we're writing a new btree root - the pointer is going to - * be in the next journal entry. - * - * But if we're writing a new btree node (that isn't a root) or - * appending to a non leaf btree node, we need either FUA or a flush - * when we write the parent with the new pointer. FUA is cheaper than a - * flush, and writes appending to leaf nodes aren't blocking anything so - * just make all btree node writes FUA to keep things sane. - */ - - bkey_copy(&k.key, &b->key); - e = bkey_i_to_s_extent(&k.key); - - extent_for_each_ptr(e, ptr) - ptr->offset += b->written; - - extent_for_each_ptr(e, ptr) - atomic64_add(sectors_to_write, - &c->devs[ptr->dev]->btree_sectors_written); - - b->written += sectors_to_write; - - bch_submit_wbio_replicas(wbio, c, &k.key, true); -} - -/* - * Work that must be done with write lock held: - */ -bool bch_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) -{ - bool invalidated_iter = false; - struct btree_node_entry *bne; - struct bset_tree *t; - - if (!btree_node_just_written(b)) - return false; - - BUG_ON(b->whiteout_u64s); - BUG_ON(b->uncompacted_whiteout_u64s); - - clear_btree_node_just_written(b); - - /* - * Note: immediately after write, bset_unwritten()/bset_written() don't - * work - the amount of data we had to write after compaction might have - * been smaller than the offset of the last bset. - * - * However, we know that all bsets have been written here, as long as - * we're still holding the write lock: - */ - - /* - * XXX: decide if we really want to unconditionally sort down to a - * single bset: - */ - if (b->nsets > 1) { - btree_node_sort(c, b, NULL, 0, b->nsets, true); - invalidated_iter = true; - } else { - invalidated_iter = bch_drop_whiteouts(b); - } - - for_each_bset(b, t) - set_needs_whiteout(bset(b, t)); - - bch_btree_verify(c, b); - - /* - * If later we don't unconditionally sort down to a single bset, we have - * to ensure this is still true: - */ - BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); - - bne = want_new_bset(c, b); - if (bne) - bch_bset_init_next(b, &bne->keys); - - bch_btree_build_aux_trees(b); - - return invalidated_iter; -} - -/* - * Use this one if the node is intent locked: - */ -void bch_btree_node_write(struct bch_fs *c, struct btree *b, - struct closure *parent, - enum six_lock_type lock_type_held, - int idx_to_write) -{ - BUG_ON(lock_type_held == SIX_LOCK_write); - - if (lock_type_held == SIX_LOCK_intent || - six_trylock_convert(&b->lock, SIX_LOCK_read, - SIX_LOCK_intent)) { - __bch_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write); - - six_lock_write(&b->lock); - bch_btree_post_write_cleanup(c, b); - six_unlock_write(&b->lock); - - if (lock_type_held == SIX_LOCK_read) - six_lock_downgrade(&b->lock); - } else { - __bch_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write); - } -} - -static void bch_btree_node_write_dirty(struct bch_fs *c, struct btree *b, - struct closure *parent) -{ - six_lock_read(&b->lock); - BUG_ON(b->level); - - bch_btree_node_write(c, b, parent, SIX_LOCK_read, -1); - six_unlock_read(&b->lock); -} - -/* - * Write all dirty btree nodes to disk, including roots - */ -void bch_btree_flush(struct bch_fs *c) -{ - struct closure cl; - struct btree *b; - struct bucket_table *tbl; - struct rhash_head *pos; - bool dropped_lock; - unsigned i; - - closure_init_stack(&cl); - - rcu_read_lock(); - - do { - dropped_lock = false; - i = 0; -restart: - tbl = rht_dereference_rcu(c->btree_cache_table.tbl, - &c->btree_cache_table); - - for (; i < tbl->size; i++) - rht_for_each_entry_rcu(b, pos, tbl, i, hash) - /* - * XXX - locking for b->level, when called from - * bch_journal_move() - */ - if (!b->level && btree_node_dirty(b)) { - rcu_read_unlock(); - bch_btree_node_write_dirty(c, b, &cl); - dropped_lock = true; - rcu_read_lock(); - goto restart; - } - } while (dropped_lock); - - rcu_read_unlock(); - - closure_sync(&cl); -} - -/** - * bch_btree_node_flush_journal - flush any journal entries that contain keys - * from this node - * - * The bset's journal sequence number is used for preserving ordering of index - * updates across unclean shutdowns - it's used to ignore bsets newer than the - * most recent journal entry. - * - * But when rewriting btree nodes we compact all the bsets in a btree node - and - * if we compacted a bset that should be ignored with bsets we do need, that - * would be bad. So to avoid that, prior to making the new node visible ensure - * that the journal has been flushed so that all the bsets we compacted should - * be visible. - */ -void bch_btree_node_flush_journal_entries(struct bch_fs *c, - struct btree *b, - struct closure *cl) -{ - int i = b->nsets; - - /* - * Journal sequence numbers in the different bsets will always be in - * ascending order, we only need to flush the highest - except that the - * most recent bset might not have a journal sequence number yet, so we - * need to loop: - */ - while (i--) { - u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq); - - if (seq) { - bch_journal_flush_seq_async(&c->journal, seq, cl); - break; - } - } -} diff --git a/libbcache/btree_io.h b/libbcache/btree_io.h deleted file mode 100644 index 0f75f456..00000000 --- a/libbcache/btree_io.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef _BCACHE_BTREE_IO_H -#define _BCACHE_BTREE_IO_H - -struct bch_fs; -struct btree_write; -struct btree; -struct btree_iter; - -static inline void btree_node_io_unlock(struct btree *b) -{ - EBUG_ON(!btree_node_write_in_flight(b)); - clear_btree_node_write_in_flight(b); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -} - -static inline void btree_node_io_lock(struct btree *b) -{ - wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} - -enum compact_mode { - COMPACT_LAZY, - COMPACT_WRITTEN, - COMPACT_WRITTEN_NO_WRITE_LOCK, -}; - -bool __bch_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode); - -static inline bool bch_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) -{ - struct bset_tree *t; - - for_each_bset(b, t) { - unsigned live_u64s = b->nr.bset_u64s[t - b->set]; - unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); - - if (live_u64s * 4 < bset_u64s * 3) - goto compact; - } - - return false; -compact: - return __bch_compact_whiteouts(c, b, COMPACT_LAZY); -} - -void bch_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); - -void bch_btree_build_aux_trees(struct btree *); -void bch_btree_init_next(struct bch_fs *, struct btree *, - struct btree_iter *); - -void bch_btree_node_read_done(struct bch_fs *, struct btree *, - struct bch_dev *, const struct bch_extent_ptr *); -void bch_btree_node_read(struct bch_fs *, struct btree *); -int bch_btree_root_read(struct bch_fs *, enum btree_id, - const struct bkey_i *, unsigned); - -void bch_btree_complete_write(struct bch_fs *, struct btree *, - struct btree_write *); - -void __bch_btree_node_write(struct bch_fs *, struct btree *, - struct closure *, enum six_lock_type, int); -bool bch_btree_post_write_cleanup(struct bch_fs *, struct btree *); - -void bch_btree_node_write(struct bch_fs *, struct btree *, - struct closure *, enum six_lock_type, int); - -void bch_btree_flush(struct bch_fs *); -void bch_btree_node_flush_journal_entries(struct bch_fs *, struct btree *, - struct closure *); - -#endif /* _BCACHE_BTREE_IO_H */ diff --git a/libbcache/btree_iter.c b/libbcache/btree_iter.c deleted file mode 100644 index 04b4bc2e..00000000 --- a/libbcache/btree_iter.c +++ /dev/null @@ -1,1150 +0,0 @@ - -#include "bcache.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "debug.h" -#include "extents.h" - -#include <trace/events/bcache.h> - -#define BTREE_ITER_NOT_END ((struct btree *) 1) - -static inline bool is_btree_node(struct btree_iter *iter, unsigned l) -{ - return iter->nodes[l] && iter->nodes[l] != BTREE_ITER_NOT_END; -} - -/* Btree node locking: */ - -/* - * Updates the saved lock sequence number, so that btree_node_relock() will - * succeed: - */ -void btree_node_unlock_write(struct btree *b, struct btree_iter *iter) -{ - struct btree_iter *linked; - - EBUG_ON(iter->nodes[b->level] != b); - EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq); - - for_each_linked_btree_node(iter, b, linked) - linked->lock_seq[b->level] += 2; - - iter->lock_seq[b->level] += 2; - - six_unlock_write(&b->lock); -} - -void btree_node_lock_write(struct btree *b, struct btree_iter *iter) -{ - struct btree_iter *linked; - unsigned readers = 0; - - EBUG_ON(iter->nodes[b->level] != b); - EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq); - - if (six_trylock_write(&b->lock)) - return; - - for_each_linked_btree_iter(iter, linked) - if (linked->nodes[b->level] == b && - btree_node_read_locked(linked, b->level)) - readers++; - - if (likely(!readers)) { - six_lock_write(&b->lock); - } else { - /* - * Must drop our read locks before calling six_lock_write() - - * six_unlock() won't do wakeups until the reader count - * goes to 0, and it's safe because we have the node intent - * locked: - */ - atomic64_sub(__SIX_VAL(read_lock, readers), - &b->lock.state.counter); - six_lock_write(&b->lock); - atomic64_add(__SIX_VAL(read_lock, readers), - &b->lock.state.counter); - } -} - -/* versions that allow iter to be null: */ -void __btree_node_unlock_write(struct btree *b, struct btree_iter *iter) -{ - if (likely(iter)) - btree_node_unlock_write(b, iter); - else - six_unlock_write(&b->lock); -} - -void __btree_node_lock_write(struct btree *b, struct btree_iter *iter) -{ - if (likely(iter)) - btree_node_lock_write(b, iter); - else - six_lock_write(&b->lock); -} - -bool btree_node_relock(struct btree_iter *iter, unsigned level) -{ - struct btree_iter *linked; - struct btree *b = iter->nodes[level]; - enum btree_node_locked_type want = btree_lock_want(iter, level); - enum btree_node_locked_type have = btree_node_locked_type(iter, level); - - if (want == have) - return true; - - if (!is_btree_node(iter, level)) - return false; - - if (race_fault()) - return false; - - if (have != BTREE_NODE_UNLOCKED - ? six_trylock_convert(&b->lock, have, want) - : six_relock_type(&b->lock, want, iter->lock_seq[level])) - goto success; - - for_each_linked_btree_iter(iter, linked) - if (linked->nodes[level] == b && - btree_node_locked_type(linked, level) == want && - iter->lock_seq[level] == b->lock.state.seq) { - btree_node_unlock(iter, level); - six_lock_increment(&b->lock, want); - goto success; - } - - return false; -success: - mark_btree_node_unlocked(iter, level); - mark_btree_node_locked(iter, level, want); - return true; -} - -/* Slowpath: */ -bool __bch_btree_node_lock(struct btree *b, struct bpos pos, - unsigned level, - struct btree_iter *iter, - enum six_lock_type type) -{ - struct btree_iter *linked; - - /* Can't have children locked before ancestors: */ - EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked)); - - /* - * Can't hold any read locks while we block taking an intent lock - see - * below for reasoning, and we should have already dropped any read - * locks in the current iterator - */ - EBUG_ON(type == SIX_LOCK_intent && - iter->nodes_locked != iter->nodes_intent_locked); - - for_each_linked_btree_iter(iter, linked) - if (linked->nodes[level] == b && - btree_node_locked_type(linked, level) == type) { - six_lock_increment(&b->lock, type); - return true; - } - - /* - * Must lock btree nodes in key order - this case hapens when locking - * the prev sibling in btree node merging: - */ - if (iter->nodes_locked && - __ffs(iter->nodes_locked) == level && - __btree_iter_cmp(iter->btree_id, pos, iter)) - return false; - - for_each_linked_btree_iter(iter, linked) { - if (!linked->nodes_locked) - continue; - - /* - * Can't block taking an intent lock if we have _any_ nodes read - * locked: - * - * - Our read lock blocks another thread with an intent lock on - * the same node from getting a write lock, and thus from - * dropping its intent lock - * - * - And the other thread may have multiple nodes intent locked: - * both the node we want to intent lock, and the node we - * already have read locked - deadlock: - */ - if (type == SIX_LOCK_intent && - linked->nodes_locked != linked->nodes_intent_locked) { - linked->locks_want = max(linked->locks_want, - iter->locks_want); - return false; - } - - /* We have to lock btree nodes in key order: */ - if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) - return false; - - /* - * Interior nodes must be locked before their descendants: if - * another iterator has possible descendants locked of the node - * we're about to lock, it must have the ancestors locked too: - */ - if (linked->btree_id == iter->btree_id && - level > __fls(linked->nodes_locked)) { - linked->locks_want = max(linked->locks_want, - iter->locks_want); - return false; - } - } - - six_lock_type(&b->lock, type); - return true; -} - -/* Btree iterator locking: */ - - -static void btree_iter_drop_extra_locks(struct btree_iter *iter) -{ - unsigned l; - - while (iter->nodes_locked && - (l = __fls(iter->nodes_locked)) > iter->locks_want) { - if (!btree_node_locked(iter, l)) - panic("l %u nodes_locked %u\n", l, iter->nodes_locked); - - if (l > iter->level) { - btree_node_unlock(iter, l); - } else if (btree_node_intent_locked(iter, l)) { - six_lock_downgrade(&iter->nodes[l]->lock); - iter->nodes_intent_locked ^= 1 << l; - } - } -} - -bool __bch_btree_iter_set_locks_want(struct btree_iter *iter, - unsigned new_locks_want) -{ - struct btree_iter *linked; - unsigned l; - - /* Drop locks we don't want anymore: */ - if (new_locks_want < iter->locks_want) - for_each_linked_btree_iter(iter, linked) - if (linked->locks_want > new_locks_want) { - linked->locks_want = max_t(unsigned, 1, - new_locks_want); - btree_iter_drop_extra_locks(linked); - } - - iter->locks_want = new_locks_want; - btree_iter_drop_extra_locks(iter); - - for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++) - if (!btree_node_relock(iter, l)) - goto fail; - - return true; -fail: - /* - * Just an optimization: ancestor nodes must be locked before child - * nodes, so set locks_want on iterators that might lock ancestors - * before us to avoid getting -EINTR later: - */ - for_each_linked_btree_iter(iter, linked) - if (linked->btree_id == iter->btree_id && - btree_iter_cmp(linked, iter) <= 0) - linked->locks_want = max_t(unsigned, linked->locks_want, - new_locks_want); - return false; -} - -static int __bch_btree_iter_unlock(struct btree_iter *iter) -{ - BUG_ON(iter->error == -EINTR); - - while (iter->nodes_locked) - btree_node_unlock(iter, __ffs(iter->nodes_locked)); - - return iter->error; -} - -int bch_btree_iter_unlock(struct btree_iter *iter) -{ - struct btree_iter *linked; - - for_each_linked_btree_iter(iter, linked) - __bch_btree_iter_unlock(linked); - return __bch_btree_iter_unlock(iter); -} - -/* Btree iterator: */ - -#ifdef CONFIG_BCACHE_DEBUG - -static void __bch_btree_iter_verify(struct btree_iter *iter, - struct btree *b) -{ - struct btree_node_iter *node_iter = &iter->node_iters[b->level]; - struct btree_node_iter tmp = *node_iter; - struct bkey_packed *k; - - bch_btree_node_iter_verify(node_iter, b); - - /* - * For interior nodes, the iterator will have skipped past - * deleted keys: - */ - k = b->level - ? bch_btree_node_iter_prev(&tmp, b) - : bch_btree_node_iter_prev_all(&tmp, b); - if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k, - iter->is_extents)) { - char buf[100]; - struct bkey uk = bkey_unpack_key(b, k); - - bch_bkey_to_text(buf, sizeof(buf), &uk); - panic("prev key should be before after pos:\n%s\n%llu:%llu\n", - buf, iter->pos.inode, iter->pos.offset); - } - - k = bch_btree_node_iter_peek_all(node_iter, b); - if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k, - iter->is_extents)) { - char buf[100]; - struct bkey uk = bkey_unpack_key(b, k); - - bch_bkey_to_text(buf, sizeof(buf), &uk); - panic("next key should be before iter pos:\n%llu:%llu\n%s\n", - iter->pos.inode, iter->pos.offset, buf); - } -} - -void bch_btree_iter_verify(struct btree_iter *iter, struct btree *b) -{ - struct btree_iter *linked; - - if (iter->nodes[b->level] == b) - __bch_btree_iter_verify(iter, b); - - for_each_linked_btree_node(iter, b, linked) - __bch_btree_iter_verify(iter, b); -} - -#endif - -static void __bch_btree_node_iter_fix(struct btree_iter *iter, - struct btree *b, - struct btree_node_iter *node_iter, - struct bset_tree *t, - struct bkey_packed *where, - unsigned clobber_u64s, - unsigned new_u64s) -{ - const struct bkey_packed *end = btree_bkey_last(b, t); - struct btree_node_iter_set *set; - unsigned offset = __btree_node_key_to_offset(b, where); - int shift = new_u64s - clobber_u64s; - unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift; - - btree_node_iter_for_each(node_iter, set) - if (set->end == old_end) - goto found; - - /* didn't find the bset in the iterator - might have to readd it: */ - if (new_u64s && - btree_iter_pos_cmp_packed(b, &iter->pos, where, - iter->is_extents)) - bch_btree_node_iter_push(node_iter, b, where, end); - return; -found: - set->end = (int) set->end + shift; - - /* Iterator hasn't gotten to the key that changed yet: */ - if (set->k < offset) - return; - - if (new_u64s && - btree_iter_pos_cmp_packed(b, &iter->pos, where, - iter->is_extents)) { - set->k = offset; - bch_btree_node_iter_sort(node_iter, b); - } else if (set->k < offset + clobber_u64s) { - set->k = offset + new_u64s; - if (set->k == set->end) - *set = node_iter->data[--node_iter->used]; - bch_btree_node_iter_sort(node_iter, b); - } else { - set->k = (int) set->k + shift; - } - - /* - * Interior nodes are special because iterators for interior nodes don't - * obey the usual invariants regarding the iterator position: - * - * We may have whiteouts that compare greater than the iterator - * position, and logically should be in the iterator, but that we - * skipped past to find the first live key greater than the iterator - * position. This becomes an issue when we insert a new key that is - * greater than the current iterator position, but smaller than the - * whiteouts we've already skipped past - this happens in the course of - * a btree split. - * - * We have to rewind the iterator past to before those whiteouts here, - * else bkey_node_iter_prev() is not going to work and who knows what - * else would happen. And we have to do it manually, because here we've - * already done the insert and the iterator is currently inconsistent: - * - * We've got multiple competing invariants, here - we have to be careful - * about rewinding iterators for interior nodes, because they should - * always point to the key for the child node the btree iterator points - * to. - */ - if (b->level && new_u64s && !bkey_deleted(where) && - btree_iter_pos_cmp_packed(b, &iter->pos, where, - iter->is_extents)) { - struct bset_tree *t; - struct bkey_packed *k; - - for_each_bset(b, t) { - if (bch_bkey_to_bset(b, where) == t) - continue; - - k = bkey_prev_all(b, t, - bch_btree_node_iter_bset_pos(node_iter, b, t)); - if (k && - __btree_node_iter_cmp(node_iter, b, - k, where) > 0) { - struct btree_node_iter_set *set; - unsigned offset = - __btree_node_key_to_offset(b, bkey_next(k)); - - btree_node_iter_for_each(node_iter, set) - if (set->k == offset) { - set->k = __btree_node_key_to_offset(b, k); - bch_btree_node_iter_sort(node_iter, b); - goto next_bset; - } - - bch_btree_node_iter_push(node_iter, b, k, - btree_bkey_last(b, t)); - } -next_bset: - t = t; - } - } -} - -void bch_btree_node_iter_fix(struct btree_iter *iter, - struct btree *b, - struct btree_node_iter *node_iter, - struct bset_tree *t, - struct bkey_packed *where, - unsigned clobber_u64s, - unsigned new_u64s) -{ - struct btree_iter *linked; - - if (node_iter != &iter->node_iters[b->level]) - __bch_btree_node_iter_fix(iter, b, node_iter, t, - where, clobber_u64s, new_u64s); - - if (iter->nodes[b->level] == b) - __bch_btree_node_iter_fix(iter, b, - &iter->node_iters[b->level], t, - where, clobber_u64s, new_u64s); - - for_each_linked_btree_node(iter, b, linked) - __bch_btree_node_iter_fix(linked, b, - &linked->node_iters[b->level], t, - where, clobber_u64s, new_u64s); - - /* interior node iterators are... special... */ - if (!b->level) - bch_btree_iter_verify(iter, b); -} - -/* peek_all() doesn't skip deleted keys */ -static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter) -{ - struct btree *b = iter->nodes[iter->level]; - struct bkey_packed *k = - bch_btree_node_iter_peek_all(&iter->node_iters[iter->level], b); - struct bkey_s_c ret; - - EBUG_ON(!btree_node_locked(iter, iter->level)); - - if (!k) - return bkey_s_c_null; - - ret = bkey_disassemble(b, k, &iter->k); - - if (debug_check_bkeys(iter->c)) - bkey_debugcheck(iter->c, b, ret); - - return ret; -} - -static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter) -{ - struct btree *b = iter->nodes[iter->level]; - struct bkey_packed *k = - bch_btree_node_iter_peek(&iter->node_iters[iter->level], b); - struct bkey_s_c ret; - - EBUG_ON(!btree_node_locked(iter, iter->level)); - - if (!k) - return bkey_s_c_null; - - ret = bkey_disassemble(b, k, &iter->k); - - if (debug_check_bkeys(iter->c)) - bkey_debugcheck(iter->c, b, ret); - - return ret; -} - -static inline void __btree_iter_advance(struct btree_iter *iter) -{ - bch_btree_node_iter_advance(&iter->node_iters[iter->level], - iter->nodes[iter->level]); -} - -/* - * Verify that iterator for parent node points to child node: - */ -static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) -{ - bool parent_locked; - struct bkey_packed *k; - - if (!IS_ENABLED(CONFIG_BCACHE_DEBUG) || - !iter->nodes[b->level + 1]) - return; - - parent_locked = btree_node_locked(iter, b->level + 1); - - if (!btree_node_relock(iter, b->level + 1)) - return; - - k = bch_btree_node_iter_peek_all(&iter->node_iters[b->level + 1], - iter->nodes[b->level + 1]); - if (!k || - bkey_deleted(k) || - bkey_cmp_left_packed(iter->nodes[b->level + 1], - k, &b->key.k.p)) { - char buf[100]; - struct bkey uk = bkey_unpack_key(b, k); - - bch_bkey_to_text(buf, sizeof(buf), &uk); - panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", - buf, b->key.k.p.inode, b->key.k.p.offset); - } - - if (!parent_locked) - btree_node_unlock(iter, b->level + 1); -} - -static inline void __btree_iter_init(struct btree_iter *iter, - struct btree *b) -{ - bch_btree_node_iter_init(&iter->node_iters[b->level], b, - iter->pos, iter->is_extents, - btree_node_is_extents(b)); - - /* Skip to first non whiteout: */ - if (b->level) - bch_btree_node_iter_peek(&iter->node_iters[b->level], b); -} - -static inline bool btree_iter_pos_in_node(struct btree_iter *iter, - struct btree *b) -{ - return iter->btree_id == b->btree_id && - bkey_cmp(iter->pos, b->data->min_key) >= 0 && - btree_iter_pos_cmp(iter->pos, &b->key.k, iter->is_extents); -} - -static inline void btree_iter_node_set(struct btree_iter *iter, - struct btree *b) -{ - btree_iter_verify_new_node(iter, b); - - EBUG_ON(!btree_iter_pos_in_node(iter, b)); - EBUG_ON(b->lock.state.seq & 1); - - iter->lock_seq[b->level] = b->lock.state.seq; - iter->nodes[b->level] = b; - __btree_iter_init(iter, b); -} - -/* - * A btree node is being replaced - update the iterator to point to the new - * node: - */ -bool bch_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) -{ - struct btree_iter *linked; - - for_each_linked_btree_iter(iter, linked) - if (btree_iter_pos_in_node(linked, b)) { - /* - * bch_btree_iter_node_drop() has already been called - - * the old node we're replacing has already been - * unlocked and the pointer invalidated - */ - BUG_ON(btree_node_locked(linked, b->level)); - - /* - * If @linked wants this node read locked, we don't want - * to actually take the read lock now because it's not - * legal to hold read locks on other nodes while we take - * write locks, so the journal can make forward - * progress... - * - * Instead, btree_iter_node_set() sets things up so - * btree_node_relock() will succeed: - */ - - if (btree_want_intent(linked, b->level)) { - six_lock_increment(&b->lock, SIX_LOCK_intent); - mark_btree_node_intent_locked(linked, b->level); - } - - btree_iter_node_set(linked, b); - } - - if (!btree_iter_pos_in_node(iter, b)) { - six_unlock_intent(&b->lock); - return false; - } - - mark_btree_node_intent_locked(iter, b->level); - btree_iter_node_set(iter, b); - return true; -} - -void bch_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b) -{ - struct btree_iter *linked; - unsigned level = b->level; - - for_each_linked_btree_iter(iter, linked) - if (linked->nodes[level] == b) { - btree_node_unlock(linked, level); - linked->nodes[level] = BTREE_ITER_NOT_END; - } -} - -void bch_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) -{ - unsigned level = b->level; - - if (iter->nodes[level] == b) { - BUG_ON(b->lock.state.intent_lock != 1); - btree_node_unlock(iter, level); - iter->nodes[level] = BTREE_ITER_NOT_END; - } -} - -/* - * A btree node has been modified in such a way as to invalidate iterators - fix - * them: - */ -void bch_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) -{ - struct btree_iter *linked; - - for_each_linked_btree_node(iter, b, linked) - __btree_iter_init(linked, b); - __btree_iter_init(iter, b); -} - -static inline int btree_iter_lock_root(struct btree_iter *iter, - unsigned depth_want) -{ - struct bch_fs *c = iter->c; - struct btree *b; - enum six_lock_type lock_type; - unsigned i; - - EBUG_ON(iter->nodes_locked); - - while (1) { - b = READ_ONCE(c->btree_roots[iter->btree_id].b); - iter->level = READ_ONCE(b->level); - - if (unlikely(iter->level < depth_want)) { - /* - * the root is at a lower depth than the depth we want: - * got to the end of the btree, or we're walking nodes - * greater than some depth and there are no nodes >= - * that depth - */ - iter->level = depth_want; - iter->nodes[iter->level] = NULL; - return 0; - } - - lock_type = btree_lock_want(iter, iter->level); - if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, - iter, lock_type))) - return -EINTR; - - if (likely(b == c->btree_roots[iter->btree_id].b && - b->level == iter->level && - !race_fault())) { - for (i = 0; i < iter->level; i++) - iter->nodes[i] = BTREE_ITER_NOT_END; - iter->nodes[iter->level] = b; - - mark_btree_node_locked(iter, iter->level, lock_type); - btree_iter_node_set(iter, b); - return 0; - - } - - six_unlock_type(&b->lock, lock_type); - } -} - -static inline int btree_iter_down(struct btree_iter *iter) -{ - struct btree *b; - struct bkey_s_c k = __btree_iter_peek(iter); - unsigned level = iter->level - 1; - enum six_lock_type lock_type = btree_lock_want(iter, level); - BKEY_PADDED(k) tmp; - - bkey_reassemble(&tmp.k, k); - - b = bch_btree_node_get(iter, &tmp.k, level, lock_type); - if (unlikely(IS_ERR(b))) - return PTR_ERR(b); - - iter->level = level; - mark_btree_node_locked(iter, level, lock_type); - btree_iter_node_set(iter, b); - return 0; -} - -static void btree_iter_up(struct btree_iter *iter) -{ - btree_node_unlock(iter, iter->level++); -} - -int __must_check __bch_btree_iter_traverse(struct btree_iter *); - -static int btree_iter_traverse_error(struct btree_iter *iter, int ret) -{ - struct bch_fs *c = iter->c; - struct btree_iter *linked, *sorted_iters, **i; -retry_all: - bch_btree_iter_unlock(iter); - - if (ret != -ENOMEM && ret != -EINTR) - goto io_error; - - if (ret == -ENOMEM) { - struct closure cl; - - closure_init_stack(&cl); - - do { - ret = mca_cannibalize_lock(c, &cl); - closure_sync(&cl); - } while (ret); - } - - /* - * Linked iters are normally a circular singly linked list - break cycle - * while we sort them: - */ - linked = iter->next; - iter->next = NULL; - sorted_iters = NULL; - - while (linked) { - iter = linked; - linked = linked->next; - - i = &sorted_iters; - while (*i && btree_iter_cmp(iter, *i) > 0) - i = &(*i)->next; - - iter->next = *i; - *i = iter; - } - - /* Make list circular again: */ - iter = sorted_iters; - while (iter->next) - iter = iter->next; - iter->next = sorted_iters; - - /* Now, redo traversals in correct order: */ - - iter = sorted_iters; - do { -retry: - ret = __bch_btree_iter_traverse(iter); - if (unlikely(ret)) { - if (ret == -EINTR) - goto retry; - goto retry_all; - } - - iter = iter->next; - } while (iter != sorted_iters); - - ret = btree_iter_linked(iter) ? -EINTR : 0; -out: - mca_cannibalize_unlock(c); - return ret; -io_error: - BUG_ON(ret != -EIO); - - iter->error = ret; - iter->nodes[iter->level] = NULL; - goto out; -} - -/* - * This is the main state machine for walking down the btree - walks down to a - * specified depth - * - * Returns 0 on success, -EIO on error (error reading in a btree node). - * - * On error, caller (peek_node()/peek_key()) must return NULL; the error is - * stashed in the iterator and returned from bch_btree_iter_unlock(). - */ -int __must_check __bch_btree_iter_traverse(struct btree_iter *iter) -{ - unsigned depth_want = iter->level; - - /* make sure we have all the intent locks we need - ugh */ - if (unlikely(iter->nodes[iter->level] && - iter->level + 1 < iter->locks_want)) { - unsigned i; - - for (i = iter->level + 1; - i < iter->locks_want && iter->nodes[i]; - i++) - if (!btree_node_relock(iter, i)) { - while (iter->nodes[iter->level] && - iter->level + 1 < iter->locks_want) - btree_iter_up(iter); - break; - } - } - - /* - * If the current node isn't locked, go up until we have a locked node - * or run out of nodes: - */ - while (iter->nodes[iter->level] && - !(is_btree_node(iter, iter->level) && - btree_node_relock(iter, iter->level) && - btree_iter_pos_cmp(iter->pos, - &iter->nodes[iter->level]->key.k, - iter->is_extents))) - btree_iter_up(iter); - - /* - * If we've got a btree node locked (i.e. we aren't about to relock the - * root) - advance its node iterator if necessary: - */ - if (iter->nodes[iter->level]) { - struct bkey_s_c k; - - while ((k = __btree_iter_peek_all(iter)).k && - !btree_iter_pos_cmp(iter->pos, k.k, iter->is_extents)) - __btree_iter_advance(iter); - } - - /* - * Note: iter->nodes[iter->level] may be temporarily NULL here - that - * would indicate to other code that we got to the end of the btree, - * here it indicates that relocking the root failed - it's critical that - * btree_iter_lock_root() comes next and that it can't fail - */ - while (iter->level > depth_want) { - int ret = iter->nodes[iter->level] - ? btree_iter_down(iter) - : btree_iter_lock_root(iter, depth_want); - if (unlikely(ret)) { - iter->level = depth_want; - return ret; - } - } - - return 0; -} - -int __must_check bch_btree_iter_traverse(struct btree_iter *iter) -{ - int ret; - - if (unlikely(!iter->nodes[iter->level])) - return 0; - - iter->at_end_of_leaf = false; - - ret = __bch_btree_iter_traverse(iter); - if (unlikely(ret)) - ret = btree_iter_traverse_error(iter, ret); - - return ret; -} - -/* Iterate across nodes (leaf and interior nodes) */ - -struct btree *bch_btree_iter_peek_node(struct btree_iter *iter) -{ - struct btree *b; - int ret; - - EBUG_ON(iter->is_extents); - - ret = bch_btree_iter_traverse(iter); - if (ret) - return NULL; - - b = iter->nodes[iter->level]; - - if (b) { - EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); - iter->pos = b->key.k.p; - } - - return b; -} - -struct btree *bch_btree_iter_next_node(struct btree_iter *iter, unsigned depth) -{ - struct btree *b; - int ret; - - EBUG_ON(iter->is_extents); - - btree_iter_up(iter); - - if (!iter->nodes[iter->level]) - return NULL; - - /* parent node usually won't be locked: redo traversal if necessary */ - ret = bch_btree_iter_traverse(iter); - if (ret) - return NULL; - - b = iter->nodes[iter->level]; - if (!b) - return b; - - if (bkey_cmp(iter->pos, b->key.k.p) < 0) { - /* Haven't gotten to the end of the parent node: */ - - /* ick: */ - iter->pos = iter->btree_id == BTREE_ID_INODES - ? btree_type_successor(iter->btree_id, iter->pos) - : bkey_successor(iter->pos); - iter->level = depth; - - ret = bch_btree_iter_traverse(iter); - if (ret) - return NULL; - - b = iter->nodes[iter->level]; - } - - iter->pos = b->key.k.p; - - return b; -} - -/* Iterate across keys (in leaf nodes only) */ - -void bch_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) -{ - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; - struct bkey_packed *k; - - EBUG_ON(iter->level != 0); - EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); - EBUG_ON(!btree_node_locked(iter, 0)); - EBUG_ON(bkey_cmp(new_pos, b->key.k.p) > 0); - - while ((k = bch_btree_node_iter_peek_all(node_iter, b)) && - !btree_iter_pos_cmp_packed(b, &new_pos, k, - iter->is_extents)) - bch_btree_node_iter_advance(node_iter, b); - - if (!k && - !btree_iter_pos_cmp(new_pos, &b->key.k, iter->is_extents)) - iter->at_end_of_leaf = true; - - iter->pos = new_pos; -} - -void bch_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) -{ - EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */ - iter->pos = new_pos; -} - -void bch_btree_iter_advance_pos(struct btree_iter *iter) -{ - /* - * We use iter->k instead of iter->pos for extents: iter->pos will be - * equal to the start of the extent we returned, but we need to advance - * to the end of the extent we returned. - */ - bch_btree_iter_set_pos(iter, - btree_type_successor(iter->btree_id, iter->k.p)); -} - -/* XXX: expensive */ -void bch_btree_iter_rewind(struct btree_iter *iter, struct bpos pos) -{ - /* incapable of rewinding across nodes: */ - BUG_ON(bkey_cmp(pos, iter->nodes[iter->level]->data->min_key) < 0); - - iter->pos = pos; - __btree_iter_init(iter, iter->nodes[iter->level]); -} - -struct bkey_s_c bch_btree_iter_peek(struct btree_iter *iter) -{ - struct bkey_s_c k; - int ret; - - while (1) { - ret = bch_btree_iter_traverse(iter); - if (unlikely(ret)) { - iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); - return bkey_s_c_err(ret); - } - - k = __btree_iter_peek(iter); - if (likely(k.k)) { - /* - * iter->pos should always be equal to the key we just - * returned - except extents can straddle iter->pos: - */ - if (!iter->is_extents || - bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) - bch_btree_iter_set_pos(iter, bkey_start_pos(k.k)); - return k; - } - - iter->pos = iter->nodes[0]->key.k.p; - - if (!bkey_cmp(iter->pos, POS_MAX)) { - iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); - bch_btree_iter_unlock(iter); - return bkey_s_c_null; - } - - iter->pos = btree_type_successor(iter->btree_id, iter->pos); - } -} - -struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *iter) -{ - struct bkey_s_c k; - struct bkey n; - int ret; - - while (1) { - ret = bch_btree_iter_traverse(iter); - if (unlikely(ret)) { - iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); - return bkey_s_c_err(ret); - } - - k = __btree_iter_peek_all(iter); -recheck: - if (!k.k || bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) { - /* hole */ - bkey_init(&n); - n.p = iter->pos; - - if (iter->is_extents) { - if (n.p.offset == KEY_OFFSET_MAX) { - iter->pos = bkey_successor(iter->pos); - goto recheck; - } - - if (!k.k) - k.k = &iter->nodes[0]->key.k; - - bch_key_resize(&n, - min_t(u64, KEY_SIZE_MAX, - (k.k->p.inode == n.p.inode - ? bkey_start_offset(k.k) - : KEY_OFFSET_MAX) - - n.p.offset)); - - EBUG_ON(!n.size); - } - - iter->k = n; - return (struct bkey_s_c) { &iter->k, NULL }; - } else if (!bkey_deleted(k.k)) { - return k; - } else { - __btree_iter_advance(iter); - } - } -} - -void __bch_btree_iter_init(struct btree_iter *iter, struct bch_fs *c, - enum btree_id btree_id, struct bpos pos, - unsigned locks_want, unsigned depth) -{ - iter->level = depth; - /* bch_bkey_ops isn't used much, this would be a cache miss */ - /* iter->is_extents = bch_bkey_ops[btree_id]->is_extents; */ - iter->is_extents = btree_id == BTREE_ID_EXTENTS; - iter->nodes_locked = 0; - iter->nodes_intent_locked = 0; - iter->locks_want = min(locks_want, BTREE_MAX_DEPTH); - iter->btree_id = btree_id; - iter->at_end_of_leaf = 0; - iter->error = 0; - iter->c = c; - iter->pos = pos; - memset(iter->nodes, 0, sizeof(iter->nodes)); - iter->nodes[iter->level] = BTREE_ITER_NOT_END; - iter->next = iter; - - prefetch(c->btree_roots[btree_id].b); -} - -void bch_btree_iter_link(struct btree_iter *iter, struct btree_iter *new) -{ - BUG_ON(btree_iter_linked(new)); - - new->next = iter->next; - iter->next = new; - - if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) { - unsigned nr_iters = 1; - - for_each_linked_btree_iter(iter, new) - nr_iters++; - - BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE); - } -} - -void bch_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) -{ - bch_btree_iter_unlock(dst); - memcpy(dst, src, offsetof(struct btree_iter, next)); - dst->nodes_locked = dst->nodes_intent_locked = 0; -} diff --git a/libbcache/btree_iter.h b/libbcache/btree_iter.h deleted file mode 100644 index acca2c68..00000000 --- a/libbcache/btree_iter.h +++ /dev/null @@ -1,282 +0,0 @@ -#ifndef _BCACHE_BTREE_ITER_H -#define _BCACHE_BTREE_ITER_H - -#include "btree_types.h" - -struct btree_iter { - /* Current btree depth */ - u8 level; - - /* - * Used in bch_btree_iter_traverse(), to indicate whether we're - * searching for @pos or the first key strictly greater than @pos - */ - u8 is_extents; - - /* Bitmasks for read/intent locks held per level */ - u8 nodes_locked; - u8 nodes_intent_locked; - - /* Btree level below which we start taking intent locks */ - u8 locks_want; - - enum btree_id btree_id:8; - - /* - * indicates we need to call bch_btree_iter_traverse() to revalidate - * iterator: - */ - u8 at_end_of_leaf; - - s8 error; - - struct bch_fs *c; - - /* Current position of the iterator */ - struct bpos pos; - - u32 lock_seq[BTREE_MAX_DEPTH]; - - /* - * NOTE: Never set iter->nodes to NULL except in btree_iter_lock_root(). - * - * This is because iter->nodes[iter->level] == NULL is how - * btree_iter_next_node() knows that it's finished with a depth first - * traversal. Just unlocking a node (with btree_node_unlock()) is fine, - * and if you really don't want that node used again (e.g. btree_split() - * freed it) decrementing lock_seq will cause btree_node_relock() to - * always fail (but since freeing a btree node takes a write lock on the - * node, which increments the node's lock seq, that's not actually - * necessary in that example). - * - * One extra slot for a sentinel NULL: - */ - struct btree *nodes[BTREE_MAX_DEPTH + 1]; - struct btree_node_iter node_iters[BTREE_MAX_DEPTH]; - - /* - * Current unpacked key - so that bch_btree_iter_next()/ - * bch_btree_iter_next_with_holes() can correctly advance pos. - */ - struct bkey k; - - /* - * Circular linked list of linked iterators: linked iterators share - * locks (e.g. two linked iterators may have the same node intent - * locked, or read and write locked, at the same time), and insertions - * through one iterator won't invalidate the other linked iterators. - */ - - /* Must come last: */ - struct btree_iter *next; -}; - -static inline bool btree_iter_linked(const struct btree_iter *iter) -{ - return iter->next != iter; -} - -/** - * for_each_linked_btree_iter - iterate over all iterators linked with @_iter - */ -#define for_each_linked_btree_iter(_iter, _linked) \ - for ((_linked) = (_iter)->next; \ - (_linked) != (_iter); \ - (_linked) = (_linked)->next) - -static inline struct btree_iter * -__next_linked_btree_node(struct btree_iter *iter, struct btree *b, - struct btree_iter *linked) -{ - do { - linked = linked->next; - - if (linked == iter) - return NULL; - - /* - * We don't compare the low bits of the lock sequence numbers - * because @iter might have taken a write lock on @b, and we - * don't want to skip the linked iterator if the sequence - * numbers were equal before taking that write lock. The lock - * sequence number is incremented by taking and releasing write - * locks and is even when unlocked: - */ - } while (linked->nodes[b->level] != b || - linked->lock_seq[b->level] >> 1 != b->lock.state.seq >> 1); - - return linked; -} - -/** - * for_each_linked_btree_node - iterate over all iterators linked with @_iter - * that also point to @_b - * - * @_b is assumed to be locked by @_iter - * - * Filters out iterators that don't have a valid btree_node iterator for @_b - - * i.e. iterators for which btree_node_relock() would not succeed. - */ -#define for_each_linked_btree_node(_iter, _b, _linked) \ - for ((_linked) = (_iter); \ - ((_linked) = __next_linked_btree_node(_iter, _b, _linked));) - -#ifdef CONFIG_BCACHE_DEBUG -void bch_btree_iter_verify(struct btree_iter *, struct btree *); -#else -static inline void bch_btree_iter_verify(struct btree_iter *iter, - struct btree *b) {} -#endif - -void bch_btree_node_iter_fix(struct btree_iter *, struct btree *, - struct btree_node_iter *, struct bset_tree *, - struct bkey_packed *, unsigned, unsigned); - -int bch_btree_iter_unlock(struct btree_iter *); -bool __bch_btree_iter_set_locks_want(struct btree_iter *, unsigned); - -static inline bool bch_btree_iter_set_locks_want(struct btree_iter *iter, - unsigned new_locks_want) -{ - new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - - if (iter->locks_want == new_locks_want && - iter->nodes_intent_locked == (1 << new_locks_want) - 1) - return true; - - return __bch_btree_iter_set_locks_want(iter, new_locks_want); -} - -bool bch_btree_iter_node_replace(struct btree_iter *, struct btree *); -void bch_btree_iter_node_drop_linked(struct btree_iter *, struct btree *); -void bch_btree_iter_node_drop(struct btree_iter *, struct btree *); - -void bch_btree_iter_reinit_node(struct btree_iter *, struct btree *); - -int __must_check bch_btree_iter_traverse(struct btree_iter *); - -struct btree *bch_btree_iter_peek_node(struct btree_iter *); -struct btree *bch_btree_iter_next_node(struct btree_iter *, unsigned); - -struct bkey_s_c bch_btree_iter_peek(struct btree_iter *); -struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *); -void bch_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); -void bch_btree_iter_set_pos(struct btree_iter *, struct bpos); -void bch_btree_iter_advance_pos(struct btree_iter *); -void bch_btree_iter_rewind(struct btree_iter *, struct bpos); - -void __bch_btree_iter_init(struct btree_iter *, struct bch_fs *, - enum btree_id, struct bpos, unsigned , unsigned); - -static inline void bch_btree_iter_init(struct btree_iter *iter, - struct bch_fs *c, - enum btree_id btree_id, - struct bpos pos) -{ - __bch_btree_iter_init(iter, c, btree_id, pos, 0, 0); -} - -static inline void bch_btree_iter_init_intent(struct btree_iter *iter, - struct bch_fs *c, - enum btree_id btree_id, - struct bpos pos) -{ - __bch_btree_iter_init(iter, c, btree_id, pos, 1, 0); -} - -void bch_btree_iter_link(struct btree_iter *, struct btree_iter *); -void bch_btree_iter_copy(struct btree_iter *, struct btree_iter *); - -static inline struct bpos btree_type_successor(enum btree_id id, - struct bpos pos) -{ - if (id == BTREE_ID_INODES) { - pos.inode++; - pos.offset = 0; - } else if (id != BTREE_ID_EXTENTS) { - pos = bkey_successor(pos); - } - - return pos; -} - -static inline int __btree_iter_cmp(enum btree_id id, - struct bpos pos, - const struct btree_iter *r) -{ - if (id != r->btree_id) - return id < r->btree_id ? -1 : 1; - return bkey_cmp(pos, r->pos); -} - -static inline int btree_iter_cmp(const struct btree_iter *l, - const struct btree_iter *r) -{ - return __btree_iter_cmp(l->btree_id, l->pos, r); -} - -#define __for_each_btree_node(_iter, _c, _btree_id, _start, _depth, \ - _b, _locks_want) \ - for (__bch_btree_iter_init((_iter), (_c), (_btree_id), \ - _start, _locks_want, _depth), \ - (_iter)->is_extents = false, \ - _b = bch_btree_iter_peek_node(_iter); \ - (_b); \ - (_b) = bch_btree_iter_next_node(_iter, _depth)) - -#define for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b) \ - __for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b, 0) - -#define __for_each_btree_key(_iter, _c, _btree_id, _start, \ - _k, _locks_want) \ - for (__bch_btree_iter_init((_iter), (_c), (_btree_id), \ - _start, _locks_want, 0); \ - !IS_ERR_OR_NULL(((_k) = bch_btree_iter_peek(_iter)).k); \ - bch_btree_iter_advance_pos(_iter)) - -#define for_each_btree_key(_iter, _c, _btree_id, _start, _k) \ - __for_each_btree_key(_iter, _c, _btree_id, _start, _k, 0) - -#define for_each_btree_key_intent(_iter, _c, _btree_id, _start, _k) \ - __for_each_btree_key(_iter, _c, _btree_id, _start, _k, 1) - -#define __for_each_btree_key_with_holes(_iter, _c, _btree_id, \ - _start, _k, _locks_want) \ - for (__bch_btree_iter_init((_iter), (_c), (_btree_id), \ - _start, _locks_want, 0); \ - !IS_ERR_OR_NULL(((_k) = bch_btree_iter_peek_with_holes(_iter)).k);\ - bch_btree_iter_advance_pos(_iter)) - -#define for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k) \ - __for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 0) - -#define for_each_btree_key_with_holes_intent(_iter, _c, _btree_id, \ - _start, _k) \ - __for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 1) - -static inline int btree_iter_err(struct bkey_s_c k) -{ - return IS_ERR(k.k) ? PTR_ERR(k.k) : 0; -} - -/* - * Unlocks before scheduling - * Note: does not revalidate iterator - */ -static inline void bch_btree_iter_cond_resched(struct btree_iter *iter) -{ - struct btree_iter *linked; - - if (need_resched()) { - for_each_linked_btree_iter(iter, linked) - bch_btree_iter_unlock(linked); - bch_btree_iter_unlock(iter); - schedule(); - } else if (race_fault()) { - for_each_linked_btree_iter(iter, linked) - bch_btree_iter_unlock(linked); - bch_btree_iter_unlock(iter); - } -} - -#endif /* _BCACHE_BTREE_ITER_H */ diff --git a/libbcache/btree_locking.h b/libbcache/btree_locking.h deleted file mode 100644 index 76f85c0d..00000000 --- a/libbcache/btree_locking.h +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef _BCACHE_BTREE_LOCKING_H -#define _BCACHE_BTREE_LOCKING_H - -/* - * Only for internal btree use: - * - * The btree iterator tracks what locks it wants to take, and what locks it - * currently has - here we have wrappers for locking/unlocking btree nodes and - * updating the iterator state - */ - -#include "btree_iter.h" -#include "six.h" - -/* matches six lock types */ -enum btree_node_locked_type { - BTREE_NODE_UNLOCKED = -1, - BTREE_NODE_READ_LOCKED = SIX_LOCK_read, - BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, -}; - -static inline int btree_node_locked_type(struct btree_iter *iter, - unsigned level) -{ - /* - * We're relying on the fact that if nodes_intent_locked is set - * nodes_locked must be set as well, so that we can compute without - * branches: - */ - return BTREE_NODE_UNLOCKED + - ((iter->nodes_locked >> level) & 1) + - ((iter->nodes_intent_locked >> level) & 1); -} - -static inline bool btree_node_intent_locked(struct btree_iter *iter, - unsigned level) -{ - return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED; -} - -static inline bool btree_node_read_locked(struct btree_iter *iter, - unsigned level) -{ - return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED; -} - -static inline bool btree_node_locked(struct btree_iter *iter, unsigned level) -{ - return iter->nodes_locked & (1 << level); -} - -static inline void mark_btree_node_unlocked(struct btree_iter *iter, - unsigned level) -{ - iter->nodes_locked &= ~(1 << level); - iter->nodes_intent_locked &= ~(1 << level); -} - -static inline void mark_btree_node_locked(struct btree_iter *iter, - unsigned level, - enum six_lock_type type) -{ - /* relying on this to avoid a branch */ - BUILD_BUG_ON(SIX_LOCK_read != 0); - BUILD_BUG_ON(SIX_LOCK_intent != 1); - - iter->nodes_locked |= 1 << level; - iter->nodes_intent_locked |= type << level; -} - -static inline void mark_btree_node_intent_locked(struct btree_iter *iter, - unsigned level) -{ - mark_btree_node_locked(iter, level, SIX_LOCK_intent); -} - -static inline enum six_lock_type -btree_lock_want(struct btree_iter *iter, int level) -{ - return level < iter->locks_want - ? SIX_LOCK_intent - : SIX_LOCK_read; -} - -static inline bool btree_want_intent(struct btree_iter *iter, int level) -{ - return btree_lock_want(iter, level) == SIX_LOCK_intent; -} - -static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) -{ - int lock_type = btree_node_locked_type(iter, level); - - if (lock_type != BTREE_NODE_UNLOCKED) - six_unlock_type(&iter->nodes[level]->lock, lock_type); - mark_btree_node_unlocked(iter, level); -} - -bool __bch_btree_node_lock(struct btree *, struct bpos, unsigned, - struct btree_iter *, enum six_lock_type); - -static inline bool btree_node_lock(struct btree *b, struct bpos pos, - unsigned level, - struct btree_iter *iter, - enum six_lock_type type) -{ - return likely(six_trylock_type(&b->lock, type)) || - __bch_btree_node_lock(b, pos, level, iter, type); -} - -bool btree_node_relock(struct btree_iter *, unsigned); - -void btree_node_unlock_write(struct btree *, struct btree_iter *); -void btree_node_lock_write(struct btree *, struct btree_iter *); - -void __btree_node_unlock_write(struct btree *, struct btree_iter *); -void __btree_node_lock_write(struct btree *, struct btree_iter *); - -#endif /* _BCACHE_BTREE_LOCKING_H */ diff --git a/libbcache/btree_types.h b/libbcache/btree_types.h deleted file mode 100644 index cfca12ea..00000000 --- a/libbcache/btree_types.h +++ /dev/null @@ -1,311 +0,0 @@ -#ifndef _BCACHE_BTREE_TYPES_H -#define _BCACHE_BTREE_TYPES_H - -#include <linux/bcache.h> -#include <linux/kernel.h> -#include <linux/list.h> -#include <linux/rhashtable.h> -#include <linux/semaphore.h> -#include <linux/workqueue.h> - -#include "bkey_methods.h" -#include "journal_types.h" -#include "six.h" - -struct open_bucket; -struct btree_interior_update; - -#define MAX_BSETS 3U - -struct btree_nr_keys { - - /* - * Amount of live metadata (i.e. size of node after a compaction) in - * units of u64s - */ - u16 live_u64s; - u16 bset_u64s[MAX_BSETS]; - - /* live keys only: */ - u16 packed_keys; - u16 unpacked_keys; -}; - -struct bset_tree { - /* - * We construct a binary tree in an array as if the array - * started at 1, so that things line up on the same cachelines - * better: see comments in bset.c at cacheline_to_bkey() for - * details - */ - - /* size of the binary tree and prev array */ - u16 size; - - /* function of size - precalculated for to_inorder() */ - u16 extra; - - u16 data_offset; - u16 aux_data_offset; - u16 end_offset; - - struct bpos max_key; -}; - -struct btree_write { - struct journal_entry_pin journal; - struct closure_waitlist wait; -}; - -struct btree { - /* Hottest entries first */ - struct rhash_head hash; - - /* Key/pointer for this btree node */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); - - struct six_lock lock; - - unsigned long flags; - u16 written; - u8 level; - u8 btree_id; - u8 nsets; - u8 nr_key_bits; - - struct bkey_format format; - - struct btree_node *data; - void *aux_data; - - /* - * Sets of sorted keys - the real btree node - plus a binary search tree - * - * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point - * to the memory we have allocated for this btree node. Additionally, - * set[0]->data points to the entire btree node as it exists on disk. - */ - struct bset_tree set[MAX_BSETS]; - - struct btree_nr_keys nr; - u16 sib_u64s[2]; - u16 whiteout_u64s; - u16 uncompacted_whiteout_u64s; - u8 page_order; - u8 unpack_fn_len; - - /* - * XXX: add a delete sequence number, so when btree_node_relock() fails - * because the lock sequence number has changed - i.e. the contents were - * modified - we can still relock the node if it's still the one we - * want, without redoing the traversal - */ - - /* - * For asynchronous splits/interior node updates: - * When we do a split, we allocate new child nodes and update the parent - * node to point to them: we update the parent in memory immediately, - * but then we must wait until the children have been written out before - * the update to the parent can be written - this is a list of the - * btree_interior_updates that are blocking this node from being - * written: - */ - struct list_head write_blocked; - - struct open_bucket *ob; - - /* lru list */ - struct list_head list; - - struct btree_write writes[2]; - -#ifdef CONFIG_BCACHE_DEBUG - bool *expensive_debug_checks; -#endif -}; - -#define BTREE_FLAG(flag) \ -static inline bool btree_node_ ## flag(struct btree *b) \ -{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ - \ -static inline void set_btree_node_ ## flag(struct btree *b) \ -{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ - \ -static inline void clear_btree_node_ ## flag(struct btree *b) \ -{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } - -enum btree_flags { - BTREE_NODE_read_error, - BTREE_NODE_write_error, - BTREE_NODE_dirty, - BTREE_NODE_noevict, - BTREE_NODE_write_idx, - BTREE_NODE_accessed, - BTREE_NODE_write_in_flight, - BTREE_NODE_just_written, -}; - -BTREE_FLAG(read_error); -BTREE_FLAG(write_error); -BTREE_FLAG(dirty); -BTREE_FLAG(noevict); -BTREE_FLAG(write_idx); -BTREE_FLAG(accessed); -BTREE_FLAG(write_in_flight); -BTREE_FLAG(just_written); - -static inline struct btree_write *btree_current_write(struct btree *b) -{ - return b->writes + btree_node_write_idx(b); -} - -static inline struct btree_write *btree_prev_write(struct btree *b) -{ - return b->writes + (btree_node_write_idx(b) ^ 1); -} - -static inline struct bset_tree *bset_tree_last(struct btree *b) -{ - EBUG_ON(!b->nsets); - return b->set + b->nsets - 1; -} - -static inline struct bset *bset(const struct btree *b, - const struct bset_tree *t) -{ - return (void *) b->data + t->data_offset * sizeof(u64); -} - -static inline struct bset *btree_bset_first(struct btree *b) -{ - return bset(b, b->set); -} - -static inline struct bset *btree_bset_last(struct btree *b) -{ - return bset(b, bset_tree_last(b)); -} - -static inline u16 -__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) -{ - size_t ret = (u64 *) k - (u64 *) b->data - 1; - - EBUG_ON(ret > U16_MAX); - return ret; -} - -static inline struct bkey_packed * -__btree_node_offset_to_key(const struct btree *b, u16 k) -{ - return (void *) ((u64 *) b->data + k + 1); -} - -#define btree_bkey_first(_b, _t) (bset(_b, _t)->start) - -#define btree_bkey_last(_b, _t) \ -({ \ - EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ - vstruct_last(bset(_b, _t))); \ - \ - __btree_node_offset_to_key(_b, (_t)->end_offset); \ -}) - -static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) -{ - t->end_offset = - __btree_node_key_to_offset(b, vstruct_last(bset(b, t))); - btree_bkey_last(b, t); -} - -static inline void set_btree_bset(struct btree *b, struct bset_tree *t, - const struct bset *i) -{ - t->data_offset = (u64 *) i - (u64 *) b->data; - - EBUG_ON(bset(b, t) != i); - - set_btree_bset_end(b, t); -} - -static inline unsigned bset_byte_offset(struct btree *b, void *i) -{ - return i - (void *) b->data; -} - -/* Type of keys @b contains: */ -static inline enum bkey_type btree_node_type(struct btree *b) -{ - return b->level ? BKEY_TYPE_BTREE : b->btree_id; -} - -static inline const struct bkey_ops *btree_node_ops(struct btree *b) -{ - return bch_bkey_ops[btree_node_type(b)]; -} - -static inline bool btree_node_has_ptrs(struct btree *b) -{ - return btree_type_has_ptrs(btree_node_type(b)); -} - -static inline bool btree_node_is_extents(struct btree *b) -{ - return btree_node_type(b) == BKEY_TYPE_EXTENTS; -} - -struct btree_root { - struct btree *b; - - struct btree_interior_update *as; - - /* On disk root - see async splits: */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); - u8 level; - u8 alive; -}; - -/* - * Optional hook that will be called just prior to a btree node update, when - * we're holding the write lock and we know what key is about to be overwritten: - */ - -struct btree_iter; -struct btree_node_iter; - -enum extent_insert_hook_ret { - BTREE_HOOK_DO_INSERT, - BTREE_HOOK_NO_INSERT, - BTREE_HOOK_RESTART_TRANS, -}; - -struct extent_insert_hook { - enum extent_insert_hook_ret - (*fn)(struct extent_insert_hook *, struct bpos, struct bpos, - struct bkey_s_c, const struct bkey_i *); -}; - -enum btree_insert_ret { - BTREE_INSERT_OK, - /* extent spanned multiple leaf nodes: have to traverse to next node: */ - BTREE_INSERT_NEED_TRAVERSE, - /* write lock held for too long */ - BTREE_INSERT_NEED_RESCHED, - /* leaf node needs to be split */ - BTREE_INSERT_BTREE_NODE_FULL, - BTREE_INSERT_JOURNAL_RES_FULL, - BTREE_INSERT_ENOSPC, - BTREE_INSERT_NEED_GC_LOCK, -}; - -enum btree_gc_coalesce_fail_reason { - BTREE_GC_COALESCE_FAIL_RESERVE_GET, - BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, - BTREE_GC_COALESCE_FAIL_FORMAT_FITS, -}; - -typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, - struct btree *, - struct btree_node_iter *); - -#endif /* _BCACHE_BTREE_TYPES_H */ diff --git a/libbcache/btree_update.c b/libbcache/btree_update.c deleted file mode 100644 index 751a51c2..00000000 --- a/libbcache/btree_update.c +++ /dev/null @@ -1,2345 +0,0 @@ - -#include "bcache.h" -#include "alloc.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "buckets.h" -#include "extents.h" -#include "journal.h" -#include "keylist.h" -#include "super-io.h" - -#include <linux/random.h> -#include <linux/sort.h> -#include <trace/events/bcache.h> - -static void btree_interior_update_updated_root(struct bch_fs *, - struct btree_interior_update *, - enum btree_id); - -/* Calculate ideal packed bkey format for new btree nodes: */ - -void __bch_btree_calc_format(struct bkey_format_state *s, struct btree *b) -{ - struct bkey_packed *k; - struct bset_tree *t; - struct bkey uk; - - bch_bkey_format_add_pos(s, b->data->min_key); - - for_each_bset(b, t) - for (k = btree_bkey_first(b, t); - k != btree_bkey_last(b, t); - k = bkey_next(k)) - if (!bkey_whiteout(k)) { - uk = bkey_unpack_key(b, k); - bch_bkey_format_add_key(s, &uk); - } -} - -static struct bkey_format bch_btree_calc_format(struct btree *b) -{ - struct bkey_format_state s; - - bch_bkey_format_init(&s); - __bch_btree_calc_format(&s, b); - - return bch_bkey_format_done(&s); -} - -static size_t btree_node_u64s_with_format(struct btree *b, - struct bkey_format *new_f) -{ - struct bkey_format *old_f = &b->format; - - /* stupid integer promotion rules */ - ssize_t delta = - (((int) new_f->key_u64s - old_f->key_u64s) * - (int) b->nr.packed_keys) + - (((int) new_f->key_u64s - BKEY_U64s) * - (int) b->nr.unpacked_keys); - - BUG_ON(delta + b->nr.live_u64s < 0); - - return b->nr.live_u64s + delta; -} - -/** - * btree_node_format_fits - check if we could rewrite node with a new format - * - * This assumes all keys can pack with the new format -- it just checks if - * the re-packed keys would fit inside the node itself. - */ -bool bch_btree_node_format_fits(struct bch_fs *c, struct btree *b, - struct bkey_format *new_f) -{ - size_t u64s = btree_node_u64s_with_format(b, new_f); - - return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); -} - -/* Btree node freeing/allocation: */ - -/* - * We're doing the index update that makes @b unreachable, update stuff to - * reflect that: - * - * Must be called _before_ btree_interior_update_updated_root() or - * btree_interior_update_updated_btree: - */ -static void bch_btree_node_free_index(struct bch_fs *c, struct btree *b, - enum btree_id id, struct bkey_s_c k, - struct bch_fs_usage *stats) -{ - struct btree_interior_update *as; - struct pending_btree_node_free *d; - - mutex_lock(&c->btree_interior_update_lock); - - for_each_pending_btree_node_free(c, as, d) - if (!bkey_cmp(k.k->p, d->key.k.p) && - bkey_val_bytes(k.k) == bkey_val_bytes(&d->key.k) && - !memcmp(k.v, &d->key.v, bkey_val_bytes(k.k))) - goto found; - - BUG(); -found: - d->index_update_done = true; - - /* - * Btree nodes are accounted as freed in bch_alloc_stats when they're - * freed from the index: - */ - stats->s[S_COMPRESSED][S_META] -= c->sb.btree_node_size; - stats->s[S_UNCOMPRESSED][S_META] -= c->sb.btree_node_size; - - /* - * We're dropping @k from the btree, but it's still live until the - * index update is persistent so we need to keep a reference around for - * mark and sweep to find - that's primarily what the - * btree_node_pending_free list is for. - * - * So here (when we set index_update_done = true), we're moving an - * existing reference to a different part of the larger "gc keyspace" - - * and the new position comes after the old position, since GC marks - * the pending free list after it walks the btree. - * - * If we move the reference while mark and sweep is _between_ the old - * and the new position, mark and sweep will see the reference twice - * and it'll get double accounted - so check for that here and subtract - * to cancel out one of mark and sweep's markings if necessary: - */ - - /* - * bch_mark_key() compares the current gc pos to the pos we're - * moving this reference from, hence one comparison here: - */ - if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) { - struct bch_fs_usage tmp = { 0 }; - - bch_mark_key(c, bkey_i_to_s_c(&d->key), - -c->sb.btree_node_size, true, b - ? gc_pos_btree_node(b) - : gc_pos_btree_root(id), - &tmp, 0); - /* - * Don't apply tmp - pending deletes aren't tracked in - * bch_alloc_stats: - */ - } - - mutex_unlock(&c->btree_interior_update_lock); -} - -static void __btree_node_free(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) -{ - trace_bcache_btree_node_free(c, b); - - BUG_ON(b == btree_node_root(c, b)); - BUG_ON(b->ob); - BUG_ON(!list_empty(&b->write_blocked)); - - six_lock_write(&b->lock); - - if (btree_node_dirty(b)) - bch_btree_complete_write(c, b, btree_current_write(b)); - clear_btree_node_dirty(b); - - mca_hash_remove(c, b); - - mutex_lock(&c->btree_cache_lock); - list_move(&b->list, &c->btree_cache_freeable); - mutex_unlock(&c->btree_cache_lock); - - /* - * By using six_unlock_write() directly instead of - * btree_node_unlock_write(), we don't update the iterator's sequence - * numbers and cause future btree_node_relock() calls to fail: - */ - six_unlock_write(&b->lock); -} - -void bch_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) -{ - struct open_bucket *ob = b->ob; - - b->ob = NULL; - - __btree_node_free(c, b, NULL); - - bch_open_bucket_put(c, ob); -} - -void bch_btree_node_free_inmem(struct btree_iter *iter, struct btree *b) -{ - bch_btree_iter_node_drop_linked(iter, b); - - __btree_node_free(iter->c, b, iter); - - bch_btree_iter_node_drop(iter, b); -} - -static void bch_btree_node_free_ondisk(struct bch_fs *c, - struct pending_btree_node_free *pending) -{ - struct bch_fs_usage stats = { 0 }; - - BUG_ON(!pending->index_update_done); - - bch_mark_key(c, bkey_i_to_s_c(&pending->key), - -c->sb.btree_node_size, true, - gc_phase(GC_PHASE_PENDING_DELETE), - &stats, 0); - /* - * Don't apply stats - pending deletes aren't tracked in - * bch_alloc_stats: - */ -} - -void btree_open_bucket_put(struct bch_fs *c, struct btree *b) -{ - bch_open_bucket_put(c, b->ob); - b->ob = NULL; -} - -static struct btree *__bch_btree_node_alloc(struct bch_fs *c, - bool use_reserve, - struct disk_reservation *res, - struct closure *cl) -{ - BKEY_PADDED(k) tmp; - struct open_bucket *ob; - struct btree *b; - unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE; - - mutex_lock(&c->btree_reserve_cache_lock); - if (c->btree_reserve_cache_nr > reserve) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - ob = a->ob; - bkey_copy(&tmp.k, &a->k); - mutex_unlock(&c->btree_reserve_cache_lock); - goto mem_alloc; - } - mutex_unlock(&c->btree_reserve_cache_lock); - -retry: - /* alloc_sectors is weird, I suppose */ - bkey_extent_init(&tmp.k); - tmp.k.k.size = c->sb.btree_node_size, - - ob = bch_alloc_sectors(c, &c->btree_write_point, - bkey_i_to_extent(&tmp.k), - res->nr_replicas, - c->opts.metadata_replicas_required, - use_reserve ? RESERVE_BTREE : RESERVE_NONE, - cl); - if (IS_ERR(ob)) - return ERR_CAST(ob); - - if (tmp.k.k.size < c->sb.btree_node_size) { - bch_open_bucket_put(c, ob); - goto retry; - } -mem_alloc: - b = mca_alloc(c); - - /* we hold cannibalize_lock: */ - BUG_ON(IS_ERR(b)); - BUG_ON(b->ob); - - bkey_copy(&b->key, &tmp.k); - b->key.k.size = 0; - b->ob = ob; - - return b; -} - -static struct btree *bch_btree_node_alloc(struct bch_fs *c, - unsigned level, enum btree_id id, - struct btree_reserve *reserve) -{ - struct btree *b; - - BUG_ON(!reserve->nr); - - b = reserve->b[--reserve->nr]; - - BUG_ON(mca_hash_insert(c, b, level, id)); - - set_btree_node_accessed(b); - set_btree_node_dirty(b); - - bch_bset_init_first(b, &b->data->keys); - memset(&b->nr, 0, sizeof(b->nr)); - b->data->magic = cpu_to_le64(bset_magic(c)); - b->data->flags = 0; - SET_BTREE_NODE_ID(b->data, id); - SET_BTREE_NODE_LEVEL(b->data, level); - b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr; - - bch_btree_build_aux_trees(b); - - bch_check_mark_super(c, &b->key, true); - - trace_bcache_btree_node_alloc(c, b); - return b; -} - -struct btree *__btree_node_alloc_replacement(struct bch_fs *c, - struct btree *b, - struct bkey_format format, - struct btree_reserve *reserve) -{ - struct btree *n; - - n = bch_btree_node_alloc(c, b->level, b->btree_id, reserve); - - n->data->min_key = b->data->min_key; - n->data->max_key = b->data->max_key; - n->data->format = format; - - btree_node_set_format(n, format); - - bch_btree_sort_into(c, n, b); - - btree_node_reset_sib_u64s(n); - - n->key.k.p = b->key.k.p; - trace_bcache_btree_node_alloc_replacement(c, b, n); - - return n; -} - -struct btree *btree_node_alloc_replacement(struct bch_fs *c, - struct btree *b, - struct btree_reserve *reserve) -{ - struct bkey_format new_f = bch_btree_calc_format(b); - - /* - * The keys might expand with the new format - if they wouldn't fit in - * the btree node anymore, use the old format for now: - */ - if (!bch_btree_node_format_fits(c, b, &new_f)) - new_f = b->format; - - return __btree_node_alloc_replacement(c, b, new_f, reserve); -} - -static void bch_btree_set_root_inmem(struct bch_fs *c, struct btree *b, - struct btree_reserve *btree_reserve) -{ - struct btree *old = btree_node_root(c, b); - - /* Root nodes cannot be reaped */ - mutex_lock(&c->btree_cache_lock); - list_del_init(&b->list); - mutex_unlock(&c->btree_cache_lock); - - mutex_lock(&c->btree_root_lock); - btree_node_root(c, b) = b; - mutex_unlock(&c->btree_root_lock); - - if (btree_reserve) { - /* - * New allocation (we're not being called because we're in - * bch_btree_root_read()) - do marking while holding - * btree_root_lock: - */ - struct bch_fs_usage stats = { 0 }; - - bch_mark_key(c, bkey_i_to_s_c(&b->key), - c->sb.btree_node_size, true, - gc_pos_btree_root(b->btree_id), - &stats, 0); - - if (old) - bch_btree_node_free_index(c, NULL, old->btree_id, - bkey_i_to_s_c(&old->key), - &stats); - bch_fs_usage_apply(c, &stats, &btree_reserve->disk_res, - gc_pos_btree_root(b->btree_id)); - } - - bch_recalc_btree_reserve(c); -} - -static void bch_btree_set_root_ondisk(struct bch_fs *c, struct btree *b) -{ - struct btree_root *r = &c->btree_roots[b->btree_id]; - - mutex_lock(&c->btree_root_lock); - - BUG_ON(b != r->b); - bkey_copy(&r->key, &b->key); - r->level = b->level; - r->alive = true; - - mutex_unlock(&c->btree_root_lock); -} - -/* - * Only for filesystem bringup, when first reading the btree roots or allocating - * btree roots when initializing a new filesystem: - */ -void bch_btree_set_root_initial(struct bch_fs *c, struct btree *b, - struct btree_reserve *btree_reserve) -{ - BUG_ON(btree_node_root(c, b)); - - bch_btree_set_root_inmem(c, b, btree_reserve); - bch_btree_set_root_ondisk(c, b); -} - -/** - * bch_btree_set_root - update the root in memory and on disk - * - * To ensure forward progress, the current task must not be holding any - * btree node write locks. However, you must hold an intent lock on the - * old root. - * - * Note: This allocates a journal entry but doesn't add any keys to - * it. All the btree roots are part of every journal write, so there - * is nothing new to be done. This just guarantees that there is a - * journal write. - */ -static void bch_btree_set_root(struct btree_iter *iter, struct btree *b, - struct btree_interior_update *as, - struct btree_reserve *btree_reserve) -{ - struct bch_fs *c = iter->c; - struct btree *old; - - trace_bcache_btree_set_root(c, b); - BUG_ON(!b->written); - - old = btree_node_root(c, b); - - /* - * Ensure no one is using the old root while we switch to the - * new root: - */ - btree_node_lock_write(old, iter); - - bch_btree_set_root_inmem(c, b, btree_reserve); - - btree_interior_update_updated_root(c, as, iter->btree_id); - - /* - * Unlock old root after new root is visible: - * - * The new root isn't persistent, but that's ok: we still have - * an intent lock on the new root, and any updates that would - * depend on the new root would have to update the new root. - */ - btree_node_unlock_write(old, iter); -} - -static struct btree *__btree_root_alloc(struct bch_fs *c, unsigned level, - enum btree_id id, - struct btree_reserve *reserve) -{ - struct btree *b = bch_btree_node_alloc(c, level, id, reserve); - - b->data->min_key = POS_MIN; - b->data->max_key = POS_MAX; - b->data->format = bch_btree_calc_format(b); - b->key.k.p = POS_MAX; - - btree_node_set_format(b, b->data->format); - bch_btree_build_aux_trees(b); - - six_unlock_write(&b->lock); - - return b; -} - -void bch_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve) -{ - bch_disk_reservation_put(c, &reserve->disk_res); - - mutex_lock(&c->btree_reserve_cache_lock); - - while (reserve->nr) { - struct btree *b = reserve->b[--reserve->nr]; - - six_unlock_write(&b->lock); - - if (c->btree_reserve_cache_nr < - ARRAY_SIZE(c->btree_reserve_cache)) { - struct btree_alloc *a = - &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; - - a->ob = b->ob; - b->ob = NULL; - bkey_copy(&a->k, &b->key); - } else { - bch_open_bucket_put(c, b->ob); - b->ob = NULL; - } - - __btree_node_free(c, b, NULL); - - six_unlock_intent(&b->lock); - } - - mutex_unlock(&c->btree_reserve_cache_lock); - - mempool_free(reserve, &c->btree_reserve_pool); -} - -static struct btree_reserve *__bch_btree_reserve_get(struct bch_fs *c, - unsigned nr_nodes, - unsigned flags, - struct closure *cl) -{ - struct btree_reserve *reserve; - struct btree *b; - struct disk_reservation disk_res = { 0, 0 }; - unsigned sectors = nr_nodes * c->sb.btree_node_size; - int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD| - BCH_DISK_RESERVATION_METADATA; - - if (flags & BTREE_INSERT_NOFAIL) - disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL; - - /* - * This check isn't necessary for correctness - it's just to potentially - * prevent us from doing a lot of work that'll end up being wasted: - */ - ret = bch_journal_error(&c->journal); - if (ret) - return ERR_PTR(ret); - - if (bch_disk_reservation_get(c, &disk_res, sectors, disk_res_flags)) - return ERR_PTR(-ENOSPC); - - BUG_ON(nr_nodes > BTREE_RESERVE_MAX); - - /* - * Protects reaping from the btree node cache and using the btree node - * open bucket reserve: - */ - ret = mca_cannibalize_lock(c, cl); - if (ret) { - bch_disk_reservation_put(c, &disk_res); - return ERR_PTR(ret); - } - - reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO); - - reserve->disk_res = disk_res; - reserve->nr = 0; - - while (reserve->nr < nr_nodes) { - b = __bch_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE, - &disk_res, cl); - if (IS_ERR(b)) { - ret = PTR_ERR(b); - goto err_free; - } - - reserve->b[reserve->nr++] = b; - } - - mca_cannibalize_unlock(c); - return reserve; -err_free: - bch_btree_reserve_put(c, reserve); - mca_cannibalize_unlock(c); - trace_bcache_btree_reserve_get_fail(c, nr_nodes, cl); - return ERR_PTR(ret); -} - -struct btree_reserve *bch_btree_reserve_get(struct bch_fs *c, - struct btree *b, - unsigned extra_nodes, - unsigned flags, - struct closure *cl) -{ - unsigned depth = btree_node_root(c, b)->level - b->level; - unsigned nr_nodes = btree_reserve_required_nodes(depth) + extra_nodes; - - return __bch_btree_reserve_get(c, nr_nodes, flags, cl); - -} - -int bch_btree_root_alloc(struct bch_fs *c, enum btree_id id, - struct closure *writes) -{ - struct closure cl; - struct btree_reserve *reserve; - struct btree *b; - - closure_init_stack(&cl); - - while (1) { - /* XXX haven't calculated capacity yet :/ */ - reserve = __bch_btree_reserve_get(c, 1, 0, &cl); - if (!IS_ERR(reserve)) - break; - - if (PTR_ERR(reserve) == -ENOSPC) - return PTR_ERR(reserve); - - closure_sync(&cl); - } - - b = __btree_root_alloc(c, 0, id, reserve); - - bch_btree_node_write(c, b, writes, SIX_LOCK_intent, -1); - - bch_btree_set_root_initial(c, b, reserve); - btree_open_bucket_put(c, b); - six_unlock_intent(&b->lock); - - bch_btree_reserve_put(c, reserve); - - return 0; -} - -static void bch_insert_fixup_btree_ptr(struct btree_iter *iter, - struct btree *b, - struct bkey_i *insert, - struct btree_node_iter *node_iter, - struct disk_reservation *disk_res) -{ - struct bch_fs *c = iter->c; - struct bch_fs_usage stats = { 0 }; - struct bkey_packed *k; - struct bkey tmp; - - if (bkey_extent_is_data(&insert->k)) - bch_mark_key(c, bkey_i_to_s_c(insert), - c->sb.btree_node_size, true, - gc_pos_btree_node(b), &stats, 0); - - while ((k = bch_btree_node_iter_peek_all(node_iter, b)) && - !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false)) - bch_btree_node_iter_advance(node_iter, b); - - /* - * If we're overwriting, look up pending delete and mark so that gc - * marks it on the pending delete list: - */ - if (k && !bkey_cmp_packed(b, k, &insert->k)) - bch_btree_node_free_index(c, b, iter->btree_id, - bkey_disassemble(b, k, &tmp), - &stats); - - bch_fs_usage_apply(c, &stats, disk_res, gc_pos_btree_node(b)); - - bch_btree_bset_insert_key(iter, b, node_iter, insert); - set_btree_node_dirty(b); -} - -/* Inserting into a given leaf node (last stage of insert): */ - -/* Handle overwrites and do insert, for non extents: */ -bool bch_btree_bset_insert_key(struct btree_iter *iter, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_i *insert) -{ - const struct bkey_format *f = &b->format; - struct bkey_packed *k; - struct bset_tree *t; - unsigned clobber_u64s; - - EBUG_ON(btree_node_just_written(b)); - EBUG_ON(bset_written(b, btree_bset_last(b))); - EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 || - bkey_cmp(insert->k.p, b->data->max_key) > 0); - BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->c, b)); - - k = bch_btree_node_iter_peek_all(node_iter, b); - if (k && !bkey_cmp_packed(b, k, &insert->k)) { - BUG_ON(bkey_whiteout(k)); - - t = bch_bkey_to_bset(b, k); - - if (bset_unwritten(b, bset(b, t)) && - bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) { - BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k)); - - k->type = insert->k.type; - memcpy_u64s(bkeyp_val(f, k), &insert->v, - bkey_val_u64s(&insert->k)); - return true; - } - - insert->k.needs_whiteout = k->needs_whiteout; - - btree_keys_account_key_drop(&b->nr, t - b->set, k); - - if (t == bset_tree_last(b)) { - clobber_u64s = k->u64s; - - /* - * If we're deleting, and the key we're deleting doesn't - * need a whiteout (it wasn't overwriting a key that had - * been written to disk) - just delete it: - */ - if (bkey_whiteout(&insert->k) && !k->needs_whiteout) { - bch_bset_delete(b, k, clobber_u64s); - bch_btree_node_iter_fix(iter, b, node_iter, t, - k, clobber_u64s, 0); - return true; - } - - goto overwrite; - } - - k->type = KEY_TYPE_DELETED; - bch_btree_node_iter_fix(iter, b, node_iter, t, k, - k->u64s, k->u64s); - - if (bkey_whiteout(&insert->k)) { - reserve_whiteout(b, t, k); - return true; - } else { - k->needs_whiteout = false; - } - } else { - /* - * Deleting, but the key to delete wasn't found - nothing to do: - */ - if (bkey_whiteout(&insert->k)) - return false; - - insert->k.needs_whiteout = false; - } - - t = bset_tree_last(b); - k = bch_btree_node_iter_bset_pos(node_iter, b, t); - clobber_u64s = 0; -overwrite: - bch_bset_insert(b, node_iter, k, insert, clobber_u64s); - if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k)) - bch_btree_node_iter_fix(iter, b, node_iter, t, k, - clobber_u64s, k->u64s); - return true; -} - -static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, - unsigned i) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct btree_write *w = container_of(pin, struct btree_write, journal); - struct btree *b = container_of(w, struct btree, writes[i]); - - six_lock_read(&b->lock); - /* - * Reusing a btree node can race with the journal reclaim code calling - * the journal pin flush fn, and there's no good fix for this: we don't - * really want journal_pin_drop() to block until the flush fn is no - * longer running, because journal_pin_drop() is called from the btree - * node write endio function, and we can't wait on the flush fn to - * finish running in mca_reap() - where we make reused btree nodes ready - * to use again - because there, we're holding the lock this function - * needs - deadlock. - * - * So, the b->level check is a hack so we don't try to write nodes we - * shouldn't: - */ - if (!b->level) - bch_btree_node_write(c, b, NULL, SIX_LOCK_read, i); - six_unlock_read(&b->lock); -} - -static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin) -{ - return __btree_node_flush(j, pin, 0); -} - -static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin) -{ - return __btree_node_flush(j, pin, 1); -} - -void bch_btree_journal_key(struct btree_insert *trans, - struct btree_iter *iter, - struct bkey_i *insert) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct btree *b = iter->nodes[0]; - struct btree_write *w = btree_current_write(b); - - EBUG_ON(iter->level || b->level); - EBUG_ON(!trans->journal_res.ref && - test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - - if (!journal_pin_active(&w->journal)) - bch_journal_pin_add(j, &w->journal, - btree_node_write_idx(b) == 0 - ? btree_node_flush0 - : btree_node_flush1); - - if (trans->journal_res.ref) { - u64 seq = trans->journal_res.seq; - bool needs_whiteout = insert->k.needs_whiteout; - - /* - * have a bug where we're seeing an extent with an invalid crc - * entry in the journal, trying to track it down: - */ - BUG_ON(bkey_invalid(c, b->btree_id, bkey_i_to_s_c(insert))); - - /* ick */ - insert->k.needs_whiteout = false; - bch_journal_add_keys(j, &trans->journal_res, - b->btree_id, insert); - insert->k.needs_whiteout = needs_whiteout; - - if (trans->journal_seq) - *trans->journal_seq = seq; - btree_bset_last(b)->journal_seq = cpu_to_le64(seq); - } - - if (!btree_node_dirty(b)) - set_btree_node_dirty(b); -} - -static enum btree_insert_ret -bch_insert_fixup_key(struct btree_insert *trans, - struct btree_insert_entry *insert) -{ - struct btree_iter *iter = insert->iter; - - BUG_ON(iter->level); - - if (bch_btree_bset_insert_key(iter, - iter->nodes[0], - &iter->node_iters[0], - insert->k)) - bch_btree_journal_key(trans, iter, insert->k); - - trans->did_work = true; - return BTREE_INSERT_OK; -} - -static void verify_keys_sorted(struct keylist *l) -{ -#ifdef CONFIG_BCACHE_DEBUG - struct bkey_i *k; - - for_each_keylist_key(l, k) - BUG_ON(bkey_next(k) != l->top && - bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); -#endif -} - -static void btree_node_lock_for_insert(struct btree *b, struct btree_iter *iter) -{ - struct bch_fs *c = iter->c; - - btree_node_lock_write(b, iter); - - if (btree_node_just_written(b) && - bch_btree_post_write_cleanup(c, b)) - bch_btree_iter_reinit_node(iter, b); - - /* - * If the last bset has been written, or if it's gotten too big - start - * a new bset to insert into: - */ - if (want_new_bset(c, b)) - bch_btree_init_next(c, b, iter); -} - -/* Asynchronous interior node update machinery */ - -struct btree_interior_update * -bch_btree_interior_update_alloc(struct bch_fs *c) -{ - struct btree_interior_update *as; - - as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); - memset(as, 0, sizeof(*as)); - closure_init(&as->cl, &c->cl); - as->c = c; - as->mode = BTREE_INTERIOR_NO_UPDATE; - - bch_keylist_init(&as->parent_keys, as->inline_keys, - ARRAY_SIZE(as->inline_keys)); - - mutex_lock(&c->btree_interior_update_lock); - list_add(&as->list, &c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); - - return as; -} - -static void btree_interior_update_free(struct closure *cl) -{ - struct btree_interior_update *as = container_of(cl, struct btree_interior_update, cl); - - mempool_free(as, &as->c->btree_interior_update_pool); -} - -static void btree_interior_update_nodes_reachable(struct closure *cl) -{ - struct btree_interior_update *as = - container_of(cl, struct btree_interior_update, cl); - struct bch_fs *c = as->c; - unsigned i; - - bch_journal_pin_drop(&c->journal, &as->journal); - - mutex_lock(&c->btree_interior_update_lock); - - for (i = 0; i < as->nr_pending; i++) - bch_btree_node_free_ondisk(c, &as->pending[i]); - as->nr_pending = 0; - - mutex_unlock(&c->btree_interior_update_lock); - - mutex_lock(&c->btree_interior_update_lock); - list_del(&as->list); - mutex_unlock(&c->btree_interior_update_lock); - - closure_wake_up(&as->wait); - - closure_return_with_destructor(cl, btree_interior_update_free); -} - -static void btree_interior_update_nodes_written(struct closure *cl) -{ - struct btree_interior_update *as = - container_of(cl, struct btree_interior_update, cl); - struct bch_fs *c = as->c; - struct btree *b; - - if (bch_journal_error(&c->journal)) { - /* XXX what? */ - } - - /* XXX: missing error handling, damnit */ - - /* check for journal error, bail out if we flushed */ - - /* - * We did an update to a parent node where the pointers we added pointed - * to child nodes that weren't written yet: now, the child nodes have - * been written so we can write out the update to the interior node. - */ -retry: - mutex_lock(&c->btree_interior_update_lock); - switch (as->mode) { - case BTREE_INTERIOR_NO_UPDATE: - BUG(); - case BTREE_INTERIOR_UPDATING_NODE: - /* The usual case: */ - b = READ_ONCE(as->b); - - if (!six_trylock_read(&b->lock)) { - mutex_unlock(&c->btree_interior_update_lock); - six_lock_read(&b->lock); - six_unlock_read(&b->lock); - goto retry; - } - - BUG_ON(!btree_node_dirty(b)); - closure_wait(&btree_current_write(b)->wait, cl); - - list_del(&as->write_blocked_list); - - if (list_empty(&b->write_blocked)) - bch_btree_node_write(c, b, NULL, SIX_LOCK_read, -1); - six_unlock_read(&b->lock); - break; - - case BTREE_INTERIOR_UPDATING_AS: - /* - * The btree node we originally updated has been freed and is - * being rewritten - so we need to write anything here, we just - * need to signal to that btree_interior_update that it's ok to make the - * new replacement node visible: - */ - closure_put(&as->parent_as->cl); - - /* - * and then we have to wait on that btree_interior_update to finish: - */ - closure_wait(&as->parent_as->wait, cl); - break; - - case BTREE_INTERIOR_UPDATING_ROOT: - /* b is the new btree root: */ - b = READ_ONCE(as->b); - - if (!six_trylock_read(&b->lock)) { - mutex_unlock(&c->btree_interior_update_lock); - six_lock_read(&b->lock); - six_unlock_read(&b->lock); - goto retry; - } - - BUG_ON(c->btree_roots[b->btree_id].as != as); - c->btree_roots[b->btree_id].as = NULL; - - bch_btree_set_root_ondisk(c, b); - - /* - * We don't have to wait anything anything here (before - * btree_interior_update_nodes_reachable frees the old nodes - * ondisk) - we've ensured that the very next journal write will - * have the pointer to the new root, and before the allocator - * can reuse the old nodes it'll have to do a journal commit: - */ - six_unlock_read(&b->lock); - } - mutex_unlock(&c->btree_interior_update_lock); - - continue_at(cl, btree_interior_update_nodes_reachable, system_wq); -} - -/* - * We're updating @b with pointers to nodes that haven't finished writing yet: - * block @b from being written until @as completes - */ -static void btree_interior_update_updated_btree(struct bch_fs *c, - struct btree_interior_update *as, - struct btree *b) -{ - mutex_lock(&c->btree_interior_update_lock); - - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); - BUG_ON(!btree_node_dirty(b)); - - as->mode = BTREE_INTERIOR_UPDATING_NODE; - as->b = b; - list_add(&as->write_blocked_list, &b->write_blocked); - - mutex_unlock(&c->btree_interior_update_lock); - - bch_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl); - - continue_at(&as->cl, btree_interior_update_nodes_written, - system_freezable_wq); -} - -static void btree_interior_update_updated_root(struct bch_fs *c, - struct btree_interior_update *as, - enum btree_id btree_id) -{ - struct btree_root *r = &c->btree_roots[btree_id]; - - mutex_lock(&c->btree_interior_update_lock); - - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); - - /* - * Old root might not be persistent yet - if so, redirect its - * btree_interior_update operation to point to us: - */ - if (r->as) { - BUG_ON(r->as->mode != BTREE_INTERIOR_UPDATING_ROOT); - - r->as->b = NULL; - r->as->mode = BTREE_INTERIOR_UPDATING_AS; - r->as->parent_as = as; - closure_get(&as->cl); - } - - as->mode = BTREE_INTERIOR_UPDATING_ROOT; - as->b = r->b; - r->as = as; - - mutex_unlock(&c->btree_interior_update_lock); - - bch_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl); - - continue_at(&as->cl, btree_interior_update_nodes_written, - system_freezable_wq); -} - -static void interior_update_flush(struct journal *j, struct journal_entry_pin *pin) -{ - struct btree_interior_update *as = - container_of(pin, struct btree_interior_update, journal); - - bch_journal_flush_seq_async(j, as->journal_seq, NULL); -} - -/* - * @b is being split/rewritten: it may have pointers to not-yet-written btree - * nodes and thus outstanding btree_interior_updates - redirect @b's - * btree_interior_updates to point to this btree_interior_update: - */ -void bch_btree_interior_update_will_free_node(struct bch_fs *c, - struct btree_interior_update *as, - struct btree *b) -{ - struct btree_interior_update *p, *n; - struct pending_btree_node_free *d; - struct bset_tree *t; - - /* - * Does this node have data that hasn't been written in the journal? - * - * If so, we have to wait for the corresponding journal entry to be - * written before making the new nodes reachable - we can't just carry - * over the bset->journal_seq tracking, since we'll be mixing those keys - * in with keys that aren't in the journal anymore: - */ - for_each_bset(b, t) - as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq); - - /* - * Does this node have unwritten data that has a pin on the journal? - * - * If so, transfer that pin to the btree_interior_update operation - - * note that if we're freeing multiple nodes, we only need to keep the - * oldest pin of any of the nodes we're freeing. We'll release the pin - * when the new nodes are persistent and reachable on disk: - */ - bch_journal_pin_add_if_older(&c->journal, - &b->writes[0].journal, - &as->journal, interior_update_flush); - bch_journal_pin_add_if_older(&c->journal, - &b->writes[1].journal, - &as->journal, interior_update_flush); - - mutex_lock(&c->btree_interior_update_lock); - - /* - * Does this node have any btree_interior_update operations preventing - * it from being written? - * - * If so, redirect them to point to this btree_interior_update: we can - * write out our new nodes, but we won't make them visible until those - * operations complete - */ - list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { - BUG_ON(p->mode != BTREE_INTERIOR_UPDATING_NODE); - - p->mode = BTREE_INTERIOR_UPDATING_AS; - list_del(&p->write_blocked_list); - p->b = NULL; - p->parent_as = as; - closure_get(&as->cl); - } - - /* Add this node to the list of nodes being freed: */ - BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending)); - - d = &as->pending[as->nr_pending++]; - d->index_update_done = false; - d->seq = b->data->keys.seq; - d->btree_id = b->btree_id; - d->level = b->level; - bkey_copy(&d->key, &b->key); - - mutex_unlock(&c->btree_interior_update_lock); -} - -static void btree_node_interior_verify(struct btree *b) -{ - struct btree_node_iter iter; - struct bkey_packed *k; - - BUG_ON(!b->level); - - bch_btree_node_iter_init(&iter, b, b->key.k.p, false, false); -#if 1 - BUG_ON(!(k = bch_btree_node_iter_peek(&iter, b)) || - bkey_cmp_left_packed(b, k, &b->key.k.p)); - - BUG_ON((bch_btree_node_iter_advance(&iter, b), - !bch_btree_node_iter_end(&iter))); -#else - const char *msg; - - msg = "not found"; - k = bch_btree_node_iter_peek(&iter, b); - if (!k) - goto err; - - msg = "isn't what it should be"; - if (bkey_cmp_left_packed(b, k, &b->key.k.p)) - goto err; - - bch_btree_node_iter_advance(&iter, b); - - msg = "isn't last key"; - if (!bch_btree_node_iter_end(&iter)) - goto err; - return; -err: - bch_dump_btree_node(b); - printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode, - b->key.k.p.offset, msg); - BUG(); -#endif -} - -static enum btree_insert_ret -bch_btree_insert_keys_interior(struct btree *b, - struct btree_iter *iter, - struct keylist *insert_keys, - struct btree_interior_update *as, - struct btree_reserve *res) -{ - struct bch_fs *c = iter->c; - struct btree_iter *linked; - struct btree_node_iter node_iter; - struct bkey_i *insert = bch_keylist_front(insert_keys); - struct bkey_packed *k; - - BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); - BUG_ON(!b->level); - BUG_ON(!as || as->b); - verify_keys_sorted(insert_keys); - - btree_node_lock_for_insert(b, iter); - - if (bch_keylist_u64s(insert_keys) > - bch_btree_keys_u64s_remaining(c, b)) { - btree_node_unlock_write(b, iter); - return BTREE_INSERT_BTREE_NODE_FULL; - } - - /* Don't screw up @iter's position: */ - node_iter = iter->node_iters[b->level]; - - /* - * btree_split(), btree_gc_coalesce() will insert keys before - * the iterator's current position - they know the keys go in - * the node the iterator points to: - */ - while ((k = bch_btree_node_iter_prev_all(&node_iter, b)) && - (bkey_cmp_packed(b, k, &insert->k) >= 0)) - ; - - while (!bch_keylist_empty(insert_keys)) { - insert = bch_keylist_front(insert_keys); - - bch_insert_fixup_btree_ptr(iter, b, insert, - &node_iter, &res->disk_res); - bch_keylist_pop_front(insert_keys); - } - - btree_interior_update_updated_btree(c, as, b); - - for_each_linked_btree_node(iter, b, linked) - bch_btree_node_iter_peek(&linked->node_iters[b->level], - b); - bch_btree_node_iter_peek(&iter->node_iters[b->level], b); - - bch_btree_iter_verify(iter, b); - - if (bch_maybe_compact_whiteouts(c, b)) - bch_btree_iter_reinit_node(iter, b); - - btree_node_unlock_write(b, iter); - - btree_node_interior_verify(b); - return BTREE_INSERT_OK; -} - -/* - * Move keys from n1 (original replacement node, now lower node) to n2 (higher - * node) - */ -static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n1, - struct btree_reserve *reserve) -{ - size_t nr_packed = 0, nr_unpacked = 0; - struct btree *n2; - struct bset *set1, *set2; - struct bkey_packed *k, *prev = NULL; - - n2 = bch_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve); - n2->data->max_key = n1->data->max_key; - n2->data->format = n1->format; - n2->key.k.p = n1->key.k.p; - - btree_node_set_format(n2, n2->data->format); - - set1 = btree_bset_first(n1); - set2 = btree_bset_first(n2); - - /* - * Has to be a linear search because we don't have an auxiliary - * search tree yet - */ - k = set1->start; - while (1) { - if (bkey_next(k) == vstruct_last(set1)) - break; - if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) - break; - - if (bkey_packed(k)) - nr_packed++; - else - nr_unpacked++; - - prev = k; - k = bkey_next(k); - } - - BUG_ON(!prev); - - n1->key.k.p = bkey_unpack_pos(n1, prev); - n1->data->max_key = n1->key.k.p; - n2->data->min_key = - btree_type_successor(n1->btree_id, n1->key.k.p); - - set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); - set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); - - set_btree_bset_end(n1, n1->set); - set_btree_bset_end(n2, n2->set); - - n2->nr.live_u64s = le16_to_cpu(set2->u64s); - n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); - n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; - n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; - - n1->nr.live_u64s = le16_to_cpu(set1->u64s); - n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); - n1->nr.packed_keys = nr_packed; - n1->nr.unpacked_keys = nr_unpacked; - - BUG_ON(!set1->u64s); - BUG_ON(!set2->u64s); - - memcpy_u64s(set2->start, - vstruct_end(set1), - le16_to_cpu(set2->u64s)); - - btree_node_reset_sib_u64s(n1); - btree_node_reset_sib_u64s(n2); - - bch_verify_btree_nr_keys(n1); - bch_verify_btree_nr_keys(n2); - - if (n1->level) { - btree_node_interior_verify(n1); - btree_node_interior_verify(n2); - } - - return n2; -} - -/* - * For updates to interior nodes, we've got to do the insert before we split - * because the stuff we're inserting has to be inserted atomically. Post split, - * the keys might have to go in different nodes and the split would no longer be - * atomic. - * - * Worse, if the insert is from btree node coalescing, if we do the insert after - * we do the split (and pick the pivot) - the pivot we pick might be between - * nodes that were coalesced, and thus in the middle of a child node post - * coalescing: - */ -static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b, - struct keylist *keys, - struct btree_reserve *res) -{ - struct btree_node_iter node_iter; - struct bkey_i *k = bch_keylist_front(keys); - struct bkey_packed *p; - struct bset *i; - - BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); - - bch_btree_node_iter_init(&node_iter, b, k->k.p, false, false); - - while (!bch_keylist_empty(keys)) { - k = bch_keylist_front(keys); - - BUG_ON(bch_keylist_u64s(keys) > - bch_btree_keys_u64s_remaining(iter->c, b)); - BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0); - BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0); - - bch_insert_fixup_btree_ptr(iter, b, k, &node_iter, &res->disk_res); - bch_keylist_pop_front(keys); - } - - /* - * We can't tolerate whiteouts here - with whiteouts there can be - * duplicate keys, and it would be rather bad if we picked a duplicate - * for the pivot: - */ - i = btree_bset_first(b); - p = i->start; - while (p != vstruct_last(i)) - if (bkey_deleted(p)) { - le16_add_cpu(&i->u64s, -p->u64s); - set_btree_bset_end(b, b->set); - memmove_u64s_down(p, bkey_next(p), - (u64 *) vstruct_last(i) - - (u64 *) p); - } else - p = bkey_next(p); - - BUG_ON(b->nsets != 1 || - b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); - - btree_node_interior_verify(b); -} - -static void btree_split(struct btree *b, struct btree_iter *iter, - struct keylist *insert_keys, - struct btree_reserve *reserve, - struct btree_interior_update *as) -{ - struct bch_fs *c = iter->c; - struct btree *parent = iter->nodes[b->level + 1]; - struct btree *n1, *n2 = NULL, *n3 = NULL; - u64 start_time = local_clock(); - - BUG_ON(!parent && (b != btree_node_root(c, b))); - BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); - - bch_btree_interior_update_will_free_node(c, as, b); - - n1 = btree_node_alloc_replacement(c, b, reserve); - if (b->level) - btree_split_insert_keys(iter, n1, insert_keys, reserve); - - if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) { - trace_bcache_btree_node_split(c, b, b->nr.live_u64s); - - n2 = __btree_split_node(iter, n1, reserve); - - bch_btree_build_aux_trees(n2); - bch_btree_build_aux_trees(n1); - six_unlock_write(&n2->lock); - six_unlock_write(&n1->lock); - - bch_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent, -1); - - /* - * Note that on recursive parent_keys == insert_keys, so we - * can't start adding new keys to parent_keys before emptying it - * out (which we did with btree_split_insert_keys() above) - */ - bch_keylist_add(&as->parent_keys, &n1->key); - bch_keylist_add(&as->parent_keys, &n2->key); - - if (!parent) { - /* Depth increases, make a new root */ - n3 = __btree_root_alloc(c, b->level + 1, - iter->btree_id, - reserve); - n3->sib_u64s[0] = U16_MAX; - n3->sib_u64s[1] = U16_MAX; - - btree_split_insert_keys(iter, n3, &as->parent_keys, - reserve); - bch_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent, -1); - } - } else { - trace_bcache_btree_node_compact(c, b, b->nr.live_u64s); - - bch_btree_build_aux_trees(n1); - six_unlock_write(&n1->lock); - - bch_keylist_add(&as->parent_keys, &n1->key); - } - - bch_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent, -1); - - /* New nodes all written, now make them visible: */ - - if (parent) { - /* Split a non root node */ - bch_btree_insert_node(parent, iter, &as->parent_keys, - reserve, as); - } else if (n3) { - bch_btree_set_root(iter, n3, as, reserve); - } else { - /* Root filled up but didn't need to be split */ - bch_btree_set_root(iter, n1, as, reserve); - } - - btree_open_bucket_put(c, n1); - if (n2) - btree_open_bucket_put(c, n2); - if (n3) - btree_open_bucket_put(c, n3); - - /* - * Note - at this point other linked iterators could still have @b read - * locked; we're depending on the bch_btree_iter_node_replace() calls - * below removing all references to @b so we don't return with other - * iterators pointing to a node they have locked that's been freed. - * - * We have to free the node first because the bch_iter_node_replace() - * calls will drop _our_ iterator's reference - and intent lock - to @b. - */ - bch_btree_node_free_inmem(iter, b); - - /* Successful split, update the iterator to point to the new nodes: */ - - if (n3) - bch_btree_iter_node_replace(iter, n3); - if (n2) - bch_btree_iter_node_replace(iter, n2); - bch_btree_iter_node_replace(iter, n1); - - bch_time_stats_update(&c->btree_split_time, start_time); -} - -/** - * bch_btree_insert_node - insert bkeys into a given btree node - * - * @iter: btree iterator - * @insert_keys: list of keys to insert - * @hook: insert callback - * @persistent: if not null, @persistent will wait on journal write - * - * Inserts as many keys as it can into a given btree node, splitting it if full. - * If a split occurred, this function will return early. This can only happen - * for leaf nodes -- inserts into interior nodes have to be atomic. - */ -void bch_btree_insert_node(struct btree *b, - struct btree_iter *iter, - struct keylist *insert_keys, - struct btree_reserve *reserve, - struct btree_interior_update *as) -{ - BUG_ON(!b->level); - BUG_ON(!reserve || !as); - - switch (bch_btree_insert_keys_interior(b, iter, insert_keys, - as, reserve)) { - case BTREE_INSERT_OK: - break; - case BTREE_INSERT_BTREE_NODE_FULL: - btree_split(b, iter, insert_keys, reserve, as); - break; - default: - BUG(); - } -} - -static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags) -{ - struct bch_fs *c = iter->c; - struct btree *b = iter->nodes[0]; - struct btree_reserve *reserve; - struct btree_interior_update *as; - struct closure cl; - int ret = 0; - - closure_init_stack(&cl); - - /* Hack, because gc and splitting nodes doesn't mix yet: */ - if (!down_read_trylock(&c->gc_lock)) { - bch_btree_iter_unlock(iter); - down_read(&c->gc_lock); - } - - /* - * XXX: figure out how far we might need to split, - * instead of locking/reserving all the way to the root: - */ - if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) { - ret = -EINTR; - goto out; - } - - reserve = bch_btree_reserve_get(c, b, 0, flags, &cl); - if (IS_ERR(reserve)) { - ret = PTR_ERR(reserve); - if (ret == -EAGAIN) { - bch_btree_iter_unlock(iter); - up_read(&c->gc_lock); - closure_sync(&cl); - return -EINTR; - } - goto out; - } - - as = bch_btree_interior_update_alloc(c); - - btree_split(b, iter, NULL, reserve, as); - bch_btree_reserve_put(c, reserve); - - bch_btree_iter_set_locks_want(iter, 1); -out: - up_read(&c->gc_lock); - return ret; -} - -enum btree_node_sibling { - btree_prev_sib, - btree_next_sib, -}; - -static struct btree *btree_node_get_sibling(struct btree_iter *iter, - struct btree *b, - enum btree_node_sibling sib) -{ - struct btree *parent; - struct btree_node_iter node_iter; - struct bkey_packed *k; - BKEY_PADDED(k) tmp; - struct btree *ret; - unsigned level = b->level; - - parent = iter->nodes[level + 1]; - if (!parent) - return NULL; - - if (!btree_node_relock(iter, level + 1)) { - bch_btree_iter_set_locks_want(iter, level + 2); - return ERR_PTR(-EINTR); - } - - node_iter = iter->node_iters[parent->level]; - - k = bch_btree_node_iter_peek_all(&node_iter, parent); - BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); - - do { - k = sib == btree_prev_sib - ? bch_btree_node_iter_prev_all(&node_iter, parent) - : (bch_btree_node_iter_advance(&node_iter, parent), - bch_btree_node_iter_peek_all(&node_iter, parent)); - if (!k) - return NULL; - } while (bkey_deleted(k)); - - bkey_unpack(parent, &tmp.k, k); - - ret = bch_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent); - - if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) { - btree_node_unlock(iter, level); - ret = bch_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent); - } - - if (!IS_ERR(ret) && !btree_node_relock(iter, level)) { - six_unlock_intent(&ret->lock); - ret = ERR_PTR(-EINTR); - } - - return ret; -} - -static int __foreground_maybe_merge(struct btree_iter *iter, - enum btree_node_sibling sib) -{ - struct bch_fs *c = iter->c; - struct btree_reserve *reserve; - struct btree_interior_update *as; - struct bkey_format_state new_s; - struct bkey_format new_f; - struct bkey_i delete; - struct btree *b, *m, *n, *prev, *next, *parent; - struct closure cl; - size_t sib_u64s; - int ret = 0; - - closure_init_stack(&cl); -retry: - if (!btree_node_relock(iter, iter->level)) - return 0; - - b = iter->nodes[iter->level]; - - parent = iter->nodes[b->level + 1]; - if (!parent) - return 0; - - if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) - return 0; - - /* XXX: can't be holding read locks */ - m = btree_node_get_sibling(iter, b, sib); - if (IS_ERR(m)) { - ret = PTR_ERR(m); - goto out; - } - - /* NULL means no sibling: */ - if (!m) { - b->sib_u64s[sib] = U16_MAX; - return 0; - } - - if (sib == btree_prev_sib) { - prev = m; - next = b; - } else { - prev = b; - next = m; - } - - bch_bkey_format_init(&new_s); - __bch_btree_calc_format(&new_s, b); - __bch_btree_calc_format(&new_s, m); - new_f = bch_bkey_format_done(&new_s); - - sib_u64s = btree_node_u64s_with_format(b, &new_f) + - btree_node_u64s_with_format(m, &new_f); - - if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { - sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); - sib_u64s /= 2; - sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); - } - - sib_u64s = min(sib_u64s, btree_max_u64s(c)); - b->sib_u64s[sib] = sib_u64s; - - if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { - six_unlock_intent(&m->lock); - return 0; - } - - /* We're changing btree topology, doesn't mix with gc: */ - if (!down_read_trylock(&c->gc_lock)) { - six_unlock_intent(&m->lock); - bch_btree_iter_unlock(iter); - - down_read(&c->gc_lock); - up_read(&c->gc_lock); - ret = -EINTR; - goto out; - } - - if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) { - ret = -EINTR; - goto out_unlock; - } - - reserve = bch_btree_reserve_get(c, b, 0, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE, - &cl); - if (IS_ERR(reserve)) { - ret = PTR_ERR(reserve); - goto out_unlock; - } - - as = bch_btree_interior_update_alloc(c); - - bch_btree_interior_update_will_free_node(c, as, b); - bch_btree_interior_update_will_free_node(c, as, m); - - n = bch_btree_node_alloc(c, b->level, b->btree_id, reserve); - n->data->min_key = prev->data->min_key; - n->data->max_key = next->data->max_key; - n->data->format = new_f; - n->key.k.p = next->key.k.p; - - btree_node_set_format(n, new_f); - - bch_btree_sort_into(c, n, prev); - bch_btree_sort_into(c, n, next); - - bch_btree_build_aux_trees(n); - six_unlock_write(&n->lock); - - bkey_init(&delete.k); - delete.k.p = prev->key.k.p; - bch_keylist_add(&as->parent_keys, &delete); - bch_keylist_add(&as->parent_keys, &n->key); - - bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1); - - bch_btree_insert_node(parent, iter, &as->parent_keys, reserve, as); - - btree_open_bucket_put(c, n); - bch_btree_node_free_inmem(iter, b); - bch_btree_node_free_inmem(iter, m); - bch_btree_iter_node_replace(iter, n); - - bch_btree_iter_verify(iter, n); - - bch_btree_reserve_put(c, reserve); -out_unlock: - if (ret != -EINTR && ret != -EAGAIN) - bch_btree_iter_set_locks_want(iter, 1); - six_unlock_intent(&m->lock); - up_read(&c->gc_lock); -out: - if (ret == -EAGAIN || ret == -EINTR) { - bch_btree_iter_unlock(iter); - ret = -EINTR; - } - - closure_sync(&cl); - - if (ret == -EINTR) { - ret = bch_btree_iter_traverse(iter); - if (!ret) - goto retry; - } - - return ret; -} - -static int inline foreground_maybe_merge(struct btree_iter *iter, - enum btree_node_sibling sib) -{ - struct bch_fs *c = iter->c; - struct btree *b; - - if (!btree_node_locked(iter, iter->level)) - return 0; - - b = iter->nodes[iter->level]; - if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) - return 0; - - return __foreground_maybe_merge(iter, sib); -} - -/** - * btree_insert_key - insert a key one key into a leaf node - */ -static enum btree_insert_ret -btree_insert_key(struct btree_insert *trans, - struct btree_insert_entry *insert) -{ - struct bch_fs *c = trans->c; - struct btree_iter *iter = insert->iter; - struct btree *b = iter->nodes[0]; - enum btree_insert_ret ret; - int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); - int old_live_u64s = b->nr.live_u64s; - int live_u64s_added, u64s_added; - - ret = !btree_node_is_extents(b) - ? bch_insert_fixup_key(trans, insert) - : bch_insert_fixup_extent(trans, insert); - - live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; - - if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); - if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); - - if (u64s_added > live_u64s_added && - bch_maybe_compact_whiteouts(iter->c, b)) - bch_btree_iter_reinit_node(iter, b); - - trace_bcache_btree_insert_key(c, b, insert->k); - return ret; -} - -static bool same_leaf_as_prev(struct btree_insert *trans, - struct btree_insert_entry *i) -{ - /* - * Because we sorted the transaction entries, if multiple iterators - * point to the same leaf node they'll always be adjacent now: - */ - return i != trans->entries && - i[0].iter->nodes[0] == i[-1].iter->nodes[0]; -} - -#define trans_for_each_entry(trans, i) \ - for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++) - -static void multi_lock_write(struct btree_insert *trans) -{ - struct btree_insert_entry *i; - - trans_for_each_entry(trans, i) - if (!same_leaf_as_prev(trans, i)) - btree_node_lock_for_insert(i->iter->nodes[0], i->iter); -} - -static void multi_unlock_write(struct btree_insert *trans) -{ - struct btree_insert_entry *i; - - trans_for_each_entry(trans, i) - if (!same_leaf_as_prev(trans, i)) - btree_node_unlock_write(i->iter->nodes[0], i->iter); -} - -static int btree_trans_entry_cmp(const void *_l, const void *_r) -{ - const struct btree_insert_entry *l = _l; - const struct btree_insert_entry *r = _r; - - return btree_iter_cmp(l->iter, r->iter); -} - -/* Normal update interface: */ - -/** - * __bch_btree_insert_at - insert keys at given iterator positions - * - * This is main entry point for btree updates. - * - * Return values: - * -EINTR: locking changed, this function should be called again. Only returned - * if passed BTREE_INSERT_ATOMIC. - * -EROFS: filesystem read only - * -EIO: journal or btree node IO error - */ -int __bch_btree_insert_at(struct btree_insert *trans) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - struct btree_iter *split = NULL; - bool cycle_gc_lock = false; - unsigned u64s; - int ret; - - trans_for_each_entry(trans, i) { - EBUG_ON(i->iter->level); - EBUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); - } - - sort(trans->entries, trans->nr, sizeof(trans->entries[0]), - btree_trans_entry_cmp, NULL); - - if (unlikely(!percpu_ref_tryget(&c->writes))) - return -EROFS; -retry_locks: - ret = -EINTR; - trans_for_each_entry(trans, i) - if (!bch_btree_iter_set_locks_want(i->iter, 1)) - goto err; -retry: - trans->did_work = false; - u64s = 0; - trans_for_each_entry(trans, i) - if (!i->done) - u64s += jset_u64s(i->k->k.u64s + i->extra_res); - - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - - ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) - ? bch_journal_res_get(&c->journal, - &trans->journal_res, - u64s, u64s) - : 0; - if (ret) - goto err; - - multi_lock_write(trans); - - u64s = 0; - trans_for_each_entry(trans, i) { - /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, i)) - u64s = 0; - - /* - * bch_btree_node_insert_fits() must be called under write lock: - * with only an intent lock, another thread can still call - * bch_btree_node_write(), converting an unwritten bset to a - * written one - */ - if (!i->done) { - u64s += i->k->k.u64s + i->extra_res; - if (!bch_btree_node_insert_fits(c, - i->iter->nodes[0], u64s)) { - split = i->iter; - goto unlock; - } - } - } - - ret = 0; - split = NULL; - cycle_gc_lock = false; - - trans_for_each_entry(trans, i) { - if (i->done) - continue; - - switch (btree_insert_key(trans, i)) { - case BTREE_INSERT_OK: - i->done = true; - break; - case BTREE_INSERT_JOURNAL_RES_FULL: - case BTREE_INSERT_NEED_TRAVERSE: - ret = -EINTR; - break; - case BTREE_INSERT_NEED_RESCHED: - ret = -EAGAIN; - break; - case BTREE_INSERT_BTREE_NODE_FULL: - split = i->iter; - break; - case BTREE_INSERT_ENOSPC: - ret = -ENOSPC; - break; - case BTREE_INSERT_NEED_GC_LOCK: - cycle_gc_lock = true; - ret = -EINTR; - break; - default: - BUG(); - } - - if (!trans->did_work && (ret || split)) - break; - } -unlock: - multi_unlock_write(trans); - bch_journal_res_put(&c->journal, &trans->journal_res); - - if (split) - goto split; - if (ret) - goto err; - - /* - * hack: iterators are inconsistent when they hit end of leaf, until - * traversed again - */ - trans_for_each_entry(trans, i) - if (i->iter->at_end_of_leaf) - goto out; - - trans_for_each_entry(trans, i) - if (!same_leaf_as_prev(trans, i)) { - foreground_maybe_merge(i->iter, btree_prev_sib); - foreground_maybe_merge(i->iter, btree_next_sib); - } -out: - /* make sure we didn't lose an error: */ - if (!ret && IS_ENABLED(CONFIG_BCACHE_DEBUG)) - trans_for_each_entry(trans, i) - BUG_ON(!i->done); - - percpu_ref_put(&c->writes); - return ret; -split: - /* - * have to drop journal res before splitting, because splitting means - * allocating new btree nodes, and holding a journal reservation - * potentially blocks the allocator: - */ - ret = bch_btree_split_leaf(split, trans->flags); - if (ret) - goto err; - /* - * if the split didn't have to drop locks the insert will still be - * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked() - * and is overwriting won't have changed) - */ - goto retry_locks; -err: - if (cycle_gc_lock) { - down_read(&c->gc_lock); - up_read(&c->gc_lock); - } - - if (ret == -EINTR) { - trans_for_each_entry(trans, i) { - int ret2 = bch_btree_iter_traverse(i->iter); - if (ret2) { - ret = ret2; - goto out; - } - } - - /* - * BTREE_ITER_ATOMIC means we have to return -EINTR if we - * dropped locks: - */ - if (!(trans->flags & BTREE_INSERT_ATOMIC)) - goto retry; - } - - goto out; -} - -int bch_btree_insert_list_at(struct btree_iter *iter, - struct keylist *keys, - struct disk_reservation *disk_res, - struct extent_insert_hook *hook, - u64 *journal_seq, unsigned flags) -{ - BUG_ON(flags & BTREE_INSERT_ATOMIC); - BUG_ON(bch_keylist_empty(keys)); - verify_keys_sorted(keys); - - while (!bch_keylist_empty(keys)) { - /* need to traverse between each insert */ - int ret = bch_btree_iter_traverse(iter); - if (ret) - return ret; - - ret = bch_btree_insert_at(iter->c, disk_res, hook, - journal_seq, flags, - BTREE_INSERT_ENTRY(iter, bch_keylist_front(keys))); - if (ret) - return ret; - - bch_keylist_pop_front(keys); - } - - return 0; -} - -/** - * bch_btree_insert_check_key - insert dummy key into btree - * - * We insert a random key on a cache miss, then compare exchange on it - * once the cache promotion or backing device read completes. This - * ensures that if this key is written to after the read, the read will - * lose and not overwrite the key with stale data. - * - * Return values: - * -EAGAIN: @iter->cl was put on a waitlist waiting for btree node allocation - * -EINTR: btree node was changed while upgrading to write lock - */ -int bch_btree_insert_check_key(struct btree_iter *iter, - struct bkey_i *check_key) -{ - struct bpos saved_pos = iter->pos; - struct bkey_i_cookie *cookie; - BKEY_PADDED(key) tmp; - int ret; - - BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&check_key->k))); - - check_key->k.type = KEY_TYPE_COOKIE; - set_bkey_val_bytes(&check_key->k, sizeof(struct bch_cookie)); - - cookie = bkey_i_to_cookie(check_key); - get_random_bytes(&cookie->v, sizeof(cookie->v)); - - bkey_copy(&tmp.key, check_key); - - ret = bch_btree_insert_at(iter->c, NULL, NULL, NULL, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(iter, &tmp.key)); - - bch_btree_iter_rewind(iter, saved_pos); - - return ret; -} - -/** - * bch_btree_insert - insert keys into the extent btree - * @c: pointer to struct bch_fs - * @id: btree to insert into - * @insert_keys: list of keys to insert - * @hook: insert callback - */ -int bch_btree_insert(struct bch_fs *c, enum btree_id id, - struct bkey_i *k, - struct disk_reservation *disk_res, - struct extent_insert_hook *hook, - u64 *journal_seq, int flags) -{ - struct btree_iter iter; - int ret, ret2; - - bch_btree_iter_init_intent(&iter, c, id, bkey_start_pos(&k->k)); - - ret = bch_btree_iter_traverse(&iter); - if (unlikely(ret)) - goto out; - - ret = bch_btree_insert_at(c, disk_res, hook, journal_seq, flags, - BTREE_INSERT_ENTRY(&iter, k)); -out: ret2 = bch_btree_iter_unlock(&iter); - - return ret ?: ret2; -} - -/** - * bch_btree_update - like bch_btree_insert(), but asserts that we're - * overwriting an existing key - */ -int bch_btree_update(struct bch_fs *c, enum btree_id id, - struct bkey_i *k, u64 *journal_seq) -{ - struct btree_iter iter; - struct bkey_s_c u; - int ret; - - EBUG_ON(id == BTREE_ID_EXTENTS); - - bch_btree_iter_init_intent(&iter, c, id, k->k.p); - - u = bch_btree_iter_peek_with_holes(&iter); - ret = btree_iter_err(u); - if (ret) - return ret; - - if (bkey_deleted(u.k)) { - bch_btree_iter_unlock(&iter); - return -ENOENT; - } - - ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, 0, - BTREE_INSERT_ENTRY(&iter, k)); - bch_btree_iter_unlock(&iter); - return ret; -} - -/* - * bch_btree_delete_range - delete everything within a given range - * - * Range is a half open interval - [start, end) - */ -int bch_btree_delete_range(struct bch_fs *c, enum btree_id id, - struct bpos start, - struct bpos end, - struct bversion version, - struct disk_reservation *disk_res, - struct extent_insert_hook *hook, - u64 *journal_seq) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - bch_btree_iter_init_intent(&iter, c, id, start); - - while ((k = bch_btree_iter_peek(&iter)).k && - !(ret = btree_iter_err(k))) { - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - /* really shouldn't be using a bare, unpadded bkey_i */ - struct bkey_i delete; - - if (bkey_cmp(iter.pos, end) >= 0) - break; - - bkey_init(&delete.k); - - /* - * For extents, iter.pos won't necessarily be the same as - * bkey_start_pos(k.k) (for non extents they always will be the - * same). It's important that we delete starting from iter.pos - * because the range we want to delete could start in the middle - * of k. - * - * (bch_btree_iter_peek() does guarantee that iter.pos >= - * bkey_start_pos(k.k)). - */ - delete.k.p = iter.pos; - delete.k.version = version; - - if (iter.is_extents) { - /* - * The extents btree is special - KEY_TYPE_DISCARD is - * used for deletions, not KEY_TYPE_DELETED. This is an - * internal implementation detail that probably - * shouldn't be exposed (internally, KEY_TYPE_DELETED is - * used as a proxy for k->size == 0): - */ - delete.k.type = KEY_TYPE_DISCARD; - - /* create the biggest key we can */ - bch_key_resize(&delete.k, max_sectors); - bch_cut_back(end, &delete.k); - } - - ret = bch_btree_insert_at(c, disk_res, hook, journal_seq, - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &delete)); - if (ret) - break; - - bch_btree_iter_cond_resched(&iter); - } - - bch_btree_iter_unlock(&iter); - return ret; -} - -/** - * bch_btree_node_rewrite - Rewrite/move a btree node - * - * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. - * btree_check_reserve() has to wait) - */ -int bch_btree_node_rewrite(struct btree_iter *iter, struct btree *b, - struct closure *cl) -{ - struct bch_fs *c = iter->c; - struct btree *n, *parent = iter->nodes[b->level + 1]; - struct btree_reserve *reserve; - struct btree_interior_update *as; - unsigned flags = BTREE_INSERT_NOFAIL; - - /* - * if caller is going to wait if allocating reserve fails, then this is - * a rewrite that must succeed: - */ - if (cl) - flags |= BTREE_INSERT_USE_RESERVE; - - if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) - return -EINTR; - - reserve = bch_btree_reserve_get(c, b, 0, flags, cl); - if (IS_ERR(reserve)) { - trace_bcache_btree_gc_rewrite_node_fail(c, b); - return PTR_ERR(reserve); - } - - as = bch_btree_interior_update_alloc(c); - - bch_btree_interior_update_will_free_node(c, as, b); - - n = btree_node_alloc_replacement(c, b, reserve); - - bch_btree_build_aux_trees(n); - six_unlock_write(&n->lock); - - trace_bcache_btree_gc_rewrite_node(c, b); - - bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1); - - if (parent) { - bch_btree_insert_node(parent, iter, - &keylist_single(&n->key), - reserve, as); - } else { - bch_btree_set_root(iter, n, as, reserve); - } - - btree_open_bucket_put(c, n); - - bch_btree_node_free_inmem(iter, b); - - BUG_ON(!bch_btree_iter_node_replace(iter, n)); - - bch_btree_reserve_put(c, reserve); - return 0; -} diff --git a/libbcache/btree_update.h b/libbcache/btree_update.h deleted file mode 100644 index 0be71862..00000000 --- a/libbcache/btree_update.h +++ /dev/null @@ -1,424 +0,0 @@ -#ifndef _BCACHE_BTREE_INSERT_H -#define _BCACHE_BTREE_INSERT_H - -#include "btree_cache.h" -#include "btree_iter.h" -#include "buckets.h" -#include "journal.h" -#include "vstructs.h" - -struct bch_fs; -struct bkey_format_state; -struct bkey_format; -struct btree; - -static inline void btree_node_reset_sib_u64s(struct btree *b) -{ - b->sib_u64s[0] = b->nr.live_u64s; - b->sib_u64s[1] = b->nr.live_u64s; -} - -struct btree_reserve { - struct disk_reservation disk_res; - unsigned nr; - struct btree *b[BTREE_RESERVE_MAX]; -}; - -void __bch_btree_calc_format(struct bkey_format_state *, struct btree *); -bool bch_btree_node_format_fits(struct bch_fs *c, struct btree *, - struct bkey_format *); - -/* Btree node freeing/allocation: */ - -/* - * Tracks a btree node that has been (or is about to be) freed in memory, but - * has _not_ yet been freed on disk (because the write that makes the new - * node(s) visible and frees the old hasn't completed yet) - */ -struct pending_btree_node_free { - bool index_update_done; - - __le64 seq; - enum btree_id btree_id; - unsigned level; - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); -}; - -/* - * Tracks an in progress split/rewrite of a btree node and the update to the - * parent node: - * - * When we split/rewrite a node, we do all the updates in memory without - * waiting for any writes to complete - we allocate the new node(s) and update - * the parent node, possibly recursively up to the root. - * - * The end result is that we have one or more new nodes being written - - * possibly several, if there were multiple splits - and then a write (updating - * an interior node) which will make all these new nodes visible. - * - * Additionally, as we split/rewrite nodes we free the old nodes - but the old - * nodes can't be freed (their space on disk can't be reclaimed) until the - * update to the interior node that makes the new node visible completes - - * until then, the old nodes are still reachable on disk. - * - */ -struct btree_interior_update { - struct closure cl; - struct bch_fs *c; - - struct list_head list; - - /* What kind of update are we doing? */ - enum { - BTREE_INTERIOR_NO_UPDATE, - BTREE_INTERIOR_UPDATING_NODE, - BTREE_INTERIOR_UPDATING_ROOT, - BTREE_INTERIOR_UPDATING_AS, - } mode; - - /* - * BTREE_INTERIOR_UPDATING_NODE: - * The update that made the new nodes visible was a regular update to an - * existing interior node - @b. We can't write out the update to @b - * until the new nodes we created are finished writing, so we block @b - * from writing by putting this btree_interior update on the - * @b->write_blocked list with @write_blocked_list: - */ - struct btree *b; - struct list_head write_blocked_list; - - /* - * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now - * we're now blocking another btree_interior_update - * @parent_as - btree_interior_update that's waiting on our nodes to finish - * writing, before it can make new nodes visible on disk - * @wait - list of child btree_interior_updates that are waiting on this - * btree_interior_update to make all the new nodes visible before they can free - * their old btree nodes - */ - struct btree_interior_update *parent_as; - struct closure_waitlist wait; - - /* - * We may be freeing nodes that were dirty, and thus had journal entries - * pinned: we need to transfer the oldest of those pins to the - * btree_interior_update operation, and release it when the new node(s) - * are all persistent and reachable: - */ - struct journal_entry_pin journal; - - u64 journal_seq; - - /* - * Nodes being freed: - * Protected by c->btree_node_pending_free_lock - */ - struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES]; - unsigned nr_pending; - - /* Only here to reduce stack usage on recursive splits: */ - struct keylist parent_keys; - /* - * Enough room for btree_split's keys without realloc - btree node - * pointers never have crc/compression info, so we only need to acount - * for the pointers for three keys - */ - u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; -}; - -#define for_each_pending_btree_node_free(c, as, p) \ - list_for_each_entry(as, &c->btree_interior_update_list, list) \ - for (p = as->pending; p < as->pending + as->nr_pending; p++) - -void bch_btree_node_free_inmem(struct btree_iter *, struct btree *); -void bch_btree_node_free_never_inserted(struct bch_fs *, struct btree *); - -void btree_open_bucket_put(struct bch_fs *c, struct btree *); - -struct btree *__btree_node_alloc_replacement(struct bch_fs *, - struct btree *, - struct bkey_format, - struct btree_reserve *); -struct btree *btree_node_alloc_replacement(struct bch_fs *, struct btree *, - struct btree_reserve *); - -struct btree_interior_update * -bch_btree_interior_update_alloc(struct bch_fs *); - -void bch_btree_interior_update_will_free_node(struct bch_fs *, - struct btree_interior_update *, - struct btree *); - -void bch_btree_set_root_initial(struct bch_fs *, struct btree *, - struct btree_reserve *); - -void bch_btree_reserve_put(struct bch_fs *, struct btree_reserve *); -struct btree_reserve *bch_btree_reserve_get(struct bch_fs *, - struct btree *, unsigned, - unsigned, struct closure *); - -int bch_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *); - -/* Inserting into a given leaf node (last stage of insert): */ - -bool bch_btree_bset_insert_key(struct btree_iter *, struct btree *, - struct btree_node_iter *, struct bkey_i *); -void bch_btree_journal_key(struct btree_insert *trans, struct btree_iter *, - struct bkey_i *); - -static inline void *btree_data_end(struct bch_fs *c, struct btree *b) -{ - return (void *) b->data + btree_bytes(c); -} - -static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, - struct btree *b) -{ - return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); -} - -static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, - struct btree *b) -{ - return btree_data_end(c, b); -} - -static inline void *write_block(struct btree *b) -{ - return (void *) b->data + (b->written << 9); -} - -static inline bool bset_written(struct btree *b, struct bset *i) -{ - return (void *) i < write_block(b); -} - -static inline bool bset_unwritten(struct btree *b, struct bset *i) -{ - return (void *) i > write_block(b); -} - -static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b, - struct bset *i) -{ - return round_up(bset_byte_offset(b, vstruct_end(i)), - block_bytes(c)) >> 9; -} - -static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, - struct btree *b) -{ - struct bset *i = btree_bset_last(b); - unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) + - b->whiteout_u64s + - b->uncompacted_whiteout_u64s; - unsigned total = c->sb.btree_node_size << 6; - - EBUG_ON(used > total); - - if (bset_written(b, i)) - return 0; - - return total - used; -} - -static inline unsigned btree_write_set_buffer(struct btree *b) -{ - /* - * Could buffer up larger amounts of keys for btrees with larger keys, - * pending benchmarking: - */ - return 4 << 10; -} - -static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, - struct btree *b) -{ - struct bset *i = btree_bset_last(b); - unsigned offset = max_t(unsigned, b->written << 9, - bset_byte_offset(b, vstruct_end(i))); - ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t) - (offset + sizeof(struct btree_node_entry) + - b->whiteout_u64s * sizeof(u64) + - b->uncompacted_whiteout_u64s * sizeof(u64)); - - EBUG_ON(offset > btree_bytes(c)); - - if ((unlikely(bset_written(b, i)) && n > 0) || - (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) && - n > btree_write_set_buffer(b))) - return (void *) b->data + offset; - - return NULL; -} - -/* - * write lock must be held on @b (else the dirty bset that we were going to - * insert into could be written out from under us) - */ -static inline bool bch_btree_node_insert_fits(struct bch_fs *c, - struct btree *b, unsigned u64s) -{ - if (btree_node_is_extents(b)) { - /* The insert key might split an existing key - * (bch_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case: - */ - u64s += BKEY_EXTENT_U64s_MAX; - } - - return u64s <= bch_btree_keys_u64s_remaining(c, b); -} - -static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - if (bset_written(b, bset(b, t))) { - EBUG_ON(b->uncompacted_whiteout_u64s < - bkeyp_key_u64s(&b->format, k)); - b->uncompacted_whiteout_u64s -= - bkeyp_key_u64s(&b->format, k); - } -} - -static inline void reserve_whiteout(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - if (bset_written(b, bset(b, t))) { - BUG_ON(!k->needs_whiteout); - b->uncompacted_whiteout_u64s += - bkeyp_key_u64s(&b->format, k); - } -} - -void bch_btree_insert_node(struct btree *, struct btree_iter *, - struct keylist *, struct btree_reserve *, - struct btree_interior_update *as); - -/* Normal update interface: */ - -struct btree_insert { - struct bch_fs *c; - struct disk_reservation *disk_res; - struct journal_res journal_res; - u64 *journal_seq; - struct extent_insert_hook *hook; - unsigned flags; - bool did_work; - - unsigned short nr; - struct btree_insert_entry { - struct btree_iter *iter; - struct bkey_i *k; - unsigned extra_res; - /* - * true if entire key was inserted - can only be false for - * extents - */ - bool done; - } *entries; -}; - -int __bch_btree_insert_at(struct btree_insert *); - - -#define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N -#define COUNT_ARGS(...) _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1) - -#define BTREE_INSERT_ENTRY(_iter, _k) \ - ((struct btree_insert_entry) { \ - .iter = (_iter), \ - .k = (_k), \ - .done = false, \ - }) - -#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra) \ - ((struct btree_insert_entry) { \ - .iter = (_iter), \ - .k = (_k), \ - .extra_res = (_extra), \ - .done = false, \ - }) - -/** - * bch_btree_insert_at - insert one or more keys at iterator positions - * @iter: btree iterator - * @insert_key: key to insert - * @disk_res: disk reservation - * @hook: extent insert callback - * - * Return values: - * -EINTR: locking changed, this function should be called again. Only returned - * if passed BTREE_INSERT_ATOMIC. - * -EROFS: filesystem read only - * -EIO: journal or btree node IO error - */ -#define bch_btree_insert_at(_c, _disk_res, _hook, \ - _journal_seq, _flags, ...) \ - __bch_btree_insert_at(&(struct btree_insert) { \ - .c = (_c), \ - .disk_res = (_disk_res), \ - .journal_seq = (_journal_seq), \ - .hook = (_hook), \ - .flags = (_flags), \ - .nr = COUNT_ARGS(__VA_ARGS__), \ - .entries = (struct btree_insert_entry[]) { \ - __VA_ARGS__ \ - }}) - -/* - * Don't drop/retake locks: instead return -EINTR if need to upgrade to intent - * locks, -EAGAIN if need to wait on btree reserve - */ -#define BTREE_INSERT_ATOMIC (1 << 0) - -/* Don't check for -ENOSPC: */ -#define BTREE_INSERT_NOFAIL (1 << 1) - -/* for copygc, or when merging btree nodes */ -#define BTREE_INSERT_USE_RESERVE (1 << 2) - -/* - * Insert is for journal replay: don't get journal reservations, or mark extents - * (bch_mark_key) - */ -#define BTREE_INSERT_JOURNAL_REPLAY (1 << 3) - -int bch_btree_insert_list_at(struct btree_iter *, struct keylist *, - struct disk_reservation *, - struct extent_insert_hook *, u64 *, unsigned); - -static inline bool journal_res_insert_fits(struct btree_insert *trans, - struct btree_insert_entry *insert) -{ - unsigned u64s = 0; - struct btree_insert_entry *i; - - /* - * If we didn't get a journal reservation, we're in journal replay and - * we're not journalling updates: - */ - if (!trans->journal_res.ref) - return true; - - for (i = insert; i < trans->entries + trans->nr; i++) - u64s += jset_u64s(i->k->k.u64s + i->extra_res); - - return u64s <= trans->journal_res.u64s; -} - -int bch_btree_insert_check_key(struct btree_iter *, struct bkey_i *); -int bch_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, - struct disk_reservation *, - struct extent_insert_hook *, u64 *, int flags); -int bch_btree_update(struct bch_fs *, enum btree_id, - struct bkey_i *, u64 *); - -int bch_btree_delete_range(struct bch_fs *, enum btree_id, - struct bpos, struct bpos, struct bversion, - struct disk_reservation *, - struct extent_insert_hook *, u64 *); - -int bch_btree_node_rewrite(struct btree_iter *, struct btree *, struct closure *); - -#endif /* _BCACHE_BTREE_INSERT_H */ - diff --git a/libbcache/buckets.c b/libbcache/buckets.c deleted file mode 100644 index 7be943d1..00000000 --- a/libbcache/buckets.c +++ /dev/null @@ -1,750 +0,0 @@ -/* - * Code for manipulating bucket marks for garbage collection. - * - * Copyright 2014 Datera, Inc. - * - * Bucket states: - * - free bucket: mark == 0 - * The bucket contains no data and will not be read - * - * - allocator bucket: owned_by_allocator == 1 - * The bucket is on a free list, or it is an open bucket - * - * - cached bucket: owned_by_allocator == 0 && - * dirty_sectors == 0 && - * cached_sectors > 0 - * The bucket contains data but may be safely discarded as there are - * enough replicas of the data on other cache devices, or it has been - * written back to the backing device - * - * - dirty bucket: owned_by_allocator == 0 && - * dirty_sectors > 0 - * The bucket contains data that we must not discard (either only copy, - * or one of the 'main copies' for data requiring multiple replicas) - * - * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1 - * This is a btree node, journal or gen/prio bucket - * - * Lifecycle: - * - * bucket invalidated => bucket on freelist => open bucket => - * [dirty bucket =>] cached bucket => bucket invalidated => ... - * - * Note that cache promotion can skip the dirty bucket step, as data - * is copied from a deeper tier to a shallower tier, onto a cached - * bucket. - * Note also that a cached bucket can spontaneously become dirty -- - * see below. - * - * Only a traversal of the key space can determine whether a bucket is - * truly dirty or cached. - * - * Transitions: - * - * - free => allocator: bucket was invalidated - * - cached => allocator: bucket was invalidated - * - * - allocator => dirty: open bucket was filled up - * - allocator => cached: open bucket was filled up - * - allocator => metadata: metadata was allocated - * - * - dirty => cached: dirty sectors were copied to a deeper tier - * - dirty => free: dirty sectors were overwritten or moved (copy gc) - * - cached => free: cached sectors were overwritten - * - * - metadata => free: metadata was freed - * - * Oddities: - * - cached => dirty: a device was removed so formerly replicated data - * is no longer sufficiently replicated - * - free => cached: cannot happen - * - free => dirty: cannot happen - * - free => metadata: cannot happen - */ - -#include "bcache.h" -#include "alloc.h" -#include "btree_gc.h" -#include "buckets.h" -#include "error.h" - -#include <linux/preempt.h> -#include <trace/events/bcache.h> - -#ifdef DEBUG_BUCKETS - -#define lg_local_lock lg_global_lock -#define lg_local_unlock lg_global_unlock - -static void bch_fs_stats_verify(struct bch_fs *c) -{ - struct bch_fs_usage stats = - __bch_fs_usage_read(c); - - if ((s64) stats.sectors_dirty < 0) - panic("sectors_dirty underflow: %lli\n", stats.sectors_dirty); - - if ((s64) stats.sectors_cached < 0) - panic("sectors_cached underflow: %lli\n", stats.sectors_cached); - - if ((s64) stats.sectors_meta < 0) - panic("sectors_meta underflow: %lli\n", stats.sectors_meta); - - if ((s64) stats.sectors_persistent_reserved < 0) - panic("sectors_persistent_reserved underflow: %lli\n", stats.sectors_persistent_reserved); - - if ((s64) stats.sectors_online_reserved < 0) - panic("sectors_online_reserved underflow: %lli\n", stats.sectors_online_reserved); -} - -#else - -static void bch_fs_stats_verify(struct bch_fs *c) {} - -#endif - -/* - * Clear journal_seq_valid for buckets for which it's not needed, to prevent - * wraparound: - */ -void bch_bucket_seq_cleanup(struct bch_fs *c) -{ - u16 last_seq_ondisk = c->journal.last_seq_ondisk; - struct bch_dev *ca; - struct bucket *g; - struct bucket_mark m; - unsigned i; - - for_each_member_device(ca, c, i) - for_each_bucket(g, ca) { - bucket_cmpxchg(g, m, ({ - if (!m.journal_seq_valid || - bucket_needs_journal_commit(m, last_seq_ondisk)) - break; - - m.journal_seq_valid = 0; - })); - } -} - -#define bch_usage_add(_acc, _stats) \ -do { \ - typeof(_acc) _a = (_acc), _s = (_stats); \ - unsigned i; \ - \ - for (i = 0; i < sizeof(*_a) / sizeof(u64); i++) \ - ((u64 *) (_a))[i] += ((u64 *) (_s))[i]; \ -} while (0) - -#define bch_usage_read_raw(_stats) \ -({ \ - typeof(*this_cpu_ptr(_stats)) _acc = { 0 }; \ - int cpu; \ - \ - for_each_possible_cpu(cpu) \ - bch_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \ - \ - _acc; \ -}) - -#define bch_usage_read_cached(_c, _cached, _uncached) \ -({ \ - typeof(_cached) _ret; \ - unsigned _seq; \ - \ - do { \ - _seq = read_seqcount_begin(&(_c)->gc_pos_lock); \ - _ret = (_c)->gc_pos.phase == GC_PHASE_DONE \ - ? bch_usage_read_raw(_uncached) \ - : (_cached); \ - } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq)); \ - \ - _ret; \ -}) - -struct bch_dev_usage __bch_dev_usage_read(struct bch_dev *ca) -{ - return bch_usage_read_raw(ca->usage_percpu); -} - -struct bch_dev_usage bch_dev_usage_read(struct bch_dev *ca) -{ - return bch_usage_read_cached(ca->fs, - ca->usage_cached, - ca->usage_percpu); -} - -struct bch_fs_usage -__bch_fs_usage_read(struct bch_fs *c) -{ - return bch_usage_read_raw(c->usage_percpu); -} - -struct bch_fs_usage -bch_fs_usage_read(struct bch_fs *c) -{ - return bch_usage_read_cached(c, - c->usage_cached, - c->usage_percpu); -} - -static inline int is_meta_bucket(struct bucket_mark m) -{ - return m.data_type != BUCKET_DATA; -} - -static inline int is_dirty_bucket(struct bucket_mark m) -{ - return m.data_type == BUCKET_DATA && !!m.dirty_sectors; -} - -static inline int is_cached_bucket(struct bucket_mark m) -{ - return m.data_type == BUCKET_DATA && - !m.dirty_sectors && !!m.cached_sectors; -} - -static inline enum s_alloc bucket_type(struct bucket_mark m) -{ - return is_meta_bucket(m) ? S_META : S_DIRTY; -} - -static bool bucket_became_unavailable(struct bch_fs *c, - struct bucket_mark old, - struct bucket_mark new) -{ - return is_available_bucket(old) && - !is_available_bucket(new) && - c && c->gc_pos.phase == GC_PHASE_DONE; -} - -void bch_fs_usage_apply(struct bch_fs *c, - struct bch_fs_usage *stats, - struct disk_reservation *disk_res, - struct gc_pos gc_pos) -{ - s64 added = - stats->s[S_COMPRESSED][S_META] + - stats->s[S_COMPRESSED][S_DIRTY] + - stats->persistent_reserved + - stats->online_reserved; - - /* - * Not allowed to reduce sectors_available except by getting a - * reservation: - */ - BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0)); - - if (added > 0) { - disk_res->sectors -= added; - stats->online_reserved -= added; - } - - lg_local_lock(&c->usage_lock); - /* online_reserved not subject to gc: */ - this_cpu_ptr(c->usage_percpu)->online_reserved += - stats->online_reserved; - stats->online_reserved = 0; - - if (!gc_will_visit(c, gc_pos)) - bch_usage_add(this_cpu_ptr(c->usage_percpu), stats); - - bch_fs_stats_verify(c); - lg_local_unlock(&c->usage_lock); - - memset(stats, 0, sizeof(*stats)); -} - -static void bch_fs_usage_update(struct bch_fs_usage *fs_usage, - struct bucket_mark old, struct bucket_mark new) -{ - fs_usage->s[S_COMPRESSED][S_CACHED] += - (int) new.cached_sectors - (int) old.cached_sectors; - fs_usage->s[S_COMPRESSED][bucket_type(old)] -= - old.dirty_sectors; - fs_usage->s[S_COMPRESSED][bucket_type(new)] += - new.dirty_sectors; -} - -static void bch_dev_usage_update(struct bch_dev *ca, - struct bucket_mark old, struct bucket_mark new) -{ - struct bch_fs *c = ca->fs; - struct bch_dev_usage *dev_usage; - - bch_fs_inconsistent_on(old.data_type && new.data_type && - old.data_type != new.data_type, c, - "different types of metadata in same bucket: %u, %u", - old.data_type, new.data_type); - - preempt_disable(); - dev_usage = this_cpu_ptr(ca->usage_percpu); - - dev_usage->sectors[S_CACHED] += - (int) new.cached_sectors - (int) old.cached_sectors; - - dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors; - dev_usage->sectors[bucket_type(new)] += new.dirty_sectors; - - dev_usage->buckets_alloc += - (int) new.owned_by_allocator - (int) old.owned_by_allocator; - - dev_usage->buckets_meta += is_meta_bucket(new) - is_meta_bucket(old); - dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old); - dev_usage->buckets_dirty += is_dirty_bucket(new) - is_dirty_bucket(old); - preempt_enable(); - - if (!is_available_bucket(old) && is_available_bucket(new)) - bch_wake_allocator(ca); -} - -#define bucket_data_cmpxchg(ca, g, new, expr) \ -({ \ - struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ - \ - bch_dev_usage_update(ca, _old, new); \ - _old; \ -}) - -void bch_invalidate_bucket(struct bch_dev *ca, struct bucket *g) -{ - struct bch_fs_usage stats = { 0 }; - struct bucket_mark old, new; - - old = bucket_data_cmpxchg(ca, g, new, ({ - new.owned_by_allocator = 1; - new.had_metadata = 0; - new.data_type = 0; - new.cached_sectors = 0; - new.dirty_sectors = 0; - new.copygc = 0; - new.gen++; - })); - - /* XXX: we're not actually updating fs usage's cached sectors... */ - bch_fs_usage_update(&stats, old, new); - - if (!old.owned_by_allocator && old.cached_sectors) - trace_bcache_invalidate(ca, g - ca->buckets, - old.cached_sectors); -} - -void bch_mark_free_bucket(struct bch_dev *ca, struct bucket *g) -{ - struct bucket_mark old, new; - - old = bucket_data_cmpxchg(ca, g, new, ({ - new.owned_by_allocator = 0; - new.data_type = 0; - new.cached_sectors = 0; - new.dirty_sectors = 0; - })); - - BUG_ON(bucket_became_unavailable(ca->fs, old, new)); -} - -void bch_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g, - bool owned_by_allocator) -{ - struct bucket_mark new; - - bucket_data_cmpxchg(ca, g, new, ({ - new.owned_by_allocator = owned_by_allocator; - })); -} - -void bch_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g, - enum bucket_data_type type, - bool may_make_unavailable) -{ - struct bucket_mark old, new; - - BUG_ON(!type); - - old = bucket_data_cmpxchg(ca, g, new, ({ - new.data_type = type; - new.had_metadata = 1; - })); - - BUG_ON(old.cached_sectors); - BUG_ON(old.dirty_sectors); - BUG_ON(!may_make_unavailable && - bucket_became_unavailable(ca->fs, old, new)); -} - -#define saturated_add(ca, dst, src, max) \ -do { \ - BUG_ON((int) (dst) + (src) < 0); \ - if ((dst) == (max)) \ - ; \ - else if ((dst) + (src) <= (max)) \ - dst += (src); \ - else { \ - dst = (max); \ - trace_bcache_sectors_saturated(ca); \ - } \ -} while (0) - -#if 0 -/* Reverting this until the copygc + compression issue is fixed: */ - -static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors) -{ - return crc_compression_type(crc) - ? sectors * crc_compressed_size(crc) / crc_uncompressed_size(crc) - : sectors; -} - -static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors) -{ - return crc_compression_type(crc) - ? min_t(unsigned, crc_compressed_size(crc), sectors) - : sectors; -} -#else -static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors) -{ - return sectors; -} - -static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors) -{ - return sectors; -} -#endif - -/* - * Checking against gc's position has to be done here, inside the cmpxchg() - * loop, to avoid racing with the start of gc clearing all the marks - GC does - * that with the gc pos seqlock held. - */ -static void bch_mark_pointer(struct bch_fs *c, - struct bkey_s_c_extent e, - const union bch_extent_crc *crc, - const struct bch_extent_ptr *ptr, - s64 sectors, enum s_alloc type, - bool may_make_unavailable, - struct bch_fs_usage *stats, - bool gc_will_visit, u64 journal_seq) -{ - struct bucket_mark old, new; - unsigned saturated; - struct bch_dev *ca = c->devs[ptr->dev]; - struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr); - unsigned old_sectors, new_sectors; - int disk_sectors, compressed_sectors; - - if (sectors > 0) { - old_sectors = 0; - new_sectors = sectors; - } else { - old_sectors = e.k->size; - new_sectors = e.k->size + sectors; - } - - disk_sectors = -__disk_sectors(crc, old_sectors) - + __disk_sectors(crc, new_sectors); - compressed_sectors = -__compressed_sectors(crc, old_sectors) - + __compressed_sectors(crc, new_sectors); - - if (gc_will_visit) { - if (journal_seq) - bucket_cmpxchg(g, new, new.journal_seq = journal_seq); - - goto out; - } - - old = bucket_data_cmpxchg(ca, g, new, ({ - saturated = 0; - - /* - * Check this after reading bucket mark to guard against - * the allocator invalidating a bucket after we've already - * checked the gen - */ - if (gen_after(new.gen, ptr->gen)) { - EBUG_ON(type != S_CACHED && - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); - return; - } - - EBUG_ON(type != S_CACHED && - !may_make_unavailable && - is_available_bucket(new) && - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); - - if (type != S_CACHED && - new.dirty_sectors == GC_MAX_SECTORS_USED && - disk_sectors < 0) - saturated = -disk_sectors; - - if (type == S_CACHED) - saturated_add(ca, new.cached_sectors, disk_sectors, - GC_MAX_SECTORS_USED); - else - saturated_add(ca, new.dirty_sectors, disk_sectors, - GC_MAX_SECTORS_USED); - - if (!new.dirty_sectors && - !new.cached_sectors) { - new.data_type = 0; - - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } - } else { - new.data_type = type == S_META - ? BUCKET_BTREE : BUCKET_DATA; - } - - new.had_metadata |= is_meta_bucket(new); - })); - - BUG_ON(!may_make_unavailable && - bucket_became_unavailable(c, old, new)); - - if (saturated && - atomic_long_add_return(saturated, - &ca->saturated_count) >= - ca->free_inc.size << ca->bucket_bits) { - if (c->gc_thread) { - trace_bcache_gc_sectors_saturated(c); - wake_up_process(c->gc_thread); - } - } -out: - stats->s[S_COMPRESSED][type] += compressed_sectors; - stats->s[S_UNCOMPRESSED][type] += sectors; -} - -static void bch_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e, - s64 sectors, bool metadata, - bool may_make_unavailable, - struct bch_fs_usage *stats, - bool gc_will_visit, u64 journal_seq) -{ - const struct bch_extent_ptr *ptr; - const union bch_extent_crc *crc; - enum s_alloc type = metadata ? S_META : S_DIRTY; - - BUG_ON(metadata && bkey_extent_is_cached(e.k)); - BUG_ON(!sectors); - - extent_for_each_ptr_crc(e, ptr, crc) - bch_mark_pointer(c, e, crc, ptr, sectors, - ptr->cached ? S_CACHED : type, - may_make_unavailable, - stats, gc_will_visit, journal_seq); -} - -static void __bch_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, bool metadata, - bool may_make_unavailable, - struct bch_fs_usage *stats, - bool gc_will_visit, u64 journal_seq) -{ - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - bch_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata, - may_make_unavailable, stats, - gc_will_visit, journal_seq); - break; - case BCH_RESERVATION: { - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - stats->persistent_reserved += r.v->nr_replicas * sectors; - break; - } - } -} - -void __bch_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, bool metadata, - struct bch_fs_usage *stats) -{ - __bch_mark_key(c, k, sectors, metadata, true, stats, false, 0); -} - -void bch_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, bool metadata) -{ - struct bch_fs_usage stats = { 0 }; - - __bch_gc_mark_key(c, k, sectors, metadata, &stats); - - preempt_disable(); - bch_usage_add(this_cpu_ptr(c->usage_percpu), &stats); - preempt_enable(); -} - -void bch_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, bool metadata, struct gc_pos gc_pos, - struct bch_fs_usage *stats, u64 journal_seq) -{ - /* - * synchronization w.r.t. GC: - * - * Normally, bucket sector counts/marks are updated on the fly, as - * references are added/removed from the btree, the lists of buckets the - * allocator owns, other metadata buckets, etc. - * - * When GC is in progress and going to mark this reference, we do _not_ - * mark this reference here, to avoid double counting - GC will count it - * when it gets to it. - * - * To know whether we should mark a given reference (GC either isn't - * running, or has already marked references at this position) we - * construct a total order for everything GC walks. Then, we can simply - * compare the position of the reference we're marking - @gc_pos - with - * GC's current position. If GC is going to mark this reference, GC's - * current position will be less than @gc_pos; if GC's current position - * is greater than @gc_pos GC has either already walked this position, - * or isn't running. - * - * To avoid racing with GC's position changing, we have to deal with - * - GC's position being set to GC_POS_MIN when GC starts: - * usage_lock guards against this - * - GC's position overtaking @gc_pos: we guard against this with - * whatever lock protects the data structure the reference lives in - * (e.g. the btree node lock, or the relevant allocator lock). - */ - lg_local_lock(&c->usage_lock); - __bch_mark_key(c, k, sectors, metadata, false, stats, - gc_will_visit(c, gc_pos), journal_seq); - - bch_fs_stats_verify(c); - lg_local_unlock(&c->usage_lock); -} - -static u64 __recalc_sectors_available(struct bch_fs *c) -{ - return c->capacity - bch_fs_sectors_used(c); -} - -/* Used by gc when it's starting: */ -void bch_recalc_sectors_available(struct bch_fs *c) -{ - int cpu; - - lg_global_lock(&c->usage_lock); - - for_each_possible_cpu(cpu) - per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0; - - atomic64_set(&c->sectors_available, - __recalc_sectors_available(c)); - - lg_global_unlock(&c->usage_lock); -} - -void bch_disk_reservation_put(struct bch_fs *c, - struct disk_reservation *res) -{ - if (res->sectors) { - lg_local_lock(&c->usage_lock); - this_cpu_sub(c->usage_percpu->online_reserved, - res->sectors); - - bch_fs_stats_verify(c); - lg_local_unlock(&c->usage_lock); - - res->sectors = 0; - } -} - -#define SECTORS_CACHE 1024 - -int bch_disk_reservation_add(struct bch_fs *c, - struct disk_reservation *res, - unsigned sectors, int flags) -{ - struct bch_fs_usage *stats; - u64 old, new, v; - s64 sectors_available; - int ret; - - sectors *= res->nr_replicas; - - lg_local_lock(&c->usage_lock); - stats = this_cpu_ptr(c->usage_percpu); - - if (sectors >= stats->available_cache) - goto out; - - v = atomic64_read(&c->sectors_available); - do { - old = v; - if (old < sectors) { - lg_local_unlock(&c->usage_lock); - goto recalculate; - } - - new = max_t(s64, 0, old - sectors - SECTORS_CACHE); - } while ((v = atomic64_cmpxchg(&c->sectors_available, - old, new)) != old); - - stats->available_cache += old - new; -out: - stats->available_cache -= sectors; - stats->online_reserved += sectors; - res->sectors += sectors; - - bch_fs_stats_verify(c); - lg_local_unlock(&c->usage_lock); - return 0; - -recalculate: - /* - * GC recalculates sectors_available when it starts, so that hopefully - * we don't normally end up blocking here: - */ - - /* - * Piss fuck, we can be called from extent_insert_fixup() with btree - * locks held: - */ - - if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) { - if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD)) - down_read(&c->gc_lock); - else if (!down_read_trylock(&c->gc_lock)) - return -EINTR; - } - lg_global_lock(&c->usage_lock); - - sectors_available = __recalc_sectors_available(c); - - if (sectors <= sectors_available || - (flags & BCH_DISK_RESERVATION_NOFAIL)) { - atomic64_set(&c->sectors_available, - max_t(s64, 0, sectors_available - sectors)); - stats->online_reserved += sectors; - res->sectors += sectors; - ret = 0; - } else { - atomic64_set(&c->sectors_available, sectors_available); - ret = -ENOSPC; - } - - bch_fs_stats_verify(c); - lg_global_unlock(&c->usage_lock); - if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) - up_read(&c->gc_lock); - - return ret; -} - -int bch_disk_reservation_get(struct bch_fs *c, - struct disk_reservation *res, - unsigned sectors, int flags) -{ - res->sectors = 0; - res->gen = c->capacity_gen; - res->nr_replicas = (flags & BCH_DISK_RESERVATION_METADATA) - ? c->opts.metadata_replicas - : c->opts.data_replicas; - - return bch_disk_reservation_add(c, res, sectors, flags); -} diff --git a/libbcache/buckets.h b/libbcache/buckets.h deleted file mode 100644 index 81355576..00000000 --- a/libbcache/buckets.h +++ /dev/null @@ -1,267 +0,0 @@ -/* - * Code for manipulating bucket marks for garbage collection. - * - * Copyright 2014 Datera, Inc. - */ - -#ifndef _BUCKETS_H -#define _BUCKETS_H - -#include "buckets_types.h" -#include "super.h" - -#define for_each_bucket(b, ca) \ - for (b = (ca)->buckets + (ca)->mi.first_bucket; \ - b < (ca)->buckets + (ca)->mi.nbuckets; b++) - -#define bucket_cmpxchg(g, new, expr) \ -({ \ - u64 _v = READ_ONCE((g)->_mark.counter); \ - struct bucket_mark _old; \ - \ - do { \ - (new).counter = _old.counter = _v; \ - expr; \ - } while ((_v = cmpxchg(&(g)->_mark.counter, \ - _old.counter, \ - (new).counter)) != _old.counter);\ - _old; \ -}) - -/* - * bucket_gc_gen() returns the difference between the bucket's current gen and - * the oldest gen of any pointer into that bucket in the btree. - */ - -static inline u8 bucket_gc_gen(struct bch_dev *ca, struct bucket *g) -{ - unsigned long r = g - ca->buckets; - return g->mark.gen - ca->oldest_gens[r]; -} - -static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return sector_to_bucket(ca, ptr->offset); -} - -/* - * Returns 0 if no pointers or device offline - only for tracepoints! - */ -static inline size_t PTR_BUCKET_NR_TRACE(const struct bch_fs *c, - const struct bkey_i *k, - unsigned ptr) -{ - size_t bucket = 0; -#if 0 - if (bkey_extent_is_data(&k->k)) { - const struct bch_extent_ptr *ptr; - - extent_for_each_ptr(bkey_i_to_s_c_extent(k), ptr) { - const struct bch_dev *ca = c->devs[ptr->dev]; - bucket = PTR_BUCKET_NR(ca, ptr); - break; - } - } -#endif - return bucket; -} - -static inline struct bucket *PTR_BUCKET(const struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return ca->buckets + PTR_BUCKET_NR(ca, ptr); -} - -static inline u8 __gen_after(u8 a, u8 b) -{ - u8 r = a - b; - - return r > 128U ? 0 : r; -} - -static inline u8 gen_after(u8 a, u8 b) -{ - u8 r = a - b; - - BUG_ON(r > 128U); - - return r; -} - -/** - * ptr_stale() - check if a pointer points into a bucket that has been - * invalidated. - */ -static inline u8 ptr_stale(const struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen); -} - -/* bucket heaps */ - -static inline bool bucket_min_cmp(struct bucket_heap_entry l, - struct bucket_heap_entry r) -{ - return l.val < r.val; -} - -static inline bool bucket_max_cmp(struct bucket_heap_entry l, - struct bucket_heap_entry r) -{ - return l.val > r.val; -} - -static inline void bucket_heap_push(struct bch_dev *ca, struct bucket *g, - unsigned long val) -{ - struct bucket_heap_entry new = { g, val }; - - if (!heap_full(&ca->heap)) - heap_add(&ca->heap, new, bucket_min_cmp); - else if (bucket_min_cmp(new, heap_peek(&ca->heap))) { - ca->heap.data[0] = new; - heap_sift(&ca->heap, 0, bucket_min_cmp); - } -} - -/* bucket gc marks */ - -/* The dirty and cached sector counts saturate. If this occurs, - * reference counting alone will not free the bucket, and a btree - * GC must be performed. */ -#define GC_MAX_SECTORS_USED ((1U << 15) - 1) - -static inline bool bucket_unused(struct bucket *g) -{ - return !g->mark.counter; -} - -static inline unsigned bucket_sectors_used(struct bucket *g) -{ - return g->mark.dirty_sectors + g->mark.cached_sectors; -} - -/* Per device stats: */ - -struct bch_dev_usage __bch_dev_usage_read(struct bch_dev *); -struct bch_dev_usage bch_dev_usage_read(struct bch_dev *); - -static inline u64 __dev_buckets_available(struct bch_dev *ca, - struct bch_dev_usage stats) -{ - return max_t(s64, 0, - ca->mi.nbuckets - ca->mi.first_bucket - - stats.buckets_dirty - - stats.buckets_alloc - - stats.buckets_meta); -} - -/* - * Number of reclaimable buckets - only for use by the allocator thread: - */ -static inline u64 dev_buckets_available(struct bch_dev *ca) -{ - return __dev_buckets_available(ca, bch_dev_usage_read(ca)); -} - -static inline u64 __dev_buckets_free(struct bch_dev *ca, - struct bch_dev_usage stats) -{ - return __dev_buckets_available(ca, stats) + - fifo_used(&ca->free[RESERVE_NONE]) + - fifo_used(&ca->free_inc); -} - -static inline u64 dev_buckets_free(struct bch_dev *ca) -{ - return __dev_buckets_free(ca, bch_dev_usage_read(ca)); -} - -/* Cache set stats: */ - -struct bch_fs_usage __bch_fs_usage_read(struct bch_fs *); -struct bch_fs_usage bch_fs_usage_read(struct bch_fs *); -void bch_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, - struct disk_reservation *, struct gc_pos); - -static inline u64 __bch_fs_sectors_used(struct bch_fs *c) -{ - struct bch_fs_usage stats = __bch_fs_usage_read(c); - u64 reserved = stats.persistent_reserved + - stats.online_reserved; - - return stats.s[S_COMPRESSED][S_META] + - stats.s[S_COMPRESSED][S_DIRTY] + - reserved + - (reserved >> 7); -} - -static inline u64 bch_fs_sectors_used(struct bch_fs *c) -{ - return min(c->capacity, __bch_fs_sectors_used(c)); -} - -/* XXX: kill? */ -static inline u64 sectors_available(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned i; - u64 ret = 0; - - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i) - ret += dev_buckets_available(ca) << ca->bucket_bits; - rcu_read_unlock(); - - return ret; -} - -static inline bool is_available_bucket(struct bucket_mark mark) -{ - return (!mark.owned_by_allocator && - mark.data_type == BUCKET_DATA && - !mark.dirty_sectors && - !mark.nouse); -} - -static inline bool bucket_needs_journal_commit(struct bucket_mark m, - u16 last_seq_ondisk) -{ - return m.journal_seq_valid && - ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); -} - -void bch_bucket_seq_cleanup(struct bch_fs *); - -void bch_invalidate_bucket(struct bch_dev *, struct bucket *); -void bch_mark_free_bucket(struct bch_dev *, struct bucket *); -void bch_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool); -void bch_mark_metadata_bucket(struct bch_dev *, struct bucket *, - enum bucket_data_type, bool); - -void __bch_gc_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, - struct bch_fs_usage *); -void bch_gc_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool); -void bch_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, - struct gc_pos, struct bch_fs_usage *, u64); - -void bch_recalc_sectors_available(struct bch_fs *); - -void bch_disk_reservation_put(struct bch_fs *, - struct disk_reservation *); - -#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) -#define BCH_DISK_RESERVATION_METADATA (1 << 1) -#define BCH_DISK_RESERVATION_GC_LOCK_HELD (1 << 2) -#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD (1 << 3) - -int bch_disk_reservation_add(struct bch_fs *, - struct disk_reservation *, - unsigned, int); -int bch_disk_reservation_get(struct bch_fs *, - struct disk_reservation *, - unsigned, int); - -#endif /* _BUCKETS_H */ diff --git a/libbcache/buckets_types.h b/libbcache/buckets_types.h deleted file mode 100644 index ca187099..00000000 --- a/libbcache/buckets_types.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef _BUCKETS_TYPES_H -#define _BUCKETS_TYPES_H - -enum bucket_data_type { - BUCKET_DATA = 0, - BUCKET_BTREE, - BUCKET_PRIOS, - BUCKET_JOURNAL, - BUCKET_SB, -}; - -struct bucket_mark { - union { - struct { - u64 counter; - }; - - struct { - u8 gen; - - /* generation copygc is going to move this bucket into */ - unsigned copygc:1; - - unsigned journal_seq_valid:1; - - /* - * If this bucket had metadata while at the current generation - * number, the allocator must increment its gen before we reuse - * it: - */ - unsigned had_metadata:1; - - unsigned owned_by_allocator:1; - - unsigned data_type:3; - - unsigned nouse:1; - - u16 dirty_sectors; - u16 cached_sectors; - - /* - * low bits of journal sequence number when this bucket was most - * recently modified: if journal_seq_valid is set, this bucket - * can't be reused until the journal sequence number written to - * disk is >= the bucket's journal sequence number: - */ - u16 journal_seq; - }; - }; -}; - -struct bucket { - union { - struct { - u16 read_prio; - u16 write_prio; - }; - u16 prio[2]; - }; - - union { - struct bucket_mark _mark; - const struct bucket_mark mark; - }; -}; - -enum s_compressed { - S_COMPRESSED, - S_UNCOMPRESSED, - S_COMPRESSED_NR, -}; - -enum s_alloc { - S_META, - S_DIRTY, - S_CACHED, - S_ALLOC_NR, -}; - -struct bch_dev_usage { - u64 buckets_dirty; - u64 buckets_cached; - u64 buckets_meta; - u64 buckets_alloc; - - u64 sectors[S_ALLOC_NR]; -}; - -struct bch_fs_usage { - /* all fields are in units of 512 byte sectors: */ - u64 s[S_COMPRESSED_NR][S_ALLOC_NR]; - u64 persistent_reserved; - u64 online_reserved; - u64 available_cache; -}; - -struct bucket_heap_entry { - struct bucket *g; - unsigned long val; -}; - -/* - * A reservation for space on disk: - */ -struct disk_reservation { - u64 sectors; - u32 gen; - unsigned nr_replicas; -}; - -#endif /* _BUCKETS_TYPES_H */ diff --git a/libbcache/chardev.c b/libbcache/chardev.c deleted file mode 100644 index da6d827f..00000000 --- a/libbcache/chardev.c +++ /dev/null @@ -1,407 +0,0 @@ -#include "bcache.h" -#include "super.h" -#include "super-io.h" - -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/major.h> -#include <linux/cdev.h> -#include <linux/device.h> -#include <linux/ioctl.h> -#include <linux/uaccess.h> -#include <linux/slab.h> -#include <linux/bcache-ioctl.h> - -static long bch_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) -{ - struct bch_ioctl_assemble arg; - const char *err; - u64 *user_devs = NULL; - char **devs = NULL; - unsigned i; - int ret = -EFAULT; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags || arg.pad) - return -EINVAL; - - user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); - if (!devs) - return -ENOMEM; - - devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); - - if (copy_from_user(user_devs, user_arg->devs, - sizeof(u64) * arg.nr_devs)) - goto err; - - for (i = 0; i < arg.nr_devs; i++) { - devs[i] = strndup_user((const char __user *)(unsigned long) - user_devs[i], - PATH_MAX); - if (!devs[i]) { - ret = -ENOMEM; - goto err; - } - } - - err = bch_fs_open(devs, arg.nr_devs, bch_opts_empty(), NULL); - if (err) { - pr_err("Could not open filesystem: %s", err); - ret = -EINVAL; - goto err; - } - - ret = 0; -err: - if (devs) - for (i = 0; i < arg.nr_devs; i++) - kfree(devs[i]); - kfree(devs); - return ret; -} - -static long bch_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) -{ - struct bch_ioctl_incremental arg; - const char *err; - char *path; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags || arg.pad) - return -EINVAL; - - path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - if (!path) - return -ENOMEM; - - err = bch_fs_open_incremental(path); - kfree(path); - - if (err) { - pr_err("Could not register bcache devices: %s", err); - return -EINVAL; - } - - return 0; -} - -static long bch_global_ioctl(unsigned cmd, void __user *arg) -{ - switch (cmd) { - case BCH_IOCTL_ASSEMBLE: - return bch_ioctl_assemble(arg); - case BCH_IOCTL_INCREMENTAL: - return bch_ioctl_incremental(arg); - default: - return -ENOTTY; - } -} - -static long bch_ioctl_query_uuid(struct bch_fs *c, - struct bch_ioctl_query_uuid __user *user_arg) -{ - return copy_to_user(&user_arg->uuid, - &c->sb.user_uuid, - sizeof(c->sb.user_uuid)); -} - -static long bch_ioctl_start(struct bch_fs *c, struct bch_ioctl_start __user *user_arg) -{ - struct bch_ioctl_start arg; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags || arg.pad) - return -EINVAL; - - return bch_fs_start(c) ? -EIO : 0; -} - -static long bch_ioctl_stop(struct bch_fs *c) -{ - bch_fs_stop(c); - return 0; -} - -/* returns with ref on ca->ref */ -static struct bch_dev *bch_device_lookup(struct bch_fs *c, - const char __user *dev) -{ - struct block_device *bdev; - struct bch_dev *ca; - char *path; - unsigned i; - - path = strndup_user(dev, PATH_MAX); - if (!path) - return ERR_PTR(-ENOMEM); - - bdev = lookup_bdev(strim(path)); - kfree(path); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); - - for_each_member_device(ca, c, i) - if (ca->disk_sb.bdev == bdev) - goto found; - - ca = NULL; -found: - bdput(bdev); - return ca; -} - -#if 0 -static struct bch_member *bch_uuid_lookup(struct bch_fs *c, uuid_le uuid) -{ - struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb); - unsigned i; - - lockdep_assert_held(&c->sb_lock); - - for (i = 0; i < c->disk_sb->nr_devices; i++) - if (!memcmp(&mi->members[i].uuid, &uuid, sizeof(uuid))) - return &mi->members[i]; - - return NULL; -} -#endif - -static long bch_ioctl_disk_add(struct bch_fs *c, - struct bch_ioctl_disk __user *user_arg) -{ - struct bch_ioctl_disk arg; - char *path; - int ret; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags || arg.pad) - return -EINVAL; - - path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - if (!path) - return -ENOMEM; - - ret = bch_dev_add(c, path); - kfree(path); - - return ret; -} - -static long bch_ioctl_disk_remove(struct bch_fs *c, - struct bch_ioctl_disk __user *user_arg) -{ - struct bch_ioctl_disk arg; - struct bch_dev *ca; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - ca = bch_device_lookup(c, (const char __user *)(unsigned long) arg.dev); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - return bch_dev_remove(c, ca, arg.flags); -} - -static long bch_ioctl_disk_online(struct bch_fs *c, - struct bch_ioctl_disk __user *user_arg) -{ - struct bch_ioctl_disk arg; - char *path; - int ret; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags || arg.pad) - return -EINVAL; - - path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - if (!path) - return -ENOMEM; - - ret = bch_dev_online(c, path); - kfree(path); - return ret; -} - -static long bch_ioctl_disk_offline(struct bch_fs *c, - struct bch_ioctl_disk __user *user_arg) -{ - struct bch_ioctl_disk arg; - struct bch_dev *ca; - int ret; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.pad) - return -EINVAL; - - ca = bch_device_lookup(c, (const char __user *)(unsigned long) arg.dev); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch_dev_offline(c, ca, arg.flags); - percpu_ref_put(&ca->ref); - return ret; -} - -static long bch_ioctl_disk_set_state(struct bch_fs *c, - struct bch_ioctl_disk_set_state __user *user_arg) -{ - struct bch_ioctl_disk_set_state arg; - struct bch_dev *ca; - int ret; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - ca = bch_device_lookup(c, (const char __user *)(unsigned long) arg.dev); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch_dev_set_state(c, ca, arg.new_state, arg.flags); - - percpu_ref_put(&ca->ref); - return ret; -} - -static long bch_ioctl_disk_evacuate(struct bch_fs *c, - struct bch_ioctl_disk __user *user_arg) -{ - struct bch_ioctl_disk arg; - struct bch_dev *ca; - int ret; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - ca = bch_device_lookup(c, (const char __user *)(unsigned long) arg.dev); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch_dev_evacuate(c, ca); - - percpu_ref_put(&ca->ref); - return ret; -} - -long bch_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) -{ - /* ioctls that don't require admin cap: */ - switch (cmd) { - case BCH_IOCTL_QUERY_UUID: - return bch_ioctl_query_uuid(c, arg); - } - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - /* ioctls that do require admin cap: */ - switch (cmd) { - case BCH_IOCTL_START: - return bch_ioctl_start(c, arg); - case BCH_IOCTL_STOP: - return bch_ioctl_stop(c); - - case BCH_IOCTL_DISK_ADD: - return bch_ioctl_disk_add(c, arg); - case BCH_IOCTL_DISK_REMOVE: - return bch_ioctl_disk_remove(c, arg); - case BCH_IOCTL_DISK_ONLINE: - return bch_ioctl_disk_online(c, arg); - case BCH_IOCTL_DISK_OFFLINE: - return bch_ioctl_disk_offline(c, arg); - case BCH_IOCTL_DISK_SET_STATE: - return bch_ioctl_disk_set_state(c, arg); - case BCH_IOCTL_DISK_EVACUATE: - return bch_ioctl_disk_evacuate(c, arg); - - default: - return -ENOTTY; - } -} - -static long bch_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) -{ - struct bch_fs *c = filp->private_data; - void __user *arg = (void __user *) v; - - return c - ? bch_fs_ioctl(c, cmd, arg) - : bch_global_ioctl(cmd, arg); -} - -static const struct file_operations bch_chardev_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = bch_chardev_ioctl, - .open = nonseekable_open, -}; - -static int bch_chardev_major; -static struct class *bch_chardev_class; -static struct device *bch_chardev; -static DEFINE_IDR(bch_chardev_minor); - -void bch_fs_chardev_exit(struct bch_fs *c) -{ - if (!IS_ERR_OR_NULL(c->chardev)) - device_unregister(c->chardev); - if (c->minor >= 0) - idr_remove(&bch_chardev_minor, c->minor); -} - -int bch_fs_chardev_init(struct bch_fs *c) -{ - c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); - if (c->minor < 0) - return c->minor; - - c->chardev = device_create(bch_chardev_class, NULL, - MKDEV(bch_chardev_major, c->minor), NULL, - "bcache%u-ctl", c->minor); - if (IS_ERR(c->chardev)) - return PTR_ERR(c->chardev); - - return 0; -} - -void bch_chardev_exit(void) -{ - if (!IS_ERR_OR_NULL(bch_chardev_class)) - device_destroy(bch_chardev_class, - MKDEV(bch_chardev_major, 255)); - if (!IS_ERR_OR_NULL(bch_chardev_class)) - class_destroy(bch_chardev_class); - if (bch_chardev_major > 0) - unregister_chrdev(bch_chardev_major, "bcache"); -} - -int __init bch_chardev_init(void) -{ - bch_chardev_major = register_chrdev(0, "bcache-ctl", &bch_chardev_fops); - if (bch_chardev_major < 0) - return bch_chardev_major; - - bch_chardev_class = class_create(THIS_MODULE, "bcache"); - if (IS_ERR(bch_chardev_class)) - return PTR_ERR(bch_chardev_class); - - bch_chardev = device_create(bch_chardev_class, NULL, - MKDEV(bch_chardev_major, 255), - NULL, "bcache-ctl"); - if (IS_ERR(bch_chardev)) - return PTR_ERR(bch_chardev); - - return 0; -} diff --git a/libbcache/chardev.h b/libbcache/chardev.h deleted file mode 100644 index 61a4c2b5..00000000 --- a/libbcache/chardev.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _BCACHE_CHARDEV_H -#define _BCACHE_CHARDEV_H - -#ifndef NO_BCACHE_CHARDEV - -long bch_fs_ioctl(struct bch_fs *, unsigned, void __user *); - -void bch_fs_chardev_exit(struct bch_fs *); -int bch_fs_chardev_init(struct bch_fs *); - -void bch_chardev_exit(void); -int __init bch_chardev_init(void); - -#else - -static inline long bch_fs_ioctl(struct bch_fs *c, - unsigned cmd, void __user * arg) -{ - return -ENOSYS; -} - -static inline void bch_fs_chardev_exit(struct bch_fs *c) {} -static inline int bch_fs_chardev_init(struct bch_fs *c) { return 0; } - -static inline void bch_chardev_exit(void) {} -static inline int __init bch_chardev_init(void) { return 0; } - -#endif - -#endif /* _BCACHE_CHARDEV_H */ diff --git a/libbcache/checksum.c b/libbcache/checksum.c deleted file mode 100644 index b96050db..00000000 --- a/libbcache/checksum.c +++ /dev/null @@ -1,590 +0,0 @@ - -#include "bcache.h" -#include "checksum.h" -#include "super.h" -#include "super-io.h" - -#include <linux/crc32c.h> -#include <linux/crypto.h> -#include <linux/key.h> -#include <linux/random.h> -#include <linux/scatterlist.h> -#include <crypto/algapi.h> -#include <crypto/chacha20.h> -#include <crypto/hash.h> -#include <crypto/poly1305.h> -#include <keys/user-type.h> - -/* - * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any - * use permitted, subject to terms of PostgreSQL license; see.) - - * If we have a 64-bit integer type, then a 64-bit CRC looks just like the - * usual sort of implementation. (See Ross Williams' excellent introduction - * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from - * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) - * If we have no working 64-bit type, then fake it with two 32-bit registers. - * - * The present implementation is a normal (not "reflected", in Williams' - * terms) 64-bit CRC, using initial all-ones register contents and a final - * bit inversion. The chosen polynomial is borrowed from the DLT1 spec - * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): - * - * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + - * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + - * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + - * x^7 + x^4 + x + 1 -*/ - -static const u64 crc_table[256] = { - 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL, - 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL, - 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL, - 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL, - 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL, - 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL, - 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL, - 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL, - 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL, - 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL, - 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL, - 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL, - 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL, - 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL, - 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL, - 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL, - 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL, - 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL, - 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL, - 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL, - 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL, - 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL, - 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL, - 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL, - 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL, - 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL, - 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL, - 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL, - 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL, - 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL, - 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL, - 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL, - 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL, - 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL, - 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL, - 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL, - 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL, - 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL, - 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL, - 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL, - 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL, - 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL, - 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL, - 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL, - 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL, - 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL, - 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL, - 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL, - 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL, - 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL, - 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL, - 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL, - 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL, - 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL, - 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL, - 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL, - 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL, - 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL, - 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL, - 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL, - 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL, - 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL, - 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL, - 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL, - 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL, - 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL, - 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL, - 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL, - 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL, - 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL, - 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL, - 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL, - 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL, - 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL, - 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL, - 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL, - 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL, - 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL, - 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL, - 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL, - 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL, - 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL, - 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL, - 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL, - 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL, - 0x9AFCE626CE85B507ULL, -}; - -u64 bch_crc64_update(u64 crc, const void *_data, size_t len) -{ - const unsigned char *data = _data; - - while (len--) { - int i = ((int) (crc >> 56) ^ *data++) & 0xFF; - crc = crc_table[i] ^ (crc << 8); - } - - return crc; -} - -static u64 bch_checksum_init(unsigned type) -{ - switch (type) { - case BCH_CSUM_NONE: - return 0; - case BCH_CSUM_CRC32C: - return U32_MAX; - case BCH_CSUM_CRC64: - return U64_MAX; - default: - BUG(); - } -} - -static u64 bch_checksum_final(unsigned type, u64 crc) -{ - switch (type) { - case BCH_CSUM_NONE: - return 0; - case BCH_CSUM_CRC32C: - return crc ^ U32_MAX; - case BCH_CSUM_CRC64: - return crc ^ U64_MAX; - default: - BUG(); - } -} - -static u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len) -{ - switch (type) { - case BCH_CSUM_NONE: - return 0; - case BCH_CSUM_CRC32C: - return crc32c(crc, data, len); - case BCH_CSUM_CRC64: - return bch_crc64_update(crc, data, len); - default: - BUG(); - } -} - -static inline void do_encrypt_sg(struct crypto_blkcipher *tfm, - struct nonce nonce, - struct scatterlist *sg, size_t len) -{ - struct blkcipher_desc desc = { .tfm = tfm, .info = nonce.d }; - int ret; - - ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len); - BUG_ON(ret); -} - -static inline void do_encrypt(struct crypto_blkcipher *tfm, - struct nonce nonce, - void *buf, size_t len) -{ - struct scatterlist sg; - - sg_init_one(&sg, buf, len); - do_encrypt_sg(tfm, nonce, &sg, len); -} - -int bch_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, - void *buf, size_t len) -{ - struct crypto_blkcipher *chacha20 = - crypto_alloc_blkcipher("chacha20", 0, CRYPTO_ALG_ASYNC); - int ret; - - if (!chacha20) - return PTR_ERR(chacha20); - - ret = crypto_blkcipher_setkey(chacha20, (void *) key, sizeof(*key)); - if (ret) - goto err; - - do_encrypt(chacha20, nonce, buf, len); -err: - crypto_free_blkcipher(chacha20); - return ret; -} - -static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, - struct nonce nonce) -{ - u8 key[POLY1305_KEY_SIZE]; - - nonce.d[3] ^= BCH_NONCE_POLY; - - memset(key, 0, sizeof(key)); - do_encrypt(c->chacha20, nonce, key, sizeof(key)); - - desc->tfm = c->poly1305; - desc->flags = 0; - crypto_shash_init(desc); - crypto_shash_update(desc, key, sizeof(key)); -} - -struct bch_csum bch_checksum(struct bch_fs *c, unsigned type, - struct nonce nonce, const void *data, size_t len) -{ - switch (type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: { - u64 crc = bch_checksum_init(type); - - crc = bch_checksum_update(type, crc, data, len); - crc = bch_checksum_final(type, crc); - - return (struct bch_csum) { .lo = crc }; - } - - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: { - SHASH_DESC_ON_STACK(desc, c->poly1305); - u8 digest[POLY1305_DIGEST_SIZE]; - struct bch_csum ret = { 0 }; - - gen_poly_key(c, desc, nonce); - - crypto_shash_update(desc, data, len); - crypto_shash_final(desc, digest); - - memcpy(&ret, digest, bch_crc_bytes[type]); - return ret; - } - default: - BUG(); - } -} - -void bch_encrypt(struct bch_fs *c, unsigned type, - struct nonce nonce, void *data, size_t len) -{ - if (!bch_csum_type_is_encryption(type)) - return; - - do_encrypt(c->chacha20, nonce, data, len); -} - -struct bch_csum bch_checksum_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) -{ - struct bio_vec bv; - struct bvec_iter iter; - - switch (type) { - case BCH_CSUM_NONE: - return (struct bch_csum) { 0 }; - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: { - u64 crc = bch_checksum_init(type); - - bio_for_each_contig_segment(bv, bio, iter) { - void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; - crc = bch_checksum_update(type, - crc, p, bv.bv_len); - kunmap_atomic(p); - } - - crc = bch_checksum_final(type, crc); - return (struct bch_csum) { .lo = crc }; - } - - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: { - SHASH_DESC_ON_STACK(desc, c->poly1305); - u8 digest[POLY1305_DIGEST_SIZE]; - struct bch_csum ret = { 0 }; - - gen_poly_key(c, desc, nonce); - - bio_for_each_contig_segment(bv, bio, iter) { - void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; - - crypto_shash_update(desc, p, bv.bv_len); - kunmap_atomic(p); - } - - crypto_shash_final(desc, digest); - - memcpy(&ret, digest, bch_crc_bytes[type]); - return ret; - } - default: - BUG(); - } -} - -void bch_encrypt_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) -{ - struct bio_vec bv; - struct bvec_iter iter; - struct scatterlist sgl[16], *sg = sgl; - size_t bytes = 0; - - if (!bch_csum_type_is_encryption(type)) - return; - - sg_init_table(sgl, ARRAY_SIZE(sgl)); - - bio_for_each_contig_segment(bv, bio, iter) { - if (sg == sgl + ARRAY_SIZE(sgl)) { - sg_mark_end(sg - 1); - do_encrypt_sg(c->chacha20, nonce, sgl, bytes); - - le32_add_cpu(nonce.d, bytes / CHACHA20_BLOCK_SIZE); - bytes = 0; - - sg_init_table(sgl, ARRAY_SIZE(sgl)); - sg = sgl; - } - - sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); - bytes += bv.bv_len; - - } - - sg_mark_end(sg - 1); - do_encrypt_sg(c->chacha20, nonce, sgl, bytes); -} - -#ifdef __KERNEL__ -int bch_request_key(struct bch_sb *sb, struct bch_key *key) -{ - char key_description[60]; - struct key *keyring_key; - const struct user_key_payload *ukp; - int ret; - - snprintf(key_description, sizeof(key_description), - "bcache:%pUb", &sb->user_uuid); - - keyring_key = request_key(&key_type_logon, key_description, NULL); - if (IS_ERR(keyring_key)) - return PTR_ERR(keyring_key); - - down_read(&keyring_key->sem); - ukp = user_key_payload(keyring_key); - if (ukp->datalen == sizeof(*key)) { - memcpy(key, ukp->data, ukp->datalen); - ret = 0; - } else { - ret = -EINVAL; - } - up_read(&keyring_key->sem); - key_put(keyring_key); - - return ret; -} -#else -#include <keyutils.h> -#include <uuid/uuid.h> - -int bch_request_key(struct bch_sb *sb, struct bch_key *key) -{ - key_serial_t key_id; - char key_description[60]; - char uuid[40]; - - uuid_unparse_lower(sb->user_uuid.b, uuid); - sprintf(key_description, "bcache:%s", uuid); - - key_id = request_key("user", key_description, NULL, - KEY_SPEC_USER_KEYRING); - if (key_id < 0) - return -errno; - - if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) - return -1; - - return 0; -} -#endif - -static int bch_decrypt_sb_key(struct bch_fs *c, - struct bch_sb_field_crypt *crypt, - struct bch_key *key) -{ - struct bch_encrypted_key sb_key = crypt->key; - struct bch_key user_key; - int ret = 0; - - /* is key encrypted? */ - if (!bch_key_is_encrypted(&sb_key)) - goto out; - - ret = bch_request_key(c->disk_sb, &user_key); - if (ret) { - bch_err(c, "error requesting encryption key"); - goto err; - } - - /* decrypt real key: */ - ret = bch_chacha_encrypt_key(&user_key, bch_sb_key_nonce(c), - &sb_key, sizeof(sb_key)); - if (ret) - goto err; - - if (bch_key_is_encrypted(&sb_key)) { - bch_err(c, "incorrect encryption key"); - ret = -EINVAL; - goto err; - } -out: - *key = sb_key.key; -err: - memzero_explicit(&sb_key, sizeof(sb_key)); - memzero_explicit(&user_key, sizeof(user_key)); - return ret; -} - -static int bch_alloc_ciphers(struct bch_fs *c) -{ - if (!c->chacha20) - c->chacha20 = crypto_alloc_blkcipher("chacha20", 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(c->chacha20)) - return PTR_ERR(c->chacha20); - - if (!c->poly1305) - c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); - if (IS_ERR(c->poly1305)) - return PTR_ERR(c->poly1305); - - return 0; -} - -int bch_disable_encryption(struct bch_fs *c) -{ - struct bch_sb_field_crypt *crypt; - struct bch_key key; - int ret = -EINVAL; - - mutex_lock(&c->sb_lock); - - crypt = bch_sb_get_crypt(c->disk_sb); - if (!crypt) - goto out; - - /* is key encrypted? */ - ret = 0; - if (bch_key_is_encrypted(&crypt->key)) - goto out; - - ret = bch_decrypt_sb_key(c, crypt, &key); - if (ret) - goto out; - - crypt->key.magic = BCH_KEY_MAGIC; - crypt->key.key = key; - - SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 0); - bch_write_super(c); -out: - mutex_unlock(&c->sb_lock); - - return ret; -} - -int bch_enable_encryption(struct bch_fs *c, bool keyed) -{ - struct bch_encrypted_key key; - struct bch_key user_key; - struct bch_sb_field_crypt *crypt; - int ret = -EINVAL; - - mutex_lock(&c->sb_lock); - - /* Do we already have an encryption key? */ - if (bch_sb_get_crypt(c->disk_sb)) - goto err; - - ret = bch_alloc_ciphers(c); - if (ret) - goto err; - - key.magic = BCH_KEY_MAGIC; - get_random_bytes(&key.key, sizeof(key.key)); - - if (keyed) { - ret = bch_request_key(c->disk_sb, &user_key); - if (ret) { - bch_err(c, "error requesting encryption key"); - goto err; - } - - ret = bch_chacha_encrypt_key(&user_key, bch_sb_key_nonce(c), - &key, sizeof(key)); - if (ret) - goto err; - } - - ret = crypto_blkcipher_setkey(c->chacha20, - (void *) &key.key, sizeof(key.key)); - if (ret) - goto err; - - crypt = bch_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64)); - if (!crypt) { - ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ - goto err; - } - - crypt->key = key; - - /* write superblock */ - SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 1); - bch_write_super(c); -err: - mutex_unlock(&c->sb_lock); - memzero_explicit(&user_key, sizeof(user_key)); - memzero_explicit(&key, sizeof(key)); - return ret; -} - -void bch_fs_encryption_exit(struct bch_fs *c) -{ - if (!IS_ERR_OR_NULL(c->poly1305)) - crypto_free_shash(c->poly1305); - if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_blkcipher(c->chacha20); -} - -int bch_fs_encryption_init(struct bch_fs *c) -{ - struct bch_sb_field_crypt *crypt; - struct bch_key key; - int ret; - - crypt = bch_sb_get_crypt(c->disk_sb); - if (!crypt) - return 0; - - ret = bch_alloc_ciphers(c); - if (ret) - return ret; - - ret = bch_decrypt_sb_key(c, crypt, &key); - if (ret) - goto err; - - ret = crypto_blkcipher_setkey(c->chacha20, - (void *) &key.key, sizeof(key.key)); -err: - memzero_explicit(&key, sizeof(key)); - return ret; -} diff --git a/libbcache/checksum.h b/libbcache/checksum.h deleted file mode 100644 index 10f62e5b..00000000 --- a/libbcache/checksum.h +++ /dev/null @@ -1,133 +0,0 @@ -#ifndef _BCACHE_CHECKSUM_H -#define _BCACHE_CHECKSUM_H - -#include "bcache.h" -#include "super-io.h" - -#include <crypto/chacha20.h> - -u64 bch_crc64_update(u64, const void *, size_t); - -#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) -#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) -#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) -#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) -#define BCH_NONCE_POLY cpu_to_le32(1 << 31) - -struct bch_csum bch_checksum(struct bch_fs *, unsigned, struct nonce, - const void *, size_t); - -/* - * This is used for various on disk data structures - bch_sb, prio_set, bset, - * jset: The checksum is _always_ the first field of these structs - */ -#define csum_vstruct(_c, _type, _nonce, _i) \ -({ \ - const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ - const void *end = vstruct_end(_i); \ - \ - bch_checksum(_c, _type, _nonce, start, end - start); \ -}) - -int bch_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); -int bch_request_key(struct bch_sb *, struct bch_key *); - -void bch_encrypt(struct bch_fs *, unsigned, struct nonce, - void *data, size_t); - -struct bch_csum bch_checksum_bio(struct bch_fs *, unsigned, - struct nonce, struct bio *); -void bch_encrypt_bio(struct bch_fs *, unsigned, - struct nonce, struct bio *); - -int bch_disable_encryption(struct bch_fs *); -int bch_enable_encryption(struct bch_fs *, bool); - -void bch_fs_encryption_exit(struct bch_fs *); -int bch_fs_encryption_init(struct bch_fs *); - -static inline unsigned bch_data_checksum_type(struct bch_fs *c) -{ - if (c->sb.encryption_type) - return c->opts.wide_macs - ? BCH_CSUM_CHACHA20_POLY1305_128 - : BCH_CSUM_CHACHA20_POLY1305_80; - - return c->opts.data_checksum; -} - -static inline unsigned bch_meta_checksum_type(struct bch_fs *c) -{ - return c->sb.encryption_type - ? BCH_CSUM_CHACHA20_POLY1305_128 - : c->opts.metadata_checksum; -} - -static inline bool bch_checksum_type_valid(const struct bch_fs *c, - unsigned type) -{ - if (type >= BCH_CSUM_NR) - return false; - - if (bch_csum_type_is_encryption(type) && !c->chacha20) - return false; - - return true; -} - -static const unsigned bch_crc_bytes[] = { - [BCH_CSUM_NONE] = 0, - [BCH_CSUM_CRC32C] = 4, - [BCH_CSUM_CRC64] = 8, - [BCH_CSUM_CHACHA20_POLY1305_80] = 10, - [BCH_CSUM_CHACHA20_POLY1305_128] = 16, -}; - -static inline bool bch_crc_cmp(struct bch_csum l, struct bch_csum r) -{ - /* - * XXX: need some way of preventing the compiler from optimizing this - * into a form that isn't constant time.. - */ - return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; -} - -/* for skipping ahead and encrypting/decrypting at an offset: */ -static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) -{ - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); - - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); - return nonce; -} - -static inline bool bch_key_is_encrypted(struct bch_encrypted_key *key) -{ - return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; -} - -static inline struct nonce __bch_sb_key_nonce(struct bch_sb *sb) -{ - __le64 magic = __bch_sb_magic(sb); - - return (struct nonce) {{ - [0] = 0, - [1] = 0, - [2] = ((__le32 *) &magic)[0], - [3] = ((__le32 *) &magic)[1], - }}; -} - -static inline struct nonce bch_sb_key_nonce(struct bch_fs *c) -{ - __le64 magic = bch_sb_magic(c); - - return (struct nonce) {{ - [0] = 0, - [1] = 0, - [2] = ((__le32 *) &magic)[0], - [3] = ((__le32 *) &magic)[1], - }}; -} - -#endif /* _BCACHE_CHECKSUM_H */ diff --git a/libbcache/clock.c b/libbcache/clock.c deleted file mode 100644 index 85891a03..00000000 --- a/libbcache/clock.c +++ /dev/null @@ -1,161 +0,0 @@ -#include "bcache.h" -#include "clock.h" - -#include <linux/freezer.h> -#include <linux/kthread.h> - -static inline bool io_timer_cmp(struct io_timer *l, struct io_timer *r) -{ - return time_after(l->expire, r->expire); -} - -void bch_io_timer_add(struct io_clock *clock, struct io_timer *timer) -{ - size_t i; - - spin_lock(&clock->timer_lock); - for (i = 0; i < clock->timers.used; i++) - if (clock->timers.data[i] == timer) - goto out; - - BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp)); -out: - spin_unlock(&clock->timer_lock); -} - -void bch_io_timer_del(struct io_clock *clock, struct io_timer *timer) -{ - size_t i; - - spin_lock(&clock->timer_lock); - - for (i = 0; i < clock->timers.used; i++) - if (clock->timers.data[i] == timer) { - heap_del(&clock->timers, i, io_timer_cmp); - break; - } - - spin_unlock(&clock->timer_lock); -} - -struct io_clock_wait { - struct io_timer timer; - struct task_struct *task; - int expired; -}; - -static void io_clock_wait_fn(struct io_timer *timer) -{ - struct io_clock_wait *wait = container_of(timer, - struct io_clock_wait, timer); - - wait->expired = 1; - wake_up_process(wait->task); -} - -void bch_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) -{ - struct io_clock_wait wait; - - /* XXX: calculate sleep time rigorously */ - wait.timer.expire = until; - wait.timer.fn = io_clock_wait_fn; - wait.task = current; - wait.expired = 0; - bch_io_timer_add(clock, &wait.timer); - - schedule(); - - bch_io_timer_del(clock, &wait.timer); -} - -/* - * _only_ to be used from a kthread - */ -void bch_kthread_io_clock_wait(struct io_clock *clock, - unsigned long until) -{ - struct io_clock_wait wait; - - /* XXX: calculate sleep time rigorously */ - wait.timer.expire = until; - wait.timer.fn = io_clock_wait_fn; - wait.task = current; - wait.expired = 0; - bch_io_timer_add(clock, &wait.timer); - - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) - break; - - if (wait.expired) - break; - - schedule(); - try_to_freeze(); - } - - __set_current_state(TASK_RUNNING); - bch_io_timer_del(clock, &wait.timer); -} - -static struct io_timer *get_expired_timer(struct io_clock *clock, - unsigned long now) -{ - struct io_timer *ret = NULL; - - spin_lock(&clock->timer_lock); - - if (clock->timers.used && - time_after_eq(now, clock->timers.data[0]->expire)) - heap_pop(&clock->timers, ret, io_timer_cmp); - - spin_unlock(&clock->timer_lock); - - return ret; -} - -void bch_increment_clock(struct bch_fs *c, unsigned sectors, int rw) -{ - struct io_clock *clock = &c->io_clock[rw]; - struct io_timer *timer; - unsigned long now; - - /* Buffer up one megabyte worth of IO in the percpu counter */ - preempt_disable(); - - if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) < - IO_CLOCK_PCPU_SECTORS)) { - preempt_enable(); - return; - } - - sectors = this_cpu_xchg(*clock->pcpu_buf, 0); - preempt_enable(); - now = atomic_long_add_return(sectors, &clock->now); - - while ((timer = get_expired_timer(clock, now))) - timer->fn(timer); -} - -void bch_io_clock_exit(struct io_clock *clock) -{ - free_heap(&clock->timers); - free_percpu(clock->pcpu_buf); -} - -int bch_io_clock_init(struct io_clock *clock) -{ - atomic_long_set(&clock->now, 0); - spin_lock_init(&clock->timer_lock); - - clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); - if (!clock->pcpu_buf) - return -ENOMEM; - - if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) - return -ENOMEM; - - return 0; -} diff --git a/libbcache/clock.h b/libbcache/clock.h deleted file mode 100644 index 9e081d7d..00000000 --- a/libbcache/clock.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _BCACHE_CLOCK_H -#define _BCACHE_CLOCK_H - -void bch_io_timer_add(struct io_clock *, struct io_timer *); -void bch_io_timer_del(struct io_clock *, struct io_timer *); -void bch_kthread_io_clock_wait(struct io_clock *, unsigned long); -void bch_increment_clock(struct bch_fs *, unsigned, int); - -void bch_io_clock_schedule_timeout(struct io_clock *, unsigned long); - -#define bch_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ -({ \ - long __ret = timeout; \ - might_sleep(); \ - if (!___wait_cond_timeout(condition)) \ - __ret = __wait_event_timeout(wq, condition, timeout); \ - __ret; \ -}) - -void bch_io_clock_exit(struct io_clock *); -int bch_io_clock_init(struct io_clock *); - -#endif /* _BCACHE_CLOCK_H */ diff --git a/libbcache/clock_types.h b/libbcache/clock_types.h deleted file mode 100644 index 4a02f467..00000000 --- a/libbcache/clock_types.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _BCACHE_CLOCK_TYPES_H -#define _BCACHE_CLOCK_TYPES_H - -#include "util.h" - -#define NR_IO_TIMERS 8 - -/* - * Clocks/timers in units of sectors of IO: - * - * Note - they use percpu batching, so they're only approximate. - */ - -struct io_timer; -typedef void (*io_timer_fn)(struct io_timer *); - -struct io_timer { - io_timer_fn fn; - unsigned long expire; -}; - -/* Amount to buffer up on a percpu counter */ -#define IO_CLOCK_PCPU_SECTORS 128 - -struct io_clock { - atomic_long_t now; - u16 __percpu *pcpu_buf; - - spinlock_t timer_lock; - DECLARE_HEAP(struct io_timer *, timers); -}; - -#endif /* _BCACHE_CLOCK_TYPES_H */ - diff --git a/libbcache/closure.c b/libbcache/closure.c deleted file mode 100644 index f6f4dd99..00000000 --- a/libbcache/closure.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Asynchronous refcounty things - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include <linux/debugfs.h> -#include <linux/module.h> -#include <linux/seq_file.h> - -#include "closure.h" - -static inline void closure_put_after_sub(struct closure *cl, int flags) -{ - int r = flags & CLOSURE_REMAINING_MASK; - - BUG_ON(flags & CLOSURE_GUARD_MASK); - BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); - - if (!r) { - if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { - atomic_set(&cl->remaining, - CLOSURE_REMAINING_INITIALIZER); - closure_queue(cl); - } else { - struct closure *parent = cl->parent; - closure_fn *destructor = cl->fn; - - closure_debug_destroy(cl); - - if (destructor) - destructor(cl); - - if (parent) - closure_put(parent); - } - } -} - -/* For clearing flags with the same atomic op as a put */ -void closure_sub(struct closure *cl, int v) -{ - closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); -} -EXPORT_SYMBOL(closure_sub); - -/** - * closure_put - decrement a closure's refcount - */ -void closure_put(struct closure *cl) -{ - closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); -} -EXPORT_SYMBOL(closure_put); - -/** - * closure_wake_up - wake up all closures on a wait list, without memory barrier - */ -void __closure_wake_up(struct closure_waitlist *wait_list) -{ - struct llist_node *list, *next; - struct closure *cl; - - /* - * Grab entire list, reverse order to preserve FIFO ordering, and wake - * everything up - */ - for (list = llist_reverse_order(llist_del_all(&wait_list->list)); - list; - list = next) { - next = llist_next(list); - cl = container_of(list, struct closure, list); - - closure_set_waiting(cl, 0); - closure_sub(cl, CLOSURE_WAITING + 1); - } -} -EXPORT_SYMBOL(__closure_wake_up); - -/** - * closure_wait - add a closure to a waitlist - * - * @waitlist will own a ref on @cl, which will be released when - * closure_wake_up() is called on @waitlist. - * - */ -bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) -{ - if (atomic_read(&cl->remaining) & CLOSURE_WAITING) - return false; - - closure_set_waiting(cl, _RET_IP_); - atomic_add(CLOSURE_WAITING + 1, &cl->remaining); - llist_add(&cl->list, &waitlist->list); - - return true; -} -EXPORT_SYMBOL(closure_wait); - -struct closure_syncer { - struct task_struct *task; - int done; -}; - -static void closure_sync_fn(struct closure *cl) -{ - cl->s->done = 1; - wake_up_process(cl->s->task); -} - -void __sched __closure_sync(struct closure *cl) -{ - struct closure_syncer s = { .task = current }; - - cl->s = &s; - continue_at_noreturn(cl, closure_sync_fn, NULL); - - while (1) { - __set_current_state(TASK_UNINTERRUPTIBLE); - smp_mb(); - if (s.done) - break; - schedule(); - } - - __set_current_state(TASK_RUNNING); -} -EXPORT_SYMBOL(__closure_sync); - -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - -static LIST_HEAD(closure_list); -static DEFINE_SPINLOCK(closure_list_lock); - -void closure_debug_create(struct closure *cl) -{ - unsigned long flags; - - BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); - cl->magic = CLOSURE_MAGIC_ALIVE; - - spin_lock_irqsave(&closure_list_lock, flags); - list_add(&cl->all, &closure_list); - spin_unlock_irqrestore(&closure_list_lock, flags); -} -EXPORT_SYMBOL(closure_debug_create); - -void closure_debug_destroy(struct closure *cl) -{ - unsigned long flags; - - BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); - cl->magic = CLOSURE_MAGIC_DEAD; - - spin_lock_irqsave(&closure_list_lock, flags); - list_del(&cl->all); - spin_unlock_irqrestore(&closure_list_lock, flags); -} -EXPORT_SYMBOL(closure_debug_destroy); - -static struct dentry *debug; - -static int debug_seq_show(struct seq_file *f, void *data) -{ - struct closure *cl; - - spin_lock_irq(&closure_list_lock); - - list_for_each_entry(cl, &closure_list, all) { - int r = atomic_read(&cl->remaining); - - seq_printf(f, "%p: %pF -> %pf p %p r %i ", - cl, (void *) cl->ip, cl->fn, cl->parent, - r & CLOSURE_REMAINING_MASK); - - seq_printf(f, "%s%s\n", - test_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(&cl->work)) ? "Q" : "", - r & CLOSURE_RUNNING ? "R" : ""); - - if (r & CLOSURE_WAITING) - seq_printf(f, " W %pF\n", - (void *) cl->waiting_on); - - seq_puts(f, "\n"); - } - - spin_unlock_irq(&closure_list_lock); - return 0; -} - -static int debug_seq_open(struct inode *inode, struct file *file) -{ - return single_open(file, debug_seq_show, NULL); -} - -static const struct file_operations debug_ops = { - .owner = THIS_MODULE, - .open = debug_seq_open, - .read = seq_read, - .release = single_release -}; - -void __init closure_debug_init(void) -{ - debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); -} - -#endif diff --git a/libbcache/closure.h b/libbcache/closure.h deleted file mode 100644 index b55254b6..00000000 --- a/libbcache/closure.h +++ /dev/null @@ -1,387 +0,0 @@ -#ifndef _LINUX_CLOSURE_H -#define _LINUX_CLOSURE_H - -#include <linux/llist.h> -#include <linux/sched.h> -#include <linux/workqueue.h> - -/* - * Closure is perhaps the most overused and abused term in computer science, but - * since I've been unable to come up with anything better you're stuck with it - * again. - * - * What are closures? - * - * They embed a refcount. The basic idea is they count "things that are in - * progress" - in flight bios, some other thread that's doing something else - - * anything you might want to wait on. - * - * The refcount may be manipulated with closure_get() and closure_put(). - * closure_put() is where many of the interesting things happen, when it causes - * the refcount to go to 0. - * - * Closures can be used to wait on things both synchronously and asynchronously, - * and synchronous and asynchronous use can be mixed without restriction. To - * wait synchronously, use closure_sync() - you will sleep until your closure's - * refcount hits 1. - * - * To wait asynchronously, use - * continue_at(cl, next_function, workqueue); - * - * passing it, as you might expect, the function to run when nothing is pending - * and the workqueue to run that function out of. - * - * continue_at() also, critically, requires a 'return' immediately following the - * location where this macro is referenced, to return to the calling function. - * There's good reason for this. - * - * To use safely closures asynchronously, they must always have a refcount while - * they are running owned by the thread that is running them. Otherwise, suppose - * you submit some bios and wish to have a function run when they all complete: - * - * foo_endio(struct bio *bio) - * { - * closure_put(cl); - * } - * - * closure_init(cl); - * - * do_stuff(); - * closure_get(cl); - * bio1->bi_endio = foo_endio; - * bio_submit(bio1); - * - * do_more_stuff(); - * closure_get(cl); - * bio2->bi_endio = foo_endio; - * bio_submit(bio2); - * - * continue_at(cl, complete_some_read, system_wq); - * - * If closure's refcount started at 0, complete_some_read() could run before the - * second bio was submitted - which is almost always not what you want! More - * importantly, it wouldn't be possible to say whether the original thread or - * complete_some_read()'s thread owned the closure - and whatever state it was - * associated with! - * - * So, closure_init() initializes a closure's refcount to 1 - and when a - * closure_fn is run, the refcount will be reset to 1 first. - * - * Then, the rule is - if you got the refcount with closure_get(), release it - * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount - * on a closure because you called closure_init() or you were run out of a - * closure - _always_ use continue_at(). Doing so consistently will help - * eliminate an entire class of particularly pernicious races. - * - * Lastly, you might have a wait list dedicated to a specific event, and have no - * need for specifying the condition - you just want to wait until someone runs - * closure_wake_up() on the appropriate wait list. In that case, just use - * closure_wait(). It will return either true or false, depending on whether the - * closure was already on a wait list or not - a closure can only be on one wait - * list at a time. - * - * Parents: - * - * closure_init() takes two arguments - it takes the closure to initialize, and - * a (possibly null) parent. - * - * If parent is non null, the new closure will have a refcount for its lifetime; - * a closure is considered to be "finished" when its refcount hits 0 and the - * function to run is null. Hence - * - * continue_at(cl, NULL, NULL); - * - * returns up the (spaghetti) stack of closures, precisely like normal return - * returns up the C stack. continue_at() with non null fn is better thought of - * as doing a tail call. - * - * All this implies that a closure should typically be embedded in a particular - * struct (which its refcount will normally control the lifetime of), and that - * struct can very much be thought of as a stack frame. - */ - -struct closure; -struct closure_syncer; -typedef void (closure_fn) (struct closure *); - -struct closure_waitlist { - struct llist_head list; -}; - -enum closure_state { - /* - * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by - * the thread that owns the closure, and cleared by the thread that's - * waking up the closure. - * - * The rest are for debugging and don't affect behaviour: - * - * CLOSURE_RUNNING: Set when a closure is running (i.e. by - * closure_init() and when closure_put() runs then next function), and - * must be cleared before remaining hits 0. Primarily to help guard - * against incorrect usage and accidentally transferring references. - * continue_at() and closure_return() clear it for you, if you're doing - * something unusual you can use closure_set_dead() which also helps - * annotate where references are being transferred. - */ - - CLOSURE_BITS_START = (1U << 27), - CLOSURE_DESTRUCTOR = (1U << 27), - CLOSURE_WAITING = (1U << 29), - CLOSURE_RUNNING = (1U << 31), -}; - -#define CLOSURE_GUARD_MASK \ - ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) - -#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) -#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) - -struct closure { - union { - struct { - struct workqueue_struct *wq; - struct closure_syncer *s; - struct llist_node list; - closure_fn *fn; - }; - struct work_struct work; - }; - - struct closure *parent; - - atomic_t remaining; - -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -#define CLOSURE_MAGIC_DEAD 0xc054dead -#define CLOSURE_MAGIC_ALIVE 0xc054a11e - - unsigned magic; - struct list_head all; - unsigned long ip; - unsigned long waiting_on; -#endif -}; - -void closure_sub(struct closure *cl, int v); -void closure_put(struct closure *cl); -void __closure_wake_up(struct closure_waitlist *list); -bool closure_wait(struct closure_waitlist *list, struct closure *cl); -void __closure_sync(struct closure *cl); - -/** - * closure_sync - sleep until a closure a closure has nothing left to wait on - * - * Sleeps until the refcount hits 1 - the thread that's running the closure owns - * the last refcount. - */ -static inline void closure_sync(struct closure *cl) -{ - if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) - __closure_sync(cl); -} - -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - -void closure_debug_init(void); -void closure_debug_create(struct closure *cl); -void closure_debug_destroy(struct closure *cl); - -#else - -static inline void closure_debug_init(void) {} -static inline void closure_debug_create(struct closure *cl) {} -static inline void closure_debug_destroy(struct closure *cl) {} - -#endif - -static inline void closure_set_ip(struct closure *cl) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - cl->ip = _THIS_IP_; -#endif -} - -static inline void closure_set_ret_ip(struct closure *cl) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - cl->ip = _RET_IP_; -#endif -} - -static inline void closure_set_waiting(struct closure *cl, unsigned long f) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - cl->waiting_on = f; -#endif -} - -static inline void closure_set_stopped(struct closure *cl) -{ - atomic_sub(CLOSURE_RUNNING, &cl->remaining); -} - -static inline void set_closure_fn(struct closure *cl, closure_fn *fn, - struct workqueue_struct *wq) -{ - closure_set_ip(cl); - cl->fn = fn; - cl->wq = wq; - /* between atomic_dec() in closure_put() */ - smp_mb__before_atomic(); -} - -static inline void closure_queue(struct closure *cl) -{ - struct workqueue_struct *wq = cl->wq; - - if (wq) { - INIT_WORK(&cl->work, cl->work.func); - queue_work(wq, &cl->work); - } else - cl->fn(cl); -} - -/** - * closure_get - increment a closure's refcount - */ -static inline void closure_get(struct closure *cl) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - BUG_ON((atomic_inc_return(&cl->remaining) & - CLOSURE_REMAINING_MASK) <= 1); -#else - atomic_inc(&cl->remaining); -#endif -} - -/** - * closure_init - Initialize a closure, setting the refcount to 1 - * @cl: closure to initialize - * @parent: parent of the new closure. cl will take a refcount on it for its - * lifetime; may be NULL. - */ -static inline void closure_init(struct closure *cl, struct closure *parent) -{ - cl->fn = NULL; - cl->parent = parent; - if (parent) - closure_get(parent); - - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); - - closure_debug_create(cl); - closure_set_ip(cl); -} - -static inline void closure_init_stack(struct closure *cl) -{ - memset(cl, 0, sizeof(struct closure)); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -} - -/** - * closure_wake_up - wake up all closures on a wait list. - */ -static inline void closure_wake_up(struct closure_waitlist *list) -{ - smp_mb(); - __closure_wake_up(list); -} - -#define continue_at_noreturn(_cl, _fn, _wq) \ -do { \ - set_closure_fn(_cl, _fn, _wq); \ - closure_sub(_cl, CLOSURE_RUNNING + 1); \ -} while (0) - -/** - * continue_at - jump to another function with barrier - * - * After @cl is no longer waiting on anything (i.e. all outstanding refs have - * been dropped with closure_put()), it will resume execution at @fn running out - * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). - * - * NOTE: This macro expands to a return in the calling function! - * - * This is because after calling continue_at() you no longer have a ref on @cl, - * and whatever @cl owns may be freed out from under you - a running closure fn - * has a ref on its own closure which continue_at() drops. - */ -#define continue_at(_cl, _fn, _wq) \ -do { \ - continue_at_noreturn(_cl, _fn, _wq); \ - return; \ -} while (0) - -/** - * closure_return - finish execution of a closure - * - * This is used to indicate that @cl is finished: when all outstanding refs on - * @cl have been dropped @cl's ref on its parent closure (as passed to - * closure_init()) will be dropped, if one was specified - thus this can be - * thought of as returning to the parent closure. - */ -#define closure_return(_cl) continue_at((_cl), NULL, NULL) - -/** - * continue_at_nobarrier - jump to another function without barrier - * - * Causes @fn to be executed out of @cl, in @wq context (or called directly if - * @wq is NULL). - * - * NOTE: like continue_at(), this macro expands to a return in the caller! - * - * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, - * thus it's not safe to touch anything protected by @cl after a - * continue_at_nobarrier(). - */ -#define continue_at_nobarrier(_cl, _fn, _wq) \ -do { \ - closure_set_ip(cl); \ - if (_wq) { \ - INIT_WORK(&(_cl)->work, (void *) _fn); \ - queue_work((_wq), &(_cl)->work); \ - } else { \ - (_fn)(_cl); \ - } \ - return; \ -} while (0) - -#define closure_return_with_destructor_noreturn(_cl, _destructor) \ -do { \ - set_closure_fn(_cl, _destructor, NULL); \ - closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ -} while (0) - -/** - * closure_return - finish execution of a closure, with destructor - * - * Works like closure_return(), except @destructor will be called when all - * outstanding refs on @cl have been dropped; @destructor may be used to safely - * free the memory occupied by @cl, and it is called with the ref on the parent - * closure still held - so @destructor could safely return an item to a - * freelist protected by @cl's parent. - */ -#define closure_return_with_destructor(_cl, _destructor) \ -do { \ - closure_return_with_destructor_noreturn(_cl, _destructor); \ - return; \ -} while (0) - -/** - * closure_call - execute @fn out of a new, uninitialized closure - * - * Typically used when running out of one closure, and we want to run @fn - * asynchronously out of a new closure - @parent will then wait for @cl to - * finish. - */ -static inline void closure_call(struct closure *cl, closure_fn fn, - struct workqueue_struct *wq, - struct closure *parent) -{ - closure_init(cl, parent); - continue_at_nobarrier(cl, fn, wq); -} - -#endif /* _LINUX_CLOSURE_H */ diff --git a/libbcache/compress.c b/libbcache/compress.c deleted file mode 100644 index d9a64c38..00000000 --- a/libbcache/compress.c +++ /dev/null @@ -1,500 +0,0 @@ -#include "bcache.h" -#include "compress.h" -#include "extents.h" -#include "io.h" -#include "super-io.h" - -#include <linux/lz4.h> -#include <linux/zlib.h> - -enum bounced { - BOUNCED_CONTIG, - BOUNCED_MAPPED, - BOUNCED_KMALLOCED, - BOUNCED_VMALLOCED, - BOUNCED_MEMPOOLED, -}; - -static void *__bounce_alloc(struct bch_fs *c, unsigned size, - unsigned *bounced, int direction) -{ - void *data; - - *bounced = BOUNCED_KMALLOCED; - data = kmalloc(size, GFP_NOIO|__GFP_NOWARN); - if (data) - return data; - - *bounced = BOUNCED_MEMPOOLED; - data = mempool_alloc(&c->compression_bounce[direction], GFP_NOWAIT); - if (data) - return page_address(data); - - *bounced = BOUNCED_VMALLOCED; - data = vmalloc(size); - if (data) - return data; - - *bounced = BOUNCED_MEMPOOLED; - data = mempool_alloc(&c->compression_bounce[direction], GFP_NOIO); - return page_address(data); -} - -static void *__bio_map_or_bounce(struct bch_fs *c, - struct bio *bio, struct bvec_iter start, - unsigned *bounced, int direction) -{ - struct bio_vec bv; - struct bvec_iter iter; - unsigned nr_pages = 0; - struct page *stack_pages[16]; - struct page **pages = NULL; - bool first = true; - unsigned prev_end = PAGE_SIZE; - void *data; - - BUG_ON(bvec_iter_sectors(start) > BCH_ENCODED_EXTENT_MAX); - -#ifndef CONFIG_HIGHMEM - *bounced = BOUNCED_CONTIG; - - __bio_for_each_contig_segment(bv, bio, iter, start) { - if (bv.bv_len == start.bi_size) - return page_address(bv.bv_page) + bv.bv_offset; - } -#endif - *bounced = BOUNCED_MAPPED; - - __bio_for_each_segment(bv, bio, iter, start) { - if ((!first && bv.bv_offset) || - prev_end != PAGE_SIZE) - goto bounce; - - prev_end = bv.bv_offset + bv.bv_len; - nr_pages++; - } - - BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); - - pages = nr_pages > ARRAY_SIZE(stack_pages) - ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO) - : stack_pages; - if (!pages) - goto bounce; - - nr_pages = 0; - __bio_for_each_segment(bv, bio, iter, start) - pages[nr_pages++] = bv.bv_page; - - data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (pages != stack_pages) - kfree(pages); - - return data + bio_iter_offset(bio, start); -bounce: - data = __bounce_alloc(c, start.bi_size, bounced, direction); - - if (direction == READ) - memcpy_from_bio(data, bio, start); - - return data; -} - -static void *bio_map_or_bounce(struct bch_fs *c, struct bio *bio, - unsigned *bounced, int direction) -{ - return __bio_map_or_bounce(c, bio, bio->bi_iter, bounced, direction); -} - -static void bio_unmap_or_unbounce(struct bch_fs *c, void *data, - unsigned bounced, int direction) -{ - if (!data) - return; - - switch (bounced) { - case BOUNCED_MAPPED: - vunmap((void *) ((unsigned long) data & PAGE_MASK)); - return; - case BOUNCED_KMALLOCED: - kfree(data); - return; - case BOUNCED_VMALLOCED: - vfree(data); - return; - case BOUNCED_MEMPOOLED: - mempool_free(virt_to_page(data), &c->compression_bounce[direction]); - return; - } -} - -static inline void zlib_set_workspace(z_stream *strm, void *workspace) -{ -#ifdef __KERNEL__ - strm->workspace = workspace; -#endif -} - -static int __bio_uncompress(struct bch_fs *c, struct bio *src, - void *dst_data, struct bch_extent_crc128 crc) -{ - void *src_data = NULL; - unsigned src_bounced; - size_t src_len = src->bi_iter.bi_size; - size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; - int ret; - - src_data = bio_map_or_bounce(c, src, &src_bounced, READ); - - switch (crc.compression_type) { - case BCH_COMPRESSION_LZ4: - ret = lz4_decompress(src_data, &src_len, - dst_data, dst_len); - if (ret) { - ret = -EIO; - goto err; - } - break; - case BCH_COMPRESSION_GZIP: { - void *workspace; - z_stream strm; - - workspace = kmalloc(zlib_inflate_workspacesize(), - GFP_NOIO|__GFP_NOWARN); - if (!workspace) { - mutex_lock(&c->zlib_workspace_lock); - workspace = c->zlib_workspace; - } - - strm.next_in = src_data; - strm.avail_in = src_len; - strm.next_out = dst_data; - strm.avail_out = dst_len; - zlib_set_workspace(&strm, workspace); - zlib_inflateInit2(&strm, -MAX_WBITS); - - ret = zlib_inflate(&strm, Z_FINISH); - - if (workspace == c->zlib_workspace) - mutex_unlock(&c->zlib_workspace_lock); - else - kfree(workspace); - - if (ret != Z_STREAM_END) { - ret = -EIO; - goto err; - } - break; - } - default: - BUG(); - } - ret = 0; -err: - bio_unmap_or_unbounce(c, src_data, src_bounced, READ); - return ret; -} - -int bch_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, - unsigned live_data_sectors, - struct bch_extent_crc128 crc) -{ - void *dst_data = NULL; - size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; - int ret = -ENOMEM; - - BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs); - - /* XXX mempoolify */ - dst_data = kmalloc(dst_len, GFP_NOIO|__GFP_NOWARN); - if (!dst_data) { - dst_data = vmalloc(dst_len); - if (!dst_data) - goto err; - } - - ret = __bio_uncompress(c, bio, dst_data, crc); - if (ret) - goto err; - - while (bio->bi_vcnt < DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS)) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; - - bv->bv_page = alloc_page(GFP_NOIO); - if (!bv->bv_page) - goto use_mempool; - - bv->bv_len = PAGE_SIZE; - bv->bv_offset = 0; - bio->bi_vcnt++; - } - - bio->bi_iter.bi_size = live_data_sectors << 9; -copy_data: - memcpy_to_bio(bio, bio->bi_iter, dst_data + (crc.offset << 9)); -err: - kvfree(dst_data); - return ret; -use_mempool: - /* - * We already allocated from mempool, we can't allocate from it again - * without freeing the pages we already allocated or else we could - * deadlock: - */ - - bch_bio_free_pages_pool(c, bio); - bch_bio_alloc_pages_pool(c, bio, live_data_sectors << 9); - goto copy_data; -} - -int bch_bio_uncompress(struct bch_fs *c, struct bio *src, - struct bio *dst, struct bvec_iter dst_iter, - struct bch_extent_crc128 crc) -{ - void *dst_data = NULL; - unsigned dst_bounced; - size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; - int ret = -ENOMEM; - - dst_data = dst_len == dst_iter.bi_size - ? __bio_map_or_bounce(c, dst, dst_iter, &dst_bounced, WRITE) - : __bounce_alloc(c, dst_len, &dst_bounced, WRITE); - - ret = __bio_uncompress(c, src, dst_data, crc); - if (ret) - goto err; - - if (dst_bounced) - memcpy_to_bio(dst, dst_iter, dst_data + (crc.offset << 9)); -err: - bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE); - return ret; -} - -static int __bio_compress(struct bch_fs *c, - struct bio *dst, size_t *dst_len, - struct bio *src, size_t *src_len, - unsigned compression_type) -{ - void *src_data = NULL, *dst_data = NULL; - unsigned src_bounced, dst_bounced, pad; - int ret = -1; - - dst_data = bio_map_or_bounce(c, dst, &dst_bounced, WRITE); - src_data = bio_map_or_bounce(c, src, &src_bounced, READ); - - switch (compression_type) { - case BCH_COMPRESSION_LZ4: { - void *workspace; - - *dst_len = dst->bi_iter.bi_size; - *src_len = src->bi_iter.bi_size; - - workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO); - - while (*src_len > block_bytes(c) && - (ret = lz4_compress(src_data, *src_len, - dst_data, dst_len, - workspace))) { - /* - * On error, the compressed data was bigger than - * dst_len, and -ret is the amount of data we were able - * to compress - round down to nearest block and try - * again: - */ - BUG_ON(ret > 0); - BUG_ON(-ret >= *src_len); - - *src_len = round_down(-ret, block_bytes(c)); - } - - mempool_free(workspace, &c->lz4_workspace_pool); - - if (ret) - goto err; - break; - } - case BCH_COMPRESSION_GZIP: { - void *workspace; - z_stream strm; - - workspace = kmalloc(zlib_deflate_workspacesize(MAX_WBITS, - DEF_MEM_LEVEL), - GFP_NOIO|__GFP_NOWARN); - if (!workspace) { - mutex_lock(&c->zlib_workspace_lock); - workspace = c->zlib_workspace; - } - - strm.next_in = src_data; - strm.avail_in = min(src->bi_iter.bi_size, - dst->bi_iter.bi_size); - strm.next_out = dst_data; - strm.avail_out = dst->bi_iter.bi_size; - zlib_set_workspace(&strm, workspace); - zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, - Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, - Z_DEFAULT_STRATEGY); - - ret = zlib_deflate(&strm, Z_FINISH); - if (ret != Z_STREAM_END) { - ret = -EIO; - goto zlib_err; - } - - ret = zlib_deflateEnd(&strm); - if (ret != Z_OK) { - ret = -EIO; - goto zlib_err; - } - - ret = 0; -zlib_err: - if (workspace == c->zlib_workspace) - mutex_unlock(&c->zlib_workspace_lock); - else - kfree(workspace); - - if (ret) - goto err; - - *dst_len = strm.total_out; - *src_len = strm.total_in; - break; - } - default: - BUG(); - } - - BUG_ON(!*dst_len); - BUG_ON(*dst_len > dst->bi_iter.bi_size); - - BUG_ON(*src_len & (block_bytes(c) - 1)); - BUG_ON(*src_len > src->bi_iter.bi_size); - - /* Didn't get smaller: */ - if (round_up(*dst_len, block_bytes(c)) >= *src_len) { - ret = -1; - goto err; - } - - pad = round_up(*dst_len, block_bytes(c)) - *dst_len; - - memset(dst_data + *dst_len, 0, pad); - *dst_len += pad; - - if (dst_bounced) - memcpy_to_bio(dst, dst->bi_iter, dst_data); -err: - bio_unmap_or_unbounce(c, src_data, src_bounced, READ); - bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE); - return ret; -} - -void bch_bio_compress(struct bch_fs *c, - struct bio *dst, size_t *dst_len, - struct bio *src, size_t *src_len, - unsigned *compression_type) -{ - unsigned orig_dst = dst->bi_iter.bi_size; - unsigned orig_src = src->bi_iter.bi_size; - - /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ - src->bi_iter.bi_size = - min(src->bi_iter.bi_size, BCH_ENCODED_EXTENT_MAX << 9); - - /* Don't generate a bigger output than input: */ - dst->bi_iter.bi_size = - min(dst->bi_iter.bi_size, src->bi_iter.bi_size); - - /* If it's only one block, don't bother trying to compress: */ - if (*compression_type != BCH_COMPRESSION_NONE && - bio_sectors(src) > c->sb.block_size && - !__bio_compress(c, dst, dst_len, src, src_len, *compression_type)) - goto out; - - /* If compressing failed (didn't get smaller), just copy: */ - *compression_type = BCH_COMPRESSION_NONE; - *dst_len = *src_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); - bio_copy_data(dst, src); -out: - dst->bi_iter.bi_size = orig_dst; - src->bi_iter.bi_size = orig_src; -} - -/* doesn't write superblock: */ -int bch_check_set_has_compressed_data(struct bch_fs *c, - unsigned compression_type) -{ - switch (compression_type) { - case BCH_COMPRESSION_NONE: - return 0; - case BCH_COMPRESSION_LZ4: - if (bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) - return 0; - - bch_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4); - break; - case BCH_COMPRESSION_GZIP: - if (bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) - return 0; - - bch_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP); - break; - } - - return bch_fs_compress_init(c); -} - -void bch_fs_compress_exit(struct bch_fs *c) -{ - vfree(c->zlib_workspace); - mempool_exit(&c->lz4_workspace_pool); - mempool_exit(&c->compression_bounce[WRITE]); - mempool_exit(&c->compression_bounce[READ]); -} - -#define COMPRESSION_WORKSPACE_SIZE \ - max_t(size_t, zlib_inflate_workspacesize(), \ - zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL)) - -int bch_fs_compress_init(struct bch_fs *c) -{ - unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9); - int ret; - - if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) && - !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) - return 0; - - if (!mempool_initialized(&c->compression_bounce[READ])) { - ret = mempool_init_page_pool(&c->compression_bounce[READ], - 1, order); - if (ret) - return ret; - } - - if (!mempool_initialized(&c->compression_bounce[WRITE])) { - ret = mempool_init_page_pool(&c->compression_bounce[WRITE], - 1, order); - if (ret) - return ret; - } - - if (!mempool_initialized(&c->lz4_workspace_pool) && - bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) { - ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool, - 1, LZ4_MEM_COMPRESS); - if (ret) - return ret; - } - - if (!c->zlib_workspace && - bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) { - c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE); - if (!c->zlib_workspace) - return -ENOMEM; - } - - return 0; -} diff --git a/libbcache/compress.h b/libbcache/compress.h deleted file mode 100644 index e8d208a0..00000000 --- a/libbcache/compress.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _BCACHE_COMPRESS_H -#define _BCACHE_COMPRESS_H - -int bch_bio_uncompress_inplace(struct bch_fs *, struct bio *, - unsigned, struct bch_extent_crc128); -int bch_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, - struct bvec_iter, struct bch_extent_crc128); -void bch_bio_compress(struct bch_fs *, struct bio *, size_t *, - struct bio *, size_t *, unsigned *); - -int bch_check_set_has_compressed_data(struct bch_fs *, unsigned); -void bch_fs_compress_exit(struct bch_fs *); -int bch_fs_compress_init(struct bch_fs *); - -#endif /* _BCACHE_COMPRESS_H */ diff --git a/libbcache/debug.c b/libbcache/debug.c deleted file mode 100644 index bddff979..00000000 --- a/libbcache/debug.c +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Assorted bcache debug code - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include "bcache.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "buckets.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "fs-gc.h" -#include "inode.h" -#include "io.h" -#include "super.h" - -#include <linux/console.h> -#include <linux/debugfs.h> -#include <linux/module.h> -#include <linux/random.h> -#include <linux/seq_file.h> - -static struct dentry *bch_debug; - -#ifdef CONFIG_BCACHE_DEBUG - -static void btree_verify_endio(struct bio *bio) -{ - struct closure *cl = bio->bi_private; - - closure_put(cl); -} - -void __bch_btree_verify(struct bch_fs *c, struct btree *b) -{ - struct btree *v = c->verify_data; - struct btree_node *n_ondisk, *n_sorted, *n_inmemory; - struct bset *sorted, *inmemory; - struct extent_pick_ptr pick; - struct bio *bio; - struct closure cl; - - if (c->opts.nochanges) - return; - - closure_init_stack(&cl); - - btree_node_io_lock(b); - mutex_lock(&c->verify_lock); - - n_ondisk = c->verify_ondisk; - n_sorted = c->verify_data->data; - n_inmemory = b->data; - - bkey_copy(&v->key, &b->key); - v->written = 0; - v->level = b->level; - v->btree_id = b->btree_id; - bch_btree_keys_init(v, &c->expensive_debug_checks); - - pick = bch_btree_pick_ptr(c, b); - if (IS_ERR_OR_NULL(pick.ca)) - return; - - bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio); - bio->bi_bdev = pick.ca->disk_sb.bdev; - bio->bi_iter.bi_sector = pick.ptr.offset; - bio->bi_iter.bi_size = btree_bytes(c); - bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC); - bio->bi_private = &cl; - bio->bi_end_io = btree_verify_endio; - bch_bio_map(bio, n_sorted); - - closure_get(&cl); - bch_generic_make_request(bio, c); - closure_sync(&cl); - - bio_put(bio); - - memcpy(n_ondisk, n_sorted, btree_bytes(c)); - - bch_btree_node_read_done(c, v, pick.ca, &pick.ptr); - n_sorted = c->verify_data->data; - - percpu_ref_put(&pick.ca->io_ref); - - sorted = &n_sorted->keys; - inmemory = &n_inmemory->keys; - - if (inmemory->u64s != sorted->u64s || - memcmp(inmemory->start, - sorted->start, - vstruct_end(inmemory) - (void *) inmemory->start)) { - unsigned offset = 0, sectors; - struct bset *i; - unsigned j; - - console_lock(); - - printk(KERN_ERR "*** in memory:\n"); - bch_dump_bset(b, inmemory, 0); - - printk(KERN_ERR "*** read back in:\n"); - bch_dump_bset(v, sorted, 0); - - while (offset < b->written) { - if (!offset ) { - i = &n_ondisk->keys; - sectors = vstruct_blocks(n_ondisk, c->block_bits) << - c->block_bits; - } else { - struct btree_node_entry *bne = - (void *) n_ondisk + (offset << 9); - i = &bne->keys; - - sectors = vstruct_blocks(bne, c->block_bits) << - c->block_bits; - } - - printk(KERN_ERR "*** on disk block %u:\n", offset); - bch_dump_bset(b, i, offset); - - offset += sectors; - } - - printk(KERN_ERR "*** block %u/%u not written\n", - offset >> c->block_bits, btree_blocks(c)); - - for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) - if (inmemory->_data[j] != sorted->_data[j]) - break; - - printk(KERN_ERR "b->written %u\n", b->written); - - console_unlock(); - panic("verify failed at %u\n", j); - } - - mutex_unlock(&c->verify_lock); - btree_node_io_unlock(b); -} - -void bch_data_verify(struct cached_dev *dc, struct bio *bio) -{ - char name[BDEVNAME_SIZE]; - struct bio *check; - struct bio_vec bv; - struct bvec_iter iter; - - check = bio_clone(bio, GFP_NOIO); - if (!check) - return; - bio_set_op_attrs(check, REQ_OP_READ, READ_SYNC); - - if (bio_alloc_pages(check, GFP_NOIO)) - goto out_put; - - submit_bio_wait(check); - - bio_for_each_segment(bv, bio, iter) { - void *p1 = kmap_atomic(bv.bv_page); - void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page); - - if (memcmp(p1 + bv.bv_offset, - p2 + bv.bv_offset, - bv.bv_len)) - panic("verify failed at dev %s sector %llu\n", - bdevname(dc->disk_sb.bdev, name), - (uint64_t) bio->bi_iter.bi_sector); - - kunmap_atomic(p1); - } - - bio_free_pages(check); -out_put: - bio_put(check); -} - -#endif - -#ifdef CONFIG_DEBUG_FS - -/* XXX: bch_fs refcounting */ - -struct dump_iter { - struct bpos from; - struct bch_fs *c; - enum btree_id id; - - char buf[PAGE_SIZE]; - size_t bytes; /* what's currently in buf */ - - char __user *ubuf; /* destination user buffer */ - size_t size; /* size of requested read */ - ssize_t ret; /* bytes read so far */ -}; - -static int flush_buf(struct dump_iter *i) -{ - if (i->bytes) { - size_t bytes = min(i->bytes, i->size); - int err = copy_to_user(i->ubuf, i->buf, bytes); - - if (err) - return err; - - i->ret += bytes; - i->ubuf += bytes; - i->size -= bytes; - i->bytes -= bytes; - memmove(i->buf, i->buf + bytes, i->bytes); - } - - return 0; -} - -static int bch_dump_open(struct inode *inode, struct file *file) -{ - struct btree_debug *bd = inode->i_private; - struct dump_iter *i; - - i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) - return -ENOMEM; - - file->private_data = i; - i->from = POS_MIN; - i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); - i->id = bd->id; - - return 0; -} - -static int bch_dump_release(struct inode *inode, struct file *file) -{ - kfree(file->private_data); - return 0; -} - -static ssize_t bch_read_btree(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct btree_iter iter; - struct bkey_s_c k; - int err; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - err = flush_buf(i); - if (err) - return err; - - if (!i->size) - return i->ret; - - bch_btree_iter_init(&iter, i->c, i->id, i->from); - - while ((k = bch_btree_iter_peek(&iter)).k && - !(err = btree_iter_err(k))) { - bch_bkey_val_to_text(i->c, bkey_type(0, i->id), - i->buf, sizeof(i->buf), k); - i->bytes = strlen(i->buf); - BUG_ON(i->bytes >= PAGE_SIZE); - i->buf[i->bytes] = '\n'; - i->bytes++; - - bch_btree_iter_advance_pos(&iter); - i->from = iter.pos; - - err = flush_buf(i); - if (err) - break; - - if (!i->size) - break; - } - bch_btree_iter_unlock(&iter); - - return err < 0 ? err : i->ret; -} - -static const struct file_operations btree_debug_ops = { - .owner = THIS_MODULE, - .open = bch_dump_open, - .release = bch_dump_release, - .read = bch_read_btree, -}; - -static ssize_t bch_read_btree_formats(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct btree_iter iter; - struct btree *b; - int err; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - err = flush_buf(i); - if (err) - return err; - - if (!i->size || !bkey_cmp(POS_MAX, i->from)) - return i->ret; - - for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) { - i->bytes = bch_print_btree_node(i->c, b, i->buf, - sizeof(i->buf)); - err = flush_buf(i); - if (err) - break; - - /* - * can't easily correctly restart a btree node traversal across - * all nodes, meh - */ - i->from = bkey_cmp(POS_MAX, b->key.k.p) - ? bkey_successor(b->key.k.p) - : b->key.k.p; - - if (!i->size) - break; - } - bch_btree_iter_unlock(&iter); - - return err < 0 ? err : i->ret; -} - -static const struct file_operations btree_format_debug_ops = { - .owner = THIS_MODULE, - .open = bch_dump_open, - .release = bch_dump_release, - .read = bch_read_btree_formats, -}; - -static ssize_t bch_read_bfloat_failed(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct btree_iter iter; - struct bkey_s_c k; - struct btree *prev_node = NULL; - int err; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - err = flush_buf(i); - if (err) - return err; - - if (!i->size) - return i->ret; - - bch_btree_iter_init(&iter, i->c, i->id, i->from); - - while ((k = bch_btree_iter_peek(&iter)).k && - !(err = btree_iter_err(k))) { - struct btree *b = iter.nodes[0]; - struct btree_node_iter *node_iter = &iter.node_iters[0]; - struct bkey_packed *_k = bch_btree_node_iter_peek(node_iter, b); - - if (iter.nodes[0] != prev_node) { - i->bytes = bch_print_btree_node(i->c, b, i->buf, - sizeof(i->buf)); - err = flush_buf(i); - if (err) - break; - } - prev_node = iter.nodes[0]; - - i->bytes = bch_bkey_print_bfloat(b, _k, i->buf, sizeof(i->buf)); - - err = flush_buf(i); - if (err) - break; - - bch_btree_iter_advance_pos(&iter); - i->from = iter.pos; - - err = flush_buf(i); - if (err) - break; - - if (!i->size) - break; - } - bch_btree_iter_unlock(&iter); - - return err < 0 ? err : i->ret; -} - -static const struct file_operations bfloat_failed_debug_ops = { - .owner = THIS_MODULE, - .open = bch_dump_open, - .release = bch_dump_release, - .read = bch_read_bfloat_failed, -}; - -void bch_fs_debug_exit(struct bch_fs *c) -{ - if (!IS_ERR_OR_NULL(c->debug)) - debugfs_remove_recursive(c->debug); -} - -void bch_fs_debug_init(struct bch_fs *c) -{ - struct btree_debug *bd; - char name[100]; - - if (IS_ERR_OR_NULL(bch_debug)) - return; - - snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); - c->debug = debugfs_create_dir(name, bch_debug); - if (IS_ERR_OR_NULL(c->debug)) - return; - - for (bd = c->btree_debug; - bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); - bd++) { - bd->id = bd - c->btree_debug; - bd->btree = debugfs_create_file(bch_btree_ids[bd->id], - 0400, c->debug, bd, - &btree_debug_ops); - - snprintf(name, sizeof(name), "%s-formats", - bch_btree_ids[bd->id]); - - bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, - &btree_format_debug_ops); - - snprintf(name, sizeof(name), "%s-bfloat-failed", - bch_btree_ids[bd->id]); - - bd->failed = debugfs_create_file(name, 0400, c->debug, bd, - &bfloat_failed_debug_ops); - } -} - -#endif - -void bch_debug_exit(void) -{ - if (!IS_ERR_OR_NULL(bch_debug)) - debugfs_remove_recursive(bch_debug); -} - -int __init bch_debug_init(void) -{ - int ret = 0; - - bch_debug = debugfs_create_dir("bcache", NULL); - return ret; -} diff --git a/libbcache/debug.h b/libbcache/debug.h deleted file mode 100644 index 63e74304..00000000 --- a/libbcache/debug.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef _BCACHE_DEBUG_H -#define _BCACHE_DEBUG_H - -#include "bcache.h" - -struct bio; -struct btree; -struct cached_dev; -struct bch_fs; - -#define BCH_DEBUG_PARAM(name, description) extern bool bch_##name; -BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - -#define BCH_DEBUG_PARAM(name, description) \ - static inline bool name(struct bch_fs *c) \ - { return bch_##name || c->name; } -BCH_DEBUG_PARAMS_ALWAYS() -#undef BCH_DEBUG_PARAM - -#ifdef CONFIG_BCACHE_DEBUG - -#define BCH_DEBUG_PARAM(name, description) \ - static inline bool name(struct bch_fs *c) \ - { return bch_##name || c->name; } -BCH_DEBUG_PARAMS_DEBUG() -#undef BCH_DEBUG_PARAM - -void __bch_btree_verify(struct bch_fs *, struct btree *); -void bch_data_verify(struct cached_dev *, struct bio *); - -#define bypass_torture_test(d) ((d)->bypass_torture_test) - -#else /* DEBUG */ - -#define BCH_DEBUG_PARAM(name, description) \ - static inline bool name(struct bch_fs *c) { return false; } -BCH_DEBUG_PARAMS_DEBUG() -#undef BCH_DEBUG_PARAM - -static inline void __bch_btree_verify(struct bch_fs *c, struct btree *b) {} -static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} - -#define bypass_torture_test(d) 0 - -#endif - -static inline void bch_btree_verify(struct bch_fs *c, struct btree *b) -{ - if (verify_btree_ondisk(c)) - __bch_btree_verify(c, b); -} - -#ifdef CONFIG_DEBUG_FS -void bch_fs_debug_exit(struct bch_fs *); -void bch_fs_debug_init(struct bch_fs *); -#else -static inline void bch_fs_debug_exit(struct bch_fs *c) {} -static inline void bch_fs_debug_init(struct bch_fs *c) {} -#endif - -void bch_debug_exit(void); -int bch_debug_init(void); - -#endif diff --git a/libbcache/dirent.c b/libbcache/dirent.c deleted file mode 100644 index f961e881..00000000 --- a/libbcache/dirent.c +++ /dev/null @@ -1,427 +0,0 @@ - -#include "bcache.h" -#include "bkey_methods.h" -#include "btree_update.h" -#include "extents.h" -#include "dirent.h" -#include "fs.h" -#include "keylist.h" -#include "str_hash.h" - -#include <linux/dcache.h> - -unsigned bch_dirent_name_bytes(struct bkey_s_c_dirent d) -{ - unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent); - - while (len && !d.v->d_name[len - 1]) - --len; - - return len; -} - -static u64 bch_dirent_hash(const struct bch_hash_info *info, - const struct qstr *name) -{ - struct bch_str_hash_ctx ctx; - - bch_str_hash_init(&ctx, info); - bch_str_hash_update(&ctx, info, name->name, name->len); - - /* [0,2) reserved for dots */ - return max_t(u64, bch_str_hash_end(&ctx, info), 2); -} - -static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) -{ - return bch_dirent_hash(info, key); -} - -static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr name = QSTR_INIT(d.v->d_name, bch_dirent_name_bytes(d)); - - return bch_dirent_hash(info, &name); -} - -static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) -{ - struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - int len = bch_dirent_name_bytes(l); - const struct qstr *r = _r; - - return len - r->len ?: memcmp(l.v->d_name, r->name, len); -} - -static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -{ - struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); - int l_len = bch_dirent_name_bytes(l); - int r_len = bch_dirent_name_bytes(r); - - return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); -} - -static const struct bch_hash_desc dirent_hash_desc = { - .btree_id = BTREE_ID_DIRENTS, - .key_type = BCH_DIRENT, - .whiteout_type = BCH_DIRENT_WHITEOUT, - .hash_key = dirent_hash_key, - .hash_bkey = dirent_hash_bkey, - .cmp_key = dirent_cmp_key, - .cmp_bkey = dirent_cmp_bkey, -}; - -static const char *bch_dirent_invalid(const struct bch_fs *c, - struct bkey_s_c k) -{ - switch (k.k->type) { - case BCH_DIRENT: - return bkey_val_bytes(k.k) < sizeof(struct bch_dirent) - ? "value too small" - : NULL; - - case BCH_DIRENT_WHITEOUT: - return bkey_val_bytes(k.k) != 0 - ? "value size should be zero" - : NULL; - - default: - return "invalid type"; - } -} - -static void bch_dirent_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) -{ - struct bkey_s_c_dirent d; - - switch (k.k->type) { - case BCH_DIRENT: - d = bkey_s_c_to_dirent(k); - - if (size) { - unsigned n = min_t(unsigned, size, - bch_dirent_name_bytes(d)); - memcpy(buf, d.v->d_name, n); - buf[size - 1] = '\0'; - buf += n; - size -= n; - } - - scnprintf(buf, size, " -> %llu", d.v->d_inum); - break; - case BCH_DIRENT_WHITEOUT: - scnprintf(buf, size, "whiteout"); - break; - } -} - -const struct bkey_ops bch_bkey_dirent_ops = { - .key_invalid = bch_dirent_invalid, - .val_to_text = bch_dirent_to_text, -}; - -static struct bkey_i_dirent *dirent_create_key(u8 type, - const struct qstr *name, u64 dst) -{ - struct bkey_i_dirent *dirent; - unsigned u64s = BKEY_U64s + - DIV_ROUND_UP(sizeof(struct bch_dirent) + name->len, - sizeof(u64)); - - dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS); - if (!dirent) - return NULL; - - bkey_dirent_init(&dirent->k_i); - dirent->k.u64s = u64s; - dirent->v.d_inum = cpu_to_le64(dst); - dirent->v.d_type = type; - - memcpy(dirent->v.d_name, name->name, name->len); - memset(dirent->v.d_name + name->len, 0, - bkey_val_bytes(&dirent->k) - - (sizeof(struct bch_dirent) + name->len)); - - EBUG_ON(bch_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); - - return dirent; -} - -int bch_dirent_create(struct bch_fs *c, u64 dir_inum, - const struct bch_hash_info *hash_info, - u8 type, const struct qstr *name, u64 dst_inum, - u64 *journal_seq, int flags) -{ - struct bkey_i_dirent *dirent; - int ret; - - dirent = dirent_create_key(type, name, dst_inum); - if (!dirent) - return -ENOMEM; - - ret = bch_hash_set(dirent_hash_desc, hash_info, c, dir_inum, - journal_seq, &dirent->k_i, flags); - kfree(dirent); - - return ret; -} - -static void dirent_copy_target(struct bkey_i_dirent *dst, - struct bkey_s_c_dirent src) -{ - dst->v.d_inum = src.v->d_inum; - dst->v.d_type = src.v->d_type; -} - -static struct bpos bch_dirent_pos(struct bch_inode_info *ei, - const struct qstr *name) -{ - return POS(ei->vfs_inode.i_ino, bch_dirent_hash(&ei->str_hash, name)); -} - -int bch_dirent_rename(struct bch_fs *c, - struct inode *src_dir, const struct qstr *src_name, - struct inode *dst_dir, const struct qstr *dst_name, - u64 *journal_seq, enum bch_rename_mode mode) -{ - struct bch_inode_info *src_ei = to_bch_ei(src_dir); - struct bch_inode_info *dst_ei = to_bch_ei(dst_dir); - struct btree_iter src_iter, dst_iter, whiteout_iter; - struct bkey_s_c old_src, old_dst; - struct bkey delete; - struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; - struct bpos src_pos = bch_dirent_pos(src_ei, src_name); - struct bpos dst_pos = bch_dirent_pos(dst_ei, dst_name); - bool need_whiteout; - int ret = -ENOMEM; - - bch_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos); - bch_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos); - bch_btree_iter_link(&src_iter, &dst_iter); - - bch_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos); - bch_btree_iter_link(&src_iter, &whiteout_iter); - - if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(0, src_name, 0); - if (!new_src) - goto err; - } else { - new_src = (void *) &delete; - } - - new_dst = dirent_create_key(0, dst_name, 0); - if (!new_dst) - goto err; -retry: - /* - * Note that on -EINTR/dropped locks we're not restarting the lookup - * from the original hashed position (like we do when creating dirents, - * in bch_hash_set) - we never move existing dirents to different slot: - */ - old_src = bch_hash_lookup_at(dirent_hash_desc, - &src_ei->str_hash, - &src_iter, src_name); - if ((ret = btree_iter_err(old_src))) - goto err; - - ret = bch_hash_needs_whiteout(dirent_hash_desc, - &src_ei->str_hash, - &whiteout_iter, &src_iter); - if (ret < 0) - goto err; - need_whiteout = ret; - - /* - * Note that in BCH_RENAME mode, we're _not_ checking if - * the target already exists - we're relying on the VFS - * to do that check for us for correctness: - */ - old_dst = mode == BCH_RENAME - ? bch_hash_hole_at(dirent_hash_desc, &dst_iter) - : bch_hash_lookup_at(dirent_hash_desc, - &dst_ei->str_hash, - &dst_iter, dst_name); - if ((ret = btree_iter_err(old_dst))) - goto err; - - switch (mode) { - case BCH_RENAME: - bkey_init(&new_src->k); - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - - if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && - bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { - /* - * If we couldn't insert new_dst at its hashed - * position (dst_pos) due to a hash collision, - * and we're going to be deleting in - * between the hashed position and first empty - * slot we found - just overwrite the pos we - * were going to delete: - * - * Note: this is a correctness issue, in this - * situation bch_hash_needs_whiteout() could - * return false when the whiteout would have - * been needed if we inserted at the pos - * __dirent_find_hole() found - */ - new_dst->k.p = src_iter.pos; - ret = bch_btree_insert_at(c, NULL, NULL, - journal_seq, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&src_iter, - &new_dst->k_i)); - goto err; - } - - if (need_whiteout) - new_src->k.type = BCH_DIRENT_WHITEOUT; - break; - case BCH_RENAME_OVERWRITE: - bkey_init(&new_src->k); - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - - if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && - bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { - /* - * Same case described above - - * bch_hash_needs_whiteout could spuriously - * return false, but we have to insert at - * dst_iter.pos because we're overwriting - * another dirent: - */ - new_src->k.type = BCH_DIRENT_WHITEOUT; - } else if (need_whiteout) - new_src->k.type = BCH_DIRENT_WHITEOUT; - break; - case BCH_RENAME_EXCHANGE: - dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - break; - } - - new_src->k.p = src_iter.pos; - new_dst->k.p = dst_iter.pos; - ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i), - BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i)); -err: - if (ret == -EINTR) - goto retry; - - bch_btree_iter_unlock(&whiteout_iter); - bch_btree_iter_unlock(&dst_iter); - bch_btree_iter_unlock(&src_iter); - - if (new_src != (void *) &delete) - kfree(new_src); - kfree(new_dst); - return ret; -} - -int bch_dirent_delete(struct bch_fs *c, u64 dir_inum, - const struct bch_hash_info *hash_info, - const struct qstr *name, - u64 *journal_seq) -{ - return bch_hash_delete(dirent_hash_desc, hash_info, - c, dir_inum, journal_seq, name); -} - -u64 bch_dirent_lookup(struct bch_fs *c, u64 dir_inum, - const struct bch_hash_info *hash_info, - const struct qstr *name) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 inum; - - k = bch_hash_lookup(dirent_hash_desc, hash_info, c, - dir_inum, &iter, name); - if (IS_ERR(k.k)) { - bch_btree_iter_unlock(&iter); - return 0; - } - - inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); - bch_btree_iter_unlock(&iter); - - return inum; -} - -int bch_empty_dir(struct bch_fs *c, u64 dir_inum) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), k) { - if (k.k->p.inode > dir_inum) - break; - - if (k.k->type == BCH_DIRENT) { - ret = -ENOTEMPTY; - break; - } - } - bch_btree_iter_unlock(&iter); - - return ret; -} - -int bch_readdir(struct bch_fs *c, struct file *file, - struct dir_context *ctx) -{ - struct inode *inode = file_inode(file); - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_dirent dirent; - unsigned len; - - if (!dir_emit_dots(file, ctx)) - return 0; - - pr_debug("listing for %lu from %llu", inode->i_ino, ctx->pos); - - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, - POS(inode->i_ino, ctx->pos), k) { - if (k.k->type != BCH_DIRENT) - continue; - - dirent = bkey_s_c_to_dirent(k); - - pr_debug("saw %llu:%llu (%s) -> %llu", - k.k->p.inode, k.k->p.offset, - dirent.v->d_name, dirent.v->d_inum); - - if (bkey_cmp(k.k->p, POS(inode->i_ino, ctx->pos)) < 0) - continue; - - if (k.k->p.inode > inode->i_ino) - break; - - len = bch_dirent_name_bytes(dirent); - - pr_debug("emitting %s", dirent.v->d_name); - - /* - * XXX: dir_emit() can fault and block, while we're holding - * locks - */ - if (!dir_emit(ctx, dirent.v->d_name, len, - le64_to_cpu(dirent.v->d_inum), - dirent.v->d_type)) - break; - - ctx->pos = k.k->p.offset + 1; - } - bch_btree_iter_unlock(&iter); - - return 0; -} diff --git a/libbcache/dirent.h b/libbcache/dirent.h deleted file mode 100644 index 158d4cae..00000000 --- a/libbcache/dirent.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef _BCACHE_DIRENT_H -#define _BCACHE_DIRENT_H - -extern const struct bkey_ops bch_bkey_dirent_ops; - -struct qstr; -struct file; -struct dir_context; -struct bch_fs; -struct bch_hash_info; - -unsigned bch_dirent_name_bytes(struct bkey_s_c_dirent); -int bch_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *, - u8, const struct qstr *, u64, u64 *, int); -int bch_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *, - const struct qstr *, u64 *); - -enum bch_rename_mode { - BCH_RENAME, - BCH_RENAME_OVERWRITE, - BCH_RENAME_EXCHANGE, -}; - -int bch_dirent_rename(struct bch_fs *, - struct inode *, const struct qstr *, - struct inode *, const struct qstr *, - u64 *, enum bch_rename_mode); - -u64 bch_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, - const struct qstr *); - -int bch_empty_dir(struct bch_fs *, u64); -int bch_readdir(struct bch_fs *, struct file *, struct dir_context *); - -#endif /* _BCACHE_DIRENT_H */ - diff --git a/libbcache/error.c b/libbcache/error.c deleted file mode 100644 index ba46d2d1..00000000 --- a/libbcache/error.c +++ /dev/null @@ -1,140 +0,0 @@ -#include "bcache.h" -#include "error.h" -#include "io.h" -#include "notify.h" -#include "super.h" - -void bch_inconsistent_error(struct bch_fs *c) -{ - set_bit(BCH_FS_ERROR, &c->flags); - - switch (c->opts.errors) { - case BCH_ON_ERROR_CONTINUE: - break; - case BCH_ON_ERROR_RO: - if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { - /* XXX do something better here? */ - bch_fs_stop_async(c); - return; - } - - if (bch_fs_emergency_read_only(c)) - bch_err(c, "emergency read only"); - break; - case BCH_ON_ERROR_PANIC: - panic(bch_fmt(c, "panic after error")); - break; - } -} - -void bch_fatal_error(struct bch_fs *c) -{ - if (bch_fs_emergency_read_only(c)) - bch_err(c, "emergency read only"); -} - -/* Nonfatal IO errors, IO error/latency accounting: */ - -/* Just does IO error accounting: */ -void bch_account_io_completion(struct bch_dev *ca) -{ - /* - * The halflife of an error is: - * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh - */ - - if (ca->fs->error_decay) { - unsigned count = atomic_inc_return(&ca->io_count); - - while (count > ca->fs->error_decay) { - unsigned errors; - unsigned old = count; - unsigned new = count - ca->fs->error_decay; - - /* - * First we subtract refresh from count; each time we - * succesfully do so, we rescale the errors once: - */ - - count = atomic_cmpxchg(&ca->io_count, old, new); - - if (count == old) { - count = new; - - errors = atomic_read(&ca->io_errors); - do { - old = errors; - new = ((uint64_t) errors * 127) / 128; - errors = atomic_cmpxchg(&ca->io_errors, - old, new); - } while (old != errors); - } - } - } -} - -/* IO error accounting and latency accounting: */ -void bch_account_io_completion_time(struct bch_dev *ca, - unsigned submit_time_us, int op) -{ - struct bch_fs *c; - unsigned threshold; - - if (!ca) - return; - - c = ca->fs; - threshold = op_is_write(op) - ? c->congested_write_threshold_us - : c->congested_read_threshold_us; - - if (threshold && submit_time_us) { - unsigned t = local_clock_us(); - - int us = t - submit_time_us; - int congested = atomic_read(&c->congested); - - if (us > (int) threshold) { - int ms = us / 1024; - c->congested_last_us = t; - - ms = min(ms, CONGESTED_MAX + congested); - atomic_sub(ms, &c->congested); - } else if (congested < 0) - atomic_inc(&c->congested); - } - - bch_account_io_completion(ca); -} - -void bch_nonfatal_io_error_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); - struct bch_fs *c = ca->fs; - unsigned errors = atomic_read(&ca->io_errors); - bool dev; - - if (errors < c->error_limit) { - bch_notify_dev_error(ca, false); - } else { - bch_notify_dev_error(ca, true); - - mutex_lock(&c->state_lock); - dev = bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO, - BCH_FORCE_IF_DEGRADED); - if (dev - ? __bch_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, - BCH_FORCE_IF_DEGRADED) - : bch_fs_emergency_read_only(c)) - bch_err(ca, - "too many IO errors, setting %s RO", - dev ? "device" : "filesystem"); - mutex_unlock(&c->state_lock); - } -} - -void bch_nonfatal_io_error(struct bch_dev *ca) -{ - atomic_add(1 << IO_ERROR_SHIFT, &ca->io_errors); - queue_work(system_long_wq, &ca->io_error_work); -} diff --git a/libbcache/error.h b/libbcache/error.h deleted file mode 100644 index 726b20d4..00000000 --- a/libbcache/error.h +++ /dev/null @@ -1,240 +0,0 @@ -#ifndef _BCACHE_ERROR_H -#define _BCACHE_ERROR_H - -#include <linux/printk.h> - -struct bch_dev; -struct bch_fs; - -/* - * XXX: separate out errors that indicate on disk data is inconsistent, and flag - * superblock as such - */ - -/* Error messages: */ - -/* - * Very fatal logic/inconsistency errors: these indicate that we've majorly - * screwed up at runtime, i.e. it's not likely that it was just caused by the - * data on disk being inconsistent. These BUG(): - * - * XXX: audit and convert to inconsistent() checks - */ - -#define bch_fs_bug(c, ...) \ -do { \ - bch_err(c, __VA_ARGS__); \ - BUG(); \ -} while (0) - -#define bch_fs_bug_on(cond, c, ...) \ -do { \ - if (cond) \ - bch_fs_bug(c, __VA_ARGS__); \ -} while (0) - -/* - * Inconsistency errors: The on disk data is inconsistent. If these occur during - * initial recovery, they don't indicate a bug in the running code - we walk all - * the metadata before modifying anything. If they occur at runtime, they - * indicate either a bug in the running code or (less likely) data is being - * silently corrupted under us. - * - * XXX: audit all inconsistent errors and make sure they're all recoverable, in - * BCH_ON_ERROR_CONTINUE mode - */ - -void bch_inconsistent_error(struct bch_fs *); - -#define bch_fs_inconsistent(c, ...) \ -do { \ - bch_err(c, __VA_ARGS__); \ - bch_inconsistent_error(c); \ -} while (0) - -#define bch_fs_inconsistent_on(cond, c, ...) \ -({ \ - int _ret = !!(cond); \ - \ - if (_ret) \ - bch_fs_inconsistent(c, __VA_ARGS__); \ - _ret; \ -}) - -/* - * Later we might want to mark only the particular device inconsistent, not the - * entire filesystem: - */ - -#define bch_dev_inconsistent(ca, ...) \ -do { \ - bch_err(ca, __VA_ARGS__); \ - bch_inconsistent_error((ca)->fs); \ -} while (0) - -#define bch_dev_inconsistent_on(cond, ca, ...) \ -({ \ - int _ret = !!(cond); \ - \ - if (_ret) \ - bch_dev_inconsistent(ca, __VA_ARGS__); \ - _ret; \ -}) - -/* - * Fsck errors: inconsistency errors we detect at mount time, and should ideally - * be able to repair: - */ - -enum { - BCH_FSCK_OK = 0, - BCH_FSCK_ERRORS_NOT_FIXED = 1, - BCH_FSCK_REPAIR_UNIMPLEMENTED = 2, - BCH_FSCK_REPAIR_IMPOSSIBLE = 3, - BCH_FSCK_UNKNOWN_VERSION = 4, -}; - -/* These macros return true if error should be fixed: */ - -/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ - -#ifndef __fsck_err -#define __fsck_err(c, _can_fix, _can_ignore, _nofix_msg, msg, ...) \ -({ \ - bool _fix = false; \ - \ - if (_can_fix && (c)->opts.fix_errors) { \ - bch_err(c, msg ", fixing", ##__VA_ARGS__); \ - set_bit(BCH_FS_FSCK_FIXED_ERRORS, &(c)->flags); \ - _fix = true; \ - } else if (_can_ignore && \ - (c)->opts.errors == BCH_ON_ERROR_CONTINUE) { \ - bch_err(c, msg " (ignoring)", ##__VA_ARGS__); \ - } else { \ - bch_err(c, msg " ("_nofix_msg")", ##__VA_ARGS__); \ - ret = BCH_FSCK_ERRORS_NOT_FIXED; \ - goto fsck_err; \ - } \ - \ - BUG_ON(!_fix && !_can_ignore); \ - _fix; \ -}) -#endif - -#define __fsck_err_on(cond, c, _can_fix, _can_ignore, _nofix_msg, ...) \ - ((cond) ? __fsck_err(c, _can_fix, _can_ignore, \ - _nofix_msg, ##__VA_ARGS__) : false) - -#define unfixable_fsck_err_on(cond, c, ...) \ - __fsck_err_on(cond, c, false, true, "repair unimplemented", ##__VA_ARGS__) - -#define need_fsck_err_on(cond, c, ...) \ - __fsck_err_on(cond, c, false, true, "run fsck to correct", ##__VA_ARGS__) - -#define mustfix_fsck_err(c, ...) \ - __fsck_err(c, true, false, "not fixing", ##__VA_ARGS__) - -#define mustfix_fsck_err_on(cond, c, ...) \ - __fsck_err_on(cond, c, true, false, "not fixing", ##__VA_ARGS__) - -#define fsck_err_on(cond, c, ...) \ - __fsck_err_on(cond, c, true, true, "not fixing", ##__VA_ARGS__) - -/* - * Fatal errors: these don't indicate a bug, but we can't continue running in RW - * mode - pretty much just due to metadata IO errors: - */ - -void bch_fatal_error(struct bch_fs *); - -#define bch_fs_fatal_error(c, ...) \ -do { \ - bch_err(c, __VA_ARGS__); \ - bch_fatal_error(c); \ -} while (0) - -#define bch_fs_fatal_err_on(cond, c, ...) \ -({ \ - int _ret = !!(cond); \ - \ - if (_ret) \ - bch_fs_fatal_error(c, __VA_ARGS__); \ - _ret; \ -}) - -#define bch_dev_fatal_error(ca, ...) \ -do { \ - bch_err(ca, __VA_ARGS__); \ - bch_fatal_error(c); \ -} while (0) - -#define bch_dev_fatal_io_error(ca, fmt, ...) \ -do { \ - printk_ratelimited(KERN_ERR bch_fmt((ca)->fs, \ - "fatal IO error on %s for " fmt), \ - (ca)->name, ##__VA_ARGS__); \ - bch_fatal_error((ca)->fs); \ -} while (0) - -#define bch_dev_fatal_io_err_on(cond, ca, ...) \ -({ \ - int _ret = !!(cond); \ - \ - if (_ret) \ - bch_dev_fatal_io_error(ca, __VA_ARGS__); \ - _ret; \ -}) - -/* - * Nonfatal IO errors: either recoverable metadata IO (because we have - * replicas), or data IO - we need to log it and print out a message, but we - * don't (necessarily) want to shut down the fs: - */ - -void bch_account_io_completion(struct bch_dev *); -void bch_account_io_completion_time(struct bch_dev *, unsigned, int); - -void bch_nonfatal_io_error_work(struct work_struct *); - -/* Does the error handling without logging a message */ -void bch_nonfatal_io_error(struct bch_dev *); - -#if 0 -#define bch_fs_nonfatal_io_error(c, ...) \ -do { \ - bch_err(c, __VA_ARGS__); \ - bch_nonfatal_io_error(c); \ -} while (0) -#endif - -/* Logs message and handles the error: */ -#define bch_dev_nonfatal_io_error(ca, fmt, ...) \ -do { \ - printk_ratelimited(KERN_ERR bch_fmt((ca)->fs, \ - "IO error on %s for " fmt), \ - (ca)->name, ##__VA_ARGS__); \ - bch_nonfatal_io_error(ca); \ -} while (0) - -#define bch_dev_nonfatal_io_err_on(cond, ca, ...) \ -({ \ - bool _ret = (cond); \ - \ - if (_ret) \ - bch_dev_nonfatal_io_error(ca, __VA_ARGS__); \ - _ret; \ -}) - -/* kill? */ - -#define __bcache_io_error(c, fmt, ...) \ - printk_ratelimited(KERN_ERR bch_fmt(c, \ - "IO error: " fmt), ##__VA_ARGS__) - -#define bcache_io_error(c, bio, fmt, ...) \ -do { \ - __bcache_io_error(c, fmt, ##__VA_ARGS__); \ - (bio)->bi_error = -EIO; \ -} while (0) - -#endif /* _BCACHE_ERROR_H */ diff --git a/libbcache/extents.c b/libbcache/extents.c deleted file mode 100644 index 4b422fb1..00000000 --- a/libbcache/extents.c +++ /dev/null @@ -1,2498 +0,0 @@ -/* - * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> - * - * Code for managing the extent btree and dynamically updating the writeback - * dirty sector count. - */ - -#include "bcache.h" -#include "bkey_methods.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "checksum.h" -#include "debug.h" -#include "dirent.h" -#include "error.h" -#include "extents.h" -#include "inode.h" -#include "journal.h" -#include "super-io.h" -#include "writeback.h" -#include "xattr.h" - -#include <trace/events/bcache.h> - -static enum merge_result bch_extent_merge(struct bch_fs *, struct btree *, - struct bkey_i *, struct bkey_i *); - -static void sort_key_next(struct btree_node_iter *iter, - struct btree *b, - struct btree_node_iter_set *i) -{ - i->k += __btree_node_offset_to_key(b, i->k)->u64s; - - if (i->k == i->end) - *i = iter->data[--iter->used]; -} - -/* - * Returns true if l > r - unless l == r, in which case returns true if l is - * older than r. - * - * Necessary for btree_sort_fixup() - if there are multiple keys that compare - * equal in different sets, we have to process them newest to oldest. - */ -#define key_sort_cmp(l, r) \ -({ \ - int _c = bkey_cmp_packed(b, \ - __btree_node_offset_to_key(b, (l).k), \ - __btree_node_offset_to_key(b, (r).k)); \ - \ - _c ? _c > 0 : (l).k > (r).k; \ -}) - -static inline bool should_drop_next_key(struct btree_node_iter *iter, - struct btree *b) -{ - struct btree_node_iter_set *l = iter->data, *r = iter->data + 1; - struct bkey_packed *k = __btree_node_offset_to_key(b, l->k); - - if (bkey_whiteout(k)) - return true; - - if (iter->used < 2) - return false; - - if (iter->used > 2 && - key_sort_cmp(r[0], r[1])) - r++; - - /* - * key_sort_cmp() ensures that when keys compare equal the older key - * comes first; so if l->k compares equal to r->k then l->k is older and - * should be dropped. - */ - return !bkey_cmp_packed(b, - __btree_node_offset_to_key(b, l->k), - __btree_node_offset_to_key(b, r->k)); -} - -struct btree_nr_keys bch_key_sort_fix_overlapping(struct bset *dst, - struct btree *b, - struct btree_node_iter *iter) -{ - struct bkey_packed *out = dst->start; - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - - heap_resort(iter, key_sort_cmp); - - while (!bch_btree_node_iter_end(iter)) { - if (!should_drop_next_key(iter, b)) { - struct bkey_packed *k = - __btree_node_offset_to_key(b, iter->data->k); - - bkey_copy(out, k); - btree_keys_account_key_add(&nr, 0, out); - out = bkey_next(out); - } - - sort_key_next(iter, b, iter->data); - heap_sift(iter, 0, key_sort_cmp); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -/* Common among btree and extent ptrs */ - -const struct bch_extent_ptr * -bch_extent_has_device(struct bkey_s_c_extent e, unsigned dev) -{ - const struct bch_extent_ptr *ptr; - - extent_for_each_ptr(e, ptr) - if (ptr->dev == dev) - return ptr; - - return NULL; -} - -unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent e) -{ - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; - - extent_for_each_ptr(e, ptr) - nr_ptrs++; - - return nr_ptrs; -} - -unsigned bch_extent_nr_dirty_ptrs(struct bkey_s_c k) -{ - struct bkey_s_c_extent e; - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; - - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); - - extent_for_each_ptr(e, ptr) - nr_ptrs += !ptr->cached; - break; - - case BCH_RESERVATION: - nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; - break; - } - - return nr_ptrs; -} - -/* returns true if equal */ -static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r) -{ - return extent_crc_type(l) == extent_crc_type(r) && - !memcmp(l, r, extent_entry_bytes(to_entry(l))); -} - -/* Increment pointers after @crc by crc's offset until the next crc entry: */ -void bch_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc) -{ - union bch_extent_entry *entry; - - extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) { - if (!extent_entry_is_ptr(entry)) - return; - - entry->ptr.offset += crc_offset(crc); - } -} - -/* - * We're writing another replica for this extent, so while we've got the data in - * memory we'll be computing a new checksum for the currently live data. - * - * If there are other replicas we aren't moving, and they are checksummed but - * not compressed, we can modify them to point to only the data that is - * currently live (so that readers won't have to bounce) while we've got the - * checksum we need: - * - * XXX: to guard against data being corrupted while in memory, instead of - * recomputing the checksum here, it would be better in the read path to instead - * of computing the checksum of the entire extent: - * - * | extent | - * - * compute the checksums of the live and dead data separately - * | dead data || live data || dead data | - * - * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then - * use crc_live here (that we verified was correct earlier) - * - * note: doesn't work with encryption - */ -void bch_extent_narrow_crcs(struct bkey_s_extent e) -{ - union bch_extent_crc *crc; - bool have_wide = false, have_narrow = false; - struct bch_csum csum = { 0 }; - unsigned csum_type = 0; - - extent_for_each_crc(e, crc) { - if (crc_compression_type(crc) || - bch_csum_type_is_encryption(crc_csum_type(crc))) - continue; - - if (crc_uncompressed_size(e.k, crc) != e.k->size) { - have_wide = true; - } else { - have_narrow = true; - csum = crc_csum(crc); - csum_type = crc_csum_type(crc); - } - } - - if (!have_wide || !have_narrow) - return; - - extent_for_each_crc(e, crc) { - if (crc_compression_type(crc)) - continue; - - if (crc_uncompressed_size(e.k, crc) != e.k->size) { - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - BUG(); - case BCH_EXTENT_CRC32: - if (bch_crc_bytes[csum_type] > 4) - continue; - - bch_extent_crc_narrow_pointers(e, crc); - crc->crc32._compressed_size = e.k->size - 1; - crc->crc32._uncompressed_size = e.k->size - 1; - crc->crc32.offset = 0; - crc->crc32.csum_type = csum_type; - crc->crc32.csum = csum.lo; - break; - case BCH_EXTENT_CRC64: - if (bch_crc_bytes[csum_type] > 10) - continue; - - bch_extent_crc_narrow_pointers(e, crc); - crc->crc64._compressed_size = e.k->size - 1; - crc->crc64._uncompressed_size = e.k->size - 1; - crc->crc64.offset = 0; - crc->crc64.csum_type = csum_type; - crc->crc64.csum_lo = csum.lo; - crc->crc64.csum_hi = csum.hi; - break; - case BCH_EXTENT_CRC128: - if (bch_crc_bytes[csum_type] > 16) - continue; - - bch_extent_crc_narrow_pointers(e, crc); - crc->crc128._compressed_size = e.k->size - 1; - crc->crc128._uncompressed_size = e.k->size - 1; - crc->crc128.offset = 0; - crc->crc128.csum_type = csum_type; - crc->crc128.csum = csum; - break; - } - } - } -} - -void bch_extent_drop_redundant_crcs(struct bkey_s_extent e) -{ - union bch_extent_entry *entry = e.v->start; - union bch_extent_crc *crc, *prev = NULL; - - while (entry != extent_entry_last(e)) { - union bch_extent_entry *next = extent_entry_next(entry); - size_t crc_u64s = extent_entry_u64s(entry); - - if (!extent_entry_is_crc(entry)) - goto next; - - crc = entry_to_crc(entry); - - if (next == extent_entry_last(e)) { - /* crc entry with no pointers after it: */ - goto drop; - } - - if (extent_entry_is_crc(next)) { - /* no pointers before next crc entry: */ - goto drop; - } - - if (prev && crc_cmp(crc, prev)) { - /* identical to previous crc entry: */ - goto drop; - } - - if (!prev && - !crc_csum_type(crc) && - !crc_compression_type(crc)) { - /* null crc entry: */ - bch_extent_crc_narrow_pointers(e, crc); - goto drop; - } - - prev = crc; -next: - entry = next; - continue; -drop: - memmove_u64s_down(crc, next, - (u64 *) extent_entry_last(e) - (u64 *) next); - e.k->u64s -= crc_u64s; - } - - EBUG_ON(bkey_val_u64s(e.k) && !bch_extent_nr_ptrs(e.c)); -} - -static bool should_drop_ptr(const struct bch_fs *c, - struct bkey_s_c_extent e, - const struct bch_extent_ptr *ptr) -{ - return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr); -} - -static void bch_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e) -{ - struct bch_extent_ptr *ptr = &e.v->start->ptr; - bool dropped = false; - - while ((ptr = extent_ptr_next(e, ptr))) - if (should_drop_ptr(c, e.c, ptr)) { - __bch_extent_drop_ptr(e, ptr); - dropped = true; - } else - ptr++; - - if (dropped) - bch_extent_drop_redundant_crcs(e); -} - -static bool bch_ptr_normalize(struct bch_fs *c, struct btree *bk, - struct bkey_s k) -{ - return bch_extent_normalize(c, k); -} - -static void bch_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) -{ - switch (k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: { - union bch_extent_entry *entry; - u64 *d = (u64 *) bkeyp_val(f, k); - unsigned i; - - for (i = 0; i < bkeyp_val_u64s(f, k); i++) - d[i] = swab64(d[i]); - - for (entry = (union bch_extent_entry *) d; - entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); - entry = extent_entry_next(entry)) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.csum = swab32(entry->crc32.csum); - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); - entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.csum.hi = swab64(entry->crc64.csum_hi); - entry->crc128.csum.lo = swab64(entry->crc64.csum_lo); - break; - case BCH_EXTENT_ENTRY_ptr: - break; - } - } - break; - } - } -} - -static const char *extent_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c_extent e, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata) -{ - const struct bch_extent_ptr *ptr2; - struct bch_dev *ca; - - if (ptr->dev >= c->sb.nr_devices) - return "pointer to invalid device"; - - ca = c->devs[ptr->dev]; - if (!ca) - return "pointer to invalid device"; - - extent_for_each_ptr(e, ptr2) - if (ptr != ptr2 && ptr->dev == ptr2->dev) - return "multiple pointers to same device"; - - if (ptr->offset + size_ondisk > ca->mi.bucket_size * ca->mi.nbuckets) - return "offset past end of device"; - - if (ptr->offset < ca->mi.bucket_size * ca->mi.first_bucket) - return "offset before first bucket"; - - if ((ptr->offset & (ca->mi.bucket_size - 1)) + - size_ondisk > ca->mi.bucket_size) - return "spans multiple buckets"; - - if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data)) - return "device not marked as containing data"; - - return NULL; -} - -static size_t extent_print_ptrs(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c_extent e) -{ - char *out = buf, *end = buf + size; - const union bch_extent_entry *entry; - const union bch_extent_crc *crc; - const struct bch_extent_ptr *ptr; - struct bch_dev *ca; - bool first = true; - -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - extent_for_each_entry(e, entry) { - if (!first) - p(" "); - - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = entry_to_crc(entry); - - p("crc: c_size %u size %u offset %u csum %u compress %u", - crc_compressed_size(e.k, crc), - crc_uncompressed_size(e.k, crc), - crc_offset(crc), crc_csum_type(crc), - crc_compression_type(crc)); - break; - case BCH_EXTENT_ENTRY_ptr: - ptr = entry_to_ptr(entry); - ca = c->devs[ptr->dev]; - - p("ptr: %u:%llu gen %u%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ca && ptr_stale(ca, ptr) - ? " stale" : ""); - break; - default: - p("(invalid extent entry %.16llx)", *((u64 *) entry)); - goto out; - } - - first = false; - } -out: - if (bkey_extent_is_cached(e.k)) - p(" cached"); -#undef p - return out - buf; -} - -/* Btree ptrs */ - -static const char *bch_btree_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c k) -{ - if (bkey_extent_is_cached(k.k)) - return "cached"; - - if (k.k->size) - return "nonzero key size"; - - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) - return "value too big"; - - switch (k.k->type) { - case BCH_EXTENT: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - const struct bch_extent_ptr *ptr; - const union bch_extent_crc *crc; - const char *reason; - - extent_for_each_entry(e, entry) - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; - - extent_for_each_ptr_crc(e, ptr, crc) { - reason = extent_ptr_invalid(c, e, ptr, - c->sb.btree_node_size, - true); - if (reason) - return reason; - } - - if (crc) - return "has crc field"; - - return NULL; - } - - default: - return "invalid value type"; - } -} - -static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) -{ - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - unsigned seq; - const char *err; - char buf[160]; - struct bucket *g; - struct bch_dev *ca; - unsigned replicas = 0; - bool bad; - - extent_for_each_ptr(e, ptr) { - ca = c->devs[ptr->dev]; - g = PTR_BUCKET(ca, ptr); - replicas++; - - err = "stale"; - if (ptr_stale(ca, ptr)) - goto err; - - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - g->mark.data_type != BUCKET_BTREE; - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - - err = "inconsistent"; - if (bad) - goto err; - } - - if (replicas < c->sb.meta_replicas_have) { - bch_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), k); - bch_fs_bug(c, - "btree key bad (too few replicas, %u < %u): %s", - replicas, c->sb.meta_replicas_have, buf); - return; - } - - return; -err: - bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); - bch_fs_bug(c, "%s btree pointer %s: bucket %zi prio %i " - "gen %i last_gc %i mark %08x", - err, buf, PTR_BUCKET_NR(ca, ptr), - g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen, - ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], - (unsigned) g->mark.counter); -} - -static void bch_btree_ptr_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) -{ - char *out = buf, *end = buf + size; - const char *invalid; - -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - if (bkey_extent_is_data(k.k)) - out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); - - invalid = bch_btree_ptr_invalid(c, k); - if (invalid) - p(" invalid: %s", invalid); -#undef p -} - -struct extent_pick_ptr -bch_btree_pick_ptr(struct bch_fs *c, const struct btree *b) -{ - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); - const union bch_extent_crc *crc; - const struct bch_extent_ptr *ptr; - struct extent_pick_ptr pick = { .ca = NULL }; - - extent_for_each_ptr_crc(e, ptr, crc) { - struct bch_dev *ca = c->devs[ptr->dev]; - struct btree *root = btree_node_root(c, b); - - if (bch_fs_inconsistent_on(crc, c, - "btree node pointer with crc at btree %u level %u/%u bucket %zu", - b->btree_id, b->level, root ? root->level : -1, - PTR_BUCKET_NR(ca, ptr))) - break; - - if (bch_dev_inconsistent_on(ptr_stale(ca, ptr), ca, - "stale btree node pointer at btree %u level %u/%u bucket %zu", - b->btree_id, b->level, root ? root->level : -1, - PTR_BUCKET_NR(ca, ptr))) - continue; - - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) - continue; - - if (pick.ca && pick.ca->mi.tier < ca->mi.tier) - continue; - - if (!percpu_ref_tryget(&ca->io_ref)) - continue; - - if (pick.ca) - percpu_ref_put(&pick.ca->io_ref); - - pick.ca = ca; - pick.ptr = *ptr; - } - - return pick; -} - -const struct bkey_ops bch_bkey_btree_ops = { - .key_invalid = bch_btree_ptr_invalid, - .key_debugcheck = btree_ptr_debugcheck, - .val_to_text = bch_btree_ptr_to_text, - .swab = bch_ptr_swab, -}; - -/* Extents */ - -static bool __bch_cut_front(struct bpos where, struct bkey_s k) -{ - u64 len = 0; - - if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) - return false; - - EBUG_ON(bkey_cmp(where, k.k->p) > 0); - - len = k.k->p.offset - where.offset; - - BUG_ON(len > k.k->size); - - /* - * Don't readjust offset if the key size is now 0, because that could - * cause offset to point to the next bucket: - */ - if (!len) - __set_bkey_deleted(k.k); - else if (bkey_extent_is_data(k.k)) { - struct bkey_s_extent e = bkey_s_to_extent(k); - struct bch_extent_ptr *ptr; - union bch_extent_crc *crc, *prev_crc = NULL; - - extent_for_each_ptr_crc(e, ptr, crc) { - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - ptr->offset += e.k->size - len; - break; - case BCH_EXTENT_CRC32: - if (prev_crc != crc) - crc->crc32.offset += e.k->size - len; - break; - case BCH_EXTENT_CRC64: - if (prev_crc != crc) - crc->crc64.offset += e.k->size - len; - break; - case BCH_EXTENT_CRC128: - if (prev_crc != crc) - crc->crc128.offset += e.k->size - len; - break; - } - prev_crc = crc; - } - } - - k.k->size = len; - - return true; -} - -bool bch_cut_front(struct bpos where, struct bkey_i *k) -{ - return __bch_cut_front(where, bkey_i_to_s(k)); -} - -bool bch_cut_back(struct bpos where, struct bkey *k) -{ - u64 len = 0; - - if (bkey_cmp(where, k->p) >= 0) - return false; - - EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0); - - len = where.offset - bkey_start_offset(k); - - BUG_ON(len > k->size); - - k->p = where; - k->size = len; - - if (!len) - __set_bkey_deleted(k); - - return true; -} - -/** - * bch_key_resize - adjust size of @k - * - * bkey_start_offset(k) will be preserved, modifies where the extent ends - */ -void bch_key_resize(struct bkey *k, - unsigned new_size) -{ - k->p.offset -= k->size; - k->p.offset += new_size; - k->size = new_size; -} - -/* - * In extent_sort_fix_overlapping(), insert_fixup_extent(), - * extent_merge_inline() - we're modifying keys in place that are packed. To do - * that we have to unpack the key, modify the unpacked key - then this - * copies/repacks the unpacked to the original as necessary. - */ -static bool __extent_save(struct btree *b, struct btree_node_iter *iter, - struct bkey_packed *dst, struct bkey *src) -{ - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; - bool ret; - - if ((dst_unpacked = packed_to_bkey(dst))) { - dst_unpacked->k = *src; - ret = true; - } else { - ret = bkey_pack_key(dst, src, f); - } - - if (ret && iter) - bch_verify_key_order(b, iter, dst); - - return ret; -} - -static void extent_save(struct btree *b, struct btree_node_iter *iter, - struct bkey_packed *dst, struct bkey *src) -{ - BUG_ON(!__extent_save(b, iter, dst, src)); -} - -/* - * Returns true if l > r - unless l == r, in which case returns true if l is - * older than r. - * - * Necessary for sort_fix_overlapping() - if there are multiple keys that - * compare equal in different sets, we have to process them newest to oldest. - */ -#define extent_sort_cmp(l, r) \ -({ \ - struct bkey _ul = bkey_unpack_key(b, \ - __btree_node_offset_to_key(b, (l).k)); \ - struct bkey _ur = bkey_unpack_key(b, \ - __btree_node_offset_to_key(b, (r).k)); \ - \ - int _c = bkey_cmp(bkey_start_pos(&_ul), bkey_start_pos(&_ur)); \ - _c ? _c > 0 : (l).k < (r).k; \ -}) - -static inline void extent_sort_sift(struct btree_node_iter *iter, - struct btree *b, size_t i) -{ - heap_sift(iter, i, extent_sort_cmp); -} - -static inline void extent_sort_next(struct btree_node_iter *iter, - struct btree *b, - struct btree_node_iter_set *i) -{ - sort_key_next(iter, b, i); - heap_sift(iter, i - iter->data, extent_sort_cmp); -} - -static void extent_sort_append(struct bch_fs *c, - struct btree *b, - struct btree_nr_keys *nr, - struct bkey_packed *start, - struct bkey_packed **prev, - struct bkey_packed *k) -{ - struct bkey_format *f = &b->format; - BKEY_PADDED(k) tmp; - - if (bkey_whiteout(k)) - return; - - bkey_unpack(b, &tmp.k, k); - - if (*prev && - bch_extent_merge(c, b, (void *) *prev, &tmp.k)) - return; - - if (*prev) { - bkey_pack(*prev, (void *) *prev, f); - - btree_keys_account_key_add(nr, 0, *prev); - *prev = bkey_next(*prev); - } else { - *prev = start; - } - - bkey_copy(*prev, &tmp.k); -} - -struct btree_nr_keys bch_extent_sort_fix_overlapping(struct bch_fs *c, - struct bset *dst, - struct btree *b, - struct btree_node_iter *iter) -{ - struct bkey_format *f = &b->format; - struct btree_node_iter_set *_l = iter->data, *_r; - struct bkey_packed *prev = NULL, *out, *lk, *rk; - struct bkey l_unpacked, r_unpacked; - struct bkey_s l, r; - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - - heap_resort(iter, extent_sort_cmp); - - while (!bch_btree_node_iter_end(iter)) { - lk = __btree_node_offset_to_key(b, _l->k); - - if (iter->used == 1) { - extent_sort_append(c, b, &nr, dst->start, &prev, lk); - extent_sort_next(iter, b, _l); - continue; - } - - _r = iter->data + 1; - if (iter->used > 2 && - extent_sort_cmp(_r[0], _r[1])) - _r++; - - rk = __btree_node_offset_to_key(b, _r->k); - - l = __bkey_disassemble(b, lk, &l_unpacked); - r = __bkey_disassemble(b, rk, &r_unpacked); - - /* If current key and next key don't overlap, just append */ - if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { - extent_sort_append(c, b, &nr, dst->start, &prev, lk); - extent_sort_next(iter, b, _l); - continue; - } - - /* Skip 0 size keys */ - if (!r.k->size) { - extent_sort_next(iter, b, _r); - continue; - } - - /* - * overlap: keep the newer key and trim the older key so they - * don't overlap. comparing pointers tells us which one is - * newer, since the bsets are appended one after the other. - */ - - /* can't happen because of comparison func */ - BUG_ON(_l->k < _r->k && - !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); - - if (_l->k > _r->k) { - /* l wins, trim r */ - if (bkey_cmp(l.k->p, r.k->p) >= 0) { - sort_key_next(iter, b, _r); - } else { - __bch_cut_front(l.k->p, r); - extent_save(b, NULL, rk, r.k); - } - - extent_sort_sift(iter, b, _r - iter->data); - } else if (bkey_cmp(l.k->p, r.k->p) > 0) { - BKEY_PADDED(k) tmp; - - /* - * r wins, but it overlaps in the middle of l - split l: - */ - bkey_reassemble(&tmp.k, l.s_c); - bch_cut_back(bkey_start_pos(r.k), &tmp.k.k); - - __bch_cut_front(r.k->p, l); - extent_save(b, NULL, lk, l.k); - - extent_sort_sift(iter, b, 0); - - extent_sort_append(c, b, &nr, dst->start, &prev, - bkey_to_packed(&tmp.k)); - } else { - bch_cut_back(bkey_start_pos(r.k), l.k); - extent_save(b, NULL, lk, l.k); - } - } - - if (prev) { - bkey_pack(prev, (void *) prev, f); - btree_keys_account_key_add(&nr, 0, prev); - out = bkey_next(prev); - } else { - out = dst->start; - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -struct extent_insert_state { - struct btree_insert *trans; - struct btree_insert_entry *insert; - struct bpos committed; - struct bch_fs_usage stats; - - /* for deleting: */ - struct bkey_i whiteout; - bool do_journal; - bool deleting; -}; - -static void bch_add_sectors(struct extent_insert_state *s, - struct bkey_s_c k, u64 offset, s64 sectors) -{ - struct bch_fs *c = s->trans->c; - struct btree *b = s->insert->iter->nodes[0]; - - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0); - - if (!sectors) - return; - - bch_mark_key(c, k, sectors, false, gc_pos_btree_node(b), - &s->stats, s->trans->journal_res.seq); - - if (bkey_extent_is_data(k.k) && - !bkey_extent_is_cached(k.k)) - bcache_dev_sectors_dirty_add(c, k.k->p.inode, offset, sectors); -} - -static void bch_subtract_sectors(struct extent_insert_state *s, - struct bkey_s_c k, u64 offset, s64 sectors) -{ - bch_add_sectors(s, k, offset, -sectors); -} - -/* These wrappers subtract exactly the sectors that we're removing from @k */ -static void bch_cut_subtract_back(struct extent_insert_state *s, - struct bpos where, struct bkey_s k) -{ - bch_subtract_sectors(s, k.s_c, where.offset, - k.k->p.offset - where.offset); - bch_cut_back(where, k.k); -} - -static void bch_cut_subtract_front(struct extent_insert_state *s, - struct bpos where, struct bkey_s k) -{ - bch_subtract_sectors(s, k.s_c, bkey_start_offset(k.k), - where.offset - bkey_start_offset(k.k)); - __bch_cut_front(where, k); -} - -static void bch_drop_subtract(struct extent_insert_state *s, struct bkey_s k) -{ - if (k.k->size) - bch_subtract_sectors(s, k.s_c, - bkey_start_offset(k.k), k.k->size); - k.k->size = 0; - __set_bkey_deleted(k.k); -} - -/* - * Note: If this returns true because only some pointers matched, - * we can lose some caching that had happened in the interim. - * Because cache promotion only promotes the part of the extent - * actually read, and not the whole extent, and due to the key - * splitting done in bch_extent_insert_fixup, preserving such - * caching is difficult. - */ -static bool bch_extent_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r) -{ - struct bkey_s_c_extent le, re; - const struct bch_extent_ptr *lp, *rp; - s64 offset; - - BUG_ON(!l.k->size || !r.k->size); - - if (l.k->type != r.k->type || - bversion_cmp(l.k->version, r.k->version)) - return false; - - switch (l.k->type) { - case KEY_TYPE_COOKIE: - return !memcmp(bkey_s_c_to_cookie(l).v, - bkey_s_c_to_cookie(r).v, - sizeof(struct bch_cookie)); - - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - le = bkey_s_c_to_extent(l); - re = bkey_s_c_to_extent(r); - - /* - * bkey_cmpxchg() handles partial matches - when either l or r - * has been trimmed - so we need just to handle l or r not - * starting at the same place when checking for a match here. - * - * If the starts of the keys are different, we just apply that - * offset to the device pointer offsets when checking those - - * matching how bch_cut_front() adjusts device pointer offsets - * when adjusting the start of a key: - */ - offset = bkey_start_offset(l.k) - bkey_start_offset(r.k); - - /* - * XXX: perhaps we only raced with copygc or tiering replacing - * one of the pointers: it should suffice to find _any_ matching - * pointer - */ - - if (bkey_val_u64s(le.k) != bkey_val_u64s(re.k)) - return false; - - extent_for_each_ptr(le, lp) { - const union bch_extent_entry *entry = - vstruct_idx(re.v, (u64 *) lp - le.v->_data); - - if (!extent_entry_is_ptr(entry)) - return false; - - rp = &entry->ptr; - - if (lp->offset != rp->offset + offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return false; - } - - return true; - default: - return false; - } - -} - -/* - * Returns true on success, false on failure (and false means @new no longer - * overlaps with @k) - * - * If returned true, we may have inserted up to one key in @b. - * If returned false, we may have inserted up to two keys in @b. - * - * On return, there is room in @res for at least one more key of the same size - * as @new. - */ -enum extent_insert_hook_ret bch_extent_cmpxchg(struct extent_insert_hook *hook, - struct bpos committed_pos, - struct bpos next_pos, - struct bkey_s_c k, - const struct bkey_i *new) -{ - struct bch_replace_info *replace = container_of(hook, - struct bch_replace_info, hook); - struct bkey_i *old = &replace->key; - - EBUG_ON(bkey_cmp(committed_pos, bkey_start_pos(&new->k)) < 0); - - /* must have something to compare against */ - EBUG_ON(!bkey_val_u64s(&old->k)); - - /* new must be a subset of old */ - EBUG_ON(bkey_cmp(new->k.p, old->k.p) > 0 || - bkey_cmp(bkey_start_pos(&new->k), bkey_start_pos(&old->k)) < 0); - - if (k.k && bch_extent_cmpxchg_cmp(k, bkey_i_to_s_c(old))) { - replace->successes++; - return BTREE_HOOK_DO_INSERT; - } else { - replace->failures++; - return BTREE_HOOK_NO_INSERT; - } -} - -static bool bch_extent_merge_inline(struct bch_fs *, - struct btree_iter *, - struct bkey_packed *, - struct bkey_packed *, - bool); - -#define MAX_LOCK_HOLD_TIME (5 * NSEC_PER_MSEC) - -static enum btree_insert_ret -extent_insert_should_stop(struct extent_insert_state *s) -{ - struct btree *b = s->insert->iter->nodes[0]; - - /* - * Check if we have sufficient space in both the btree node and the - * journal reservation: - * - * Each insert checks for room in the journal entry, but we check for - * room in the btree node up-front. In the worst case, bkey_cmpxchg() - * will insert two keys, and one iteration of this room will insert one - * key, so we need room for three keys. - */ - if (!bch_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s)) - return BTREE_INSERT_BTREE_NODE_FULL; - else if (!journal_res_insert_fits(s->trans, s->insert)) - return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */ - else - return BTREE_INSERT_OK; -} - -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) -{ - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; - struct bset_tree *t = bset_tree_last(b); - struct bkey_packed *where = - bch_btree_node_iter_bset_pos(node_iter, b, t); - struct bkey_packed *prev = bkey_prev(b, t, where); - struct bkey_packed *next_live_key = where; - unsigned clobber_u64s; - - if (prev) - where = bkey_next(prev); - - while (next_live_key != btree_bkey_last(b, t) && - bkey_deleted(next_live_key)) - next_live_key = bkey_next(next_live_key); - - /* - * Everything between where and next_live_key is now deleted keys, and - * is overwritten: - */ - clobber_u64s = (u64 *) next_live_key - (u64 *) where; - - if (prev && - bch_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true)) - goto drop_deleted_keys; - - if (next_live_key != btree_bkey_last(b, t) && - bch_extent_merge_inline(c, iter, bkey_to_packed(insert), - next_live_key, false)) - goto drop_deleted_keys; - - bch_bset_insert(b, node_iter, where, insert, clobber_u64s); - bch_btree_node_iter_fix(iter, b, node_iter, t, where, - clobber_u64s, where->u64s); - return; -drop_deleted_keys: - bch_bset_delete(b, where, clobber_u64s); - bch_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, 0); -} - -static void extent_insert_committed(struct extent_insert_state *s) -{ - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; - struct bkey_i *insert = !s->deleting - ? s->insert->k - : &s->whiteout; - BKEY_PADDED(k) split; - - EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0); - EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0); - - if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k))) - return; - - if (s->deleting && !s->do_journal) { - bch_cut_front(s->committed, insert); - goto done; - } - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - - bkey_copy(&split.k, insert); - - if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && - bkey_cmp(s->committed, insert->k.p) && - bkey_extent_is_compressed(bkey_i_to_s_c(insert))) { - /* XXX: possibly need to increase our reservation? */ - bch_cut_subtract_back(s, s->committed, - bkey_i_to_s(&split.k)); - bch_cut_front(s->committed, insert); - bch_add_sectors(s, bkey_i_to_s_c(insert), - bkey_start_offset(&insert->k), - insert->k.size); - } else { - bch_cut_back(s->committed, &split.k.k); - bch_cut_front(s->committed, insert); - } - - if (debug_check_bkeys(c)) - bkey_debugcheck(c, iter->nodes[iter->level], - bkey_i_to_s_c(&split.k)); - - bch_btree_journal_key(s->trans, iter, &split.k); - - if (!s->deleting) - extent_bset_insert(c, iter, &split.k); -done: - bch_btree_iter_set_pos_same_leaf(iter, s->committed); - - insert->k.needs_whiteout = false; - s->do_journal = false; - s->trans->did_work = true; -} - -static enum extent_insert_hook_ret -__extent_insert_advance_pos(struct extent_insert_state *s, - struct bpos next_pos, - struct bkey_s_c k) -{ - struct extent_insert_hook *hook = s->trans->hook; - enum extent_insert_hook_ret ret; -#if 0 - /* - * Currently disabled for encryption - broken with fcollapse. Will have - * to reenable when versions are exposed for send/receive - versions - * will have to be monotonic then: - */ - if (k.k && k.k->size && - !bversion_zero(s->insert->k->k.version) && - bversion_cmp(k.k->version, s->insert->k->k.version) > 0) { - ret = BTREE_HOOK_NO_INSERT; - } else -#endif - if (hook) - ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k); - else - ret = BTREE_HOOK_DO_INSERT; - - EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size); - - switch (ret) { - case BTREE_HOOK_DO_INSERT: - break; - case BTREE_HOOK_NO_INSERT: - extent_insert_committed(s); - bch_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k)); - - bch_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos); - break; - case BTREE_HOOK_RESTART_TRANS: - return ret; - } - - s->committed = next_pos; - return ret; -} - -/* - * Update iter->pos, marking how much of @insert we've processed, and call hook - * fn: - */ -static enum extent_insert_hook_ret -extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k) -{ - struct btree *b = s->insert->iter->nodes[0]; - struct bpos next_pos = bpos_min(s->insert->k->k.p, - k.k ? k.k->p : b->key.k.p); - - /* hole? */ - if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) { - bool have_uncommitted = bkey_cmp(s->committed, - bkey_start_pos(&s->insert->k->k)) > 0; - - switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k), - bkey_s_c_null)) { - case BTREE_HOOK_DO_INSERT: - break; - case BTREE_HOOK_NO_INSERT: - /* - * we had to split @insert and insert the committed - * part - need to bail out and recheck journal - * reservation/btree node before we advance pos past @k: - */ - if (have_uncommitted) - return BTREE_HOOK_NO_INSERT; - break; - case BTREE_HOOK_RESTART_TRANS: - return BTREE_HOOK_RESTART_TRANS; - } - } - - /* avoid redundant calls to hook fn: */ - if (!bkey_cmp(s->committed, next_pos)) - return BTREE_HOOK_DO_INSERT; - - return __extent_insert_advance_pos(s, next_pos, k); -} - -static enum btree_insert_ret -extent_insert_check_split_compressed(struct extent_insert_state *s, - struct bkey_s_c k, - enum bch_extent_overlap overlap) -{ - struct bch_fs *c = s->trans->c; - unsigned sectors; - - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && - (sectors = bkey_extent_is_compressed(k))) { - int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD; - - if (s->trans->flags & BTREE_INSERT_NOFAIL) - flags |= BCH_DISK_RESERVATION_NOFAIL; - - switch (bch_disk_reservation_add(c, - s->trans->disk_res, - sectors, flags)) { - case 0: - break; - case -ENOSPC: - return BTREE_INSERT_ENOSPC; - case -EINTR: - return BTREE_INSERT_NEED_GC_LOCK; - default: - BUG(); - } - } - - return BTREE_INSERT_OK; -} - -static enum btree_insert_ret -extent_squash(struct extent_insert_state *s, struct bkey_i *insert, - struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k, - enum bch_extent_overlap overlap) -{ - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - /* insert overlaps with start of k: */ - bch_cut_subtract_front(s, insert->k.p, k); - BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); - break; - - case BCH_EXTENT_OVERLAP_BACK: - /* insert overlaps with end of k: */ - bch_cut_subtract_back(s, bkey_start_pos(&insert->k), k); - BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); - - /* - * As the auxiliary tree is indexed by the end of the - * key and we've just changed the end, update the - * auxiliary tree. - */ - bch_bset_fix_invalidated_key(b, t, _k); - bch_btree_node_iter_fix(iter, b, node_iter, t, - _k, _k->u64s, _k->u64s); - break; - - case BCH_EXTENT_OVERLAP_ALL: { - struct bpos orig_pos = k.k->p; - - /* The insert key completely covers k, invalidate k */ - if (!bkey_whiteout(k.k)) - btree_keys_account_key_drop(&b->nr, - t - b->set, _k); - - bch_drop_subtract(s, k); - k.k->p = bkey_start_pos(&insert->k); - if (!__extent_save(b, node_iter, _k, k.k)) { - /* - * Couldn't repack: we aren't necessarily able - * to repack if the new key is outside the range - * of the old extent, so we have to split - * @insert: - */ - k.k->p = orig_pos; - extent_save(b, node_iter, _k, k.k); - - if (extent_insert_advance_pos(s, k.s_c) == - BTREE_HOOK_RESTART_TRANS) - return BTREE_INSERT_NEED_TRAVERSE; - - extent_insert_committed(s); - /* - * We split and inserted upto at k.k->p - that - * has to coincide with iter->pos, so that we - * don't have anything more we have to insert - * until we recheck our journal reservation: - */ - EBUG_ON(bkey_cmp(s->committed, k.k->p)); - } else { - bch_bset_fix_invalidated_key(b, t, _k); - bch_btree_node_iter_fix(iter, b, node_iter, t, - _k, _k->u64s, _k->u64s); - } - - break; - } - case BCH_EXTENT_OVERLAP_MIDDLE: { - BKEY_PADDED(k) split; - /* - * The insert key falls 'in the middle' of k - * The insert key splits k in 3: - * - start only in k, preserve - * - middle common section, invalidate in k - * - end only in k, preserve - * - * We update the old key to preserve the start, - * insert will be the new common section, - * we manually insert the end that we are preserving. - * - * modify k _before_ doing the insert (which will move - * what k points to) - */ - bkey_reassemble(&split.k, k.s_c); - split.k.k.needs_whiteout |= bset_written(b, bset(b, t)); - - bch_cut_back(bkey_start_pos(&insert->k), &split.k.k); - BUG_ON(bkey_deleted(&split.k.k)); - - bch_cut_subtract_front(s, insert->k.p, k); - BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); - - bch_add_sectors(s, bkey_i_to_s_c(&split.k), - bkey_start_offset(&split.k.k), - split.k.k.size); - extent_bset_insert(c, iter, &split.k); - break; - } - } - - return BTREE_INSERT_OK; -} - -static enum btree_insert_ret -bch_delete_fixup_extent(struct extent_insert_state *s) -{ - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; - struct bkey_packed *_k; - struct bkey unpacked; - struct bkey_i *insert = s->insert->k; - enum btree_insert_ret ret = BTREE_INSERT_OK; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - - s->whiteout = *insert; - s->do_journal = false; - - while (bkey_cmp(s->committed, insert->k.p) < 0 && - (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && - (_k = bch_btree_node_iter_peek_all(node_iter, b))) { - struct bset_tree *t = bch_bkey_to_bset(b, _k); - struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); - enum bch_extent_overlap overlap; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) - break; - - if (bkey_whiteout(k.k)) { - s->committed = bpos_min(insert->k.p, k.k->p); - goto next; - } - - overlap = bch_extent_overlap(&insert->k, k.k); - - ret = extent_insert_check_split_compressed(s, k.s_c, overlap); - if (ret != BTREE_INSERT_OK) - goto stop; - - switch (extent_insert_advance_pos(s, k.s_c)) { - case BTREE_HOOK_DO_INSERT: - break; - case BTREE_HOOK_NO_INSERT: - continue; - case BTREE_HOOK_RESTART_TRANS: - ret = BTREE_INSERT_NEED_TRAVERSE; - goto stop; - } - - s->do_journal = true; - - if (overlap == BCH_EXTENT_OVERLAP_ALL) { - btree_keys_account_key_drop(&b->nr, - t - b->set, _k); - bch_subtract_sectors(s, k.s_c, - bkey_start_offset(k.k), k.k->size); - _k->type = KEY_TYPE_DISCARD; - reserve_whiteout(b, t, _k); - } else if (k.k->needs_whiteout || - bset_written(b, bset(b, t))) { - struct bkey_i discard = *insert; - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - bch_cut_front(bkey_start_pos(k.k), &discard); - break; - case BCH_EXTENT_OVERLAP_BACK: - bch_cut_back(k.k->p, &discard.k); - break; - default: - break; - } - - discard.k.needs_whiteout = true; - - ret = extent_squash(s, insert, t, _k, k, overlap); - BUG_ON(ret != BTREE_INSERT_OK); - - extent_bset_insert(c, iter, &discard); - } else { - ret = extent_squash(s, insert, t, _k, k, overlap); - BUG_ON(ret != BTREE_INSERT_OK); - } -next: - bch_cut_front(s->committed, insert); - bch_btree_iter_set_pos_same_leaf(iter, s->committed); - } - - if (bkey_cmp(s->committed, insert->k.p) < 0 && - ret == BTREE_INSERT_OK && - extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS) - ret = BTREE_INSERT_NEED_TRAVERSE; -stop: - extent_insert_committed(s); - - bch_fs_usage_apply(c, &s->stats, s->trans->disk_res, - gc_pos_btree_node(b)); - - EBUG_ON(bkey_cmp(iter->pos, s->committed)); - EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf); - - bch_cut_front(iter->pos, insert); - - if (insert->k.size && iter->at_end_of_leaf) - ret = BTREE_INSERT_NEED_TRAVERSE; - - EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK); - - return ret; -} - -/** - * bch_extent_insert_fixup - insert a new extent and deal with overlaps - * - * this may result in not actually doing the insert, or inserting some subset - * of the insert key. For cmpxchg operations this is where that logic lives. - * - * All subsets of @insert that need to be inserted are inserted using - * bch_btree_insert_and_journal(). If @b or @res fills up, this function - * returns false, setting @iter->pos for the prefix of @insert that actually got - * inserted. - * - * BSET INVARIANTS: this function is responsible for maintaining all the - * invariants for bsets of extents in memory. things get really hairy with 0 - * size extents - * - * within one bset: - * - * bkey_start_pos(bkey_next(k)) >= k - * or bkey_start_offset(bkey_next(k)) >= k->offset - * - * i.e. strict ordering, no overlapping extents. - * - * multiple bsets (i.e. full btree node): - * - * ∀ k, j - * k.size != 0 ∧ j.size != 0 → - * ¬ (k > bkey_start_pos(j) ∧ k < j) - * - * i.e. no two overlapping keys _of nonzero size_ - * - * We can't realistically maintain this invariant for zero size keys because of - * the key merging done in bch_btree_insert_key() - for two mergeable keys k, j - * there may be another 0 size key between them in another bset, and it will - * thus overlap with the merged key. - * - * In addition, the end of iter->pos indicates how much has been processed. - * If the end of iter->pos is not the same as the end of insert, then - * key insertion needs to continue/be retried. - */ -enum btree_insert_ret -bch_insert_fixup_extent(struct btree_insert *trans, - struct btree_insert_entry *insert) -{ - struct bch_fs *c = trans->c; - struct btree_iter *iter = insert->iter; - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; - struct bkey_packed *_k; - struct bkey unpacked; - enum btree_insert_ret ret = BTREE_INSERT_OK; - - struct extent_insert_state s = { - .trans = trans, - .insert = insert, - .committed = insert->iter->pos, - .deleting = bkey_whiteout(&insert->k->k), - }; - - EBUG_ON(iter->level); - EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size); - - if (s.deleting) - return bch_delete_fixup_extent(&s); - - /* - * As we process overlapping extents, we advance @iter->pos both to - * signal to our caller (btree_insert_key()) how much of @insert->k has - * been inserted, and also to keep @iter->pos consistent with - * @insert->k and the node iterator that we're advancing: - */ - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - - if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) - bch_add_sectors(&s, bkey_i_to_s_c(insert->k), - bkey_start_offset(&insert->k->k), - insert->k->k.size); - - while (bkey_cmp(s.committed, insert->k->k.p) < 0 && - (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK && - (_k = bch_btree_node_iter_peek_all(node_iter, b))) { - struct bset_tree *t = bch_bkey_to_bset(b, _k); - struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); - enum bch_extent_overlap overlap; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0) - break; - - overlap = bch_extent_overlap(&insert->k->k, k.k); - - ret = extent_insert_check_split_compressed(&s, k.s_c, overlap); - if (ret != BTREE_INSERT_OK) - goto stop; - - if (!k.k->size) - goto squash; - - /* - * Only call advance pos & call hook for nonzero size extents: - * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer - * overlaps with @k: - */ - switch (extent_insert_advance_pos(&s, k.s_c)) { - case BTREE_HOOK_DO_INSERT: - break; - case BTREE_HOOK_NO_INSERT: - continue; - case BTREE_HOOK_RESTART_TRANS: - ret = BTREE_INSERT_NEED_TRAVERSE; - goto stop; - } - - if (k.k->size && - (k.k->needs_whiteout || bset_written(b, bset(b, t)))) - insert->k->k.needs_whiteout = true; - - if (overlap == BCH_EXTENT_OVERLAP_ALL && - bkey_whiteout(k.k) && - k.k->needs_whiteout) { - unreserve_whiteout(b, t, _k); - _k->needs_whiteout = false; - } -squash: - ret = extent_squash(&s, insert->k, t, _k, k, overlap); - if (ret != BTREE_INSERT_OK) - goto stop; - } - - if (bkey_cmp(s.committed, insert->k->k.p) < 0 && - ret == BTREE_INSERT_OK && - extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS) - ret = BTREE_INSERT_NEED_TRAVERSE; -stop: - extent_insert_committed(&s); - /* - * Subtract any remaining sectors from @insert, if we bailed out early - * and didn't fully insert @insert: - */ - if (insert->k->k.size && - !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) - bch_subtract_sectors(&s, bkey_i_to_s_c(insert->k), - bkey_start_offset(&insert->k->k), - insert->k->k.size); - - bch_fs_usage_apply(c, &s.stats, trans->disk_res, - gc_pos_btree_node(b)); - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - EBUG_ON(bkey_cmp(iter->pos, s.committed)); - EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf); - - if (insert->k->k.size && iter->at_end_of_leaf) - ret = BTREE_INSERT_NEED_TRAVERSE; - - EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK); - - return ret; -} - -static const char *bch_extent_invalid(const struct bch_fs *c, - struct bkey_s_c k) -{ - if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) - return "value too big"; - - if (!k.k->size) - return "zero key size"; - - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - const union bch_extent_crc *crc; - const struct bch_extent_ptr *ptr; - unsigned size_ondisk = e.k->size; - const char *reason; - - extent_for_each_entry(e, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; - - if (extent_entry_is_crc(entry)) { - crc = entry_to_crc(entry); - - if (crc_offset(crc) + e.k->size > - crc_uncompressed_size(e.k, crc)) - return "checksum offset + key size > uncompressed size"; - - size_ondisk = crc_compressed_size(e.k, crc); - - if (!bch_checksum_type_valid(c, crc_csum_type(crc))) - return "invalid checksum type"; - - if (crc_compression_type(crc) >= BCH_COMPRESSION_NR) - return "invalid compression type"; - } else { - ptr = entry_to_ptr(entry); - - reason = extent_ptr_invalid(c, e, &entry->ptr, - size_ondisk, false); - if (reason) - return reason; - } - } - - return NULL; - } - - case BCH_RESERVATION: { - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) - return "incorrect value size"; - - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) - return "invalid nr_replicas"; - - return NULL; - } - - default: - return "invalid value type"; - } -} - -static void bch_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, - struct bkey_s_c_extent e) -{ - const struct bch_extent_ptr *ptr; - struct bch_dev *ca; - struct bucket *g; - unsigned seq, stale; - char buf[160]; - bool bad; - unsigned ptrs_per_tier[BCH_TIER_MAX]; - unsigned replicas = 0; - - /* - * XXX: we should be doing most/all of these checks at startup time, - * where we check bkey_invalid() in btree_node_read_done() - * - * But note that we can't check for stale pointers or incorrect gc marks - * until after journal replay is done (it might be an extent that's - * going to get overwritten during replay) - */ - - memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier)); - - extent_for_each_ptr(e, ptr) { - ca = c->devs[ptr->dev]; - g = PTR_BUCKET(ca, ptr); - replicas++; - ptrs_per_tier[ca->mi.tier]++; - - /* - * If journal replay hasn't finished, we might be seeing keys - * that will be overwritten by the time journal replay is done: - */ - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - continue; - - stale = 0; - - do { - struct bucket_mark mark; - - seq = read_seqcount_begin(&c->gc_pos_lock); - mark = READ_ONCE(g->mark); - - /* between mark and bucket gen */ - smp_rmb(); - - stale = ptr_stale(ca, ptr); - - bch_fs_bug_on(stale && !ptr->cached, c, - "stale dirty pointer"); - - bch_fs_bug_on(stale > 96, c, - "key too stale: %i", - stale); - - if (stale) - break; - - bad = (mark.data_type != BUCKET_DATA || - (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - !mark.owned_by_allocator && - !(ptr->cached - ? mark.cached_sectors - : mark.dirty_sectors))); - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - - if (bad) - goto bad_ptr; - } - - if (replicas > BCH_REPLICAS_MAX) { - bch_bkey_val_to_text(c, btree_node_type(b), buf, - sizeof(buf), e.s_c); - bch_fs_bug(c, - "extent key bad (too many replicas: %u): %s", - replicas, buf); - return; - } - - if (!bkey_extent_is_cached(e.k) && - replicas < c->sb.data_replicas_have) { - bch_bkey_val_to_text(c, btree_node_type(b), buf, - sizeof(buf), e.s_c); - bch_fs_bug(c, - "extent key bad (too few replicas, %u < %u): %s", - replicas, c->sb.data_replicas_have, buf); - return; - } - - return; - -bad_ptr: - bch_bkey_val_to_text(c, btree_node_type(b), buf, - sizeof(buf), e.s_c); - bch_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i " - "gen %i last_gc %i mark 0x%08x", - buf, PTR_BUCKET_NR(ca, ptr), - g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen, - ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], - (unsigned) g->mark.counter); - return; -} - -static void bch_extent_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) -{ - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - bch_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k)); - break; - case BCH_RESERVATION: - break; - default: - BUG(); - } -} - -static void bch_extent_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) -{ - char *out = buf, *end = buf + size; - const char *invalid; - -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - if (bkey_extent_is_data(k.k)) - out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); - - invalid = bch_extent_invalid(c, k); - if (invalid) - p(" invalid: %s", invalid); -#undef p -} - -static unsigned PTR_TIER(struct bch_fs *c, - const struct bch_extent_ptr *ptr) -{ - return c->devs[ptr->dev]->mi.tier; -} - -static void bch_extent_crc_init(union bch_extent_crc *crc, - unsigned compressed_size, - unsigned uncompressed_size, - unsigned compression_type, - unsigned nonce, - struct bch_csum csum, unsigned csum_type) -{ - if (bch_crc_bytes[csum_type] <= 4 && - uncompressed_size <= CRC32_SIZE_MAX && - nonce <= CRC32_NONCE_MAX) { - crc->crc32 = (struct bch_extent_crc32) { - .type = 1 << BCH_EXTENT_ENTRY_crc32, - ._compressed_size = compressed_size - 1, - ._uncompressed_size = uncompressed_size - 1, - .offset = 0, - .compression_type = compression_type, - .csum_type = csum_type, - .csum = *((__le32 *) &csum.lo), - }; - return; - } - - if (bch_crc_bytes[csum_type] <= 10 && - uncompressed_size <= CRC64_SIZE_MAX && - nonce <= CRC64_NONCE_MAX) { - crc->crc64 = (struct bch_extent_crc64) { - .type = 1 << BCH_EXTENT_ENTRY_crc64, - ._compressed_size = compressed_size - 1, - ._uncompressed_size = uncompressed_size - 1, - .offset = 0, - .nonce = nonce, - .compression_type = compression_type, - .csum_type = csum_type, - .csum_lo = csum.lo, - .csum_hi = *((__le16 *) &csum.hi), - }; - return; - } - - if (bch_crc_bytes[csum_type] <= 16 && - uncompressed_size <= CRC128_SIZE_MAX && - nonce <= CRC128_NONCE_MAX) { - crc->crc128 = (struct bch_extent_crc128) { - .type = 1 << BCH_EXTENT_ENTRY_crc128, - ._compressed_size = compressed_size - 1, - ._uncompressed_size = uncompressed_size - 1, - .offset = 0, - .nonce = nonce, - .compression_type = compression_type, - .csum_type = csum_type, - .csum = csum, - }; - return; - } - - BUG(); -} - -void bch_extent_crc_append(struct bkey_i_extent *e, - unsigned compressed_size, - unsigned uncompressed_size, - unsigned compression_type, - unsigned nonce, - struct bch_csum csum, unsigned csum_type) -{ - union bch_extent_crc *crc; - - BUG_ON(compressed_size > uncompressed_size); - BUG_ON(uncompressed_size != e->k.size); - BUG_ON(!compressed_size || !uncompressed_size); - - /* - * Look up the last crc entry, so we can check if we need to add - * another: - */ - extent_for_each_crc(extent_i_to_s(e), crc) - ; - - if (!crc && !csum_type && !compression_type) - return; - - if (crc && - crc_compressed_size(&e->k, crc) == compressed_size && - crc_uncompressed_size(&e->k, crc) == uncompressed_size && - crc_offset(crc) == 0 && - crc_nonce(crc) == nonce && - crc_csum_type(crc) == csum_type && - crc_compression_type(crc) == compression_type && - crc_csum(crc).lo == csum.lo && - crc_csum(crc).hi == csum.hi) - return; - - bch_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), - compressed_size, - uncompressed_size, - compression_type, - nonce, csum, csum_type); - __extent_entry_push(e); -} - -/* - * bch_extent_normalize - clean up an extent, dropping stale pointers etc. - * - * Returns true if @k should be dropped entirely - * - * For existing keys, only called when btree nodes are being rewritten, not when - * they're merely being compacted/resorted in memory. - */ -bool bch_extent_normalize(struct bch_fs *c, struct bkey_s k) -{ - struct bkey_s_extent e; - - switch (k.k->type) { - case KEY_TYPE_ERROR: - return false; - - case KEY_TYPE_DELETED: - case KEY_TYPE_COOKIE: - return true; - - case KEY_TYPE_DISCARD: - return bversion_zero(k.k->version); - - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_to_extent(k); - - bch_extent_drop_stale(c, e); - - if (!bkey_val_u64s(e.k)) { - if (bkey_extent_is_cached(e.k)) { - k.k->type = KEY_TYPE_DISCARD; - if (bversion_zero(k.k->version)) - return true; - } else { - k.k->type = KEY_TYPE_ERROR; - } - } - - return false; - case BCH_RESERVATION: - return false; - default: - BUG(); - } -} - -void bch_extent_mark_replicas_cached(struct bch_fs *c, - struct bkey_s_extent e, - unsigned nr_cached) -{ - struct bch_extent_ptr *ptr; - bool have_higher_tier; - unsigned tier = 0; - - if (!nr_cached) - return; - - do { - have_higher_tier = false; - - extent_for_each_ptr(e, ptr) { - if (!ptr->cached && - PTR_TIER(c, ptr) == tier) { - ptr->cached = true; - nr_cached--; - if (!nr_cached) - return; - } - - if (PTR_TIER(c, ptr) > tier) - have_higher_tier = true; - } - - tier++; - } while (have_higher_tier); -} - -/* - * This picks a non-stale pointer, preferabbly from a device other than - * avoid. Avoid can be NULL, meaning pick any. If there are no non-stale - * pointers to other devices, it will still pick a pointer from avoid. - * Note that it prefers lowered-numbered pointers to higher-numbered pointers - * as the pointers are sorted by tier, hence preferring pointers to tier 0 - * rather than pointers to tier 1. - */ -void bch_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k, - struct bch_dev *avoid, - struct extent_pick_ptr *ret) -{ - struct bkey_s_c_extent e; - const union bch_extent_crc *crc; - const struct bch_extent_ptr *ptr; - - switch (k.k->type) { - case KEY_TYPE_DELETED: - case KEY_TYPE_DISCARD: - case KEY_TYPE_COOKIE: - ret->ca = NULL; - return; - - case KEY_TYPE_ERROR: - ret->ca = ERR_PTR(-EIO); - return; - - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); - ret->ca = NULL; - - extent_for_each_ptr_crc(e, ptr, crc) { - struct bch_dev *ca = c->devs[ptr->dev]; - - if (ptr->cached && ptr_stale(ca, ptr)) - continue; - - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) - continue; - - if (ret->ca && - (ca == avoid || - ret->ca->mi.tier < ca->mi.tier)) - continue; - - if (!percpu_ref_tryget(&ca->io_ref)) - continue; - - if (ret->ca) - percpu_ref_put(&ret->ca->io_ref); - - *ret = (struct extent_pick_ptr) { - .crc = crc_to_128(e.k, crc), - .ptr = *ptr, - .ca = ca, - }; - } - - if (!ret->ca && !bkey_extent_is_cached(e.k)) - ret->ca = ERR_PTR(-EIO); - return; - - case BCH_RESERVATION: - ret->ca = NULL; - return; - - default: - BUG(); - } -} - -static enum merge_result bch_extent_merge(struct bch_fs *c, - struct btree *bk, - struct bkey_i *l, struct bkey_i *r) -{ - struct bkey_s_extent el, er; - union bch_extent_entry *en_l, *en_r; - - if (key_merging_disabled(c)) - return BCH_MERGE_NOMERGE; - - /* - * Generic header checks - * Assumes left and right are in order - * Left and right must be exactly aligned - */ - - if (l->k.u64s != r->k.u64s || - l->k.type != r->k.type || - bversion_cmp(l->k.version, r->k.version) || - bkey_cmp(l->k.p, bkey_start_pos(&r->k))) - return BCH_MERGE_NOMERGE; - - switch (l->k.type) { - case KEY_TYPE_DELETED: - case KEY_TYPE_DISCARD: - case KEY_TYPE_ERROR: - /* These types are mergeable, and no val to check */ - break; - - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - el = bkey_i_to_s_extent(l); - er = bkey_i_to_s_extent(r); - - extent_for_each_entry(el, en_l) { - struct bch_extent_ptr *lp, *rp; - unsigned bucket_size; - - en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); - - if ((extent_entry_type(en_l) != - extent_entry_type(en_r)) || - extent_entry_is_crc(en_l)) - return BCH_MERGE_NOMERGE; - - lp = &en_l->ptr; - rp = &en_r->ptr; - - if (lp->offset + el.k->size != rp->offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return BCH_MERGE_NOMERGE; - - /* We don't allow extents to straddle buckets: */ - bucket_size = c->devs[lp->dev]->mi.bucket_size; - - if ((lp->offset & ~((u64) bucket_size - 1)) != - (rp->offset & ~((u64) bucket_size - 1))) - return BCH_MERGE_NOMERGE; - } - - break; - case BCH_RESERVATION: { - struct bkey_i_reservation *li = bkey_i_to_reservation(l); - struct bkey_i_reservation *ri = bkey_i_to_reservation(r); - - if (li->v.generation != ri->v.generation || - li->v.nr_replicas != ri->v.nr_replicas) - return BCH_MERGE_NOMERGE; - break; - } - default: - return BCH_MERGE_NOMERGE; - } - - l->k.needs_whiteout |= r->k.needs_whiteout; - - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ - if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) { - bch_key_resize(&l->k, KEY_SIZE_MAX); - bch_cut_front(l->k.p, r); - return BCH_MERGE_PARTIAL; - } - - bch_key_resize(&l->k, l->k.size + r->k.size); - - return BCH_MERGE_MERGE; -} - -static void extent_i_save(struct btree *b, struct bkey_packed *dst, - struct bkey_i *src) -{ - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; - - BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k)); - - /* - * We don't want the bch_verify_key_order() call in extent_save(), - * because we may be out of order with deleted keys that are about to be - * removed by extent_bset_insert() - */ - - if ((dst_unpacked = packed_to_bkey(dst))) - bkey_copy(dst_unpacked, src); - else - BUG_ON(!bkey_pack(dst, src, f)); -} - -static bool extent_merge_one_overlapping(struct btree_iter *iter, - struct bpos new_pos, - struct bset_tree *t, - struct bkey_packed *k, struct bkey uk, - bool check, bool could_pack) -{ - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; - - BUG_ON(!bkey_deleted(k)); - - if (check) { - return !bkey_packed(k) || could_pack; - } else { - uk.p = new_pos; - extent_save(b, node_iter, k, &uk); - bch_bset_fix_invalidated_key(b, t, k); - bch_btree_node_iter_fix(iter, b, node_iter, t, - k, k->u64s, k->u64s); - return true; - } -} - -static bool extent_merge_do_overlapping(struct btree_iter *iter, - struct bkey *m, bool back_merge) -{ - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; - struct bset_tree *t; - struct bkey_packed *k; - struct bkey uk; - struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m); - bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b); - bool check = true; - - /* - * @m is the new merged extent: - * - * The merge took place in the last bset; we know there can't be any 0 - * size extents overlapping with m there because if so they would have - * been between the two extents we merged. - * - * But in the other bsets, we have to check for and fix such extents: - */ -do_fixup: - for_each_bset(b, t) { - if (t == bset_tree_last(b)) - break; - - /* - * if we don't find this bset in the iterator we already got to - * the end of that bset, so start searching from the end. - */ - k = bch_btree_node_iter_bset_pos(node_iter, b, t); - - if (k == btree_bkey_last(b, t)) - k = bkey_prev_all(b, t, k); - if (!k) - continue; - - if (back_merge) { - /* - * Back merge: 0 size extents will be before the key - * that was just inserted (and thus the iterator - * position) - walk backwards to find them - */ - for (; - k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(m)) > 0); - k = bkey_prev_all(b, t, k)) { - if (bkey_cmp(uk.p, m->p) >= 0) - continue; - - if (!extent_merge_one_overlapping(iter, new_pos, - t, k, uk, check, could_pack)) - return false; - } - } else { - /* Front merge - walk forwards */ - for (; - k != btree_bkey_last(b, t) && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, m->p) < 0); - k = bkey_next(k)) { - if (bkey_cmp(uk.p, - bkey_start_pos(m)) <= 0) - continue; - - if (!extent_merge_one_overlapping(iter, new_pos, - t, k, uk, check, could_pack)) - return false; - } - } - } - - if (check) { - check = false; - goto do_fixup; - } - - return true; -} - -/* - * When merging an extent that we're inserting into a btree node, the new merged - * extent could overlap with an existing 0 size extent - if we don't fix that, - * it'll break the btree node iterator so this code finds those 0 size extents - * and shifts them out of the way. - * - * Also unpacks and repacks. - */ -static bool bch_extent_merge_inline(struct bch_fs *c, - struct btree_iter *iter, - struct bkey_packed *l, - struct bkey_packed *r, - bool back_merge) -{ - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; - const struct bkey_format *f = &b->format; - struct bset_tree *t = bset_tree_last(b); - struct bkey_packed *m; - BKEY_PADDED(k) li; - BKEY_PADDED(k) ri; - struct bkey_i *mi; - struct bkey tmp; - - /* - * We need to save copies of both l and r, because we might get a - * partial merge (which modifies both) and then fails to repack - */ - bkey_unpack(b, &li.k, l); - bkey_unpack(b, &ri.k, r); - - m = back_merge ? l : r; - mi = back_merge ? &li.k : &ri.k; - - /* l & r should be in last bset: */ - EBUG_ON(bch_bkey_to_bset(b, m) != t); - - switch (bch_extent_merge(c, b, &li.k, &ri.k)) { - case BCH_MERGE_NOMERGE: - return false; - case BCH_MERGE_PARTIAL: - if (bkey_packed(m) && !bkey_pack_key((void *) &tmp, &mi->k, f)) - return false; - - if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) - return false; - - extent_i_save(b, m, mi); - bch_bset_fix_invalidated_key(b, t, m); - - /* - * Update iterator to reflect what we just inserted - otherwise, - * the iter_fix() call is going to put us _before_ the key we - * just partially merged with: - */ - if (back_merge) - bch_btree_iter_set_pos_same_leaf(iter, li.k.k.p); - - bch_btree_node_iter_fix(iter, iter->nodes[0], node_iter, - t, m, m->u64s, m->u64s); - - if (!back_merge) - bkey_copy(packed_to_bkey(l), &li.k); - else - bkey_copy(packed_to_bkey(r), &ri.k); - return false; - case BCH_MERGE_MERGE: - if (bkey_packed(m) && !bkey_pack_key((void *) &tmp, &li.k.k, f)) - return false; - - if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) - return false; - - extent_i_save(b, m, &li.k); - bch_bset_fix_invalidated_key(b, t, m); - - bch_btree_node_iter_fix(iter, iter->nodes[0], node_iter, - t, m, m->u64s, m->u64s); - return true; - default: - BUG(); - } -} - -const struct bkey_ops bch_bkey_extent_ops = { - .key_invalid = bch_extent_invalid, - .key_debugcheck = bch_extent_debugcheck, - .val_to_text = bch_extent_to_text, - .swab = bch_ptr_swab, - .key_normalize = bch_ptr_normalize, - .key_merge = bch_extent_merge, - .is_extents = true, -}; diff --git a/libbcache/extents.h b/libbcache/extents.h deleted file mode 100644 index 1d63b79d..00000000 --- a/libbcache/extents.h +++ /dev/null @@ -1,587 +0,0 @@ -#ifndef _BCACHE_EXTENTS_H -#define _BCACHE_EXTENTS_H - -#include "bcache.h" -#include "bkey.h" - -#include <linux/bcache.h> - -struct btree_node_iter; -struct btree_insert; -struct btree_insert_entry; -struct extent_insert_hook; - -struct btree_nr_keys bch_key_sort_fix_overlapping(struct bset *, - struct btree *, - struct btree_node_iter *); -struct btree_nr_keys bch_extent_sort_fix_overlapping(struct bch_fs *c, - struct bset *, - struct btree *, - struct btree_node_iter *); - -extern const struct bkey_ops bch_bkey_btree_ops; -extern const struct bkey_ops bch_bkey_extent_ops; - -struct bch_fs; -struct journal_res; - -struct extent_pick_ptr { - struct bch_extent_crc128 crc; - struct bch_extent_ptr ptr; - struct bch_dev *ca; -}; - -struct extent_pick_ptr -bch_btree_pick_ptr(struct bch_fs *, const struct btree *); - -void bch_extent_pick_ptr_avoiding(struct bch_fs *, struct bkey_s_c, - struct bch_dev *, struct extent_pick_ptr *); - -static inline void -bch_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, - struct extent_pick_ptr *ret) -{ - bch_extent_pick_ptr_avoiding(c, k, NULL, ret); -} - -enum extent_insert_hook_ret -bch_extent_cmpxchg(struct extent_insert_hook *, struct bpos, struct bpos, - struct bkey_s_c, const struct bkey_i *); - -enum btree_insert_ret -bch_insert_fixup_extent(struct btree_insert *, - struct btree_insert_entry *); - -bool bch_extent_normalize(struct bch_fs *, struct bkey_s); -void bch_extent_mark_replicas_cached(struct bch_fs *, - struct bkey_s_extent, unsigned); - -unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent); -unsigned bch_extent_nr_dirty_ptrs(struct bkey_s_c); - -static inline bool bkey_extent_is_data(const struct bkey *k) -{ - switch (k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - return true; - default: - return false; - } -} - -static inline bool bkey_extent_is_allocation(const struct bkey *k) -{ - switch (k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - case BCH_RESERVATION: - return true; - default: - return false; - } -} - -static inline bool bkey_extent_is_cached(const struct bkey *k) -{ - return k->type == BCH_EXTENT_CACHED; -} - -static inline void bkey_extent_set_cached(struct bkey *k, bool cached) -{ - EBUG_ON(k->type != BCH_EXTENT && - k->type != BCH_EXTENT_CACHED); - - k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT; -} - -static inline unsigned -__extent_entry_type(const union bch_extent_entry *e) -{ - return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; -} - -static inline enum bch_extent_entry_type -extent_entry_type(const union bch_extent_entry *e) -{ - int ret = __ffs(e->type); - - EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); - - return ret; -} - -static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) -{ - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_crc32: - return sizeof(struct bch_extent_crc32); - case BCH_EXTENT_ENTRY_crc64: - return sizeof(struct bch_extent_crc64); - case BCH_EXTENT_ENTRY_crc128: - return sizeof(struct bch_extent_crc128); - case BCH_EXTENT_ENTRY_ptr: - return sizeof(struct bch_extent_ptr); - default: - BUG(); - } -} - -static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) -{ - return extent_entry_bytes(entry) / sizeof(u64); -} - -static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) -{ - return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; -} - -static inline bool extent_entry_is_crc(const union bch_extent_entry *e) -{ - return !extent_entry_is_ptr(e); -} - -union bch_extent_crc { - u8 type; - struct bch_extent_crc32 crc32; - struct bch_extent_crc64 crc64; - struct bch_extent_crc128 crc128; -}; - -/* downcast, preserves const */ -#define to_entry(_entry) \ -({ \ - BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ - !type_is(_entry, struct bch_extent_ptr *)); \ - \ - __builtin_choose_expr( \ - (type_is_exact(_entry, const union bch_extent_crc *) || \ - type_is_exact(_entry, const struct bch_extent_ptr *)), \ - (const union bch_extent_entry *) (_entry), \ - (union bch_extent_entry *) (_entry)); \ -}) - -#define __entry_to_crc(_entry) \ - __builtin_choose_expr( \ - type_is_exact(_entry, const union bch_extent_entry *), \ - (const union bch_extent_crc *) (_entry), \ - (union bch_extent_crc *) (_entry)) - -#define entry_to_crc(_entry) \ -({ \ - EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ - \ - __entry_to_crc(_entry); \ -}) - -#define entry_to_ptr(_entry) \ -({ \ - EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ - \ - __builtin_choose_expr( \ - type_is_exact(_entry, const union bch_extent_entry *), \ - (const struct bch_extent_ptr *) (_entry), \ - (struct bch_extent_ptr *) (_entry)); \ -}) - -enum bch_extent_crc_type { - BCH_EXTENT_CRC_NONE, - BCH_EXTENT_CRC32, - BCH_EXTENT_CRC64, - BCH_EXTENT_CRC128, -}; - -static inline enum bch_extent_crc_type -__extent_crc_type(const union bch_extent_crc *crc) -{ - if (!crc) - return BCH_EXTENT_CRC_NONE; - - switch (extent_entry_type(to_entry(crc))) { - case BCH_EXTENT_ENTRY_crc32: - return BCH_EXTENT_CRC32; - case BCH_EXTENT_ENTRY_crc64: - return BCH_EXTENT_CRC64; - case BCH_EXTENT_ENTRY_crc128: - return BCH_EXTENT_CRC128; - default: - BUG(); - } -} - -#define extent_crc_type(_crc) \ -({ \ - BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) && \ - !type_is(_crc, struct bch_extent_crc64 *) && \ - !type_is(_crc, struct bch_extent_crc128 *) && \ - !type_is(_crc, union bch_extent_crc *)); \ - \ - type_is(_crc, struct bch_extent_crc32 *) ? BCH_EXTENT_CRC32 \ - : type_is(_crc, struct bch_extent_crc64 *) ? BCH_EXTENT_CRC64 \ - : type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128 \ - : __extent_crc_type((union bch_extent_crc *) _crc); \ -}) - -#define extent_entry_next(_entry) \ - ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) - -#define extent_entry_last(_e) \ - vstruct_idx((_e).v, bkey_val_u64s((_e).k)) - -/* Iterate over all entries: */ - -#define extent_for_each_entry_from(_e, _entry, _start) \ - for ((_entry) = _start; \ - (_entry) < extent_entry_last(_e); \ - (_entry) = extent_entry_next(_entry)) - -#define extent_for_each_entry(_e, _entry) \ - extent_for_each_entry_from(_e, _entry, (_e).v->start) - -/* Iterate over crcs only: */ - -#define extent_crc_next(_e, _p) \ -({ \ - typeof(&(_e).v->start[0]) _entry = _p; \ - \ - while ((_entry) < extent_entry_last(_e) && \ - !extent_entry_is_crc(_entry)) \ - (_entry) = extent_entry_next(_entry); \ - \ - entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL); \ -}) - -#define extent_for_each_crc(_e, _crc) \ - for ((_crc) = extent_crc_next(_e, (_e).v->start); \ - (_crc); \ - (_crc) = extent_crc_next(_e, extent_entry_next(to_entry(_crc)))) - -/* Iterate over pointers, with crcs: */ - -#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter) \ -({ \ - __label__ out; \ - typeof(&(_e).v->start[0]) _entry; \ - \ - extent_for_each_entry_from(_e, _entry, to_entry(_ptr)) \ - if (extent_entry_is_crc(_entry)) { \ - (_crc) = entry_to_crc(_entry); \ - } else { \ - _ptr = entry_to_ptr(_entry); \ - if (_filter) \ - goto out; \ - } \ - \ - _ptr = NULL; \ -out: \ - _ptr; \ -}) - -#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter) \ - for ((_crc) = NULL, \ - (_ptr) = &(_e).v->start->ptr; \ - ((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\ - (_ptr)++) - -#define extent_for_each_ptr_crc(_e, _ptr, _crc) \ - extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true) - -/* Iterate over pointers only, and from a given position: */ - -#define extent_ptr_next_filter(_e, _ptr, _filter) \ -({ \ - typeof(__entry_to_crc(&(_e).v->start[0])) _crc; \ - \ - extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter); \ -}) - -#define extent_ptr_next(_e, _ptr) \ - extent_ptr_next_filter(_e, _ptr, true) - -#define extent_for_each_ptr_filter(_e, _ptr, _filter) \ - for ((_ptr) = &(_e).v->start->ptr; \ - ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter)); \ - (_ptr)++) - -#define extent_for_each_ptr(_e, _ptr) \ - extent_for_each_ptr_filter(_e, _ptr, true) - -#define extent_ptr_prev(_e, _ptr) \ -({ \ - typeof(&(_e).v->start->ptr) _p; \ - typeof(&(_e).v->start->ptr) _prev = NULL; \ - \ - extent_for_each_ptr(_e, _p) { \ - if (_p == (_ptr)) \ - break; \ - _prev = _p; \ - } \ - \ - _prev; \ -}) - -/* - * Use this when you'll be dropping pointers as you iterate. Quadratic, - * unfortunately: - */ -#define extent_for_each_ptr_backwards(_e, _ptr) \ - for ((_ptr) = extent_ptr_prev(_e, NULL); \ - (_ptr); \ - (_ptr) = extent_ptr_prev(_e, _ptr)) - -void bch_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned, - unsigned, unsigned, struct bch_csum, unsigned); - -static inline void __extent_entry_push(struct bkey_i_extent *e) -{ - union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e)); - - EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) > - BKEY_EXTENT_VAL_U64s_MAX); - - e->k.u64s += extent_entry_u64s(entry); -} - -static inline void extent_ptr_append(struct bkey_i_extent *e, - struct bch_extent_ptr ptr) -{ - ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - extent_entry_last(extent_i_to_s(e))->ptr = ptr; - __extent_entry_push(e); -} - -static inline struct bch_extent_crc128 crc_to_128(const struct bkey *k, - const union bch_extent_crc *crc) -{ - EBUG_ON(!k->size); - - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - return (struct bch_extent_crc128) { - ._compressed_size = k->size - 1, - ._uncompressed_size = k->size - 1, - }; - case BCH_EXTENT_CRC32: - return (struct bch_extent_crc128) { - .type = 1 << BCH_EXTENT_ENTRY_crc128, - ._compressed_size = crc->crc32._compressed_size, - ._uncompressed_size = crc->crc32._uncompressed_size, - .offset = crc->crc32.offset, - .csum_type = crc->crc32.csum_type, - .compression_type = crc->crc32.compression_type, - .csum.lo = crc->crc32.csum, - }; - case BCH_EXTENT_CRC64: - return (struct bch_extent_crc128) { - .type = 1 << BCH_EXTENT_ENTRY_crc128, - ._compressed_size = crc->crc64._compressed_size, - ._uncompressed_size = crc->crc64._uncompressed_size, - .offset = crc->crc64.offset, - .nonce = crc->crc64.nonce, - .csum_type = crc->crc64.csum_type, - .compression_type = crc->crc64.compression_type, - .csum.lo = crc->crc64.csum_lo, - .csum.hi = crc->crc64.csum_hi, - }; - case BCH_EXTENT_CRC128: - return crc->crc128; - default: - BUG(); - } -} - -#define crc_compressed_size(_k, _crc) \ -({ \ - unsigned _size = 0; \ - \ - switch (extent_crc_type(_crc)) { \ - case BCH_EXTENT_CRC_NONE: \ - _size = ((const struct bkey *) (_k))->size; \ - break; \ - case BCH_EXTENT_CRC32: \ - _size = ((struct bch_extent_crc32 *) _crc) \ - ->_compressed_size + 1; \ - break; \ - case BCH_EXTENT_CRC64: \ - _size = ((struct bch_extent_crc64 *) _crc) \ - ->_compressed_size + 1; \ - break; \ - case BCH_EXTENT_CRC128: \ - _size = ((struct bch_extent_crc128 *) _crc) \ - ->_compressed_size + 1; \ - break; \ - } \ - _size; \ -}) - -#define crc_uncompressed_size(_k, _crc) \ -({ \ - unsigned _size = 0; \ - \ - switch (extent_crc_type(_crc)) { \ - case BCH_EXTENT_CRC_NONE: \ - _size = ((const struct bkey *) (_k))->size; \ - break; \ - case BCH_EXTENT_CRC32: \ - _size = ((struct bch_extent_crc32 *) _crc) \ - ->_uncompressed_size + 1; \ - break; \ - case BCH_EXTENT_CRC64: \ - _size = ((struct bch_extent_crc64 *) _crc) \ - ->_uncompressed_size + 1; \ - break; \ - case BCH_EXTENT_CRC128: \ - _size = ((struct bch_extent_crc128 *) _crc) \ - ->_uncompressed_size + 1; \ - break; \ - } \ - _size; \ -}) - -static inline unsigned crc_offset(const union bch_extent_crc *crc) -{ - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - return 0; - case BCH_EXTENT_CRC32: - return crc->crc32.offset; - case BCH_EXTENT_CRC64: - return crc->crc64.offset; - case BCH_EXTENT_CRC128: - return crc->crc128.offset; - default: - BUG(); - } -} - -static inline unsigned crc_nonce(const union bch_extent_crc *crc) -{ - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - case BCH_EXTENT_CRC32: - return 0; - case BCH_EXTENT_CRC64: - return crc->crc64.nonce; - case BCH_EXTENT_CRC128: - return crc->crc128.nonce; - default: - BUG(); - } -} - -static inline unsigned crc_csum_type(const union bch_extent_crc *crc) -{ - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - return 0; - case BCH_EXTENT_CRC32: - return crc->crc32.csum_type; - case BCH_EXTENT_CRC64: - return crc->crc64.csum_type; - case BCH_EXTENT_CRC128: - return crc->crc128.csum_type; - default: - BUG(); - } -} - -static inline unsigned crc_compression_type(const union bch_extent_crc *crc) -{ - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - return 0; - case BCH_EXTENT_CRC32: - return crc->crc32.compression_type; - case BCH_EXTENT_CRC64: - return crc->crc64.compression_type; - case BCH_EXTENT_CRC128: - return crc->crc128.compression_type; - default: - BUG(); - } -} - -static inline struct bch_csum crc_csum(const union bch_extent_crc *crc) -{ - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - return (struct bch_csum) { 0 }; - case BCH_EXTENT_CRC32: - return (struct bch_csum) { .lo = crc->crc32.csum }; - case BCH_EXTENT_CRC64: - return (struct bch_csum) { - .lo = crc->crc64.csum_lo, - .hi = crc->crc64.csum_hi, - }; - case BCH_EXTENT_CRC128: - return crc->crc128.csum; - default: - BUG(); - } -} - -static inline unsigned bkey_extent_is_compressed(struct bkey_s_c k) -{ - struct bkey_s_c_extent e; - const struct bch_extent_ptr *ptr; - const union bch_extent_crc *crc; - unsigned ret = 0; - - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); - - extent_for_each_ptr_crc(e, ptr, crc) - if (!ptr->cached && - crc_compression_type(crc) != BCH_COMPRESSION_NONE && - crc_compressed_size(e.k, crc) < k.k->size) - ret = max_t(unsigned, ret, - crc_compressed_size(e.k, crc)); - } - - return ret; -} - -static inline unsigned extent_current_nonce(struct bkey_s_c_extent e) -{ - const union bch_extent_crc *crc; - - extent_for_each_crc(e, crc) - if (bch_csum_type_is_encryption(crc_csum_type(crc))) - return crc_offset(crc) + crc_nonce(crc); - - return 0; -} - -void bch_extent_narrow_crcs(struct bkey_s_extent); -void bch_extent_drop_redundant_crcs(struct bkey_s_extent); - -/* Doesn't cleanup redundant crcs */ -static inline void __bch_extent_drop_ptr(struct bkey_s_extent e, - struct bch_extent_ptr *ptr) -{ - EBUG_ON(ptr < &e.v->start->ptr || - ptr >= &extent_entry_last(e)->ptr); - EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - memmove_u64s_down(ptr, ptr + 1, - (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1)); - e.k->u64s -= sizeof(*ptr) / sizeof(u64); -} - -static inline void bch_extent_drop_ptr(struct bkey_s_extent e, - struct bch_extent_ptr *ptr) -{ - __bch_extent_drop_ptr(e, ptr); - bch_extent_drop_redundant_crcs(e); -} - -const struct bch_extent_ptr * -bch_extent_has_device(struct bkey_s_c_extent, unsigned); - -bool bch_cut_front(struct bpos, struct bkey_i *); -bool bch_cut_back(struct bpos, struct bkey *); -void bch_key_resize(struct bkey *, unsigned); - -#endif /* _BCACHE_EXTENTS_H */ diff --git a/libbcache/eytzinger.h b/libbcache/eytzinger.h deleted file mode 100644 index 13d54e5e..00000000 --- a/libbcache/eytzinger.h +++ /dev/null @@ -1,196 +0,0 @@ -#ifndef _EYTZINGER_H -#define _EYTZINGER_H - -#include <linux/bitops.h> -#include <linux/log2.h> - -#include "util.h" - -/* - * Traversal for trees in eytzinger layout - a full binary tree layed out in an - * array - * - * We used one based indexing, not zero based: with one based indexing, each - * level of the tree starts at a power of two - leading to better alignment - - * and it's what you want for implementing next/prev and to/from inorder. - * - * To/from inorder also uses 1 based indexing. - * - * Size parameter is treated as if we were using 0 based indexing, however: - * valid nodes, and inorder indices, are in the range [1..size) - */ - -static inline unsigned eytzinger_child(unsigned j, unsigned child) -{ - EBUG_ON(child > 1); - - return (j << 1) + child; -} - -static inline unsigned eytzinger_left_child(unsigned j) -{ - return eytzinger_child(j, 0); -} - -static inline unsigned eytzinger_right_child(unsigned j) -{ - return eytzinger_child(j, 1); -} - -static inline unsigned eytzinger_first(unsigned size) -{ - return rounddown_pow_of_two(size - 1); -} - -static inline unsigned eytzinger_last(unsigned size) -{ - return rounddown_pow_of_two(size) - 1; -} - -/* - * eytzinger_next() and eytzinger_prev() have the nice properties that - * - * eytzinger_next(0) == eytzinger_first()) - * eytzinger_prev(0) == eytzinger_last()) - * - * eytzinger_prev(eytzinger_first()) == 0 - * eytzinger_next(eytzinger_last()) == 0 - */ - -static inline unsigned eytzinger_next(unsigned j, unsigned size) -{ - EBUG_ON(j >= size); - - if (eytzinger_right_child(j) < size) { - j = eytzinger_right_child(j); - - j <<= __fls(size) - __fls(j); - j >>= j >= size; - } else { - j >>= ffz(j) + 1; - } - - return j; -} - -static inline unsigned eytzinger_prev(unsigned j, unsigned size) -{ - EBUG_ON(j >= size); - - if (eytzinger_left_child(j) < size) { - j = eytzinger_left_child(j); - - j <<= __fls(size) - __fls(j); - j -= 1; - j >>= j >= size; - } else { - j >>= __ffs(j) + 1; - } - - return j; -} - -static inline unsigned eytzinger_extra(unsigned size) -{ - return (size - rounddown_pow_of_two(size - 1)) << 1; -} - -static inline unsigned __eytzinger_to_inorder(unsigned j, unsigned size, - unsigned extra) -{ - unsigned b = __fls(j); - unsigned shift = __fls(size - 1) - b; - int s; - - EBUG_ON(!j || j >= size); - - j ^= 1U << b; - j <<= 1; - j |= 1; - j <<= shift; - - /* - * sign bit trick: - * - * if (j > extra) - * j -= (j - extra) >> 1; - */ - s = extra - j; - j += (s >> 1) & (s >> 31); - - return j; -} - -static inline unsigned __inorder_to_eytzinger(unsigned j, unsigned size, - unsigned extra) -{ - unsigned shift; - int s; - - EBUG_ON(!j || j >= size); - - /* - * sign bit trick: - * - * if (j > extra) - * j += j - extra; - */ - s = extra - j; - j -= s & (s >> 31); - - shift = __ffs(j); - - j >>= shift + 1; - j |= 1U << (__fls(size - 1) - shift); - - return j; -} - -static inline unsigned eytzinger_to_inorder(unsigned j, unsigned size) -{ - return __eytzinger_to_inorder(j, size, eytzinger_extra(size)); -} - -static inline unsigned inorder_to_eytzinger(unsigned j, unsigned size) -{ - return __inorder_to_eytzinger(j, size, eytzinger_extra(size)); -} - -#define eytzinger_for_each(_i, _size) \ - for ((_i) = eytzinger_first((_size)); \ - (_i) != 0; \ - (_i) = eytzinger_next((_i), (_size))) - -#if 0 -void eytzinger_test(void) -{ - unsigned i, j, size; - - for (size = 2; - size < 65536000; - size++) { - if (!(size % 4096)) - printk(KERN_INFO "tree size %u\n", size); - - assert(eytzinger_prev(0, size) == eytzinger_last(size)); - assert(eytzinger_next(0, size) == eytzinger_first(size)); - - assert(eytzinger_prev(eytzinger_first(size), size) == 0); - assert(eytzinger_next(eytzinger_last(size), size) == 0); - - eytzinger_for_each(j, size) { - assert(from_inorder(i, size) == j); - assert(to_inorder(j, size) == i); - - if (j != eytzinger_last(size)) { - unsigned next = eytzinger_next(j, size); - - assert(eytzinger_prev(next, size) == j); - } - } - } - -} -#endif - -#endif /* _EYTZINGER_H */ diff --git a/libbcache/fifo.h b/libbcache/fifo.h deleted file mode 100644 index 2908ca23..00000000 --- a/libbcache/fifo.h +++ /dev/null @@ -1,123 +0,0 @@ -#ifndef _BCACHE_FIFO_H -#define _BCACHE_FIFO_H - -#define DECLARE_FIFO(type, name) \ - struct { \ - size_t front, back, size, mask; \ - type *data; \ - } name - -#define init_fifo(fifo, _size, _gfp) \ -({ \ - bool _ret = true; \ - gfp_t gfp_flags = (_gfp); \ - \ - if (gfp_flags & GFP_KERNEL) \ - gfp_flags |= __GFP_NOWARN; \ - \ - (fifo)->size = (_size); \ - (fifo)->front = (fifo)->back = 0; \ - (fifo)->data = NULL; \ - \ - if ((fifo)->size) { \ - size_t _allocated_size, _bytes; \ - \ - _allocated_size = roundup_pow_of_two((fifo)->size); \ - _bytes = _allocated_size * sizeof(*(fifo)->data); \ - \ - (fifo)->mask = _allocated_size - 1; \ - \ - if (_bytes < KMALLOC_MAX_SIZE) \ - (fifo)->data = kmalloc(_bytes, gfp_flags); \ - if ((!(fifo)->data) && (gfp_flags & GFP_KERNEL)) \ - (fifo)->data = vmalloc(_bytes); \ - if ((!(fifo)->data)) \ - _ret = false; \ - } \ - _ret; \ -}) - -#define free_fifo(fifo) \ -do { \ - kvfree((fifo)->data); \ - (fifo)->data = NULL; \ -} while (0) - -#define fifo_swap(l, r) \ -do { \ - swap((l)->front, (r)->front); \ - swap((l)->back, (r)->back); \ - swap((l)->size, (r)->size); \ - swap((l)->mask, (r)->mask); \ - swap((l)->data, (r)->data); \ -} while (0) - -#define fifo_move(dest, src) \ -do { \ - typeof(*((dest)->data)) _t; \ - while (!fifo_full(dest) && \ - fifo_pop(src, _t)) \ - fifo_push(dest, _t); \ -} while (0) - -#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) -#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) - -#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) -#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) - -#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) -#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) - -#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) - -#define fifo_push_back(fifo, i) \ -({ \ - bool _r = !fifo_full((fifo)); \ - if (_r) \ - (fifo)->data[(fifo)->back++ & (fifo)->mask] = (i); \ - _r; \ -}) - -#define fifo_pop_front(fifo, i) \ -({ \ - bool _r = !fifo_empty((fifo)); \ - if (_r) \ - (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ - _r; \ -}) - -#define fifo_push_front(fifo, i) \ -({ \ - bool _r = !fifo_full((fifo)); \ - if (_r) \ - (fifo)->data[--(fifo)->front & (fifo)->mask] = (i); \ - _r; \ -}) - -#define fifo_pop_back(fifo, i) \ -({ \ - bool _r = !fifo_empty((fifo)); \ - if (_r) \ - (i) = (fifo)->data[--(fifo)->back & (fifo)->mask] \ - _r; \ -}) - -#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) -#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) -#define fifo_peek(fifo) fifo_peek_front(fifo) - -#define fifo_for_each_entry(_entry, _fifo, _iter) \ - for (_iter = (_fifo)->front; \ - ((_iter != (_fifo)->back) && \ - (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ - _iter++) - -#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ - for (_iter = (_fifo)->front; \ - ((_iter != (_fifo)->back) && \ - (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ - _iter++) - -#endif /* _BCACHE_FIFO_H */ - diff --git a/libbcache/fs-gc.c b/libbcache/fs-gc.c deleted file mode 100644 index 1f6a65ec..00000000 --- a/libbcache/fs-gc.c +++ /dev/null @@ -1,924 +0,0 @@ - -#include "bcache.h" -#include "btree_update.h" -#include "dirent.h" -#include "error.h" -#include "fs.h" -#include "fs-gc.h" -#include "inode.h" -#include "keylist.h" -#include "super.h" - -#include <linux/generic-radix-tree.h> - -#define QSTR(n) { { { .len = strlen(n) } }, .name = n } - -static int remove_dirent(struct bch_fs *c, struct btree_iter *iter, - struct bkey_s_c_dirent dirent) -{ - struct qstr name; - struct bch_inode_unpacked dir_inode; - struct bch_hash_info dir_hash_info; - u64 dir_inum = dirent.k->p.inode; - int ret; - char *buf; - - name.len = bch_dirent_name_bytes(dirent); - buf = kmalloc(name.len + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - memcpy(buf, dirent.v->d_name, name.len); - buf[name.len] = '\0'; - name.name = buf; - - /* Unlock iter so we don't deadlock, after copying name: */ - bch_btree_iter_unlock(iter); - - ret = bch_inode_find_by_inum(c, dir_inum, &dir_inode); - if (ret) - goto err; - - dir_hash_info = bch_hash_info_init(&dir_inode); - - ret = bch_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL); -err: - kfree(buf); - return ret; -} - -static int reattach_inode(struct bch_fs *c, - struct bch_inode_unpacked *lostfound_inode, - u64 inum) -{ - struct bch_hash_info lostfound_hash_info = - bch_hash_info_init(lostfound_inode); - struct bkey_inode_buf packed; - char name_buf[20]; - struct qstr name; - int ret; - - snprintf(name_buf, sizeof(name_buf), "%llu", inum); - name = (struct qstr) QSTR(name_buf); - - lostfound_inode->i_nlink++; - - bch_inode_pack(&packed, lostfound_inode); - - ret = bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, NULL, 0); - if (ret) - return ret; - - return bch_dirent_create(c, lostfound_inode->inum, - &lostfound_hash_info, - DT_DIR, &name, inum, NULL, 0); -} - -struct inode_walker { - bool first_this_inode; - bool have_inode; - u64 cur_inum; - struct bch_inode_unpacked inode; -}; - -static struct inode_walker inode_walker_init(void) -{ - return (struct inode_walker) { - .cur_inum = -1, - .have_inode = false, - }; -} - -static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum) -{ - w->first_this_inode = inum != w->cur_inum; - w->cur_inum = inum; - - if (w->first_this_inode) { - int ret = bch_inode_find_by_inum(c, inum, &w->inode); - - if (ret && ret != -ENOENT) - return ret; - - w->have_inode = !ret; - } - - return 0; -} - -/* - * Walk extents: verify that extents have a corresponding S_ISREG inode, and - * that i_size an i_sectors are consistent - */ -noinline_for_stack -static int check_extents(struct bch_fs *c) -{ - struct inode_walker w = inode_walker_init(); - struct btree_iter iter; - struct bkey_s_c k; - u64 i_sectors; - int ret = 0; - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(BCACHE_ROOT_INO, 0), k) { - if (k.k->type == KEY_TYPE_DISCARD) - continue; - - ret = walk_inode(c, &w, k.k->p.inode); - if (ret) - break; - - unfixable_fsck_err_on(!w.have_inode, c, - "extent type %u for missing inode %llu", - k.k->type, k.k->p.inode); - - unfixable_fsck_err_on(w.first_this_inode && w.have_inode && - w.inode.i_sectors != - (i_sectors = bch_count_inode_sectors(c, w.cur_inum)), - c, "i_sectors wrong: got %llu, should be %llu", - w.inode.i_sectors, i_sectors); - - unfixable_fsck_err_on(w.have_inode && - !S_ISREG(w.inode.i_mode) && !S_ISLNK(w.inode.i_mode), c, - "extent type %u for non regular file, inode %llu mode %o", - k.k->type, k.k->p.inode, w.inode.i_mode); - - unfixable_fsck_err_on(k.k->type != BCH_RESERVATION && - k.k->p.offset > round_up(w.inode.i_size, PAGE_SIZE) >> 9, c, - "extent type %u offset %llu past end of inode %llu, i_size %llu", - k.k->type, k.k->p.offset, k.k->p.inode, w.inode.i_size); - } -fsck_err: - return bch_btree_iter_unlock(&iter) ?: ret; -} - -/* - * Walk dirents: verify that they all have a corresponding S_ISDIR inode, - * validate d_type - */ -noinline_for_stack -static int check_dirents(struct bch_fs *c) -{ - struct inode_walker w = inode_walker_init(); - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, - POS(BCACHE_ROOT_INO, 0), k) { - struct bkey_s_c_dirent d; - struct bch_inode_unpacked target; - bool have_target; - u64 d_inum; - - ret = walk_inode(c, &w, k.k->p.inode); - if (ret) - break; - - unfixable_fsck_err_on(!w.have_inode, c, - "dirent in nonexisting directory %llu", - k.k->p.inode); - - unfixable_fsck_err_on(!S_ISDIR(w.inode.i_mode), c, - "dirent in non directory inode %llu, type %u", - k.k->p.inode, mode_to_type(w.inode.i_mode)); - - if (k.k->type != BCH_DIRENT) - continue; - - d = bkey_s_c_to_dirent(k); - d_inum = le64_to_cpu(d.v->d_inum); - - if (fsck_err_on(d_inum == d.k->p.inode, c, - "dirent points to own directory")) { - ret = remove_dirent(c, &iter, d); - if (ret) - goto err; - continue; - } - - ret = bch_inode_find_by_inum(c, d_inum, &target); - if (ret && ret != -ENOENT) - break; - - have_target = !ret; - ret = 0; - - if (fsck_err_on(!have_target, c, - "dirent points to missing inode %llu, type %u filename %s", - d_inum, d.v->d_type, d.v->d_name)) { - ret = remove_dirent(c, &iter, d); - if (ret) - goto err; - continue; - } - - if (fsck_err_on(have_target && - d.v->d_type != - mode_to_type(le16_to_cpu(target.i_mode)), c, - "incorrect d_type: got %u should be %u, filename %s", - d.v->d_type, - mode_to_type(le16_to_cpu(target.i_mode)), - d.v->d_name)) { - struct bkey_i_dirent *n; - - n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); - if (!n) { - ret = -ENOMEM; - goto err; - } - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = mode_to_type(le16_to_cpu(target.i_mode)); - - ret = bch_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &n->k_i)); - kfree(n); - if (ret) - goto err; - - } - } -err: -fsck_err: - return bch_btree_iter_unlock(&iter) ?: ret; -} - -/* - * Walk xattrs: verify that they all have a corresponding inode - */ -noinline_for_stack -static int check_xattrs(struct bch_fs *c) -{ - struct inode_walker w = inode_walker_init(); - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key(&iter, c, BTREE_ID_XATTRS, - POS(BCACHE_ROOT_INO, 0), k) { - ret = walk_inode(c, &w, k.k->p.inode); - if (ret) - break; - - unfixable_fsck_err_on(!w.have_inode, c, - "xattr for missing inode %llu", - k.k->p.inode); - } -fsck_err: - return bch_btree_iter_unlock(&iter) ?: ret; -} - -/* Get root directory, create if it doesn't exist: */ -static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) -{ - struct bkey_inode_buf packed; - int ret; - - ret = bch_inode_find_by_inum(c, BCACHE_ROOT_INO, root_inode); - if (ret && ret != -ENOENT) - return ret; - - if (fsck_err_on(ret, c, "root directory missing")) - goto create_root; - - if (fsck_err_on(!S_ISDIR(root_inode->i_mode), c, - "root inode not a directory")) - goto create_root; - - return 0; -fsck_err: - return ret; -create_root: - bch_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); - root_inode->inum = BCACHE_ROOT_INO; - - bch_inode_pack(&packed, root_inode); - - return bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, NULL, 0); -} - -/* Get lost+found, create if it doesn't exist: */ -static int check_lostfound(struct bch_fs *c, - struct bch_inode_unpacked *root_inode, - struct bch_inode_unpacked *lostfound_inode) -{ - struct qstr lostfound = QSTR("lost+found"); - struct bch_hash_info root_hash_info = - bch_hash_info_init(root_inode); - struct bkey_inode_buf packed; - u64 inum; - int ret; - - inum = bch_dirent_lookup(c, BCACHE_ROOT_INO, &root_hash_info, - &lostfound); - if (!inum) { - bch_notice(c, "creating lost+found"); - goto create_lostfound; - } - - ret = bch_inode_find_by_inum(c, inum, lostfound_inode); - if (ret && ret != -ENOENT) - return ret; - - if (fsck_err_on(ret, c, "lost+found missing")) - goto create_lostfound; - - if (fsck_err_on(!S_ISDIR(lostfound_inode->i_mode), c, - "lost+found inode not a directory")) - goto create_lostfound; - - return 0; -fsck_err: - return ret; -create_lostfound: - root_inode->i_nlink++; - - bch_inode_pack(&packed, root_inode); - - ret = bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, NULL, 0); - if (ret) - return ret; - - bch_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); - bch_inode_pack(&packed, lostfound_inode); - - ret = bch_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0, - &c->unused_inode_hint); - if (ret) - return ret; - - lostfound_inode->inum = packed.inode.k.p.inode; - - ret = bch_dirent_create(c, BCACHE_ROOT_INO, &root_hash_info, DT_DIR, - &lostfound, lostfound_inode->inum, NULL, 0); - if (ret) - return ret; - - return 0; -} - -struct inode_bitmap { - unsigned long *bits; - size_t size; -}; - -static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) -{ - return nr < b->size ? test_bit(nr, b->bits) : false; -} - -static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) -{ - if (nr >= b->size) { - size_t new_size = max(max(PAGE_SIZE * 8, - b->size * 2), - nr + 1); - void *n; - - new_size = roundup_pow_of_two(new_size); - n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); - if (!n) - return -ENOMEM; - - b->bits = n; - b->size = new_size; - } - - __set_bit(nr, b->bits); - return 0; -} - -struct pathbuf { - size_t nr; - size_t size; - - struct pathbuf_entry { - u64 inum; - u64 offset; - } *entries; -}; - -static int path_down(struct pathbuf *p, u64 inum) -{ - if (p->nr == p->size) { - size_t new_size = max(256UL, p->size * 2); - void *n = krealloc(p->entries, - new_size * sizeof(p->entries[0]), - GFP_KERNEL); - if (!n) - return -ENOMEM; - - p->entries = n; - p->size = new_size; - }; - - p->entries[p->nr++] = (struct pathbuf_entry) { - .inum = inum, - .offset = 0, - }; - return 0; -} - -noinline_for_stack -static int check_directory_structure(struct bch_fs *c, - struct bch_inode_unpacked *lostfound_inode) -{ - struct inode_bitmap dirs_done = { NULL, 0 }; - struct pathbuf path = { 0, 0, NULL }; - struct pathbuf_entry *e; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_dirent dirent; - bool had_unreachable; - u64 d_inum; - int ret = 0; - - /* DFS: */ -restart_dfs: - ret = inode_bitmap_set(&dirs_done, BCACHE_ROOT_INO); - if (ret) - goto err; - - ret = path_down(&path, BCACHE_ROOT_INO); - if (ret) - return ret; - - while (path.nr) { -next: - e = &path.entries[path.nr - 1]; - - if (e->offset == U64_MAX) - goto up; - - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, - POS(e->inum, e->offset + 1), k) { - if (k.k->p.inode != e->inum) - break; - - e->offset = k.k->p.offset; - - if (k.k->type != BCH_DIRENT) - continue; - - dirent = bkey_s_c_to_dirent(k); - - if (dirent.v->d_type != DT_DIR) - continue; - - d_inum = le64_to_cpu(dirent.v->d_inum); - - if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c, - "directory with multiple hardlinks")) { - ret = remove_dirent(c, &iter, dirent); - if (ret) - goto err; - continue; - } - - ret = inode_bitmap_set(&dirs_done, d_inum); - if (ret) - goto err; - - ret = path_down(&path, d_inum); - if (ret) - goto err; - - bch_btree_iter_unlock(&iter); - goto next; - } - ret = bch_btree_iter_unlock(&iter); - if (ret) - goto err; -up: - path.nr--; - } - - had_unreachable = false; - - for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) { - if (k.k->type != BCH_INODE_FS || - !S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->i_mode))) - continue; - - if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c, - "unreachable directory found (inum %llu)", - k.k->p.inode)) { - bch_btree_iter_unlock(&iter); - - ret = reattach_inode(c, lostfound_inode, k.k->p.inode); - if (ret) - goto err; - - had_unreachable = true; - } - } - ret = bch_btree_iter_unlock(&iter); - if (ret) - goto err; - - if (had_unreachable) { - bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); - kfree(dirs_done.bits); - kfree(path.entries); - memset(&dirs_done, 0, sizeof(dirs_done)); - memset(&path, 0, sizeof(path)); - goto restart_dfs; - } - -out: - kfree(dirs_done.bits); - kfree(path.entries); - return ret; -err: -fsck_err: - ret = bch_btree_iter_unlock(&iter) ?: ret; - goto out; -} - -struct nlink { - u32 count; - u32 dir_count; -}; - -typedef GENRADIX(struct nlink) nlink_table; - -static void inc_link(struct bch_fs *c, nlink_table *links, - u64 range_start, u64 *range_end, - u64 inum, bool dir) -{ - struct nlink *link; - - if (inum < range_start || inum >= *range_end) - return; - - link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); - if (!link) { - bch_verbose(c, "allocation failed during fs gc - will need another pass"); - *range_end = inum; - return; - } - - if (dir) - link->dir_count++; - else - link->count++; -} - -noinline_for_stack -static int bch_gc_walk_dirents(struct bch_fs *c, nlink_table *links, - u64 range_start, u64 *range_end) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_dirent d; - u64 d_inum; - int ret; - - inc_link(c, links, range_start, range_end, BCACHE_ROOT_INO, false); - - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, k) { - switch (k.k->type) { - case BCH_DIRENT: - d = bkey_s_c_to_dirent(k); - d_inum = le64_to_cpu(d.v->d_inum); - - if (d.v->d_type == DT_DIR) - inc_link(c, links, range_start, range_end, - d.k->p.inode, true); - - inc_link(c, links, range_start, range_end, - d_inum, false); - - break; - } - - bch_btree_iter_cond_resched(&iter); - } - ret = bch_btree_iter_unlock(&iter); - if (ret) - bch_err(c, "error in fs gc: btree error %i while walking dirents", ret); - - return ret; -} - -s64 bch_count_inode_sectors(struct bch_fs *c, u64 inum) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 sectors = 0; - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), k) { - if (k.k->p.inode != inum) - break; - - if (bkey_extent_is_allocation(k.k)) - sectors += k.k->size; - } - - return bch_btree_iter_unlock(&iter) ?: sectors; -} - -static int bch_gc_do_inode(struct bch_fs *c, - struct bch_inode_unpacked *lostfound_inode, - struct btree_iter *iter, - struct bkey_s_c_inode inode, struct nlink link) -{ - struct bch_inode_unpacked u; - int ret = 0; - u32 i_nlink, real_i_nlink; - bool do_update = false; - - ret = bch_inode_unpack(inode, &u); - if (bch_fs_inconsistent_on(ret, c, - "error unpacking inode %llu in fs-gc", - inode.k->p.inode)) - return ret; - - i_nlink = u.i_nlink + nlink_bias(u.i_mode); - - fsck_err_on(i_nlink < link.count, c, - "inode %llu i_link too small (%u < %u, type %i)", - inode.k->p.inode, i_nlink, - link.count, mode_to_type(u.i_mode)); - - /* These should have been caught/fixed by earlier passes: */ - if (S_ISDIR(u.i_mode)) { - need_fsck_err_on(link.count > 1, c, - "directory %llu with multiple hardlinks: %u", - inode.k->p.inode, link.count); - - real_i_nlink = link.count * 2 + link.dir_count; - } else { - need_fsck_err_on(link.dir_count, c, - "found dirents for non directory %llu", - inode.k->p.inode); - - real_i_nlink = link.count + link.dir_count; - } - - if (!link.count) { - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but found orphaned inode %llu", - inode.k->p.inode); - - if (fsck_err_on(S_ISDIR(u.i_mode) && - bch_empty_dir(c, inode.k->p.inode), c, - "non empty directory with link count 0, " - "inode nlink %u, dir links found %u", - i_nlink, link.dir_count)) { - ret = reattach_inode(c, lostfound_inode, - inode.k->p.inode); - if (ret) - return ret; - } - - bch_verbose(c, "deleting inode %llu", inode.k->p.inode); - - ret = bch_inode_rm(c, inode.k->p.inode); - if (ret) - bch_err(c, "error in fs gc: error %i " - "while deleting inode", ret); - return ret; - } - - if (u.i_flags & BCH_INODE_I_SIZE_DIRTY) { - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but inode %llu has i_size dirty", - inode.k->p.inode); - - bch_verbose(c, "truncating inode %llu", inode.k->p.inode); - - /* - * XXX: need to truncate partial blocks too here - or ideally - * just switch units to bytes and that issue goes away - */ - - ret = bch_inode_truncate(c, inode.k->p.inode, - round_up(u.i_size, PAGE_SIZE) >> 9, - NULL, NULL); - if (ret) { - bch_err(c, "error in fs gc: error %i " - "truncating inode", ret); - return ret; - } - - /* - * We truncated without our normal sector accounting hook, just - * make sure we recalculate it: - */ - u.i_flags |= BCH_INODE_I_SECTORS_DIRTY; - - u.i_flags &= ~BCH_INODE_I_SIZE_DIRTY; - do_update = true; - } - - if (u.i_flags & BCH_INODE_I_SECTORS_DIRTY) { - s64 sectors; - - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but inode %llu has i_sectors dirty", - inode.k->p.inode); - - bch_verbose(c, "recounting sectors for inode %llu", - inode.k->p.inode); - - sectors = bch_count_inode_sectors(c, inode.k->p.inode); - if (sectors < 0) { - bch_err(c, "error in fs gc: error %i " - "recounting inode sectors", - (int) sectors); - return sectors; - } - - u.i_sectors = sectors; - u.i_flags &= ~BCH_INODE_I_SECTORS_DIRTY; - do_update = true; - } - - if (i_nlink != real_i_nlink) { - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but inode %llu has wrong i_nlink " - "(type %u i_nlink %u, should be %u)", - inode.k->p.inode, mode_to_type(u.i_mode), - i_nlink, real_i_nlink); - - bch_verbose(c, "setting inode %llu nlinks from %u to %u", - inode.k->p.inode, i_nlink, real_i_nlink); - u.i_nlink = real_i_nlink - nlink_bias(u.i_mode);; - do_update = true; - } - - if (do_update) { - struct bkey_inode_buf p; - - bch_inode_pack(&p, &u); - - ret = bch_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(iter, &p.inode.k_i)); - if (ret && ret != -EINTR) - bch_err(c, "error in fs gc: error %i " - "updating inode", ret); - } -fsck_err: - return ret; -} - -noinline_for_stack -static int bch_gc_walk_inodes(struct bch_fs *c, - struct bch_inode_unpacked *lostfound_inode, - nlink_table *links, - u64 range_start, u64 range_end) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct nlink *link, zero_links = { 0, 0 }; - struct genradix_iter nlinks_iter; - int ret = 0, ret2 = 0; - u64 nlinks_pos; - - bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0)); - genradix_iter_init(&nlinks_iter); - - while ((k = bch_btree_iter_peek(&iter)).k && - !btree_iter_err(k)) { -peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); - - if (!link && (!k.k || iter.pos.inode >= range_end)) - break; - - nlinks_pos = range_start + nlinks_iter.pos; - if (iter.pos.inode > nlinks_pos) { - /* Should have been caught by dirents pass: */ - need_fsck_err_on(link && link->count, c, - "missing inode %llu (nlink %u)", - nlinks_pos, link->count); - genradix_iter_advance(&nlinks_iter, links); - goto peek_nlinks; - } - - if (iter.pos.inode < nlinks_pos || !link) - link = &zero_links; - - if (k.k && k.k->type == BCH_INODE_FS) { - /* - * Avoid potential deadlocks with iter for - * truncate/rm/etc.: - */ - bch_btree_iter_unlock(&iter); - - ret = bch_gc_do_inode(c, lostfound_inode, &iter, - bkey_s_c_to_inode(k), *link); - if (ret == -EINTR) - continue; - if (ret) - break; - - if (link->count) - atomic_long_inc(&c->nr_inodes); - } else { - /* Should have been caught by dirents pass: */ - need_fsck_err_on(link->count, c, - "missing inode %llu (nlink %u)", - nlinks_pos, link->count); - } - - if (nlinks_pos == iter.pos.inode) - genradix_iter_advance(&nlinks_iter, links); - - bch_btree_iter_advance_pos(&iter); - bch_btree_iter_cond_resched(&iter); - } -fsck_err: - ret2 = bch_btree_iter_unlock(&iter); - if (ret2) - bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2); - - return ret ?: ret2; -} - -noinline_for_stack -static int check_inode_nlinks(struct bch_fs *c, - struct bch_inode_unpacked *lostfound_inode) -{ - nlink_table links; - u64 this_iter_range_start, next_iter_range_start = 0; - int ret = 0; - - genradix_init(&links); - - do { - this_iter_range_start = next_iter_range_start; - next_iter_range_start = U64_MAX; - - ret = bch_gc_walk_dirents(c, &links, - this_iter_range_start, - &next_iter_range_start); - if (ret) - break; - - ret = bch_gc_walk_inodes(c, lostfound_inode, &links, - this_iter_range_start, - next_iter_range_start); - if (ret) - break; - - genradix_free(&links); - } while (next_iter_range_start != U64_MAX); - - genradix_free(&links); - - return ret; -} - -/* - * Checks for inconsistencies that shouldn't happen, unless we have a bug. - * Doesn't fix them yet, mainly because they haven't yet been observed: - */ -int bch_fsck(struct bch_fs *c, bool full_fsck) -{ - struct bch_inode_unpacked root_inode, lostfound_inode; - int ret; - - ret = check_root(c, &root_inode); - if (ret) - return ret; - - ret = check_lostfound(c, &root_inode, &lostfound_inode); - if (ret) - return ret; - - if (!full_fsck) - goto check_nlinks; - - ret = check_extents(c); - if (ret) - return ret; - - ret = check_dirents(c); - if (ret) - return ret; - - ret = check_xattrs(c); - if (ret) - return ret; - - ret = check_directory_structure(c, &lostfound_inode); - if (ret) - return ret; -check_nlinks: - ret = check_inode_nlinks(c, &lostfound_inode); - if (ret) - return ret; - - return 0; -} diff --git a/libbcache/fs-gc.h b/libbcache/fs-gc.h deleted file mode 100644 index ac86fd22..00000000 --- a/libbcache/fs-gc.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _BCACHE_FS_GC_H -#define _BCACHE_FS_GC_H - -s64 bch_count_inode_sectors(struct bch_fs *, u64); -int bch_fsck(struct bch_fs *, bool); - -#endif /* _BCACHE_FS_GC_H */ diff --git a/libbcache/fs-io.c b/libbcache/fs-io.c deleted file mode 100644 index afc8c208..00000000 --- a/libbcache/fs-io.c +++ /dev/null @@ -1,2496 +0,0 @@ - -#include "bcache.h" -#include "btree_update.h" -#include "buckets.h" -#include "clock.h" -#include "error.h" -#include "fs.h" -#include "fs-gc.h" -#include "fs-io.h" -#include "inode.h" -#include "journal.h" -#include "io.h" -#include "keylist.h" - -#include <linux/aio.h> -#include <linux/backing-dev.h> -#include <linux/falloc.h> -#include <linux/migrate.h> -#include <linux/mmu_context.h> -#include <linux/pagevec.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/uio.h> -#include <linux/writeback.h> -#include <trace/events/writeback.h> - -struct bio_set *bch_writepage_bioset; -struct bio_set *bch_dio_read_bioset; -struct bio_set *bch_dio_write_bioset; - -/* pagecache_block must be held */ -static int write_invalidate_inode_pages_range(struct address_space *mapping, - loff_t start, loff_t end) -{ - int ret; - - /* - * XXX: the way this is currently implemented, we can spin if a process - * is continually redirtying a specific page - */ - do { - if (!mapping->nrpages && - !mapping->nrexceptional) - return 0; - - ret = filemap_write_and_wait_range(mapping, start, end); - if (ret) - break; - - if (!mapping->nrpages) - return 0; - - ret = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, - end >> PAGE_SHIFT); - } while (ret == -EBUSY); - - return ret; -} - -/* i_size updates: */ - -static int inode_set_size(struct bch_inode_info *ei, - struct bch_inode_unpacked *bi, - void *p) -{ - loff_t *new_i_size = p; - - lockdep_assert_held(&ei->update_lock); - - bi->i_size = *new_i_size; - - if (atomic_long_read(&ei->i_size_dirty_count)) - bi->i_flags |= BCH_INODE_I_SIZE_DIRTY; - else - bi->i_flags &= ~BCH_INODE_I_SIZE_DIRTY; - - return 0; -} - -static int __must_check bch_write_inode_size(struct bch_fs *c, - struct bch_inode_info *ei, - loff_t new_size) -{ - return __bch_write_inode(c, ei, inode_set_size, &new_size); -} - -static inline void i_size_dirty_put(struct bch_inode_info *ei) -{ - atomic_long_dec_bug(&ei->i_size_dirty_count); -} - -static inline void i_size_dirty_get(struct bch_inode_info *ei) -{ - lockdep_assert_held(&ei->vfs_inode.i_rwsem); - - atomic_long_inc(&ei->i_size_dirty_count); -} - -/* i_sectors accounting: */ - -static enum extent_insert_hook_ret -i_sectors_hook_fn(struct extent_insert_hook *hook, - struct bpos committed_pos, - struct bpos next_pos, - struct bkey_s_c k, - const struct bkey_i *insert) -{ - struct i_sectors_hook *h = container_of(hook, - struct i_sectors_hook, hook); - s64 sectors = next_pos.offset - committed_pos.offset; - int sign = bkey_extent_is_allocation(&insert->k) - - (k.k && bkey_extent_is_allocation(k.k)); - - EBUG_ON(!(h->ei->i_flags & BCH_INODE_I_SECTORS_DIRTY)); - EBUG_ON(!atomic_long_read(&h->ei->i_sectors_dirty_count)); - - h->sectors += sectors * sign; - - return BTREE_HOOK_DO_INSERT; -} - -static int inode_set_i_sectors_dirty(struct bch_inode_info *ei, - struct bch_inode_unpacked *bi, void *p) -{ - BUG_ON(bi->i_flags & BCH_INODE_I_SECTORS_DIRTY); - - bi->i_flags |= BCH_INODE_I_SECTORS_DIRTY; - return 0; -} - -static int inode_clear_i_sectors_dirty(struct bch_inode_info *ei, - struct bch_inode_unpacked *bi, - void *p) -{ - BUG_ON(!(bi->i_flags & BCH_INODE_I_SECTORS_DIRTY)); - - bi->i_sectors = atomic64_read(&ei->i_sectors); - bi->i_flags &= ~BCH_INODE_I_SECTORS_DIRTY; - return 0; -} - -static void i_sectors_dirty_put(struct bch_inode_info *ei, - struct i_sectors_hook *h) -{ - struct inode *inode = &ei->vfs_inode; - - if (h->sectors) { - spin_lock(&inode->i_lock); - inode->i_blocks += h->sectors; - spin_unlock(&inode->i_lock); - - atomic64_add(h->sectors, &ei->i_sectors); - EBUG_ON(atomic64_read(&ei->i_sectors) < 0); - } - - EBUG_ON(atomic_long_read(&ei->i_sectors_dirty_count) <= 0); - - mutex_lock(&ei->update_lock); - - if (atomic_long_dec_and_test(&ei->i_sectors_dirty_count)) { - struct bch_fs *c = ei->vfs_inode.i_sb->s_fs_info; - int ret = __bch_write_inode(c, ei, inode_clear_i_sectors_dirty, NULL); - - ret = ret; - } - - mutex_unlock(&ei->update_lock); -} - -static int __must_check i_sectors_dirty_get(struct bch_inode_info *ei, - struct i_sectors_hook *h) -{ - int ret = 0; - - h->hook.fn = i_sectors_hook_fn; - h->sectors = 0; -#ifdef CONFIG_BCACHE_DEBUG - h->ei = ei; -#endif - - if (atomic_long_inc_not_zero(&ei->i_sectors_dirty_count)) - return 0; - - mutex_lock(&ei->update_lock); - - if (!(ei->i_flags & BCH_INODE_I_SECTORS_DIRTY)) { - struct bch_fs *c = ei->vfs_inode.i_sb->s_fs_info; - - ret = __bch_write_inode(c, ei, inode_set_i_sectors_dirty, NULL); - } - - if (!ret) - atomic_long_inc(&ei->i_sectors_dirty_count); - - mutex_unlock(&ei->update_lock); - - return ret; -} - -struct bchfs_extent_trans_hook { - struct bchfs_write_op *op; - struct extent_insert_hook hook; - - struct bch_inode_unpacked inode_u; - struct bkey_inode_buf inode_p; - - bool need_inode_update; -}; - -static enum extent_insert_hook_ret -bchfs_extent_update_hook(struct extent_insert_hook *hook, - struct bpos committed_pos, - struct bpos next_pos, - struct bkey_s_c k, - const struct bkey_i *insert) -{ - struct bchfs_extent_trans_hook *h = container_of(hook, - struct bchfs_extent_trans_hook, hook); - struct bch_inode_info *ei = h->op->ei; - struct inode *inode = &ei->vfs_inode; - int sign = bkey_extent_is_allocation(&insert->k) - - (k.k && bkey_extent_is_allocation(k.k)); - s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign; - u64 offset = min(next_pos.offset << 9, h->op->new_i_size); - bool do_pack = false; - - BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE)); - - /* XXX: ei->i_size locking */ - if (offset > ei->i_size) { - BUG_ON(ei->i_flags & BCH_INODE_I_SIZE_DIRTY); - - if (!h->need_inode_update) { - h->need_inode_update = true; - return BTREE_HOOK_RESTART_TRANS; - } - - h->inode_u.i_size = offset; - do_pack = true; - - ei->i_size = offset; - - if (h->op->is_dio) - i_size_write(inode, offset); - } - - if (sectors) { - if (!h->need_inode_update) { - h->need_inode_update = true; - return BTREE_HOOK_RESTART_TRANS; - } - - h->inode_u.i_sectors += sectors; - do_pack = true; - - atomic64_add(sectors, &ei->i_sectors); - - h->op->sectors_added += sectors; - - if (h->op->is_dio) { - spin_lock(&inode->i_lock); - inode->i_blocks += sectors; - spin_unlock(&inode->i_lock); - } - } - - if (do_pack) - bch_inode_pack(&h->inode_p, &h->inode_u); - - return BTREE_HOOK_DO_INSERT; -} - -static int bchfs_write_index_update(struct bch_write_op *wop) -{ - struct bchfs_write_op *op = container_of(wop, - struct bchfs_write_op, op); - struct keylist *keys = &op->op.insert_keys; - struct btree_iter extent_iter, inode_iter; - struct bchfs_extent_trans_hook hook; - struct bkey_i *k = bch_keylist_front(keys); - int ret; - - BUG_ON(k->k.p.inode != op->ei->vfs_inode.i_ino); - - bch_btree_iter_init_intent(&extent_iter, wop->c, BTREE_ID_EXTENTS, - bkey_start_pos(&bch_keylist_front(keys)->k)); - bch_btree_iter_init_intent(&inode_iter, wop->c, BTREE_ID_INODES, - POS(extent_iter.pos.inode, 0)); - - hook.op = op; - hook.hook.fn = bchfs_extent_update_hook; - hook.need_inode_update = false; - - do { - ret = bch_btree_iter_traverse(&extent_iter); - if (ret) - goto err; - - /* XXX: ei->i_size locking */ - k = bch_keylist_front(keys); - if (min(k->k.p.offset << 9, op->new_i_size) > op->ei->i_size) - hook.need_inode_update = true; - - if (hook.need_inode_update) { - struct bkey_s_c inode; - - if (!btree_iter_linked(&inode_iter)) - bch_btree_iter_link(&extent_iter, &inode_iter); - - inode = bch_btree_iter_peek_with_holes(&inode_iter); - if ((ret = btree_iter_err(inode))) - goto err; - - if (WARN_ONCE(inode.k->type != BCH_INODE_FS, - "inode %llu not found when updating", - extent_iter.pos.inode)) { - ret = -ENOENT; - break; - } - - if (WARN_ONCE(bkey_bytes(inode.k) > - sizeof(hook.inode_p), - "inode %llu too big (%zu bytes, buf %zu)", - extent_iter.pos.inode, - bkey_bytes(inode.k), - sizeof(hook.inode_p))) { - ret = -ENOENT; - break; - } - - bkey_reassemble(&hook.inode_p.inode.k_i, inode); - ret = bch_inode_unpack(bkey_s_c_to_inode(inode), - &hook.inode_u); - if (WARN_ONCE(ret, - "error %i unpacking inode %llu", - ret, extent_iter.pos.inode)) { - ret = -ENOENT; - break; - } - - ret = bch_btree_insert_at(wop->c, &wop->res, - &hook.hook, op_journal_seq(wop), - BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&extent_iter, k), - BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter, - &hook.inode_p.inode.k_i, 2)); - } else { - ret = bch_btree_insert_at(wop->c, &wop->res, - &hook.hook, op_journal_seq(wop), - BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&extent_iter, k)); - } -err: - if (ret == -EINTR) - continue; - if (ret) - break; - - bch_keylist_pop_front(keys); - } while (!bch_keylist_empty(keys)); - - bch_btree_iter_unlock(&extent_iter); - bch_btree_iter_unlock(&inode_iter); - - return ret; -} - -/* page state: */ - -/* stored in page->private: */ - -/* - * bch_page_state has to (unfortunately) be manipulated with cmpxchg - we could - * almost protected it with the page lock, except that bch_writepage_io_done has - * to update the sector counts (and from interrupt/bottom half context). - */ -struct bch_page_state { -union { struct { - /* - * page is _fully_ written on disk, and not compressed - which means to - * write this page we don't have to reserve space (the new write will - * never take up more space on disk than what it's overwriting) - */ - unsigned allocated:1; - - /* Owns PAGE_SECTORS sized reservation: */ - unsigned reserved:1; - unsigned nr_replicas:4; - - /* - * Number of sectors on disk - for i_blocks - * Uncompressed size, not compressed size: - */ - u8 sectors; - u8 dirty_sectors; -}; - /* for cmpxchg: */ - unsigned long v; -}; -}; - -#define page_state_cmpxchg(_ptr, _new, _expr) \ -({ \ - unsigned long _v = READ_ONCE((_ptr)->v); \ - struct bch_page_state _old; \ - \ - do { \ - _old.v = _new.v = _v; \ - _expr; \ - \ - EBUG_ON(_new.sectors + _new.dirty_sectors > PAGE_SECTORS);\ - } while (_old.v != _new.v && \ - (_v = cmpxchg(&(_ptr)->v, _old.v, _new.v)) != _old.v); \ - \ - _old; \ -}) - -static inline struct bch_page_state *page_state(struct page *page) -{ - struct bch_page_state *s = (void *) &page->private; - - BUILD_BUG_ON(sizeof(*s) > sizeof(page->private)); - - if (!PagePrivate(page)) - SetPagePrivate(page); - - return s; -} - -static void bch_put_page_reservation(struct bch_fs *c, struct page *page) -{ - struct disk_reservation res = { .sectors = PAGE_SECTORS }; - struct bch_page_state s; - - s = page_state_cmpxchg(page_state(page), s, { - if (!s.reserved) - return; - s.reserved = 0; - }); - - bch_disk_reservation_put(c, &res); -} - -static int bch_get_page_reservation(struct bch_fs *c, struct page *page, - bool check_enospc) -{ - struct bch_page_state *s = page_state(page), new; - struct disk_reservation res; - int ret = 0; - - BUG_ON(s->allocated && s->sectors != PAGE_SECTORS); - - if (s->allocated || s->reserved) - return 0; - - ret = bch_disk_reservation_get(c, &res, PAGE_SECTORS, !check_enospc - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - return ret; - - page_state_cmpxchg(s, new, { - if (new.reserved) { - bch_disk_reservation_put(c, &res); - return 0; - } - new.reserved = 1; - new.nr_replicas = res.nr_replicas; - }); - - return 0; -} - -static void bch_clear_page_bits(struct page *page) -{ - struct inode *inode = page->mapping->host; - struct bch_fs *c = inode->i_sb->s_fs_info; - struct disk_reservation res = { .sectors = PAGE_SECTORS }; - struct bch_page_state s; - - if (!PagePrivate(page)) - return; - - s = xchg(page_state(page), (struct bch_page_state) { .v = 0 }); - ClearPagePrivate(page); - - if (s.dirty_sectors) { - spin_lock(&inode->i_lock); - inode->i_blocks -= s.dirty_sectors; - spin_unlock(&inode->i_lock); - } - - if (s.reserved) - bch_disk_reservation_put(c, &res); -} - -int bch_set_page_dirty(struct page *page) -{ - struct bch_page_state old, new; - - old = page_state_cmpxchg(page_state(page), new, - new.dirty_sectors = PAGE_SECTORS - new.sectors; - ); - - if (old.dirty_sectors != new.dirty_sectors) { - struct inode *inode = page->mapping->host; - - spin_lock(&inode->i_lock); - inode->i_blocks += new.dirty_sectors - old.dirty_sectors; - spin_unlock(&inode->i_lock); - } - - return __set_page_dirty_nobuffers(page); -} - -/* readpages/writepages: */ - -static bool bio_can_add_page_contig(struct bio *bio, struct page *page) -{ - sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9); - - return bio->bi_vcnt < bio->bi_max_vecs && - bio_end_sector(bio) == offset; -} - -static int bio_add_page_contig(struct bio *bio, struct page *page) -{ - sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9); - - BUG_ON(!bio->bi_max_vecs); - - if (!bio->bi_vcnt) - bio->bi_iter.bi_sector = offset; - else if (!bio_can_add_page_contig(bio, page)) - return -1; - - bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) { - .bv_page = page, - .bv_len = PAGE_SIZE, - .bv_offset = 0, - }; - - bio->bi_iter.bi_size += PAGE_SIZE; - - return 0; -} - -static void bch_readpages_end_io(struct bio *bio) -{ - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - - if (!bio->bi_error) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } - - bio_put(bio); -} - -static inline struct page *__readpage_next_page(struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - struct page *page; - int ret; - - while (*nr_pages) { - page = list_entry(pages->prev, struct page, lru); - prefetchw(&page->flags); - list_del(&page->lru); - - ret = add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS); - - /* if add_to_page_cache_lru() succeeded, page is locked: */ - put_page(page); - - if (!ret) - return page; - - (*nr_pages)--; - } - - return NULL; -} - -#define for_each_readpage_page(_mapping, _pages, _nr_pages, _page) \ - for (; \ - ((_page) = __readpage_next_page(_mapping, _pages, &(_nr_pages)));\ - (_nr_pages)--) - -static void bch_mark_pages_unalloc(struct bio *bio) -{ - struct bvec_iter iter; - struct bio_vec bv; - - bio_for_each_segment(bv, bio, iter) - page_state(bv.bv_page)->allocated = 0; -} - -static void bch_add_page_sectors(struct bio *bio, struct bkey_s_c k) -{ - struct bvec_iter iter; - struct bio_vec bv; - - bio_for_each_segment(bv, bio, iter) { - struct bch_page_state *s = page_state(bv.bv_page); - - /* sectors in @k from the start of this page: */ - unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset); - - unsigned page_sectors = min(bv.bv_len >> 9, k_sectors); - - if (!s->sectors) - s->nr_replicas = bch_extent_nr_dirty_ptrs(k); - else - s->nr_replicas = min_t(unsigned, s->nr_replicas, - bch_extent_nr_dirty_ptrs(k)); - - BUG_ON(s->sectors + page_sectors > PAGE_SECTORS); - s->sectors += page_sectors; - } -} - -static void bchfs_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) -{ - struct bio *bio = &rbio->bio; - struct btree_iter iter; - struct bkey_s_c k; - struct bio_vec *bv; - unsigned i; - int ret; - - bch_increment_clock(c, bio_sectors(bio), READ); - - /* - * Initialize page state: - * If a page is partly allocated and partly a hole, we want it to be - * marked BCH_PAGE_UNALLOCATED - so we initially mark all pages - * allocated and then mark them unallocated as we find holes: - * - * Note that the bio hasn't been split yet - it's the only bio that - * points to these pages. As we walk extents and split @bio, that - * necessarily be true, the splits won't necessarily be on page - * boundaries: - */ - bio_for_each_segment_all(bv, bio, i) { - struct bch_page_state *s = page_state(bv->bv_page); - - EBUG_ON(s->reserved); - - s->allocated = 1; - s->sectors = 0; - } - - for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS, - POS(inode, bio->bi_iter.bi_sector), k) { - BKEY_PADDED(k) tmp; - struct extent_pick_ptr pick; - unsigned bytes, sectors; - bool is_last; - - bkey_reassemble(&tmp.k, k); - bch_btree_iter_unlock(&iter); - k = bkey_i_to_s_c(&tmp.k); - - if (!bkey_extent_is_allocation(k.k) || - bkey_extent_is_compressed(k)) - bch_mark_pages_unalloc(bio); - - bch_extent_pick_ptr(c, k, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, bio, "no device to read from"); - bio_endio(bio); - return; - } - - sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) - - bio->bi_iter.bi_sector; - bytes = sectors << 9; - is_last = bytes == bio->bi_iter.bi_size; - swap(bio->bi_iter.bi_size, bytes); - - if (bkey_extent_is_allocation(k.k)) - bch_add_page_sectors(bio, k); - - if (pick.ca) { - PTR_BUCKET(pick.ca, &pick.ptr)->read_prio = - c->prio_clock[READ].hand; - - bch_read_extent(c, rbio, k, &pick, - BCH_READ_RETRY_IF_STALE| - BCH_READ_PROMOTE| - (is_last ? BCH_READ_IS_LAST : 0)); - } else { - zero_fill_bio_iter(bio, bio->bi_iter); - - if (is_last) - bio_endio(bio); - } - - if (is_last) - return; - - swap(bio->bi_iter.bi_size, bytes); - bio_advance(bio, bytes); - } - - /* - * If we get here, it better have been because there was an error - * reading a btree node - */ - ret = bch_btree_iter_unlock(&iter); - BUG_ON(!ret); - bcache_io_error(c, bio, "btree IO error %i", ret); - bio_endio(bio); -} - -int bch_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - struct inode *inode = mapping->host; - struct bch_fs *c = inode->i_sb->s_fs_info; - struct bch_read_bio *rbio = NULL; - struct page *page; - - pr_debug("reading %u pages", nr_pages); - - if (current->pagecache_lock != &mapping->add_lock) - pagecache_add_get(&mapping->add_lock); - - for_each_readpage_page(mapping, pages, nr_pages, page) { -again: - if (!rbio) { - rbio = container_of(bio_alloc_bioset(GFP_NOFS, - min_t(unsigned, nr_pages, - BIO_MAX_PAGES), - &c->bio_read), - struct bch_read_bio, bio); - - rbio->bio.bi_end_io = bch_readpages_end_io; - } - - if (bio_add_page_contig(&rbio->bio, page)) { - bchfs_read(c, rbio, inode->i_ino); - rbio = NULL; - goto again; - } - } - - if (rbio) - bchfs_read(c, rbio, inode->i_ino); - - if (current->pagecache_lock != &mapping->add_lock) - pagecache_add_put(&mapping->add_lock); - - pr_debug("success"); - return 0; -} - -int bch_readpage(struct file *file, struct page *page) -{ - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct bch_fs *c = inode->i_sb->s_fs_info; - struct bch_read_bio *rbio; - - rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1, - &c->bio_read), - struct bch_read_bio, bio); - bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); - rbio->bio.bi_end_io = bch_readpages_end_io; - - bio_add_page_contig(&rbio->bio, page); - bchfs_read(c, rbio, inode->i_ino); - - return 0; -} - -struct bch_writepage_state { - struct bch_writepage_io *io; -}; - -static void bch_writepage_io_free(struct closure *cl) -{ - struct bch_writepage_io *io = container_of(cl, - struct bch_writepage_io, cl); - struct bio *bio = &io->bio.bio; - - bio_put(bio); -} - -static void bch_writepage_io_done(struct closure *cl) -{ - struct bch_writepage_io *io = container_of(cl, - struct bch_writepage_io, cl); - struct bch_fs *c = io->op.op.c; - struct bio *bio = &io->bio.bio; - struct bio_vec *bvec; - unsigned i; - - atomic_sub(bio->bi_vcnt, &c->writeback_pages); - wake_up(&c->writeback_wait); - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (io->op.op.error) { - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); - } - - if (io->op.op.written >= PAGE_SECTORS) { - struct bch_page_state old, new; - - old = page_state_cmpxchg(page_state(page), new, { - new.sectors = PAGE_SECTORS; - new.dirty_sectors = 0; - }); - - io->op.sectors_added -= old.dirty_sectors; - io->op.op.written -= PAGE_SECTORS; - } - } - - /* - * racing with fallocate can cause us to add fewer sectors than - * expected - but we shouldn't add more sectors than expected: - * - * (error (due to going RO) halfway through a page can screw that up - * slightly) - */ - BUG_ON(io->op.sectors_added >= (s64) PAGE_SECTORS); - - /* - * PageWriteback is effectively our ref on the inode - fixup i_blocks - * before calling end_page_writeback: - */ - if (io->op.sectors_added) { - struct inode *inode = &io->op.ei->vfs_inode; - - spin_lock(&inode->i_lock); - inode->i_blocks += io->op.sectors_added; - spin_unlock(&inode->i_lock); - } - - bio_for_each_segment_all(bvec, bio, i) - end_page_writeback(bvec->bv_page); - - closure_return_with_destructor(&io->cl, bch_writepage_io_free); -} - -static void bch_writepage_do_io(struct bch_writepage_state *w) -{ - struct bch_writepage_io *io = w->io; - - w->io = NULL; - atomic_add(io->bio.bio.bi_vcnt, &io->op.op.c->writeback_pages); - - io->op.op.pos.offset = io->bio.bio.bi_iter.bi_sector; - - closure_call(&io->op.op.cl, bch_write, NULL, &io->cl); - continue_at(&io->cl, bch_writepage_io_done, NULL); -} - -/* - * Get a bch_writepage_io and add @page to it - appending to an existing one if - * possible, else allocating a new one: - */ -static void bch_writepage_io_alloc(struct bch_fs *c, - struct bch_writepage_state *w, - struct bch_inode_info *ei, - struct page *page) -{ - u64 inum = ei->vfs_inode.i_ino; - unsigned nr_replicas = page_state(page)->nr_replicas; - - EBUG_ON(!nr_replicas); - /* XXX: disk_reservation->gen isn't plumbed through */ - - if (!w->io) { -alloc_io: - w->io = container_of(bio_alloc_bioset(GFP_NOFS, - BIO_MAX_PAGES, - bch_writepage_bioset), - struct bch_writepage_io, bio.bio); - - closure_init(&w->io->cl, NULL); - w->io->op.ei = ei; - w->io->op.sectors_added = 0; - w->io->op.is_dio = false; - bch_write_op_init(&w->io->op.op, c, &w->io->bio, - (struct disk_reservation) { - .nr_replicas = c->opts.data_replicas, - }, - foreground_write_point(c, inum), - POS(inum, 0), - &ei->journal_seq, 0); - w->io->op.op.index_update_fn = bchfs_write_index_update; - } - - if (w->io->op.op.res.nr_replicas != nr_replicas || - bio_add_page_contig(&w->io->bio.bio, page)) { - bch_writepage_do_io(w); - goto alloc_io; - } - - /* - * We shouldn't ever be handed pages for multiple inodes in a single - * pass - right? - */ - BUG_ON(ei != w->io->op.ei); -} - -static int __bch_writepage(struct bch_fs *c, struct page *page, - struct writeback_control *wbc, - struct bch_writepage_state *w) -{ - struct inode *inode = page->mapping->host; - struct bch_inode_info *ei = to_bch_ei(inode); - struct bch_page_state new, old; - unsigned offset; - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; - - EBUG_ON(!PageUptodate(page)); - - /* Is the page fully inside i_size? */ - if (page->index < end_index) - goto do_io; - - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE - 1); - if (page->index > end_index || !offset) { - unlock_page(page); - return 0; - } - - /* - * The page straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - zero_user_segment(page, offset, PAGE_SIZE); -do_io: - bch_writepage_io_alloc(c, w, ei, page); - - /* while page is locked: */ - w->io->op.new_i_size = i_size; - - if (wbc->sync_mode == WB_SYNC_ALL) - w->io->bio.bio.bi_opf |= WRITE_SYNC; - - /* Before unlocking the page, transfer reservation to w->io: */ - old = page_state_cmpxchg(page_state(page), new, { - EBUG_ON(!new.reserved && - (new.sectors != PAGE_SECTORS || - !new.allocated)); - - if (new.allocated && - w->io->op.op.compression_type != BCH_COMPRESSION_NONE) - new.allocated = 0; - else if (!new.reserved) - goto out; - new.reserved = 0; - }); - - w->io->op.op.res.sectors += PAGE_SECTORS * - (old.reserved - new.reserved) * - old.nr_replicas; -out: - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); - - return 0; -} - -int bch_writepages(struct address_space *mapping, struct writeback_control *wbc) -{ - struct bch_fs *c = mapping->host->i_sb->s_fs_info; - struct bch_writepage_state w = { NULL }; - struct pagecache_iter iter; - struct page *page; - int ret = 0; - int done = 0; - pgoff_t uninitialized_var(writeback_index); - pgoff_t index; - pgoff_t end; /* Inclusive */ - pgoff_t done_index; - int cycled; - int range_whole = 0; - int tag; - - if (wbc->range_cyclic) { - writeback_index = mapping->writeback_index; /* prev offset */ - index = writeback_index; - if (index == 0) - cycled = 1; - else - cycled = 0; - end = -1; - } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - cycled = 1; /* ignore range_cyclic tests */ - } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; -retry: - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); - - done_index = index; -get_pages: - for_each_pagecache_tag(&iter, mapping, tag, index, end, page) { - done_index = page->index; - - if (w.io && - !bio_can_add_page_contig(&w.io->bio.bio, page)) - bch_writepage_do_io(&w); - - if (!w.io && - atomic_read(&c->writeback_pages) >= - c->writeback_pages_max) { - /* don't sleep with pages pinned: */ - pagecache_iter_release(&iter); - - __wait_event(c->writeback_wait, - atomic_read(&c->writeback_pages) < - c->writeback_pages_max); - goto get_pages; - } - - lock_page(page); - - /* - * Page truncated or invalidated. We can freely skip it - * then, even for data integrity operations: the page - * has disappeared concurrently, so there could be no - * real expectation of this data interity operation - * even if there is now a new, dirty page at the same - * pagecache address. - */ - if (unlikely(page->mapping != mapping)) { -continue_unlock: - unlock_page(page); - continue; - } - - if (!PageDirty(page)) { - /* someone wrote it for us */ - goto continue_unlock; - } - - if (PageWriteback(page)) { - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - else - goto continue_unlock; - } - - BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) - goto continue_unlock; - - trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - ret = __bch_writepage(c, page, wbc, &w); - if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); - ret = 0; - } else { - /* - * done_index is set past this page, - * so media errors will not choke - * background writeout for the entire - * file. This has consequences for - * range_cyclic semantics (ie. it may - * not be suitable for data integrity - * writeout). - */ - done_index = page->index + 1; - done = 1; - break; - } - } - - /* - * We stop writing back only if we are not doing - * integrity sync. In case of integrity sync we have to - * keep going until we have written all the pages - * we tagged for writeback prior to entering this loop. - */ - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { - done = 1; - break; - } - } - pagecache_iter_release(&iter); - - if (w.io) - bch_writepage_do_io(&w); - - if (!cycled && !done) { - /* - * range_cyclic: - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - cycled = 1; - index = 0; - end = writeback_index - 1; - goto retry; - } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = done_index; - - return ret; -} - -int bch_writepage(struct page *page, struct writeback_control *wbc) -{ - struct bch_fs *c = page->mapping->host->i_sb->s_fs_info; - struct bch_writepage_state w = { NULL }; - int ret; - - ret = __bch_writepage(c, page, wbc, &w); - if (w.io) - bch_writepage_do_io(&w); - - return ret; -} - -static void bch_read_single_page_end_io(struct bio *bio) -{ - complete(bio->bi_private); -} - -static int bch_read_single_page(struct page *page, - struct address_space *mapping) -{ - struct inode *inode = mapping->host; - struct bch_fs *c = inode->i_sb->s_fs_info; - struct bch_read_bio *rbio; - int ret; - DECLARE_COMPLETION_ONSTACK(done); - - rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1, - &c->bio_read), - struct bch_read_bio, bio); - bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); - rbio->bio.bi_private = &done; - rbio->bio.bi_end_io = bch_read_single_page_end_io; - bio_add_page_contig(&rbio->bio, page); - - bchfs_read(c, rbio, inode->i_ino); - wait_for_completion(&done); - - ret = rbio->bio.bi_error; - bio_put(&rbio->bio); - - if (ret < 0) - return ret; - - SetPageUptodate(page); - return 0; -} - -int bch_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - struct inode *inode = mapping->host; - struct bch_fs *c = inode->i_sb->s_fs_info; - pgoff_t index = pos >> PAGE_SHIFT; - unsigned offset = pos & (PAGE_SIZE - 1); - struct page *page; - int ret = -ENOMEM; - - BUG_ON(inode_unhashed(mapping->host)); - - /* Not strictly necessary - same reason as mkwrite(): */ - pagecache_add_get(&mapping->add_lock); - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - goto err_unlock; - - if (PageUptodate(page)) - goto out; - - /* If we're writing entire page, don't need to read it in first: */ - if (len == PAGE_SIZE) - goto out; - - if (!offset && pos + len >= inode->i_size) { - zero_user_segment(page, len, PAGE_SIZE); - flush_dcache_page(page); - goto out; - } - - if (index > inode->i_size >> PAGE_SHIFT) { - zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); - flush_dcache_page(page); - goto out; - } -readpage: - ret = bch_read_single_page(page, mapping); - if (ret) - goto err; -out: - ret = bch_get_page_reservation(c, page, true); - if (ret) { - if (!PageUptodate(page)) { - /* - * If the page hasn't been read in, we won't know if we - * actually need a reservation - we don't actually need - * to read here, we just need to check if the page is - * fully backed by uncompressed data: - */ - goto readpage; - } - - goto err; - } - - *pagep = page; - return 0; -err: - unlock_page(page); - put_page(page); - *pagep = NULL; -err_unlock: - pagecache_add_put(&mapping->add_lock); - return ret; -} - -int bch_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - struct bch_fs *c = inode->i_sb->s_fs_info; - - lockdep_assert_held(&inode->i_rwsem); - - if (unlikely(copied < len && !PageUptodate(page))) { - /* - * The page needs to be read in, but that would destroy - * our partial write - simplest thing is to just force - * userspace to redo the write: - */ - zero_user(page, 0, PAGE_SIZE); - flush_dcache_page(page); - copied = 0; - } - - if (pos + copied > inode->i_size) - i_size_write(inode, pos + copied); - - if (copied) { - if (!PageUptodate(page)) - SetPageUptodate(page); - if (!PageDirty(page)) - set_page_dirty(page); - } else { - bch_put_page_reservation(c, page); - } - - unlock_page(page); - put_page(page); - pagecache_add_put(&mapping->add_lock); - - return copied; -} - -/* O_DIRECT */ - -static void bch_dio_read_complete(struct closure *cl) -{ - struct dio_read *dio = container_of(cl, struct dio_read, cl); - - dio->req->ki_complete(dio->req, dio->ret, 0); - bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ -} - -static void bch_direct_IO_read_endio(struct bio *bio) -{ - struct dio_read *dio = bio->bi_private; - - if (bio->bi_error) - dio->ret = bio->bi_error; - - closure_put(&dio->cl); -} - -static void bch_direct_IO_read_split_endio(struct bio *bio) -{ - bch_direct_IO_read_endio(bio); - bio_check_pages_dirty(bio); /* transfers ownership */ -} - -static int bch_direct_IO_read(struct bch_fs *c, struct kiocb *req, - struct file *file, struct inode *inode, - struct iov_iter *iter, loff_t offset) -{ - struct dio_read *dio; - struct bio *bio; - bool sync = is_sync_kiocb(req); - ssize_t ret; - - if ((offset|iter->count) & (block_bytes(c) - 1)) - return -EINVAL; - - ret = min_t(loff_t, iter->count, - max_t(loff_t, 0, i_size_read(inode) - offset)); - iov_iter_truncate(iter, round_up(ret, block_bytes(c))); - - if (!ret) - return ret; - - bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_PAGES), - bch_dio_read_bioset); - - bio->bi_end_io = bch_direct_IO_read_endio; - - dio = container_of(bio, struct dio_read, rbio.bio); - closure_init(&dio->cl, NULL); - - /* - * this is a _really_ horrible hack just to avoid an atomic sub at the - * end: - */ - if (!sync) { - set_closure_fn(&dio->cl, bch_dio_read_complete, NULL); - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER - - CLOSURE_RUNNING + - CLOSURE_DESTRUCTOR); - } else { - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER + 1); - } - - dio->req = req; - dio->ret = ret; - - goto start; - while (iter->count) { - bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_PAGES), - &c->bio_read); - bio->bi_end_io = bch_direct_IO_read_split_endio; -start: - bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); - bio->bi_iter.bi_sector = offset >> 9; - bio->bi_private = dio; - - ret = bio_get_user_pages(bio, iter, 1); - if (ret < 0) { - /* XXX: fault inject this path */ - bio->bi_error = ret; - bio_endio(bio); - break; - } - - offset += bio->bi_iter.bi_size; - bio_set_pages_dirty(bio); - - if (iter->count) - closure_get(&dio->cl); - - bch_read(c, container_of(bio, - struct bch_read_bio, bio), - inode->i_ino); - } - - if (sync) { - closure_sync(&dio->cl); - closure_debug_destroy(&dio->cl); - ret = dio->ret; - bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ - return ret; - } else { - return -EIOCBQUEUED; - } -} - -static long __bch_dio_write_complete(struct dio_write *dio) -{ - struct file *file = dio->req->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = file->f_inode; - long ret = dio->error ?: dio->written; - - bch_disk_reservation_put(dio->c, &dio->res); - - __pagecache_block_put(&mapping->add_lock); - inode_dio_end(inode); - - if (dio->iovec && dio->iovec != dio->inline_vecs) - kfree(dio->iovec); - - bio_put(&dio->bio.bio); - return ret; -} - -static void bch_dio_write_complete(struct closure *cl) -{ - struct dio_write *dio = container_of(cl, struct dio_write, cl); - struct kiocb *req = dio->req; - - req->ki_complete(req, __bch_dio_write_complete(dio), 0); -} - -static void bch_dio_write_done(struct dio_write *dio) -{ - struct bio_vec *bv; - int i; - - dio->written += dio->iop.op.written << 9; - - if (dio->iop.op.error) - dio->error = dio->iop.op.error; - - bio_for_each_segment_all(bv, &dio->bio.bio, i) - put_page(bv->bv_page); - - if (dio->iter.count) - bio_reset(&dio->bio.bio); -} - -static void bch_do_direct_IO_write(struct dio_write *dio) -{ - struct file *file = dio->req->ki_filp; - struct inode *inode = file->f_inode; - struct bch_inode_info *ei = to_bch_ei(inode); - struct bio *bio = &dio->bio.bio; - unsigned flags = 0; - int ret; - - if ((dio->req->ki_flags & IOCB_DSYNC) && - !dio->c->opts.journal_flush_disabled) - flags |= BCH_WRITE_FLUSH; - - bio->bi_iter.bi_sector = (dio->offset + dio->written) >> 9; - - ret = bio_get_user_pages(bio, &dio->iter, 0); - if (ret < 0) { - /* - * these didn't get initialized, but bch_dio_write_done() will - * look at them: - */ - dio->iop.op.error = 0; - dio->iop.op.written = 0; - dio->error = ret; - return; - } - - dio->iop.ei = ei; - dio->iop.sectors_added = 0; - dio->iop.is_dio = true; - dio->iop.new_i_size = U64_MAX; - bch_write_op_init(&dio->iop.op, dio->c, &dio->bio, - dio->res, - foreground_write_point(dio->c, inode->i_ino), - POS(inode->i_ino, bio->bi_iter.bi_sector), - &ei->journal_seq, flags); - dio->iop.op.index_update_fn = bchfs_write_index_update; - - dio->res.sectors -= bio_sectors(bio); - dio->iop.op.res.sectors = bio_sectors(bio); - - task_io_account_write(bio->bi_iter.bi_size); - - closure_call(&dio->iop.op.cl, bch_write, NULL, &dio->cl); -} - -static void bch_dio_write_loop_async(struct closure *cl) -{ - struct dio_write *dio = - container_of(cl, struct dio_write, cl); - struct address_space *mapping = dio->req->ki_filp->f_mapping; - - bch_dio_write_done(dio); - - if (dio->iter.count && !dio->error) { - use_mm(dio->mm); - pagecache_block_get(&mapping->add_lock); - - bch_do_direct_IO_write(dio); - - pagecache_block_put(&mapping->add_lock); - unuse_mm(dio->mm); - - continue_at(&dio->cl, bch_dio_write_loop_async, NULL); - } else { -#if 0 - closure_return_with_destructor(cl, bch_dio_write_complete); -#else - closure_debug_destroy(cl); - bch_dio_write_complete(cl); -#endif - } -} - -static int bch_direct_IO_write(struct bch_fs *c, struct kiocb *req, - struct file *file, struct inode *inode, - struct iov_iter *iter, loff_t offset) -{ - struct address_space *mapping = file->f_mapping; - struct dio_write *dio; - struct bio *bio; - ssize_t ret; - bool sync = is_sync_kiocb(req); - - lockdep_assert_held(&inode->i_rwsem); - - if (unlikely(!iter->count)) - return 0; - - if (unlikely((offset|iter->count) & (block_bytes(c) - 1))) - return -EINVAL; - - bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_PAGES), - bch_dio_write_bioset); - dio = container_of(bio, struct dio_write, bio.bio); - dio->req = req; - dio->c = c; - dio->written = 0; - dio->error = 0; - dio->offset = offset; - dio->iovec = NULL; - dio->iter = *iter; - dio->mm = current->mm; - closure_init(&dio->cl, NULL); - - if (offset + iter->count > inode->i_size) - sync = true; - - /* - * XXX: we shouldn't return -ENOSPC if we're overwriting existing data - - * if getting a reservation fails we should check if we are doing an - * overwrite. - * - * Have to then guard against racing with truncate (deleting data that - * we would have been overwriting) - */ - ret = bch_disk_reservation_get(c, &dio->res, iter->count >> 9, 0); - if (unlikely(ret)) { - closure_debug_destroy(&dio->cl); - bio_put(bio); - return ret; - } - - inode_dio_begin(inode); - __pagecache_block_get(&mapping->add_lock); - - if (sync) { - do { - bch_do_direct_IO_write(dio); - - closure_sync(&dio->cl); - bch_dio_write_done(dio); - } while (dio->iter.count && !dio->error); - - closure_debug_destroy(&dio->cl); - return __bch_dio_write_complete(dio); - } else { - bch_do_direct_IO_write(dio); - - if (dio->iter.count && !dio->error) { - if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { - dio->iovec = kmalloc(dio->iter.nr_segs * - sizeof(struct iovec), - GFP_KERNEL); - if (!dio->iovec) - dio->error = -ENOMEM; - } else { - dio->iovec = dio->inline_vecs; - } - - memcpy(dio->iovec, - dio->iter.iov, - dio->iter.nr_segs * sizeof(struct iovec)); - dio->iter.iov = dio->iovec; - } - - continue_at_noreturn(&dio->cl, bch_dio_write_loop_async, NULL); - return -EIOCBQUEUED; - } -} - -ssize_t bch_direct_IO(struct kiocb *req, struct iov_iter *iter) -{ - struct file *file = req->ki_filp; - struct inode *inode = file->f_inode; - struct bch_fs *c = inode->i_sb->s_fs_info; - struct blk_plug plug; - ssize_t ret; - - blk_start_plug(&plug); - ret = ((iov_iter_rw(iter) == WRITE) - ? bch_direct_IO_write - : bch_direct_IO_read)(c, req, file, inode, iter, req->ki_pos); - blk_finish_plug(&plug); - - return ret; -} - -static ssize_t -bch_direct_write(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_inode; - struct bch_fs *c = inode->i_sb->s_fs_info; - struct address_space *mapping = file->f_mapping; - loff_t pos = iocb->ki_pos; - ssize_t ret; - - pagecache_block_get(&mapping->add_lock); - - /* Write and invalidate pagecache range that we're writing to: */ - ret = write_invalidate_inode_pages_range(file->f_mapping, pos, - pos + iov_iter_count(iter) - 1); - if (unlikely(ret)) - goto err; - - ret = bch_direct_IO_write(c, iocb, file, inode, iter, pos); -err: - pagecache_block_put(&mapping->add_lock); - - return ret; -} - -static ssize_t __bch_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - ret = file_remove_privs(file); - if (ret) - goto out; - - ret = file_update_time(file); - if (ret) - goto out; - - ret = iocb->ki_flags & IOCB_DIRECT - ? bch_direct_write(iocb, from) - : generic_perform_write(file, from, iocb->ki_pos); - - if (likely(ret > 0)) - iocb->ki_pos += ret; -out: - current->backing_dev_info = NULL; - return ret; -} - -ssize_t bch_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - bool direct = iocb->ki_flags & IOCB_DIRECT; - ssize_t ret; - - inode_lock(inode); - ret = generic_write_checks(iocb, from); - if (ret > 0) - ret = __bch_write_iter(iocb, from); - inode_unlock(inode); - - if (ret > 0 && !direct) - ret = generic_write_sync(iocb, ret); - - return ret; -} - -int bch_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); - struct address_space *mapping = inode->i_mapping; - struct bch_fs *c = inode->i_sb->s_fs_info; - int ret = VM_FAULT_LOCKED; - - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - - /* - * Not strictly necessary, but helps avoid dio writes livelocking in - * write_invalidate_inode_pages_range() - can drop this if/when we get - * a write_invalidate_inode_pages_range() that works without dropping - * page lock before invalidating page - */ - if (current->pagecache_lock != &mapping->add_lock) - pagecache_add_get(&mapping->add_lock); - - lock_page(page); - if (page->mapping != mapping || - page_offset(page) > i_size_read(inode)) { - unlock_page(page); - ret = VM_FAULT_NOPAGE; - goto out; - } - - if (bch_get_page_reservation(c, page, true)) { - unlock_page(page); - ret = VM_FAULT_SIGBUS; - goto out; - } - - if (!PageDirty(page)) - set_page_dirty(page); - wait_for_stable_page(page); -out: - if (current->pagecache_lock != &mapping->add_lock) - pagecache_add_put(&mapping->add_lock); - sb_end_pagefault(inode->i_sb); - return ret; -} - -void bch_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - EBUG_ON(!PageLocked(page)); - EBUG_ON(PageWriteback(page)); - - if (offset || length < PAGE_SIZE) - return; - - bch_clear_page_bits(page); -} - -int bch_releasepage(struct page *page, gfp_t gfp_mask) -{ - EBUG_ON(!PageLocked(page)); - EBUG_ON(PageWriteback(page)); - - if (PageDirty(page)) - return 0; - - bch_clear_page_bits(page); - return 1; -} - -#ifdef CONFIG_MIGRATION -int bch_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) -{ - int ret; - - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); - if (ret != MIGRATEPAGE_SUCCESS) - return ret; - - if (PagePrivate(page)) { - *page_state(newpage) = *page_state(page); - ClearPagePrivate(page); - } - - migrate_page_copy(newpage, page); - return MIGRATEPAGE_SUCCESS; -} -#endif - -int bch_fsync(struct file *file, loff_t start, loff_t end, int datasync) -{ - struct inode *inode = file->f_mapping->host; - struct bch_inode_info *ei = to_bch_ei(inode); - struct bch_fs *c = inode->i_sb->s_fs_info; - int ret; - - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - - if (c->opts.journal_flush_disabled) - return 0; - - return bch_journal_flush_seq(&c->journal, ei->journal_seq); -} - -static int __bch_truncate_page(struct address_space *mapping, - pgoff_t index, loff_t start, loff_t end) -{ - struct inode *inode = mapping->host; - struct bch_fs *c = inode->i_sb->s_fs_info; - unsigned start_offset = start & (PAGE_SIZE - 1); - unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; - struct page *page; - int ret = 0; - - /* Page boundary? Nothing to do */ - if (!((index == start >> PAGE_SHIFT && start_offset) || - (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) - return 0; - - /* Above i_size? */ - if (index << PAGE_SHIFT >= inode->i_size) - return 0; - - page = find_lock_page(mapping, index); - if (!page) { - struct btree_iter iter; - struct bkey_s_c k = bkey_s_c_null; - - /* - * XXX: we're doing two index lookups when we end up reading the - * page - */ - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(inode->i_ino, - index << (PAGE_SHIFT - 9)), k) { - if (bkey_cmp(bkey_start_pos(k.k), - POS(inode->i_ino, - (index + 1) << (PAGE_SHIFT - 9))) >= 0) - break; - - if (k.k->type != KEY_TYPE_DISCARD && - k.k->type != BCH_RESERVATION) { - bch_btree_iter_unlock(&iter); - goto create; - } - } - bch_btree_iter_unlock(&iter); - return 0; -create: - page = find_or_create_page(mapping, index, GFP_KERNEL); - if (unlikely(!page)) { - ret = -ENOMEM; - goto out; - } - } - - if (!PageUptodate(page)) { - ret = bch_read_single_page(page, mapping); - if (ret) - goto unlock; - } - - /* - * Bit of a hack - we don't want truncate to fail due to -ENOSPC. - * - * XXX: because we aren't currently tracking whether the page has actual - * data in it (vs. just 0s, or only partially written) this wrong. ick. - */ - ret = bch_get_page_reservation(c, page, false); - BUG_ON(ret); - - if (index == start >> PAGE_SHIFT && - index == end >> PAGE_SHIFT) - zero_user_segment(page, start_offset, end_offset); - else if (index == start >> PAGE_SHIFT) - zero_user_segment(page, start_offset, PAGE_SIZE); - else if (index == end >> PAGE_SHIFT) - zero_user_segment(page, 0, end_offset); - - if (!PageDirty(page)) - set_page_dirty(page); -unlock: - unlock_page(page); - put_page(page); -out: - return ret; -} - -static int bch_truncate_page(struct address_space *mapping, loff_t from) -{ - return __bch_truncate_page(mapping, from >> PAGE_SHIFT, - from, from + PAGE_SIZE); -} - -int bch_truncate(struct inode *inode, struct iattr *iattr) -{ - struct address_space *mapping = inode->i_mapping; - struct bch_inode_info *ei = to_bch_ei(inode); - struct bch_fs *c = inode->i_sb->s_fs_info; - bool shrink = iattr->ia_size <= inode->i_size; - int ret = 0; - - inode_dio_wait(inode); - pagecache_block_get(&mapping->add_lock); - - truncate_setsize(inode, iattr->ia_size); - - /* sync appends.. */ - /* XXX what protects ei->i_size? */ - if (iattr->ia_size > ei->i_size) - ret = filemap_write_and_wait_range(mapping, ei->i_size, S64_MAX); - if (ret) - goto err_put_pagecache; - - mutex_lock(&ei->update_lock); - i_size_dirty_get(ei); - ret = bch_write_inode_size(c, ei, inode->i_size); - mutex_unlock(&ei->update_lock); - - if (unlikely(ret)) - goto err; - - /* - * There might be persistent reservations (from fallocate()) - * above i_size, which bch_inode_truncate() will discard - we're - * only supposed to discard them if we're doing a real truncate - * here (new i_size < current i_size): - */ - if (shrink) { - struct i_sectors_hook i_sectors_hook; - int ret; - - ret = i_sectors_dirty_get(ei, &i_sectors_hook); - if (unlikely(ret)) - goto err; - - ret = bch_truncate_page(inode->i_mapping, iattr->ia_size); - if (unlikely(ret)) { - i_sectors_dirty_put(ei, &i_sectors_hook); - goto err; - } - - ret = bch_inode_truncate(c, inode->i_ino, - round_up(iattr->ia_size, PAGE_SIZE) >> 9, - &i_sectors_hook.hook, - &ei->journal_seq); - - i_sectors_dirty_put(ei, &i_sectors_hook); - - if (unlikely(ret)) - goto err; - } - - mutex_lock(&ei->update_lock); - setattr_copy(inode, iattr); - inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); - - /* clear I_SIZE_DIRTY: */ - i_size_dirty_put(ei); - ret = bch_write_inode_size(c, ei, inode->i_size); - mutex_unlock(&ei->update_lock); - - pagecache_block_put(&mapping->add_lock); - - return 0; -err: - i_size_dirty_put(ei); -err_put_pagecache: - pagecache_block_put(&mapping->add_lock); - return ret; -} - -static long bch_fpunch(struct inode *inode, loff_t offset, loff_t len) -{ - struct address_space *mapping = inode->i_mapping; - struct bch_inode_info *ei = to_bch_ei(inode); - struct bch_fs *c = inode->i_sb->s_fs_info; - u64 ino = inode->i_ino; - u64 discard_start = round_up(offset, PAGE_SIZE) >> 9; - u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9; - int ret = 0; - - inode_lock(inode); - inode_dio_wait(inode); - pagecache_block_get(&mapping->add_lock); - - ret = __bch_truncate_page(inode->i_mapping, - offset >> PAGE_SHIFT, - offset, offset + len); - if (unlikely(ret)) - goto out; - - if (offset >> PAGE_SHIFT != - (offset + len) >> PAGE_SHIFT) { - ret = __bch_truncate_page(inode->i_mapping, - (offset + len) >> PAGE_SHIFT, - offset, offset + len); - if (unlikely(ret)) - goto out; - } - - truncate_pagecache_range(inode, offset, offset + len - 1); - - if (discard_start < discard_end) { - struct disk_reservation disk_res; - struct i_sectors_hook i_sectors_hook; - int ret; - - BUG_ON(bch_disk_reservation_get(c, &disk_res, 0, 0)); - - ret = i_sectors_dirty_get(ei, &i_sectors_hook); - if (unlikely(ret)) - goto out; - - ret = bch_discard(c, - POS(ino, discard_start), - POS(ino, discard_end), - ZERO_VERSION, - &disk_res, - &i_sectors_hook.hook, - &ei->journal_seq); - - i_sectors_dirty_put(ei, &i_sectors_hook); - bch_disk_reservation_put(c, &disk_res); - } -out: - pagecache_block_put(&mapping->add_lock); - inode_unlock(inode); - - return ret; -} - -static long bch_fcollapse(struct inode *inode, loff_t offset, loff_t len) -{ - struct address_space *mapping = inode->i_mapping; - struct bch_inode_info *ei = to_bch_ei(inode); - struct bch_fs *c = inode->i_sb->s_fs_info; - struct btree_iter src; - struct btree_iter dst; - BKEY_PADDED(k) copy; - struct bkey_s_c k; - struct i_sectors_hook i_sectors_hook; - loff_t new_size; - int ret; - - if ((offset | len) & (PAGE_SIZE - 1)) - return -EINVAL; - - bch_btree_iter_init_intent(&dst, c, BTREE_ID_EXTENTS, - POS(inode->i_ino, offset >> 9)); - /* position will be set from dst iter's position: */ - bch_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN); - bch_btree_iter_link(&src, &dst); - - /* - * We need i_mutex to keep the page cache consistent with the extents - * btree, and the btree consistent with i_size - we don't need outside - * locking for the extents btree itself, because we're using linked - * iterators - */ - inode_lock(inode); - inode_dio_wait(inode); - pagecache_block_get(&mapping->add_lock); - - ret = -EINVAL; - if (offset + len >= inode->i_size) - goto err; - - if (inode->i_size < len) - goto err; - - new_size = inode->i_size - len; - - ret = write_invalidate_inode_pages_range(inode->i_mapping, - offset, LLONG_MAX); - if (ret) - goto err; - - ret = i_sectors_dirty_get(ei, &i_sectors_hook); - if (ret) - goto err; - - while (bkey_cmp(dst.pos, - POS(inode->i_ino, - round_up(new_size, PAGE_SIZE) >> 9)) < 0) { - struct disk_reservation disk_res; - - bch_btree_iter_set_pos(&src, - POS(dst.pos.inode, dst.pos.offset + (len >> 9))); - - ret = bch_btree_iter_traverse(&dst); - if (ret) - goto btree_iter_err; - - k = bch_btree_iter_peek_with_holes(&src); - if ((ret = btree_iter_err(k))) - goto btree_iter_err; - - bkey_reassemble(©.k, k); - - if (bkey_deleted(©.k.k)) - copy.k.k.type = KEY_TYPE_DISCARD; - - bch_cut_front(src.pos, ©.k); - copy.k.k.p.offset -= len >> 9; - - BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(©.k.k))); - - ret = bch_disk_reservation_get(c, &disk_res, copy.k.k.size, - BCH_DISK_RESERVATION_NOFAIL); - BUG_ON(ret); - - ret = bch_btree_insert_at(c, &disk_res, &i_sectors_hook.hook, - &ei->journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&dst, ©.k)); - bch_disk_reservation_put(c, &disk_res); -btree_iter_err: - if (ret < 0 && ret != -EINTR) - goto err_unwind; - - bch_btree_iter_cond_resched(&src); - } - - bch_btree_iter_unlock(&src); - bch_btree_iter_unlock(&dst); - - ret = bch_inode_truncate(c, inode->i_ino, - round_up(new_size, PAGE_SIZE) >> 9, - &i_sectors_hook.hook, - &ei->journal_seq); - if (ret) - goto err_unwind; - - i_sectors_dirty_put(ei, &i_sectors_hook); - - mutex_lock(&ei->update_lock); - i_size_write(inode, new_size); - ret = bch_write_inode_size(c, ei, inode->i_size); - mutex_unlock(&ei->update_lock); - - pagecache_block_put(&mapping->add_lock); - inode_unlock(inode); - - return ret; -err_unwind: - /* - * XXX: we've left data with multiple pointers... which isn't a _super_ - * serious problem... - */ - i_sectors_dirty_put(ei, &i_sectors_hook); -err: - bch_btree_iter_unlock(&src); - bch_btree_iter_unlock(&dst); - pagecache_block_put(&mapping->add_lock); - inode_unlock(inode); - return ret; -} - -static long bch_fallocate(struct inode *inode, int mode, - loff_t offset, loff_t len) -{ - struct address_space *mapping = inode->i_mapping; - struct bch_inode_info *ei = to_bch_ei(inode); - struct bch_fs *c = inode->i_sb->s_fs_info; - struct i_sectors_hook i_sectors_hook; - struct btree_iter iter; - struct bpos end; - loff_t block_start, block_end; - loff_t new_size = offset + len; - unsigned sectors; - unsigned replicas = READ_ONCE(c->opts.data_replicas); - int ret; - - bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, POS_MIN); - - inode_lock(inode); - inode_dio_wait(inode); - pagecache_block_get(&mapping->add_lock); - - if (!(mode & FALLOC_FL_KEEP_SIZE) && - new_size > inode->i_size) { - ret = inode_newsize_ok(inode, new_size); - if (ret) - goto err; - } - - if (mode & FALLOC_FL_ZERO_RANGE) { - ret = __bch_truncate_page(inode->i_mapping, - offset >> PAGE_SHIFT, - offset, offset + len); - - if (!ret && - offset >> PAGE_SHIFT != - (offset + len) >> PAGE_SHIFT) - ret = __bch_truncate_page(inode->i_mapping, - (offset + len) >> PAGE_SHIFT, - offset, offset + len); - - if (unlikely(ret)) - goto err; - - truncate_pagecache_range(inode, offset, offset + len - 1); - - block_start = round_up(offset, PAGE_SIZE); - block_end = round_down(offset + len, PAGE_SIZE); - } else { - block_start = round_down(offset, PAGE_SIZE); - block_end = round_up(offset + len, PAGE_SIZE); - } - - bch_btree_iter_set_pos(&iter, POS(inode->i_ino, block_start >> 9)); - end = POS(inode->i_ino, block_end >> 9); - - ret = i_sectors_dirty_get(ei, &i_sectors_hook); - if (unlikely(ret)) - goto err; - - while (bkey_cmp(iter.pos, end) < 0) { - struct disk_reservation disk_res = { 0 }; - struct bkey_i_reservation reservation; - struct bkey_s_c k; - - k = bch_btree_iter_peek_with_holes(&iter); - if ((ret = btree_iter_err(k))) - goto btree_iter_err; - - /* already reserved */ - if (k.k->type == BCH_RESERVATION && - bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { - bch_btree_iter_advance_pos(&iter); - continue; - } - - if (bkey_extent_is_data(k.k)) { - if (!(mode & FALLOC_FL_ZERO_RANGE)) { - bch_btree_iter_advance_pos(&iter); - continue; - } - } - - bkey_reservation_init(&reservation.k_i); - reservation.k.type = BCH_RESERVATION; - reservation.k.p = k.k->p; - reservation.k.size = k.k->size; - - bch_cut_front(iter.pos, &reservation.k_i); - bch_cut_back(end, &reservation.k); - - sectors = reservation.k.size; - reservation.v.nr_replicas = bch_extent_nr_dirty_ptrs(k); - - if (reservation.v.nr_replicas < replicas || - bkey_extent_is_compressed(k)) { - ret = bch_disk_reservation_get(c, &disk_res, - sectors, 0); - if (ret) - goto err_put_sectors_dirty; - - reservation.v.nr_replicas = disk_res.nr_replicas; - } - - ret = bch_btree_insert_at(c, &disk_res, &i_sectors_hook.hook, - &ei->journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &reservation.k_i)); - bch_disk_reservation_put(c, &disk_res); -btree_iter_err: - if (ret < 0 && ret != -EINTR) - goto err_put_sectors_dirty; - - } - bch_btree_iter_unlock(&iter); - - i_sectors_dirty_put(ei, &i_sectors_hook); - - if (!(mode & FALLOC_FL_KEEP_SIZE) && - new_size > inode->i_size) { - i_size_write(inode, new_size); - - mutex_lock(&ei->update_lock); - ret = bch_write_inode_size(c, ei, inode->i_size); - mutex_unlock(&ei->update_lock); - } - - /* blech */ - if ((mode & FALLOC_FL_KEEP_SIZE) && - (mode & FALLOC_FL_ZERO_RANGE) && - ei->i_size != inode->i_size) { - /* sync appends.. */ - ret = filemap_write_and_wait_range(mapping, ei->i_size, S64_MAX); - if (ret) - goto err; - - if (ei->i_size != inode->i_size) { - mutex_lock(&ei->update_lock); - ret = bch_write_inode_size(c, ei, inode->i_size); - mutex_unlock(&ei->update_lock); - } - } - - pagecache_block_put(&mapping->add_lock); - inode_unlock(inode); - - return 0; -err_put_sectors_dirty: - i_sectors_dirty_put(ei, &i_sectors_hook); -err: - bch_btree_iter_unlock(&iter); - pagecache_block_put(&mapping->add_lock); - inode_unlock(inode); - return ret; -} - -long bch_fallocate_dispatch(struct file *file, int mode, - loff_t offset, loff_t len) -{ - struct inode *inode = file_inode(file); - - if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) - return bch_fallocate(inode, mode, offset, len); - - if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) - return bch_fpunch(inode, offset, len); - - if (mode == FALLOC_FL_COLLAPSE_RANGE) - return bch_fcollapse(inode, offset, len); - - return -EOPNOTSUPP; -} - -static bool page_is_data(struct page *page) -{ - /* XXX: should only have to check PageDirty */ - return PagePrivate(page) && - (page_state(page)->sectors || - page_state(page)->dirty_sectors); -} - -static loff_t bch_next_pagecache_data(struct inode *inode, - loff_t start_offset, - loff_t end_offset) -{ - struct address_space *mapping = inode->i_mapping; - struct page *page; - pgoff_t index; - - for (index = start_offset >> PAGE_SHIFT; - index < end_offset >> PAGE_SHIFT; - index++) { - if (find_get_pages(mapping, index, 1, &page)) { - lock_page(page); - index = page->index; - - if (page_is_data(page)) - end_offset = - min(end_offset, - max(start_offset, - ((loff_t) index) << PAGE_SHIFT)); - unlock_page(page); - put_page(page); - } else { - break; - } - } - - return end_offset; -} - -static loff_t bch_seek_data(struct file *file, u64 offset) -{ - struct inode *inode = file->f_mapping->host; - struct bch_fs *c = inode->i_sb->s_fs_info; - struct btree_iter iter; - struct bkey_s_c k; - u64 isize, next_data = MAX_LFS_FILESIZE; - int ret; - - isize = i_size_read(inode); - if (offset >= isize) - return -ENXIO; - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(inode->i_ino, offset >> 9), k) { - if (k.k->p.inode != inode->i_ino) { - break; - } else if (bkey_extent_is_data(k.k)) { - next_data = max(offset, bkey_start_offset(k.k) << 9); - break; - } else if (k.k->p.offset >> 9 > isize) - break; - } - - ret = bch_btree_iter_unlock(&iter); - if (ret) - return ret; - - if (next_data > offset) - next_data = bch_next_pagecache_data(inode, offset, next_data); - - if (next_data > isize) - return -ENXIO; - - return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); -} - -static bool page_slot_is_data(struct address_space *mapping, pgoff_t index) -{ - struct page *page; - bool ret; - - page = find_lock_entry(mapping, index); - if (!page || radix_tree_exception(page)) - return false; - - ret = page_is_data(page); - unlock_page(page); - - return ret; -} - -static loff_t bch_next_pagecache_hole(struct inode *inode, - loff_t start_offset, - loff_t end_offset) -{ - struct address_space *mapping = inode->i_mapping; - pgoff_t index; - - for (index = start_offset >> PAGE_SHIFT; - index < end_offset >> PAGE_SHIFT; - index++) - if (!page_slot_is_data(mapping, index)) - end_offset = max(start_offset, - ((loff_t) index) << PAGE_SHIFT); - - return end_offset; -} - -static loff_t bch_seek_hole(struct file *file, u64 offset) -{ - struct inode *inode = file->f_mapping->host; - struct bch_fs *c = inode->i_sb->s_fs_info; - struct btree_iter iter; - struct bkey_s_c k; - u64 isize, next_hole = MAX_LFS_FILESIZE; - int ret; - - isize = i_size_read(inode); - if (offset >= isize) - return -ENXIO; - - for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS, - POS(inode->i_ino, offset >> 9), k) { - if (k.k->p.inode != inode->i_ino) { - next_hole = bch_next_pagecache_hole(inode, - offset, MAX_LFS_FILESIZE); - break; - } else if (!bkey_extent_is_data(k.k)) { - next_hole = bch_next_pagecache_hole(inode, - max(offset, bkey_start_offset(k.k) << 9), - k.k->p.offset << 9); - - if (next_hole < k.k->p.offset << 9) - break; - } else { - offset = max(offset, bkey_start_offset(k.k) << 9); - } - } - - ret = bch_btree_iter_unlock(&iter); - if (ret) - return ret; - - if (next_hole > isize) - next_hole = isize; - - return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); -} - -loff_t bch_llseek(struct file *file, loff_t offset, int whence) -{ - switch (whence) { - case SEEK_SET: - case SEEK_CUR: - case SEEK_END: - return generic_file_llseek(file, offset, whence); - case SEEK_DATA: - return bch_seek_data(file, offset); - case SEEK_HOLE: - return bch_seek_hole(file, offset); - } - - return -EINVAL; -} diff --git a/libbcache/fs-io.h b/libbcache/fs-io.h deleted file mode 100644 index 4c428978..00000000 --- a/libbcache/fs-io.h +++ /dev/null @@ -1,96 +0,0 @@ -#ifndef _BCACHE_FS_IO_H -#define _BCACHE_FS_IO_H - -#include "buckets.h" -#include <linux/uio.h> - -int bch_set_page_dirty(struct page *); - -int bch_writepage(struct page *, struct writeback_control *); -int bch_readpage(struct file *, struct page *); - -int bch_writepages(struct address_space *, struct writeback_control *); -int bch_readpages(struct file *, struct address_space *, - struct list_head *, unsigned); - -int bch_write_begin(struct file *, struct address_space *, loff_t, - unsigned, unsigned, struct page **, void **); -int bch_write_end(struct file *, struct address_space *, loff_t, - unsigned, unsigned, struct page *, void *); - -ssize_t bch_direct_IO(struct kiocb *, struct iov_iter *); - -ssize_t bch_write_iter(struct kiocb *, struct iov_iter *); - -int bch_fsync(struct file *, loff_t, loff_t, int); - -int bch_truncate(struct inode *, struct iattr *); -long bch_fallocate_dispatch(struct file *, int, loff_t, loff_t); - -loff_t bch_llseek(struct file *, loff_t, int); - -int bch_page_mkwrite(struct vm_area_struct *, struct vm_fault *); -void bch_invalidatepage(struct page *, unsigned int, unsigned int); -int bch_releasepage(struct page *, gfp_t); -int bch_migrate_page(struct address_space *, struct page *, - struct page *, enum migrate_mode); - -struct i_sectors_hook { - struct extent_insert_hook hook; - s64 sectors; - struct bch_inode_info *ei; -}; - -struct bchfs_write_op { - struct bch_inode_info *ei; - s64 sectors_added; - bool is_dio; - u64 new_i_size; - struct bch_write_op op; -}; - -struct bch_writepage_io { - struct closure cl; - - struct bchfs_write_op op; - - /* must come last: */ - struct bch_write_bio bio; -}; - -extern struct bio_set *bch_writepage_bioset; - -struct dio_write { - struct closure cl; - struct kiocb *req; - struct bch_fs *c; - long written; - long error; - loff_t offset; - - struct disk_reservation res; - - struct iovec *iovec; - struct iovec inline_vecs[UIO_FASTIOV]; - struct iov_iter iter; - - struct mm_struct *mm; - - struct bchfs_write_op iop; - - /* must be last: */ - struct bch_write_bio bio; -}; - -extern struct bio_set *bch_dio_write_bioset; - -struct dio_read { - struct closure cl; - struct kiocb *req; - long ret; - struct bch_read_bio rbio; -}; - -extern struct bio_set *bch_dio_read_bioset; - -#endif /* _BCACHE_FS_IO_H */ diff --git a/libbcache/fs.c b/libbcache/fs.c deleted file mode 100644 index f1125a32..00000000 --- a/libbcache/fs.c +++ /dev/null @@ -1,1481 +0,0 @@ - -#include "bcache.h" -#include "acl.h" -#include "btree_update.h" -#include "buckets.h" -#include "chardev.h" -#include "dirent.h" -#include "extents.h" -#include "fs.h" -#include "fs-gc.h" -#include "fs-io.h" -#include "inode.h" -#include "journal.h" -#include "keylist.h" -#include "super.h" -#include "xattr.h" - -#include <linux/aio.h> -#include <linux/backing-dev.h> -#include <linux/compat.h> -#include <linux/module.h> -#include <linux/mount.h> -#include <linux/random.h> -#include <linux/statfs.h> -#include <linux/xattr.h> - -static struct kmem_cache *bch_inode_cache; - -static void bch_vfs_inode_init(struct bch_fs *, - struct bch_inode_info *, - struct bch_inode_unpacked *); - -/* - * I_SIZE_DIRTY requires special handling: - * - * To the recovery code, the flag means that there is stale data past i_size - * that needs to be deleted; it's used for implementing atomic appends and - * truncates. - * - * On append, we set I_SIZE_DIRTY before doing the write, then after the write - * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size - * that exposes the data we just wrote. - * - * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting - * i_size to the new smaller size, then we delete the data that we just made - * invisible, and then we clear I_SIZE_DIRTY. - * - * Because there can be multiple appends in flight at a time, we need a refcount - * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero - * refcount means I_SIZE_DIRTY is set, zero means it's cleared. - * - * Because write_inode() can be called at any time, i_size_dirty_count means - * something different to the runtime code - it means to write_inode() "don't - * update i_size yet". - * - * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when - * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must - * be set explicitly. - */ - -int __must_check __bch_write_inode(struct bch_fs *c, - struct bch_inode_info *ei, - inode_set_fn set, - void *p) -{ - struct btree_iter iter; - struct inode *inode = &ei->vfs_inode; - struct bch_inode_unpacked inode_u; - struct bkey_inode_buf inode_p; - u64 inum = inode->i_ino; - unsigned i_nlink = READ_ONCE(inode->i_nlink); - int ret; - - /* - * We can't write an inode with i_nlink == 0 because it's stored biased; - * however, we don't need to because if i_nlink is 0 the inode is - * getting deleted when it's evicted. - */ - if (!i_nlink) - return 0; - - lockdep_assert_held(&ei->update_lock); - - bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(inum, 0)); - - do { - struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter); - - if ((ret = btree_iter_err(k))) - goto out; - - if (WARN_ONCE(k.k->type != BCH_INODE_FS, - "inode %llu not found when updating", inum)) { - bch_btree_iter_unlock(&iter); - return -ENOENT; - } - - ret = bch_inode_unpack(bkey_s_c_to_inode(k), &inode_u); - if (WARN_ONCE(ret, - "error %i unpacking inode %llu", ret, inum)) { - ret = -ENOENT; - break; - } - - if (set) { - ret = set(ei, &inode_u, p); - if (ret) - goto out; - } - - BUG_ON(i_nlink < nlink_bias(inode->i_mode)); - - inode_u.i_mode = inode->i_mode; - inode_u.i_uid = i_uid_read(inode); - inode_u.i_gid = i_gid_read(inode); - inode_u.i_nlink = i_nlink - nlink_bias(inode->i_mode); - inode_u.i_dev = inode->i_rdev; - inode_u.i_atime = timespec_to_bch_time(c, inode->i_atime); - inode_u.i_mtime = timespec_to_bch_time(c, inode->i_mtime); - inode_u.i_ctime = timespec_to_bch_time(c, inode->i_ctime); - - bch_inode_pack(&inode_p, &inode_u); - - ret = bch_btree_insert_at(c, NULL, NULL, &ei->journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i)); - } while (ret == -EINTR); - - if (!ret) { - ei->i_size = inode_u.i_size; - ei->i_flags = inode_u.i_flags; - } -out: - bch_btree_iter_unlock(&iter); - - return ret < 0 ? ret : 0; -} - -int __must_check bch_write_inode(struct bch_fs *c, - struct bch_inode_info *ei) -{ - return __bch_write_inode(c, ei, NULL, NULL); -} - -int bch_inc_nlink(struct bch_fs *c, struct bch_inode_info *ei) -{ - int ret; - - mutex_lock(&ei->update_lock); - inc_nlink(&ei->vfs_inode); - ret = bch_write_inode(c, ei); - mutex_unlock(&ei->update_lock); - - return ret; -} - -int bch_dec_nlink(struct bch_fs *c, struct bch_inode_info *ei) -{ - int ret = 0; - - mutex_lock(&ei->update_lock); - drop_nlink(&ei->vfs_inode); - ret = bch_write_inode(c, ei); - mutex_unlock(&ei->update_lock); - - return ret; -} - -static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum) -{ - struct bch_fs *c = sb->s_fs_info; - struct inode *inode; - struct bch_inode_unpacked inode_u; - struct bch_inode_info *ei; - int ret; - - pr_debug("inum %llu", inum); - - inode = iget_locked(sb, inum); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; - - ret = bch_inode_find_by_inum(c, inum, &inode_u); - if (ret) { - iget_failed(inode); - return ERR_PTR(ret); - } - - ei = to_bch_ei(inode); - bch_vfs_inode_init(c, ei, &inode_u); - - ei->journal_seq = bch_inode_journal_seq(&c->journal, inum); - - unlock_new_inode(inode); - - return inode; -} - -static struct inode *bch_vfs_inode_create(struct bch_fs *c, - struct inode *parent, - umode_t mode, dev_t rdev) -{ - struct inode *inode; - struct posix_acl *default_acl = NULL, *acl = NULL; - struct bch_inode_info *ei; - struct bch_inode_unpacked inode_u; - struct bkey_inode_buf inode_p; - int ret; - - inode = new_inode(parent->i_sb); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - - inode_init_owner(inode, parent, mode); - - ret = posix_acl_create(parent, &inode->i_mode, &default_acl, &acl); - if (ret) { - make_bad_inode(inode); - goto err; - } - - ei = to_bch_ei(inode); - - bch_inode_init(c, &inode_u, i_uid_read(inode), - i_gid_read(inode), inode->i_mode, rdev); - bch_inode_pack(&inode_p, &inode_u); - - ret = bch_inode_create(c, &inode_p.inode.k_i, - BLOCKDEV_INODE_MAX, 0, - &c->unused_inode_hint); - if (unlikely(ret)) { - /* - * indicate to bch_evict_inode that the inode was never actually - * created: - */ - make_bad_inode(inode); - goto err; - } - - inode_u.inum = inode_p.inode.k.p.inode; - bch_vfs_inode_init(c, ei, &inode_u); - - if (default_acl) { - ret = bch_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); - if (unlikely(ret)) - goto err; - } - - if (acl) { - ret = bch_set_acl(inode, acl, ACL_TYPE_ACCESS); - if (unlikely(ret)) - goto err; - } - - insert_inode_hash(inode); - atomic_long_inc(&c->nr_inodes); -out: - posix_acl_release(default_acl); - posix_acl_release(acl); - return inode; -err: - clear_nlink(inode); - iput(inode); - inode = ERR_PTR(ret); - goto out; -} - -static int bch_vfs_dirent_create(struct bch_fs *c, struct inode *dir, - u8 type, const struct qstr *name, - struct inode *dst) -{ - struct bch_inode_info *dir_ei = to_bch_ei(dir); - int ret; - - ret = bch_dirent_create(c, dir->i_ino, &dir_ei->str_hash, - type, name, dst->i_ino, - &dir_ei->journal_seq, - BCH_HASH_SET_MUST_CREATE); - if (unlikely(ret)) - return ret; - - dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); - mark_inode_dirty_sync(dir); - return 0; -} - -static int __bch_create(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - struct bch_inode_info *dir_ei = to_bch_ei(dir); - struct bch_fs *c = dir->i_sb->s_fs_info; - struct inode *inode; - struct bch_inode_info *ei; - int ret; - - inode = bch_vfs_inode_create(c, dir, mode, rdev); - if (unlikely(IS_ERR(inode))) - return PTR_ERR(inode); - - ei = to_bch_ei(inode); - - ret = bch_vfs_dirent_create(c, dir, mode_to_type(mode), - &dentry->d_name, inode); - if (unlikely(ret)) { - clear_nlink(inode); - iput(inode); - return ret; - } - - if (dir_ei->journal_seq > ei->journal_seq) - ei->journal_seq = dir_ei->journal_seq; - - d_instantiate(dentry, inode); - return 0; -} - -/* methods */ - -static struct dentry *bch_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct bch_fs *c = dir->i_sb->s_fs_info; - struct bch_inode_info *dir_ei = to_bch_ei(dir); - struct inode *inode = NULL; - u64 inum; - - inum = bch_dirent_lookup(c, dir->i_ino, - &dir_ei->str_hash, - &dentry->d_name); - - if (inum) - inode = bch_vfs_inode_get(dir->i_sb, inum); - - return d_splice_alias(inode, dentry); -} - -static int bch_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) -{ - return __bch_create(dir, dentry, mode|S_IFREG, 0); -} - -static int bch_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *dentry) -{ - struct bch_fs *c = dir->i_sb->s_fs_info; - struct inode *inode = old_dentry->d_inode; - struct bch_inode_info *ei = to_bch_ei(inode); - int ret; - - lockdep_assert_held(&inode->i_rwsem); - - inode->i_ctime = current_fs_time(dir->i_sb); - - ret = bch_inc_nlink(c, ei); - if (ret) - return ret; - - ihold(inode); - - ret = bch_vfs_dirent_create(c, dir, mode_to_type(inode->i_mode), - &dentry->d_name, inode); - if (unlikely(ret)) { - bch_dec_nlink(c, ei); - iput(inode); - return ret; - } - - d_instantiate(dentry, inode); - return 0; -} - -static int bch_unlink(struct inode *dir, struct dentry *dentry) -{ - struct bch_fs *c = dir->i_sb->s_fs_info; - struct bch_inode_info *dir_ei = to_bch_ei(dir); - struct inode *inode = dentry->d_inode; - struct bch_inode_info *ei = to_bch_ei(inode); - int ret; - - lockdep_assert_held(&inode->i_rwsem); - - ret = bch_dirent_delete(c, dir->i_ino, &dir_ei->str_hash, - &dentry->d_name, &dir_ei->journal_seq); - if (ret) - return ret; - - if (dir_ei->journal_seq > ei->journal_seq) - ei->journal_seq = dir_ei->journal_seq; - - inode->i_ctime = dir->i_ctime; - - if (S_ISDIR(inode->i_mode)) { - bch_dec_nlink(c, dir_ei); - drop_nlink(inode); - } - - bch_dec_nlink(c, ei); - - return 0; -} - -static int bch_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) -{ - struct bch_fs *c = dir->i_sb->s_fs_info; - struct inode *inode; - struct bch_inode_info *ei, *dir_ei = to_bch_ei(dir); - int ret; - - inode = bch_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0); - if (unlikely(IS_ERR(inode))) - return PTR_ERR(inode); - - ei = to_bch_ei(inode); - - inode_lock(inode); - ret = page_symlink(inode, symname, strlen(symname) + 1); - inode_unlock(inode); - - if (unlikely(ret)) - goto err; - - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (unlikely(ret)) - goto err; - - /* XXX: racy */ - if (dir_ei->journal_seq < ei->journal_seq) - dir_ei->journal_seq = ei->journal_seq; - - ret = bch_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name, inode); - if (unlikely(ret)) - goto err; - - d_instantiate(dentry, inode); - return 0; -err: - clear_nlink(inode); - iput(inode); - return ret; -} - -static int bch_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - struct bch_fs *c = dir->i_sb->s_fs_info; - int ret; - - lockdep_assert_held(&dir->i_rwsem); - - ret = __bch_create(dir, dentry, mode|S_IFDIR, 0); - if (unlikely(ret)) - return ret; - - bch_inc_nlink(c, to_bch_ei(dir)); - - return 0; -} - -static int bch_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct bch_fs *c = dir->i_sb->s_fs_info; - struct inode *inode = dentry->d_inode; - - if (bch_empty_dir(c, inode->i_ino)) - return -ENOTEMPTY; - - return bch_unlink(dir, dentry); -} - -static int bch_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - return __bch_create(dir, dentry, mode, rdev); -} - -static int bch_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - struct bch_fs *c = old_dir->i_sb->s_fs_info; - struct inode *old_inode = old_dentry->d_inode; - struct bch_inode_info *ei = to_bch_ei(old_inode); - struct inode *new_inode = new_dentry->d_inode; - struct timespec now = current_fs_time(old_dir->i_sb); - int ret; - - lockdep_assert_held(&old_dir->i_rwsem); - lockdep_assert_held(&new_dir->i_rwsem); - - if (new_inode) - filemap_write_and_wait_range(old_inode->i_mapping, - 0, LLONG_MAX); - - if (new_inode && S_ISDIR(old_inode->i_mode)) { - lockdep_assert_held(&new_inode->i_rwsem); - - if (!S_ISDIR(new_inode->i_mode)) - return -ENOTDIR; - - if (bch_empty_dir(c, new_inode->i_ino)) - return -ENOTEMPTY; - - ret = bch_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &ei->journal_seq, BCH_RENAME_OVERWRITE); - if (unlikely(ret)) - return ret; - - clear_nlink(new_inode); - bch_dec_nlink(c, to_bch_ei(old_dir)); - } else if (new_inode) { - lockdep_assert_held(&new_inode->i_rwsem); - - ret = bch_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &ei->journal_seq, BCH_RENAME_OVERWRITE); - if (unlikely(ret)) - return ret; - - new_inode->i_ctime = now; - bch_dec_nlink(c, to_bch_ei(new_inode)); - } else if (S_ISDIR(old_inode->i_mode)) { - ret = bch_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &ei->journal_seq, BCH_RENAME); - if (unlikely(ret)) - return ret; - - bch_inc_nlink(c, to_bch_ei(new_dir)); - bch_dec_nlink(c, to_bch_ei(old_dir)); - } else { - ret = bch_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &ei->journal_seq, BCH_RENAME); - if (unlikely(ret)) - return ret; - } - - old_dir->i_ctime = old_dir->i_mtime = now; - new_dir->i_ctime = new_dir->i_mtime = now; - mark_inode_dirty_sync(old_dir); - mark_inode_dirty_sync(new_dir); - - old_inode->i_ctime = now; - mark_inode_dirty_sync(old_inode); - - return 0; -} - -static int bch_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - struct bch_fs *c = old_dir->i_sb->s_fs_info; - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; - struct bch_inode_info *ei = to_bch_ei(old_inode); - struct timespec now = current_fs_time(old_dir->i_sb); - int ret; - - ret = bch_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &ei->journal_seq, BCH_RENAME_EXCHANGE); - if (unlikely(ret)) - return ret; - - if (S_ISDIR(old_inode->i_mode) != - S_ISDIR(new_inode->i_mode)) { - if (S_ISDIR(old_inode->i_mode)) { - bch_inc_nlink(c, to_bch_ei(new_dir)); - bch_dec_nlink(c, to_bch_ei(old_dir)); - } else { - bch_dec_nlink(c, to_bch_ei(new_dir)); - bch_inc_nlink(c, to_bch_ei(old_dir)); - } - } - - old_dir->i_ctime = old_dir->i_mtime = now; - new_dir->i_ctime = new_dir->i_mtime = now; - mark_inode_dirty_sync(old_dir); - mark_inode_dirty_sync(new_dir); - - old_inode->i_ctime = now; - new_inode->i_ctime = now; - mark_inode_dirty_sync(old_inode); - mark_inode_dirty_sync(new_inode); - - return 0; -} - -static int bch_rename2(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned flags) -{ - if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) - return -EINVAL; - - if (flags & RENAME_EXCHANGE) - return bch_rename_exchange(old_dir, old_dentry, - new_dir, new_dentry); - - return bch_rename(old_dir, old_dentry, new_dir, new_dentry); -} - -static int bch_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = dentry->d_inode; - struct bch_inode_info *ei = to_bch_ei(inode); - struct bch_fs *c = inode->i_sb->s_fs_info; - int ret = 0; - - lockdep_assert_held(&inode->i_rwsem); - - pr_debug("i_size was %llu update has %llu", - inode->i_size, iattr->ia_size); - - ret = setattr_prepare(dentry, iattr); - if (ret) - return ret; - - if (iattr->ia_valid & ATTR_SIZE) { - ret = bch_truncate(inode, iattr); - } else { - mutex_lock(&ei->update_lock); - setattr_copy(inode, iattr); - ret = bch_write_inode(c, ei); - mutex_unlock(&ei->update_lock); - } - - if (unlikely(ret)) - return ret; - - if (iattr->ia_valid & ATTR_MODE) - ret = posix_acl_chmod(inode, inode->i_mode); - - return ret; -} - -static int bch_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - struct bch_fs *c = dir->i_sb->s_fs_info; - struct inode *inode; - - /* XXX: i_nlink should be 0? */ - inode = bch_vfs_inode_create(c, dir, mode, 0); - if (unlikely(IS_ERR(inode))) - return PTR_ERR(inode); - - d_tmpfile(dentry, inode); - return 0; -} - -static int bch_fill_extent(struct fiemap_extent_info *info, - const struct bkey_i *k, unsigned flags) -{ - if (bkey_extent_is_data(&k->k)) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); - const struct bch_extent_ptr *ptr; - const union bch_extent_crc *crc; - int ret; - - extent_for_each_ptr_crc(e, ptr, crc) { - int flags2 = 0; - u64 offset = ptr->offset; - - if (crc_compression_type(crc)) - flags2 |= FIEMAP_EXTENT_ENCODED; - else - offset += crc_offset(crc); - - if ((offset & (PAGE_SECTORS - 1)) || - (e.k->size & (PAGE_SECTORS - 1))) - flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; - - ret = fiemap_fill_next_extent(info, - bkey_start_offset(e.k) << 9, - offset << 9, - e.k->size << 9, flags|flags2); - if (ret) - return ret; - } - - return 0; - } else if (k->k.type == BCH_RESERVATION) { - return fiemap_fill_next_extent(info, - bkey_start_offset(&k->k) << 9, - 0, k->k.size << 9, - flags| - FIEMAP_EXTENT_DELALLOC| - FIEMAP_EXTENT_UNWRITTEN); - } else { - BUG(); - } -} - -static int bch_fiemap(struct inode *inode, struct fiemap_extent_info *info, - u64 start, u64 len) -{ - struct bch_fs *c = inode->i_sb->s_fs_info; - struct btree_iter iter; - struct bkey_s_c k; - BKEY_PADDED(k) tmp; - bool have_extent = false; - int ret = 0; - - if (start + len < start) - return -EINVAL; - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(inode->i_ino, start >> 9), k) - if (bkey_extent_is_data(k.k) || - k.k->type == BCH_RESERVATION) { - if (bkey_cmp(bkey_start_pos(k.k), - POS(inode->i_ino, (start + len) >> 9)) >= 0) - break; - - if (have_extent) { - ret = bch_fill_extent(info, &tmp.k, 0); - if (ret) - goto out; - } - - bkey_reassemble(&tmp.k, k); - have_extent = true; - } - - if (have_extent) - ret = bch_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST); -out: - bch_btree_iter_unlock(&iter); - return ret < 0 ? ret : 0; -} - -static const struct vm_operations_struct bch_vm_ops = { - .fault = filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = bch_page_mkwrite, -}; - -static int bch_mmap(struct file *file, struct vm_area_struct *vma) -{ - file_accessed(file); - - vma->vm_ops = &bch_vm_ops; - return 0; -} - -/* Inode flags: */ - -static const unsigned bch_inode_flags_to_vfs_flags_map[] = { - [__BCH_INODE_SYNC] = S_SYNC, - [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, - [__BCH_INODE_APPEND] = S_APPEND, - [__BCH_INODE_NOATIME] = S_NOATIME, -}; - -static const unsigned bch_inode_flags_to_user_flags_map[] = { - [__BCH_INODE_SYNC] = FS_SYNC_FL, - [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, - [__BCH_INODE_APPEND] = FS_APPEND_FL, - [__BCH_INODE_NODUMP] = FS_NODUMP_FL, - [__BCH_INODE_NOATIME] = FS_NOATIME_FL, -}; - -/* Set VFS inode flags from bcache inode: */ -static void bch_inode_flags_to_vfs(struct inode *inode) -{ - unsigned i, flags = to_bch_ei(inode)->i_flags; - - for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_vfs_flags_map); i++) - if (flags & (1 << i)) - inode->i_flags |= bch_inode_flags_to_vfs_flags_map[i]; - else - inode->i_flags &= ~bch_inode_flags_to_vfs_flags_map[i]; -} - -/* Get FS_IOC_GETFLAGS flags from bcache inode: */ -static unsigned bch_inode_flags_to_user_flags(unsigned flags) -{ - unsigned i, ret = 0; - - for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_user_flags_map); i++) - if (flags & (1 << i)) - ret |= bch_inode_flags_to_user_flags_map[i]; - - return ret; -} - -static int bch_inode_user_flags_set(struct bch_inode_info *ei, - struct bch_inode_unpacked *bi, - void *p) -{ - /* - * We're relying on btree locking here for exclusion with other ioctl - * calls - use the flags in the btree (@bi), not ei->i_flags: - */ - unsigned bch_flags = bi->i_flags; - unsigned oldflags = bch_inode_flags_to_user_flags(bch_flags); - unsigned newflags = *((unsigned *) p); - unsigned i; - - if (((newflags ^ oldflags) & (FS_APPEND_FL|FS_IMMUTABLE_FL)) && - !capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; - - for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_user_flags_map); i++) { - if (newflags & bch_inode_flags_to_user_flags_map[i]) - bch_flags |= (1 << i); - else - bch_flags &= ~(1 << i); - - newflags &= ~bch_inode_flags_to_user_flags_map[i]; - oldflags &= ~bch_inode_flags_to_user_flags_map[i]; - } - - if (oldflags != newflags) - return -EOPNOTSUPP; - - bi->i_flags = bch_flags; - ei->vfs_inode.i_ctime = current_fs_time(ei->vfs_inode.i_sb); - - return 0; -} - -#define FS_IOC_GOINGDOWN _IOR ('X', 125, __u32) - -static long bch_fs_file_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) -{ - struct inode *inode = file_inode(filp); - struct super_block *sb = inode->i_sb; - struct bch_fs *c = sb->s_fs_info; - struct bch_inode_info *ei = to_bch_ei(inode); - unsigned flags; - int ret; - - switch (cmd) { - case FS_IOC_GETFLAGS: - return put_user(bch_inode_flags_to_user_flags(ei->i_flags), - (int __user *) arg); - - case FS_IOC_SETFLAGS: { - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto setflags_out; - } - - if (get_user(flags, (int __user *) arg)) { - ret = -EFAULT; - goto setflags_out; - } - - if (!S_ISREG(inode->i_mode) && - !S_ISDIR(inode->i_mode) && - (flags & (FS_NODUMP_FL|FS_NOATIME_FL)) != flags) { - ret = -EINVAL; - goto setflags_out; - } - - inode_lock(inode); - - mutex_lock(&ei->update_lock); - ret = __bch_write_inode(c, ei, bch_inode_user_flags_set, &flags); - mutex_unlock(&ei->update_lock); - - if (!ret) - bch_inode_flags_to_vfs(inode); - - inode_unlock(inode); -setflags_out: - mnt_drop_write_file(filp); - return ret; - } - - case FS_IOC_GETVERSION: - return -ENOTTY; - case FS_IOC_SETVERSION: - return -ENOTTY; - - case FS_IOC_GOINGDOWN: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - down_write(&sb->s_umount); - sb->s_flags |= MS_RDONLY; - bch_fs_emergency_read_only(c); - up_write(&sb->s_umount); - return 0; - - default: - return bch_fs_ioctl(c, cmd, (void __user *) arg); - } -} - -#ifdef CONFIG_COMPAT -static long bch_compat_fs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - /* These are just misnamed, they actually get/put from/to user an int */ - switch (cmd) { - case FS_IOC_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; - default: - return -ENOIOCTLCMD; - } - return bch_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); -} -#endif - -/* Directories: */ - -static loff_t bch_dir_llseek(struct file *file, loff_t offset, int whence) -{ - return generic_file_llseek_size(file, offset, whence, - S64_MAX, S64_MAX); -} - -static int bch_vfs_readdir(struct file *file, struct dir_context *ctx) -{ - struct inode *inode = file_inode(file); - struct bch_fs *c = inode->i_sb->s_fs_info; - - return bch_readdir(c, file, ctx); -} - -static const struct file_operations bch_file_operations = { - .llseek = bch_llseek, - .read_iter = generic_file_read_iter, - .write_iter = bch_write_iter, - .mmap = bch_mmap, - .open = generic_file_open, - .fsync = bch_fsync, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fallocate = bch_fallocate_dispatch, - .unlocked_ioctl = bch_fs_file_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = bch_compat_fs_ioctl, -#endif -}; - -static const struct inode_operations bch_file_inode_operations = { - .setattr = bch_setattr, - .fiemap = bch_fiemap, - .listxattr = bch_xattr_list, - .get_acl = bch_get_acl, - .set_acl = bch_set_acl, -}; - -static const struct inode_operations bch_dir_inode_operations = { - .lookup = bch_lookup, - .create = bch_create, - .link = bch_link, - .unlink = bch_unlink, - .symlink = bch_symlink, - .mkdir = bch_mkdir, - .rmdir = bch_rmdir, - .mknod = bch_mknod, - .rename = bch_rename2, - .setattr = bch_setattr, - .tmpfile = bch_tmpfile, - .listxattr = bch_xattr_list, - .get_acl = bch_get_acl, - .set_acl = bch_set_acl, -}; - -static const struct file_operations bch_dir_file_operations = { - .llseek = bch_dir_llseek, - .read = generic_read_dir, - .iterate = bch_vfs_readdir, - .fsync = bch_fsync, - .unlocked_ioctl = bch_fs_file_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = bch_compat_fs_ioctl, -#endif -}; - -static const struct inode_operations bch_symlink_inode_operations = { - .readlink = generic_readlink, - .get_link = page_get_link, - .setattr = bch_setattr, - .listxattr = bch_xattr_list, - .get_acl = bch_get_acl, - .set_acl = bch_set_acl, -}; - -static const struct inode_operations bch_special_inode_operations = { - .setattr = bch_setattr, - .listxattr = bch_xattr_list, - .get_acl = bch_get_acl, - .set_acl = bch_set_acl, -}; - -static const struct address_space_operations bch_address_space_operations = { - .writepage = bch_writepage, - .readpage = bch_readpage, - .writepages = bch_writepages, - .readpages = bch_readpages, - .set_page_dirty = bch_set_page_dirty, - .write_begin = bch_write_begin, - .write_end = bch_write_end, - .invalidatepage = bch_invalidatepage, - .releasepage = bch_releasepage, - .direct_IO = bch_direct_IO, -#ifdef CONFIG_MIGRATION - .migratepage = bch_migrate_page, -#endif - .error_remove_page = generic_error_remove_page, -}; - -static void bch_vfs_inode_init(struct bch_fs *c, - struct bch_inode_info *ei, - struct bch_inode_unpacked *bi) -{ - struct inode *inode = &ei->vfs_inode; - - pr_debug("init inode %llu with mode %o", - bi->inum, bi->i_mode); - - ei->i_flags = bi->i_flags; - ei->i_size = bi->i_size; - - inode->i_mode = bi->i_mode; - i_uid_write(inode, bi->i_uid); - i_gid_write(inode, bi->i_gid); - - atomic64_set(&ei->i_sectors, bi->i_sectors); - inode->i_blocks = bi->i_sectors; - - inode->i_ino = bi->inum; - set_nlink(inode, bi->i_nlink + nlink_bias(inode->i_mode)); - inode->i_rdev = bi->i_dev; - inode->i_generation = bi->i_generation; - inode->i_size = bi->i_size; - inode->i_atime = bch_time_to_timespec(c, bi->i_atime); - inode->i_mtime = bch_time_to_timespec(c, bi->i_mtime); - inode->i_ctime = bch_time_to_timespec(c, bi->i_ctime); - bch_inode_flags_to_vfs(inode); - - ei->str_hash = bch_hash_info_init(bi); - - inode->i_mapping->a_ops = &bch_address_space_operations; - - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - inode->i_op = &bch_file_inode_operations; - inode->i_fop = &bch_file_operations; - break; - case S_IFDIR: - inode->i_op = &bch_dir_inode_operations; - inode->i_fop = &bch_dir_file_operations; - break; - case S_IFLNK: - inode_nohighmem(inode); - inode->i_op = &bch_symlink_inode_operations; - break; - default: - init_special_inode(inode, inode->i_mode, inode->i_rdev); - inode->i_op = &bch_special_inode_operations; - break; - } -} - -static struct inode *bch_alloc_inode(struct super_block *sb) -{ - struct bch_inode_info *ei; - - ei = kmem_cache_alloc(bch_inode_cache, GFP_NOFS); - if (!ei) - return NULL; - - pr_debug("allocated %p", &ei->vfs_inode); - - inode_init_once(&ei->vfs_inode); - mutex_init(&ei->update_lock); - ei->journal_seq = 0; - atomic_long_set(&ei->i_size_dirty_count, 0); - atomic_long_set(&ei->i_sectors_dirty_count, 0); - - return &ei->vfs_inode; -} - -static void bch_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - - kmem_cache_free(bch_inode_cache, to_bch_ei(inode)); -} - -static void bch_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, bch_i_callback); -} - -static int bch_vfs_write_inode(struct inode *inode, - struct writeback_control *wbc) -{ - struct bch_fs *c = inode->i_sb->s_fs_info; - struct bch_inode_info *ei = to_bch_ei(inode); - int ret; - - mutex_lock(&ei->update_lock); - ret = bch_write_inode(c, ei); - mutex_unlock(&ei->update_lock); - - if (c->opts.journal_flush_disabled) - return ret; - - if (!ret && wbc->sync_mode == WB_SYNC_ALL) - ret = bch_journal_flush_seq(&c->journal, ei->journal_seq); - - return ret; -} - -static void bch_evict_inode(struct inode *inode) -{ - struct bch_fs *c = inode->i_sb->s_fs_info; - - truncate_inode_pages_final(&inode->i_data); - - if (!bch_journal_error(&c->journal) && !is_bad_inode(inode)) { - struct bch_inode_info *ei = to_bch_ei(inode); - - /* XXX - we want to check this stuff iff there weren't IO errors: */ - BUG_ON(atomic_long_read(&ei->i_sectors_dirty_count)); - BUG_ON(atomic64_read(&ei->i_sectors) != inode->i_blocks); - } - - clear_inode(inode); - - if (!inode->i_nlink && !is_bad_inode(inode)) { - bch_inode_rm(c, inode->i_ino); - atomic_long_dec(&c->nr_inodes); - } -} - -static int bch_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_sb; - struct bch_fs *c = sb->s_fs_info; - u64 fsid; - - buf->f_type = BCACHE_STATFS_MAGIC; - buf->f_bsize = sb->s_blocksize; - buf->f_blocks = c->capacity >> PAGE_SECTOR_SHIFT; - buf->f_bfree = (c->capacity - bch_fs_sectors_used(c)) >> PAGE_SECTOR_SHIFT; - buf->f_bavail = buf->f_bfree; - buf->f_files = atomic_long_read(&c->nr_inodes); - buf->f_ffree = U64_MAX; - - fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ - le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; - buf->f_namelen = NAME_MAX; - - return 0; -} - -static int bch_sync_fs(struct super_block *sb, int wait) -{ - struct bch_fs *c = sb->s_fs_info; - - if (!wait) { - bch_journal_flush_async(&c->journal, NULL); - return 0; - } - - return bch_journal_flush(&c->journal); -} - -static struct bch_fs *bch_open_as_blockdevs(const char *_dev_name, - struct bch_opts opts) -{ - size_t nr_devs = 0, i = 0; - char *dev_name, *s, **devs; - struct bch_fs *c = NULL; - const char *err = "cannot allocate memory"; - - dev_name = kstrdup(_dev_name, GFP_KERNEL); - if (!dev_name) - return NULL; - - for (s = dev_name; s; s = strchr(s + 1, ':')) - nr_devs++; - - devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL); - if (!devs) - goto err; - - for (i = 0, s = dev_name; - s; - (s = strchr(s, ':')) && (*s++ = '\0')) - devs[i++] = s; - - err = bch_fs_open(devs, nr_devs, opts, &c); - if (err) { - /* - * Already open? - * Look up each block device, make sure they all belong to a - * filesystem and they all belong to the _same_ filesystem - */ - - for (i = 0; i < nr_devs; i++) { - struct block_device *bdev = lookup_bdev(devs[i]); - struct bch_fs *c2; - - if (IS_ERR(bdev)) - goto err; - - c2 = bch_bdev_to_fs(bdev); - bdput(bdev); - - if (!c) - c = c2; - else if (c2) - closure_put(&c2->cl); - - if (!c) - goto err; - if (c != c2) { - closure_put(&c->cl); - goto err; - } - } - - mutex_lock(&c->state_lock); - - if (!bch_fs_running(c)) { - mutex_unlock(&c->state_lock); - closure_put(&c->cl); - err = "incomplete filesystem"; - c = NULL; - goto err; - } - - mutex_unlock(&c->state_lock); - } - - set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); -err: - kfree(devs); - kfree(dev_name); - - if (!c) - pr_err("bch_fs_open err %s", err); - return c; -} - -static int bch_remount(struct super_block *sb, int *flags, char *data) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_opts opts = bch_opts_empty(); - int ret; - - opts.read_only = (*flags & MS_RDONLY) != 0; - - ret = bch_parse_mount_opts(&opts, data); - if (ret) - return ret; - - if (opts.read_only >= 0 && - opts.read_only != c->opts.read_only) { - const char *err = NULL; - - if (opts.read_only) { - bch_fs_read_only(c); - - sb->s_flags |= MS_RDONLY; - } else { - err = bch_fs_read_write(c); - if (err) { - bch_err(c, "error going rw: %s", err); - return -EINVAL; - } - - sb->s_flags &= ~MS_RDONLY; - } - - c->opts.read_only = opts.read_only; - } - - if (opts.errors >= 0) - c->opts.errors = opts.errors; - - return ret; -} - -static const struct super_operations bch_super_operations = { - .alloc_inode = bch_alloc_inode, - .destroy_inode = bch_destroy_inode, - .write_inode = bch_vfs_write_inode, - .evict_inode = bch_evict_inode, - .sync_fs = bch_sync_fs, - .statfs = bch_statfs, - .show_options = generic_show_options, - .remount_fs = bch_remount, -#if 0 - .put_super = bch_put_super, - .freeze_fs = bch_freeze, - .unfreeze_fs = bch_unfreeze, -#endif -}; - -static int bch_test_super(struct super_block *s, void *data) -{ - return s->s_fs_info == data; -} - -static int bch_set_super(struct super_block *s, void *data) -{ - s->s_fs_info = data; - return 0; -} - -static struct dentry *bch_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - struct bch_fs *c; - struct bch_dev *ca; - struct super_block *sb; - struct inode *inode; - struct bch_opts opts = bch_opts_empty(); - unsigned i; - int ret; - - opts.read_only = (flags & MS_RDONLY) != 0; - - ret = bch_parse_mount_opts(&opts, data); - if (ret) - return ERR_PTR(ret); - - c = bch_open_as_blockdevs(dev_name, opts); - if (!c) - return ERR_PTR(-ENOENT); - - sb = sget(fs_type, bch_test_super, bch_set_super, flags|MS_NOSEC, c); - if (IS_ERR(sb)) { - closure_put(&c->cl); - return ERR_CAST(sb); - } - - BUG_ON(sb->s_fs_info != c); - - if (sb->s_root) { - closure_put(&c->cl); - - if ((flags ^ sb->s_flags) & MS_RDONLY) { - ret = -EBUSY; - goto err_put_super; - } - goto out; - } - - /* XXX: blocksize */ - sb->s_blocksize = PAGE_SIZE; - sb->s_blocksize_bits = PAGE_SHIFT; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_op = &bch_super_operations; - sb->s_xattr = bch_xattr_handlers; - sb->s_magic = BCACHE_STATFS_MAGIC; - sb->s_time_gran = c->sb.time_precision; - c->vfs_sb = sb; - sb->s_bdi = &c->bdi; - strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); - - for_each_online_member(ca, c, i) { - struct block_device *bdev = ca->disk_sb.bdev; - - /* XXX: create an anonymous device for multi device filesystems */ - sb->s_bdev = bdev; - sb->s_dev = bdev->bd_dev; - percpu_ref_put(&ca->io_ref); - break; - } - - if (opts.posix_acl < 0) - sb->s_flags |= MS_POSIXACL; - else - sb->s_flags |= opts.posix_acl ? MS_POSIXACL : 0; - - inode = bch_vfs_inode_get(sb, BCACHE_ROOT_INO); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - goto err_put_super; - } - - sb->s_root = d_make_root(inode); - if (!sb->s_root) { - ret = -ENOMEM; - goto err_put_super; - } - - sb->s_flags |= MS_ACTIVE; -out: - return dget(sb->s_root); - -err_put_super: - deactivate_locked_super(sb); - return ERR_PTR(ret); -} - -static void bch_kill_sb(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - - generic_shutdown_super(sb); - - if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags)) - bch_fs_stop(c); - else - closure_put(&c->cl); -} - -static struct file_system_type bcache_fs_type = { - .owner = THIS_MODULE, - .name = "bcache", - .mount = bch_mount, - .kill_sb = bch_kill_sb, - .fs_flags = FS_REQUIRES_DEV, -}; - -MODULE_ALIAS_FS("bcache"); - -void bch_vfs_exit(void) -{ - unregister_filesystem(&bcache_fs_type); - if (bch_dio_write_bioset) - bioset_free(bch_dio_write_bioset); - if (bch_dio_read_bioset) - bioset_free(bch_dio_read_bioset); - if (bch_writepage_bioset) - bioset_free(bch_writepage_bioset); - if (bch_inode_cache) - kmem_cache_destroy(bch_inode_cache); -} - -int __init bch_vfs_init(void) -{ - int ret = -ENOMEM; - - bch_inode_cache = KMEM_CACHE(bch_inode_info, 0); - if (!bch_inode_cache) - goto err; - - bch_writepage_bioset = - bioset_create(4, offsetof(struct bch_writepage_io, bio.bio)); - if (!bch_writepage_bioset) - goto err; - - bch_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, rbio.bio)); - if (!bch_dio_read_bioset) - goto err; - - bch_dio_write_bioset = bioset_create(4, offsetof(struct dio_write, bio.bio)); - if (!bch_dio_write_bioset) - goto err; - - ret = register_filesystem(&bcache_fs_type); - if (ret) - goto err; - - return 0; -err: - bch_vfs_exit(); - return ret; -} diff --git a/libbcache/fs.h b/libbcache/fs.h deleted file mode 100644 index 1c0a2b15..00000000 --- a/libbcache/fs.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef _BCACHE_FS_H -#define _BCACHE_FS_H - -#include "str_hash.h" - -#include <linux/seqlock.h> - -struct bch_inode_info { - struct inode vfs_inode; - - struct mutex update_lock; - u64 journal_seq; - - atomic_long_t i_size_dirty_count; - - /* - * these are updated whenever we update the inode in the btree - for - * e.g. fsync - */ - u64 i_size; - u32 i_flags; - - atomic_long_t i_sectors_dirty_count; - atomic64_t i_sectors; - - struct bch_hash_info str_hash; -}; - -#define to_bch_ei(_inode) \ - container_of(_inode, struct bch_inode_info, vfs_inode) - -static inline u8 mode_to_type(umode_t mode) -{ - return (mode >> 12) & 15; -} - -static inline unsigned nlink_bias(umode_t mode) -{ - return S_ISDIR(mode) ? 2 : 1; -} - -struct bch_inode_unpacked; - -#ifndef NO_BCACHE_FS - -/* returns 0 if we want to do the update, or error is passed up */ -typedef int (*inode_set_fn)(struct bch_inode_info *, - struct bch_inode_unpacked *, void *); - -int __must_check __bch_write_inode(struct bch_fs *, struct bch_inode_info *, - inode_set_fn, void *); -int __must_check bch_write_inode(struct bch_fs *, - struct bch_inode_info *); - -void bch_vfs_exit(void); -int bch_vfs_init(void); - -#else - -static inline void bch_vfs_exit(void) {} -static inline int bch_vfs_init(void) { return 0; } - -#endif - -#endif /* _BCACHE_FS_H */ diff --git a/libbcache/inode.c b/libbcache/inode.c deleted file mode 100644 index 2e15497f..00000000 --- a/libbcache/inode.c +++ /dev/null @@ -1,451 +0,0 @@ - -#include "bcache.h" -#include "bkey_methods.h" -#include "btree_update.h" -#include "extents.h" -#include "inode.h" -#include "io.h" -#include "keylist.h" - -#include <linux/random.h> - -#include <asm/unaligned.h> - -#define FIELD_BYTES() \ - -static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; -static const u8 bits_table[8] = { - 1 * 8 - 1, - 2 * 8 - 2, - 3 * 8 - 3, - 4 * 8 - 4, - 6 * 8 - 5, - 8 * 8 - 6, - 10 * 8 - 7, - 13 * 8 - 8, -}; - -static int inode_encode_field(u8 *out, u8 *end, const u64 in[2]) -{ - unsigned bytes, bits, shift; - - if (likely(!in[1])) - bits = fls64(in[0]); - else - bits = fls64(in[1]) + 64; - - for (shift = 1; shift <= 8; shift++) - if (bits < bits_table[shift - 1]) - goto got_shift; - - BUG(); -got_shift: - bytes = byte_table[shift - 1]; - - BUG_ON(out + bytes > end); - - if (likely(bytes <= 8)) { - u64 b = cpu_to_be64(in[0]); - - memcpy(out, (void *) &b + 8 - bytes, bytes); - } else { - u64 b = cpu_to_be64(in[1]); - - memcpy(out, (void *) &b + 16 - bytes, bytes); - put_unaligned_be64(in[0], out + bytes - 8); - } - - *out |= (1 << 8) >> shift; - - return bytes; -} - -static int inode_decode_field(const u8 *in, const u8 *end, - u64 out[2], unsigned *out_bits) -{ - unsigned bytes, bits, shift; - - if (in >= end) - return -1; - - if (!*in) - return -1; - - /* - * position of highest set bit indicates number of bytes: - * shift = number of bits to remove in high byte: - */ - shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ - bytes = byte_table[shift - 1]; - bits = bytes * 8 - shift; - - if (in + bytes > end) - return -1; - - /* - * we're assuming it's safe to deref up to 7 bytes < in; this will work - * because keys always start quite a bit more than 7 bytes after the - * start of the btree node header: - */ - if (likely(bytes <= 8)) { - out[0] = get_unaligned_be64(in + bytes - 8); - out[0] <<= 64 - bits; - out[0] >>= 64 - bits; - out[1] = 0; - } else { - out[0] = get_unaligned_be64(in + bytes - 8); - out[1] = get_unaligned_be64(in + bytes - 16); - out[1] <<= 128 - bits; - out[1] >>= 128 - bits; - } - - *out_bits = out[1] ? 64 + fls64(out[1]) : fls64(out[0]); - return bytes; -} - -void bch_inode_pack(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) -{ - u8 *out = packed->inode.v.fields; - u8 *end = (void *) &packed[1]; - u8 *last_nonzero_field = out; - u64 field[2]; - unsigned nr_fields = 0, last_nonzero_fieldnr = 0; - - bkey_inode_init(&packed->inode.k_i); - packed->inode.k.p.inode = inode->inum; - packed->inode.v.i_hash_seed = inode->i_hash_seed; - packed->inode.v.i_flags = cpu_to_le32(inode->i_flags); - packed->inode.v.i_mode = cpu_to_le16(inode->i_mode); - -#define BCH_INODE_FIELD(_name, _bits) \ - field[0] = inode->_name; \ - field[1] = 0; \ - out += inode_encode_field(out, end, field); \ - nr_fields++; \ - \ - if (field[0] | field[1]) { \ - last_nonzero_field = out; \ - last_nonzero_fieldnr = nr_fields; \ - } - - BCH_INODE_FIELDS() -#undef BCH_INODE_FIELD - - out = last_nonzero_field; - nr_fields = last_nonzero_fieldnr; - - set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v); - memset(out, 0, - (u8 *) &packed->inode.v + - bkey_val_bytes(&packed->inode.k) - out); - - SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); - - if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) { - struct bch_inode_unpacked unpacked; - - int ret = bch_inode_unpack(inode_i_to_s_c(&packed->inode), - &unpacked); - BUG_ON(ret); - BUG_ON(unpacked.inum != inode->inum); - BUG_ON(unpacked.i_hash_seed != inode->i_hash_seed); - BUG_ON(unpacked.i_mode != inode->i_mode); - -#define BCH_INODE_FIELD(_name, _bits) BUG_ON(unpacked._name != inode->_name); - BCH_INODE_FIELDS() -#undef BCH_INODE_FIELD - } -} - -int bch_inode_unpack(struct bkey_s_c_inode inode, - struct bch_inode_unpacked *unpacked) -{ - const u8 *in = inode.v->fields; - const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); - u64 field[2]; - unsigned fieldnr = 0, field_bits; - int ret; - - unpacked->inum = inode.k->p.inode; - unpacked->i_hash_seed = inode.v->i_hash_seed; - unpacked->i_flags = le32_to_cpu(inode.v->i_flags); - unpacked->i_mode = le16_to_cpu(inode.v->i_mode); - -#define BCH_INODE_FIELD(_name, _bits) \ - if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ - memset(&unpacked->_name, 0, \ - sizeof(*unpacked) - \ - offsetof(struct bch_inode_unpacked, _name)); \ - return 0; \ - } \ - \ - ret = inode_decode_field(in, end, field, &field_bits); \ - if (ret < 0) \ - return ret; \ - \ - if (field_bits > sizeof(unpacked->_name) * 8) \ - return -1; \ - \ - unpacked->_name = field[0]; \ - in += ret; - - BCH_INODE_FIELDS() -#undef BCH_INODE_FIELD - - /* XXX: signal if there were more fields than expected? */ - - return 0; -} - -static const char *bch_inode_invalid(const struct bch_fs *c, - struct bkey_s_c k) -{ - if (k.k->p.offset) - return "nonzero offset"; - - switch (k.k->type) { - case BCH_INODE_FS: { - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - struct bch_inode_unpacked unpacked; - - if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) - return "incorrect value size"; - - if (k.k->p.inode < BLOCKDEV_INODE_MAX) - return "fs inode in blockdev range"; - - if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) - return "invalid str hash type"; - - if (bch_inode_unpack(inode, &unpacked)) - return "invalid variable length fields"; - - return NULL; - } - case BCH_INODE_BLOCKDEV: - if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev)) - return "incorrect value size"; - - if (k.k->p.inode >= BLOCKDEV_INODE_MAX) - return "blockdev inode in fs range"; - - return NULL; - default: - return "invalid type"; - } -} - -static void bch_inode_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) -{ - struct bkey_s_c_inode inode; - struct bch_inode_unpacked unpacked; - - switch (k.k->type) { - case BCH_INODE_FS: - inode = bkey_s_c_to_inode(k); - if (bch_inode_unpack(inode, &unpacked)) { - scnprintf(buf, size, "(unpack error)"); - break; - } - - scnprintf(buf, size, "i_size %llu", unpacked.i_size); - break; - } -} - -const struct bkey_ops bch_bkey_inode_ops = { - .key_invalid = bch_inode_invalid, - .val_to_text = bch_inode_to_text, -}; - -void bch_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev) -{ - s64 now = timespec_to_bch_time(c, CURRENT_TIME); - - memset(inode_u, 0, sizeof(*inode_u)); - - /* ick */ - inode_u->i_flags |= c->sb.str_hash_type << INODE_STR_HASH_OFFSET; - get_random_bytes(&inode_u->i_hash_seed, sizeof(inode_u->i_hash_seed)); - - inode_u->i_mode = mode; - inode_u->i_uid = uid; - inode_u->i_gid = gid; - inode_u->i_dev = rdev; - inode_u->i_atime = now; - inode_u->i_mtime = now; - inode_u->i_ctime = now; - inode_u->i_otime = now; -} - -int bch_inode_create(struct bch_fs *c, struct bkey_i *inode, - u64 min, u64 max, u64 *hint) -{ - struct btree_iter iter; - bool searched_from_start = false; - int ret; - - if (!max) - max = ULLONG_MAX; - - if (c->opts.inodes_32bit) - max = min_t(u64, max, U32_MAX); - - if (*hint >= max || *hint < min) - *hint = min; - - if (*hint == min) - searched_from_start = true; -again: - bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(*hint, 0)); - - while (1) { - struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter); - - ret = btree_iter_err(k); - if (ret) { - bch_btree_iter_unlock(&iter); - return ret; - } - - if (k.k->type < BCH_INODE_FS) { - inode->k.p = k.k->p; - - pr_debug("inserting inode %llu (size %u)", - inode->k.p.inode, inode->k.u64s); - - ret = bch_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&iter, inode)); - - if (ret == -EINTR) - continue; - - bch_btree_iter_unlock(&iter); - if (!ret) - *hint = k.k->p.inode + 1; - - return ret; - } else { - if (iter.pos.inode == max) - break; - /* slot used */ - bch_btree_iter_advance_pos(&iter); - } - } - bch_btree_iter_unlock(&iter); - - if (!searched_from_start) { - /* Retry from start */ - *hint = min; - searched_from_start = true; - goto again; - } - - return -ENOSPC; -} - -int bch_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size, - struct extent_insert_hook *hook, u64 *journal_seq) -{ - return bch_discard(c, POS(inode_nr, new_size), POS(inode_nr + 1, 0), - ZERO_VERSION, NULL, hook, journal_seq); -} - -int bch_inode_rm(struct bch_fs *c, u64 inode_nr) -{ - struct bkey_i delete; - int ret; - - ret = bch_inode_truncate(c, inode_nr, 0, NULL, NULL); - if (ret < 0) - return ret; - - ret = bch_btree_delete_range(c, BTREE_ID_XATTRS, - POS(inode_nr, 0), - POS(inode_nr + 1, 0), - ZERO_VERSION, NULL, NULL, NULL); - if (ret < 0) - return ret; - - /* - * If this was a directory, there shouldn't be any real dirents left - - * but there could be whiteouts (from hash collisions) that we should - * delete: - * - * XXX: the dirent could ideally would delete whitouts when they're no - * longer needed - */ - ret = bch_btree_delete_range(c, BTREE_ID_DIRENTS, - POS(inode_nr, 0), - POS(inode_nr + 1, 0), - ZERO_VERSION, NULL, NULL, NULL); - if (ret < 0) - return ret; - - bkey_init(&delete.k); - delete.k.p.inode = inode_nr; - - return bch_btree_insert(c, BTREE_ID_INODES, &delete, NULL, - NULL, NULL, BTREE_INSERT_NOFAIL); -} - -int bch_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = -ENOENT; - - for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES, - POS(inode_nr, 0), k) { - switch (k.k->type) { - case BCH_INODE_FS: - ret = bch_inode_unpack(bkey_s_c_to_inode(k), inode); - break; - default: - /* hole, not found */ - break; - } - - break; - - } - - return bch_btree_iter_unlock(&iter) ?: ret; -} - -int bch_cached_dev_inode_find_by_uuid(struct bch_fs *c, uuid_le *uuid, - struct bkey_i_inode_blockdev *ret) -{ - struct btree_iter iter; - struct bkey_s_c k; - - for_each_btree_key(&iter, c, BTREE_ID_INODES, POS(0, 0), k) { - if (k.k->p.inode >= BLOCKDEV_INODE_MAX) - break; - - if (k.k->type == BCH_INODE_BLOCKDEV) { - struct bkey_s_c_inode_blockdev inode = - bkey_s_c_to_inode_blockdev(k); - - pr_debug("found inode %llu: %pU (u64s %u)", - inode.k->p.inode, inode.v->i_uuid.b, - inode.k->u64s); - - if (CACHED_DEV(inode.v) && - !memcmp(uuid, &inode.v->i_uuid, 16)) { - bkey_reassemble(&ret->k_i, k); - bch_btree_iter_unlock(&iter); - return 0; - } - } - - bch_btree_iter_cond_resched(&iter); - } - bch_btree_iter_unlock(&iter); - return -ENOENT; -} diff --git a/libbcache/inode.h b/libbcache/inode.h deleted file mode 100644 index 41e344d5..00000000 --- a/libbcache/inode.h +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef _BCACHE_INODE_H -#define _BCACHE_INODE_H - -#include <linux/math64.h> - -extern const struct bkey_ops bch_bkey_inode_ops; - -struct bch_inode_unpacked { - u64 inum; - __le64 i_hash_seed; - u32 i_flags; - u16 i_mode; - -#define BCH_INODE_FIELD(_name, _bits) u##_bits _name; - BCH_INODE_FIELDS() -#undef BCH_INODE_FIELD -}; - -struct bkey_inode_buf { - struct bkey_i_inode inode; - -#define BCH_INODE_FIELD(_name, _bits) + 8 + _bits / 8 - u8 _pad[0 + BCH_INODE_FIELDS()]; -#undef BCH_INODE_FIELD -} __packed; - -void bch_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); -int bch_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); - -void bch_inode_init(struct bch_fs *, struct bch_inode_unpacked *, - uid_t, gid_t, umode_t, dev_t); -int bch_inode_create(struct bch_fs *, struct bkey_i *, u64, u64, u64 *); -int bch_inode_truncate(struct bch_fs *, u64, u64, - struct extent_insert_hook *, u64 *); -int bch_inode_rm(struct bch_fs *, u64); - -int bch_inode_find_by_inum(struct bch_fs *, u64, - struct bch_inode_unpacked *); -int bch_cached_dev_inode_find_by_uuid(struct bch_fs *, uuid_le *, - struct bkey_i_inode_blockdev *); - -static inline struct timespec bch_time_to_timespec(struct bch_fs *c, u64 time) -{ - return ns_to_timespec(time * c->sb.time_precision + c->sb.time_base_lo); -} - -static inline u64 timespec_to_bch_time(struct bch_fs *c, struct timespec ts) -{ - s64 ns = timespec_to_ns(&ts) - c->sb.time_base_lo; - - if (c->sb.time_precision == 1) - return ns; - - return div_s64(ns, c->sb.time_precision); -} - -#endif diff --git a/libbcache/io.c b/libbcache/io.c deleted file mode 100644 index 753c8a3d..00000000 --- a/libbcache/io.c +++ /dev/null @@ -1,1435 +0,0 @@ -/* - * Some low level IO code, and hacks for various block layer limitations - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include "bcache.h" -#include "alloc.h" -#include "bset.h" -#include "btree_update.h" -#include "buckets.h" -#include "checksum.h" -#include "compress.h" -#include "clock.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "io.h" -#include "journal.h" -#include "keylist.h" -#include "move.h" -#include "notify.h" -#include "stats.h" -#include "super-io.h" - -#include <linux/blkdev.h> -#include <linux/random.h> - -#include <trace/events/bcache.h> - -static inline void __bio_inc_remaining(struct bio *bio) -{ - bio_set_flag(bio, BIO_CHAIN); - smp_mb__before_atomic(); - atomic_inc(&bio->__bi_remaining); -} - -void bch_generic_make_request(struct bio *bio, struct bch_fs *c) -{ - if (current->bio_list) { - spin_lock(&c->bio_submit_lock); - bio_list_add(&c->bio_submit_list, bio); - spin_unlock(&c->bio_submit_lock); - queue_work(bcache_io_wq, &c->bio_submit_work); - } else { - generic_make_request(bio); - } -} - -void bch_bio_submit_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, - bio_submit_work); - struct bio_list bl; - struct bio *bio; - - spin_lock(&c->bio_submit_lock); - bl = c->bio_submit_list; - bio_list_init(&c->bio_submit_list); - spin_unlock(&c->bio_submit_lock); - - while ((bio = bio_list_pop(&bl))) - generic_make_request(bio); -} - -/* Allocate, free from mempool: */ - -void bch_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) -{ - struct bio_vec *bv; - unsigned i; - - bio_for_each_segment_all(bv, bio, i) - if (bv->bv_page != ZERO_PAGE(0)) - mempool_free(bv->bv_page, &c->bio_bounce_pages); - bio->bi_vcnt = 0; -} - -static void bch_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio, - bool *using_mempool) -{ - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++]; - - if (likely(!*using_mempool)) { - bv->bv_page = alloc_page(GFP_NOIO); - if (unlikely(!bv->bv_page)) { - mutex_lock(&c->bio_bounce_pages_lock); - *using_mempool = true; - goto pool_alloc; - - } - } else { -pool_alloc: - bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); - } - - bv->bv_len = PAGE_SIZE; - bv->bv_offset = 0; -} - -void bch_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, - size_t bytes) -{ - bool using_mempool = false; - - bio->bi_iter.bi_size = bytes; - - while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) - bch_bio_alloc_page_pool(c, bio, &using_mempool); - - if (using_mempool) - mutex_unlock(&c->bio_bounce_pages_lock); -} - -/* Bios with headers */ - -static void bch_submit_wbio(struct bch_fs *c, struct bch_write_bio *wbio, - struct bch_dev *ca, const struct bch_extent_ptr *ptr, - bool punt) -{ - wbio->ca = ca; - wbio->submit_time_us = local_clock_us(); - wbio->bio.bi_iter.bi_sector = ptr->offset; - wbio->bio.bi_bdev = ca ? ca->disk_sb.bdev : NULL; - - if (!ca) - bcache_io_error(c, &wbio->bio, "device has been removed"); - else if (punt) - bch_generic_make_request(&wbio->bio, c); - else - generic_make_request(&wbio->bio); -} - -void bch_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, - const struct bkey_i *k, bool punt) -{ - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); - const struct bch_extent_ptr *ptr; - struct bch_write_bio *n; - struct bch_dev *ca; - - BUG_ON(c->opts.nochanges); - - wbio->split = false; - wbio->c = c; - - extent_for_each_ptr(e, ptr) { - ca = c->devs[ptr->dev]; - if (!percpu_ref_tryget(&ca->io_ref)) { - bch_submit_wbio(c, wbio, NULL, ptr, punt); - break; - } - - if (ptr + 1 < &extent_entry_last(e)->ptr) { - n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, - &ca->replica_set)); - - n->bio.bi_end_io = wbio->bio.bi_end_io; - n->bio.bi_private = wbio->bio.bi_private; - n->c = c; - n->orig = &wbio->bio; - n->bounce = false; - n->split = true; - n->put_bio = true; - n->bio.bi_opf = wbio->bio.bi_opf; - __bio_inc_remaining(n->orig); - } else { - n = wbio; - } - - if (!journal_flushes_device(ca)) - n->bio.bi_opf |= REQ_FUA; - - bch_submit_wbio(c, n, ca, ptr, punt); - } -} - -/* IO errors */ - -/* Writes */ - -static struct workqueue_struct *index_update_wq(struct bch_write_op *op) -{ - return op->alloc_reserve == RESERVE_MOVINGGC - ? op->c->copygc_wq - : op->c->wq; -} - -static void __bch_write(struct closure *); - -static void bch_write_done(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - - BUG_ON(!(op->flags & BCH_WRITE_DONE)); - - if (!op->error && (op->flags & BCH_WRITE_FLUSH)) - op->error = bch_journal_error(&op->c->journal); - - bch_disk_reservation_put(op->c, &op->res); - percpu_ref_put(&op->c->writes); - bch_keylist_free(&op->insert_keys, op->inline_keys); - closure_return(cl); -} - -static u64 keylist_sectors(struct keylist *keys) -{ - struct bkey_i *k; - u64 ret = 0; - - for_each_keylist_key(keys, k) - ret += k->k.size; - - return ret; -} - -static int bch_write_index_default(struct bch_write_op *op) -{ - struct keylist *keys = &op->insert_keys; - struct btree_iter iter; - int ret; - - bch_btree_iter_init_intent(&iter, op->c, BTREE_ID_EXTENTS, - bkey_start_pos(&bch_keylist_front(keys)->k)); - - ret = bch_btree_insert_list_at(&iter, keys, &op->res, - NULL, op_journal_seq(op), - BTREE_INSERT_NOFAIL); - bch_btree_iter_unlock(&iter); - - return ret; -} - -/** - * bch_write_index - after a write, update index to point to new data - */ -static void bch_write_index(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_fs *c = op->c; - struct keylist *keys = &op->insert_keys; - unsigned i; - - op->flags |= BCH_WRITE_LOOPED; - - if (!bch_keylist_empty(keys)) { - u64 sectors_start = keylist_sectors(keys); - int ret = op->index_update_fn(op); - - BUG_ON(keylist_sectors(keys) && !ret); - - op->written += sectors_start - keylist_sectors(keys); - - if (ret) { - __bcache_io_error(c, "btree IO error %i", ret); - op->error = ret; - } - } - - for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++) - if (op->open_buckets[i]) { - bch_open_bucket_put(c, - c->open_buckets + - op->open_buckets[i]); - op->open_buckets[i] = 0; - } - - if (!(op->flags & BCH_WRITE_DONE)) - continue_at(cl, __bch_write, op->io_wq); - - if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { - bch_journal_flush_seq_async(&c->journal, - *op_journal_seq(op), - cl); - continue_at(cl, bch_write_done, index_update_wq(op)); - } else { - continue_at_nobarrier(cl, bch_write_done, NULL); - } -} - -/** - * bch_write_discard - discard range of keys - * - * Used to implement discard, and to handle when writethrough write hits - * a write error on the cache device. - */ -static void bch_write_discard(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bio *bio = &op->bio->bio; - struct bpos end = op->pos; - - end.offset += bio_sectors(bio); - - op->error = bch_discard(op->c, op->pos, end, op->version, - &op->res, NULL, NULL); -} - -/* - * Convert extents to be inserted to discards after an error: - */ -static void bch_write_io_error(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - - if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) { - struct bkey_i *src = bch_keylist_front(&op->insert_keys); - struct bkey_i *dst = bch_keylist_front(&op->insert_keys); - - /* - * Our data write just errored, which means we've got a bunch - * of keys to insert that point to data that wasn't - * successfully written. - * - * We don't have to insert those keys but we still have to - * invalidate that region of the cache - so, if we just strip - * off all the pointers from the keys we'll accomplish just - * that. - */ - - while (src != op->insert_keys.top) { - struct bkey_i *n = bkey_next(src); - - set_bkey_val_u64s(&src->k, 0); - src->k.type = KEY_TYPE_DISCARD; - bkey_copy(dst, src); - - dst = bkey_next(dst); - src = n; - } - - op->insert_keys.top = dst; - op->flags |= BCH_WRITE_DISCARD; - } else { - /* TODO: We could try to recover from this. */ - while (!bch_keylist_empty(&op->insert_keys)) - bch_keylist_pop_front(&op->insert_keys); - - op->error = -EIO; - op->flags |= BCH_WRITE_DONE; - } - - bch_write_index(cl); -} - -static void bch_write_endio(struct bio *bio) -{ - struct closure *cl = bio->bi_private; - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_write_bio *wbio = to_wbio(bio); - struct bch_fs *c = wbio->c; - struct bio *orig = wbio->orig; - struct bch_dev *ca = wbio->ca; - - if (bch_dev_nonfatal_io_err_on(bio->bi_error, ca, - "data write")) { - set_closure_fn(cl, bch_write_io_error, index_update_wq(op)); - } - - bch_account_io_completion_time(ca, wbio->submit_time_us, - REQ_OP_WRITE); - if (ca) - percpu_ref_put(&ca->io_ref); - - if (bio->bi_error && orig) - orig->bi_error = bio->bi_error; - - if (wbio->bounce) - bch_bio_free_pages_pool(c, bio); - - if (wbio->put_bio) - bio_put(bio); - - if (orig) - bio_endio(orig); - else - closure_put(cl); -} - -static struct nonce extent_nonce(struct bversion version, - unsigned nonce, - unsigned uncompressed_size, - unsigned compression_type) -{ - return (struct nonce) {{ - [0] = cpu_to_le32((nonce << 12) | - (uncompressed_size << 22)), - [1] = cpu_to_le32(version.lo), - [2] = cpu_to_le32(version.lo >> 32), - [3] = cpu_to_le32(version.hi| - (compression_type << 24))^BCH_NONCE_EXTENT, - }}; -} - -static void init_append_extent(struct bch_write_op *op, - unsigned compressed_size, - unsigned uncompressed_size, - unsigned compression_type, - unsigned nonce, - struct bch_csum csum, unsigned csum_type, - struct open_bucket *ob) -{ - struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top); - - op->pos.offset += uncompressed_size; - e->k.p = op->pos; - e->k.size = uncompressed_size; - e->k.version = op->version; - bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED); - - bch_extent_crc_append(e, compressed_size, - uncompressed_size, - compression_type, - nonce, csum, csum_type); - - bch_alloc_sectors_append_ptrs(op->c, e, op->nr_replicas, - ob, compressed_size); - - bkey_extent_set_cached(&e->k, (op->flags & BCH_WRITE_CACHED)); - bch_keylist_push(&op->insert_keys); -} - -static int bch_write_extent(struct bch_write_op *op, - struct open_bucket *ob, - struct bio *orig) -{ - struct bch_fs *c = op->c; - struct bio *bio; - struct bch_write_bio *wbio; - unsigned key_to_write_offset = op->insert_keys.top_p - - op->insert_keys.keys_p; - struct bkey_i *key_to_write; - unsigned csum_type = op->csum_type; - unsigned compression_type = op->compression_type; - int ret; - - /* don't refetch csum type/compression type */ - barrier(); - - /* Need to decompress data? */ - if ((op->flags & BCH_WRITE_DATA_COMPRESSED) && - (crc_uncompressed_size(NULL, &op->crc) != op->size || - crc_compressed_size(NULL, &op->crc) > ob->sectors_free)) { - int ret; - - ret = bch_bio_uncompress_inplace(c, orig, op->size, op->crc); - if (ret) - return ret; - - op->flags &= ~BCH_WRITE_DATA_COMPRESSED; - } - - if (op->flags & BCH_WRITE_DATA_COMPRESSED) { - init_append_extent(op, - crc_compressed_size(NULL, &op->crc), - crc_uncompressed_size(NULL, &op->crc), - op->crc.compression_type, - op->crc.nonce, - op->crc.csum, - op->crc.csum_type, - ob); - - bio = orig; - wbio = to_wbio(bio); - wbio->orig = NULL; - wbio->bounce = false; - wbio->put_bio = false; - ret = 0; - } else if (csum_type != BCH_CSUM_NONE || - compression_type != BCH_COMPRESSION_NONE) { - /* all units here in bytes */ - unsigned total_output = 0, output_available = - min(ob->sectors_free << 9, orig->bi_iter.bi_size); - unsigned crc_nonce = bch_csum_type_is_encryption(csum_type) - ? op->nonce : 0; - struct bch_csum csum; - struct nonce nonce; - - bio = bio_alloc_bioset(GFP_NOIO, - DIV_ROUND_UP(output_available, PAGE_SIZE), - &c->bio_write); - /* - * XXX: can't use mempool for more than - * BCH_COMPRESSED_EXTENT_MAX worth of pages - */ - bch_bio_alloc_pages_pool(c, bio, output_available); - - /* copy WRITE_SYNC flag */ - bio->bi_opf = orig->bi_opf; - wbio = to_wbio(bio); - wbio->orig = NULL; - wbio->bounce = true; - wbio->put_bio = true; - - do { - unsigned fragment_compression_type = compression_type; - size_t dst_len, src_len; - - bch_bio_compress(c, bio, &dst_len, - orig, &src_len, - &fragment_compression_type); - - BUG_ON(!dst_len || dst_len > bio->bi_iter.bi_size); - BUG_ON(!src_len || src_len > orig->bi_iter.bi_size); - BUG_ON(dst_len & (block_bytes(c) - 1)); - BUG_ON(src_len & (block_bytes(c) - 1)); - - swap(bio->bi_iter.bi_size, dst_len); - nonce = extent_nonce(op->version, - crc_nonce, - src_len >> 9, - compression_type), - - bch_encrypt_bio(c, csum_type, nonce, bio); - - csum = bch_checksum_bio(c, csum_type, nonce, bio); - swap(bio->bi_iter.bi_size, dst_len); - - init_append_extent(op, - dst_len >> 9, src_len >> 9, - fragment_compression_type, - crc_nonce, csum, csum_type, ob); - - total_output += dst_len; - bio_advance(bio, dst_len); - bio_advance(orig, src_len); - } while (bio->bi_iter.bi_size && - orig->bi_iter.bi_size && - !bch_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_EXTENT_U64s_MAX)); - - BUG_ON(total_output > output_available); - - memset(&bio->bi_iter, 0, sizeof(bio->bi_iter)); - bio->bi_iter.bi_size = total_output; - - /* - * Free unneeded pages after compressing: - */ - while (bio->bi_vcnt * PAGE_SIZE > - round_up(bio->bi_iter.bi_size, PAGE_SIZE)) - mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page, - &c->bio_bounce_pages); - - ret = orig->bi_iter.bi_size != 0; - } else { - bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO, - &c->bio_write); - - wbio = to_wbio(bio); - wbio->orig = NULL; - wbio->bounce = false; - wbio->put_bio = bio != orig; - - init_append_extent(op, bio_sectors(bio), bio_sectors(bio), - compression_type, 0, - (struct bch_csum) { 0 }, csum_type, ob); - - ret = bio != orig; - } - - bio->bi_end_io = bch_write_endio; - bio->bi_private = &op->cl; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - - closure_get(bio->bi_private); - - /* might have done a realloc... */ - - key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - - bch_check_mark_super(c, key_to_write, false); - -#ifndef CONFIG_BCACHE_NO_IO - bch_submit_wbio_replicas(to_wbio(bio), c, key_to_write, false); -#else - to_wbio(bio)->ca = NULL; - bio_endio(bio); -#endif - return ret; -} - -static void __bch_write(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_fs *c = op->c; - struct bio *bio = &op->bio->bio; - unsigned open_bucket_nr = 0; - struct open_bucket *b; - int ret; - - memset(op->open_buckets, 0, sizeof(op->open_buckets)); - - if (op->flags & BCH_WRITE_DISCARD) { - op->flags |= BCH_WRITE_DONE; - bch_write_discard(cl); - bio_put(bio); - continue_at(cl, bch_write_done, index_update_wq(op)); - } - - /* - * Journal writes are marked REQ_PREFLUSH; if the original write was a - * flush, it'll wait on the journal write. - */ - bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA); - - do { - EBUG_ON(bio->bi_iter.bi_sector != op->pos.offset); - EBUG_ON(!bio_sectors(bio)); - - if (open_bucket_nr == ARRAY_SIZE(op->open_buckets)) - continue_at(cl, bch_write_index, index_update_wq(op)); - - /* for the device pointers and 1 for the chksum */ - if (bch_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_EXTENT_U64s_MAX)) - continue_at(cl, bch_write_index, index_update_wq(op)); - - b = bch_alloc_sectors_start(c, op->wp, - op->nr_replicas, - c->opts.data_replicas_required, - op->alloc_reserve, - (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); - EBUG_ON(!b); - - if (unlikely(IS_ERR(b))) { - if (unlikely(PTR_ERR(b) != -EAGAIN)) { - ret = PTR_ERR(b); - goto err; - } - - /* - * If we already have some keys, must insert them first - * before allocating another open bucket. We only hit - * this case if open_bucket_nr > 1. - */ - if (!bch_keylist_empty(&op->insert_keys)) - continue_at(cl, bch_write_index, - index_update_wq(op)); - - /* - * If we've looped, we're running out of a workqueue - - * not the bch_write() caller's context - and we don't - * want to block the workqueue: - */ - if (op->flags & BCH_WRITE_LOOPED) - continue_at(cl, __bch_write, op->io_wq); - - /* - * Otherwise, we do want to block the caller on alloc - * failure instead of letting it queue up more and more - * writes: - * XXX: this technically needs a try_to_freeze() - - * except that that's not safe because caller may have - * issued other IO... hmm.. - */ - closure_sync(cl); - continue; - } - - BUG_ON(b - c->open_buckets == 0 || - b - c->open_buckets > U8_MAX); - op->open_buckets[open_bucket_nr++] = b - c->open_buckets; - - ret = bch_write_extent(op, b, bio); - - bch_alloc_sectors_done(c, op->wp, b); - - if (ret < 0) - goto err; - } while (ret); - - op->flags |= BCH_WRITE_DONE; - continue_at(cl, bch_write_index, index_update_wq(op)); -err: - if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) { - /* - * If we were writing cached data, not doing the write is fine - * so long as we discard whatever would have been overwritten - - * then it's equivalent to doing the write and immediately - * reclaiming it. - */ - - bch_write_discard(cl); - } else { - /* - * Right now we can only error here if we went RO - the - * allocation failed, but we already checked for -ENOSPC when we - * got our reservation. - * - * XXX capacity might have changed, but we don't check for that - * yet: - */ - op->error = ret; - } - - op->flags |= BCH_WRITE_DONE; - - /* - * No reason not to insert keys for whatever data was successfully - * written (especially for a cmpxchg operation that's moving data - * around) - */ - continue_at(cl, !bch_keylist_empty(&op->insert_keys) - ? bch_write_index - : bch_write_done, index_update_wq(op)); -} - -void bch_wake_delayed_writes(unsigned long data) -{ - struct bch_fs *c = (void *) data; - struct bch_write_op *op; - unsigned long flags; - - spin_lock_irqsave(&c->foreground_write_pd_lock, flags); - - while ((op = c->write_wait_head)) { - if (time_after(op->expires, jiffies)) { - mod_timer(&c->foreground_write_wakeup, op->expires); - break; - } - - c->write_wait_head = op->next; - if (!c->write_wait_head) - c->write_wait_tail = NULL; - - closure_put(&op->cl); - } - - spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags); -} - -/** - * bch_write - handle a write to a cache device or flash only volume - * - * This is the starting point for any data to end up in a cache device; it could - * be from a normal write, or a writeback write, or a write to a flash only - * volume - it's also used by the moving garbage collector to compact data in - * mostly empty buckets. - * - * It first writes the data to the cache, creating a list of keys to be inserted - * (if the data won't fit in a single open bucket, there will be multiple keys); - * after the data is written it calls bch_journal, and after the keys have been - * added to the next journal write they're inserted into the btree. - * - * It inserts the data in op->bio; bi_sector is used for the key offset, and - * op->inode is used for the key inode. - * - * If op->discard is true, instead of inserting the data it invalidates the - * region of the cache represented by op->bio and op->inode. - */ -void bch_write(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bio *bio = &op->bio->bio; - struct bch_fs *c = op->c; - u64 inode = op->pos.inode; - - trace_bcache_write(c, inode, bio, - !(op->flags & BCH_WRITE_CACHED), - op->flags & BCH_WRITE_DISCARD); - - if (c->opts.nochanges || - !percpu_ref_tryget(&c->writes)) { - __bcache_io_error(c, "read only"); - op->error = -EROFS; - bch_disk_reservation_put(c, &op->res); - closure_return(cl); - } - - if (bversion_zero(op->version) && - bch_csum_type_is_encryption(op->csum_type)) - op->version.lo = - atomic64_inc_return(&c->key_version) + 1; - - if (!(op->flags & BCH_WRITE_DISCARD)) - bch_increment_clock(c, bio_sectors(bio), WRITE); - - if (!(op->flags & BCH_WRITE_DISCARD)) - bch_mark_foreground_write(c, bio_sectors(bio)); - else - bch_mark_discard(c, bio_sectors(bio)); - - /* Don't call bch_next_delay() if rate is >= 1 GB/sec */ - - if (c->foreground_write_ratelimit_enabled && - c->foreground_write_pd.rate.rate < (1 << 30) && - !(op->flags & BCH_WRITE_DISCARD) && op->wp->throttle) { - unsigned long flags; - u64 delay; - - spin_lock_irqsave(&c->foreground_write_pd_lock, flags); - bch_ratelimit_increment(&c->foreground_write_pd.rate, - bio->bi_iter.bi_size); - - delay = bch_ratelimit_delay(&c->foreground_write_pd.rate); - - if (delay >= HZ / 100) { - trace_bcache_write_throttle(c, inode, bio, delay); - - closure_get(&op->cl); /* list takes a ref */ - - op->expires = jiffies + delay; - op->next = NULL; - - if (c->write_wait_tail) - c->write_wait_tail->next = op; - else - c->write_wait_head = op; - c->write_wait_tail = op; - - if (!timer_pending(&c->foreground_write_wakeup)) - mod_timer(&c->foreground_write_wakeup, - op->expires); - - spin_unlock_irqrestore(&c->foreground_write_pd_lock, - flags); - continue_at(cl, __bch_write, index_update_wq(op)); - } - - spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags); - } - - continue_at_nobarrier(cl, __bch_write, NULL); -} - -void bch_write_op_init(struct bch_write_op *op, struct bch_fs *c, - struct bch_write_bio *bio, struct disk_reservation res, - struct write_point *wp, struct bpos pos, - u64 *journal_seq, unsigned flags) -{ - EBUG_ON(res.sectors && !res.nr_replicas); - - op->c = c; - op->io_wq = index_update_wq(op); - op->bio = bio; - op->written = 0; - op->error = 0; - op->flags = flags; - op->csum_type = bch_data_checksum_type(c); - op->compression_type = c->opts.compression; - op->nr_replicas = res.nr_replicas; - op->alloc_reserve = RESERVE_NONE; - op->nonce = 0; - op->pos = pos; - op->version = ZERO_VERSION; - op->res = res; - op->wp = wp; - - if (journal_seq) { - op->journal_seq_p = journal_seq; - op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; - } else { - op->journal_seq = 0; - } - - op->index_update_fn = bch_write_index_default; - - bch_keylist_init(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys)); - - if (version_stress_test(c)) - get_random_bytes(&op->version, sizeof(op->version)); -} - -/* Discard */ - -/* bch_discard - discard a range of keys from start_key to end_key. - * @c filesystem - * @start_key pointer to start location - * NOTE: discard starts at bkey_start_offset(start_key) - * @end_key pointer to end location - * NOTE: discard ends at KEY_OFFSET(end_key) - * @version version of discard (0ULL if none) - * - * Returns: - * 0 on success - * <0 on error - * - * XXX: this needs to be refactored with inode_truncate, or more - * appropriately inode_truncate should call this - */ -int bch_discard(struct bch_fs *c, struct bpos start, - struct bpos end, struct bversion version, - struct disk_reservation *disk_res, - struct extent_insert_hook *hook, - u64 *journal_seq) -{ - return bch_btree_delete_range(c, BTREE_ID_EXTENTS, start, end, version, - disk_res, hook, journal_seq); -} - -/* Cache promotion on read */ - -struct cache_promote_op { - struct closure cl; - struct migrate_write write; - struct bio_vec bi_inline_vecs[0]; /* must be last */ -}; - -/* Read */ - -static int bio_checksum_uncompress(struct bch_fs *c, - struct bch_read_bio *rbio) -{ - struct bio *src = &rbio->bio; - struct bio *dst = &bch_rbio_parent(rbio)->bio; - struct bvec_iter dst_iter = rbio->parent_iter; - struct nonce nonce = extent_nonce(rbio->version, - rbio->crc.nonce, - crc_uncompressed_size(NULL, &rbio->crc), - rbio->crc.compression_type); - struct bch_csum csum; - int ret = 0; - - /* - * reset iterator for checksumming and copying bounced data: here we've - * set rbio->compressed_size to the amount of data we actually read, - * which was not necessarily the full extent if we were only bouncing - * in order to promote - */ - if (rbio->bounce) { - src->bi_iter.bi_size = crc_compressed_size(NULL, &rbio->crc) << 9; - src->bi_iter.bi_idx = 0; - src->bi_iter.bi_bvec_done = 0; - } else { - src->bi_iter = rbio->parent_iter; - } - - csum = bch_checksum_bio(c, rbio->crc.csum_type, nonce, src); - if (bch_dev_nonfatal_io_err_on(bch_crc_cmp(rbio->crc.csum, csum), rbio->ca, - "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)", - rbio->inode, (u64) rbio->parent_iter.bi_sector << 9, - rbio->crc.csum.hi, rbio->crc.csum.lo, csum.hi, csum.lo, - rbio->crc.csum_type)) - ret = -EIO; - - /* - * If there was a checksum error, still copy the data back - unless it - * was compressed, we don't want to decompress bad data: - */ - if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) { - if (!ret) { - bch_encrypt_bio(c, rbio->crc.csum_type, nonce, src); - ret = bch_bio_uncompress(c, src, dst, - dst_iter, rbio->crc); - if (ret) - __bcache_io_error(c, "decompression error"); - } - } else if (rbio->bounce) { - bio_advance(src, rbio->crc.offset << 9); - - /* don't need to decrypt the entire bio: */ - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; - - nonce = nonce_add(nonce, rbio->crc.offset << 9); - - bch_encrypt_bio(c, rbio->crc.csum_type, - nonce, src); - - bio_copy_data_iter(dst, dst_iter, - src, src->bi_iter); - } else { - bch_encrypt_bio(c, rbio->crc.csum_type, nonce, src); - } - - return ret; -} - -static void bch_rbio_free(struct bch_read_bio *rbio) -{ - struct bch_fs *c = rbio->c; - struct bio *bio = &rbio->bio; - - BUG_ON(rbio->ca); - BUG_ON(!rbio->split); - - if (rbio->promote) - kfree(rbio->promote); - if (rbio->bounce) - bch_bio_free_pages_pool(c, bio); - - bio_put(bio); -} - -static void bch_rbio_done(struct bch_read_bio *rbio) -{ - struct bio *orig = &bch_rbio_parent(rbio)->bio; - - percpu_ref_put(&rbio->ca->io_ref); - rbio->ca = NULL; - - if (rbio->split) { - if (rbio->bio.bi_error) - orig->bi_error = rbio->bio.bi_error; - - bio_endio(orig); - bch_rbio_free(rbio); - } else { - if (rbio->promote) - kfree(rbio->promote); - - orig->bi_end_io = rbio->orig_bi_end_io; - bio_endio_nodec(orig); - } -} - -static void bch_rbio_error(struct bch_read_bio *rbio, int error) -{ - bch_rbio_parent(rbio)->bio.bi_error = error; - bch_rbio_done(rbio); -} - -static void bch_rbio_retry(struct bch_fs *c, struct bch_read_bio *rbio) -{ - unsigned long flags; - - percpu_ref_put(&rbio->ca->io_ref); - rbio->ca = NULL; - - spin_lock_irqsave(&c->read_retry_lock, flags); - bio_list_add(&c->read_retry_list, &rbio->bio); - spin_unlock_irqrestore(&c->read_retry_lock, flags); - queue_work(c->wq, &c->read_retry_work); -} - -static void cache_promote_done(struct closure *cl) -{ - struct cache_promote_op *op = - container_of(cl, struct cache_promote_op, cl); - - bch_bio_free_pages_pool(op->write.op.c, &op->write.wbio.bio); - kfree(op); -} - -/* Inner part that may run in process context */ -static void __bch_read_endio(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - int ret; - - ret = bio_checksum_uncompress(c, rbio); - if (ret) { - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { - rbio->flags |= BCH_READ_FORCE_BOUNCE; - bch_rbio_retry(c, rbio); - } else { - bch_rbio_error(rbio, -EIO); - } - return; - } - - if (rbio->promote) { - struct cache_promote_op *promote = rbio->promote; - struct closure *cl = &promote->cl; - - BUG_ON(!rbio->split || !rbio->bounce); - - /* we now own pages: */ - swap(promote->write.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt); - rbio->promote = NULL; - - bch_rbio_done(rbio); - - closure_init(cl, &c->cl); - closure_call(&promote->write.op.cl, bch_write, c->wq, cl); - closure_return_with_destructor(cl, cache_promote_done); - } else { - bch_rbio_done(rbio); - } -} - -static void bch_read_endio(struct bio *bio) -{ - struct bch_read_bio *rbio = - container_of(bio, struct bch_read_bio, bio); - struct bch_fs *c = rbio->c; - - if (rbio->flags & BCH_READ_ACCOUNT_TIMES) - bch_account_io_completion_time(rbio->ca, rbio->submit_time_us, - REQ_OP_READ); - - if (bch_dev_nonfatal_io_err_on(bio->bi_error, rbio->ca, "data read")) { - /* XXX: retry IO errors when we have another replica */ - bch_rbio_error(rbio, bio->bi_error); - return; - } - - if (rbio->ptr.cached && - (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(rbio->ca, &rbio->ptr))) { - atomic_long_inc(&c->cache_read_races); - - if (rbio->flags & BCH_READ_RETRY_IF_STALE) - bch_rbio_retry(c, rbio); - else - bch_rbio_error(rbio, -EINTR); - return; - } - - if (rbio->crc.compression_type || - bch_csum_type_is_encryption(rbio->crc.csum_type)) - queue_work(system_unbound_wq, &rbio->work); - else if (rbio->crc.csum_type) - queue_work(system_highpri_wq, &rbio->work); - else - __bch_read_endio(&rbio->work); -} - -static bool should_promote(struct bch_fs *c, - struct extent_pick_ptr *pick, unsigned flags) -{ - if (!(flags & BCH_READ_PROMOTE)) - return false; - - if (percpu_ref_is_dying(&c->writes)) - return false; - - return c->fastest_tier && - c->fastest_tier < c->tiers + pick->ca->mi.tier; -} - -void bch_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig, - struct bvec_iter iter, struct bkey_s_c k, - struct extent_pick_ptr *pick, unsigned flags) -{ - struct bch_read_bio *rbio; - struct cache_promote_op *promote_op = NULL; - unsigned skip = iter.bi_sector - bkey_start_offset(k.k); - bool bounce = false, split, read_full = false; - - EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector || - k.k->p.offset < bvec_iter_end_sector(iter)); - - /* only promote if we're not reading from the fastest tier: */ - - /* - * XXX: multiple promotes can race with each other, wastefully. Keep a - * list of outstanding promotes? - */ - if (should_promote(c, pick, flags)) { - /* - * biovec needs to be big enough to hold decompressed data, if - * the bch_write_extent() has to decompress/recompress it: - */ - unsigned sectors = - max_t(unsigned, k.k->size, - crc_uncompressed_size(NULL, &pick->crc)); - unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); - - promote_op = kmalloc(sizeof(*promote_op) + - sizeof(struct bio_vec) * pages, GFP_NOIO); - if (promote_op) { - struct bio *promote_bio = &promote_op->write.wbio.bio; - - bio_init(promote_bio); - promote_bio->bi_max_vecs = pages; - promote_bio->bi_io_vec = promote_bio->bi_inline_vecs; - bounce = true; - /* could also set read_full */ - } - } - - /* - * note: if compression_type and crc_type both == none, then - * compressed/uncompressed size is zero - */ - if (pick->crc.compression_type != BCH_COMPRESSION_NONE || - (pick->crc.csum_type != BCH_CSUM_NONE && - (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) || - (bch_csum_type_is_encryption(pick->crc.csum_type) && - (flags & BCH_READ_USER_MAPPED)) || - (flags & BCH_READ_FORCE_BOUNCE)))) { - read_full = true; - bounce = true; - } - - if (bounce) { - unsigned sectors = read_full - ? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size) - : bvec_iter_sectors(iter); - - rbio = container_of(bio_alloc_bioset(GFP_NOIO, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - &c->bio_read_split), - struct bch_read_bio, bio); - - bch_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); - split = true; - } else if (!(flags & BCH_READ_MAY_REUSE_BIO) || - !(flags & BCH_READ_IS_LAST)) { - /* - * Have to clone if there were any splits, due to error - * reporting issues (if a split errored, and retrying didn't - * work, when it reports the error to its parent (us) we don't - * know if the error was from our bio, and we should retry, or - * from the whole bio, in which case we don't want to retry and - * lose the error) - */ - rbio = container_of(bio_clone_fast(&orig->bio, - GFP_NOIO, &c->bio_read_split), - struct bch_read_bio, bio); - rbio->bio.bi_iter = iter; - split = true; - } else { - rbio = orig; - rbio->bio.bi_iter = iter; - split = false; - BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); - } - - if (!(flags & BCH_READ_IS_LAST)) - __bio_inc_remaining(&orig->bio); - - if (split) - rbio->parent = orig; - else - rbio->orig_bi_end_io = orig->bio.bi_end_io; - rbio->parent_iter = iter; - - rbio->flags = flags; - rbio->bounce = bounce; - rbio->split = split; - rbio->c = c; - rbio->ca = pick->ca; - rbio->ptr = pick->ptr; - rbio->crc = pick->crc; - /* - * crc.compressed_size will be 0 if there wasn't any checksum - * information, also we need to stash the original size of the bio if we - * bounced (which isn't necessarily the original key size, if we bounced - * only for promoting) - */ - rbio->crc._compressed_size = bio_sectors(&rbio->bio) - 1; - rbio->version = k.k->version; - rbio->promote = promote_op; - rbio->inode = k.k->p.inode; - INIT_WORK(&rbio->work, __bch_read_endio); - - rbio->bio.bi_bdev = pick->ca->disk_sb.bdev; - rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick->ptr.offset; - rbio->bio.bi_end_io = bch_read_endio; - - if (promote_op) { - struct bio *promote_bio = &promote_op->write.wbio.bio; - - promote_bio->bi_iter = rbio->bio.bi_iter; - memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); - - bch_migrate_write_init(c, &promote_op->write, - &c->promote_write_point, - k, NULL, - BCH_WRITE_ALLOC_NOWAIT| - BCH_WRITE_CACHED); - promote_op->write.promote = true; - - if (rbio->crc.compression_type) { - promote_op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED; - promote_op->write.op.crc = rbio->crc; - promote_op->write.op.size = k.k->size; - } else if (read_full) { - /* - * Adjust bio to correspond to _live_ portion of @k - - * which might be less than what we're actually reading: - */ - bio_advance(promote_bio, rbio->crc.offset << 9); - BUG_ON(bio_sectors(promote_bio) < k.k->size); - promote_bio->bi_iter.bi_size = k.k->size << 9; - } else { - /* - * Set insert pos to correspond to what we're actually - * reading: - */ - promote_op->write.op.pos.offset = iter.bi_sector; - } - - promote_bio->bi_iter.bi_sector = - promote_op->write.op.pos.offset; - } - - /* _after_ promete stuff has looked at rbio->crc.offset */ - if (read_full) - rbio->crc.offset += skip; - else - rbio->bio.bi_iter.bi_sector += skip; - - rbio->submit_time_us = local_clock_us(); - -#ifndef CONFIG_BCACHE_NO_IO - generic_make_request(&rbio->bio); -#else - bio_endio(&rbio->bio); -#endif -} - -static void bch_read_iter(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, - unsigned flags) -{ - struct bio *bio = &rbio->bio; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), k) { - BKEY_PADDED(k) tmp; - struct extent_pick_ptr pick; - unsigned bytes, sectors; - bool is_last; - - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); - bch_btree_iter_unlock(&iter); - - bch_extent_pick_ptr(c, k, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, bio, "no device to read from"); - bio_endio(bio); - return; - } - - sectors = min_t(u64, k.k->p.offset, - bvec_iter_end_sector(bvec_iter)) - - bvec_iter.bi_sector; - bytes = sectors << 9; - is_last = bytes == bvec_iter.bi_size; - swap(bvec_iter.bi_size, bytes); - - if (is_last) - flags |= BCH_READ_IS_LAST; - - if (pick.ca) { - PTR_BUCKET(pick.ca, &pick.ptr)->read_prio = - c->prio_clock[READ].hand; - - bch_read_extent_iter(c, rbio, bvec_iter, - k, &pick, flags); - - flags &= ~BCH_READ_MAY_REUSE_BIO; - } else { - zero_fill_bio_iter(bio, bvec_iter); - - if (is_last) - bio_endio(bio); - } - - if (is_last) - return; - - swap(bvec_iter.bi_size, bytes); - bio_advance_iter(bio, &bvec_iter, bytes); - } - - /* - * If we get here, it better have been because there was an error - * reading a btree node - */ - ret = bch_btree_iter_unlock(&iter); - BUG_ON(!ret); - bcache_io_error(c, bio, "btree IO error %i", ret); - bio_endio(bio); -} - -void bch_read(struct bch_fs *c, struct bch_read_bio *bio, u64 inode) -{ - bch_increment_clock(c, bio_sectors(&bio->bio), READ); - - bch_read_iter(c, bio, bio->bio.bi_iter, inode, - BCH_READ_RETRY_IF_STALE| - BCH_READ_PROMOTE| - BCH_READ_MAY_REUSE_BIO| - BCH_READ_USER_MAPPED); -} - -/** - * bch_read_retry - re-submit a bio originally from bch_read() - */ -static void bch_read_retry(struct bch_fs *c, struct bch_read_bio *rbio) -{ - struct bch_read_bio *parent = bch_rbio_parent(rbio); - struct bvec_iter iter = rbio->parent_iter; - unsigned flags = rbio->flags; - u64 inode = rbio->inode; - - trace_bcache_read_retry(&rbio->bio); - - if (rbio->split) - bch_rbio_free(rbio); - else - rbio->bio.bi_end_io = rbio->orig_bi_end_io; - - bch_read_iter(c, parent, iter, inode, flags); -} - -void bch_read_retry_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, - read_retry_work); - struct bch_read_bio *rbio; - struct bio *bio; - unsigned long flags; - - while (1) { - spin_lock_irqsave(&c->read_retry_lock, flags); - bio = bio_list_pop(&c->read_retry_list); - spin_unlock_irqrestore(&c->read_retry_lock, flags); - - if (!bio) - break; - - rbio = container_of(bio, struct bch_read_bio, bio); - bch_read_retry(c, rbio); - } -} diff --git a/libbcache/io.h b/libbcache/io.h deleted file mode 100644 index 9239ca4a..00000000 --- a/libbcache/io.h +++ /dev/null @@ -1,90 +0,0 @@ -#ifndef _BCACHE_IO_H -#define _BCACHE_IO_H - -#include "io_types.h" - -#define to_wbio(_bio) \ - container_of((_bio), struct bch_write_bio, bio) - -#define to_rbio(_bio) \ - container_of((_bio), struct bch_read_bio, bio) - -void bch_bio_free_pages_pool(struct bch_fs *, struct bio *); -void bch_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); - -enum bch_write_flags { - BCH_WRITE_ALLOC_NOWAIT = (1 << 0), - BCH_WRITE_DISCARD = (1 << 1), - BCH_WRITE_CACHED = (1 << 2), - BCH_WRITE_FLUSH = (1 << 3), - BCH_WRITE_DISCARD_ON_ERROR = (1 << 4), - BCH_WRITE_DATA_COMPRESSED = (1 << 5), - - /* Internal: */ - BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 6), - BCH_WRITE_DONE = (1 << 7), - BCH_WRITE_LOOPED = (1 << 8), -}; - -static inline u64 *op_journal_seq(struct bch_write_op *op) -{ - return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR) - ? op->journal_seq_p : &op->journal_seq; -} - -static inline struct write_point *foreground_write_point(struct bch_fs *c, - unsigned long v) -{ - return c->write_points + - hash_long(v, ilog2(ARRAY_SIZE(c->write_points))); -} - -void bch_write_op_init(struct bch_write_op *, struct bch_fs *, - struct bch_write_bio *, - struct disk_reservation, struct write_point *, - struct bpos, u64 *, unsigned); -void bch_write(struct closure *); - -struct cache_promote_op; - -struct extent_pick_ptr; - -void bch_read_extent_iter(struct bch_fs *, struct bch_read_bio *, - struct bvec_iter, struct bkey_s_c k, - struct extent_pick_ptr *, unsigned); - -static inline void bch_read_extent(struct bch_fs *c, - struct bch_read_bio *orig, - struct bkey_s_c k, - struct extent_pick_ptr *pick, - unsigned flags) -{ - bch_read_extent_iter(c, orig, orig->bio.bi_iter, - k, pick, flags); -} - -enum bch_read_flags { - BCH_READ_FORCE_BOUNCE = 1 << 0, - BCH_READ_RETRY_IF_STALE = 1 << 1, - BCH_READ_PROMOTE = 1 << 2, - BCH_READ_IS_LAST = 1 << 3, - BCH_READ_MAY_REUSE_BIO = 1 << 4, - BCH_READ_ACCOUNT_TIMES = 1 << 5, - BCH_READ_USER_MAPPED = 1 << 6, -}; - -void bch_read(struct bch_fs *, struct bch_read_bio *, u64); - -void bch_generic_make_request(struct bio *, struct bch_fs *); -void bch_bio_submit_work(struct work_struct *); -void bch_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - const struct bkey_i *, bool); - -int bch_discard(struct bch_fs *, struct bpos, struct bpos, - struct bversion, struct disk_reservation *, - struct extent_insert_hook *, u64 *); - -void bch_read_retry_work(struct work_struct *); -void bch_wake_delayed_writes(unsigned long data); - -#endif /* _BCACHE_IO_H */ diff --git a/libbcache/io_types.h b/libbcache/io_types.h deleted file mode 100644 index ca1b0192..00000000 --- a/libbcache/io_types.h +++ /dev/null @@ -1,145 +0,0 @@ -#ifndef _BCACHE_IO_TYPES_H -#define _BCACHE_IO_TYPES_H - -#include "btree_types.h" -#include "buckets_types.h" -#include "keylist_types.h" - -#include <linux/llist.h> -#include <linux/workqueue.h> - -struct bch_read_bio { - /* - * Reads will often have to be split, and if the extent being read from - * was checksummed or compressed we'll also have to allocate bounce - * buffers and copy the data back into the original bio. - * - * If we didn't have to split, we have to save and restore the original - * bi_end_io - @split below indicates which: - */ - union { - struct bch_read_bio *parent; - bio_end_io_t *orig_bi_end_io; - }; - - /* - * Saved copy of parent->bi_iter, from submission time - allows us to - * resubmit on IO error, and also to copy data back to the original bio - * when we're bouncing: - */ - struct bvec_iter parent_iter; - - unsigned submit_time_us; - u16 flags; - u8 bounce:1, - split:1; - - struct bch_fs *c; - struct bch_dev *ca; - struct bch_extent_ptr ptr; - struct bch_extent_crc128 crc; - struct bversion version; - - struct cache_promote_op *promote; - - /* - * If we have to retry the read (IO error, checksum failure, read stale - * data (raced with allocator), we retry the portion of the parent bio - * that failed (i.e. this bio's portion, parent_iter). - * - * But we need to stash the inode somewhere: - */ - u64 inode; - - struct work_struct work; - - struct bio bio; -}; - -static inline struct bch_read_bio * -bch_rbio_parent(struct bch_read_bio *rbio) -{ - return rbio->split ? rbio->parent : rbio; -} - -struct bch_write_bio { - struct bch_fs *c; - struct bch_dev *ca; - union { - struct bio *orig; - struct closure *cl; - }; - - unsigned submit_time_us; - unsigned split:1, - bounce:1, - put_bio:1; - - /* Only for btree writes: */ - unsigned used_mempool:1; - u8 order; - - struct bio bio; -}; - -struct bch_replace_info { - struct extent_insert_hook hook; - /* How many insertions succeeded */ - unsigned successes; - /* How many insertions failed */ - unsigned failures; - BKEY_PADDED(key); -}; - -struct bch_write_op { - struct closure cl; - struct bch_fs *c; - struct workqueue_struct *io_wq; - struct bch_write_bio *bio; - - unsigned written; /* sectors */ - - short error; - - u16 flags; - unsigned csum_type:4; - unsigned compression_type:4; - unsigned nr_replicas:4; - unsigned alloc_reserve:4; - unsigned nonce:14; - - struct bpos pos; - struct bversion version; - - /* For BCH_WRITE_DATA_COMPRESSED: */ - struct bch_extent_crc128 crc; - unsigned size; - - struct disk_reservation res; - - struct write_point *wp; - - union { - u8 open_buckets[16]; - struct { - struct bch_write_op *next; - unsigned long expires; - }; - }; - - /* - * If caller wants to flush but hasn't passed us a journal_seq ptr, we - * still need to stash the journal_seq somewhere: - */ - union { - u64 *journal_seq_p; - u64 journal_seq; - }; - - int (*index_update_fn)(struct bch_write_op *); - - struct keylist insert_keys; - u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; -}; - -#endif /* _BCACHE_IO_TYPES_H */ diff --git a/libbcache/journal.c b/libbcache/journal.c deleted file mode 100644 index 585d1205..00000000 --- a/libbcache/journal.c +++ /dev/null @@ -1,2835 +0,0 @@ -/* - * bcache journalling code, for btree insertions - * - * Copyright 2012 Google, Inc. - */ - -#include "bcache.h" -#include "alloc.h" -#include "bkey_methods.h" -#include "buckets.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "btree_io.h" -#include "checksum.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "io.h" -#include "keylist.h" -#include "journal.h" -#include "super-io.h" -#include "vstructs.h" - -#include <trace/events/bcache.h> - -static void journal_write(struct closure *); -static void journal_reclaim_fast(struct journal *); -static void journal_pin_add_entry(struct journal *, - struct journal_entry_pin_list *, - struct journal_entry_pin *, - journal_pin_flush_fn); - -static inline struct journal_buf *journal_cur_buf(struct journal *j) -{ - return j->buf + j->reservations.idx; -} - -static inline struct journal_buf *journal_prev_buf(struct journal *j) -{ - return j->buf + !j->reservations.idx; -} - -/* Sequence number of oldest dirty journal entry */ - -static inline u64 last_seq(struct journal *j) -{ - return atomic64_read(&j->seq) - fifo_used(&j->pin) + 1; -} - -static inline u64 journal_pin_seq(struct journal *j, - struct journal_entry_pin_list *pin_list) -{ - return last_seq(j) + fifo_entry_idx(&j->pin, pin_list); -} - -static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, - struct jset_entry *entry, unsigned type) -{ - while (entry < vstruct_last(jset)) { - if (JOURNAL_ENTRY_TYPE(entry) == type) - return entry; - - entry = vstruct_next(entry); - } - - return NULL; -} - -#define for_each_jset_entry_type(entry, jset, type) \ - for (entry = (jset)->start; \ - (entry = __jset_entry_type_next(jset, entry, type)); \ - entry = vstruct_next(entry)) - -#define for_each_jset_key(k, _n, entry, jset) \ - for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \ - vstruct_for_each_safe(entry, k, _n) - -static inline void bch_journal_add_entry(struct journal_buf *buf, - const void *data, size_t u64s, - unsigned type, enum btree_id id, - unsigned level) -{ - struct jset *jset = buf->data; - - bch_journal_add_entry_at(buf, data, u64s, type, id, level, - le32_to_cpu(jset->u64s)); - le32_add_cpu(&jset->u64s, jset_u64s(u64s)); -} - -static struct jset_entry *bch_journal_find_entry(struct jset *j, unsigned type, - enum btree_id id) -{ - struct jset_entry *entry; - - for_each_jset_entry_type(entry, j, type) - if (entry->btree_id == id) - return entry; - - return NULL; -} - -struct bkey_i *bch_journal_find_btree_root(struct bch_fs *c, struct jset *j, - enum btree_id id, unsigned *level) -{ - struct bkey_i *k; - struct jset_entry *entry = - bch_journal_find_entry(j, JOURNAL_ENTRY_BTREE_ROOT, id); - - if (!entry) - return NULL; - - k = entry->start; - *level = entry->level; - *level = entry->level; - return k; -} - -static void bch_journal_add_btree_root(struct journal_buf *buf, - enum btree_id id, struct bkey_i *k, - unsigned level) -{ - bch_journal_add_entry(buf, k, k->k.u64s, - JOURNAL_ENTRY_BTREE_ROOT, id, level); -} - -static inline void bch_journal_add_prios(struct journal *j, - struct journal_buf *buf) -{ - /* - * no prio bucket ptrs yet... XXX should change the allocator so this - * can't happen: - */ - if (!buf->nr_prio_buckets) - return; - - bch_journal_add_entry(buf, j->prio_buckets, buf->nr_prio_buckets, - JOURNAL_ENTRY_PRIO_PTRS, 0, 0); -} - -static void journal_seq_blacklist_flush(struct journal *j, - struct journal_entry_pin *pin) -{ - struct bch_fs *c = - container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl = - container_of(pin, struct journal_seq_blacklist, pin); - struct blacklisted_node n; - struct closure cl; - unsigned i; - int ret; - - closure_init_stack(&cl); - - for (i = 0;; i++) { - struct btree_iter iter; - struct btree *b; - - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); - - bch_btree_iter_init(&iter, c, n.btree_id, n.pos); - iter.is_extents = false; -redo_peek: - b = bch_btree_iter_peek_node(&iter); - - /* The node might have already been rewritten: */ - - if (b->data->keys.seq == n.seq && - !bkey_cmp(b->key.k.p, n.pos)) { - ret = bch_btree_node_rewrite(&iter, b, &cl); - if (ret) { - bch_btree_iter_unlock(&iter); - closure_sync(&cl); - - if (ret == -EAGAIN || - ret == -EINTR) - goto redo_peek; - - /* -EROFS or perhaps -ENOSPC - bail out: */ - /* XXX warn here */ - return; - } - } - - bch_btree_iter_unlock(&iter); - } - - closure_sync(&cl); - - for (i = 0;; i++) { - struct btree_interior_update *as; - struct pending_btree_node_free *d; - - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); -redo_wait: - mutex_lock(&c->btree_interior_update_lock); - - /* - * Is the node on the list of pending interior node updates - - * being freed? If so, wait for that to finish: - */ - for_each_pending_btree_node_free(c, as, d) - if (n.seq == d->seq && - n.btree_id == d->btree_id && - !d->level && - !bkey_cmp(n.pos, d->key.k.p)) { - closure_wait(&as->wait, &cl); - mutex_unlock(&c->btree_interior_update_lock); - closure_sync(&cl); - goto redo_wait; - } - - mutex_unlock(&c->btree_interior_update_lock); - } - - mutex_lock(&j->blacklist_lock); - - bch_journal_pin_drop(j, &bl->pin); - list_del(&bl->list); - kfree(bl->entries); - kfree(bl); - - mutex_unlock(&j->blacklist_lock); -} - -static struct journal_seq_blacklist * -journal_seq_blacklist_find(struct journal *j, u64 seq) -{ - struct journal_seq_blacklist *bl; - - lockdep_assert_held(&j->blacklist_lock); - - list_for_each_entry(bl, &j->seq_blacklist, list) - if (seq == bl->seq) - return bl; - - return NULL; -} - -static struct journal_seq_blacklist * -bch_journal_seq_blacklisted_new(struct journal *j, u64 seq) -{ - struct journal_seq_blacklist *bl; - - lockdep_assert_held(&j->blacklist_lock); - - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return NULL; - - bl->seq = seq; - list_add_tail(&bl->list, &j->seq_blacklist); - return bl; -} - -/* - * Returns true if @seq is newer than the most recent journal entry that got - * written, and data corresponding to @seq should be ignored - also marks @seq - * as blacklisted so that on future restarts the corresponding data will still - * be ignored: - */ -int bch_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b) -{ - struct journal *j = &c->journal; - struct journal_seq_blacklist *bl = NULL; - struct blacklisted_node *n; - u64 journal_seq, i; - int ret = 0; - - if (!seq) - return 0; - - journal_seq = atomic64_read(&j->seq); - - /* Interier updates aren't journalled: */ - BUG_ON(b->level); - BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)); - - if (seq <= journal_seq) { - if (list_empty_careful(&j->seq_blacklist)) - return 0; - - mutex_lock(&j->blacklist_lock); - ret = journal_seq_blacklist_find(j, seq) != NULL; - mutex_unlock(&j->blacklist_lock); - return ret; - } - - /* - * Decrease this back to j->seq + 2 when we next rev the on disk format: - * increasing it temporarily to work around bug in old kernels - */ - bch_fs_inconsistent_on(seq > journal_seq + 4, c, - "bset journal seq too far in the future: %llu > %llu", - seq, journal_seq); - - bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting", - b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq); - - /* - * When we start the journal, bch_journal_start() will skip over @seq: - */ - - mutex_lock(&j->blacklist_lock); - - for (i = journal_seq + 1; i <= seq; i++) { - bl = journal_seq_blacklist_find(j, i) ?: - bch_journal_seq_blacklisted_new(j, i); - - if (!bl) { - ret = -ENOMEM; - goto out; - } - } - - for (n = bl->entries; n < bl->entries + bl->nr_entries; n++) - if (b->data->keys.seq == n->seq && - b->btree_id == n->btree_id && - !bkey_cmp(b->key.k.p, n->pos)) - goto found_entry; - - if (!bl->nr_entries || - is_power_of_2(bl->nr_entries)) { - n = krealloc(bl->entries, - max(bl->nr_entries * 2, 8UL) * sizeof(*n), - GFP_KERNEL); - if (!n) { - ret = -ENOMEM; - goto out; - } - bl->entries = n; - } - - bl->entries[bl->nr_entries++] = (struct blacklisted_node) { - .seq = b->data->keys.seq, - .btree_id = b->btree_id, - .pos = b->key.k.p, - }; -found_entry: - ret = 1; -out: - mutex_unlock(&j->blacklist_lock); - return ret; -} - -/* - * Journal replay/recovery: - * - * This code is all driven from bch_fs_start(); we first read the journal - * entries, do some other stuff, then we mark all the keys in the journal - * entries (same as garbage collection would), then we replay them - reinserting - * them into the cache in precisely the same order as they appear in the - * journal. - * - * We only journal keys that go in leaf nodes, which simplifies things quite a - * bit. - */ - -struct journal_list { - struct closure cl; - struct mutex lock; - struct list_head *head; - int ret; -}; - -#define JOURNAL_ENTRY_ADD_OK 0 -#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 - -/* - * Given a journal entry we just read, add it to the list of journal entries to - * be replayed: - */ -static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist, - struct jset *j) -{ - struct journal_replay *i, *pos; - struct list_head *where; - size_t bytes = vstruct_bytes(j); - __le64 last_seq; - int ret; - - mutex_lock(&jlist->lock); - - last_seq = !list_empty(jlist->head) - ? list_last_entry(jlist->head, struct journal_replay, - list)->j.last_seq - : 0; - - /* Is this entry older than the range we need? */ - if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { - ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; - goto out; - } - - /* Drop entries we don't need anymore */ - list_for_each_entry_safe(i, pos, jlist->head, list) { - if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) - break; - list_del(&i->list); - kfree(i); - } - - list_for_each_entry_reverse(i, jlist->head, list) { - /* Duplicate? */ - if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { - fsck_err_on(bytes != vstruct_bytes(&i->j) || - memcmp(j, &i->j, bytes), c, - "found duplicate but non identical journal entries (seq %llu)", - le64_to_cpu(j->seq)); - - ret = JOURNAL_ENTRY_ADD_OK; - goto out; - } - - if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { - where = &i->list; - goto add; - } - } - - where = jlist->head; -add: - i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); - if (!i) { - ret = -ENOMEM; - goto out; - } - - memcpy(&i->j, j, bytes); - list_add(&i->list, where); - ret = JOURNAL_ENTRY_ADD_OK; -out: -fsck_err: - mutex_unlock(&jlist->lock); - return ret; -} - -static struct nonce journal_nonce(const struct jset *jset) -{ - return (struct nonce) {{ - [0] = 0, - [1] = ((__le32 *) &jset->seq)[0], - [2] = ((__le32 *) &jset->seq)[1], - [3] = BCH_NONCE_JOURNAL, - }}; -} - -static void journal_entry_null_range(void *start, void *end) -{ - struct jset_entry *entry; - - for (entry = start; entry != end; entry = vstruct_next(entry)) { - entry->u64s = 0; - entry->btree_id = 0; - entry->level = 0; - entry->flags = 0; - SET_JOURNAL_ENTRY_TYPE(entry, 0); - } -} - -static int journal_validate_key(struct bch_fs *c, struct jset *j, - struct jset_entry *entry, - struct bkey_i *k, enum bkey_type key_type, - const char *type) -{ - void *next = vstruct_next(entry); - const char *invalid; - char buf[160]; - int ret = 0; - - if (mustfix_fsck_err_on(!k->k.u64s, c, - "invalid %s in journal: k->u64s 0", type)) { - entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(vstruct_next(entry), next); - return 0; - } - - if (mustfix_fsck_err_on((void *) bkey_next(k) > - (void *) vstruct_next(entry), c, - "invalid %s in journal: extends past end of journal entry", - type)) { - entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(vstruct_next(entry), next); - return 0; - } - - if (mustfix_fsck_err_on(k->k.format != KEY_FORMAT_CURRENT, c, - "invalid %s in journal: bad format %u", - type, k->k.format)) { - le16_add_cpu(&entry->u64s, -k->k.u64s); - memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(vstruct_next(entry), next); - return 0; - } - - if (JSET_BIG_ENDIAN(j) != CPU_BIG_ENDIAN) - bch_bkey_swab(key_type, NULL, bkey_to_packed(k)); - - invalid = bkey_invalid(c, key_type, bkey_i_to_s_c(k)); - if (invalid) { - bch_bkey_val_to_text(c, key_type, buf, sizeof(buf), - bkey_i_to_s_c(k)); - mustfix_fsck_err(c, "invalid %s in journal: %s", type, buf); - - le16_add_cpu(&entry->u64s, -k->k.u64s); - memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(vstruct_next(entry), next); - return 0; - } -fsck_err: - return ret; -} - -#define JOURNAL_ENTRY_REREAD 5 -#define JOURNAL_ENTRY_NONE 6 -#define JOURNAL_ENTRY_BAD 7 - -static int journal_entry_validate(struct bch_fs *c, - struct jset *j, u64 sector, - unsigned bucket_sectors_left, - unsigned sectors_read) -{ - struct jset_entry *entry; - size_t bytes = vstruct_bytes(j); - struct bch_csum csum; - int ret = 0; - - if (le64_to_cpu(j->magic) != jset_magic(c)) - return JOURNAL_ENTRY_NONE; - - if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) { - bch_err(c, "unknown journal entry version %u", - le32_to_cpu(j->version)); - return BCH_FSCK_UNKNOWN_VERSION; - } - - if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c, - "journal entry too big (%zu bytes), sector %lluu", - bytes, sector)) { - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; - } - - if (bytes > sectors_read << 9) - return JOURNAL_ENTRY_REREAD; - - if (fsck_err_on(!bch_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c, - "journal entry with unknown csum type %llu sector %lluu", - JSET_CSUM_TYPE(j), sector)) - return JOURNAL_ENTRY_BAD; - - csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); - if (mustfix_fsck_err_on(bch_crc_cmp(csum, j->csum), c, - "journal checksum bad, sector %llu", sector)) { - /* XXX: retry IO, when we start retrying checksum errors */ - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; - } - - bch_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), - j->encrypted_start, - vstruct_end(j) - (void *) j->encrypted_start); - - if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c, - "invalid journal entry: last_seq > seq")) - j->last_seq = j->seq; - - vstruct_for_each(j, entry) { - struct bkey_i *k; - - if (mustfix_fsck_err_on(vstruct_next(entry) > - vstruct_last(j), c, - "journal entry extents past end of jset")) { - j->u64s = cpu_to_le64((u64 *) entry - j->_data); - break; - } - - switch (JOURNAL_ENTRY_TYPE(entry)) { - case JOURNAL_ENTRY_BTREE_KEYS: - vstruct_for_each(entry, k) { - ret = journal_validate_key(c, j, entry, k, - bkey_type(entry->level, - entry->btree_id), - "key"); - if (ret) - goto fsck_err; - } - break; - - case JOURNAL_ENTRY_BTREE_ROOT: - k = entry->start; - - if (mustfix_fsck_err_on(!entry->u64s || - le16_to_cpu(entry->u64s) != k->k.u64s, c, - "invalid btree root journal entry: wrong number of keys")) { - journal_entry_null_range(entry, - vstruct_next(entry)); - continue; - } - - ret = journal_validate_key(c, j, entry, k, - BKEY_TYPE_BTREE, "btree root"); - if (ret) - goto fsck_err; - break; - - case JOURNAL_ENTRY_PRIO_PTRS: - break; - - case JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED: - if (mustfix_fsck_err_on(le16_to_cpu(entry->u64s) != 1, c, - "invalid journal seq blacklist entry: bad size")) { - journal_entry_null_range(entry, - vstruct_next(entry)); - } - - break; - default: - mustfix_fsck_err(c, "invalid journal entry type %llu", - JOURNAL_ENTRY_TYPE(entry)); - journal_entry_null_range(entry, vstruct_next(entry)); - break; - } - } - -fsck_err: - return ret; -} - -struct journal_read_buf { - void *data; - size_t size; -}; - -static int journal_read_buf_realloc(struct journal_read_buf *b, - size_t new_size) -{ - void *n; - - new_size = roundup_pow_of_two(new_size); - n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size)); - if (!n) - return -ENOMEM; - - free_pages((unsigned long) b->data, get_order(b->size)); - b->data = n; - b->size = new_size; - return 0; -} - -static int journal_read_bucket(struct bch_dev *ca, - struct journal_read_buf *buf, - struct journal_list *jlist, - unsigned bucket, u64 *seq, bool *entries_found) -{ - struct bch_fs *c = ca->fs; - struct journal_device *ja = &ca->journal; - struct bio *bio = ja->bio; - struct jset *j = NULL; - unsigned sectors, sectors_read = 0; - u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), - end = offset + ca->mi.bucket_size; - bool saw_bad = false; - int ret = 0; - - pr_debug("reading %u", bucket); - - while (offset < end) { - if (!sectors_read) { -reread: sectors_read = min_t(unsigned, - end - offset, buf->size >> 9); - - bio_reset(bio); - bio->bi_bdev = ca->disk_sb.bdev; - bio->bi_iter.bi_sector = offset; - bio->bi_iter.bi_size = sectors_read << 9; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bch_bio_map(bio, buf->data); - - ret = submit_bio_wait(bio); - - if (bch_dev_fatal_io_err_on(ret, ca, - "journal read from sector %llu", - offset) || - bch_meta_read_fault("journal")) - return -EIO; - - j = buf->data; - } - - ret = journal_entry_validate(c, j, offset, - end - offset, sectors_read); - switch (ret) { - case BCH_FSCK_OK: - break; - case JOURNAL_ENTRY_REREAD: - if (vstruct_bytes(j) > buf->size) { - ret = journal_read_buf_realloc(buf, - vstruct_bytes(j)); - if (ret) - return ret; - } - goto reread; - case JOURNAL_ENTRY_NONE: - if (!saw_bad) - return 0; - sectors = c->sb.block_size; - goto next_block; - case JOURNAL_ENTRY_BAD: - saw_bad = true; - sectors = c->sb.block_size; - goto next_block; - default: - return ret; - } - - /* - * This happens sometimes if we don't have discards on - - * when we've partially overwritten a bucket with new - * journal entries. We don't need the rest of the - * bucket: - */ - if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - return 0; - - ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - - ret = journal_entry_add(c, jlist, j); - switch (ret) { - case JOURNAL_ENTRY_ADD_OK: - *entries_found = true; - break; - case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: - break; - default: - return ret; - } - - if (le64_to_cpu(j->seq) > *seq) - *seq = le64_to_cpu(j->seq); - - sectors = vstruct_sectors(j, c->block_bits); -next_block: - pr_debug("next"); - offset += sectors; - sectors_read -= sectors; - j = ((void *) j) + (sectors << 9); - } - - return 0; -} - -static void bch_journal_read_device(struct closure *cl) -{ -#define read_bucket(b) \ - ({ \ - bool entries_found = false; \ - ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \ - &entries_found); \ - if (ret) \ - goto err; \ - __set_bit(b, bitmap); \ - entries_found; \ - }) - - struct journal_device *ja = - container_of(cl, struct journal_device, read); - struct bch_dev *ca = container_of(ja, struct bch_dev, journal); - struct journal_list *jlist = - container_of(cl->parent, struct journal_list, cl); - struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev); - struct journal_read_buf buf = { NULL, 0 }; - - DECLARE_BITMAP(bitmap, ja->nr); - unsigned i, l, r; - u64 seq = 0; - int ret; - - if (!ja->nr) - goto out; - - bitmap_zero(bitmap, ja->nr); - ret = journal_read_buf_realloc(&buf, PAGE_SIZE); - if (ret) - goto err; - - pr_debug("%u journal buckets", ja->nr); - - /* - * If the device supports discard but not secure discard, we can't do - * the fancy fibonacci hash/binary search because the live journal - * entries might not form a contiguous range: - */ - for (i = 0; i < ja->nr; i++) - read_bucket(i); - goto search_done; - - if (!blk_queue_nonrot(q)) - goto linear_scan; - - /* - * Read journal buckets ordered by golden ratio hash to quickly - * find a sequence of buckets with valid journal entries - */ - for (i = 0; i < ja->nr; i++) { - l = (i * 2654435769U) % ja->nr; - - if (test_bit(l, bitmap)) - break; - - if (read_bucket(l)) - goto bsearch; - } - - /* - * If that fails, check all the buckets we haven't checked - * already - */ - pr_debug("falling back to linear search"); -linear_scan: - for (l = find_first_zero_bit(bitmap, ja->nr); - l < ja->nr; - l = find_next_zero_bit(bitmap, ja->nr, l + 1)) - if (read_bucket(l)) - goto bsearch; - - /* no journal entries on this device? */ - if (l == ja->nr) - goto out; -bsearch: - /* Binary search */ - r = find_next_bit(bitmap, ja->nr, l + 1); - pr_debug("starting binary search, l %u r %u", l, r); - - while (l + 1 < r) { - unsigned m = (l + r) >> 1; - u64 cur_seq = seq; - - read_bucket(m); - - if (cur_seq != seq) - l = m; - else - r = m; - } - -search_done: - /* - * Find the journal bucket with the highest sequence number: - * - * If there's duplicate journal entries in multiple buckets (which - * definitely isn't supposed to happen, but...) - make sure to start - * cur_idx at the last of those buckets, so we don't deadlock trying to - * allocate - */ - seq = 0; - - for (i = 0; i < ja->nr; i++) - if (ja->bucket_seq[i] >= seq && - ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) { - /* - * When journal_next_bucket() goes to allocate for - * the first time, it'll use the bucket after - * ja->cur_idx - */ - ja->cur_idx = i; - seq = ja->bucket_seq[i]; - } - - /* - * Set last_idx to indicate the entire journal is full and needs to be - * reclaimed - journal reclaim will immediately reclaim whatever isn't - * pinned when it first runs: - */ - ja->last_idx = (ja->cur_idx + 1) % ja->nr; - - /* - * Read buckets in reverse order until we stop finding more journal - * entries: - */ - for (i = (ja->cur_idx + ja->nr - 1) % ja->nr; - i != ja->cur_idx; - i = (i + ja->nr - 1) % ja->nr) - if (!test_bit(i, bitmap) && - !read_bucket(i)) - break; -out: - free_pages((unsigned long) buf.data, get_order(buf.size)); - percpu_ref_put(&ca->io_ref); - closure_return(cl); -err: - mutex_lock(&jlist->lock); - jlist->ret = ret; - mutex_unlock(&jlist->lock); - goto out; -#undef read_bucket -} - -void bch_journal_entries_free(struct list_head *list) -{ - - while (!list_empty(list)) { - struct journal_replay *i = - list_first_entry(list, struct journal_replay, list); - list_del(&i->list); - kvfree(i); - } -} - -static int journal_seq_blacklist_read(struct journal *j, - struct journal_replay *i, - struct journal_entry_pin_list *p) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset_entry *entry; - struct journal_seq_blacklist *bl; - u64 seq; - - for_each_jset_entry_type(entry, &i->j, - JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) { - seq = le64_to_cpu(entry->_data[0]); - - bch_verbose(c, "blacklisting existing journal seq %llu", seq); - - bl = bch_journal_seq_blacklisted_new(j, seq); - if (!bl) - return -ENOMEM; - - journal_pin_add_entry(j, p, &bl->pin, - journal_seq_blacklist_flush); - bl->written = true; - } - - return 0; -} - -static inline bool journal_has_keys(struct list_head *list) -{ - struct journal_replay *i; - struct jset_entry *entry; - struct bkey_i *k, *_n; - - list_for_each_entry(i, list, list) - for_each_jset_key(k, _n, entry, &i->j) - return true; - - return false; -} - -int bch_journal_read(struct bch_fs *c, struct list_head *list) -{ - struct jset_entry *prio_ptrs; - struct journal_list jlist; - struct journal_replay *i; - struct jset *j; - struct journal_entry_pin_list *p; - struct bch_dev *ca; - u64 cur_seq, end_seq; - unsigned iter; - int ret = 0; - - closure_init_stack(&jlist.cl); - mutex_init(&jlist.lock); - jlist.head = list; - jlist.ret = 0; - - for_each_readable_member(ca, c, iter) { - percpu_ref_get(&ca->io_ref); - closure_call(&ca->journal.read, - bch_journal_read_device, - system_unbound_wq, - &jlist.cl); - } - - closure_sync(&jlist.cl); - - if (jlist.ret) - return jlist.ret; - - if (list_empty(list)){ - bch_err(c, "no journal entries found"); - return BCH_FSCK_REPAIR_IMPOSSIBLE; - } - - fsck_err_on(c->sb.clean && journal_has_keys(list), c, - "filesystem marked clean but journal has keys to replay"); - - j = &list_entry(list->prev, struct journal_replay, list)->j; - - unfixable_fsck_err_on(le64_to_cpu(j->seq) - - le64_to_cpu(j->last_seq) + 1 > - c->journal.pin.size, c, - "too many journal entries open for refcount fifo"); - - c->journal.pin.back = le64_to_cpu(j->seq) - - le64_to_cpu(j->last_seq) + 1; - - atomic64_set(&c->journal.seq, le64_to_cpu(j->seq)); - c->journal.last_seq_ondisk = le64_to_cpu(j->last_seq); - - BUG_ON(last_seq(&c->journal) != le64_to_cpu(j->last_seq)); - - i = list_first_entry(list, struct journal_replay, list); - - mutex_lock(&c->journal.blacklist_lock); - - fifo_for_each_entry_ptr(p, &c->journal.pin, iter) { - u64 seq = journal_pin_seq(&c->journal, p); - - INIT_LIST_HEAD(&p->list); - - if (i && le64_to_cpu(i->j.seq) == seq) { - atomic_set(&p->count, 1); - - if (journal_seq_blacklist_read(&c->journal, i, p)) { - mutex_unlock(&c->journal.blacklist_lock); - return -ENOMEM; - } - - i = list_is_last(&i->list, list) - ? NULL - : list_next_entry(i, list); - } else { - atomic_set(&p->count, 0); - } - } - - mutex_unlock(&c->journal.blacklist_lock); - - cur_seq = last_seq(&c->journal); - end_seq = le64_to_cpu(list_last_entry(list, - struct journal_replay, list)->j.seq); - - list_for_each_entry(i, list, list) { - bool blacklisted; - - mutex_lock(&c->journal.blacklist_lock); - while (cur_seq < le64_to_cpu(i->j.seq) && - journal_seq_blacklist_find(&c->journal, cur_seq)) - cur_seq++; - - blacklisted = journal_seq_blacklist_find(&c->journal, - le64_to_cpu(i->j.seq)); - mutex_unlock(&c->journal.blacklist_lock); - - fsck_err_on(blacklisted, c, - "found blacklisted journal entry %llu", - le64_to_cpu(i->j.seq)); - - fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c, - "journal entries %llu-%llu missing! (replaying %llu-%llu)", - cur_seq, le64_to_cpu(i->j.seq) - 1, - last_seq(&c->journal), end_seq); - - cur_seq = le64_to_cpu(i->j.seq) + 1; - } - - prio_ptrs = bch_journal_find_entry(j, JOURNAL_ENTRY_PRIO_PTRS, 0); - if (prio_ptrs) { - memcpy_u64s(c->journal.prio_buckets, - prio_ptrs->_data, - le16_to_cpu(prio_ptrs->u64s)); - c->journal.nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s); - } -fsck_err: - return ret; -} - -void bch_journal_mark(struct bch_fs *c, struct list_head *list) -{ - struct bkey_i *k, *n; - struct jset_entry *j; - struct journal_replay *r; - - list_for_each_entry(r, list, list) - for_each_jset_key(k, n, j, &r->j) { - enum bkey_type type = bkey_type(j->level, j->btree_id); - struct bkey_s_c k_s_c = bkey_i_to_s_c(k); - - if (btree_type_has_ptrs(type)) - bch_btree_mark_key_initial(c, type, k_s_c); - } -} - -static bool journal_entry_is_open(struct journal *j) -{ - return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -} - -void bch_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - if (!need_write_just_set && - test_bit(JOURNAL_NEED_WRITE, &j->flags)) - __bch_time_stats_update(j->delay_time, - j->need_write_time); -#if 0 - closure_call(&j->io, journal_write, NULL, &c->cl); -#else - /* Shut sparse up: */ - closure_init(&j->io, &c->cl); - set_closure_fn(&j->io, journal_write, NULL); - journal_write(&j->io); -#endif -} - -static void __bch_journal_next_entry(struct journal *j) -{ - struct journal_entry_pin_list pin_list, *p; - struct journal_buf *buf; - - /* - * The fifo_push() needs to happen at the same time as j->seq is - * incremented for last_seq() to be calculated correctly - */ - atomic64_inc(&j->seq); - BUG_ON(!fifo_push(&j->pin, pin_list)); - p = &fifo_peek_back(&j->pin); - - INIT_LIST_HEAD(&p->list); - atomic_set(&p->count, 1); - - if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) { - smp_wmb(); - j->cur_pin_list = p; - } - - buf = journal_cur_buf(j); - memset(buf->has_inode, 0, sizeof(buf->has_inode)); - - memset(buf->data, 0, sizeof(*buf->data)); - buf->data->seq = cpu_to_le64(atomic64_read(&j->seq)); - buf->data->u64s = 0; - - BUG_ON(journal_pin_seq(j, p) != atomic64_read(&j->seq)); -} - -static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf) -{ - unsigned ret = BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); - - if (buf->nr_prio_buckets) - ret += JSET_KEYS_U64s + buf->nr_prio_buckets; - - return ret; -} - -static enum { - JOURNAL_ENTRY_ERROR, - JOURNAL_ENTRY_INUSE, - JOURNAL_ENTRY_CLOSED, - JOURNAL_UNLOCKED, -} journal_buf_switch(struct journal *j, bool need_write_just_set) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf; - union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); - - do { - old.v = new.v = v; - if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) - return JOURNAL_ENTRY_CLOSED; - - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return JOURNAL_ENTRY_ERROR; - - if (new.prev_buf_unwritten) - return JOURNAL_ENTRY_INUSE; - - /* - * avoid race between setting buf->data->u64s and - * journal_res_put starting write: - */ - journal_state_inc(&new); - - new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; - new.idx++; - new.prev_buf_unwritten = 1; - - BUG_ON(journal_state_count(new, new.idx)); - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); - - journal_reclaim_fast(j); - - clear_bit(JOURNAL_NEED_WRITE, &j->flags); - - buf = &j->buf[old.idx]; - buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - buf->data->last_seq = cpu_to_le64(last_seq(j)); - - j->prev_buf_sectors = - vstruct_blocks_plus(buf->data, c->block_bits, - journal_entry_u64s_reserve(buf)) * - c->sb.block_size; - - BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors); - - atomic_dec_bug(&fifo_peek_back(&j->pin).count); - __bch_journal_next_entry(j); - - cancel_delayed_work(&j->write_work); - spin_unlock(&j->lock); - - if (c->bucket_journal_seq > 1 << 14) { - c->bucket_journal_seq = 0; - bch_bucket_seq_cleanup(c); - } - - /* ugh - might be called from __journal_res_get() under wait_event() */ - __set_current_state(TASK_RUNNING); - bch_journal_buf_put(j, old.idx, need_write_just_set); - - return JOURNAL_UNLOCKED; -} - -void bch_journal_halt(struct journal *j) -{ - union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); - - do { - old.v = new.v = v; - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return; - - new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); - - wake_up(&j->wait); - closure_wake_up(&journal_cur_buf(j)->wait); - closure_wake_up(&journal_prev_buf(j)->wait); -} - -static unsigned journal_dev_buckets_available(struct journal *j, - struct bch_dev *ca) -{ - struct journal_device *ja = &ca->journal; - unsigned next = (ja->cur_idx + 1) % ja->nr; - unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; - - /* - * Hack to avoid a deadlock during journal replay: - * journal replay might require setting a new btree - * root, which requires writing another journal entry - - * thus, if the journal is full (and this happens when - * replaying the first journal bucket's entries) we're - * screwed. - * - * So don't let the journal fill up unless we're in - * replay: - */ - if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) - available = max((int) available - 2, 0); - - /* - * Don't use the last bucket unless writing the new last_seq - * will make another bucket available: - */ - if (ja->bucket_seq[ja->last_idx] >= last_seq(j)) - available = max((int) available - 1, 0); - - return available; -} - -/* returns number of sectors available for next journal entry: */ -static int journal_entry_sectors(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); - unsigned sectors_available = j->entry_size_max >> 9; - unsigned i, nr_online = 0, nr_devs = 0; - - lockdep_assert_held(&j->lock); - - spin_lock(&j->devs.lock); - group_for_each_dev(ca, &j->devs, i) { - unsigned buckets_required = 0; - - sectors_available = min_t(unsigned, sectors_available, - ca->mi.bucket_size); - - /* - * Note that we don't allocate the space for a journal entry - * until we write it out - thus, if we haven't started the write - * for the previous entry we have to make sure we have space for - * it too: - */ - if (bch_extent_has_device(e.c, ca->dev_idx)) { - if (j->prev_buf_sectors > ca->journal.sectors_free) - buckets_required++; - - if (j->prev_buf_sectors + sectors_available > - ca->journal.sectors_free) - buckets_required++; - } else { - if (j->prev_buf_sectors + sectors_available > - ca->mi.bucket_size) - buckets_required++; - - buckets_required++; - } - - if (journal_dev_buckets_available(j, ca) >= buckets_required) - nr_devs++; - nr_online++; - } - spin_unlock(&j->devs.lock); - - if (nr_online < c->opts.metadata_replicas_required) - return -EROFS; - - if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas)) - return 0; - - return sectors_available; -} - -/* - * should _only_ called from journal_res_get() - when we actually want a - * journal reservation - journal entry is open means journal is dirty: - */ -static int journal_entry_open(struct journal *j) -{ - struct journal_buf *buf = journal_cur_buf(j); - ssize_t u64s; - int ret = 0, sectors; - - lockdep_assert_held(&j->lock); - BUG_ON(journal_entry_is_open(j)); - - if (!fifo_free(&j->pin)) - return 0; - - sectors = journal_entry_sectors(j); - if (sectors <= 0) - return sectors; - - j->cur_buf_sectors = sectors; - buf->nr_prio_buckets = j->nr_prio_buckets; - - u64s = (sectors << 9) / sizeof(u64); - - /* Subtract the journal header */ - u64s -= sizeof(struct jset) / sizeof(u64); - /* - * Btree roots, prio pointers don't get added until right before we do - * the write: - */ - u64s -= journal_entry_u64s_reserve(buf); - u64s = max_t(ssize_t, 0L, u64s); - - BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL); - - if (u64s > le32_to_cpu(buf->data->u64s)) { - union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); - - /* - * Must be set before marking the journal entry as open: - */ - j->cur_entry_u64s = u64s; - - do { - old.v = new.v = v; - - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return false; - - /* Handle any already added entries */ - new.cur_entry_offset = le32_to_cpu(buf->data->u64s); - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); - ret = 1; - - wake_up(&j->wait); - - if (j->res_get_blocked_start) { - __bch_time_stats_update(j->blocked_time, - j->res_get_blocked_start); - j->res_get_blocked_start = 0; - } - - mod_delayed_work(system_freezable_wq, - &j->write_work, - msecs_to_jiffies(j->write_delay_ms)); - } - - return ret; -} - -void bch_journal_start(struct bch_fs *c) -{ - struct journal *j = &c->journal; - struct journal_seq_blacklist *bl; - u64 new_seq = 0; - - list_for_each_entry(bl, &j->seq_blacklist, list) - new_seq = max(new_seq, bl->seq); - - spin_lock(&j->lock); - - set_bit(JOURNAL_STARTED, &j->flags); - - while (atomic64_read(&j->seq) < new_seq) { - struct journal_entry_pin_list pin_list, *p; - - BUG_ON(!fifo_push(&j->pin, pin_list)); - p = &fifo_peek_back(&j->pin); - - INIT_LIST_HEAD(&p->list); - atomic_set(&p->count, 0); - atomic64_inc(&j->seq); - } - - /* - * journal_buf_switch() only inits the next journal entry when it - * closes an open journal entry - the very first journal entry gets - * initialized here: - */ - __bch_journal_next_entry(j); - - /* - * Adding entries to the next journal entry before allocating space on - * disk for the next journal entry - this is ok, because these entries - * only have to go down with the next journal entry we write: - */ - list_for_each_entry(bl, &j->seq_blacklist, list) - if (!bl->written) { - bch_journal_add_entry(journal_cur_buf(j), &bl->seq, 1, - JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED, - 0, 0); - - journal_pin_add_entry(j, - &fifo_peek_back(&j->pin), - &bl->pin, - journal_seq_blacklist_flush); - bl->written = true; - } - - spin_unlock(&j->lock); - - queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0); -} - -int bch_journal_replay(struct bch_fs *c, struct list_head *list) -{ - int ret = 0, keys = 0, entries = 0; - struct journal *j = &c->journal; - struct bkey_i *k, *_n; - struct jset_entry *entry; - struct journal_replay *i, *n; - - list_for_each_entry_safe(i, n, list, list) { - j->cur_pin_list = - &j->pin.data[((j->pin.back - 1 - - (atomic64_read(&j->seq) - - le64_to_cpu(i->j.seq))) & - j->pin.mask)]; - - for_each_jset_key(k, _n, entry, &i->j) { - struct disk_reservation disk_res; - - /* - * We might cause compressed extents to be split, so we - * need to pass in a disk_reservation: - */ - BUG_ON(bch_disk_reservation_get(c, &disk_res, 0, 0)); - - trace_bcache_journal_replay_key(&k->k); - - ret = bch_btree_insert(c, entry->btree_id, k, - &disk_res, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_REPLAY); - bch_disk_reservation_put(c, &disk_res); - - if (ret) - goto err; - - cond_resched(); - keys++; - } - - if (atomic_dec_and_test(&j->cur_pin_list->count)) - wake_up(&j->wait); - - entries++; - } - - if (keys) { - bch_btree_flush(c); - - /* - * Write a new journal entry _before_ we start journalling new data - - * otherwise, we could end up with btree node bsets with journal seqs - * arbitrarily far in the future vs. the most recently written journal - * entry on disk, if we crash before writing the next journal entry: - */ - ret = bch_journal_meta(&c->journal); - if (ret) - goto err; - } - - bch_info(c, "journal replay done, %i keys in %i entries, seq %llu", - keys, entries, (u64) atomic64_read(&j->seq)); - - bch_journal_set_replay_done(&c->journal); -err: - if (ret) - bch_err(c, "journal replay error: %d", ret); - - bch_journal_entries_free(list); - - return ret; -} - -#if 0 -/* - * Allocate more journal space at runtime - not currently making use if it, but - * the code works: - */ -static int bch_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, - unsigned nr) -{ - struct journal *j = &c->journal; - struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets; - struct disk_reservation disk_res = { 0, 0 }; - struct closure cl; - u64 *new_bucket_seq = NULL, *new_buckets = NULL; - int ret = 0; - - closure_init_stack(&cl); - - /* don't handle reducing nr of buckets yet: */ - if (nr <= ja->nr) - return 0; - - /* - * note: journal buckets aren't really counted as _sectors_ used yet, so - * we don't need the disk reservation to avoid the BUG_ON() in buckets.c - * when space used goes up without a reservation - but we do need the - * reservation to ensure we'll actually be able to allocate: - */ - - if (bch_disk_reservation_get(c, &disk_res, - (nr - ja->nr) << ca->bucket_bits, 0)) - return -ENOSPC; - - mutex_lock(&c->sb_lock); - - ret = -ENOMEM; - new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); - new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); - if (!new_buckets || !new_bucket_seq) - goto err; - - journal_buckets = bch_sb_resize_journal(&ca->disk_sb, - nr + sizeof(*journal_buckets) / sizeof(u64)); - if (!journal_buckets) - goto err; - - spin_lock(&j->lock); - memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); - memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); - swap(new_buckets, ja->buckets); - swap(new_bucket_seq, ja->bucket_seq); - - while (ja->nr < nr) { - /* must happen under journal lock, to avoid racing with gc: */ - u64 b = bch_bucket_alloc(ca, RESERVE_NONE); - if (!b) { - if (!closure_wait(&c->freelist_wait, &cl)) { - spin_unlock(&j->lock); - closure_sync(&cl); - spin_lock(&j->lock); - } - continue; - } - - bch_mark_metadata_bucket(ca, &ca->buckets[b], - BUCKET_JOURNAL, false); - bch_mark_alloc_bucket(ca, &ca->buckets[b], false); - - memmove(ja->buckets + ja->last_idx + 1, - ja->buckets + ja->last_idx, - (ja->nr - ja->last_idx) * sizeof(u64)); - memmove(ja->bucket_seq + ja->last_idx + 1, - ja->bucket_seq + ja->last_idx, - (ja->nr - ja->last_idx) * sizeof(u64)); - memmove(journal_buckets->buckets + ja->last_idx + 1, - journal_buckets->buckets + ja->last_idx, - (ja->nr - ja->last_idx) * sizeof(u64)); - - ja->buckets[ja->last_idx] = b; - journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b); - - if (ja->last_idx < ja->nr) { - if (ja->cur_idx >= ja->last_idx) - ja->cur_idx++; - ja->last_idx++; - } - ja->nr++; - - } - spin_unlock(&j->lock); - - BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi)); - - bch_write_super(c); - - ret = 0; -err: - mutex_unlock(&c->sb_lock); - - kfree(new_bucket_seq); - kfree(new_buckets); - bch_disk_reservation_put(c, &disk_res); - - return ret; -} -#endif - -int bch_dev_journal_alloc(struct bch_dev *ca) -{ - struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets; - unsigned i, nr; - u64 b, *p; - - if (dynamic_fault("bcache:add:journal_alloc")) - return -ENOMEM; - - /* - * clamp journal size to 1024 buckets or 512MB (in sectors), whichever - * is smaller: - */ - nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, - BCH_JOURNAL_BUCKETS_MIN, - min(1 << 10, - (1 << 20) / ca->mi.bucket_size)); - - p = krealloc(ja->bucket_seq, nr * sizeof(u64), - GFP_KERNEL|__GFP_ZERO); - if (!p) - return -ENOMEM; - - ja->bucket_seq = p; - - p = krealloc(ja->buckets, nr * sizeof(u64), - GFP_KERNEL|__GFP_ZERO); - if (!p) - return -ENOMEM; - - ja->buckets = p; - - journal_buckets = bch_sb_resize_journal(&ca->disk_sb, - nr + sizeof(*journal_buckets) / sizeof(u64)); - if (!journal_buckets) - return -ENOMEM; - - for (i = 0, b = ca->mi.first_bucket; - i < nr && b < ca->mi.nbuckets; b++) { - if (!is_available_bucket(ca->buckets[b].mark)) - continue; - - bch_mark_metadata_bucket(ca, &ca->buckets[b], - BUCKET_JOURNAL, true); - ja->buckets[i] = b; - journal_buckets->buckets[i] = cpu_to_le64(b); - i++; - } - - if (i < nr) - return -ENOSPC; - - BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi)); - - ja->nr = nr; - - return 0; -} - -/* Journalling */ - -/** - * journal_reclaim_fast - do the fast part of journal reclaim - * - * Called from IO submission context, does not block. Cleans up after btree - * write completions by advancing the journal pin and each cache's last_idx, - * kicking off discards and background reclaim as necessary. - */ -static void journal_reclaim_fast(struct journal *j) -{ - struct journal_entry_pin_list temp; - bool popped = false; - - lockdep_assert_held(&j->lock); - - /* - * Unpin journal entries whose reference counts reached zero, meaning - * all btree nodes got written out - */ - while (!atomic_read(&fifo_peek_front(&j->pin).count)) { - BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); - BUG_ON(!fifo_pop(&j->pin, temp)); - popped = true; - } - - if (popped) - wake_up(&j->wait); -} - -/* - * Journal entry pinning - machinery for holding a reference on a given journal - * entry, marking it as dirty: - */ - -static inline void __journal_pin_add(struct journal *j, - struct journal_entry_pin_list *pin_list, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - BUG_ON(journal_pin_active(pin)); - - atomic_inc(&pin_list->count); - pin->pin_list = pin_list; - pin->flush = flush_fn; - - if (flush_fn) - list_add(&pin->list, &pin_list->list); - else - INIT_LIST_HEAD(&pin->list); -} - -static void journal_pin_add_entry(struct journal *j, - struct journal_entry_pin_list *pin_list, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - spin_lock_irq(&j->pin_lock); - __journal_pin_add(j, pin_list, pin, flush_fn); - spin_unlock_irq(&j->pin_lock); -} - -void bch_journal_pin_add(struct journal *j, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - spin_lock_irq(&j->pin_lock); - __journal_pin_add(j, j->cur_pin_list, pin, flush_fn); - spin_unlock_irq(&j->pin_lock); -} - -static inline bool __journal_pin_drop(struct journal *j, - struct journal_entry_pin *pin) -{ - struct journal_entry_pin_list *pin_list = pin->pin_list; - - pin->pin_list = NULL; - - /* journal_reclaim_work() might have already taken us off the list */ - if (!list_empty_careful(&pin->list)) - list_del_init(&pin->list); - - return atomic_dec_and_test(&pin_list->count); -} - -void bch_journal_pin_drop(struct journal *j, - struct journal_entry_pin *pin) -{ - unsigned long flags; - bool wakeup; - - if (!journal_pin_active(pin)) - return; - - spin_lock_irqsave(&j->pin_lock, flags); - wakeup = __journal_pin_drop(j, pin); - spin_unlock_irqrestore(&j->pin_lock, flags); - - /* - * Unpinning a journal entry make make journal_next_bucket() succeed, if - * writing a new last_seq will now make another bucket available: - * - * Nested irqsave is expensive, don't do the wakeup with lock held: - */ - if (wakeup) - wake_up(&j->wait); -} - -void bch_journal_pin_add_if_older(struct journal *j, - struct journal_entry_pin *src_pin, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - spin_lock_irq(&j->pin_lock); - - if (journal_pin_active(src_pin) && - (!journal_pin_active(pin) || - fifo_entry_idx(&j->pin, src_pin->pin_list) < - fifo_entry_idx(&j->pin, pin->pin_list))) { - if (journal_pin_active(pin)) - __journal_pin_drop(j, pin); - __journal_pin_add(j, src_pin->pin_list, pin, flush_fn); - } - - spin_unlock_irq(&j->pin_lock); -} - -static struct journal_entry_pin * -journal_get_next_pin(struct journal *j, u64 seq_to_flush) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *ret = NULL; - unsigned iter; - - /* so we don't iterate over empty fifo entries below: */ - if (!atomic_read(&fifo_peek_front(&j->pin).count)) { - spin_lock(&j->lock); - journal_reclaim_fast(j); - spin_unlock(&j->lock); - } - - spin_lock_irq(&j->pin_lock); - fifo_for_each_entry_ptr(pin_list, &j->pin, iter) { - if (journal_pin_seq(j, pin_list) > seq_to_flush) - break; - - ret = list_first_entry_or_null(&pin_list->list, - struct journal_entry_pin, list); - if (ret) { - /* must be list_del_init(), see bch_journal_pin_drop() */ - list_del_init(&ret->list); - break; - } - } - spin_unlock_irq(&j->pin_lock); - - return ret; -} - -static bool journal_has_pins(struct journal *j) -{ - bool ret; - - spin_lock(&j->lock); - journal_reclaim_fast(j); - ret = fifo_used(&j->pin) > 1 || - atomic_read(&fifo_peek_front(&j->pin).count) > 1; - spin_unlock(&j->lock); - - return ret; -} - -void bch_journal_flush_pins(struct journal *j) -{ - struct journal_entry_pin *pin; - - while ((pin = journal_get_next_pin(j, U64_MAX))) - pin->flush(j, pin); - - wait_event(j->wait, !journal_has_pins(j) || bch_journal_error(j)); -} - -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) -{ - bool ret; - - spin_lock(&j->lock); - ret = ja->nr && - (ja->last_idx != ja->cur_idx && - ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk); - spin_unlock(&j->lock); - - return ret; -} - -/** - * journal_reclaim_work - free up journal buckets - * - * Background journal reclaim writes out btree nodes. It should be run - * early enough so that we never completely run out of journal buckets. - * - * High watermarks for triggering background reclaim: - * - FIFO has fewer than 512 entries left - * - fewer than 25% journal buckets free - * - * Background reclaim runs until low watermarks are reached: - * - FIFO has more than 1024 entries left - * - more than 50% journal buckets free - * - * As long as a reclaim can complete in the time it takes to fill up - * 512 journal entries or 25% of all journal buckets, then - * journal_next_bucket() should not stall. - */ -static void journal_reclaim_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(to_delayed_work(work), - struct bch_fs, journal.reclaim_work); - struct journal *j = &c->journal; - struct bch_dev *ca; - struct journal_entry_pin *pin; - u64 seq_to_flush = 0; - unsigned iter, bucket_to_flush; - unsigned long next_flush; - bool reclaim_lock_held = false, need_flush; - - /* - * Advance last_idx to point to the oldest journal entry containing - * btree node updates that have not yet been written out - */ - for_each_rw_member(ca, c, iter) { - struct journal_device *ja = &ca->journal; - - if (!ja->nr) - continue; - - while (should_discard_bucket(j, ja)) { - if (!reclaim_lock_held) { - /* - * ugh: - * might be called from __journal_res_get() - * under wait_event() - have to go back to - * TASK_RUNNING before doing something that - * would block, but only if we're doing work: - */ - __set_current_state(TASK_RUNNING); - - mutex_lock(&j->reclaim_lock); - reclaim_lock_held = true; - /* recheck under reclaim_lock: */ - continue; - } - - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, - ja->buckets[ja->last_idx]), - ca->mi.bucket_size, GFP_NOIO, 0); - - spin_lock(&j->lock); - ja->last_idx = (ja->last_idx + 1) % ja->nr; - spin_unlock(&j->lock); - - wake_up(&j->wait); - } - - /* - * Write out enough btree nodes to free up 50% journal - * buckets - */ - spin_lock(&j->lock); - bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr; - seq_to_flush = max_t(u64, seq_to_flush, - ja->bucket_seq[bucket_to_flush]); - spin_unlock(&j->lock); - } - - if (reclaim_lock_held) - mutex_unlock(&j->reclaim_lock); - - /* Also flush if the pin fifo is more than half full */ - seq_to_flush = max_t(s64, seq_to_flush, - (s64) atomic64_read(&j->seq) - - (j->pin.size >> 1)); - - /* - * If it's been longer than j->reclaim_delay_ms since we last flushed, - * make sure to flush at least one journal pin: - */ - next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); - need_flush = time_after(jiffies, next_flush); - - while ((pin = journal_get_next_pin(j, need_flush - ? U64_MAX - : seq_to_flush))) { - __set_current_state(TASK_RUNNING); - pin->flush(j, pin); - need_flush = false; - - j->last_flushed = jiffies; - } - - if (!test_bit(BCH_FS_RO, &c->flags)) - queue_delayed_work(system_freezable_wq, &j->reclaim_work, - msecs_to_jiffies(j->reclaim_delay_ms)); -} - -/** - * journal_next_bucket - move on to the next journal bucket if possible - */ -static int journal_write_alloc(struct journal *j, unsigned sectors) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); - struct bch_extent_ptr *ptr; - struct journal_device *ja; - struct bch_dev *ca; - bool swapped; - unsigned i, replicas, replicas_want = - READ_ONCE(c->opts.metadata_replicas); - - spin_lock(&j->lock); - - /* - * Drop any pointers to devices that have been removed, are no longer - * empty, or filled up their current journal bucket: - * - * Note that a device may have had a small amount of free space (perhaps - * one sector) that wasn't enough for the smallest possible journal - * entry - that's why we drop pointers to devices <= current free space, - * i.e. whichever device was limiting the current journal entry size. - */ - extent_for_each_ptr_backwards(e, ptr) { - ca = c->devs[ptr->dev]; - - if (ca->mi.state != BCH_MEMBER_STATE_RW || - ca->journal.sectors_free <= sectors) - __bch_extent_drop_ptr(e, ptr); - else - ca->journal.sectors_free -= sectors; - } - - replicas = bch_extent_nr_ptrs(e.c); - - spin_lock(&j->devs.lock); - - /* Sort by tier: */ - do { - swapped = false; - - for (i = 0; i + 1 < j->devs.nr; i++) - if (j->devs.d[i + 0].dev->mi.tier > - j->devs.d[i + 1].dev->mi.tier) { - swap(j->devs.d[i], j->devs.d[i + 1]); - swapped = true; - } - } while (swapped); - - /* - * Pick devices for next journal write: - * XXX: sort devices by free journal space? - */ - group_for_each_dev(ca, &j->devs, i) { - ja = &ca->journal; - - if (replicas >= replicas_want) - break; - - /* - * Check that we can use this device, and aren't already using - * it: - */ - if (bch_extent_has_device(e.c, ca->dev_idx) || - !journal_dev_buckets_available(j, ca) || - sectors > ca->mi.bucket_size) - continue; - - ja->sectors_free = ca->mi.bucket_size - sectors; - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq); - - extent_ptr_append(bkey_i_to_extent(&j->key), - (struct bch_extent_ptr) { - .offset = bucket_to_sector(ca, - ja->buckets[ja->cur_idx]), - .dev = ca->dev_idx, - }); - replicas++; - - trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx); - } - spin_unlock(&j->devs.lock); - - j->prev_buf_sectors = 0; - spin_unlock(&j->lock); - - if (replicas < c->opts.metadata_replicas_required) - return -EROFS; - - BUG_ON(!replicas); - - return 0; -} - -static void journal_write_compact(struct jset *jset) -{ - struct jset_entry *i, *next, *prev = NULL; - - /* - * Simple compaction, dropping empty jset_entries (from journal - * reservations that weren't fully used) and merging jset_entries that - * can be. - * - * If we wanted to be really fancy here, we could sort all the keys in - * the jset and drop keys that were overwritten - probably not worth it: - */ - vstruct_for_each_safe(jset, i, next) { - unsigned u64s = le16_to_cpu(i->u64s); - - /* Empty entry: */ - if (!u64s) - continue; - - /* Can we merge with previous entry? */ - if (prev && - i->btree_id == prev->btree_id && - i->level == prev->level && - JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) && - JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS && - le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { - memmove_u64s_down(vstruct_next(prev), - i->_data, - u64s); - le16_add_cpu(&prev->u64s, u64s); - continue; - } - - /* Couldn't merge, move i into new position (after prev): */ - prev = prev ? vstruct_next(prev) : jset->start; - if (i != prev) - memmove_u64s_down(prev, i, jset_u64s(u64s)); - } - - prev = prev ? vstruct_next(prev) : jset->start; - jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); -} - -static void journal_write_endio(struct bio *bio) -{ - struct bch_dev *ca = bio->bi_private; - struct journal *j = &ca->fs->journal; - - if (bch_dev_fatal_io_err_on(bio->bi_error, ca, "journal write") || - bch_meta_write_fault("journal")) - bch_journal_halt(j); - - closure_put(&j->io); - percpu_ref_put(&ca->io_ref); -} - -static void journal_write_done(struct closure *cl) -{ - struct journal *j = container_of(cl, struct journal, io); - struct journal_buf *w = journal_prev_buf(j); - - j->last_seq_ondisk = le64_to_cpu(w->data->last_seq); - - __bch_time_stats_update(j->write_time, j->write_start_time); - - BUG_ON(!j->reservations.prev_buf_unwritten); - atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, - &j->reservations.counter); - - /* - * XXX: this is racy, we could technically end up doing the wake up - * after the journal_buf struct has been reused for the next write - * (because we're clearing JOURNAL_IO_IN_FLIGHT) and wake up things that - * are waiting on the _next_ write, not this one. - * - * The wake up can't come before, because journal_flush_seq_async() is - * looking at JOURNAL_IO_IN_FLIGHT when it has to wait on a journal - * write that was already in flight. - * - * The right fix is to use a lock here, but using j.lock here means it - * has to be a spin_lock_irqsave() lock which then requires propagating - * the irq()ness to other locks and it's all kinds of nastiness. - */ - - closure_wake_up(&w->wait); - wake_up(&j->wait); - - /* - * Updating last_seq_ondisk may let journal_reclaim_work() discard more - * buckets: - */ - mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0); -} - -static void journal_write(struct closure *cl) -{ - struct journal *j = container_of(cl, struct journal, io); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - struct journal_buf *w = journal_prev_buf(j); - struct jset *jset = w->data; - struct bio *bio; - struct bch_extent_ptr *ptr; - unsigned i, sectors, bytes; - - j->write_start_time = local_clock(); - - bch_journal_add_prios(j, w); - - mutex_lock(&c->btree_root_lock); - for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_root *r = &c->btree_roots[i]; - - if (r->alive) - bch_journal_add_btree_root(w, i, &r->key, r->level); - } - mutex_unlock(&c->btree_root_lock); - - journal_write_compact(jset); - - jset->read_clock = cpu_to_le16(c->prio_clock[READ].hand); - jset->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand); - jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = cpu_to_le32(BCACHE_JSET_VERSION); - - SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); - SET_JSET_CSUM_TYPE(jset, bch_meta_checksum_type(c)); - - bch_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), - jset->encrypted_start, - vstruct_end(jset) - (void *) jset->encrypted_start); - - jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), - journal_nonce(jset), jset); - - sectors = vstruct_sectors(jset, c->block_bits); - BUG_ON(sectors > j->prev_buf_sectors); - - bytes = vstruct_bytes(w->data); - memset((void *) w->data + bytes, 0, (sectors << 9) - bytes); - - if (journal_write_alloc(j, sectors)) { - bch_journal_halt(j); - bch_err(c, "Unable to allocate journal write"); - bch_fatal_error(c); - closure_return_with_destructor(cl, journal_write_done); - } - - bch_check_mark_super(c, &j->key, true); - - /* - * XXX: we really should just disable the entire journal in nochanges - * mode - */ - if (c->opts.nochanges) - goto no_io; - - extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) { - ca = c->devs[ptr->dev]; - if (!percpu_ref_tryget(&ca->io_ref)) { - /* XXX: fix this */ - bch_err(c, "missing device for journal write\n"); - continue; - } - - atomic64_add(sectors, &ca->meta_sectors_written); - - bio = ca->journal.bio; - bio_reset(bio); - bio->bi_iter.bi_sector = ptr->offset; - bio->bi_bdev = ca->disk_sb.bdev; - bio->bi_iter.bi_size = sectors << 9; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - bio_set_op_attrs(bio, REQ_OP_WRITE, - REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); - bch_bio_map(bio, jset); - - trace_bcache_journal_write(bio); - closure_bio_submit_punt(bio, cl, c); - - ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq); - } - - for_each_rw_member(ca, c, i) - if (journal_flushes_device(ca) && - !bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) { - percpu_ref_get(&ca->io_ref); - - bio = ca->journal.bio; - bio_reset(bio); - bio->bi_bdev = ca->disk_sb.bdev; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); - closure_bio_submit_punt(bio, cl, c); - } - -no_io: - extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) - ptr->offset += sectors; - - closure_return_with_destructor(cl, journal_write_done); -} - -static void journal_write_work(struct work_struct *work) -{ - struct journal *j = container_of(to_delayed_work(work), - struct journal, write_work); - spin_lock(&j->lock); - set_bit(JOURNAL_NEED_WRITE, &j->flags); - - if (journal_buf_switch(j, false) != JOURNAL_UNLOCKED) - spin_unlock(&j->lock); -} - -/* - * Given an inode number, if that inode number has data in the journal that - * hasn't yet been flushed, return the journal sequence number that needs to be - * flushed: - */ -u64 bch_inode_journal_seq(struct journal *j, u64 inode) -{ - size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); - u64 seq = 0; - - if (!test_bit(h, j->buf[0].has_inode) && - !test_bit(h, j->buf[1].has_inode)) - return 0; - - spin_lock(&j->lock); - if (test_bit(h, journal_cur_buf(j)->has_inode)) - seq = atomic64_read(&j->seq); - else if (test_bit(h, journal_prev_buf(j)->has_inode)) - seq = atomic64_read(&j->seq) - 1; - spin_unlock(&j->lock); - - return seq; -} - -static int __journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - int ret; -retry: - ret = journal_res_get_fast(j, res, u64s_min, u64s_max); - if (ret) - return ret; - - spin_lock(&j->lock); - /* - * Recheck after taking the lock, so we don't race with another thread - * that just did journal_entry_open() and call journal_entry_close() - * unnecessarily - */ - ret = journal_res_get_fast(j, res, u64s_min, u64s_max); - if (ret) { - spin_unlock(&j->lock); - return 1; - } - - /* - * Ok, no more room in the current journal entry - try to start a new - * one: - */ - switch (journal_buf_switch(j, false)) { - case JOURNAL_ENTRY_ERROR: - spin_unlock(&j->lock); - return -EIO; - case JOURNAL_ENTRY_INUSE: - /* haven't finished writing out the previous one: */ - spin_unlock(&j->lock); - trace_bcache_journal_entry_full(c); - goto blocked; - case JOURNAL_ENTRY_CLOSED: - break; - case JOURNAL_UNLOCKED: - goto retry; - } - - /* We now have a new, closed journal buf - see if we can open it: */ - ret = journal_entry_open(j); - spin_unlock(&j->lock); - - if (ret < 0) - return ret; - if (ret) - goto retry; - - /* Journal's full, we have to wait */ - - /* - * Direct reclaim - can't rely on reclaim from work item - * due to freezing.. - */ - journal_reclaim_work(&j->reclaim_work.work); - - trace_bcache_journal_full(c); -blocked: - if (!j->res_get_blocked_start) - j->res_get_blocked_start = local_clock() ?: 1; - return 0; -} - -/* - * Essentially the entry function to the journaling code. When bcache is doing - * a btree insert, it calls this function to get the current journal write. - * Journal write is the structure used set up journal writes. The calling - * function will then add its keys to the structure, queuing them for the - * next write. - * - * To ensure forward progress, the current task must not be holding any - * btree node write locks. - */ -int bch_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) -{ - int ret; - - wait_event(j->wait, - (ret = __journal_res_get(j, res, u64s_min, - u64s_max))); - return ret < 0 ? ret : 0; -} - -void bch_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent) -{ - spin_lock(&j->lock); - - BUG_ON(seq > atomic64_read(&j->seq)); - - if (bch_journal_error(j)) { - spin_unlock(&j->lock); - return; - } - - if (seq == atomic64_read(&j->seq)) { - if (!closure_wait(&journal_cur_buf(j)->wait, parent)) - BUG(); - } else if (seq + 1 == atomic64_read(&j->seq) && - j->reservations.prev_buf_unwritten) { - if (!closure_wait(&journal_prev_buf(j)->wait, parent)) - BUG(); - - smp_mb(); - - /* check if raced with write completion (or failure) */ - if (!j->reservations.prev_buf_unwritten || - bch_journal_error(j)) - closure_wake_up(&journal_prev_buf(j)->wait); - } - - spin_unlock(&j->lock); -} - -void bch_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) -{ - spin_lock(&j->lock); - - BUG_ON(seq > atomic64_read(&j->seq)); - - if (bch_journal_error(j)) { - spin_unlock(&j->lock); - return; - } - - if (seq == atomic64_read(&j->seq)) { - bool set_need_write = false; - - if (parent && - !closure_wait(&journal_cur_buf(j)->wait, parent)) - BUG(); - - if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) { - j->need_write_time = local_clock(); - set_need_write = true; - } - - switch (journal_buf_switch(j, set_need_write)) { - case JOURNAL_ENTRY_ERROR: - if (parent) - closure_wake_up(&journal_cur_buf(j)->wait); - break; - case JOURNAL_ENTRY_CLOSED: - /* - * Journal entry hasn't been opened yet, but caller - * claims it has something (seq == j->seq): - */ - BUG(); - case JOURNAL_ENTRY_INUSE: - break; - case JOURNAL_UNLOCKED: - return; - } - } else if (parent && - seq + 1 == atomic64_read(&j->seq) && - j->reservations.prev_buf_unwritten) { - if (!closure_wait(&journal_prev_buf(j)->wait, parent)) - BUG(); - - smp_mb(); - - /* check if raced with write completion (or failure) */ - if (!j->reservations.prev_buf_unwritten || - bch_journal_error(j)) - closure_wake_up(&journal_prev_buf(j)->wait); - } - - spin_unlock(&j->lock); -} - -int bch_journal_flush_seq(struct journal *j, u64 seq) -{ - struct closure cl; - u64 start_time = local_clock(); - - closure_init_stack(&cl); - bch_journal_flush_seq_async(j, seq, &cl); - closure_sync(&cl); - - bch_time_stats_update(j->flush_seq_time, start_time); - - return bch_journal_error(j); -} - -void bch_journal_meta_async(struct journal *j, struct closure *parent) -{ - struct journal_res res; - unsigned u64s = jset_u64s(0); - - memset(&res, 0, sizeof(res)); - - bch_journal_res_get(j, &res, u64s, u64s); - bch_journal_res_put(j, &res); - - bch_journal_flush_seq_async(j, res.seq, parent); -} - -int bch_journal_meta(struct journal *j) -{ - struct journal_res res; - unsigned u64s = jset_u64s(0); - int ret; - - memset(&res, 0, sizeof(res)); - - ret = bch_journal_res_get(j, &res, u64s, u64s); - if (ret) - return ret; - - bch_journal_res_put(j, &res); - - return bch_journal_flush_seq(j, res.seq); -} - -void bch_journal_flush_async(struct journal *j, struct closure *parent) -{ - u64 seq, journal_seq; - - spin_lock(&j->lock); - journal_seq = atomic64_read(&j->seq); - - if (journal_entry_is_open(j)) { - seq = journal_seq; - } else if (journal_seq) { - seq = journal_seq - 1; - } else { - spin_unlock(&j->lock); - return; - } - spin_unlock(&j->lock); - - bch_journal_flush_seq_async(j, seq, parent); -} - -int bch_journal_flush(struct journal *j) -{ - u64 seq, journal_seq; - - spin_lock(&j->lock); - journal_seq = atomic64_read(&j->seq); - - if (journal_entry_is_open(j)) { - seq = journal_seq; - } else if (journal_seq) { - seq = journal_seq - 1; - } else { - spin_unlock(&j->lock); - return 0; - } - spin_unlock(&j->lock); - - return bch_journal_flush_seq(j, seq); -} - -ssize_t bch_journal_print_debug(struct journal *j, char *buf) -{ - union journal_res_state *s = &j->reservations; - struct bch_dev *ca; - unsigned iter; - ssize_t ret = 0; - - rcu_read_lock(); - spin_lock(&j->lock); - - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "active journal entries:\t%zu\n" - "seq:\t\t\t%llu\n" - "last_seq:\t\t%llu\n" - "last_seq_ondisk:\t%llu\n" - "reservation count:\t%u\n" - "reservation offset:\t%u\n" - "current entry u64s:\t%u\n" - "io in flight:\t\t%i\n" - "need write:\t\t%i\n" - "dirty:\t\t\t%i\n" - "replay done:\t\t%i\n", - fifo_used(&j->pin), - (u64) atomic64_read(&j->seq), - last_seq(j), - j->last_seq_ondisk, - journal_state_count(*s, s->idx), - s->cur_entry_offset, - j->cur_entry_u64s, - s->prev_buf_unwritten, - test_bit(JOURNAL_NEED_WRITE, &j->flags), - journal_entry_is_open(j), - test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - - spin_lock(&j->devs.lock); - group_for_each_dev(ca, &j->devs, iter) { - struct journal_device *ja = &ca->journal; - - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "dev %u:\n" - "\tnr\t\t%u\n" - "\tcur_idx\t\t%u (seq %llu)\n" - "\tlast_idx\t%u (seq %llu)\n", - iter, ja->nr, - ja->cur_idx, ja->bucket_seq[ja->cur_idx], - ja->last_idx, ja->bucket_seq[ja->last_idx]); - } - spin_unlock(&j->devs.lock); - - spin_unlock(&j->lock); - rcu_read_unlock(); - - return ret; -} - -static bool bch_journal_writing_to_device(struct bch_dev *ca) -{ - struct journal *j = &ca->fs->journal; - bool ret; - - spin_lock(&j->lock); - ret = bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), - ca->dev_idx); - spin_unlock(&j->lock); - - return ret; -} - -/* - * This asumes that ca has already been marked read-only so that - * journal_next_bucket won't pick buckets out of ca any more. - * Hence, if the journal is not currently pointing to ca, there - * will be no new writes to journal entries in ca after all the - * pending ones have been flushed to disk. - * - * If the journal is being written to ca, write a new record, and - * journal_next_bucket will notice that the device is no longer - * writeable and pick a new set of devices to write to. - */ - -int bch_journal_move(struct bch_dev *ca) -{ - u64 last_flushed_seq; - struct journal_device *ja = &ca->journal; - struct bch_fs *c = ca->fs; - struct journal *j = &c->journal; - unsigned i; - int ret = 0; /* Success */ - - if (bch_journal_writing_to_device(ca)) { - /* - * bch_journal_meta will write a record and we'll wait - * for the write to complete. - * Actually writing the journal (journal_write_locked) - * will call journal_next_bucket which notices that the - * device is no longer writeable, and picks a new one. - */ - bch_journal_meta(j); - BUG_ON(bch_journal_writing_to_device(ca)); - } - - /* - * Flush all btree updates to backing store so that any - * journal entries written to ca become stale and are no - * longer needed. - */ - - /* - * XXX: switch to normal journal reclaim machinery - */ - bch_btree_flush(c); - - /* - * Force a meta-data journal entry to be written so that - * we have newer journal entries in devices other than ca, - * and wait for the meta data write to complete. - */ - bch_journal_meta(j); - - /* - * Verify that we no longer need any of the journal entries in - * the device - */ - spin_lock(&j->lock); - last_flushed_seq = last_seq(j); - spin_unlock(&j->lock); - - for (i = 0; i < ja->nr; i += 1) - BUG_ON(ja->bucket_seq[i] > last_flushed_seq); - - return ret; -} - -void bch_fs_journal_stop(struct journal *j) -{ - if (!test_bit(JOURNAL_STARTED, &j->flags)) - return; - - /* - * Empty out the journal by first flushing everything pinning existing - * journal entries, then force a brand new empty journal entry to be - * written: - */ - bch_journal_flush_pins(j); - bch_journal_flush_async(j, NULL); - bch_journal_meta(j); - - cancel_delayed_work_sync(&j->write_work); - cancel_delayed_work_sync(&j->reclaim_work); -} - -void bch_dev_journal_exit(struct bch_dev *ca) -{ - kfree(ca->journal.bio); - kfree(ca->journal.buckets); - kfree(ca->journal.bucket_seq); - - ca->journal.bio = NULL; - ca->journal.buckets = NULL; - ca->journal.bucket_seq = NULL; -} - -int bch_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) -{ - struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets = - bch_sb_get_journal(sb); - unsigned i, journal_entry_pages; - - journal_entry_pages = - DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb), - PAGE_SECTORS); - - ja->nr = bch_nr_journal_buckets(journal_buckets); - - ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); - if (!ja->bucket_seq) - return -ENOMEM; - - ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages); - if (!ca->journal.bio) - return -ENOMEM; - - ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); - if (!ja->buckets) - return -ENOMEM; - - for (i = 0; i < ja->nr; i++) - ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); - - return 0; -} - -void bch_fs_journal_exit(struct journal *j) -{ - unsigned order = get_order(j->entry_size_max); - - free_pages((unsigned long) j->buf[1].data, order); - free_pages((unsigned long) j->buf[0].data, order); - free_fifo(&j->pin); -} - -int bch_fs_journal_init(struct journal *j, unsigned entry_size_max) -{ - static struct lock_class_key res_key; - unsigned order = get_order(entry_size_max); - - spin_lock_init(&j->lock); - spin_lock_init(&j->pin_lock); - init_waitqueue_head(&j->wait); - INIT_DELAYED_WORK(&j->write_work, journal_write_work); - INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work); - mutex_init(&j->blacklist_lock); - INIT_LIST_HEAD(&j->seq_blacklist); - spin_lock_init(&j->devs.lock); - mutex_init(&j->reclaim_lock); - - lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - - j->entry_size_max = entry_size_max; - j->write_delay_ms = 100; - j->reclaim_delay_ms = 100; - - bkey_extent_init(&j->key); - - atomic64_set(&j->reservations.counter, - ((union journal_res_state) - { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); - - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) || - !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order))) - return -ENOMEM; - - return 0; -} diff --git a/libbcache/journal.h b/libbcache/journal.h deleted file mode 100644 index c83f8104..00000000 --- a/libbcache/journal.h +++ /dev/null @@ -1,373 +0,0 @@ -#ifndef _BCACHE_JOURNAL_H -#define _BCACHE_JOURNAL_H - -/* - * THE JOURNAL: - * - * The primary purpose of the journal is to log updates (insertions) to the - * b-tree, to avoid having to do synchronous updates to the b-tree on disk. - * - * Without the journal, the b-tree is always internally consistent on - * disk - and in fact, in the earliest incarnations bcache didn't have a journal - * but did handle unclean shutdowns by doing all index updates synchronously - * (with coalescing). - * - * Updates to interior nodes still happen synchronously and without the journal - * (for simplicity) - this may change eventually but updates to interior nodes - * are rare enough it's not a huge priority. - * - * This means the journal is relatively separate from the b-tree; it consists of - * just a list of keys and journal replay consists of just redoing those - * insertions in same order that they appear in the journal. - * - * PERSISTENCE: - * - * For synchronous updates (where we're waiting on the index update to hit - * disk), the journal entry will be written out immediately (or as soon as - * possible, if the write for the previous journal entry was still in flight). - * - * Synchronous updates are specified by passing a closure (@flush_cl) to - * bch_btree_insert() or bch_btree_insert_node(), which then pass that parameter - * down to the journalling code. That closure will will wait on the journal - * write to complete (via closure_wait()). - * - * If the index update wasn't synchronous, the journal entry will be - * written out after 10 ms have elapsed, by default (the delay_ms field - * in struct journal). - * - * JOURNAL ENTRIES: - * - * A journal entry is variable size (struct jset), it's got a fixed length - * header and then a variable number of struct jset_entry entries. - * - * Journal entries are identified by monotonically increasing 64 bit sequence - * numbers - jset->seq; other places in the code refer to this sequence number. - * - * A jset_entry entry contains one or more bkeys (which is what gets inserted - * into the b-tree). We need a container to indicate which b-tree the key is - * for; also, the roots of the various b-trees are stored in jset_entry entries - * (one for each b-tree) - this lets us add new b-tree types without changing - * the on disk format. - * - * We also keep some things in the journal header that are logically part of the - * superblock - all the things that are frequently updated. This is for future - * bcache on raw flash support; the superblock (which will become another - * journal) can't be moved or wear leveled, so it contains just enough - * information to find the main journal, and the superblock only has to be - * rewritten when we want to move/wear level the main journal. - * - * JOURNAL LAYOUT ON DISK: - * - * The journal is written to a ringbuffer of buckets (which is kept in the - * superblock); the individual buckets are not necessarily contiguous on disk - * which means that journal entries are not allowed to span buckets, but also - * that we can resize the journal at runtime if desired (unimplemented). - * - * The journal buckets exist in the same pool as all the other buckets that are - * managed by the allocator and garbage collection - garbage collection marks - * the journal buckets as metadata buckets. - * - * OPEN/DIRTY JOURNAL ENTRIES: - * - * Open/dirty journal entries are journal entries that contain b-tree updates - * that have not yet been written out to the b-tree on disk. We have to track - * which journal entries are dirty, and we also have to avoid wrapping around - * the journal and overwriting old but still dirty journal entries with new - * journal entries. - * - * On disk, this is represented with the "last_seq" field of struct jset; - * last_seq is the first sequence number that journal replay has to replay. - * - * To avoid overwriting dirty journal entries on disk, we keep a mapping (in - * journal_device->seq) of for each journal bucket, the highest sequence number - * any journal entry it contains. Then, by comparing that against last_seq we - * can determine whether that journal bucket contains dirty journal entries or - * not. - * - * To track which journal entries are dirty, we maintain a fifo of refcounts - * (where each entry corresponds to a specific sequence number) - when a ref - * goes to 0, that journal entry is no longer dirty. - * - * Journalling of index updates is done at the same time as the b-tree itself is - * being modified (see btree_insert_key()); when we add the key to the journal - * the pending b-tree write takes a ref on the journal entry the key was added - * to. If a pending b-tree write would need to take refs on multiple dirty - * journal entries, it only keeps the ref on the oldest one (since a newer - * journal entry will still be replayed if an older entry was dirty). - * - * JOURNAL FILLING UP: - * - * There are two ways the journal could fill up; either we could run out of - * space to write to, or we could have too many open journal entries and run out - * of room in the fifo of refcounts. Since those refcounts are decremented - * without any locking we can't safely resize that fifo, so we handle it the - * same way. - * - * If the journal fills up, we start flushing dirty btree nodes until we can - * allocate space for a journal write again - preferentially flushing btree - * nodes that are pinning the oldest journal entries first. - */ - -#include <linux/hash.h> - -#include "journal_types.h" - -/* - * Only used for holding the journal entries we read in btree_journal_read() - * during cache_registration - */ -struct journal_replay { - struct list_head list; - struct jset j; -}; - -#define JOURNAL_PIN ((32 * 1024) - 1) - -static inline bool journal_pin_active(struct journal_entry_pin *pin) -{ - return pin->pin_list != NULL; -} - -void bch_journal_pin_add(struct journal *, struct journal_entry_pin *, - journal_pin_flush_fn); -void bch_journal_pin_drop(struct journal *, struct journal_entry_pin *); -void bch_journal_pin_add_if_older(struct journal *, - struct journal_entry_pin *, - struct journal_entry_pin *, - journal_pin_flush_fn); -void bch_journal_flush_pins(struct journal *); - -struct closure; -struct bch_fs; -struct keylist; - -struct bkey_i *bch_journal_find_btree_root(struct bch_fs *, struct jset *, - enum btree_id, unsigned *); - -int bch_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *); - -u64 bch_inode_journal_seq(struct journal *, u64); - -static inline int journal_state_count(union journal_res_state s, int idx) -{ - return idx == 0 ? s.buf0_count : s.buf1_count; -} - -static inline void journal_state_inc(union journal_res_state *s) -{ - s->buf0_count += s->idx == 0; - s->buf1_count += s->idx == 1; -} - -static inline void bch_journal_set_has_inode(struct journal_buf *buf, u64 inum) -{ - set_bit(hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)), buf->has_inode); -} - -/* - * Amount of space that will be taken up by some keys in the journal (i.e. - * including the jset header) - */ -static inline unsigned jset_u64s(unsigned u64s) -{ - return u64s + sizeof(struct jset_entry) / sizeof(u64); -} - -static inline void bch_journal_add_entry_at(struct journal_buf *buf, - const void *data, size_t u64s, - unsigned type, enum btree_id id, - unsigned level, unsigned offset) -{ - struct jset_entry *entry = vstruct_idx(buf->data, offset); - - entry->u64s = cpu_to_le16(u64s); - entry->btree_id = id; - entry->level = level; - entry->flags = 0; - SET_JOURNAL_ENTRY_TYPE(entry, type); - - memcpy_u64s(entry->_data, data, u64s); -} - -static inline void bch_journal_add_keys(struct journal *j, struct journal_res *res, - enum btree_id id, const struct bkey_i *k) -{ - struct journal_buf *buf = &j->buf[res->idx]; - unsigned actual = jset_u64s(k->k.u64s); - - EBUG_ON(!res->ref); - BUG_ON(actual > res->u64s); - - bch_journal_set_has_inode(buf, k->k.p.inode); - - bch_journal_add_entry_at(buf, k, k->k.u64s, - JOURNAL_ENTRY_BTREE_KEYS, id, - 0, res->offset); - - res->offset += actual; - res->u64s -= actual; -} - -void bch_journal_buf_put_slowpath(struct journal *, bool); - -static inline void bch_journal_buf_put(struct journal *j, unsigned idx, - bool need_write_just_set) -{ - union journal_res_state s; - - s.v = atomic64_sub_return(((union journal_res_state) { - .buf0_count = idx == 0, - .buf1_count = idx == 1, - }).v, &j->reservations.counter); - - EBUG_ON(s.idx != idx && !s.prev_buf_unwritten); - - /* - * Do not initiate a journal write if the journal is in an error state - * (previous journal entry write may have failed) - */ - if (s.idx != idx && - !journal_state_count(s, idx) && - s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL) - bch_journal_buf_put_slowpath(j, need_write_just_set); -} - -/* - * This function releases the journal write structure so other threads can - * then proceed to add their keys as well. - */ -static inline void bch_journal_res_put(struct journal *j, - struct journal_res *res) -{ - if (!res->ref) - return; - - lock_release(&j->res_map, 0, _RET_IP_); - - while (res->u64s) { - bch_journal_add_entry_at(&j->buf[res->idx], NULL, 0, - JOURNAL_ENTRY_BTREE_KEYS, - 0, 0, res->offset); - res->offset += jset_u64s(0); - res->u64s -= jset_u64s(0); - } - - bch_journal_buf_put(j, res->idx, false); - - res->ref = 0; -} - -int bch_journal_res_get_slowpath(struct journal *, struct journal_res *, - unsigned, unsigned); - -static inline int journal_res_get_fast(struct journal *j, - struct journal_res *res, - unsigned u64s_min, - unsigned u64s_max) -{ - union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); - - do { - old.v = new.v = v; - - /* - * Check if there is still room in the current journal - * entry: - */ - if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s) - return 0; - - res->offset = old.cur_entry_offset; - res->u64s = min(u64s_max, j->cur_entry_u64s - - old.cur_entry_offset); - - journal_state_inc(&new); - new.cur_entry_offset += res->u64s; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); - - res->ref = true; - res->idx = new.idx; - res->seq = le64_to_cpu(j->buf[res->idx].data->seq); - return 1; -} - -static inline int bch_journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) -{ - int ret; - - EBUG_ON(res->ref); - EBUG_ON(u64s_max < u64s_min); - - if (journal_res_get_fast(j, res, u64s_min, u64s_max)) - goto out; - - ret = bch_journal_res_get_slowpath(j, res, u64s_min, u64s_max); - if (ret) - return ret; -out: - lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_); - EBUG_ON(!res->ref); - return 0; -} - -void bch_journal_wait_on_seq(struct journal *, u64, struct closure *); -void bch_journal_flush_seq_async(struct journal *, u64, struct closure *); -void bch_journal_flush_async(struct journal *, struct closure *); -void bch_journal_meta_async(struct journal *, struct closure *); - -int bch_journal_flush_seq(struct journal *, u64); -int bch_journal_flush(struct journal *); -int bch_journal_meta(struct journal *); - -void bch_journal_halt(struct journal *); - -static inline int bch_journal_error(struct journal *j) -{ - return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL - ? -EIO : 0; -} - -static inline bool journal_flushes_device(struct bch_dev *ca) -{ - return true; -} - -void bch_journal_start(struct bch_fs *); -void bch_journal_mark(struct bch_fs *, struct list_head *); -void bch_journal_entries_free(struct list_head *); -int bch_journal_read(struct bch_fs *, struct list_head *); -int bch_journal_replay(struct bch_fs *, struct list_head *); - -static inline void bch_journal_set_replay_done(struct journal *j) -{ - spin_lock(&j->lock); - BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); - - set_bit(JOURNAL_REPLAY_DONE, &j->flags); - j->cur_pin_list = &fifo_peek_back(&j->pin); - spin_unlock(&j->lock); -} - -ssize_t bch_journal_print_debug(struct journal *, char *); - -int bch_dev_journal_alloc(struct bch_dev *); - -static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j) -{ - return j - ? (__le64 *) vstruct_end(&j->field) - j->buckets - : 0; -} - -int bch_journal_move(struct bch_dev *); - -void bch_fs_journal_stop(struct journal *); -void bch_dev_journal_exit(struct bch_dev *); -int bch_dev_journal_init(struct bch_dev *, struct bch_sb *); -void bch_fs_journal_exit(struct journal *); -int bch_fs_journal_init(struct journal *, unsigned); - -#endif /* _BCACHE_JOURNAL_H */ diff --git a/libbcache/journal_types.h b/libbcache/journal_types.h deleted file mode 100644 index ebc340ad..00000000 --- a/libbcache/journal_types.h +++ /dev/null @@ -1,242 +0,0 @@ -#ifndef _BCACHE_JOURNAL_TYPES_H -#define _BCACHE_JOURNAL_TYPES_H - -#include <linux/cache.h> -#include <linux/workqueue.h> - -#include "alloc_types.h" -#include "fifo.h" - -struct journal_res; - -/* - * We put two of these in struct journal; we used them for writes to the - * journal that are being staged or in flight. - */ -struct journal_buf { - struct jset *data; - struct closure_waitlist wait; - - /* - * ugh, prio_buckets are stupid - need to convert them to new - * transaction machinery when it arrives - */ - unsigned nr_prio_buckets; - - /* bloom filter: */ - unsigned long has_inode[1024 / sizeof(unsigned long)]; -}; - -/* - * Something that makes a journal entry dirty - i.e. a btree node that has to be - * flushed: - */ - -struct journal_entry_pin_list { - struct list_head list; - atomic_t count; -}; - -struct journal; -struct journal_entry_pin; -typedef void (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin *); - -struct journal_entry_pin { - struct list_head list; - journal_pin_flush_fn flush; - struct journal_entry_pin_list *pin_list; -}; - -/* corresponds to a btree node with a blacklisted bset: */ -struct blacklisted_node { - __le64 seq; - enum btree_id btree_id; - struct bpos pos; -}; - -struct journal_seq_blacklist { - struct list_head list; - u64 seq; - bool written; - struct journal_entry_pin pin; - - struct blacklisted_node *entries; - size_t nr_entries; -}; - -struct journal_res { - bool ref; - u8 idx; - u16 u64s; - u32 offset; - u64 seq; -}; - -union journal_res_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - u64 cur_entry_offset:20, - idx:1, - prev_buf_unwritten:1, - buf0_count:21, - buf1_count:21; - }; -}; - -/* 4 mb, in bytes: */ -#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) - -/* - * We stash some journal state as sentinal values in cur_entry_offset: - */ -#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) - -#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) -#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) - -/* - * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, - * either because something's waiting on the write to complete or because it's - * been dirty too long and the timer's expired. - */ - -enum { - JOURNAL_REPLAY_DONE, - JOURNAL_STARTED, - JOURNAL_NEED_WRITE, -}; - -/* Embedded in struct bch_fs */ -struct journal { - /* Fastpath stuff up front: */ - - unsigned long flags; - - union journal_res_state reservations; - unsigned cur_entry_u64s; - unsigned prev_buf_sectors; - unsigned cur_buf_sectors; - unsigned entry_size_max; /* bytes */ - - /* - * Two journal entries -- one is currently open for new entries, the - * other is possibly being written out. - */ - struct journal_buf buf[2]; - - spinlock_t lock; - - /* Used when waiting because the journal was full */ - wait_queue_head_t wait; - - struct closure io; - struct delayed_work write_work; - - /* Sequence number of most recent journal entry (last entry in @pin) */ - atomic64_t seq; - - /* last_seq from the most recent journal entry written */ - u64 last_seq_ondisk; - - /* - * FIFO of journal entries whose btree updates have not yet been - * written out. - * - * Each entry is a reference count. The position in the FIFO is the - * entry's sequence number relative to @seq. - * - * The journal entry itself holds a reference count, put when the - * journal entry is written out. Each btree node modified by the journal - * entry also holds a reference count, put when the btree node is - * written. - * - * When a reference count reaches zero, the journal entry is no longer - * needed. When all journal entries in the oldest journal bucket are no - * longer needed, the bucket can be discarded and reused. - */ - DECLARE_FIFO(struct journal_entry_pin_list, pin); - struct journal_entry_pin_list *cur_pin_list; - - /* - * Protects the pin lists - the fifo itself is still protected by - * j->lock though: - */ - spinlock_t pin_lock; - - struct mutex blacklist_lock; - struct list_head seq_blacklist; - - BKEY_PADDED(key); - struct dev_group devs; - - struct delayed_work reclaim_work; - unsigned long last_flushed; - - /* protects advancing ja->last_idx: */ - struct mutex reclaim_lock; - - /* - * ugh: need to get prio_buckets converted over to the eventual new - * transaction machinery - */ - __le64 prio_buckets[BCH_SB_MEMBERS_MAX]; - unsigned nr_prio_buckets; - - unsigned write_delay_ms; - unsigned reclaim_delay_ms; - - u64 res_get_blocked_start; - u64 need_write_time; - u64 write_start_time; - - struct time_stats *write_time; - struct time_stats *delay_time; - struct time_stats *blocked_time; - struct time_stats *flush_seq_time; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map res_map; -#endif -}; - -/* - * Embedded in struct bch_dev. First three fields refer to the array of journal - * buckets, in bch_sb. - */ -struct journal_device { - /* - * For each journal bucket, contains the max sequence number of the - * journal writes it contains - so we know when a bucket can be reused. - */ - u64 *bucket_seq; - - unsigned sectors_free; - - /* Journal bucket we're currently writing to */ - unsigned cur_idx; - - /* Last journal bucket that still contains an open journal entry */ - - /* - * j->lock and j->reclaim_lock must both be held to modify, j->lock - * sufficient to read: - */ - unsigned last_idx; - unsigned nr; - u64 *buckets; - - /* Bio for journal reads/writes to this device */ - struct bio *bio; - - /* for bch_journal_read_device */ - struct closure read; -}; - -#endif /* _BCACHE_JOURNAL_TYPES_H */ diff --git a/libbcache/keybuf.c b/libbcache/keybuf.c deleted file mode 100644 index 961fc79a..00000000 --- a/libbcache/keybuf.c +++ /dev/null @@ -1,195 +0,0 @@ - -#include "bcache.h" -#include "btree_gc.h" -#include "btree_iter.h" -#include "keybuf.h" - -#include <trace/events/bcache.h> - -/* - * For buffered iteration over the btree, with predicates and ratelimiting and - * whatnot - */ - -static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) -{ - /* Overlapping keys compare equal */ - if (bkey_cmp(l->key.k.p, bkey_start_pos(&r->key.k)) <= 0) - return -1; - if (bkey_cmp(bkey_start_pos(&l->key.k), r->key.k.p) >= 0) - return 1; - return 0; -} - -static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, - struct keybuf_key *r) -{ - return clamp_t(s64, bkey_cmp(l->key.k.p, r->key.k.p), -1, 1); -} - -void bch_refill_keybuf(struct bch_fs *c, struct keybuf *buf, - struct bpos end, keybuf_pred_fn *pred) -{ - struct bpos start = buf->last_scanned; - struct btree_iter iter; - struct bkey_s_c k; - unsigned nr_found = 0; - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, buf->last_scanned, k) { - if (bkey_cmp(k.k->p, end) >= 0) { - buf->last_scanned = k.k->p; - goto done; - } - - if (pred(buf, k)) { - struct keybuf_key *w; - - spin_lock(&buf->lock); - - w = array_alloc(&buf->freelist); - if (!w) { - spin_unlock(&buf->lock); - goto done; - } - - bkey_reassemble(&w->key, k); - atomic_set(&w->ref, -1); /* -1 means hasn't started */ - - if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) - array_free(&buf->freelist, w); - else - nr_found++; - - spin_unlock(&buf->lock); - } - - buf->last_scanned = k.k->p; - bch_btree_iter_cond_resched(&iter); - } - - /* If we end up here, it means: - * - the map_fn didn't fill up the keybuf - * - the map_fn didn't see the end key - * - there were no more keys to map over - * Therefore, we are at the end of the key space */ - buf->last_scanned = POS_MAX; -done: - bch_btree_iter_unlock(&iter); - - trace_bcache_keyscan(nr_found, - start.inode, start.offset, - buf->last_scanned.inode, - buf->last_scanned.offset); - - spin_lock(&buf->lock); - - if (!RB_EMPTY_ROOT(&buf->keys)) { - struct keybuf_key *w; - - w = RB_FIRST(&buf->keys, struct keybuf_key, node); - buf->start = bkey_start_pos(&w->key.k); - - w = RB_LAST(&buf->keys, struct keybuf_key, node); - buf->end = w->key.k.p; - } else { - buf->start = POS_MAX; - buf->end = POS_MAX; - } - - spin_unlock(&buf->lock); -} - -static void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) -{ - rb_erase(&w->node, &buf->keys); - array_free(&buf->freelist, w); -} - -void bch_keybuf_put(struct keybuf *buf, struct keybuf_key *w) -{ - BUG_ON(atomic_read(&w->ref) <= 0); - - if (atomic_dec_and_test(&w->ref)) { - up(&buf->in_flight); - - spin_lock(&buf->lock); - bch_keybuf_del(buf, w); - spin_unlock(&buf->lock); - } -} - -void bch_keybuf_recalc_oldest_gens(struct bch_fs *c, struct keybuf *buf) -{ - struct keybuf_key *w, *n; - - spin_lock(&buf->lock); - rbtree_postorder_for_each_entry_safe(w, n, - &buf->keys, node) - bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(&w->key)); - spin_unlock(&buf->lock); -} - -bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bpos start, - struct bpos end) -{ - bool ret = false; - struct keybuf_key *w, *next, s = { .key.k.p = start }; - - if (bkey_cmp(end, buf->start) <= 0 || - bkey_cmp(start, buf->end) >= 0) - return false; - - spin_lock(&buf->lock); - - for (w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); - w && bkey_cmp(bkey_start_pos(&w->key.k), end) < 0; - w = next) { - next = RB_NEXT(w, node); - - if (atomic_read(&w->ref) == -1) - bch_keybuf_del(buf, w); - else - ret = true; - } - - spin_unlock(&buf->lock); - return ret; -} - -struct keybuf_key *bch_keybuf_next(struct keybuf *buf) -{ - struct keybuf_key *w; - - spin_lock(&buf->lock); - - w = RB_FIRST(&buf->keys, struct keybuf_key, node); - - while (w && atomic_read(&w->ref) != -1) - w = RB_NEXT(w, node); - - if (!w) { - spin_unlock(&buf->lock); - return NULL; - } - - atomic_set(&w->ref, 1); - spin_unlock(&buf->lock); - - down(&buf->in_flight); - - return w; -} - -void bch_keybuf_init(struct keybuf *buf) -{ - sema_init(&buf->in_flight, KEYBUF_REFILL_BATCH / 2); - - buf->last_scanned = POS_MAX; - buf->start = POS_MIN; - buf->end = POS_MIN; - - buf->keys = RB_ROOT; - - spin_lock_init(&buf->lock); - array_allocator_init(&buf->freelist); -} diff --git a/libbcache/keybuf.h b/libbcache/keybuf.h deleted file mode 100644 index dd1402d3..00000000 --- a/libbcache/keybuf.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _BCACHE_KEYBUF_H -#define _BCACHE_KEYBUF_H - -#include "keybuf_types.h" - -typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey_s_c); - -void bch_keybuf_init(struct keybuf *); -void bch_refill_keybuf(struct bch_fs *, struct keybuf *, - struct bpos, keybuf_pred_fn *); -void bch_keybuf_recalc_oldest_gens(struct bch_fs *, struct keybuf *); -bool bch_keybuf_check_overlapping(struct keybuf *, struct bpos, struct bpos); -void bch_keybuf_put(struct keybuf *, struct keybuf_key *); -struct keybuf_key *bch_keybuf_next(struct keybuf *); - -#endif /* _BCACHE_KEYBUF_H */ diff --git a/libbcache/keybuf_types.h b/libbcache/keybuf_types.h deleted file mode 100644 index 3facc4a0..00000000 --- a/libbcache/keybuf_types.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _BCACHE_KEYBUF_TYPES_H -#define _BCACHE_KEYBUF_TYPES_H - -struct keybuf_key { - struct rb_node node; - BKEY_PADDED(key); - atomic_t ref; -}; - -#define KEYBUF_REFILL_BATCH 500 - -struct keybuf { - struct bpos last_scanned; - spinlock_t lock; - - /* - * Beginning and end of range in rb tree - so that we can skip taking - * lock and checking the rb tree when we need to check for overlapping - * keys. - */ - struct bpos start; - struct bpos end; - - struct rb_root keys; - - unsigned max_in_flight; - struct semaphore in_flight; - - DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, - KEYBUF_REFILL_BATCH); -}; - -#endif /* _BCACHE_KEYBUF_TYPES_H */ diff --git a/libbcache/keylist.c b/libbcache/keylist.c deleted file mode 100644 index adf5eeba..00000000 --- a/libbcache/keylist.c +++ /dev/null @@ -1,55 +0,0 @@ - -#include "bcache.h" -#include "keylist.h" - -int bch_keylist_realloc(struct keylist *l, u64 *inline_u64s, - size_t nr_inline_u64s, size_t new_u64s) -{ - size_t oldsize = bch_keylist_u64s(l); - size_t newsize = oldsize + new_u64s; - u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; - u64 *new_keys; - - newsize = roundup_pow_of_two(newsize); - - if (newsize <= nr_inline_u64s || - (old_buf && roundup_pow_of_two(oldsize) == newsize)) - return 0; - - new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO); - if (!new_keys) - return -ENOMEM; - - if (!old_buf) - memcpy_u64s(new_keys, inline_u64s, oldsize); - - l->keys_p = new_keys; - l->top_p = new_keys + oldsize; - - return 0; -} - -void bch_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) -{ - struct bkey_i *where; - - for_each_keylist_key(l, where) - if (bkey_cmp(insert->k.p, where->k.p) < 0) - break; - - memmove_u64s_up((u64 *) where + insert->k.u64s, - where, - ((u64 *) l->top) - ((u64 *) where)); - - l->top_p += insert->k.u64s; - bkey_copy(where, insert); -} - -void bch_keylist_pop_front(struct keylist *l) -{ - l->top_p -= bch_keylist_front(l)->k.u64s; - - memmove_u64s_down(l->keys, - bkey_next(l->keys), - bch_keylist_u64s(l)); -} diff --git a/libbcache/keylist.h b/libbcache/keylist.h deleted file mode 100644 index 1166f941..00000000 --- a/libbcache/keylist.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef _BCACHE_KEYLIST_H -#define _BCACHE_KEYLIST_H - -#include "keylist_types.h" - -int bch_keylist_realloc(struct keylist *, u64 *, size_t, size_t); -void bch_keylist_add_in_order(struct keylist *, struct bkey_i *); -void bch_keylist_pop_front(struct keylist *); - -static inline void bch_keylist_init(struct keylist *l, u64 *inline_keys, - size_t nr_inline_u64s) -{ - l->top_p = l->keys_p = inline_keys; -} - -static inline void bch_keylist_free(struct keylist *l, u64 *inline_keys) -{ - if (l->keys_p != inline_keys) - kfree(l->keys_p); - memset(l, 0, sizeof(*l)); -} - -static inline void bch_keylist_push(struct keylist *l) -{ - l->top = bkey_next(l->top); -} - -static inline void bch_keylist_add(struct keylist *l, const struct bkey_i *k) -{ - bkey_copy(l->top, k); - bch_keylist_push(l); -} - -static inline bool bch_keylist_empty(struct keylist *l) -{ - return l->top == l->keys; -} - -static inline size_t bch_keylist_u64s(struct keylist *l) -{ - return l->top_p - l->keys_p; -} - -static inline size_t bch_keylist_bytes(struct keylist *l) -{ - return bch_keylist_u64s(l) * sizeof(u64); -} - -static inline struct bkey_i *bch_keylist_front(struct keylist *l) -{ - return l->keys; -} - -#define for_each_keylist_key(_keylist, _k) \ - for (_k = (_keylist)->keys; \ - _k != (_keylist)->top; \ - _k = bkey_next(_k)) - -#define keylist_single(k) \ - ((struct keylist) { .keys = k, .top = bkey_next(k) }) - -#endif /* _BCACHE_KEYLIST_H */ diff --git a/libbcache/keylist_types.h b/libbcache/keylist_types.h deleted file mode 100644 index 195785bf..00000000 --- a/libbcache/keylist_types.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _BCACHE_KEYLIST_TYPES_H -#define _BCACHE_KEYLIST_TYPES_H - -struct keylist { - union { - struct bkey_i *keys; - u64 *keys_p; - }; - union { - struct bkey_i *top; - u64 *top_p; - }; -}; - -#endif /* _BCACHE_KEYLIST_TYPES_H */ diff --git a/libbcache/migrate.c b/libbcache/migrate.c deleted file mode 100644 index 9ef9685e..00000000 --- a/libbcache/migrate.c +++ /dev/null @@ -1,395 +0,0 @@ -/* - * Code for moving data off a device. - */ - -#include "bcache.h" -#include "btree_update.h" -#include "buckets.h" -#include "extents.h" -#include "io.h" -#include "journal.h" -#include "keylist.h" -#include "migrate.h" -#include "move.h" -#include "super-io.h" - -static int issue_migration_move(struct bch_dev *ca, - struct moving_context *ctxt, - struct bkey_s_c k) -{ - struct bch_fs *c = ca->fs; - struct disk_reservation res; - const struct bch_extent_ptr *ptr; - int ret; - - if (bch_disk_reservation_get(c, &res, k.k->size, 0)) - return -ENOSPC; - - extent_for_each_ptr(bkey_s_c_to_extent(k), ptr) - if (ptr->dev == ca->dev_idx) - goto found; - - BUG(); -found: - /* XXX: we need to be doing something with the disk reservation */ - - ret = bch_data_move(c, ctxt, &c->migration_write_point, k, ptr); - if (ret) - bch_disk_reservation_put(c, &res); - return ret; -} - -#define MAX_DATA_OFF_ITER 10 - -/* - * This moves only the data off, leaving the meta-data (if any) in place. - * It walks the key space, and for any key with a valid pointer to the - * relevant device, it copies it elsewhere, updating the key to point to - * the copy. - * The meta-data is moved off by bch_move_meta_data_off_device. - * - * Note: If the number of data replicas desired is > 1, ideally, any - * new copies would not be made in the same device that already have a - * copy (if there are enough devices). - * This is _not_ currently implemented. The multiple replicas can - * land in the same device even if there are others available. - */ - -int bch_move_data_off_device(struct bch_dev *ca) -{ - struct moving_context ctxt; - struct bch_fs *c = ca->fs; - struct bch_sb_field_members *mi; - unsigned pass = 0; - u64 seen_key_count; - int ret = 0; - - BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); - - if (!ca->mi.has_data) - return 0; - - bch_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE); - ctxt.avoid = ca; - - /* - * In theory, only one pass should be necessary as we've - * quiesced all writes before calling this. - * - * However, in practice, more than one pass may be necessary: - * - Some move fails due to an error. We can can find this out - * from the moving_context. - * - Some key swap failed because some of the pointers in the - * key in the tree changed due to caching behavior, btree gc - * pruning stale pointers, or tiering (if the device being - * removed is in tier 0). A smarter bkey_cmpxchg would - * handle these cases. - * - * Thus this scans the tree one more time than strictly necessary, - * but that can be viewed as a verification pass. - */ - - do { - struct btree_iter iter; - struct bkey_s_c k; - - seen_key_count = 0; - atomic_set(&ctxt.error_count, 0); - atomic_set(&ctxt.error_flags, 0); - - bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); - - while (!bch_move_ctxt_wait(&ctxt) && - (k = bch_btree_iter_peek(&iter)).k && - !(ret = btree_iter_err(k))) { - if (!bkey_extent_is_data(k.k) || - !bch_extent_has_device(bkey_s_c_to_extent(k), - ca->dev_idx)) - goto next; - - ret = issue_migration_move(ca, &ctxt, k); - if (ret == -ENOMEM) { - bch_btree_iter_unlock(&iter); - - /* - * memory allocation failure, wait for some IO - * to finish - */ - bch_move_ctxt_wait_for_io(&ctxt); - continue; - } - if (ret == -ENOSPC) - break; - BUG_ON(ret); - - seen_key_count++; -next: - bch_btree_iter_advance_pos(&iter); - bch_btree_iter_cond_resched(&iter); - - } - bch_btree_iter_unlock(&iter); - bch_move_ctxt_exit(&ctxt); - - if (ret) - return ret; - } while (seen_key_count && pass++ < MAX_DATA_OFF_ITER); - - if (seen_key_count) { - pr_err("Unable to migrate all data in %d iterations.", - MAX_DATA_OFF_ITER); - return -1; - } - - mutex_lock(&c->sb_lock); - mi = bch_sb_get_members(c->disk_sb); - SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false); - - bch_write_super(c); - mutex_unlock(&c->sb_lock); - - return 0; -} - -/* - * This walks the btree, and for any node on the relevant device it moves the - * node elsewhere. - */ -static int bch_move_btree_off(struct bch_dev *ca, enum btree_id id) -{ - struct bch_fs *c = ca->fs; - struct btree_iter iter; - struct closure cl; - struct btree *b; - int ret; - - BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); - - closure_init_stack(&cl); - - for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); -retry: - if (!bch_extent_has_device(e, ca->dev_idx)) - continue; - - ret = bch_btree_node_rewrite(&iter, b, &cl); - if (ret == -EINTR || ret == -ENOSPC) { - /* - * Drop locks to upgrade locks or wait on - * reserve: after retaking, recheck in case we - * raced. - */ - bch_btree_iter_unlock(&iter); - closure_sync(&cl); - b = bch_btree_iter_peek_node(&iter); - goto retry; - } - if (ret) { - bch_btree_iter_unlock(&iter); - return ret; - } - - bch_btree_iter_set_locks_want(&iter, 0); - } - ret = bch_btree_iter_unlock(&iter); - if (ret) - return ret; /* btree IO error */ - - if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) { - for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); - - BUG_ON(bch_extent_has_device(e, ca->dev_idx)); - } - bch_btree_iter_unlock(&iter); - } - - return 0; -} - -/* - * This moves only the meta-data off, leaving the data (if any) in place. - * The data is moved off by bch_move_data_off_device, if desired, and - * called first. - * - * Before calling this, allocation of buckets to the device must have - * been disabled, as else we'll continue to write meta-data to the device - * when new buckets are picked for meta-data writes. - * In addition, the copying gc and allocator threads for the device - * must have been stopped. The allocator thread is the only thread - * that writes prio/gen information. - * - * Meta-data consists of: - * - Btree nodes - * - Prio/gen information - * - Journal entries - * - Superblock - * - * This has to move the btree nodes and the journal only: - * - prio/gen information is not written once the allocator thread is stopped. - * also, as the prio/gen information is per-device it is not moved. - * - the superblock will be written by the caller once after everything - * is stopped. - * - * Note that currently there is no way to stop btree node and journal - * meta-data writes to a device without moving the meta-data because - * once a bucket is open for a btree node, unless a replacement btree - * node is allocated (and the tree updated), the bucket will continue - * to be written with updates. Similarly for the journal (it gets - * written until filled). - * - * This routine leaves the data (if any) in place. Whether the data - * should be moved off is a decision independent of whether the meta - * data should be moved off and stopped: - * - * - For device removal, both data and meta-data are moved off, in - * that order. - * - * - However, for turning a device read-only without removing it, only - * meta-data is moved off since that's the only way to prevent it - * from being written. Data is left in the device, but no new data - * is written. - */ - -int bch_move_metadata_off_device(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_sb_field_members *mi; - unsigned i; - int ret; - - BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); - - if (!ca->mi.has_metadata) - return 0; - - /* 1st, Move the btree nodes off the device */ - - for (i = 0; i < BTREE_ID_NR; i++) { - ret = bch_move_btree_off(ca, i); - if (ret) - return ret; - } - - /* There are no prios/gens to move -- they are already in the device. */ - - /* 2nd. Move the journal off the device */ - - ret = bch_journal_move(ca); - if (ret) - return ret; - - mutex_lock(&c->sb_lock); - mi = bch_sb_get_members(c->disk_sb); - SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false); - - bch_write_super(c); - mutex_unlock(&c->sb_lock); - - return 0; -} - -/* - * Flagging data bad when forcibly removing a device after failing to - * migrate the data off the device. - */ - -static int bch_flag_key_bad(struct btree_iter *iter, - struct bch_dev *ca, - struct bkey_s_c_extent orig) -{ - BKEY_PADDED(key) tmp; - struct bkey_s_extent e; - struct bch_extent_ptr *ptr; - struct bch_fs *c = ca->fs; - - bkey_reassemble(&tmp.key, orig.s_c); - e = bkey_i_to_s_extent(&tmp.key); - - extent_for_each_ptr_backwards(e, ptr) - if (ptr->dev == ca->dev_idx) - bch_extent_drop_ptr(e, ptr); - - /* - * If the new extent no longer has any pointers, bch_extent_normalize() - * will do the appropriate thing with it (turning it into a - * KEY_TYPE_ERROR key, or just a discard if it was a cached extent) - */ - bch_extent_normalize(c, e.s); - - return bch_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(iter, &tmp.key)); -} - -/* - * This doesn't actually move any data -- it marks the keys as bad - * if they contain a pointer to a device that is forcibly removed - * and don't have other valid pointers. If there are valid pointers, - * the necessary pointers to the removed device are replaced with - * bad pointers instead. - * - * This is only called if bch_move_data_off_device above failed, meaning - * that we've already tried to move the data MAX_DATA_OFF_ITER times and - * are not likely to succeed if we try again. - */ -int bch_flag_data_bad(struct bch_dev *ca) -{ - int ret = 0; - struct bkey_s_c k; - struct bkey_s_c_extent e; - struct btree_iter iter; - - bch_btree_iter_init(&iter, ca->fs, BTREE_ID_EXTENTS, POS_MIN); - - while ((k = bch_btree_iter_peek(&iter)).k && - !(ret = btree_iter_err(k))) { - if (!bkey_extent_is_data(k.k)) - goto advance; - - e = bkey_s_c_to_extent(k); - if (!bch_extent_has_device(e, ca->dev_idx)) - goto advance; - - ret = bch_flag_key_bad(&iter, ca, e); - - /* - * don't want to leave ret == -EINTR, since if we raced and - * something else overwrote the key we could spuriously return - * -EINTR below: - */ - if (ret == -EINTR) - ret = 0; - if (ret) - break; - - /* - * If the replica we're dropping was dirty and there is an - * additional cached replica, the cached replica will now be - * considered dirty - upon inserting the new version of the key, - * the bucket accounting will be updated to reflect the fact - * that the cached data is now dirty and everything works out as - * if by magic without us having to do anything. - * - * The one thing we need to be concerned with here is there's a - * race between when we drop any stale pointers from the key - * we're about to insert, and when the key actually gets - * inserted and the cached data is marked as dirty - we could - * end up trying to insert a key with a pointer that should be - * dirty, but points to stale data. - * - * If that happens the insert code just bails out and doesn't do - * the insert - however, it doesn't return an error. Hence we - * need to always recheck the current key before advancing to - * the next: - */ - continue; -advance: - bch_btree_iter_advance_pos(&iter); - } - - bch_btree_iter_unlock(&iter); - - return ret; -} diff --git a/libbcache/migrate.h b/libbcache/migrate.h deleted file mode 100644 index c6a056cb..00000000 --- a/libbcache/migrate.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _BCACHE_MIGRATE_H -#define _BCACHE_MIGRATE_H - -int bch_move_data_off_device(struct bch_dev *); -int bch_move_metadata_off_device(struct bch_dev *); -int bch_flag_data_bad(struct bch_dev *); - -#endif /* _BCACHE_MIGRATE_H */ diff --git a/libbcache/move.c b/libbcache/move.c deleted file mode 100644 index edee726c..00000000 --- a/libbcache/move.c +++ /dev/null @@ -1,392 +0,0 @@ - -#include "bcache.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "buckets.h" -#include "io.h" -#include "move.h" -#include "super-io.h" -#include "keylist.h" - -#include <linux/ioprio.h> - -#include <trace/events/bcache.h> - -static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c, - struct bkey_s_extent e, - struct bch_extent_ptr ptr) -{ - struct bch_extent_ptr *ptr2; - unsigned bucket_bits = c->devs[ptr.dev]->bucket_bits; - - extent_for_each_ptr(e, ptr2) - if (ptr2->dev == ptr.dev && - ptr2->gen == ptr.gen && - (ptr2->offset >> bucket_bits) == - (ptr.offset >> bucket_bits)) - return ptr2; - - return NULL; -} - -static struct bch_extent_ptr *bch_migrate_matching_ptr(struct migrate_write *m, - struct bkey_s_extent e) -{ - const struct bch_extent_ptr *ptr; - struct bch_extent_ptr *ret; - - if (m->move) - ret = bkey_find_ptr(m->op.c, e, m->move_ptr); - else - extent_for_each_ptr(bkey_i_to_s_c_extent(&m->key), ptr) - if ((ret = bkey_find_ptr(m->op.c, e, *ptr))) - break; - - return ret; -} - -static int bch_migrate_index_update(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct migrate_write *m = - container_of(op, struct migrate_write, op); - struct keylist *keys = &op->insert_keys; - struct btree_iter iter; - int ret = 0; - - bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, - bkey_start_pos(&bch_keylist_front(keys)->k)); - - while (1) { - struct bkey_s_extent insert = - bkey_i_to_s_extent(bch_keylist_front(keys)); - struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter); - struct bch_extent_ptr *ptr; - struct bkey_s_extent e; - BKEY_PADDED(k) new; - - if (!k.k) { - ret = bch_btree_iter_unlock(&iter); - break; - } - - if (!bkey_extent_is_data(k.k)) - goto nomatch; - - bkey_reassemble(&new.k, k); - bch_cut_front(iter.pos, &new.k); - bch_cut_back(insert.k->p, &new.k.k); - e = bkey_i_to_s_extent(&new.k); - - /* hack - promotes can race: */ - if (m->promote) - extent_for_each_ptr(insert, ptr) - if (bch_extent_has_device(e.c, ptr->dev)) - goto nomatch; - - ptr = bch_migrate_matching_ptr(m, e); - if (ptr) { - int nr_new_dirty = bch_extent_nr_dirty_ptrs(insert.s_c); - unsigned insert_flags = - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL; - - /* copygc uses btree node reserve: */ - if (m->move) - insert_flags |= BTREE_INSERT_USE_RESERVE; - - if (m->move) { - nr_new_dirty -= !ptr->cached; - __bch_extent_drop_ptr(e, ptr); - } - - BUG_ON(nr_new_dirty < 0); - - memcpy_u64s(extent_entry_last(e), - insert.v, - bkey_val_u64s(insert.k)); - e.k->u64s += bkey_val_u64s(insert.k); - - bch_extent_narrow_crcs(e); - bch_extent_drop_redundant_crcs(e); - bch_extent_normalize(c, e.s); - bch_extent_mark_replicas_cached(c, e, nr_new_dirty); - - ret = bch_btree_insert_at(c, &op->res, - NULL, op_journal_seq(op), - insert_flags, - BTREE_INSERT_ENTRY(&iter, &new.k)); - if (ret && ret != -EINTR) - break; - } else { -nomatch: - bch_btree_iter_advance_pos(&iter); - } - - while (bkey_cmp(iter.pos, bch_keylist_front(keys)->k.p) >= 0) { - bch_keylist_pop_front(keys); - if (bch_keylist_empty(keys)) - goto out; - } - - bch_cut_front(iter.pos, bch_keylist_front(keys)); - } -out: - bch_btree_iter_unlock(&iter); - return ret; -} - -void bch_migrate_write_init(struct bch_fs *c, - struct migrate_write *m, - struct write_point *wp, - struct bkey_s_c k, - const struct bch_extent_ptr *move_ptr, - unsigned flags) -{ - bkey_reassemble(&m->key, k); - - m->promote = false; - m->move = move_ptr != NULL; - if (move_ptr) - m->move_ptr = *move_ptr; - - if (bkey_extent_is_cached(k.k) || - (move_ptr && move_ptr->cached)) - flags |= BCH_WRITE_CACHED; - - bch_write_op_init(&m->op, c, &m->wbio, - (struct disk_reservation) { 0 }, - wp, - bkey_start_pos(k.k), - NULL, flags); - - if (m->move) - m->op.alloc_reserve = RESERVE_MOVINGGC; - - m->op.nonce = extent_current_nonce(bkey_s_c_to_extent(k)); - m->op.nr_replicas = 1; - m->op.index_update_fn = bch_migrate_index_update; -} - -static void migrate_bio_init(struct moving_io *io, struct bio *bio, - unsigned sectors) -{ - bio_init(bio); - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); - - bio->bi_iter.bi_size = sectors << 9; - bio->bi_max_vecs = DIV_ROUND_UP(sectors, PAGE_SECTORS); - bio->bi_private = &io->cl; - bio->bi_io_vec = io->bi_inline_vecs; - bch_bio_map(bio, NULL); -} - -static void moving_io_destructor(struct closure *cl) -{ - struct moving_io *io = container_of(cl, struct moving_io, cl); - struct moving_context *ctxt = io->ctxt; - struct bio_vec *bv; - int i; - - //if (io->replace.failures) - // trace_bcache_copy_collision(q, &io->key.k); - - atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight); - wake_up(&ctxt->wait); - - bio_for_each_segment_all(bv, &io->write.wbio.bio, i) - if (bv->bv_page) - __free_page(bv->bv_page); - - kfree(io); -} - -static void moving_error(struct moving_context *ctxt, unsigned flag) -{ - atomic_inc(&ctxt->error_count); - //atomic_or(flag, &ctxt->error_flags); -} - -static void moving_io_after_write(struct closure *cl) -{ - struct moving_io *io = container_of(cl, struct moving_io, cl); - struct moving_context *ctxt = io->ctxt; - - if (io->write.op.error) - moving_error(ctxt, MOVING_FLAG_WRITE); - - moving_io_destructor(cl); -} - -static void write_moving(struct moving_io *io) -{ - struct bch_write_op *op = &io->write.op; - - if (op->error) { - closure_return_with_destructor(&io->cl, moving_io_destructor); - } else { - closure_call(&op->cl, bch_write, NULL, &io->cl); - closure_return_with_destructor(&io->cl, moving_io_after_write); - } -} - -static inline struct moving_io *next_pending_write(struct moving_context *ctxt) -{ - struct moving_io *io = - list_first_entry_or_null(&ctxt->reads, struct moving_io, list); - - return io && io->read_completed ? io : NULL; -} - -static void read_moving_endio(struct bio *bio) -{ - struct closure *cl = bio->bi_private; - struct moving_io *io = container_of(cl, struct moving_io, cl); - struct moving_context *ctxt = io->ctxt; - - trace_bcache_move_read_done(&io->write.key.k); - - if (bio->bi_error) { - io->write.op.error = bio->bi_error; - moving_error(io->ctxt, MOVING_FLAG_READ); - } - - io->read_completed = true; - if (next_pending_write(ctxt)) - wake_up(&ctxt->wait); - - closure_put(&ctxt->cl); -} - -static void __bch_data_move(struct closure *cl) -{ - struct moving_io *io = container_of(cl, struct moving_io, cl); - struct bch_fs *c = io->write.op.c; - struct extent_pick_ptr pick; - - bch_extent_pick_ptr_avoiding(c, bkey_i_to_s_c(&io->write.key), - io->ctxt->avoid, &pick); - if (IS_ERR_OR_NULL(pick.ca)) - closure_return_with_destructor(cl, moving_io_destructor); - - bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); - io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&io->write.key.k); - io->rbio.bio.bi_end_io = read_moving_endio; - - /* - * dropped by read_moving_endio() - guards against use after free of - * ctxt when doing wakeup - */ - closure_get(&io->ctxt->cl); - - bch_read_extent(c, &io->rbio, - bkey_i_to_s_c(&io->write.key), - &pick, BCH_READ_IS_LAST); -} - -int bch_data_move(struct bch_fs *c, - struct moving_context *ctxt, - struct write_point *wp, - struct bkey_s_c k, - const struct bch_extent_ptr *move_ptr) -{ - struct moving_io *io; - - io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) * - DIV_ROUND_UP(k.k->size, PAGE_SECTORS), - GFP_KERNEL); - if (!io) - return -ENOMEM; - - io->ctxt = ctxt; - - migrate_bio_init(io, &io->rbio.bio, k.k->size); - - if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) { - kfree(io); - return -ENOMEM; - } - - migrate_bio_init(io, &io->write.wbio.bio, k.k->size); - bio_get(&io->write.wbio.bio); - io->write.wbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); - - bch_migrate_write_init(c, &io->write, wp, k, move_ptr, 0); - - trace_bcache_move_read(&io->write.key.k); - - ctxt->keys_moved++; - ctxt->sectors_moved += k.k->size; - if (ctxt->rate) - bch_ratelimit_increment(ctxt->rate, k.k->size); - - atomic_add(k.k->size, &ctxt->sectors_in_flight); - list_add_tail(&io->list, &ctxt->reads); - - closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl); - return 0; -} - -static void do_pending_writes(struct moving_context *ctxt) -{ - struct moving_io *io; - - while ((io = next_pending_write(ctxt))) { - list_del(&io->list); - trace_bcache_move_write(&io->write.key.k); - write_moving(io); - } -} - -#define move_ctxt_wait_event(_ctxt, _cond) \ -do { \ - do_pending_writes(_ctxt); \ - \ - if (_cond) \ - break; \ - __wait_event((_ctxt)->wait, \ - next_pending_write(_ctxt) || (_cond)); \ -} while (1) - -int bch_move_ctxt_wait(struct moving_context *ctxt) -{ - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->sectors_in_flight) < - ctxt->max_sectors_in_flight); - - return ctxt->rate - ? bch_ratelimit_wait_freezable_stoppable(ctxt->rate) - : 0; -} - -void bch_move_ctxt_wait_for_io(struct moving_context *ctxt) -{ - unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight); - - move_ctxt_wait_event(ctxt, - !atomic_read(&ctxt->sectors_in_flight) || - atomic_read(&ctxt->sectors_in_flight) != sectors_pending); -} - -void bch_move_ctxt_exit(struct moving_context *ctxt) -{ - move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight)); - closure_sync(&ctxt->cl); - - EBUG_ON(!list_empty(&ctxt->reads)); - EBUG_ON(atomic_read(&ctxt->sectors_in_flight)); -} - -void bch_move_ctxt_init(struct moving_context *ctxt, - struct bch_ratelimit *rate, - unsigned max_sectors_in_flight) -{ - memset(ctxt, 0, sizeof(*ctxt)); - closure_init_stack(&ctxt->cl); - - ctxt->rate = rate; - ctxt->max_sectors_in_flight = max_sectors_in_flight; - - INIT_LIST_HEAD(&ctxt->reads); - init_waitqueue_head(&ctxt->wait); -} diff --git a/libbcache/move.h b/libbcache/move.h deleted file mode 100644 index 317431d6..00000000 --- a/libbcache/move.h +++ /dev/null @@ -1,87 +0,0 @@ -#ifndef _BCACHE_MOVE_H -#define _BCACHE_MOVE_H - -#include "buckets.h" -#include "io_types.h" -#include "move_types.h" - -enum moving_flag_bitnos { - MOVING_FLAG_BITNO_READ = 0, - MOVING_FLAG_BITNO_WRITE, -}; - -#define MOVING_FLAG_READ (1U << MOVING_FLAG_BITNO_READ) -#define MOVING_FLAG_WRITE (1U << MOVING_FLAG_BITNO_WRITE) - -struct migrate_write { - BKEY_PADDED(key); - bool promote; - bool move; - struct bch_extent_ptr move_ptr; - struct bch_write_op op; - struct bch_write_bio wbio; -}; - -void bch_migrate_write_init(struct bch_fs *, - struct migrate_write *, - struct write_point *, - struct bkey_s_c, - const struct bch_extent_ptr *, - unsigned); - -#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 - -struct moving_context { - /* Closure for waiting on all reads and writes to complete */ - struct closure cl; - - /* Number and types of errors reported */ - atomic_t error_count; - atomic_t error_flags; - - /* Key and sector moves issued, updated from submission context */ - u64 keys_moved; - u64 sectors_moved; - - /* Rate-limiter counting submitted reads */ - struct bch_ratelimit *rate; - - /* Try to avoid reading the following device */ - struct bch_dev *avoid; - - struct list_head reads; - - /* Configuration */ - unsigned max_sectors_in_flight; - atomic_t sectors_in_flight; - - wait_queue_head_t wait; -}; - -struct moving_io { - struct list_head list; - struct rb_node node; - struct closure cl; - struct moving_context *ctxt; - struct migrate_write write; - bool read_completed; - - struct bch_read_bio rbio; - /* Must be last since it is variable size */ - struct bio_vec bi_inline_vecs[0]; -}; - -int bch_data_move(struct bch_fs *, - struct moving_context *, - struct write_point *, - struct bkey_s_c, - const struct bch_extent_ptr *); - -int bch_move_ctxt_wait(struct moving_context *); -void bch_move_ctxt_wait_for_io(struct moving_context *); - -void bch_move_ctxt_exit(struct moving_context *); -void bch_move_ctxt_init(struct moving_context *, struct bch_ratelimit *, - unsigned); - -#endif /* _BCACHE_MOVE_H */ diff --git a/libbcache/move_types.h b/libbcache/move_types.h deleted file mode 100644 index 0e2275e2..00000000 --- a/libbcache/move_types.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef _BCACHE_MOVE_TYPES_H -#define _BCACHE_MOVE_TYPES_H - -#endif /* _BCACHE_MOVE_TYPES_H */ diff --git a/libbcache/movinggc.c b/libbcache/movinggc.c deleted file mode 100644 index 9bb2b7a4..00000000 --- a/libbcache/movinggc.c +++ /dev/null @@ -1,297 +0,0 @@ -/* - * Moving/copying garbage collector - * - * Copyright 2012 Google, Inc. - */ - -#include "bcache.h" -#include "btree_iter.h" -#include "buckets.h" -#include "clock.h" -#include "extents.h" -#include "io.h" -#include "keylist.h" -#include "move.h" -#include "movinggc.h" - -#include <trace/events/bcache.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/wait.h> - -/* Moving GC - IO loop */ - -static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca, - struct bkey_s_c k) -{ - const struct bch_extent_ptr *ptr; - - if (bkey_extent_is_data(k.k) && - (ptr = bch_extent_has_device(bkey_s_c_to_extent(k), - ca->dev_idx)) && - PTR_BUCKET(ca, ptr)->mark.copygc) - return ptr; - - return NULL; -} - -static int issue_moving_gc_move(struct bch_dev *ca, - struct moving_context *ctxt, - struct bkey_s_c k) -{ - struct bch_fs *c = ca->fs; - const struct bch_extent_ptr *ptr; - int ret; - - ptr = moving_pred(ca, k); - if (!ptr) /* We raced - bucket's been reused */ - return 0; - - ret = bch_data_move(c, ctxt, &ca->copygc_write_point, k, ptr); - if (!ret) - trace_bcache_gc_copy(k.k); - else - trace_bcache_moving_gc_alloc_fail(c, k.k->size); - return ret; -} - -static void read_moving(struct bch_dev *ca, size_t buckets_to_move, - u64 sectors_to_move) -{ - struct bch_fs *c = ca->fs; - struct bucket *g; - struct moving_context ctxt; - struct btree_iter iter; - struct bkey_s_c k; - u64 sectors_not_moved = 0; - size_t buckets_not_moved = 0; - - bch_ratelimit_reset(&ca->moving_gc_pd.rate); - bch_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate, - SECTORS_IN_FLIGHT_PER_DEVICE); - bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); - - while (1) { - if (kthread_should_stop()) - goto out; - if (bch_move_ctxt_wait(&ctxt)) - goto out; - k = bch_btree_iter_peek(&iter); - if (!k.k) - break; - if (btree_iter_err(k)) - goto out; - - if (!moving_pred(ca, k)) - goto next; - - if (issue_moving_gc_move(ca, &ctxt, k)) { - bch_btree_iter_unlock(&iter); - - /* memory allocation failure, wait for some IO to finish */ - bch_move_ctxt_wait_for_io(&ctxt); - continue; - } -next: - bch_btree_iter_advance_pos(&iter); - //bch_btree_iter_cond_resched(&iter); - - /* unlock before calling moving_context_wait() */ - bch_btree_iter_unlock(&iter); - cond_resched(); - } - - bch_btree_iter_unlock(&iter); - bch_move_ctxt_exit(&ctxt); - trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved, - buckets_to_move); - - /* don't check this if we bailed out early: */ - for_each_bucket(g, ca) - if (g->mark.copygc && bucket_sectors_used(g)) { - sectors_not_moved += bucket_sectors_used(g); - buckets_not_moved++; - } - - if (sectors_not_moved) - bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved", - sectors_not_moved, sectors_to_move, - buckets_not_moved, buckets_to_move); - return; -out: - bch_btree_iter_unlock(&iter); - bch_move_ctxt_exit(&ctxt); - trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved, - buckets_to_move); -} - -static bool have_copygc_reserve(struct bch_dev *ca) -{ - bool ret; - - spin_lock(&ca->freelist_lock); - ret = fifo_used(&ca->free[RESERVE_MOVINGGC]) >= - COPYGC_BUCKETS_PER_ITER(ca); - spin_unlock(&ca->freelist_lock); - - return ret; -} - -static void bch_moving_gc(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bucket *g; - struct bucket_mark new; - u64 sectors_to_move; - size_t buckets_to_move, buckets_unused = 0; - struct bucket_heap_entry e; - unsigned sectors_used, i; - int reserve_sectors; - - if (!have_copygc_reserve(ca)) { - struct closure cl; - - closure_init_stack(&cl); - while (1) { - closure_wait(&c->freelist_wait, &cl); - if (have_copygc_reserve(ca)) - break; - closure_sync(&cl); - } - closure_wake_up(&c->freelist_wait); - } - - reserve_sectors = COPYGC_SECTORS_PER_ITER(ca); - - trace_bcache_moving_gc_start(ca); - - /* - * Find buckets with lowest sector counts, skipping completely - * empty buckets, by building a maxheap sorted by sector count, - * and repeatedly replacing the maximum element until all - * buckets have been visited. - */ - - /* - * We need bucket marks to be up to date, so gc can't be recalculating - * them, and we don't want the allocator invalidating a bucket after - * we've decided to evacuate it but before we set copygc: - */ - down_read(&c->gc_lock); - mutex_lock(&ca->heap_lock); - mutex_lock(&ca->fs->bucket_lock); - - ca->heap.used = 0; - for_each_bucket(g, ca) { - bucket_cmpxchg(g, new, new.copygc = 0); - - if (bucket_unused(g)) { - buckets_unused++; - continue; - } - - if (g->mark.owned_by_allocator || - g->mark.data_type != BUCKET_DATA) - continue; - - sectors_used = bucket_sectors_used(g); - - if (sectors_used >= ca->mi.bucket_size) - continue; - - bucket_heap_push(ca, g, sectors_used); - } - - sectors_to_move = 0; - for (i = 0; i < ca->heap.used; i++) - sectors_to_move += ca->heap.data[i].val; - - while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { - BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp)); - sectors_to_move -= e.val; - } - - for (i = 0; i < ca->heap.used; i++) - bucket_cmpxchg(ca->heap.data[i].g, new, new.copygc = 1); - - buckets_to_move = ca->heap.used; - - mutex_unlock(&ca->fs->bucket_lock); - mutex_unlock(&ca->heap_lock); - up_read(&c->gc_lock); - - read_moving(ca, buckets_to_move, sectors_to_move); -} - -static int bch_moving_gc_thread(void *arg) -{ - struct bch_dev *ca = arg; - struct bch_fs *c = ca->fs; - struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last; - u64 available, want, next; - - set_freezable(); - - while (!kthread_should_stop()) { - if (kthread_wait_freezable(c->copy_gc_enabled)) - break; - - last = atomic_long_read(&clock->now); - /* - * don't start copygc until less than half the gc reserve is - * available: - */ - available = dev_buckets_available(ca); - want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) * - c->opts.gc_reserve_percent, 200); - if (available > want) { - next = last + (available - want) * - ca->mi.bucket_size; - bch_kthread_io_clock_wait(clock, next); - continue; - } - - bch_moving_gc(ca); - } - - return 0; -} - -void bch_moving_gc_stop(struct bch_dev *ca) -{ - ca->moving_gc_pd.rate.rate = UINT_MAX; - bch_ratelimit_reset(&ca->moving_gc_pd.rate); - - if (ca->moving_gc_read) - kthread_stop(ca->moving_gc_read); - ca->moving_gc_read = NULL; -} - -int bch_moving_gc_start(struct bch_dev *ca) -{ - struct task_struct *t; - - BUG_ON(ca->moving_gc_read); - - if (ca->fs->opts.nochanges) - return 0; - - if (bch_fs_init_fault("moving_gc_start")) - return -ENOMEM; - - t = kthread_create(bch_moving_gc_thread, ca, "bch_copygc_read"); - if (IS_ERR(t)) - return PTR_ERR(t); - - ca->moving_gc_read = t; - wake_up_process(ca->moving_gc_read); - - return 0; -} - -void bch_dev_moving_gc_init(struct bch_dev *ca) -{ - bch_pd_controller_init(&ca->moving_gc_pd); - ca->moving_gc_pd.d_term = 0; -} diff --git a/libbcache/movinggc.h b/libbcache/movinggc.h deleted file mode 100644 index 5afbf34f..00000000 --- a/libbcache/movinggc.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _BCACHE_MOVINGGC_H -#define _BCACHE_MOVINGGC_H - -/* - * We can't use the entire copygc reserve in one iteration of copygc: we may - * need the buckets we're freeing up to go back into the copygc reserve to make - * forward progress, but if the copygc reserve is full they'll be available for - * any allocation - and it's possible that in a given iteration, we free up most - * of the buckets we're going to free before we allocate most of the buckets - * we're going to allocate. - * - * If we only use half of the reserve per iteration, then in steady state we'll - * always have room in the reserve for the buckets we're going to need in the - * next iteration: - */ -#define COPYGC_BUCKETS_PER_ITER(ca) \ - ((ca)->free[RESERVE_MOVINGGC].size / 2) - -/* - * Max sectors to move per iteration: Have to take into account internal - * fragmentation from the multiple write points for each generation: - */ -#define COPYGC_SECTORS_PER_ITER(ca) \ - ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) - -void bch_moving_gc_stop(struct bch_dev *); -int bch_moving_gc_start(struct bch_dev *); -void bch_dev_moving_gc_init(struct bch_dev *); - -#endif diff --git a/libbcache/notify.c b/libbcache/notify.c deleted file mode 100644 index b06a8749..00000000 --- a/libbcache/notify.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Code for sending uevent notifications to user-space. - * - * Copyright 2015 Datera, Inc. - */ - -#include "bcache.h" -#include "notify.h" - -#include <linux/kobject.h> - -#define notify_var(c, format, ...) \ -({ \ - int ret; \ - lockdep_assert_held(&(c)->uevent_lock); \ - ret = add_uevent_var(&(c)->uevent_env, format, ##__VA_ARGS__); \ - WARN_ON_ONCE(ret); \ -}) - -static void notify_get(struct bch_fs *c) -{ - struct kobj_uevent_env *env = &c->uevent_env; - - mutex_lock(&c->uevent_lock); - env->envp_idx = 0; - env->buflen = 0; - - notify_var(c, "SET_UUID=%pU", c->sb.user_uuid.b); -} - -static void notify_get_cache(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - - notify_get(c); - notify_var(c, "UUID=%pU", ca->uuid.b); - notify_var(c, "BLOCKDEV=%s", ca->name); -} - -static void notify_put(struct bch_fs *c) -{ - struct kobj_uevent_env *env = &c->uevent_env; - - env->envp[env->envp_idx] = NULL; - kobject_uevent_env(&c->kobj, KOBJ_CHANGE, env->envp); - mutex_unlock(&c->uevent_lock); -} - -void bch_notify_fs_read_write(struct bch_fs *c) -{ - notify_get(c); - notify_var(c, "STATE=active"); - notify_put(c); -} - -void bch_notify_fs_read_only(struct bch_fs *c) -{ - notify_get(c); - notify_var(c, "STATE=readonly"); - notify_put(c); -} - -void bch_notify_fs_stopped(struct bch_fs *c) -{ - notify_get(c); - notify_var(c, "STATE=stopped"); - notify_put(c); -} - -void bch_notify_dev_read_write(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - - notify_get_cache(ca); - notify_var(c, "STATE=active"); - notify_put(c); -} - -void bch_notify_dev_read_only(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - - notify_get_cache(ca); - notify_var(c, "STATE=readonly"); - notify_put(c); -} - -void bch_notify_dev_added(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - - notify_get_cache(ca); - notify_var(c, "STATE=removing"); - notify_put(c); -} - -void bch_notify_dev_error(struct bch_dev *ca, bool fatal) -{ - struct bch_fs *c = ca->fs; - - notify_get_cache(ca); - notify_var(c, "STATE=error"); - notify_var(c, "FATAL=%d", fatal); - notify_put(c); -} diff --git a/libbcache/notify.h b/libbcache/notify.h deleted file mode 100644 index 2c1e3679..00000000 --- a/libbcache/notify.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Code for sending uevent notifications to user-space. - * - * Copyright 2015 Datera, Inc. - */ - -#ifndef _NOTIFY_H -#define _NOTIFY_H - -#ifndef NO_BCACHE_NOTIFY - -void bch_notify_fs_read_write(struct bch_fs *); -void bch_notify_fs_read_only(struct bch_fs *); -void bch_notify_fs_stopped(struct bch_fs *); - -void bch_notify_dev_read_write(struct bch_dev *); -void bch_notify_dev_read_only(struct bch_dev *); -void bch_notify_dev_added(struct bch_dev *); -void bch_notify_dev_error(struct bch_dev *, bool); - -#else - -static inline void bch_notify_fs_read_write(struct bch_fs *c) {} -static inline void bch_notify_fs_read_only(struct bch_fs *c) {} -static inline void bch_notify_fs_stopped(struct bch_fs *c) {} - -static inline void bch_notify_dev_read_write(struct bch_dev *ca) {} -static inline void bch_notify_dev_read_only(struct bch_dev *ca) {} -static inline void bch_notify_dev_added(struct bch_dev *ca) {} -static inline void bch_notify_dev_error(struct bch_dev *ca, bool b) {} - -#endif - -#endif /* _NOTIFY_H */ diff --git a/libbcache/opts.c b/libbcache/opts.c deleted file mode 100644 index 41780d59..00000000 --- a/libbcache/opts.c +++ /dev/null @@ -1,241 +0,0 @@ - -#include <linux/kernel.h> - -#include "opts.h" -#include "util.h" - -const char * const bch_error_actions[] = { - "continue", - "remount-ro", - "panic", - NULL -}; - -const char * const bch_csum_types[] = { - "none", - "crc32c", - "crc64", - NULL -}; - -const char * const bch_compression_types[] = { - "none", - "lz4", - "gzip", - NULL -}; - -const char * const bch_str_hash_types[] = { - "crc32c", - "crc64", - "siphash", - NULL -}; - -const char * const bch_cache_replacement_policies[] = { - "lru", - "fifo", - "random", - NULL -}; - -/* Default is -1; we skip past it for struct cached_dev's cache mode */ -const char * const bch_cache_modes[] = { - "default", - "writethrough", - "writeback", - "writearound", - "none", - NULL -}; - -const char * const bch_dev_state[] = { - "readwrite", - "readonly", - "failed", - "spare", - NULL -}; - -const struct bch_option bch_opt_table[] = { -#define OPT_BOOL() .type = BCH_OPT_BOOL -#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max -#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices - -#define BCH_OPT(_name, _mode, _sb_opt, _bits, _type) \ - [Opt_##_name] = { \ - .name = #_name, \ - .set_sb = SET_##_sb_opt, \ - _type \ - }, - BCH_VISIBLE_OPTS() -#undef BCH_OPT -}; - -static enum bch_opt_id bch_opt_lookup(const char *name) -{ - const struct bch_option *i; - - for (i = bch_opt_table; - i < bch_opt_table + ARRAY_SIZE(bch_opt_table); - i++) - if (!strcmp(name, i->name)) - return i - bch_opt_table; - - return -1; -} - -static u64 bch_opt_get(struct bch_opts *opts, enum bch_opt_id id) -{ - switch (id) { -#define BCH_OPT(_name, ...) \ - case Opt_##_name: \ - return opts->_name; \ - - BCH_VISIBLE_OPTS() -#undef BCH_OPT - - default: - BUG(); - } -} - -void bch_opt_set(struct bch_opts *opts, enum bch_opt_id id, u64 v) -{ - switch (id) { -#define BCH_OPT(_name, ...) \ - case Opt_##_name: \ - opts->_name = v; \ - break; - - BCH_VISIBLE_OPTS() -#undef BCH_OPT - - default: - BUG(); - } -} - -/* - * Initial options from superblock - here we don't want any options undefined, - * any options the superblock doesn't specify are set to 0: - */ -struct bch_opts bch_sb_opts(struct bch_sb *sb) -{ - struct bch_opts opts = bch_opts_empty(); - -#define BCH_OPT(_name, _mode, _sb_opt, ...) \ - if (_sb_opt != NO_SB_OPT) \ - opts._name = _sb_opt(sb); - - BCH_OPTS() -#undef BCH_OPT - - return opts; -} - -int parse_one_opt(enum bch_opt_id id, const char *val, u64 *res) -{ - const struct bch_option *opt = &bch_opt_table[id]; - ssize_t ret; - - switch (opt->type) { - case BCH_OPT_BOOL: - ret = kstrtou64(val, 10, res); - if (ret < 0) - return ret; - - if (*res > 1) - return -ERANGE; - break; - case BCH_OPT_UINT: - ret = kstrtou64(val, 10, res); - if (ret < 0) - return ret; - - if (*res < opt->min || *res >= opt->max) - return -ERANGE; - break; - case BCH_OPT_STR: - ret = bch_read_string_list(val, opt->choices); - if (ret < 0) - return ret; - - *res = ret; - break; - } - - return 0; -} - -int bch_parse_mount_opts(struct bch_opts *opts, char *options) -{ - char *opt, *name, *val; - int ret, id; - u64 v; - - while ((opt = strsep(&options, ",")) != NULL) { - name = strsep(&opt, "="); - val = opt; - - if (val) { - id = bch_opt_lookup(name); - if (id < 0) - return -EINVAL; - - ret = parse_one_opt(id, val, &v); - if (ret < 0) - return ret; - } else { - id = bch_opt_lookup(name); - v = 1; - - if (id < 0 && - !strncmp("no", name, 2)) { - id = bch_opt_lookup(name + 2); - v = 0; - } - - if (bch_opt_table[id].type != BCH_OPT_BOOL) - return -EINVAL; - } - - bch_opt_set(opts, id, v); - } - - return 0; -} - -enum bch_opt_id bch_parse_sysfs_opt(const char *name, const char *val, - u64 *res) -{ - enum bch_opt_id id = bch_opt_lookup(name); - int ret; - - if (id < 0) - return -EINVAL; - - ret = parse_one_opt(id, val, res); - if (ret < 0) - return ret; - - return id; -} - -ssize_t bch_opt_show(struct bch_opts *opts, const char *name, - char *buf, size_t size) -{ - enum bch_opt_id id = bch_opt_lookup(name); - const struct bch_option *opt; - u64 v; - - if (id < 0) - return -EINVAL; - - v = bch_opt_get(opts, id); - opt = &bch_opt_table[id]; - - return opt->type == BCH_OPT_STR - ? bch_snprint_string_list(buf, size, opt->choices, v) - : snprintf(buf, size, "%lli\n", v); -} diff --git a/libbcache/opts.h b/libbcache/opts.h deleted file mode 100644 index 253b7399..00000000 --- a/libbcache/opts.h +++ /dev/null @@ -1,168 +0,0 @@ -#ifndef _BCACHE_OPTS_H -#define _BCACHE_OPTS_H - -#include <linux/bcache.h> -#include <linux/bug.h> -#include <linux/log2.h> -#include <linux/string.h> - -extern const char * const bch_error_actions[]; -extern const char * const bch_csum_types[]; -extern const char * const bch_compression_types[]; -extern const char * const bch_str_hash_types[]; -extern const char * const bch_cache_replacement_policies[]; -extern const char * const bch_cache_modes[]; -extern const char * const bch_dev_state[]; - -/* - * Mount options; we also store defaults in the superblock. - * - * Also exposed via sysfs: if an option is writeable, and it's also stored in - * the superblock, changing it via sysfs (currently? might change this) also - * updates the superblock. - * - * We store options as signed integers, where -1 means undefined. This means we - * can pass the mount options to bch_fs_alloc() as a whole struct, and then only - * apply the options from that struct that are defined. - */ - -/* dummy option, for options that aren't stored in the superblock */ -LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); - -/** - * BCH_OPT(name, mode, sb_opt, type, ...) - * - * @name - name of mount option, sysfs attribute, and struct bch_opts - * member - * - * @mode - sysfs attr permissions - * - * @sb_option - name of corresponding superblock option - * - * @type - one of OPT_BOOL, OPT_UINT, OPT_STR - */ - -enum opt_type { - BCH_OPT_BOOL, - BCH_OPT_UINT, - BCH_OPT_STR, -}; - -#define BCH_VISIBLE_OPTS() \ - BCH_OPT(errors, 0644, BCH_SB_ERROR_ACTION, \ - s8, OPT_STR(bch_error_actions)) \ - BCH_OPT(metadata_replicas, 0444, BCH_SB_META_REPLICAS_WANT,\ - s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \ - BCH_OPT(data_replicas, 0444, BCH_SB_DATA_REPLICAS_WANT,\ - s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \ - BCH_OPT(metadata_replicas_required, 0444, BCH_SB_META_REPLICAS_REQ,\ - s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \ - BCH_OPT(data_replicas_required, 0444, BCH_SB_DATA_REPLICAS_REQ,\ - s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \ - BCH_OPT(metadata_checksum, 0644, BCH_SB_META_CSUM_TYPE, \ - s8, OPT_STR(bch_csum_types)) \ - BCH_OPT(data_checksum, 0644, BCH_SB_DATA_CSUM_TYPE, \ - s8, OPT_STR(bch_csum_types)) \ - BCH_OPT(compression, 0644, BCH_SB_COMPRESSION_TYPE,\ - s8, OPT_STR(bch_compression_types)) \ - BCH_OPT(str_hash, 0644, BCH_SB_STR_HASH_TYPE, \ - s8, OPT_STR(bch_str_hash_types)) \ - BCH_OPT(inodes_32bit, 0644, BCH_SB_INODE_32BIT, \ - s8, OPT_BOOL()) \ - BCH_OPT(gc_reserve_percent, 0444, BCH_SB_GC_RESERVE, \ - s8, OPT_UINT(5, 21)) \ - BCH_OPT(root_reserve_percent, 0444, BCH_SB_ROOT_RESERVE, \ - s8, OPT_UINT(0, 100)) \ - BCH_OPT(wide_macs, 0644, BCH_SB_128_BIT_MACS, \ - s8, OPT_BOOL()) \ - BCH_OPT(verbose_recovery, 0444, NO_SB_OPT, \ - s8, OPT_BOOL()) \ - BCH_OPT(posix_acl, 0444, NO_SB_OPT, \ - s8, OPT_BOOL()) \ - BCH_OPT(journal_flush_disabled, 0644, NO_SB_OPT, \ - s8, OPT_BOOL()) \ - BCH_OPT(nofsck, 0444, NO_SB_OPT, \ - s8, OPT_BOOL()) \ - BCH_OPT(fix_errors, 0444, NO_SB_OPT, \ - s8, OPT_BOOL()) \ - BCH_OPT(nochanges, 0444, NO_SB_OPT, \ - s8, OPT_BOOL()) \ - BCH_OPT(noreplay, 0444, NO_SB_OPT, \ - s8, OPT_BOOL()) \ - BCH_OPT(norecovery, 0444, NO_SB_OPT, \ - s8, OPT_BOOL()) \ - BCH_OPT(noexcl, 0444, NO_SB_OPT, \ - s8, OPT_BOOL()) \ - BCH_OPT(sb, 0444, NO_SB_OPT, \ - s64, OPT_UINT(0, S64_MAX)) \ - -#define BCH_OPTS() \ - BCH_OPT(read_only, 0444, NO_SB_OPT, \ - s8, OPT_BOOL()) \ - BCH_OPT(nostart, 0444, NO_SB_OPT, \ - s8, OPT_BOOL()) \ - BCH_VISIBLE_OPTS() - -struct bch_opts { -#define BCH_OPT(_name, _mode, _sb_opt, _bits, ...) \ - _bits _name; - - BCH_OPTS() -#undef BCH_OPT -}; - -enum bch_opt_id { -#define BCH_OPT(_name, ...) \ - Opt_##_name, - - BCH_VISIBLE_OPTS() -#undef BCH_OPT -}; - -struct bch_option { - const char *name; - void (*set_sb)(struct bch_sb *, u64); - enum opt_type type; - - union { - struct { - u64 min, max; - }; - struct { - const char * const *choices; - }; - }; - -}; - -extern const struct bch_option bch_opt_table[]; - -static inline struct bch_opts bch_opts_empty(void) -{ - struct bch_opts ret; - - memset(&ret, 255, sizeof(ret)); - return ret; -} - -static inline void bch_opts_apply(struct bch_opts *dst, struct bch_opts src) -{ -#define BCH_OPT(_name, ...) \ - if (src._name >= 0) \ - dst->_name = src._name; - - BCH_OPTS() -#undef BCH_OPT -} - -#define opt_defined(_opt) ((_opt) >= 0) - -void bch_opt_set(struct bch_opts *, enum bch_opt_id, u64); -struct bch_opts bch_sb_opts(struct bch_sb *); - -int bch_parse_mount_opts(struct bch_opts *, char *); -enum bch_opt_id bch_parse_sysfs_opt(const char *, const char *, u64 *); - -ssize_t bch_opt_show(struct bch_opts *, const char *, char *, size_t); - -#endif /* _BCACHE_OPTS_H */ diff --git a/libbcache/request.c b/libbcache/request.c deleted file mode 100644 index b24770bc..00000000 --- a/libbcache/request.c +++ /dev/null @@ -1,809 +0,0 @@ -/* - * Handle a read or a write request and decide what to do with it. - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - * - * Main pieces here: - * - * 1) Data insert path, via bch_data_insert() -- writes data to cache and - * updates extents btree - * 2) Read path, via bch_read() -- for now only used by bcachefs and ioctl - * interface - * 3) Read path, via cache_lookup() and struct search -- used by block device - * make_request functions - * 4) Cache promotion -- used by bch_read() and cache_lookup() to copy data to - * the cache, either from a backing device or a cache device in a higher tier - * - * One tricky thing that comes up is a race condition where a bucket may be - * re-used while reads from it are still in flight. To guard against this, we - * save the ptr that is being read and check if it is stale once the read - * completes. If the ptr is stale, the read is retried. - * - * #2 and #3 will be unified further in the future. - */ - -#include "bcache.h" -#include "blockdev.h" -#include "btree_update.h" -#include "btree_iter.h" -#include "clock.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "io.h" -#include "journal.h" -#include "keybuf.h" -#include "request.h" -#include "writeback.h" -#include "stats.h" - -#include <linux/module.h> -#include <linux/hash.h> -#include <linux/random.h> -#include <linux/backing-dev.h> - -#include <trace/events/bcache.h> - -#define CUTOFF_CACHE_ADD 10 -#define CUTOFF_CACHE_READA 15 - -/* Congested? */ - -unsigned bch_get_congested(struct bch_fs *c) -{ - int i; - long rand; - - if (!c->congested_read_threshold_us && - !c->congested_write_threshold_us) - return 0; - - i = (local_clock_us() - c->congested_last_us) / 1024; - if (i < 0) - return 0; - - i += atomic_read(&c->congested); - if (i >= 0) - return 0; - - i += CONGESTED_MAX; - - if (i > 0) - i = fract_exp_two(i, 6); - - rand = get_random_int(); - i -= bitmap_weight(&rand, BITS_PER_LONG); - - return i > 0 ? i : 1; -} - -static void add_sequential(struct task_struct *t) -{ - t->sequential_io_avg = ewma_add(t->sequential_io_avg, - t->sequential_io, 3); - t->sequential_io = 0; -} - -static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) -{ - return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; -} - -static bool check_should_bypass(struct cached_dev *dc, struct bio *bio, int rw) -{ - struct bch_fs *c = dc->disk.c; - unsigned mode = BDEV_CACHE_MODE(dc->disk_sb.sb); - unsigned sectors, congested = bch_get_congested(c); - struct task_struct *task = current; - struct io *i; - - if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - sectors_available(c) * 100 < c->capacity * CUTOFF_CACHE_ADD || - (bio_op(bio) == REQ_OP_DISCARD)) - goto skip; - - if (mode == CACHE_MODE_NONE || - (mode == CACHE_MODE_WRITEAROUND && - op_is_write(bio_op(bio)))) - goto skip; - - if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || - bio_sectors(bio) & (c->sb.block_size - 1)) { - pr_debug("skipping unaligned io"); - goto skip; - } - - if (bypass_torture_test(dc)) { - if ((get_random_int() & 3) == 3) - goto skip; - else - goto rescale; - } - - if (!congested && !dc->sequential_cutoff) - goto rescale; - - if (!congested && - mode == CACHE_MODE_WRITEBACK && - op_is_write(bio_op(bio)) && - (bio->bi_opf & REQ_SYNC)) - goto rescale; - - spin_lock(&dc->io_lock); - - hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) - if (i->last == bio->bi_iter.bi_sector && - time_before(jiffies, i->last_io)) - goto found; - - i = list_first_entry(&dc->io_lru, struct io, lru); - - add_sequential(task); - i->sequential = 0; -found: - if (i->sequential + bio->bi_iter.bi_size > i->sequential) - i->sequential += bio->bi_iter.bi_size; - - i->last = bio_end_sector(bio); - i->last_io = jiffies + msecs_to_jiffies(5000); - task->sequential_io = i->sequential; - - hlist_del(&i->hash); - hlist_add_head(&i->hash, iohash(dc, i->last)); - list_move_tail(&i->lru, &dc->io_lru); - - spin_unlock(&dc->io_lock); - - sectors = max(task->sequential_io, - task->sequential_io_avg) >> 9; - - if (dc->sequential_cutoff && - sectors >= dc->sequential_cutoff >> 9) { - trace_bcache_bypass_sequential(bio); - goto skip; - } - - if (congested && sectors >= congested) { - trace_bcache_bypass_congested(bio); - goto skip; - } - -rescale: - return false; -skip: - bch_mark_sectors_bypassed(c, dc, bio_sectors(bio)); - return true; -} - -/* Common code for the make_request functions */ - -/** - * request_endio - endio function for backing device bios - */ -static void request_endio(struct bio *bio) -{ - struct closure *cl = bio->bi_private; - - if (bio->bi_error) { - struct search *s = container_of(cl, struct search, cl); - s->iop.error = bio->bi_error; - /* Only cache read errors are recoverable */ - s->recoverable = false; - } - - bio_put(bio); - closure_put(cl); -} - -static void bio_complete(struct search *s) -{ - if (s->orig_bio) { - generic_end_io_acct(bio_data_dir(s->orig_bio), - &s->d->disk->part0, s->start_time); - - trace_bcache_request_end(s->d, s->orig_bio); - s->orig_bio->bi_error = s->iop.error; - bio_endio(s->orig_bio); - s->orig_bio = NULL; - } -} - -static void do_bio_hook(struct search *s, struct bio *orig_bio) -{ - int rw = bio_data_dir(orig_bio); - struct bio *bio = rw ? &s->wbio.bio : &s->rbio.bio; - - bio_init(bio); - __bio_clone_fast(bio, orig_bio); - bio->bi_end_io = request_endio; - bio->bi_private = &s->cl; - - bio_cnt_set(bio, 3); -} - -static void search_free(struct closure *cl) -{ - struct search *s = container_of(cl, struct search, cl); - - bio_complete(s); - - if (s->iop.bio) - bio_put(&s->iop.bio->bio); - - closure_debug_destroy(cl); - mempool_free(s, &s->d->c->search); -} - -static inline struct search *search_alloc(struct bio *bio, - struct bcache_device *d) -{ - struct search *s; - - s = mempool_alloc(&d->c->search, GFP_NOIO); - - closure_init(&s->cl, NULL); - do_bio_hook(s, bio); - - s->orig_bio = bio; - s->d = d; - s->recoverable = 1; - s->bypass = 0; - s->write = op_is_write(bio_op(bio)); - s->read_dirty_data = 0; - s->cache_miss = 0; - s->start_time = jiffies; - s->inode = bcache_dev_inum(d); - - s->iop.c = d->c; - s->iop.bio = NULL; - s->iop.error = 0; - - return s; -} - -/* Cached devices */ - -static void cached_dev_bio_complete(struct closure *cl) -{ - struct search *s = container_of(cl, struct search, cl); - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - - search_free(cl); - cached_dev_put(dc); -} - -/* Process reads */ - -static void cached_dev_read_error(struct closure *cl) -{ - struct search *s = container_of(cl, struct search, cl); - struct bio *bio = &s->rbio.bio; - - if (s->recoverable) { - /* Read bucket invalidate races are handled here, also plain - * old IO errors from the cache that can be retried from the - * backing device (reads of clean data) */ - trace_bcache_read_retry(s->orig_bio); - - s->iop.error = 0; - do_bio_hook(s, s->orig_bio); - - /* XXX: invalidate cache, don't count twice */ - - closure_bio_submit(bio, cl); - } - - continue_at(cl, cached_dev_bio_complete, NULL); -} - -static void cached_dev_read_done(struct closure *cl) -{ - struct search *s = container_of(cl, struct search, cl); - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - - if (dc->verify && s->recoverable && !s->read_dirty_data) - bch_data_verify(dc, s->orig_bio); - - continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); -} - -static void cached_dev_read_done_bh(struct closure *cl) -{ - struct search *s = container_of(cl, struct search, cl); - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - - bch_mark_cache_accounting(s->iop.c, dc, !s->cache_miss, s->bypass); - trace_bcache_read(s->orig_bio, !s->cache_miss, s->bypass); - - if (s->iop.error) - continue_at_nobarrier(cl, cached_dev_read_error, s->iop.c->wq); - else if (dc->verify) - continue_at_nobarrier(cl, cached_dev_read_done, s->iop.c->wq); - else - continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); -} - -/** - * __cache_promote -- insert result of read bio into cache - * - * Used for backing devices and flash-only volumes. - * - * @orig_bio must actually be a bbio with a valid key. - */ -void __cache_promote(struct bch_fs *c, struct bch_read_bio *orig_bio, - struct bkey_s_c old, - struct bkey_s_c new, - unsigned write_flags) -{ -#if 0 - struct cache_promote_op *op; - struct bio *bio; - unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE); - - /* XXX: readahead? */ - - op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); - if (!op) - goto out_submit; - - /* clone the bbio */ - memcpy(&op->bio, orig_bio, offsetof(struct bbio, bio)); - - bio = &op->bio.bio.bio; - bio_init(bio); - bio_get(bio); - bio->bi_bdev = orig_bio->bio.bi_bdev; - bio->bi_iter.bi_sector = orig_bio->bio.bi_iter.bi_sector; - bio->bi_iter.bi_size = orig_bio->bio.bi_iter.bi_size; - bio->bi_end_io = cache_promote_endio; - bio->bi_private = &op->cl; - bio->bi_io_vec = bio->bi_inline_vecs; - bch_bio_map(bio, NULL); - - if (bio_alloc_pages(bio, __GFP_NOWARN|GFP_NOIO)) - goto out_free; - - orig_bio->ca = NULL; - - closure_init(&op->cl, &c->cl); - op->orig_bio = &orig_bio->bio; - op->stale = 0; - - bch_write_op_init(&op->iop, c, &op->bio, &c->promote_write_point, - new, old, - BCH_WRITE_ALLOC_NOWAIT|write_flags); - op->iop.nr_replicas = 1; - - //bch_cut_front(bkey_start_pos(&orig_bio->key.k), &op->iop.insert_key); - //bch_cut_back(orig_bio->key.k.p, &op->iop.insert_key.k); - - trace_bcache_promote(&orig_bio->bio); - - op->bio.bio.submit_time_us = local_clock_us(); - closure_bio_submit(bio, &op->cl); - - continue_at(&op->cl, cache_promote_write, c->wq); -out_free: - kfree(op); -out_submit: - generic_make_request(&orig_bio->bio); -#endif -} - -/** - * cached_dev_cache_miss - populate cache with data from backing device - * - * We don't write to the cache if s->bypass is set. - */ -static int cached_dev_cache_miss(struct btree_iter *iter, struct search *s, - struct bio *bio, unsigned sectors) -{ - int ret; - unsigned reada = 0; - struct bio *miss; - BKEY_PADDED(key) replace; - - s->cache_miss = 1; - - if (s->bypass) - goto nopromote; -#if 0 - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - - /* XXX: broken */ - if (!(bio->bi_opf & REQ_RAHEAD) && - !(bio->bi_opf & REQ_META) && - ((u64) sectors_available(dc->disk.c) * 100 < - (u64) iter->c->capacity * CUTOFF_CACHE_READA)) - reada = min_t(sector_t, dc->readahead >> 9, - bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); -#endif - sectors = min(sectors, bio_sectors(bio) + reada); - - replace.key.k = KEY(s->inode, - bio->bi_iter.bi_sector + sectors, - sectors); - - ret = bch_btree_insert_check_key(iter, &replace.key); - if (ret == -EINTR) - return ret; - - miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split); - - miss->bi_end_io = request_endio; - miss->bi_private = &s->cl; - - //to_bbio(miss)->key.k = KEY(s->inode, - // bio_end_sector(miss), - // bio_sectors(miss)); - to_rbio(miss)->ca = NULL; - - closure_get(&s->cl); - __cache_promote(s->iop.c, to_rbio(miss), - bkey_i_to_s_c(&replace.key), - bkey_to_s_c(&KEY(replace.key.k.p.inode, - replace.key.k.p.offset, - replace.key.k.size)), - BCH_WRITE_CACHED); - - return 0; -nopromote: - miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split); - - miss->bi_end_io = request_endio; - miss->bi_private = &s->cl; - closure_bio_submit(miss, &s->cl); - - return 0; -} - -static void cached_dev_read(struct cached_dev *dc, struct search *s) -{ - struct bch_fs *c = s->iop.c; - struct closure *cl = &s->cl; - struct bio *bio = &s->rbio.bio; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch_increment_clock(c, bio_sectors(bio), READ); - - for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS, - POS(s->inode, bio->bi_iter.bi_sector), k) { - BKEY_PADDED(k) tmp; - struct extent_pick_ptr pick; - unsigned sectors, bytes; - bool is_last; -retry: - bkey_reassemble(&tmp.k, k); - bch_btree_iter_unlock(&iter); - k = bkey_i_to_s_c(&tmp.k); - - bch_extent_pick_ptr(c, k, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, bio, "no device to read from"); - goto out; - } - - sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) - - bio->bi_iter.bi_sector; - bytes = sectors << 9; - is_last = bytes == bio->bi_iter.bi_size; - swap(bio->bi_iter.bi_size, bytes); - - if (pick.ca) { - PTR_BUCKET(pick.ca, &pick.ptr)->read_prio = - c->prio_clock[READ].hand; - - if (!bkey_extent_is_cached(k.k)) - s->read_dirty_data = true; - - bch_read_extent(c, &s->rbio, k, &pick, - BCH_READ_ACCOUNT_TIMES| - BCH_READ_RETRY_IF_STALE| - (!s->bypass ? BCH_READ_PROMOTE : 0)| - (is_last ? BCH_READ_IS_LAST : 0)); - } else { - /* not present (hole), or stale cached data */ - if (cached_dev_cache_miss(&iter, s, bio, sectors)) { - k = bch_btree_iter_peek_with_holes(&iter); - if (btree_iter_err(k)) - break; - goto retry; - } - } - - swap(bio->bi_iter.bi_size, bytes); - bio_advance(bio, bytes); - - if (is_last) { - bch_btree_iter_unlock(&iter); - goto out; - } - } - - /* - * If we get here, it better have been because there was an error - * reading a btree node - */ - ret = bch_btree_iter_unlock(&iter); - BUG_ON(!ret); - bcache_io_error(c, bio, "btree IO error %i", ret); -out: - continue_at(cl, cached_dev_read_done_bh, NULL); -} - -/* Process writes */ - -static void cached_dev_write_complete(struct closure *cl) -{ - struct search *s = container_of(cl, struct search, cl); - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - - up_read_non_owner(&dc->writeback_lock); - cached_dev_bio_complete(cl); -} - -static void cached_dev_write(struct cached_dev *dc, struct search *s) -{ - struct closure *cl = &s->cl; - struct bio *bio = &s->wbio.bio; - bool writeback = false; - bool bypass = s->bypass; - struct bkey insert_key = KEY(s->inode, - bio_end_sector(bio), - bio_sectors(bio)); - unsigned flags = BCH_WRITE_DISCARD_ON_ERROR; - - down_read_non_owner(&dc->writeback_lock); - if (bch_keybuf_check_overlapping(&dc->writeback_keys, - bkey_start_pos(&insert_key), - insert_key.p)) { - /* - * We overlap with some dirty data undergoing background - * writeback, force this write to writeback - */ - bypass = false; - writeback = true; - } - - /* - * Discards aren't _required_ to do anything, so skipping if - * check_overlapping returned true is ok - * - * But check_overlapping drops dirty keys for which io hasn't started, - * so we still want to call it. - */ - if (bio_op(bio) == REQ_OP_DISCARD) - bypass = true; - - if (should_writeback(dc, bio, BDEV_CACHE_MODE(dc->disk_sb.sb), - bypass)) { - bypass = false; - writeback = true; - } - - if (bypass) { - /* - * If this is a bypass-write (as opposed to a discard), send - * it down to the backing device. If this is a discard, only - * send it to the backing device if the backing device - * supports discards. Otherwise, we simply discard the key - * range from the cache and don't touch the backing device. - */ - if ((bio_op(bio) != REQ_OP_DISCARD) || - blk_queue_discard(bdev_get_queue(dc->disk_sb.bdev))) - closure_bio_submit(s->orig_bio, cl); - } else if (writeback) { - bch_writeback_add(dc); - - if (bio->bi_opf & REQ_PREFLUSH) { - /* Also need to send a flush to the backing device */ - struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, - &dc->disk.bio_split); - - flush->bi_bdev = bio->bi_bdev; - flush->bi_end_io = request_endio; - flush->bi_private = cl; - bio_set_op_attrs(flush, REQ_OP_WRITE, WRITE_FLUSH); - - closure_bio_submit(flush, cl); - } - } else { - struct bio *writethrough = - bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split); - - closure_bio_submit(writethrough, cl); - - flags |= BCH_WRITE_CACHED; - flags |= BCH_WRITE_ALLOC_NOWAIT; - } - - if (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) - flags |= BCH_WRITE_FLUSH; - if (bypass) - flags |= BCH_WRITE_DISCARD; - - bch_write_op_init(&s->iop, dc->disk.c, &s->wbio, - (struct disk_reservation) { 0 }, - foreground_write_point(dc->disk.c, - (unsigned long) current), - bkey_start_pos(&insert_key), - NULL, flags); - - closure_call(&s->iop.cl, bch_write, NULL, cl); - continue_at(cl, cached_dev_write_complete, NULL); -} - -/* Cached devices - read & write stuff */ - -static void __cached_dev_make_request(struct request_queue *q, struct bio *bio) -{ - struct search *s; - struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - int rw = bio_data_dir(bio); - - generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0); - - bio->bi_bdev = dc->disk_sb.bdev; - bio->bi_iter.bi_sector += le64_to_cpu(dc->disk_sb.sb->data_offset); - - if (cached_dev_get(dc)) { - struct bio *clone; - - s = search_alloc(bio, d); - trace_bcache_request_start(s->d, bio); - - clone = rw ? &s->wbio.bio : &s->rbio.bio; - - if (!bio->bi_iter.bi_size) { - if (s->orig_bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) - bch_journal_flush_async(&s->iop.c->journal, - &s->cl); - - /* - * If it's a flush, we send the flush to the backing - * device too - */ - closure_bio_submit(clone, &s->cl); - - continue_at(&s->cl, cached_dev_bio_complete, NULL); - } else { - s->bypass = check_should_bypass(dc, bio, rw); - - if (rw) - cached_dev_write(dc, s); - else - cached_dev_read(dc, s); - } - } else { - if ((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(dc->disk_sb.bdev))) - bio_endio(bio); - else - generic_make_request(bio); - } -} - -static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct bio *bio) -{ - __cached_dev_make_request(q, bio); - return BLK_QC_T_NONE; -} - -static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - return __blkdev_driver_ioctl(dc->disk_sb.bdev, mode, cmd, arg); -} - -static int cached_dev_congested(void *data, int bits) -{ - struct bcache_device *d = data; - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - struct request_queue *q = bdev_get_queue(dc->disk_sb.bdev); - int ret = 0; - - if (bdi_congested(&q->backing_dev_info, bits)) - return 1; - - if (cached_dev_get(dc)) { - ret |= bch_congested(d->c, bits); - cached_dev_put(dc); - } - - return ret; -} - -void bch_cached_dev_request_init(struct cached_dev *dc) -{ - struct gendisk *g = dc->disk.disk; - - g->queue->make_request_fn = cached_dev_make_request; - g->queue->backing_dev_info.congested_fn = cached_dev_congested; - dc->disk.ioctl = cached_dev_ioctl; -} - -/* Blockdev volumes */ - -static void __blockdev_volume_make_request(struct request_queue *q, - struct bio *bio) -{ - struct search *s; - struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; - int rw = bio_data_dir(bio); - - generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0); - - trace_bcache_request_start(d, bio); - - s = search_alloc(bio, d); - - if (!bio->bi_iter.bi_size) { - if (s->orig_bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) - bch_journal_flush_async(&s->iop.c->journal, - &s->cl); - - continue_at(&s->cl, search_free, NULL); - } else if (rw) { - struct disk_reservation res = { 0 }; - unsigned flags = 0; - - if (bio_op(bio) != REQ_OP_DISCARD && - bch_disk_reservation_get(d->c, &res, bio_sectors(bio), 0)) { - s->iop.error = -ENOSPC; - continue_at(&s->cl, search_free, NULL); - return; - } - - if (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) - flags |= BCH_WRITE_FLUSH; - if (bio_op(bio) == REQ_OP_DISCARD) - flags |= BCH_WRITE_DISCARD; - - bch_write_op_init(&s->iop, d->c, &s->wbio, res, - foreground_write_point(d->c, - (unsigned long) current), - POS(s->inode, bio->bi_iter.bi_sector), - NULL, flags); - - closure_call(&s->iop.cl, bch_write, NULL, &s->cl); - } else { - closure_get(&s->cl); - bch_read(d->c, &s->rbio, bcache_dev_inum(d)); - } - continue_at(&s->cl, search_free, NULL); -} - -static blk_qc_t blockdev_volume_make_request(struct request_queue *q, - struct bio *bio) -{ - __blockdev_volume_make_request(q, bio); - return BLK_QC_T_NONE; -} - -static int blockdev_volume_ioctl(struct bcache_device *d, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - return -ENOTTY; -} - -static int blockdev_volume_congested(void *data, int bits) -{ - struct bcache_device *d = data; - - return bch_congested(d->c, bits); -} - -void bch_blockdev_volume_request_init(struct bcache_device *d) -{ - struct gendisk *g = d->disk; - - g->queue->make_request_fn = blockdev_volume_make_request; - g->queue->backing_dev_info.congested_fn = blockdev_volume_congested; - d->ioctl = blockdev_volume_ioctl; -} diff --git a/libbcache/request.h b/libbcache/request.h deleted file mode 100644 index 1ee3d16f..00000000 --- a/libbcache/request.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _BCACHE_REQUEST_H_ -#define _BCACHE_REQUEST_H_ - -#include "stats.h" - -struct bch_fs; -struct cached_dev; -struct bcache_device; -struct kmem_cache; - -unsigned bch_get_congested(struct bch_fs *); - -void bch_cached_dev_request_init(struct cached_dev *dc); -void bch_blockdev_volume_request_init(struct bcache_device *d); - -#endif /* _BCACHE_REQUEST_H_ */ diff --git a/libbcache/siphash.c b/libbcache/siphash.c deleted file mode 100644 index 3a6c9c82..00000000 --- a/libbcache/siphash.c +++ /dev/null @@ -1,172 +0,0 @@ -/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ - -/*- - * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d - * are the number of compression rounds and the number of finalization rounds. - * A compression round is identical to a finalization round and this round - * function is called SipRound. Given a 128-bit key k and a (possibly empty) - * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). - * - * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, - * by Jean-Philippe Aumasson and Daniel J. Bernstein, - * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa - * https://131002.net/siphash/siphash.pdf - * https://131002.net/siphash/ - */ - -#include <asm/byteorder.h> -#include <asm/unaligned.h> -#include <linux/bitops.h> -#include <linux/string.h> - -#include "siphash.h" - -static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) -{ - while (rounds--) { - ctx->v[0] += ctx->v[1]; - ctx->v[2] += ctx->v[3]; - ctx->v[1] = rol64(ctx->v[1], 13); - ctx->v[3] = rol64(ctx->v[3], 16); - - ctx->v[1] ^= ctx->v[0]; - ctx->v[3] ^= ctx->v[2]; - ctx->v[0] = rol64(ctx->v[0], 32); - - ctx->v[2] += ctx->v[1]; - ctx->v[0] += ctx->v[3]; - ctx->v[1] = rol64(ctx->v[1], 17); - ctx->v[3] = rol64(ctx->v[3], 21); - - ctx->v[1] ^= ctx->v[2]; - ctx->v[3] ^= ctx->v[0]; - ctx->v[2] = rol64(ctx->v[2], 32); - } -} - -static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) -{ - u64 m = get_unaligned_le64(ptr); - - ctx->v[3] ^= m; - SipHash_Rounds(ctx, rounds); - ctx->v[0] ^= m; -} - -void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) -{ - u64 k0, k1; - - k0 = le64_to_cpu(key->k0); - k1 = le64_to_cpu(key->k1); - - ctx->v[0] = 0x736f6d6570736575ULL ^ k0; - ctx->v[1] = 0x646f72616e646f6dULL ^ k1; - ctx->v[2] = 0x6c7967656e657261ULL ^ k0; - ctx->v[3] = 0x7465646279746573ULL ^ k1; - - memset(ctx->buf, 0, sizeof(ctx->buf)); - ctx->bytes = 0; -} - -void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, - const void *src, size_t len) -{ - const u8 *ptr = src; - size_t left, used; - - if (len == 0) - return; - - used = ctx->bytes % sizeof(ctx->buf); - ctx->bytes += len; - - if (used > 0) { - left = sizeof(ctx->buf) - used; - - if (len >= left) { - memcpy(&ctx->buf[used], ptr, left); - SipHash_CRounds(ctx, ctx->buf, rc); - len -= left; - ptr += left; - } else { - memcpy(&ctx->buf[used], ptr, len); - return; - } - } - - while (len >= sizeof(ctx->buf)) { - SipHash_CRounds(ctx, ptr, rc); - len -= sizeof(ctx->buf); - ptr += sizeof(ctx->buf); - } - - if (len > 0) - memcpy(&ctx->buf[used], ptr, len); -} - -void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) -{ - u64 r; - - r = SipHash_End(ctx, rc, rf); - - *((__le64 *) dst) = cpu_to_le64(r); -} - -u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) -{ - u64 r; - size_t left, used; - - used = ctx->bytes % sizeof(ctx->buf); - left = sizeof(ctx->buf) - used; - memset(&ctx->buf[used], 0, left - 1); - ctx->buf[7] = ctx->bytes; - - SipHash_CRounds(ctx, ctx->buf, rc); - ctx->v[2] ^= 0xff; - SipHash_Rounds(ctx, rf); - - r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); - memset(ctx, 0, sizeof(*ctx)); - return (r); -} - -u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) -{ - SIPHASH_CTX ctx; - - SipHash_Init(&ctx, key); - SipHash_Update(&ctx, rc, rf, src, len); - return SipHash_End(&ctx, rc, rf); -} diff --git a/libbcache/siphash.h b/libbcache/siphash.h deleted file mode 100644 index 7a4b2241..00000000 --- a/libbcache/siphash.h +++ /dev/null @@ -1,86 +0,0 @@ -/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ -/*- - * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -/* - * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) - * optimized for speed on short messages returning a 64bit hash/digest value. - * - * The number of rounds is defined during the initialization: - * SipHash24_Init() for the fast and resonable strong version - * SipHash48_Init() for the strong version (half as fast) - * - * struct SIPHASH_CTX ctx; - * SipHash24_Init(&ctx); - * SipHash_SetKey(&ctx, "16bytes long key"); - * SipHash_Update(&ctx, pointer_to_string, length_of_string); - * SipHash_Final(output, &ctx); - */ - -#ifndef _SIPHASH_H_ -#define _SIPHASH_H_ - -#include <linux/types.h> - -#define SIPHASH_BLOCK_LENGTH 8 -#define SIPHASH_KEY_LENGTH 16 -#define SIPHASH_DIGEST_LENGTH 8 - -typedef struct _SIPHASH_CTX { - u64 v[4]; - u8 buf[SIPHASH_BLOCK_LENGTH]; - u32 bytes; -} SIPHASH_CTX; - -typedef struct { - __le64 k0; - __le64 k1; -} SIPHASH_KEY; - -void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); -void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); -u64 SipHash_End(SIPHASH_CTX *, int, int); -void SipHash_Final(void *, SIPHASH_CTX *, int, int); -u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); - -#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) -#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) -#define SipHash24_End(_d) SipHash_End((_d), 2, 4) -#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) -#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) - -#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) -#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) -#define SipHash48_End(_d) SipHash_End((_d), 4, 8) -#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) -#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) - -#endif /* _SIPHASH_H_ */ diff --git a/libbcache/six.c b/libbcache/six.c deleted file mode 100644 index 1bb8bfcc..00000000 --- a/libbcache/six.c +++ /dev/null @@ -1,396 +0,0 @@ - -#include <linux/sched.h> -#include <linux/sched/rt.h> - -#include "six.h" - -#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) -#define six_release(l) lock_release(l, 0, _RET_IP_) - -#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) -#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) -#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) - -struct six_lock_vals { - /* Value we add to the lock in order to take the lock: */ - u64 lock_val; - - /* If the lock has this value (used as a mask), taking the lock fails: */ - u64 lock_fail; - - /* Value we add to the lock in order to release the lock: */ - u64 unlock_val; - - /* Mask that indicates lock is held for this type: */ - u64 held_mask; - - /* Waitlist we wakeup when releasing the lock: */ - enum six_lock_type unlock_wakeup; -}; - -#define LOCK_VALS { \ - [SIX_LOCK_read] = { \ - .lock_val = __SIX_VAL(read_lock, 1), \ - .lock_fail = __SIX_LOCK_HELD_write, \ - .unlock_val = -__SIX_VAL(read_lock, 1), \ - .held_mask = __SIX_LOCK_HELD_read, \ - .unlock_wakeup = SIX_LOCK_write, \ - }, \ - [SIX_LOCK_intent] = { \ - .lock_val = __SIX_VAL(intent_lock, 1), \ - .lock_fail = __SIX_LOCK_HELD_intent, \ - .unlock_val = -__SIX_VAL(intent_lock, 1), \ - .held_mask = __SIX_LOCK_HELD_intent, \ - .unlock_wakeup = SIX_LOCK_intent, \ - }, \ - [SIX_LOCK_write] = { \ - .lock_val = __SIX_VAL(seq, 1), \ - .lock_fail = __SIX_LOCK_HELD_read, \ - .unlock_val = __SIX_VAL(seq, 1), \ - .held_mask = __SIX_LOCK_HELD_write, \ - .unlock_wakeup = SIX_LOCK_read, \ - }, \ -} - -static void six_set_owner(struct six_lock *lock, enum six_lock_type type) -{ - if (type == SIX_LOCK_intent) - lock->owner = current; -} - -static void six_clear_owner(struct six_lock *lock, enum six_lock_type type) -{ - if (type == SIX_LOCK_intent) - lock->owner = NULL; -} - -static inline bool __six_trylock_type(struct six_lock *lock, - enum six_lock_type type) -{ - const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state old; - u64 v = READ_ONCE(lock->state.v); - - do { - old.v = v; - - EBUG_ON(type == SIX_LOCK_write && - ((old.v & __SIX_LOCK_HELD_write) || - !(old.v & __SIX_LOCK_HELD_intent))); - - if (old.v & l[type].lock_fail) - return false; - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, - old.v + l[type].lock_val)) != old.v); - return true; -} - -bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) -{ - bool ret = __six_trylock_type(lock, type); - - if (ret) { - six_acquire(&lock->dep_map, 1); - six_set_owner(lock, type); - } - - return ret; -} - -bool six_relock_type(struct six_lock *lock, enum six_lock_type type, - unsigned seq) -{ - const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state old; - u64 v = READ_ONCE(lock->state.v); - - do { - old.v = v; - - if (old.seq != seq || old.v & l[type].lock_fail) - return false; - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, - old.v + l[type].lock_val)) != old.v); - - six_acquire(&lock->dep_map, 1); - six_set_owner(lock, type); - return true; -} - -struct six_lock_waiter { - struct list_head list; - struct task_struct *task; -}; - -/* This is probably up there with the more evil things I've done */ -#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) - -static inline int six_can_spin_on_owner(struct six_lock *lock) -{ - struct task_struct *owner; - int retval = 1; - - if (need_resched()) - return 0; - - rcu_read_lock(); - owner = READ_ONCE(lock->owner); - if (owner) - retval = owner->on_cpu; - rcu_read_unlock(); - /* - * if lock->owner is not set, the mutex owner may have just acquired - * it and not set the owner yet or the mutex has been released. - */ - return retval; -} - -static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner) -{ - bool ret = true; - - rcu_read_lock(); - while (lock->owner == owner) { - /* - * Ensure we emit the owner->on_cpu, dereference _after_ - * checking lock->owner still matches owner. If that fails, - * owner might point to freed memory. If it still matches, - * the rcu_read_lock() ensures the memory stays valid. - */ - barrier(); - - if (!owner->on_cpu || need_resched()) { - ret = false; - break; - } - - cpu_relax_lowlatency(); - } - rcu_read_unlock(); - - return ret; -} - -static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -{ - struct task_struct *task = current; - - if (type == SIX_LOCK_write) - return false; - - preempt_disable(); - if (!six_can_spin_on_owner(lock)) - goto fail; - - if (!osq_lock(&lock->osq)) - goto fail; - - while (1) { - struct task_struct *owner; - - /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. - */ - owner = READ_ONCE(lock->owner); - if (owner && !six_spin_on_owner(lock, owner)) - break; - - if (__six_trylock_type(lock, type)) { - osq_unlock(&lock->osq); - preempt_enable(); - return true; - } - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) - break; - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - cpu_relax_lowlatency(); - } - - osq_unlock(&lock->osq); -fail: - preempt_enable(); - - /* - * If we fell out of the spin path because of need_resched(), - * reschedule now, before we try-lock again. This avoids getting - * scheduled out right after we obtained the lock. - */ - if (need_resched()) - schedule(); - - return false; -} - -void six_lock_type(struct six_lock *lock, enum six_lock_type type) -{ - const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state old, new; - struct six_lock_waiter wait; - u64 v; - - six_acquire(&lock->dep_map, 0); - - if (__six_trylock_type(lock, type)) - goto done; - - if (six_optimistic_spin(lock, type)) - goto done; - - lock_contended(&lock->dep_map, _RET_IP_); - - INIT_LIST_HEAD(&wait.list); - wait.task = current; - - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (list_empty_careful(&wait.list)) { - raw_spin_lock(&lock->wait_lock); - list_add_tail(&wait.list, &lock->wait_list[type]); - raw_spin_unlock(&lock->wait_lock); - } - - v = READ_ONCE(lock->state.v); - do { - new.v = old.v = v; - - if (!(old.v & l[type].lock_fail)) - new.v += l[type].lock_val; - else if (!(new.waiters & (1 << type))) - new.waiters |= 1 << type; - else - break; /* waiting bit already set */ - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, new.v)) != old.v); - - if (!(old.v & l[type].lock_fail)) - break; - - schedule(); - } - - __set_current_state(TASK_RUNNING); - - if (!list_empty_careful(&wait.list)) { - raw_spin_lock(&lock->wait_lock); - list_del_init(&wait.list); - raw_spin_unlock(&lock->wait_lock); - } -done: - lock_acquired(&lock->dep_map, _RET_IP_); - six_set_owner(lock, type); -} - -static inline void six_lock_wakeup(struct six_lock *lock, - union six_lock_state state, - unsigned waitlist_id) -{ - struct list_head *wait_list = &lock->wait_list[waitlist_id]; - struct six_lock_waiter *w, *next; - - if (waitlist_id == SIX_LOCK_write && state.read_lock) - return; - - if (!(state.waiters & (1 << waitlist_id))) - return; - - clear_bit(waitlist_bitnr(waitlist_id), - (unsigned long *) &lock->state.v); - - raw_spin_lock(&lock->wait_lock); - - list_for_each_entry_safe(w, next, wait_list, list) { - list_del_init(&w->list); - - if (wake_up_process(w->task) && - waitlist_id != SIX_LOCK_read) { - if (!list_empty(wait_list)) - set_bit(waitlist_bitnr(waitlist_id), - (unsigned long *) &lock->state.v); - break; - } - } - - raw_spin_unlock(&lock->wait_lock); -} - -void six_unlock_type(struct six_lock *lock, enum six_lock_type type) -{ - const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state state; - - six_clear_owner(lock, type); - - EBUG_ON(!(lock->state.v & l[type].held_mask)); - EBUG_ON(type == SIX_LOCK_write && - !(lock->state.v & __SIX_LOCK_HELD_intent)); - - state.v = atomic64_add_return_release(l[type].unlock_val, - &lock->state.counter); - six_release(&lock->dep_map); - six_lock_wakeup(lock, state, l[type].unlock_wakeup); -} - -bool six_trylock_convert(struct six_lock *lock, - enum six_lock_type from, - enum six_lock_type to) -{ - const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state old, new; - u64 v = READ_ONCE(lock->state.v); - - do { - new.v = old.v = v; - new.v += l[from].unlock_val; - - if (new.v & l[to].lock_fail) - return false; - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, - new.v + l[to].lock_val)) != old.v); - - six_clear_owner(lock, from); - six_set_owner(lock, to); - - six_lock_wakeup(lock, new, l[from].unlock_wakeup); - - return true; -} - -/* - * Increment read/intent lock count, assuming we already have it read or intent - * locked: - */ -void six_lock_increment(struct six_lock *lock, enum six_lock_type type) -{ - const struct six_lock_vals l[] = LOCK_VALS; - - EBUG_ON(type == SIX_LOCK_write); - six_acquire(&lock->dep_map, 0); - - /* XXX: assert already locked, and that we don't overflow: */ - - atomic64_add(l[type].lock_val, &lock->state.counter); -} - -/* Convert from intent to read: */ -void six_lock_downgrade(struct six_lock *lock) -{ - six_lock_increment(lock, SIX_LOCK_read); - six_unlock_intent(lock); -} diff --git a/libbcache/six.h b/libbcache/six.h deleted file mode 100644 index 01ed3385..00000000 --- a/libbcache/six.h +++ /dev/null @@ -1,136 +0,0 @@ - -#ifndef _BCACHE_SIX_H -#define _BCACHE_SIX_H - -#include <linux/lockdep.h> -#include <linux/osq_lock.h> -#include <linux/sched.h> -#include <linux/types.h> - -#include "util.h" - -/* - * LOCK STATES: - * - * read, intent, write (i.e. shared/intent/exclusive, hence the name) - * - * read and write work as with normal read/write locks - a lock can have - * multiple readers, but write excludes reads and other write locks. - * - * Intent does not block read, but it does block other intent locks. The idea is - * by taking an intent lock, you can then later upgrade to a write lock without - * dropping your read lock and without deadlocking - because no other thread has - * the intent lock and thus no other thread could be trying to take the write - * lock. - */ - -union six_lock_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - /* for waitlist_bitnr() */ - unsigned long l; - }; - - struct { - unsigned read_lock:26; - unsigned intent_lock:3; - unsigned waiters:3; - /* - * seq works much like in seqlocks: it's incremented every time - * we lock and unlock for write. - * - * If it's odd write lock is held, even unlocked. - * - * Thus readers can unlock, and then lock again later iff it - * hasn't been modified in the meantime. - */ - u32 seq; - }; -}; - -#define SIX_LOCK_MAX_RECURSE ((1 << 3) - 1) - -enum six_lock_type { - SIX_LOCK_read, - SIX_LOCK_intent, - SIX_LOCK_write, -}; - -struct six_lock { - union six_lock_state state; - struct task_struct *owner; - struct optimistic_spin_queue osq; - - raw_spinlock_t wait_lock; - struct list_head wait_list[3]; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - -static __always_inline void __six_lock_init(struct six_lock *lock, - const char *name, - struct lock_class_key *key) -{ - atomic64_set(&lock->state.counter, 0); - raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); - INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); - INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_write]); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - debug_check_no_locks_freed((void *) lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif -} - -#define six_lock_init(lock) \ -do { \ - static struct lock_class_key __key; \ - \ - __six_lock_init((lock), #lock, &__key); \ -} while (0) - -bool six_trylock_type(struct six_lock *, enum six_lock_type); -bool six_relock_type(struct six_lock *, enum six_lock_type, unsigned); -void six_lock_type(struct six_lock *, enum six_lock_type); -void six_unlock_type(struct six_lock *, enum six_lock_type); -bool six_trylock_convert(struct six_lock *, enum six_lock_type, - enum six_lock_type); -void six_lock_increment(struct six_lock *, enum six_lock_type); -void six_lock_downgrade(struct six_lock *); - -#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) - -#define __SIX_LOCK(type) \ -static __always_inline bool six_trylock_##type(struct six_lock *lock) \ -{ \ - return six_trylock_type(lock, SIX_LOCK_##type); \ -} \ - \ -static __always_inline bool six_relock_##type(struct six_lock *lock, u32 seq)\ -{ \ - return six_relock_type(lock, SIX_LOCK_##type, seq); \ -} \ - \ -static __always_inline void six_lock_##type(struct six_lock *lock) \ -{ \ - six_lock_type(lock, SIX_LOCK_##type); \ -} \ - \ -static __always_inline void six_unlock_##type(struct six_lock *lock) \ -{ \ - six_unlock_type(lock, SIX_LOCK_##type); \ -} - -__SIX_LOCK(read) -__SIX_LOCK(intent) -__SIX_LOCK(write) - -#endif /* _BCACHE_SIX_H */ diff --git a/libbcache/stats.c b/libbcache/stats.c deleted file mode 100644 index a8a4eb36..00000000 --- a/libbcache/stats.c +++ /dev/null @@ -1,219 +0,0 @@ -/* - * bcache stats code - * - * Copyright 2012 Google, Inc. - */ - -#include "bcache.h" -#include "stats.h" -#include "sysfs.h" - -/* - * We keep absolute totals of various statistics, and addionally a set of three - * rolling averages. - * - * Every so often, a timer goes off and rescales the rolling averages. - * accounting_rescale[] is how many times the timer has to go off before we - * rescale each set of numbers; that gets us half lives of 5 minutes, one hour, - * and one day. - * - * accounting_delay is how often the timer goes off - 22 times in 5 minutes, - * and accounting_weight is what we use to rescale: - * - * pow(31 / 32, 22) ~= 1/2 - * - * So that we don't have to increment each set of numbers every time we (say) - * get a cache hit, we increment a single atomic_t in acc->collector, and when - * the rescale function runs it resets the atomic counter to 0 and adds its - * old value to each of the exported numbers. - * - * To reduce rounding error, the numbers in struct cache_stats are all - * stored left shifted by 16, and scaled back in the sysfs show() function. - */ - -static const unsigned DAY_RESCALE = 288; -static const unsigned HOUR_RESCALE = 12; -static const unsigned FIVE_MINUTE_RESCALE = 1; -static const unsigned accounting_delay = (HZ * 300) / 22; -static const unsigned accounting_weight = 5; - -/* sysfs reading/writing */ - -read_attribute(cache_hits); -read_attribute(cache_misses); -read_attribute(cache_bypass_hits); -read_attribute(cache_bypass_misses); -read_attribute(cache_hit_ratio); -read_attribute(cache_readaheads); -read_attribute(cache_miss_collisions); -read_attribute(bypassed); -read_attribute(foreground_write_ratio); -read_attribute(foreground_writes); -read_attribute(gc_writes); -read_attribute(discards); - -SHOW(bch_stats) -{ - struct cache_stats *s = - container_of(kobj, struct cache_stats, kobj); -#define var(stat) (s->stat >> 16) - var_print(cache_hits); - var_print(cache_misses); - var_print(cache_bypass_hits); - var_print(cache_bypass_misses); - - sysfs_print(cache_hit_ratio, - DIV_SAFE(var(cache_hits) * 100, - var(cache_hits) + var(cache_misses))); - - var_print(cache_readaheads); - var_print(cache_miss_collisions); - - sysfs_hprint(bypassed, var(sectors_bypassed) << 9); - sysfs_hprint(foreground_writes, var(foreground_write_sectors) << 9); - sysfs_hprint(gc_writes, var(gc_write_sectors) << 9); - sysfs_hprint(discards, var(discard_sectors) << 9); - - sysfs_print(foreground_write_ratio, - DIV_SAFE(var(foreground_write_sectors) * 100, - var(foreground_write_sectors) + - var(gc_write_sectors))); -#undef var - return 0; -} - -STORE(bch_stats) -{ - return size; -} - -static void bch_stats_release(struct kobject *k) -{ -} - -static struct attribute *bch_stats_files[] = { - &sysfs_cache_hits, - &sysfs_cache_misses, - &sysfs_cache_bypass_hits, - &sysfs_cache_bypass_misses, - &sysfs_cache_hit_ratio, - &sysfs_cache_readaheads, - &sysfs_cache_miss_collisions, - &sysfs_bypassed, - &sysfs_foreground_write_ratio, - &sysfs_foreground_writes, - &sysfs_gc_writes, - &sysfs_discards, - NULL -}; -static KTYPE(bch_stats); - -int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, - struct kobject *parent) -{ - int ret = kobject_add(&acc->total.kobj, parent, - "stats_total"); - ret = ret ?: kobject_add(&acc->five_minute.kobj, parent, - "stats_five_minute"); - ret = ret ?: kobject_add(&acc->hour.kobj, parent, - "stats_hour"); - ret = ret ?: kobject_add(&acc->day.kobj, parent, - "stats_day"); - return ret; -} - -void bch_cache_accounting_clear(struct cache_accounting *acc) -{ - memset(&acc->total.cache_hits, - 0, - sizeof(unsigned long) * 9); -} - -void bch_cache_accounting_destroy(struct cache_accounting *acc) -{ - kobject_put(&acc->total.kobj); - kobject_put(&acc->five_minute.kobj); - kobject_put(&acc->hour.kobj); - kobject_put(&acc->day.kobj); - - atomic_set(&acc->closing, 1); - if (del_timer_sync(&acc->timer)) - closure_return(&acc->cl); -} - -/* EWMA scaling */ - -static void scale_stat(unsigned long *stat) -{ - *stat = ewma_add(*stat, 0, accounting_weight); -} - -static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) -{ - if (++stats->rescale == rescale_at) { - stats->rescale = 0; - scale_stat(&stats->cache_hits); - scale_stat(&stats->cache_misses); - scale_stat(&stats->cache_bypass_hits); - scale_stat(&stats->cache_bypass_misses); - scale_stat(&stats->cache_readaheads); - scale_stat(&stats->cache_miss_collisions); - scale_stat(&stats->sectors_bypassed); - scale_stat(&stats->foreground_write_sectors); - scale_stat(&stats->gc_write_sectors); - scale_stat(&stats->discard_sectors); - } -} - -static void scale_accounting(unsigned long data) -{ - struct cache_accounting *acc = (struct cache_accounting *) data; - -#define move_stat(name) do { \ - unsigned t = atomic_xchg(&acc->collector.name, 0); \ - t <<= 16; \ - acc->five_minute.name += t; \ - acc->hour.name += t; \ - acc->day.name += t; \ - acc->total.name += t; \ -} while (0) - - move_stat(cache_hits); - move_stat(cache_misses); - move_stat(cache_bypass_hits); - move_stat(cache_bypass_misses); - move_stat(cache_readaheads); - move_stat(cache_miss_collisions); - move_stat(sectors_bypassed); - move_stat(foreground_write_sectors); - move_stat(gc_write_sectors); - move_stat(discard_sectors); - - scale_stats(&acc->total, 0); - scale_stats(&acc->day, DAY_RESCALE); - scale_stats(&acc->hour, HOUR_RESCALE); - scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE); - - acc->timer.expires += accounting_delay; - - if (!atomic_read(&acc->closing)) - add_timer(&acc->timer); - else - closure_return(&acc->cl); -} - -void bch_cache_accounting_init(struct cache_accounting *acc, - struct closure *parent) -{ - kobject_init(&acc->total.kobj, &bch_stats_ktype); - kobject_init(&acc->five_minute.kobj, &bch_stats_ktype); - kobject_init(&acc->hour.kobj, &bch_stats_ktype); - kobject_init(&acc->day.kobj, &bch_stats_ktype); - - closure_init(&acc->cl, parent); - init_timer(&acc->timer); - acc->timer.expires = jiffies + accounting_delay; - acc->timer.data = (unsigned long) acc; - acc->timer.function = scale_accounting; - add_timer(&acc->timer); -} diff --git a/libbcache/stats.h b/libbcache/stats.h deleted file mode 100644 index a3c7bd26..00000000 --- a/libbcache/stats.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef _BCACHE_STATS_H_ -#define _BCACHE_STATS_H_ - -#include "stats_types.h" - -struct bch_fs; -struct cached_dev; -struct bcache_device; - -#ifndef NO_BCACHE_ACCOUNTING - -void bch_cache_accounting_init(struct cache_accounting *, struct closure *); -int bch_cache_accounting_add_kobjs(struct cache_accounting *, struct kobject *); -void bch_cache_accounting_clear(struct cache_accounting *); -void bch_cache_accounting_destroy(struct cache_accounting *); - -#else - -static inline void bch_cache_accounting_init(struct cache_accounting *acc, - struct closure *cl) {} -static inline int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, - struct kobject *cl) -{ - return 0; -} -static inline void bch_cache_accounting_clear(struct cache_accounting *acc) {} -static inline void bch_cache_accounting_destroy(struct cache_accounting *acc) {} - -#endif - -static inline void mark_cache_stats(struct cache_stat_collector *stats, - bool hit, bool bypass) -{ - atomic_inc(&stats->cache_hit_array[!bypass][!hit]); -} - -static inline void bch_mark_cache_accounting(struct bch_fs *c, - struct cached_dev *dc, - bool hit, bool bypass) -{ - mark_cache_stats(&dc->accounting.collector, hit, bypass); - mark_cache_stats(&c->accounting.collector, hit, bypass); -} - -static inline void bch_mark_sectors_bypassed(struct bch_fs *c, - struct cached_dev *dc, - unsigned sectors) -{ - atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); - atomic_add(sectors, &c->accounting.collector.sectors_bypassed); -} - -static inline void bch_mark_gc_write(struct bch_fs *c, int sectors) -{ - atomic_add(sectors, &c->accounting.collector.gc_write_sectors); -} - -static inline void bch_mark_foreground_write(struct bch_fs *c, int sectors) -{ - atomic_add(sectors, &c->accounting.collector.foreground_write_sectors); -} - -static inline void bch_mark_discard(struct bch_fs *c, int sectors) -{ - atomic_add(sectors, &c->accounting.collector.discard_sectors); -} - -#endif /* _BCACHE_STATS_H_ */ diff --git a/libbcache/stats_types.h b/libbcache/stats_types.h deleted file mode 100644 index 28e4c69e..00000000 --- a/libbcache/stats_types.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef _BCACHE_STATS_TYPES_H_ -#define _BCACHE_STATS_TYPES_H_ - -struct cache_stat_collector { - union { - struct { - atomic_t cache_hits; - atomic_t cache_misses; - atomic_t cache_bypass_hits; - atomic_t cache_bypass_misses; - }; - - /* cache_hit_array[!bypass][!hit]: */ - atomic_t cache_hit_array[2][2]; - }; - - - atomic_t cache_readaheads; - atomic_t cache_miss_collisions; - atomic_t sectors_bypassed; - atomic_t foreground_write_sectors; - atomic_t gc_write_sectors; - atomic_t discard_sectors; -}; - -struct cache_stats { - struct kobject kobj; - - unsigned long cache_hits; - unsigned long cache_misses; - unsigned long cache_bypass_hits; - unsigned long cache_bypass_misses; - unsigned long cache_readaheads; - unsigned long cache_miss_collisions; - unsigned long sectors_bypassed; - unsigned long foreground_write_sectors; - unsigned long gc_write_sectors; - unsigned long discard_sectors; - - unsigned rescale; -}; - -struct cache_accounting { - struct closure cl; - struct timer_list timer; - atomic_t closing; - - struct cache_stat_collector collector; - - struct cache_stats total; - struct cache_stats five_minute; - struct cache_stats hour; - struct cache_stats day; -}; - -#endif /* _BCACHE_STATS_TYPES_H_ */ diff --git a/libbcache/str_hash.h b/libbcache/str_hash.h deleted file mode 100644 index 1173dfe8..00000000 --- a/libbcache/str_hash.h +++ /dev/null @@ -1,384 +0,0 @@ -#ifndef _BCACHE_STR_HASH_H -#define _BCACHE_STR_HASH_H - -#include "btree_iter.h" -#include "checksum.h" -#include "inode.h" -#include "siphash.h" -#include "super.h" - -#include <linux/crc32c.h> -#include <crypto/hash.h> - -struct bch_hash_info { - u8 type; - union { - __le64 crc_key; - SIPHASH_KEY siphash_key; - }; -}; - -static inline struct bch_hash_info -bch_hash_info_init(const struct bch_inode_unpacked *bi) -{ - /* XXX ick */ - struct bch_hash_info info = { - .type = (bi->i_flags >> INODE_STR_HASH_OFFSET) & - ~(~0 << INODE_STR_HASH_BITS) - }; - - switch (info.type) { - case BCH_STR_HASH_CRC32C: - case BCH_STR_HASH_CRC64: - info.crc_key = bi->i_hash_seed; - break; - case BCH_STR_HASH_SIPHASH: { - SHASH_DESC_ON_STACK(desc, bch_sha256); - u8 digest[crypto_shash_digestsize(bch_sha256)]; - - desc->tfm = bch_sha256; - desc->flags = 0; - - crypto_shash_digest(desc, (void *) &bi->i_hash_seed, - sizeof(bi->i_hash_seed), digest); - memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); - break; - } - default: - BUG(); - } - - return info; -} - -struct bch_str_hash_ctx { - union { - u32 crc32c; - u64 crc64; - SIPHASH_CTX siphash; - }; -}; - -static inline void bch_str_hash_init(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info) -{ - switch (info->type) { - case BCH_STR_HASH_CRC32C: - ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key)); - break; - case BCH_STR_HASH_CRC64: - ctx->crc64 = bch_crc64_update(~0, &info->crc_key, sizeof(info->crc_key)); - break; - case BCH_STR_HASH_SIPHASH: - SipHash24_Init(&ctx->siphash, &info->siphash_key); - break; - default: - BUG(); - } -} - -static inline void bch_str_hash_update(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info, - const void *data, size_t len) -{ - switch (info->type) { - case BCH_STR_HASH_CRC32C: - ctx->crc32c = crc32c(ctx->crc32c, data, len); - break; - case BCH_STR_HASH_CRC64: - ctx->crc64 = bch_crc64_update(ctx->crc64, data, len); - break; - case BCH_STR_HASH_SIPHASH: - SipHash24_Update(&ctx->siphash, data, len); - break; - default: - BUG(); - } -} - -static inline u64 bch_str_hash_end(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info) -{ - switch (info->type) { - case BCH_STR_HASH_CRC32C: - return ctx->crc32c; - case BCH_STR_HASH_CRC64: - return ctx->crc64 >> 1; - case BCH_STR_HASH_SIPHASH: - return SipHash24_End(&ctx->siphash) >> 1; - default: - BUG(); - } -} - -struct bch_hash_desc { - enum btree_id btree_id; - u8 key_type; - u8 whiteout_type; - - u64 (*hash_key)(const struct bch_hash_info *, const void *); - u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); - bool (*cmp_key)(struct bkey_s_c, const void *); - bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); -}; - -static inline struct bkey_s_c -bch_hash_lookup_at(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, const void *search) -{ - u64 inode = iter->pos.inode; - - do { - struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter); - - if (btree_iter_err(k)) - return k; - - if (k.k->type == desc.key_type) { - if (!desc.cmp_key(k, search)) - return k; - } else if (k.k->type == desc.whiteout_type) { - ; - } else { - /* hole, not found */ - break; - } - - bch_btree_iter_advance_pos(iter); - } while (iter->pos.inode == inode); - - return bkey_s_c_err(-ENOENT); -} - -static inline struct bkey_s_c -bch_hash_lookup_bkey_at(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, struct bkey_s_c search) -{ - u64 inode = iter->pos.inode; - - do { - struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter); - - if (btree_iter_err(k)) - return k; - - if (k.k->type == desc.key_type) { - if (!desc.cmp_bkey(k, search)) - return k; - } else if (k.k->type == desc.whiteout_type) { - ; - } else { - /* hole, not found */ - break; - } - - bch_btree_iter_advance_pos(iter); - } while (iter->pos.inode == inode); - - return bkey_s_c_err(-ENOENT); -} - -static inline struct bkey_s_c -bch_hash_lookup(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - struct btree_iter *iter, const void *key) -{ - bch_btree_iter_init(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key))); - - return bch_hash_lookup_at(desc, info, iter, key); -} - -static inline struct bkey_s_c -bch_hash_lookup_intent(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - struct btree_iter *iter, const void *key) -{ - bch_btree_iter_init_intent(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key))); - - return bch_hash_lookup_at(desc, info, iter, key); -} - -static inline struct bkey_s_c -bch_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter) -{ - while (1) { - struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter); - - if (btree_iter_err(k)) - return k; - - if (k.k->type != desc.key_type) - return k; - - /* hash collision, keep going */ - bch_btree_iter_advance_pos(iter); - if (iter->pos.inode != k.k->p.inode) - return bkey_s_c_err(-ENOENT); - } -} - -static inline struct bkey_s_c bch_hash_hole(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - struct btree_iter *iter, - const void *key) -{ - bch_btree_iter_init_intent(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key))); - - return bch_hash_hole_at(desc, iter); -} - -static inline int bch_hash_needs_whiteout(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, - struct btree_iter *start) -{ - bch_btree_iter_set_pos(iter, - btree_type_successor(start->btree_id, start->pos)); - - while (1) { - struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter); - int ret = btree_iter_err(k); - - if (ret) - return ret; - - if (k.k->type != desc.key_type && - k.k->type != desc.whiteout_type) - return false; - - if (k.k->type == desc.key_type && - desc.hash_bkey(info, k) <= start->pos.offset) - return true; - - bch_btree_iter_advance_pos(iter); - } -} - -#define BCH_HASH_SET_MUST_CREATE 1 -#define BCH_HASH_SET_MUST_REPLACE 2 - -static inline int bch_hash_set(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - u64 *journal_seq, - struct bkey_i *insert, int flags) -{ - struct btree_iter iter, hashed_slot; - struct bkey_s_c k; - int ret; - - bch_btree_iter_init_intent(&hashed_slot, c, desc.btree_id, - POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)))); - bch_btree_iter_init_intent(&iter, c, desc.btree_id, hashed_slot.pos); - bch_btree_iter_link(&hashed_slot, &iter); -retry: - /* - * On hash collision, we have to keep the slot we hashed to locked while - * we do the insert - to avoid racing with another thread deleting - * whatever's in the slot we hashed to: - */ - ret = bch_btree_iter_traverse(&hashed_slot); - if (ret) - goto err; - - /* - * On -EINTR/retry, we dropped locks - always restart from the slot we - * hashed to: - */ - bch_btree_iter_copy(&iter, &hashed_slot); - - k = bch_hash_lookup_bkey_at(desc, info, &iter, bkey_i_to_s_c(insert)); - - ret = btree_iter_err(k); - if (ret == -ENOENT) { - if (flags & BCH_HASH_SET_MUST_REPLACE) { - ret = -ENOENT; - goto err; - } - - /* - * Not found, so we're now looking for any open - * slot - we might have skipped over a whiteout - * that we could have used, so restart from the - * slot we hashed to: - */ - bch_btree_iter_copy(&iter, &hashed_slot); - k = bch_hash_hole_at(desc, &iter); - if ((ret = btree_iter_err(k))) - goto err; - } else if (!ret) { - if (flags & BCH_HASH_SET_MUST_CREATE) { - ret = -EEXIST; - goto err; - } - } else { - goto err; - } - - insert->k.p = iter.pos; - ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&iter, insert)); -err: - if (ret == -EINTR) - goto retry; - - /* - * On successful insert, we don't want to clobber ret with error from - * iter: - */ - bch_btree_iter_unlock(&iter); - bch_btree_iter_unlock(&hashed_slot); - return ret; -} - -static inline int bch_hash_delete(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - u64 *journal_seq, const void *key) -{ - struct btree_iter iter, whiteout_iter; - struct bkey_s_c k; - struct bkey_i delete; - int ret = -ENOENT; - - bch_btree_iter_init_intent(&iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key))); - bch_btree_iter_init(&whiteout_iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key))); - bch_btree_iter_link(&iter, &whiteout_iter); -retry: - k = bch_hash_lookup_at(desc, info, &iter, key); - if ((ret = btree_iter_err(k))) - goto err; - - ret = bch_hash_needs_whiteout(desc, info, &whiteout_iter, &iter); - if (ret < 0) - goto err; - - bkey_init(&delete.k); - delete.k.p = k.k->p; - delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED; - - ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&iter, &delete)); -err: - if (ret == -EINTR) - goto retry; - - bch_btree_iter_unlock(&whiteout_iter); - bch_btree_iter_unlock(&iter); - return ret; -} - -#endif /* _BCACHE_STR_HASH_H */ diff --git a/libbcache/super-io.c b/libbcache/super-io.c deleted file mode 100644 index 67c03e19..00000000 --- a/libbcache/super-io.c +++ /dev/null @@ -1,820 +0,0 @@ - -#include "bcache.h" -#include "blockdev.h" -#include "checksum.h" -#include "error.h" -#include "io.h" -#include "journal.h" -#include "super-io.h" -#include "super.h" -#include "vstructs.h" - -#include <linux/backing-dev.h> -#include <linux/sort.h> - -static inline void __bch_sb_layout_size_assert(void) -{ - BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); -} - -struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb, - enum bch_sb_field_type type) -{ - struct bch_sb_field *f; - - /* XXX: need locking around superblock to access optional fields */ - - vstruct_for_each(sb, f) - if (le32_to_cpu(f->type) == type) - return f; - return NULL; -} - -void bch_free_super(struct bcache_superblock *sb) -{ - if (sb->bio) - bio_put(sb->bio); - if (!IS_ERR_OR_NULL(sb->bdev)) - blkdev_put(sb->bdev, sb->mode); - - free_pages((unsigned long) sb->sb, sb->page_order); - memset(sb, 0, sizeof(*sb)); -} - -static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order) -{ - struct bch_sb *new_sb; - struct bio *bio; - - if (sb->page_order >= order && sb->sb) - return 0; - - if (dynamic_fault("bcache:add:super_realloc")) - return -ENOMEM; - - bio = bio_kmalloc(GFP_KERNEL, 1 << order); - if (!bio) - return -ENOMEM; - - if (sb->bio) - bio_put(sb->bio); - sb->bio = bio; - - new_sb = (void *) __get_free_pages(GFP_KERNEL, order); - if (!new_sb) - return -ENOMEM; - - if (sb->sb) - memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); - - free_pages((unsigned long) sb->sb, sb->page_order); - sb->sb = new_sb; - - sb->page_order = order; - - return 0; -} - -static int bch_sb_realloc(struct bcache_superblock *sb, unsigned u64s) -{ - u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s); - u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; - - if (new_bytes > max_bytes) { - char buf[BDEVNAME_SIZE]; - - pr_err("%s: superblock too big: want %llu but have %llu", - bdevname(sb->bdev, buf), new_bytes, max_bytes); - return -ENOSPC; - } - - return __bch_super_realloc(sb, get_order(new_bytes)); -} - -static int bch_fs_sb_realloc(struct bch_fs *c, unsigned u64s) -{ - u64 bytes = __vstruct_bytes(struct bch_sb, u64s); - struct bch_sb *sb; - unsigned order = get_order(bytes); - - if (c->disk_sb && order <= c->disk_sb_order) - return 0; - - sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); - if (!sb) - return -ENOMEM; - - if (c->disk_sb) - memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order); - - free_pages((unsigned long) c->disk_sb, c->disk_sb_order); - - c->disk_sb = sb; - c->disk_sb_order = order; - return 0; -} - -static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb, - struct bch_sb_field *f, - unsigned u64s) -{ - unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; - - if (!f) { - f = vstruct_last(sb); - memset(f, 0, sizeof(u64) * u64s); - f->u64s = cpu_to_le32(u64s); - f->type = 0; - } else { - void *src, *dst; - - src = vstruct_end(f); - f->u64s = cpu_to_le32(u64s); - dst = vstruct_end(f); - - memmove(dst, src, vstruct_end(sb) - src); - - if (dst > src) - memset(src, 0, dst - src); - } - - le32_add_cpu(&sb->u64s, u64s - old_u64s); - - return f; -} - -struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *sb, - enum bch_sb_field_type type, - unsigned u64s) -{ - struct bch_sb_field *f = bch_sb_field_get(sb->sb, type); - ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; - ssize_t d = -old_u64s + u64s; - - if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) - return NULL; - - f = __bch_sb_field_resize(sb->sb, f, u64s); - f->type = type; - return f; -} - -struct bch_sb_field *bch_fs_sb_field_resize(struct bch_fs *c, - enum bch_sb_field_type type, - unsigned u64s) -{ - struct bch_sb_field *f = bch_sb_field_get(c->disk_sb, type); - ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; - ssize_t d = -old_u64s + u64s; - struct bch_dev *ca; - unsigned i; - - lockdep_assert_held(&c->sb_lock); - - if (bch_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d)) - return NULL; - - /* XXX: we're not checking that offline device have enough space */ - - for_each_online_member(ca, c, i) { - struct bcache_superblock *sb = &ca->disk_sb; - - if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { - percpu_ref_put(&ca->ref); - return NULL; - } - } - - f = __bch_sb_field_resize(c->disk_sb, f, u64s); - f->type = type; - return f; -} - -static const char *validate_sb_layout(struct bch_sb_layout *layout) -{ - u64 offset, prev_offset, max_sectors; - unsigned i; - - if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) - return "Not a bcache superblock layout"; - - if (layout->layout_type != 0) - return "Invalid superblock layout type"; - - if (!layout->nr_superblocks) - return "Invalid superblock layout: no superblocks"; - - if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) - return "Invalid superblock layout: too many superblocks"; - - max_sectors = 1 << layout->sb_max_size_bits; - - prev_offset = le64_to_cpu(layout->sb_offset[0]); - - for (i = 1; i < layout->nr_superblocks; i++) { - offset = le64_to_cpu(layout->sb_offset[i]); - - if (offset < prev_offset + max_sectors) - return "Invalid superblock layout: superblocks overlap"; - prev_offset = offset; - } - - return NULL; -} - -static int u64_cmp(const void *_l, const void *_r) -{ - u64 l = *((const u64 *) _l), r = *((const u64 *) _r); - - return l < r ? -1 : l > r ? 1 : 0; -} - -const char *bch_validate_journal_layout(struct bch_sb *sb, - struct bch_member_cpu mi) -{ - struct bch_sb_field_journal *journal; - const char *err; - unsigned nr; - unsigned i; - u64 *b; - - journal = bch_sb_get_journal(sb); - if (!journal) - return NULL; - - nr = bch_nr_journal_buckets(journal); - if (!nr) - return NULL; - - b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); - if (!b) - return "cannot allocate memory"; - - for (i = 0; i < nr; i++) - b[i] = le64_to_cpu(journal->buckets[i]); - - sort(b, nr, sizeof(u64), u64_cmp, NULL); - - err = "journal bucket at sector 0"; - if (!b[0]) - goto err; - - err = "journal bucket before first bucket"; - if (b[0] < mi.first_bucket) - goto err; - - err = "journal bucket past end of device"; - if (b[nr - 1] >= mi.nbuckets) - goto err; - - err = "duplicate journal buckets"; - for (i = 0; i + 1 < nr; i++) - if (b[i] == b[i + 1]) - goto err; - - err = NULL; -err: - kfree(b); - return err; -} - -static const char *bch_sb_validate_members(struct bch_sb *sb) -{ - struct bch_sb_field_members *mi; - unsigned i; - - mi = bch_sb_get_members(sb); - if (!mi) - return "Invalid superblock: member info area missing"; - - if ((void *) (mi->members + sb->nr_devices) > - vstruct_end(&mi->field)) - return "Invalid superblock: bad member info"; - - for (i = 0; i < sb->nr_devices; i++) { - if (bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) - continue; - - if (le16_to_cpu(mi->members[i].bucket_size) < - BCH_SB_BTREE_NODE_SIZE(sb)) - return "bucket size smaller than btree node size"; - } - - return NULL; -} - -const char *bch_validate_cache_super(struct bcache_superblock *disk_sb) -{ - struct bch_sb *sb = disk_sb->sb; - struct bch_sb_field *f; - struct bch_sb_field_members *sb_mi; - struct bch_member_cpu mi; - const char *err; - u16 block_size; - - switch (le64_to_cpu(sb->version)) { - case BCACHE_SB_VERSION_CDEV_V4: - break; - default: - return"Unsupported superblock version"; - } - - if (BCH_SB_INITIALIZED(sb) && - le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V4) - return "Unsupported superblock version"; - - block_size = le16_to_cpu(sb->block_size); - - if (!is_power_of_2(block_size) || - block_size > PAGE_SECTORS) - return "Bad block size"; - - if (bch_is_zero(sb->user_uuid.b, sizeof(uuid_le))) - return "Bad user UUID"; - - if (bch_is_zero(sb->uuid.b, sizeof(uuid_le))) - return "Bad internal UUID"; - - if (!sb->nr_devices || - sb->nr_devices <= sb->dev_idx || - sb->nr_devices > BCH_SB_MEMBERS_MAX) - return "Bad cache device number in set"; - - if (!BCH_SB_META_REPLICAS_WANT(sb) || - BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) - return "Invalid number of metadata replicas"; - - if (!BCH_SB_META_REPLICAS_REQ(sb) || - BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) - return "Invalid number of metadata replicas"; - - if (!BCH_SB_META_REPLICAS_HAVE(sb) || - BCH_SB_META_REPLICAS_HAVE(sb) > - BCH_SB_META_REPLICAS_WANT(sb)) - return "Invalid number of metadata replicas"; - - if (!BCH_SB_DATA_REPLICAS_WANT(sb) || - BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) - return "Invalid number of data replicas"; - - if (!BCH_SB_DATA_REPLICAS_REQ(sb) || - BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) - return "Invalid number of metadata replicas"; - - if (!BCH_SB_DATA_REPLICAS_HAVE(sb) || - BCH_SB_DATA_REPLICAS_HAVE(sb) > - BCH_SB_DATA_REPLICAS_WANT(sb)) - return "Invalid number of data replicas"; - - if (!BCH_SB_BTREE_NODE_SIZE(sb)) - return "Btree node size not set"; - - if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) - return "Btree node size not a power of two"; - - if (BCH_SB_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX) - return "Btree node size too large"; - - if (BCH_SB_GC_RESERVE(sb) < 5) - return "gc reserve percentage too small"; - - if (1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) < block_size) - return "max journal entry size too small"; - - /* 4 mb max: */ - if (512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX) - return "max journal entry size too big"; - - if (!sb->time_precision || - le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) - return "invalid time precision"; - - /* validate layout */ - err = validate_sb_layout(&sb->layout); - if (err) - return err; - - vstruct_for_each(sb, f) { - if (!f->u64s) - return "Invalid superblock: invalid optional field"; - - if (vstruct_next(f) > vstruct_last(sb)) - return "Invalid superblock: invalid optional field"; - - if (le32_to_cpu(f->type) >= BCH_SB_FIELD_NR) - return "Invalid superblock: unknown optional field type"; - } - - err = bch_sb_validate_members(sb); - if (err) - return err; - - sb_mi = bch_sb_get_members(sb); - mi = bch_mi_to_cpu(sb_mi->members + sb->dev_idx); - - if (mi.nbuckets > LONG_MAX) - return "Too many buckets"; - - if (mi.nbuckets - mi.first_bucket < 1 << 10) - return "Not enough buckets"; - - if (!is_power_of_2(mi.bucket_size) || - mi.bucket_size < PAGE_SECTORS || - mi.bucket_size < block_size) - return "Bad bucket size"; - - if (get_capacity(disk_sb->bdev->bd_disk) < - mi.bucket_size * mi.nbuckets) - return "Invalid superblock: device too small"; - - err = bch_validate_journal_layout(sb, mi); - if (err) - return err; - - return NULL; -} - -/* device open: */ - -static const char *bch_blkdev_open(const char *path, fmode_t mode, - void *holder, struct block_device **ret) -{ - struct block_device *bdev; - - *ret = NULL; - bdev = blkdev_get_by_path(path, mode, holder); - if (bdev == ERR_PTR(-EBUSY)) - return "device busy"; - - if (IS_ERR(bdev)) - return "failed to open device"; - - if (mode & FMODE_WRITE) - bdev_get_queue(bdev)->backing_dev_info.capabilities - |= BDI_CAP_STABLE_WRITES; - - *ret = bdev; - return NULL; -} - -static void bch_sb_update(struct bch_fs *c) -{ - struct bch_sb *src = c->disk_sb; - struct bch_sb_field_members *mi = bch_sb_get_members(src); - struct bch_dev *ca; - unsigned i; - - lockdep_assert_held(&c->sb_lock); - - c->sb.uuid = src->uuid; - c->sb.user_uuid = src->user_uuid; - c->sb.block_size = le16_to_cpu(src->block_size); - c->sb.btree_node_size = BCH_SB_BTREE_NODE_SIZE(src); - c->sb.nr_devices = src->nr_devices; - c->sb.clean = BCH_SB_CLEAN(src); - c->sb.meta_replicas_have= BCH_SB_META_REPLICAS_HAVE(src); - c->sb.data_replicas_have= BCH_SB_DATA_REPLICAS_HAVE(src); - c->sb.str_hash_type = BCH_SB_STR_HASH_TYPE(src); - c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); - c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); - c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); - c->sb.time_precision = le32_to_cpu(src->time_precision); - - for_each_member_device(ca, c, i) - ca->mi = bch_mi_to_cpu(mi->members + i); -} - -/* doesn't copy member info */ -static void __copy_super(struct bch_sb *dst, struct bch_sb *src) -{ - struct bch_sb_field *src_f, *dst_f; - - dst->version = src->version; - dst->seq = src->seq; - dst->uuid = src->uuid; - dst->user_uuid = src->user_uuid; - memcpy(dst->label, src->label, sizeof(dst->label)); - - dst->block_size = src->block_size; - dst->nr_devices = src->nr_devices; - - dst->time_base_lo = src->time_base_lo; - dst->time_base_hi = src->time_base_hi; - dst->time_precision = src->time_precision; - - memcpy(dst->flags, src->flags, sizeof(dst->flags)); - memcpy(dst->features, src->features, sizeof(dst->features)); - memcpy(dst->compat, src->compat, sizeof(dst->compat)); - - vstruct_for_each(src, src_f) { - if (src_f->type == BCH_SB_FIELD_journal) - continue; - - dst_f = bch_sb_field_get(dst, src_f->type); - dst_f = __bch_sb_field_resize(dst, dst_f, - le32_to_cpu(src_f->u64s)); - - memcpy(dst_f, src_f, vstruct_bytes(src_f)); - } -} - -int bch_sb_to_fs(struct bch_fs *c, struct bch_sb *src) -{ - struct bch_sb_field_journal *journal_buckets = - bch_sb_get_journal(src); - unsigned journal_u64s = journal_buckets - ? le32_to_cpu(journal_buckets->field.u64s) - : 0; - - lockdep_assert_held(&c->sb_lock); - - if (bch_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s)) - return -ENOMEM; - - __copy_super(c->disk_sb, src); - bch_sb_update(c); - - return 0; -} - -int bch_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) -{ - struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb; - struct bch_sb_field_journal *journal_buckets = - bch_sb_get_journal(dst); - unsigned journal_u64s = journal_buckets - ? le32_to_cpu(journal_buckets->field.u64s) - : 0; - unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; - int ret; - - ret = bch_sb_realloc(&ca->disk_sb, u64s); - if (ret) - return ret; - - __copy_super(dst, src); - - return 0; -} - -/* read superblock: */ - -static const char *read_one_super(struct bcache_superblock *sb, u64 offset) -{ - struct bch_csum csum; - size_t bytes; - unsigned order; -reread: - bio_reset(sb->bio); - sb->bio->bi_bdev = sb->bdev; - sb->bio->bi_iter.bi_sector = offset; - sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order; - bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); - bch_bio_map(sb->bio, sb->sb); - - if (submit_bio_wait(sb->bio)) - return "IO error"; - - if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) - return "Not a bcache superblock"; - - if (le64_to_cpu(sb->sb->version) != BCACHE_SB_VERSION_CDEV_V4) - return "Unsupported superblock version"; - - bytes = vstruct_bytes(sb->sb); - - if (bytes > 512 << sb->sb->layout.sb_max_size_bits) - return "Bad superblock: too big"; - - order = get_order(bytes); - if (order > sb->page_order) { - if (__bch_super_realloc(sb, order)) - return "cannot allocate memory"; - goto reread; - } - - if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) - return "unknown csum type"; - - /* XXX: verify MACs */ - csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), - (struct nonce) { 0 }, sb->sb); - - if (bch_crc_cmp(csum, sb->sb->csum)) - return "bad checksum reading superblock"; - - return NULL; -} - -const char *bch_read_super(struct bcache_superblock *sb, - struct bch_opts opts, - const char *path) -{ - u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR; - struct bch_sb_layout layout; - const char *err; - unsigned i; - - memset(sb, 0, sizeof(*sb)); - sb->mode = FMODE_READ; - - if (!(opt_defined(opts.noexcl) && opts.noexcl)) - sb->mode |= FMODE_EXCL; - - if (!(opt_defined(opts.nochanges) && opts.nochanges)) - sb->mode |= FMODE_WRITE; - - err = bch_blkdev_open(path, sb->mode, sb, &sb->bdev); - if (err) - return err; - - err = "cannot allocate memory"; - if (__bch_super_realloc(sb, 0)) - goto err; - - err = "dynamic fault"; - if (bch_fs_init_fault("read_super")) - goto err; - - err = read_one_super(sb, offset); - if (!err) - goto got_super; - - if (offset != BCH_SB_SECTOR) { - pr_err("error reading superblock: %s", err); - goto err; - } - - pr_err("error reading default superblock: %s", err); - - /* - * Error reading primary superblock - read location of backup - * superblocks: - */ - bio_reset(sb->bio); - sb->bio->bi_bdev = sb->bdev; - sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; - sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout); - bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); - /* - * use sb buffer to read layout, since sb buffer is page aligned but - * layout won't be: - */ - bch_bio_map(sb->bio, sb->sb); - - err = "IO error"; - if (submit_bio_wait(sb->bio)) - goto err; - - memcpy(&layout, sb->sb, sizeof(layout)); - err = validate_sb_layout(&layout); - if (err) - goto err; - - for (i = 0; i < layout.nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout.sb_offset[i]); - - if (offset == BCH_SB_SECTOR) - continue; - - err = read_one_super(sb, offset); - if (!err) - goto got_super; - } - goto err; -got_super: - pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", - le64_to_cpu(sb->sb->version), - le64_to_cpu(sb->sb->flags), - le64_to_cpu(sb->sb->seq), - le16_to_cpu(sb->sb->u64s)); - - err = "Superblock block size smaller than device block size"; - if (le16_to_cpu(sb->sb->block_size) << 9 < - bdev_logical_block_size(sb->bdev)) - goto err; - - return NULL; -err: - bch_free_super(sb); - return err; -} - -/* write superblock: */ - -static void write_super_endio(struct bio *bio) -{ - struct bch_dev *ca = bio->bi_private; - - /* XXX: return errors directly */ - - bch_dev_fatal_io_err_on(bio->bi_error, ca, "superblock write"); - - bch_account_io_completion(ca); - - closure_put(&ca->fs->sb_write); - percpu_ref_put(&ca->io_ref); -} - -static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) -{ - struct bch_sb *sb = ca->disk_sb.sb; - struct bio *bio = ca->disk_sb.bio; - - if (idx >= sb->layout.nr_superblocks) - return false; - - if (!percpu_ref_tryget(&ca->io_ref)) - return false; - - sb->offset = sb->layout.sb_offset[idx]; - - SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); - sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), - (struct nonce) { 0 }, sb); - - bio_reset(bio); - bio->bi_bdev = ca->disk_sb.bdev; - bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); - bio->bi_iter.bi_size = - roundup(vstruct_bytes(sb), - bdev_logical_block_size(ca->disk_sb.bdev)); - bio->bi_end_io = write_super_endio; - bio->bi_private = ca; - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); - bch_bio_map(bio, sb); - - closure_bio_submit_punt(bio, &c->sb_write, c); - return true; -} - -void bch_write_super(struct bch_fs *c) -{ - struct closure *cl = &c->sb_write; - struct bch_dev *ca; - unsigned i, super_idx = 0; - bool wrote; - - lockdep_assert_held(&c->sb_lock); - - closure_init_stack(cl); - - le64_add_cpu(&c->disk_sb->seq, 1); - - for_each_online_member(ca, c, i) - bch_sb_from_fs(c, ca); - - if (c->opts.nochanges) - goto out; - - do { - wrote = false; - for_each_online_member(ca, c, i) - if (write_one_super(c, ca, super_idx)) - wrote = true; - - closure_sync(cl); - super_idx++; - } while (wrote); -out: - /* Make new options visible after they're persistent: */ - bch_sb_update(c); -} - -void bch_check_mark_super_slowpath(struct bch_fs *c, const struct bkey_i *k, - bool meta) -{ - struct bch_member *mi; - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); - const struct bch_extent_ptr *ptr; - unsigned nr_replicas = 0; - - mutex_lock(&c->sb_lock); - - /* recheck, might have raced */ - if (bch_check_super_marked(c, k, meta)) { - mutex_unlock(&c->sb_lock); - return; - } - - mi = bch_sb_get_members(c->disk_sb)->members; - - extent_for_each_ptr(e, ptr) - if (!ptr->cached) { - (meta - ? SET_BCH_MEMBER_HAS_METADATA - : SET_BCH_MEMBER_HAS_DATA)(mi + ptr->dev, true); - nr_replicas++; - } - - nr_replicas = min_t(unsigned, nr_replicas, - (meta - ? BCH_SB_META_REPLICAS_HAVE - : BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb)); - (meta - ? SET_BCH_SB_META_REPLICAS_HAVE - : SET_BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb, nr_replicas); - - bch_write_super(c); - mutex_unlock(&c->sb_lock); -} diff --git a/libbcache/super-io.h b/libbcache/super-io.h deleted file mode 100644 index 1a9bd309..00000000 --- a/libbcache/super-io.h +++ /dev/null @@ -1,159 +0,0 @@ -#ifndef _BCACHE_SUPER_IO_H -#define _BCACHE_SUPER_IO_H - -#include "extents.h" -#include "super_types.h" - -#include <asm/byteorder.h> - -struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_type); -struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *, - enum bch_sb_field_type, unsigned); -struct bch_sb_field *bch_fs_sb_field_resize(struct bch_fs *, - enum bch_sb_field_type, unsigned); - -#define field_to_type(_f, _name) \ - container_of_or_null(_f, struct bch_sb_field_##_name, field) - -#define BCH_SB_FIELD_TYPE(_name) \ -static inline struct bch_sb_field_##_name * \ -bch_sb_get_##_name(struct bch_sb *sb) \ -{ \ - return field_to_type(bch_sb_field_get(sb, \ - BCH_SB_FIELD_##_name), _name); \ -} \ - \ -static inline struct bch_sb_field_##_name * \ -bch_sb_resize_##_name(struct bcache_superblock *sb, unsigned u64s) \ -{ \ - return field_to_type(bch_sb_field_resize(sb, \ - BCH_SB_FIELD_##_name, u64s), _name); \ -} \ - \ -static inline struct bch_sb_field_##_name * \ -bch_fs_sb_resize_##_name(struct bch_fs *c, unsigned u64s) \ -{ \ - return field_to_type(bch_fs_sb_field_resize(c, \ - BCH_SB_FIELD_##_name, u64s), _name); \ -} - -BCH_SB_FIELD_TYPE(journal); -BCH_SB_FIELD_TYPE(members); -BCH_SB_FIELD_TYPE(crypt); - -static inline bool bch_sb_test_feature(struct bch_sb *sb, - enum bch_sb_features f) -{ - unsigned w = f / 64; - unsigned b = f % 64; - - return le64_to_cpu(sb->features[w]) & (1ULL << b); -} - -static inline void bch_sb_set_feature(struct bch_sb *sb, - enum bch_sb_features f) -{ - if (!bch_sb_test_feature(sb, f)) { - unsigned w = f / 64; - unsigned b = f % 64; - - le64_add_cpu(&sb->features[w], 1ULL << b); - } -} - -static inline __le64 bch_sb_magic(struct bch_fs *c) -{ - __le64 ret; - memcpy(&ret, &c->sb.uuid, sizeof(ret)); - return ret; -} - -static inline __u64 jset_magic(struct bch_fs *c) -{ - return __le64_to_cpu(bch_sb_magic(c) ^ JSET_MAGIC); -} - -static inline __u64 pset_magic(struct bch_fs *c) -{ - return __le64_to_cpu(bch_sb_magic(c) ^ PSET_MAGIC); -} - -static inline __u64 bset_magic(struct bch_fs *c) -{ - return __le64_to_cpu(bch_sb_magic(c) ^ BSET_MAGIC); -} - -static inline struct bch_member_cpu bch_mi_to_cpu(struct bch_member *mi) -{ - return (struct bch_member_cpu) { - .nbuckets = le64_to_cpu(mi->nbuckets), - .first_bucket = le16_to_cpu(mi->first_bucket), - .bucket_size = le16_to_cpu(mi->bucket_size), - .state = BCH_MEMBER_STATE(mi), - .tier = BCH_MEMBER_TIER(mi), - .has_metadata = BCH_MEMBER_HAS_METADATA(mi), - .has_data = BCH_MEMBER_HAS_DATA(mi), - .replacement = BCH_MEMBER_REPLACEMENT(mi), - .discard = BCH_MEMBER_DISCARD(mi), - .valid = !bch_is_zero(mi->uuid.b, sizeof(uuid_le)), - }; -} - -int bch_sb_to_fs(struct bch_fs *, struct bch_sb *); -int bch_sb_from_fs(struct bch_fs *, struct bch_dev *); - -void bch_free_super(struct bcache_superblock *); -int bch_super_realloc(struct bcache_superblock *, unsigned); - -const char *bch_validate_journal_layout(struct bch_sb *, - struct bch_member_cpu); -const char *bch_validate_cache_super(struct bcache_superblock *); - -const char *bch_read_super(struct bcache_superblock *, - struct bch_opts, const char *); -void bch_write_super(struct bch_fs *); - -void bch_check_mark_super_slowpath(struct bch_fs *, - const struct bkey_i *, bool); - -static inline bool bch_check_super_marked(struct bch_fs *c, - const struct bkey_i *k, bool meta) -{ - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); - const struct bch_extent_ptr *ptr; - unsigned nr_replicas = 0; - bool ret = true; - - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; - - if (ptr->cached) - continue; - - if (!(meta - ? ca->mi.has_metadata - : ca->mi.has_data)) { - ret = false; - break; - } - - nr_replicas++; - } - - if (nr_replicas < - (meta ? c->sb.meta_replicas_have : c->sb.data_replicas_have)) - ret = false; - - return ret; -} - -static inline void bch_check_mark_super(struct bch_fs *c, - const struct bkey_i *k, bool meta) -{ - if (bch_check_super_marked(c, k, meta)) - return; - - bch_check_mark_super_slowpath(c, k, meta); -} - -#endif /* _BCACHE_SUPER_IO_H */ diff --git a/libbcache/super.c b/libbcache/super.c deleted file mode 100644 index f5f74936..00000000 --- a/libbcache/super.c +++ /dev/null @@ -1,2047 +0,0 @@ -/* - * bcache setup/teardown code, and some metadata io - read a superblock and - * figure out what to do with it. - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include "bcache.h" -#include "blockdev.h" -#include "alloc.h" -#include "btree_cache.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "btree_io.h" -#include "chardev.h" -#include "checksum.h" -#include "clock.h" -#include "compress.h" -#include "debug.h" -#include "error.h" -#include "fs.h" -#include "fs-gc.h" -#include "inode.h" -#include "io.h" -#include "journal.h" -#include "keylist.h" -#include "move.h" -#include "migrate.h" -#include "movinggc.h" -#include "notify.h" -#include "stats.h" -#include "super.h" -#include "super-io.h" -#include "tier.h" -#include "writeback.h" - -#include <linux/backing-dev.h> -#include <linux/blkdev.h> -#include <linux/debugfs.h> -#include <linux/device.h> -#include <linux/genhd.h> -#include <linux/idr.h> -#include <linux/kthread.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/random.h> -#include <linux/reboot.h> -#include <linux/sysfs.h> -#include <crypto/hash.h> - -#include <trace/events/bcache.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); - -static const uuid_le invalid_uuid = { - .b = { - 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78, - 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99 - } -}; - -static struct kset *bcache_kset; -static LIST_HEAD(bch_fs_list); -static DEFINE_MUTEX(bch_fs_list_lock); - -static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); -struct workqueue_struct *bcache_io_wq; -struct crypto_shash *bch_sha256; - -static void bch_dev_free(struct bch_dev *); -static int bch_dev_alloc(struct bch_fs *, unsigned); -static int bch_dev_sysfs_online(struct bch_dev *); -static void __bch_dev_read_only(struct bch_fs *, struct bch_dev *); - -struct bch_fs *bch_bdev_to_fs(struct block_device *bdev) -{ - struct bch_fs *c; - struct bch_dev *ca; - unsigned i; - - mutex_lock(&bch_fs_list_lock); - rcu_read_lock(); - - list_for_each_entry(c, &bch_fs_list, list) - for_each_member_device_rcu(ca, c, i) - if (ca->disk_sb.bdev == bdev) { - closure_get(&c->cl); - goto found; - } - c = NULL; -found: - rcu_read_unlock(); - mutex_unlock(&bch_fs_list_lock); - - return c; -} - -static struct bch_fs *__bch_uuid_to_fs(uuid_le uuid) -{ - struct bch_fs *c; - - lockdep_assert_held(&bch_fs_list_lock); - - list_for_each_entry(c, &bch_fs_list, list) - if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le))) - return c; - - return NULL; -} - -struct bch_fs *bch_uuid_to_fs(uuid_le uuid) -{ - struct bch_fs *c; - - mutex_lock(&bch_fs_list_lock); - c = __bch_uuid_to_fs(uuid); - if (c) - closure_get(&c->cl); - mutex_unlock(&bch_fs_list_lock); - - return c; -} - -int bch_congested(struct bch_fs *c, int bdi_bits) -{ - struct backing_dev_info *bdi; - struct bch_dev *ca; - unsigned i; - int ret = 0; - - if (bdi_bits & (1 << WB_sync_congested)) { - /* Reads - check all devices: */ - for_each_readable_member(ca, c, i) { - bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); - - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - } else { - /* Writes prefer fastest tier: */ - struct bch_tier *tier = READ_ONCE(c->fastest_tier); - struct dev_group *grp = tier ? &tier->devs : &c->all_devs; - - rcu_read_lock(); - group_for_each_dev(ca, grp, i) { - bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); - - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - rcu_read_unlock(); - } - - return ret; -} - -static int bch_congested_fn(void *data, int bdi_bits) -{ - struct bch_fs *c = data; - - return bch_congested(c, bdi_bits); -} - -/* Filesystem RO/RW: */ - -/* - * For startup/shutdown of RW stuff, the dependencies are: - * - * - foreground writes depend on copygc and tiering (to free up space) - * - * - copygc and tiering depend on mark and sweep gc (they actually probably - * don't because they either reserve ahead of time or don't block if - * allocations fail, but allocations can require mark and sweep gc to run - * because of generation number wraparound) - * - * - all of the above depends on the allocator threads - * - * - allocator depends on the journal (when it rewrites prios and gens) - */ - -static void __bch_fs_read_only(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned i; - - bch_tiering_stop(c); - - for_each_member_device(ca, c, i) - bch_moving_gc_stop(ca); - - bch_gc_thread_stop(c); - - bch_btree_flush(c); - - for_each_member_device(ca, c, i) - bch_dev_allocator_stop(ca); - - bch_fs_journal_stop(&c->journal); -} - -static void bch_writes_disabled(struct percpu_ref *writes) -{ - struct bch_fs *c = container_of(writes, struct bch_fs, writes); - - set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - wake_up(&bch_read_only_wait); -} - -void bch_fs_read_only(struct bch_fs *c) -{ - mutex_lock(&c->state_lock); - if (c->state != BCH_FS_STARTING && - c->state != BCH_FS_RW) - goto out; - - if (test_bit(BCH_FS_ERROR, &c->flags)) - goto out; - - trace_fs_read_only(c); - - /* - * Block new foreground-end write operations from starting - any new - * writes will return -EROFS: - * - * (This is really blocking new _allocations_, writes to previously - * allocated space can still happen until stopping the allocator in - * bch_dev_allocator_stop()). - */ - percpu_ref_kill(&c->writes); - - del_timer(&c->foreground_write_wakeup); - cancel_delayed_work(&c->pd_controllers_update); - - c->foreground_write_pd.rate.rate = UINT_MAX; - bch_wake_delayed_writes((unsigned long) c); - - /* - * If we're not doing an emergency shutdown, we want to wait on - * outstanding writes to complete so they don't see spurious errors due - * to shutting down the allocator: - * - * If we are doing an emergency shutdown outstanding writes may - * hang until we shutdown the allocator so we don't want to wait - * on outstanding writes before shutting everything down - but - * we do need to wait on them before returning and signalling - * that going RO is complete: - */ - wait_event(bch_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || - test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); - - __bch_fs_read_only(c); - - wait_event(bch_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); - - clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - - if (!bch_journal_error(&c->journal) && - !test_bit(BCH_FS_ERROR, &c->flags)) { - mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb, true); - bch_write_super(c); - mutex_unlock(&c->sb_lock); - } - - c->state = BCH_FS_RO; - bch_notify_fs_read_only(c); - trace_fs_read_only_done(c); -out: - mutex_unlock(&c->state_lock); -} - -static void bch_fs_read_only_work(struct work_struct *work) -{ - struct bch_fs *c = - container_of(work, struct bch_fs, read_only_work); - - bch_fs_read_only(c); -} - -static void bch_fs_read_only_async(struct bch_fs *c) -{ - queue_work(system_long_wq, &c->read_only_work); -} - -bool bch_fs_emergency_read_only(struct bch_fs *c) -{ - bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); - - bch_fs_read_only_async(c); - bch_journal_halt(&c->journal); - - wake_up(&bch_read_only_wait); - return ret; -} - -const char *bch_fs_read_write(struct bch_fs *c) -{ - struct bch_dev *ca; - const char *err = NULL; - unsigned i; - - mutex_lock(&c->state_lock); - if (c->state != BCH_FS_STARTING && - c->state != BCH_FS_RO) - goto out; - - err = "error starting allocator thread"; - for_each_rw_member(ca, c, i) - if (bch_dev_allocator_start(ca)) { - percpu_ref_put(&ca->io_ref); - goto err; - } - - err = "error starting btree GC thread"; - if (bch_gc_thread_start(c)) - goto err; - - err = "error starting moving GC thread"; - for_each_rw_member(ca, c, i) - if (bch_moving_gc_start(ca)) { - percpu_ref_put(&ca->io_ref); - goto err; - } - - err = "error starting tiering thread"; - if (bch_tiering_start(c)) - goto err; - - schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); - - if (c->state != BCH_FS_STARTING) - percpu_ref_reinit(&c->writes); - - c->state = BCH_FS_RW; - err = NULL; -out: - mutex_unlock(&c->state_lock); - return err; -err: - __bch_fs_read_only(c); - goto out; -} - -/* Filesystem startup/shutdown: */ - -static void bch_fs_free(struct bch_fs *c) -{ - bch_fs_encryption_exit(c); - bch_fs_btree_exit(c); - bch_fs_journal_exit(&c->journal); - bch_io_clock_exit(&c->io_clock[WRITE]); - bch_io_clock_exit(&c->io_clock[READ]); - bch_fs_compress_exit(c); - bch_fs_blockdev_exit(c); - bdi_destroy(&c->bdi); - lg_lock_free(&c->usage_lock); - free_percpu(c->usage_percpu); - mempool_exit(&c->btree_bounce_pool); - mempool_exit(&c->bio_bounce_pages); - bioset_exit(&c->bio_write); - bioset_exit(&c->bio_read_split); - bioset_exit(&c->bio_read); - bioset_exit(&c->btree_read_bio); - mempool_exit(&c->btree_interior_update_pool); - mempool_exit(&c->btree_reserve_pool); - mempool_exit(&c->fill_iter); - percpu_ref_exit(&c->writes); - - if (c->copygc_wq) - destroy_workqueue(c->copygc_wq); - if (c->wq) - destroy_workqueue(c->wq); - - free_pages((unsigned long) c->disk_sb, c->disk_sb_order); - kfree(c); - module_put(THIS_MODULE); -} - -static void bch_fs_exit(struct bch_fs *c) -{ - unsigned i; - - del_timer_sync(&c->foreground_write_wakeup); - cancel_delayed_work_sync(&c->pd_controllers_update); - cancel_work_sync(&c->read_only_work); - cancel_work_sync(&c->bio_submit_work); - cancel_work_sync(&c->read_retry_work); - - for (i = 0; i < c->sb.nr_devices; i++) - if (c->devs[i]) - bch_dev_free(c->devs[i]); - - closure_debug_destroy(&c->cl); - kobject_put(&c->kobj); -} - -static void bch_fs_offline(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned i; - - mutex_lock(&bch_fs_list_lock); - list_del(&c->list); - mutex_unlock(&bch_fs_list_lock); - - for_each_member_device(ca, c, i) - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev) - sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, - "bcache"); - - if (c->kobj.state_in_sysfs) - kobject_del(&c->kobj); - - bch_fs_debug_exit(c); - bch_fs_chardev_exit(c); - - bch_cache_accounting_destroy(&c->accounting); - - kobject_put(&c->time_stats); - kobject_put(&c->opts_dir); - kobject_put(&c->internal); - - __bch_fs_read_only(c); -} - -/* - * should be __bch_fs_stop4 - block devices are closed, now we can finally - * free it - */ -void bch_fs_release(struct kobject *kobj) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - bch_notify_fs_stopped(c); - bch_fs_free(c); -} - -/* - * All activity on the filesystem should have stopped now - close devices: - */ -static void __bch_fs_stop3(struct closure *cl) -{ - struct bch_fs *c = container_of(cl, struct bch_fs, cl); - - bch_fs_exit(c); -} - -/* - * Openers (i.e. block devices) should have exited, shutdown all userspace - * interfaces and wait for &c->cl to hit 0 - */ -static void __bch_fs_stop2(struct closure *cl) -{ - struct bch_fs *c = container_of(cl, struct bch_fs, caching); - - bch_fs_offline(c); - - closure_return(cl); -} - -/* - * First phase of the shutdown process that's kicked off by bch_fs_stop_async(); - * we haven't waited for anything to stop yet, we're just punting to process - * context to shut down block devices: - */ -static void __bch_fs_stop1(struct closure *cl) -{ - struct bch_fs *c = container_of(cl, struct bch_fs, caching); - - bch_blockdevs_stop(c); - - continue_at(cl, __bch_fs_stop2, system_wq); -} - -void bch_fs_stop_async(struct bch_fs *c) -{ - mutex_lock(&c->state_lock); - if (c->state != BCH_FS_STOPPING) { - c->state = BCH_FS_STOPPING; - closure_queue(&c->caching); - } - mutex_unlock(&c->state_lock); -} - -void bch_fs_stop(struct bch_fs *c) -{ - mutex_lock(&c->state_lock); - BUG_ON(c->state == BCH_FS_STOPPING); - c->state = BCH_FS_STOPPING; - mutex_unlock(&c->state_lock); - - bch_blockdevs_stop(c); - - closure_sync(&c->caching); - closure_debug_destroy(&c->caching); - - bch_fs_offline(c); - - closure_put(&c->cl); - closure_sync(&c->cl); - - bch_fs_exit(c); -} - -/* Stop, detaching from backing devices: */ -void bch_fs_detach(struct bch_fs *c) -{ - if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags)) - bch_fs_stop_async(c); -} - -#define alloc_bucket_pages(gfp, ca) \ - ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca)))) - -static struct bch_fs *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts) -{ - struct bch_sb_field_members *mi; - struct bch_fs *c; - unsigned i, iter_size, journal_entry_bytes; - - c = kzalloc(sizeof(struct bch_fs), GFP_KERNEL); - if (!c) - return NULL; - - __module_get(THIS_MODULE); - - c->minor = -1; - - mutex_init(&c->state_lock); - mutex_init(&c->sb_lock); - INIT_RADIX_TREE(&c->devices, GFP_KERNEL); - mutex_init(&c->btree_cache_lock); - mutex_init(&c->bucket_lock); - mutex_init(&c->btree_root_lock); - INIT_WORK(&c->read_only_work, bch_fs_read_only_work); - - init_rwsem(&c->gc_lock); - -#define BCH_TIME_STAT(name, frequency_units, duration_units) \ - spin_lock_init(&c->name##_time.lock); - BCH_TIME_STATS() -#undef BCH_TIME_STAT - - bch_fs_allocator_init(c); - bch_fs_tiering_init(c); - - INIT_LIST_HEAD(&c->list); - INIT_LIST_HEAD(&c->cached_devs); - INIT_LIST_HEAD(&c->btree_cache); - INIT_LIST_HEAD(&c->btree_cache_freeable); - INIT_LIST_HEAD(&c->btree_cache_freed); - - INIT_LIST_HEAD(&c->btree_interior_update_list); - mutex_init(&c->btree_reserve_cache_lock); - mutex_init(&c->btree_interior_update_lock); - - mutex_init(&c->bio_bounce_pages_lock); - INIT_WORK(&c->bio_submit_work, bch_bio_submit_work); - spin_lock_init(&c->bio_submit_lock); - bio_list_init(&c->read_retry_list); - spin_lock_init(&c->read_retry_lock); - INIT_WORK(&c->read_retry_work, bch_read_retry_work); - mutex_init(&c->zlib_workspace_lock); - - seqcount_init(&c->gc_pos_lock); - - c->prio_clock[READ].hand = 1; - c->prio_clock[READ].min_prio = 0; - c->prio_clock[WRITE].hand = 1; - c->prio_clock[WRITE].min_prio = 0; - - c->congested_read_threshold_us = 2000; - c->congested_write_threshold_us = 20000; - c->error_limit = 16 << IO_ERROR_SHIFT; - init_waitqueue_head(&c->writeback_wait); - - c->writeback_pages_max = (256 << 10) / PAGE_SIZE; - - c->copy_gc_enabled = 1; - c->tiering_enabled = 1; - c->tiering_percent = 10; - - c->foreground_target_percent = 20; - - c->journal.write_time = &c->journal_write_time; - c->journal.delay_time = &c->journal_delay_time; - c->journal.blocked_time = &c->journal_blocked_time; - c->journal.flush_seq_time = &c->journal_flush_seq_time; - - mutex_init(&c->uevent_lock); - - mutex_lock(&c->sb_lock); - - if (bch_sb_to_fs(c, sb)) { - mutex_unlock(&c->sb_lock); - goto err; - } - - mutex_unlock(&c->sb_lock); - - scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); - - bch_opts_apply(&c->opts, bch_sb_opts(sb)); - bch_opts_apply(&c->opts, opts); - - c->opts.nochanges |= c->opts.noreplay; - c->opts.read_only |= c->opts.nochanges; - - c->block_bits = ilog2(c->sb.block_size); - - if (bch_fs_init_fault("fs_alloc")) - goto err; - - iter_size = (btree_blocks(c) + 1) * 2 * - sizeof(struct btree_node_iter_set); - - journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb); - - if (!(c->wq = alloc_workqueue("bcache", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || - !(c->copygc_wq = alloc_workqueue("bcache_copygc", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || - percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) || - mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1, - sizeof(struct btree_reserve)) || - mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, - sizeof(struct btree_interior_update)) || - mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || - bioset_init(&c->btree_read_bio, 1, 0) || - bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) || - bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) || - bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) || - mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->sb.btree_node_size, - BCH_ENCODED_EXTENT_MAX) / - PAGE_SECTORS, 0) || - !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) || - lg_lock_init(&c->usage_lock) || - mempool_init_page_pool(&c->btree_bounce_pool, 1, - ilog2(btree_pages(c))) || - bdi_setup_and_register(&c->bdi, "bcache") || - bch_fs_blockdev_init(c) || - bch_io_clock_init(&c->io_clock[READ]) || - bch_io_clock_init(&c->io_clock[WRITE]) || - bch_fs_journal_init(&c->journal, journal_entry_bytes) || - bch_fs_btree_init(c) || - bch_fs_encryption_init(c) || - bch_fs_compress_init(c) || - bch_check_set_has_compressed_data(c, c->opts.compression)) - goto err; - - c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; - c->bdi.congested_fn = bch_congested_fn; - c->bdi.congested_data = c; - - mi = bch_sb_get_members(c->disk_sb); - for (i = 0; i < c->sb.nr_devices; i++) - if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) && - bch_dev_alloc(c, i)) - goto err; - - /* - * Now that all allocations have succeeded, init various refcounty - * things that let us shutdown: - */ - closure_init(&c->cl, NULL); - - c->kobj.kset = bcache_kset; - kobject_init(&c->kobj, &bch_fs_ktype); - kobject_init(&c->internal, &bch_fs_internal_ktype); - kobject_init(&c->opts_dir, &bch_fs_opts_dir_ktype); - kobject_init(&c->time_stats, &bch_fs_time_stats_ktype); - - bch_cache_accounting_init(&c->accounting, &c->cl); - - closure_init(&c->caching, &c->cl); - set_closure_fn(&c->caching, __bch_fs_stop1, system_wq); - - closure_get(&c->cl); - continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq); - return c; -err: - bch_fs_free(c); - return NULL; -} - -static const char *__bch_fs_online(struct bch_fs *c) -{ - struct bch_dev *ca; - const char *err = NULL; - unsigned i; - int ret; - - lockdep_assert_held(&bch_fs_list_lock); - - if (!list_empty(&c->list)) - return NULL; - - if (__bch_uuid_to_fs(c->sb.uuid)) - return "filesystem UUID already open"; - - ret = bch_fs_chardev_init(c); - if (ret) - return "error creating character device"; - - bch_fs_debug_init(c); - - if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || - kobject_add(&c->internal, &c->kobj, "internal") || - kobject_add(&c->opts_dir, &c->kobj, "options") || - kobject_add(&c->time_stats, &c->kobj, "time_stats") || - bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj)) - return "error creating sysfs objects"; - - mutex_lock(&c->state_lock); - - err = "error creating sysfs objects"; - __for_each_member_device(ca, c, i) - if (bch_dev_sysfs_online(ca)) - goto err; - - err = "can't bring up blockdev volumes"; - if (bch_blockdev_volumes_start(c)) - goto err; - - bch_attach_backing_devs(c); - - list_add(&c->list, &bch_fs_list); - err = NULL; -err: - mutex_unlock(&c->state_lock); - return err; -} - -static const char *bch_fs_online(struct bch_fs *c) -{ - const char *err; - - mutex_lock(&bch_fs_list_lock); - err = __bch_fs_online(c); - mutex_unlock(&bch_fs_list_lock); - - return err; -} - -static const char *__bch_fs_start(struct bch_fs *c) -{ - const char *err = "cannot allocate memory"; - struct bch_sb_field_members *mi; - struct bch_dev *ca; - unsigned i, id; - time64_t now; - LIST_HEAD(journal); - struct jset *j; - int ret = -EINVAL; - - BUG_ON(c->state != BCH_FS_STARTING); - - mutex_lock(&c->sb_lock); - for_each_online_member(ca, c, i) - bch_sb_from_fs(c, ca); - mutex_unlock(&c->sb_lock); - - if (BCH_SB_INITIALIZED(c->disk_sb)) { - ret = bch_journal_read(c, &journal); - if (ret) - goto err; - - j = &list_entry(journal.prev, struct journal_replay, list)->j; - - c->prio_clock[READ].hand = le16_to_cpu(j->read_clock); - c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock); - - err = "error reading priorities"; - for_each_readable_member(ca, c, i) { - ret = bch_prio_read(ca); - if (ret) { - percpu_ref_put(&ca->io_ref); - goto err; - } - } - - for (id = 0; id < BTREE_ID_NR; id++) { - unsigned level; - struct bkey_i *k; - - err = "bad btree root"; - k = bch_journal_find_btree_root(c, j, id, &level); - if (!k && id == BTREE_ID_EXTENTS) - goto err; - if (!k) { - pr_debug("missing btree root: %d", id); - continue; - } - - err = "error reading btree root"; - if (bch_btree_root_read(c, id, k, level)) - goto err; - } - - bch_verbose(c, "starting mark and sweep:"); - - err = "error in recovery"; - if (bch_initial_gc(c, &journal)) - goto err; - - if (c->opts.noreplay) - goto recovery_done; - - bch_verbose(c, "mark and sweep done"); - - /* - * bch_journal_start() can't happen sooner, or btree_gc_finish() - * will give spurious errors about oldest_gen > bucket_gen - - * this is a hack but oh well. - */ - bch_journal_start(c); - - err = "error starting allocator thread"; - for_each_rw_member(ca, c, i) - if (bch_dev_allocator_start(ca)) { - percpu_ref_put(&ca->io_ref); - goto err; - } - - bch_verbose(c, "starting journal replay:"); - - err = "journal replay failed"; - ret = bch_journal_replay(c, &journal); - if (ret) - goto err; - - bch_verbose(c, "journal replay done"); - - if (c->opts.norecovery) - goto recovery_done; - - bch_verbose(c, "starting fsck:"); - err = "error in fsck"; - ret = bch_fsck(c, !c->opts.nofsck); - if (ret) - goto err; - - bch_verbose(c, "fsck done"); - } else { - struct bch_inode_unpacked inode; - struct bkey_inode_buf packed_inode; - struct closure cl; - - closure_init_stack(&cl); - - bch_notice(c, "initializing new filesystem"); - - bch_initial_gc(c, NULL); - - err = "unable to allocate journal buckets"; - for_each_rw_member(ca, c, i) - if (bch_dev_journal_alloc(ca)) { - percpu_ref_put(&ca->io_ref); - goto err; - } - - /* - * journal_res_get() will crash if called before this has - * set up the journal.pin FIFO and journal.cur pointer: - */ - bch_journal_start(c); - bch_journal_set_replay_done(&c->journal); - - err = "error starting allocator thread"; - for_each_rw_member(ca, c, i) - if (bch_dev_allocator_start(ca)) { - percpu_ref_put(&ca->io_ref); - goto err; - } - - err = "cannot allocate new btree root"; - for (id = 0; id < BTREE_ID_NR; id++) - if (bch_btree_root_alloc(c, id, &cl)) { - closure_sync(&cl); - goto err; - } - - /* Wait for new btree roots to be written: */ - closure_sync(&cl); - - bch_inode_init(c, &inode, 0, 0, - S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); - inode.inum = BCACHE_ROOT_INO; - - bch_inode_pack(&packed_inode, &inode); - - err = "error creating root directory"; - if (bch_btree_insert(c, BTREE_ID_INODES, - &packed_inode.inode.k_i, - NULL, NULL, NULL, 0)) - goto err; - - err = "error writing first journal entry"; - if (bch_journal_meta(&c->journal)) - goto err; - } -recovery_done: - err = "dynamic fault"; - if (bch_fs_init_fault("fs_start")) - goto err; - - if (c->opts.read_only) { - bch_fs_read_only(c); - } else { - err = bch_fs_read_write(c); - if (err) - goto err; - } - - mutex_lock(&c->sb_lock); - mi = bch_sb_get_members(c->disk_sb); - now = ktime_get_seconds(); - - for_each_member_device(ca, c, i) - mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); - - SET_BCH_SB_INITIALIZED(c->disk_sb, true); - SET_BCH_SB_CLEAN(c->disk_sb, false); - c->disk_sb->version = BCACHE_SB_VERSION_CDEV; - - bch_write_super(c); - mutex_unlock(&c->sb_lock); - - err = NULL; -out: - bch_journal_entries_free(&journal); - return err; -err: - switch (ret) { - case BCH_FSCK_ERRORS_NOT_FIXED: - bch_err(c, "filesystem contains errors: please report this to the developers"); - pr_cont("mount with -o fix_errors to repair"); - err = "fsck error"; - break; - case BCH_FSCK_REPAIR_UNIMPLEMENTED: - bch_err(c, "filesystem contains errors: please report this to the developers"); - pr_cont("repair unimplemented: inform the developers so that it can be added"); - err = "fsck error"; - break; - case BCH_FSCK_REPAIR_IMPOSSIBLE: - bch_err(c, "filesystem contains errors, but repair impossible"); - err = "fsck error"; - break; - case BCH_FSCK_UNKNOWN_VERSION: - err = "unknown metadata version";; - break; - case -ENOMEM: - err = "cannot allocate memory"; - break; - case -EIO: - err = "IO error"; - break; - } - - BUG_ON(!err); - set_bit(BCH_FS_ERROR, &c->flags); - goto out; -} - -const char *bch_fs_start(struct bch_fs *c) -{ - return __bch_fs_start(c) ?: bch_fs_online(c); -} - -static const char *bch_dev_may_add(struct bch_sb *sb, struct bch_fs *c) -{ - struct bch_sb_field_members *sb_mi; - - sb_mi = bch_sb_get_members(sb); - if (!sb_mi) - return "Invalid superblock: member info area missing"; - - if (le16_to_cpu(sb->block_size) != c->sb.block_size) - return "mismatched block size"; - - if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < - BCH_SB_BTREE_NODE_SIZE(c->disk_sb)) - return "new cache bucket size is too small"; - - return NULL; -} - -static const char *bch_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) -{ - struct bch_sb *newest = - le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; - struct bch_sb_field_members *mi = bch_sb_get_members(newest); - - if (uuid_le_cmp(fs->uuid, sb->uuid)) - return "device not a member of filesystem"; - - if (sb->dev_idx >= newest->nr_devices) - return "device has invalid dev_idx"; - - if (bch_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le))) - return "device has been removed"; - - if (fs->block_size != sb->block_size) - return "mismatched block size"; - - return NULL; -} - -/* Device startup/shutdown: */ - -void bch_dev_release(struct kobject *kobj) -{ - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - - kfree(ca); -} - -static void bch_dev_free(struct bch_dev *ca) -{ - unsigned i; - - cancel_work_sync(&ca->io_error_work); - - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev) - sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, - "bcache"); - - if (ca->kobj.state_in_sysfs) - kobject_del(&ca->kobj); - - bch_free_super(&ca->disk_sb); - bch_dev_journal_exit(ca); - - free_percpu(ca->sectors_written); - bioset_exit(&ca->replica_set); - free_percpu(ca->usage_percpu); - free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); - kfree(ca->prio_buckets); - kfree(ca->bio_prio); - vfree(ca->buckets); - vfree(ca->oldest_gens); - free_heap(&ca->heap); - free_fifo(&ca->free_inc); - - for (i = 0; i < RESERVE_NR; i++) - free_fifo(&ca->free[i]); - - percpu_ref_exit(&ca->io_ref); - percpu_ref_exit(&ca->ref); - kobject_put(&ca->kobj); -} - -static void bch_dev_io_ref_release(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); - - complete(&ca->offline_complete); -} - -static void __bch_dev_offline(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - - lockdep_assert_held(&c->state_lock); - - __bch_dev_read_only(ca->fs, ca); - - reinit_completion(&ca->offline_complete); - percpu_ref_kill(&ca->io_ref); - wait_for_completion(&ca->offline_complete); - - if (ca->kobj.state_in_sysfs) { - struct kobject *block = - &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; - - sysfs_remove_link(block, "bcache"); - sysfs_remove_link(&ca->kobj, "block"); - } - - bch_free_super(&ca->disk_sb); - bch_dev_journal_exit(ca); -} - -static void bch_dev_ref_release(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, ref); - - complete(&ca->stop_complete); -} - -static void bch_dev_stop(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - - lockdep_assert_held(&c->state_lock); - - BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca); - rcu_assign_pointer(c->devs[ca->dev_idx], NULL); - - synchronize_rcu(); - - reinit_completion(&ca->stop_complete); - percpu_ref_kill(&ca->ref); - wait_for_completion(&ca->stop_complete); -} - -static int bch_dev_sysfs_online(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - int ret; - - if (!c->kobj.state_in_sysfs) - return 0; - - if (!ca->kobj.state_in_sysfs) { - ret = kobject_add(&ca->kobj, &ca->fs->kobj, - "dev-%u", ca->dev_idx); - if (ret) - return ret; - } - - if (ca->disk_sb.bdev) { - struct kobject *block = - &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; - - ret = sysfs_create_link(block, &ca->kobj, "bcache"); - if (ret) - return ret; - ret = sysfs_create_link(&ca->kobj, block, "block"); - if (ret) - return ret; - } - - return 0; -} - -static int bch_dev_alloc(struct bch_fs *c, unsigned dev_idx) -{ - struct bch_member *member; - size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve; - size_t heap_size; - unsigned i; - struct bch_dev *ca; - - if (bch_fs_init_fault("dev_alloc")) - return -ENOMEM; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - return -ENOMEM; - - kobject_init(&ca->kobj, &bch_dev_ktype); - init_completion(&ca->stop_complete); - init_completion(&ca->offline_complete); - - spin_lock_init(&ca->self.lock); - ca->self.nr = 1; - rcu_assign_pointer(ca->self.d[0].dev, ca); - ca->dev_idx = dev_idx; - - spin_lock_init(&ca->freelist_lock); - spin_lock_init(&ca->prio_buckets_lock); - mutex_init(&ca->heap_lock); - bch_dev_moving_gc_init(ca); - - INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work); - - if (bch_fs_init_fault("dev_alloc")) - goto err; - - member = bch_sb_get_members(c->disk_sb)->members + dev_idx; - - ca->mi = bch_mi_to_cpu(member); - ca->uuid = member->uuid; - ca->bucket_bits = ilog2(ca->mi.bucket_size); - scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); - - /* XXX: tune these */ - movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7); - reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9); - /* - * free_inc must be smaller than the copygc reserve: if it was bigger, - * one copygc iteration might not make enough buckets available to fill - * up free_inc and allow the allocator to make forward progress - */ - free_inc_reserve = movinggc_reserve / 2; - heap_size = movinggc_reserve * 8; - - if (percpu_ref_init(&ca->ref, bch_dev_ref_release, - 0, GFP_KERNEL) || - percpu_ref_init(&ca->io_ref, bch_dev_io_ref_release, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_MOVINGGC], - movinggc_reserve, GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) || - !init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) || - !init_heap(&ca->heap, heap_size, GFP_KERNEL) || - !(ca->oldest_gens = vzalloc(sizeof(u8) * - ca->mi.nbuckets)) || - !(ca->buckets = vzalloc(sizeof(struct bucket) * - ca->mi.nbuckets)) || - !(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) * - 2, GFP_KERNEL)) || - !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || - !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) || - !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) || - bioset_init(&ca->replica_set, 4, - offsetof(struct bch_write_bio, bio)) || - !(ca->sectors_written = alloc_percpu(*ca->sectors_written))) - goto err; - - ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); - - total_reserve = ca->free_inc.size; - for (i = 0; i < RESERVE_NR; i++) - total_reserve += ca->free[i].size; - - ca->copygc_write_point.group = &ca->self; - ca->tiering_write_point.group = &ca->self; - - ca->fs = c; - rcu_assign_pointer(c->devs[ca->dev_idx], ca); - - if (bch_dev_sysfs_online(ca)) - pr_warn("error creating sysfs objects"); - - return 0; -err: - bch_dev_free(ca); - return -ENOMEM; -} - -static int __bch_dev_online(struct bch_fs *c, struct bcache_superblock *sb) -{ - struct bch_dev *ca; - int ret; - - lockdep_assert_held(&c->sb_lock); - - if (le64_to_cpu(sb->sb->seq) > - le64_to_cpu(c->disk_sb->seq)) - bch_sb_to_fs(c, sb->sb); - - BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || - !c->devs[sb->sb->dev_idx]); - - ca = c->devs[sb->sb->dev_idx]; - if (ca->disk_sb.bdev) { - bch_err(c, "already have device online in slot %u", - sb->sb->dev_idx); - return -EINVAL; - } - - ret = bch_dev_journal_init(ca, sb->sb); - if (ret) - return ret; - - /* - * Increase journal write timeout if flushes to this device are - * expensive: - */ - if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) && - journal_flushes_device(ca)) - c->journal.write_delay_ms = - max(c->journal.write_delay_ms, 1000U); - - /* Commit: */ - ca->disk_sb = *sb; - if (sb->mode & FMODE_EXCL) - ca->disk_sb.bdev->bd_holder = ca; - memset(sb, 0, sizeof(*sb)); - - if (c->sb.nr_devices == 1) - bdevname(ca->disk_sb.bdev, c->name); - bdevname(ca->disk_sb.bdev, ca->name); - - if (bch_dev_sysfs_online(ca)) - pr_warn("error creating sysfs objects"); - - lg_local_lock(&c->usage_lock); - if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA))) - bch_mark_dev_metadata(ca->fs, ca); - lg_local_unlock(&c->usage_lock); - - percpu_ref_reinit(&ca->io_ref); - return 0; -} - -/* Device management: */ - -bool bch_fs_may_start(struct bch_fs *c, int flags) -{ - struct bch_sb_field_members *mi; - unsigned meta_missing = 0; - unsigned data_missing = 0; - bool degraded = false; - unsigned i; - - mutex_lock(&c->sb_lock); - mi = bch_sb_get_members(c->disk_sb); - - for (i = 0; i < c->disk_sb->nr_devices; i++) - if (!c->devs[i] && - !bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) { - degraded = true; - if (BCH_MEMBER_HAS_METADATA(&mi->members[i])) - meta_missing++; - if (BCH_MEMBER_HAS_DATA(&mi->members[i])) - data_missing++; - } - mutex_unlock(&c->sb_lock); - - if (degraded && - !(flags & BCH_FORCE_IF_DEGRADED)) - return false; - - if (meta_missing && - !(flags & BCH_FORCE_IF_METADATA_DEGRADED)) - return false; - - if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) && - !(flags & BCH_FORCE_IF_METADATA_LOST)) - return false; - - if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) - return false; - - if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) && - !(flags & BCH_FORCE_IF_DATA_LOST)) - return false; - - return true; -} - -bool bch_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, - enum bch_member_state new_state, int flags) -{ - lockdep_assert_held(&c->state_lock); - - if (new_state == BCH_MEMBER_STATE_RW) - return true; - - if (ca->mi.has_data && - !(flags & BCH_FORCE_IF_DATA_DEGRADED)) - return false; - - if (ca->mi.has_data && - c->sb.data_replicas_have <= 1 && - !(flags & BCH_FORCE_IF_DATA_LOST)) - return false; - - if (ca->mi.has_metadata && - !(flags & BCH_FORCE_IF_METADATA_DEGRADED)) - return false; - - if (ca->mi.has_metadata && - c->sb.meta_replicas_have <= 1 && - !(flags & BCH_FORCE_IF_METADATA_LOST)) - return false; - - return true; -} - -static void __bch_dev_read_only(struct bch_fs *c, struct bch_dev *ca) -{ - bch_moving_gc_stop(ca); - - /* - * This stops new data writes (e.g. to existing open data - * buckets) and then waits for all existing writes to - * complete. - */ - bch_dev_allocator_stop(ca); - - bch_dev_group_remove(&c->journal.devs, ca); -} - -static const char *__bch_dev_read_write(struct bch_fs *c, struct bch_dev *ca) -{ - lockdep_assert_held(&c->state_lock); - - BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); - - trace_bcache_cache_read_write(ca); - - if (bch_dev_allocator_start(ca)) - return "error starting allocator thread"; - - if (bch_moving_gc_start(ca)) - return "error starting moving GC thread"; - - if (bch_tiering_start(c)) - return "error starting tiering thread"; - - bch_notify_dev_read_write(ca); - trace_bcache_cache_read_write_done(ca); - - return NULL; -} - -int __bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca, - enum bch_member_state new_state, int flags) -{ - struct bch_sb_field_members *mi; - - if (ca->mi.state == new_state) - return 0; - - if (!bch_dev_state_allowed(c, ca, new_state, flags)) - return -EINVAL; - - if (new_state == BCH_MEMBER_STATE_RW) { - if (__bch_dev_read_write(c, ca)) - return -ENOMEM; - } else { - __bch_dev_read_only(c, ca); - } - - bch_notice(ca, "%s", bch_dev_state[new_state]); - - mutex_lock(&c->sb_lock); - mi = bch_sb_get_members(c->disk_sb); - SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); - bch_write_super(c); - mutex_unlock(&c->sb_lock); - - return 0; -} - -int bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca, - enum bch_member_state new_state, int flags) -{ - int ret; - - mutex_lock(&c->state_lock); - ret = __bch_dev_set_state(c, ca, new_state, flags); - mutex_unlock(&c->state_lock); - - return ret; -} - -/* Device add/removal: */ - -int bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) -{ - struct bch_sb_field_members *mi; - unsigned dev_idx = ca->dev_idx; - int ret = -EINVAL; - - mutex_lock(&c->state_lock); - - percpu_ref_put(&ca->ref); /* XXX */ - - if (ca->mi.state == BCH_MEMBER_STATE_RW) { - bch_err(ca, "Cannot remove RW device"); - goto err; - } - - if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { - bch_err(ca, "Cannot remove without losing data"); - goto err; - } - - /* - * XXX: verify that dev_idx is really not in use anymore, anywhere - * - * flag_data_bad() does not check btree pointers - */ - ret = bch_flag_data_bad(ca); - if (ret) { - bch_err(ca, "Remove failed"); - goto err; - } - - if (ca->mi.has_data || ca->mi.has_metadata) { - bch_err(ca, "Remove failed, still has data"); - goto err; - } - - /* - * Ok, really doing the remove: - * Drop device's prio pointer before removing it from superblock: - */ - spin_lock(&c->journal.lock); - c->journal.prio_buckets[dev_idx] = 0; - spin_unlock(&c->journal.lock); - - bch_journal_meta(&c->journal); - - __bch_dev_offline(ca); - bch_dev_stop(ca); - bch_dev_free(ca); - - /* - * Free this device's slot in the bch_member array - all pointers to - * this device must be gone: - */ - mutex_lock(&c->sb_lock); - mi = bch_sb_get_members(c->disk_sb); - memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); - - bch_write_super(c); - - mutex_unlock(&c->sb_lock); - mutex_unlock(&c->state_lock); - return 0; -err: - mutex_unlock(&c->state_lock); - return ret; -} - -int bch_dev_add(struct bch_fs *c, const char *path) -{ - struct bcache_superblock sb; - const char *err; - struct bch_dev *ca = NULL; - struct bch_sb_field_members *mi, *dev_mi; - struct bch_member saved_mi; - unsigned dev_idx, nr_devices, u64s; - int ret = -EINVAL; - - err = bch_read_super(&sb, bch_opts_empty(), path); - if (err) - return -EINVAL; - - err = bch_validate_cache_super(&sb); - if (err) - return -EINVAL; - - err = bch_dev_may_add(sb.sb, c); - if (err) - return -EINVAL; - - mutex_lock(&c->state_lock); - mutex_lock(&c->sb_lock); - - /* - * Preserve the old cache member information (esp. tier) - * before we start bashing the disk stuff. - */ - dev_mi = bch_sb_get_members(sb.sb); - saved_mi = dev_mi->members[sb.sb->dev_idx]; - saved_mi.last_mount = cpu_to_le64(ktime_get_seconds()); - - if (dynamic_fault("bcache:add:no_slot")) - goto no_slot; - - mi = bch_sb_get_members(c->disk_sb); - for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) - if (dev_idx >= c->sb.nr_devices || - bch_is_zero(mi->members[dev_idx].uuid.b, - sizeof(uuid_le))) - goto have_slot; -no_slot: - err = "no slots available in superblock"; - ret = -ENOSPC; - goto err_unlock; - -have_slot: - nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); - u64s = (sizeof(struct bch_sb_field_members) + - sizeof(struct bch_member) * nr_devices) / sizeof(u64); - err = "no space in superblock for member info"; - - mi = bch_fs_sb_resize_members(c, u64s); - if (!mi) - goto err_unlock; - - dev_mi = bch_sb_resize_members(&sb, u64s); - if (!dev_mi) - goto err_unlock; - - memcpy(dev_mi, mi, u64s * sizeof(u64)); - dev_mi->members[dev_idx] = saved_mi; - - sb.sb->uuid = c->disk_sb->uuid; - sb.sb->dev_idx = dev_idx; - sb.sb->nr_devices = nr_devices; - - /* commit new member info */ - memcpy(mi, dev_mi, u64s * sizeof(u64)); - c->disk_sb->nr_devices = nr_devices; - c->sb.nr_devices = nr_devices; - - if (bch_dev_alloc(c, dev_idx)) { - err = "cannot allocate memory"; - ret = -ENOMEM; - goto err_unlock; - } - - if (__bch_dev_online(c, &sb)) { - err = "bch_dev_online() error"; - ret = -ENOMEM; - goto err_unlock; - } - - bch_write_super(c); - mutex_unlock(&c->sb_lock); - - ca = c->devs[dev_idx]; - if (ca->mi.state == BCH_MEMBER_STATE_RW) { - err = "journal alloc failed"; - if (bch_dev_journal_alloc(ca)) - goto err; - - err = __bch_dev_read_write(c, ca); - if (err) - goto err; - } - - bch_notify_dev_added(ca); - mutex_unlock(&c->state_lock); - return 0; -err_unlock: - mutex_unlock(&c->sb_lock); -err: - mutex_unlock(&c->state_lock); - bch_free_super(&sb); - - bch_err(c, "Unable to add device: %s", err); - return ret ?: -EINVAL; -} - -int bch_dev_online(struct bch_fs *c, const char *path) -{ - struct bcache_superblock sb = { 0 }; - struct bch_dev *ca; - unsigned dev_idx; - const char *err; - - mutex_lock(&c->state_lock); - - err = bch_read_super(&sb, bch_opts_empty(), path); - if (err) - goto err; - - dev_idx = sb.sb->dev_idx; - - err = bch_dev_in_fs(c->disk_sb, sb.sb); - if (err) - goto err; - - mutex_lock(&c->sb_lock); - if (__bch_dev_online(c, &sb)) { - err = "__bch_dev_online() error"; - mutex_unlock(&c->sb_lock); - goto err; - } - mutex_unlock(&c->sb_lock); - - ca = c->devs[dev_idx]; - if (ca->mi.state == BCH_MEMBER_STATE_RW) { - err = __bch_dev_read_write(c, ca); - if (err) - goto err; - } - - mutex_unlock(&c->state_lock); - return 0; -err: - mutex_unlock(&c->state_lock); - bch_free_super(&sb); - bch_err(c, "error bringing %s online: %s", path, err); - return -EINVAL; -} - -int bch_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) -{ - mutex_lock(&c->state_lock); - - if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { - bch_err(ca, "Cannot offline required disk"); - mutex_unlock(&c->state_lock); - return -EINVAL; - } - - __bch_dev_read_only(c, ca); - __bch_dev_offline(ca); - - mutex_unlock(&c->state_lock); - return 0; -} - -int bch_dev_evacuate(struct bch_fs *c, struct bch_dev *ca) -{ - int ret; - - mutex_lock(&c->state_lock); - - if (ca->mi.state == BCH_MEMBER_STATE_RW) { - bch_err(ca, "Cannot migrate data off RW device"); - mutex_unlock(&c->state_lock); - return -EINVAL; - } - - mutex_unlock(&c->state_lock); - - ret = bch_move_data_off_device(ca); - if (ret) { - bch_err(ca, "Error migrating data: %i", ret); - return ret; - } - - ret = bch_move_metadata_off_device(ca); - if (ret) { - bch_err(ca, "Error migrating metadata: %i", ret); - return ret; - } - - if (ca->mi.has_data || ca->mi.has_metadata) { - bch_err(ca, "Migrate error: data still present"); - return -EINVAL; - } - - return 0; -} - -/* Filesystem open: */ - -const char *bch_fs_open(char * const *devices, unsigned nr_devices, - struct bch_opts opts, struct bch_fs **ret) -{ - const char *err; - struct bch_fs *c = NULL; - struct bcache_superblock *sb; - unsigned i, best_sb = 0; - - if (!nr_devices) - return "need at least one device"; - - if (!try_module_get(THIS_MODULE)) - return "module unloading"; - - err = "cannot allocate memory"; - sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); - if (!sb) - goto err; - - for (i = 0; i < nr_devices; i++) { - err = bch_read_super(&sb[i], opts, devices[i]); - if (err) - goto err; - - err = "attempting to register backing device"; - if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version))) - goto err; - - err = bch_validate_cache_super(&sb[i]); - if (err) - goto err; - } - - for (i = 1; i < nr_devices; i++) - if (le64_to_cpu(sb[i].sb->seq) > - le64_to_cpu(sb[best_sb].sb->seq)) - best_sb = i; - - for (i = 0; i < nr_devices; i++) { - err = bch_dev_in_fs(sb[best_sb].sb, sb[i].sb); - if (err) - goto err; - } - - err = "cannot allocate memory"; - c = bch_fs_alloc(sb[best_sb].sb, opts); - if (!c) - goto err; - - err = "bch_dev_online() error"; - mutex_lock(&c->sb_lock); - for (i = 0; i < nr_devices; i++) - if (__bch_dev_online(c, &sb[i])) { - mutex_unlock(&c->sb_lock); - goto err; - } - mutex_unlock(&c->sb_lock); - - err = "insufficient devices"; - if (!bch_fs_may_start(c, 0)) - goto err; - - if (!c->opts.nostart) { - err = __bch_fs_start(c); - if (err) - goto err; - } - - err = bch_fs_online(c); - if (err) - goto err; - - if (ret) - *ret = c; - else - closure_put(&c->cl); - - err = NULL; -out: - kfree(sb); - module_put(THIS_MODULE); - if (err) - c = NULL; - return err; -err: - if (c) - bch_fs_stop(c); - - for (i = 0; i < nr_devices; i++) - bch_free_super(&sb[i]); - goto out; -} - -static const char *__bch_fs_open_incremental(struct bcache_superblock *sb, - struct bch_opts opts) -{ - const char *err; - struct bch_fs *c; - bool allocated_fs = false; - - err = bch_validate_cache_super(sb); - if (err) - return err; - - mutex_lock(&bch_fs_list_lock); - c = __bch_uuid_to_fs(sb->sb->uuid); - if (c) { - closure_get(&c->cl); - - err = bch_dev_in_fs(c->disk_sb, sb->sb); - if (err) - goto err; - } else { - c = bch_fs_alloc(sb->sb, opts); - err = "cannot allocate memory"; - if (!c) - goto err; - - allocated_fs = true; - } - - err = "bch_dev_online() error"; - - mutex_lock(&c->sb_lock); - if (__bch_dev_online(c, sb)) { - mutex_unlock(&c->sb_lock); - goto err; - } - mutex_unlock(&c->sb_lock); - - if (!c->opts.nostart && bch_fs_may_start(c, 0)) { - err = __bch_fs_start(c); - if (err) - goto err; - } - - err = __bch_fs_online(c); - if (err) - goto err; - - closure_put(&c->cl); - mutex_unlock(&bch_fs_list_lock); - - return NULL; -err: - mutex_unlock(&bch_fs_list_lock); - - if (allocated_fs) - bch_fs_stop(c); - else if (c) - closure_put(&c->cl); - - return err; -} - -const char *bch_fs_open_incremental(const char *path) -{ - struct bcache_superblock sb; - struct bch_opts opts = bch_opts_empty(); - const char *err; - - err = bch_read_super(&sb, opts, path); - if (err) - return err; - - if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) { - mutex_lock(&bch_fs_list_lock); - err = bch_backing_dev_register(&sb); - mutex_unlock(&bch_fs_list_lock); - } else { - err = __bch_fs_open_incremental(&sb, opts); - } - - bch_free_super(&sb); - - return err; -} - -/* Global interfaces/init */ - -#define kobj_attribute_write(n, fn) \ - static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) - -#define kobj_attribute_rw(n, show, store) \ - static struct kobj_attribute ksysfs_##n = \ - __ATTR(n, S_IWUSR|S_IRUSR, show, store) - -static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, - const char *, size_t); - -kobj_attribute_write(register, register_bcache); -kobj_attribute_write(register_quiet, register_bcache); - -static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - const char *buffer, size_t size) -{ - ssize_t ret = -EINVAL; - const char *err = "cannot allocate memory"; - char *path = NULL; - - if (!try_module_get(THIS_MODULE)) - return -EBUSY; - - if (!(path = kstrndup(skip_spaces(buffer), size, GFP_KERNEL))) - goto err; - - err = bch_fs_open_incremental(strim(path)); - if (err) - goto err; - - ret = size; -out: - kfree(path); - module_put(THIS_MODULE); - return ret; -err: - pr_err("error opening %s: %s", path, err); - goto out; -} - -static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) -{ - if (code == SYS_DOWN || - code == SYS_HALT || - code == SYS_POWER_OFF) { - struct bch_fs *c; - - mutex_lock(&bch_fs_list_lock); - - if (!list_empty(&bch_fs_list)) - pr_info("Setting all devices read only:"); - - list_for_each_entry(c, &bch_fs_list, list) - bch_fs_read_only_async(c); - - list_for_each_entry(c, &bch_fs_list, list) - bch_fs_read_only(c); - - mutex_unlock(&bch_fs_list_lock); - } - - return NOTIFY_DONE; -} - -static struct notifier_block reboot = { - .notifier_call = bcache_reboot, - .priority = INT_MAX, /* before any real devices */ -}; - -static ssize_t reboot_test(struct kobject *k, struct kobj_attribute *attr, - const char *buffer, size_t size) -{ - bcache_reboot(NULL, SYS_DOWN, NULL); - return size; -} - -kobj_attribute_write(reboot, reboot_test); - -static void bcache_exit(void) -{ - bch_debug_exit(); - bch_vfs_exit(); - bch_blockdev_exit(); - bch_chardev_exit(); - if (bcache_kset) - kset_unregister(bcache_kset); - if (bcache_io_wq) - destroy_workqueue(bcache_io_wq); - if (!IS_ERR_OR_NULL(bch_sha256)) - crypto_free_shash(bch_sha256); - unregister_reboot_notifier(&reboot); -} - -static int __init bcache_init(void) -{ - static const struct attribute *files[] = { - &ksysfs_register.attr, - &ksysfs_register_quiet.attr, - &ksysfs_reboot.attr, - NULL - }; - - register_reboot_notifier(&reboot); - closure_debug_init(); - bkey_pack_test(); - - bch_sha256 = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(bch_sha256)) - goto err; - - if (!(bcache_io_wq = create_freezable_workqueue("bcache_io")) || - !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) || - sysfs_create_files(&bcache_kset->kobj, files) || - bch_chardev_init() || - bch_blockdev_init() || - bch_vfs_init() || - bch_debug_init()) - goto err; - - return 0; -err: - bcache_exit(); - return -ENOMEM; -} - -#define BCH_DEBUG_PARAM(name, description) \ - bool bch_##name; \ - module_param_named(name, bch_##name, bool, 0644); \ - MODULE_PARM_DESC(name, description); -BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - -module_exit(bcache_exit); -module_init(bcache_init); diff --git a/libbcache/super.h b/libbcache/super.h deleted file mode 100644 index 66c34308..00000000 --- a/libbcache/super.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef _BCACHE_SUPER_H -#define _BCACHE_SUPER_H - -#include "extents.h" - -#include <linux/bcache-ioctl.h> - -static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) -{ - return s >> ca->bucket_bits; -} - -static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) -{ - return ((sector_t) b) << ca->bucket_bits; -} - -static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) -{ - return s & (ca->mi.bucket_size - 1); -} - -static inline struct bch_dev *__bch_next_dev(struct bch_fs *c, unsigned *iter) -{ - struct bch_dev *ca = NULL; - - while (*iter < c->sb.nr_devices && - !(ca = rcu_dereference_check(c->devs[*iter], - lockdep_is_held(&c->state_lock)))) - (*iter)++; - - return ca; -} - -#define __for_each_member_device(ca, c, iter) \ - for ((iter) = 0; ((ca) = __bch_next_dev((c), &(iter))); (iter)++) - -#define for_each_member_device_rcu(ca, c, iter) \ - __for_each_member_device(ca, c, iter) - -static inline struct bch_dev *bch_get_next_dev(struct bch_fs *c, unsigned *iter) -{ - struct bch_dev *ca; - - rcu_read_lock(); - if ((ca = __bch_next_dev(c, iter))) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - - return ca; -} - -/* - * If you break early, you must drop your ref on the current device - */ -#define for_each_member_device(ca, c, iter) \ - for ((iter) = 0; \ - (ca = bch_get_next_dev(c, &(iter))); \ - percpu_ref_put(&ca->ref), (iter)++) - -static inline struct bch_dev *bch_get_next_online_dev(struct bch_fs *c, - unsigned *iter, - int state_mask) -{ - struct bch_dev *ca; - - rcu_read_lock(); - while ((ca = __bch_next_dev(c, iter)) && - (!((1 << ca->mi.state) & state_mask) || - !percpu_ref_tryget(&ca->io_ref))) - (*iter)++; - rcu_read_unlock(); - - return ca; -} - -#define __for_each_online_member(ca, c, iter, state_mask) \ - for ((iter) = 0; \ - (ca = bch_get_next_online_dev(c, &(iter), state_mask)); \ - percpu_ref_put(&ca->io_ref), (iter)++) - -#define for_each_online_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, ~0) - -#define for_each_rw_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW) - -#define for_each_readable_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, \ - (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) - -struct bch_fs *bch_bdev_to_fs(struct block_device *); -struct bch_fs *bch_uuid_to_fs(uuid_le); -int bch_congested(struct bch_fs *, int); - -void bch_dev_release(struct kobject *); - -bool bch_dev_state_allowed(struct bch_fs *, struct bch_dev *, - enum bch_member_state, int); -int __bch_dev_set_state(struct bch_fs *, struct bch_dev *, - enum bch_member_state, int); -int bch_dev_set_state(struct bch_fs *, struct bch_dev *, - enum bch_member_state, int); - -int bch_dev_fail(struct bch_dev *, int); -int bch_dev_remove(struct bch_fs *, struct bch_dev *, int); -int bch_dev_add(struct bch_fs *, const char *); -int bch_dev_online(struct bch_fs *, const char *); -int bch_dev_offline(struct bch_fs *, struct bch_dev *, int); -int bch_dev_evacuate(struct bch_fs *, struct bch_dev *); - -void bch_fs_detach(struct bch_fs *); - -bool bch_fs_emergency_read_only(struct bch_fs *); -void bch_fs_read_only(struct bch_fs *); -const char *bch_fs_read_write(struct bch_fs *); - -void bch_fs_release(struct kobject *); -void bch_fs_stop_async(struct bch_fs *); -void bch_fs_stop(struct bch_fs *); - -const char *bch_fs_start(struct bch_fs *); -const char *bch_fs_open(char * const *, unsigned, struct bch_opts, - struct bch_fs **); -const char *bch_fs_open_incremental(const char *path); - -extern struct workqueue_struct *bcache_io_wq; -extern struct crypto_shash *bch_sha256; - -extern struct kobj_type bch_fs_ktype; -extern struct kobj_type bch_fs_internal_ktype; -extern struct kobj_type bch_fs_time_stats_ktype; -extern struct kobj_type bch_fs_opts_dir_ktype; -extern struct kobj_type bch_dev_ktype; - -#endif /* _BCACHE_SUPER_H */ diff --git a/libbcache/super_types.h b/libbcache/super_types.h deleted file mode 100644 index 69c747de..00000000 --- a/libbcache/super_types.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _BCACHE_SUPER_TYPES_H -#define _BCACHE_SUPER_TYPES_H - -struct bcache_superblock { - struct bch_sb *sb; - struct block_device *bdev; - struct bio *bio; - unsigned page_order; - fmode_t mode; -}; - -#endif /* _BCACHE_SUPER_TYPES_H */ diff --git a/libbcache/sysfs.c b/libbcache/sysfs.c deleted file mode 100644 index 3536ec0c..00000000 --- a/libbcache/sysfs.c +++ /dev/null @@ -1,1336 +0,0 @@ -/* - * bcache sysfs interfaces - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include "bcache.h" -#include "alloc.h" -#include "blockdev.h" -#include "compress.h" -#include "sysfs.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_gc.h" -#include "buckets.h" -#include "inode.h" -#include "journal.h" -#include "keylist.h" -#include "move.h" -#include "opts.h" -#include "request.h" -#include "super-io.h" -#include "tier.h" -#include "writeback.h" - -#include <linux/blkdev.h> -#include <linux/sort.h> - -write_attribute(attach); -write_attribute(detach); -write_attribute(unregister); -write_attribute(stop); -write_attribute(clear_stats); -write_attribute(trigger_btree_coalesce); -write_attribute(trigger_gc); -write_attribute(prune_cache); -write_attribute(blockdev_volume_create); - -read_attribute(uuid); -read_attribute(minor); -read_attribute(bucket_size); -read_attribute(bucket_size_bytes); -read_attribute(block_size); -read_attribute(block_size_bytes); -read_attribute(btree_node_size); -read_attribute(btree_node_size_bytes); -read_attribute(first_bucket); -read_attribute(nbuckets); -read_attribute(tree_depth); -read_attribute(root_usage_percent); -read_attribute(read_priority_stats); -read_attribute(write_priority_stats); -read_attribute(fragmentation_stats); -read_attribute(oldest_gen_stats); -read_attribute(reserve_stats); -read_attribute(btree_cache_size); -read_attribute(cache_available_percent); -read_attribute(compression_stats); -read_attribute(written); -read_attribute(btree_written); -read_attribute(metadata_written); -read_attribute(journal_debug); -write_attribute(journal_flush); -read_attribute(internal_uuid); - -read_attribute(btree_gc_running); - -read_attribute(btree_nodes); -read_attribute(btree_used_percent); -read_attribute(average_key_size); -read_attribute(available_buckets); -read_attribute(free_buckets); -read_attribute(dirty_data); -read_attribute(dirty_bytes); -read_attribute(dirty_buckets); -read_attribute(cached_data); -read_attribute(cached_bytes); -read_attribute(cached_buckets); -read_attribute(meta_buckets); -read_attribute(alloc_buckets); -read_attribute(has_data); -read_attribute(has_metadata); -read_attribute(bset_tree_stats); -read_attribute(alloc_debug); - -read_attribute(state); -read_attribute(cache_read_races); -read_attribute(writeback_keys_done); -read_attribute(writeback_keys_failed); -read_attribute(io_errors); -rw_attribute(io_error_limit); -rw_attribute(io_error_halflife); -read_attribute(congested); -rw_attribute(congested_read_threshold_us); -rw_attribute(congested_write_threshold_us); - -rw_attribute(sequential_cutoff); -rw_attribute(cache_mode); -rw_attribute(writeback_metadata); -rw_attribute(writeback_running); -rw_attribute(writeback_percent); -sysfs_pd_controller_attribute(writeback); - -read_attribute(stripe_size); -read_attribute(partial_stripes_expensive); - -rw_attribute(journal_write_delay_ms); -rw_attribute(journal_reclaim_delay_ms); -read_attribute(journal_entry_size_max); - -rw_attribute(discard); -rw_attribute(running); -rw_attribute(label); -rw_attribute(readahead); -rw_attribute(verify); -rw_attribute(bypass_torture_test); -rw_attribute(cache_replacement_policy); - -rw_attribute(foreground_write_ratelimit_enabled); -rw_attribute(copy_gc_enabled); -sysfs_pd_controller_attribute(copy_gc); - -rw_attribute(tier); -rw_attribute(tiering_enabled); -rw_attribute(tiering_percent); -sysfs_pd_controller_attribute(tiering); - -sysfs_pd_controller_attribute(foreground_write); - -rw_attribute(pd_controllers_update_seconds); - -rw_attribute(foreground_target_percent); - -rw_attribute(size); -read_attribute(meta_replicas_have); -read_attribute(data_replicas_have); - -#define BCH_DEBUG_PARAM(name, description) \ - rw_attribute(name); - - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - -#define BCH_OPT(_name, _mode, ...) \ - static struct attribute sysfs_opt_##_name = { \ - .name = #_name, .mode = _mode, \ - }; - - BCH_VISIBLE_OPTS() -#undef BCH_OPT - -#define BCH_TIME_STAT(name, frequency_units, duration_units) \ - sysfs_time_stats_attribute(name, frequency_units, duration_units); - BCH_TIME_STATS() -#undef BCH_TIME_STAT - -static struct attribute sysfs_state_rw = { - .name = "state", - .mode = S_IRUGO -}; - -SHOW(bch_cached_dev) -{ - struct cached_dev *dc = container_of(kobj, struct cached_dev, - disk.kobj); - const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; - -#define var(stat) (dc->stat) - - if (attr == &sysfs_cache_mode) - return bch_snprint_string_list(buf, PAGE_SIZE, - bch_cache_modes + 1, - BDEV_CACHE_MODE(dc->disk_sb.sb)); - - var_printf(verify, "%i"); - var_printf(bypass_torture_test, "%i"); - var_printf(writeback_metadata, "%i"); - var_printf(writeback_running, "%i"); - var_print(writeback_percent); - sysfs_pd_controller_show(writeback, &dc->writeback_pd); - - sysfs_hprint(dirty_data, - bcache_dev_sectors_dirty(&dc->disk) << 9); - sysfs_print(dirty_bytes, - bcache_dev_sectors_dirty(&dc->disk) << 9); - - sysfs_hprint(stripe_size, dc->disk.stripe_size << 9); - var_printf(partial_stripes_expensive, "%u"); - - var_hprint(sequential_cutoff); - var_hprint(readahead); - - sysfs_print(running, atomic_read(&dc->running)); - sysfs_print(state, states[BDEV_STATE(dc->disk_sb.sb)]); - - if (attr == &sysfs_label) { - memcpy(buf, dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE); - buf[BCH_SB_LABEL_SIZE + 1] = '\0'; - strcat(buf, "\n"); - return strlen(buf); - } - -#undef var - return 0; -} - -STORE(bch_cached_dev) -{ - struct cached_dev *dc = container_of(kobj, struct cached_dev, - disk.kobj); - struct kobj_uevent_env *env; - -#define d_strtoul(var) sysfs_strtoul(var, dc->var) -#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) -#define d_strtoi_h(var) sysfs_hatoi(var, dc->var) - - d_strtoul(verify); - d_strtoul(bypass_torture_test); - d_strtoul(writeback_metadata); - d_strtoul(writeback_running); - sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); - sysfs_pd_controller_store(writeback, &dc->writeback_pd); - - d_strtoi_h(sequential_cutoff); - d_strtoi_h(readahead); - - if (attr == &sysfs_writeback_running) - bch_writeback_queue(dc); - - if (attr == &sysfs_writeback_percent) - schedule_delayed_work(&dc->writeback_pd_update, - dc->writeback_pd_update_seconds * HZ); - - if (attr == &sysfs_clear_stats) - bch_cache_accounting_clear(&dc->accounting); - - if (attr == &sysfs_running && - strtoul_or_return(buf)) - bch_cached_dev_run(dc); - - if (attr == &sysfs_cache_mode) { - ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1); - - if (v < 0) - return v; - - if ((unsigned) v != BDEV_CACHE_MODE(dc->disk_sb.sb)) { - SET_BDEV_CACHE_MODE(dc->disk_sb.sb, v); - bch_write_bdev_super(dc, NULL); - } - } - - if (attr == &sysfs_label) { - u64 journal_seq = 0; - int ret = 0; - - if (size > BCH_SB_LABEL_SIZE) - return -EINVAL; - - mutex_lock(&dc->disk.inode_lock); - - memcpy(dc->disk_sb.sb->label, buf, size); - if (size < BCH_SB_LABEL_SIZE) - dc->disk_sb.sb->label[size] = '\0'; - if (size && dc->disk_sb.sb->label[size - 1] == '\n') - dc->disk_sb.sb->label[size - 1] = '\0'; - - memcpy(dc->disk.inode.v.i_label, - dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE); - - bch_write_bdev_super(dc, NULL); - - if (dc->disk.c) - ret = bch_btree_update(dc->disk.c, BTREE_ID_INODES, - &dc->disk.inode.k_i, - &journal_seq); - - mutex_unlock(&dc->disk.inode_lock); - - if (ret) - return ret; - - if (dc->disk.c) - ret = bch_journal_flush_seq(&dc->disk.c->journal, - journal_seq); - if (ret) - return ret; - - env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); - if (!env) - return -ENOMEM; - add_uevent_var(env, "DRIVER=bcache"); - add_uevent_var(env, "CACHED_UUID=%pU", dc->disk_sb.sb->disk_uuid.b), - add_uevent_var(env, "CACHED_LABEL=%s", buf); - kobject_uevent_env( - &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp); - kfree(env); - } - - if (attr == &sysfs_attach) { - struct bch_fs *c; - uuid_le uuid; - int ret; - - if (uuid_parse(buf, &uuid)) - return -EINVAL; - - c = bch_uuid_to_fs(uuid); - if (!c) { - pr_err("Can't attach %s: cache set not found", buf); - return -ENOENT; - } - - dc->disk_sb.sb->set_uuid = uuid; - - ret = bch_cached_dev_attach(dc, c); - closure_put(&c->cl); - if (ret) - return ret; - } - - if (attr == &sysfs_detach && dc->disk.c) - bch_cached_dev_detach(dc); - - if (attr == &sysfs_stop) - bch_blockdev_stop(&dc->disk); - - return size; -} - -static struct attribute *bch_cached_dev_files[] = { - &sysfs_attach, - &sysfs_detach, - &sysfs_stop, - &sysfs_cache_mode, - &sysfs_writeback_metadata, - &sysfs_writeback_running, - &sysfs_writeback_percent, - sysfs_pd_controller_files(writeback), - &sysfs_dirty_data, - &sysfs_dirty_bytes, - &sysfs_stripe_size, - &sysfs_partial_stripes_expensive, - &sysfs_sequential_cutoff, - &sysfs_clear_stats, - &sysfs_running, - &sysfs_state, - &sysfs_label, - &sysfs_readahead, -#ifdef CONFIG_BCACHE_DEBUG - &sysfs_verify, - &sysfs_bypass_torture_test, -#endif - NULL -}; -KTYPE(bch_cached_dev); - -SHOW(bch_blockdev_volume) -{ - struct bcache_device *d = container_of(kobj, struct bcache_device, - kobj); - - sysfs_hprint(size, le64_to_cpu(d->inode.v.i_size)); - - if (attr == &sysfs_label) { - memcpy(buf, d->inode.v.i_label, BCH_SB_LABEL_SIZE); - buf[BCH_SB_LABEL_SIZE + 1] = '\0'; - strcat(buf, "\n"); - return strlen(buf); - } - - return 0; -} - -STORE(bch_blockdev_volume) -{ - struct bcache_device *d = container_of(kobj, struct bcache_device, - kobj); - - if (attr == &sysfs_size) { - u64 journal_seq = 0; - u64 v = strtoi_h_or_return(buf); - int ret; - - mutex_lock(&d->inode_lock); - - if (v < le64_to_cpu(d->inode.v.i_size) ){ - ret = bch_inode_truncate(d->c, d->inode.k.p.inode, - v >> 9, NULL, NULL); - if (ret) { - mutex_unlock(&d->inode_lock); - return ret; - } - } - d->inode.v.i_size = cpu_to_le64(v); - ret = bch_btree_update(d->c, BTREE_ID_INODES, - &d->inode.k_i, &journal_seq); - - mutex_unlock(&d->inode_lock); - - if (ret) - return ret; - - ret = bch_journal_flush_seq(&d->c->journal, journal_seq); - if (ret) - return ret; - - set_capacity(d->disk, v >> 9); - } - - if (attr == &sysfs_label) { - u64 journal_seq = 0; - int ret; - - mutex_lock(&d->inode_lock); - - memcpy(d->inode.v.i_label, buf, BCH_SB_LABEL_SIZE); - ret = bch_btree_update(d->c, BTREE_ID_INODES, - &d->inode.k_i, &journal_seq); - - mutex_unlock(&d->inode_lock); - - return ret ?: bch_journal_flush_seq(&d->c->journal, journal_seq); - } - - if (attr == &sysfs_unregister) { - set_bit(BCACHE_DEV_DETACHING, &d->flags); - bch_blockdev_stop(d); - } - - return size; -} - -static struct attribute *bch_blockdev_volume_files[] = { - &sysfs_unregister, - &sysfs_label, - &sysfs_size, - NULL -}; -KTYPE(bch_blockdev_volume); - -static int bch_bset_print_stats(struct bch_fs *c, char *buf) -{ - struct bset_stats stats; - size_t nodes = 0; - struct btree *b; - struct bucket_table *tbl; - struct rhash_head *pos; - unsigned iter; - - memset(&stats, 0, sizeof(stats)); - - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, iter, pos) { - bch_btree_keys_stats(b, &stats); - nodes++; - } - rcu_read_unlock(); - - return snprintf(buf, PAGE_SIZE, - "btree nodes: %zu\n" - "written sets: %zu\n" - "written key bytes: %zu\n" - "unwritten sets: %zu\n" - "unwritten key bytes: %zu\n" - "no table sets: %zu\n" - "no table key bytes: %zu\n" - "floats: %zu\n" - "failed unpacked: %zu\n" - "failed prev: %zu\n" - "failed overflow: %zu\n", - nodes, - stats.sets[BSET_RO_AUX_TREE].nr, - stats.sets[BSET_RO_AUX_TREE].bytes, - stats.sets[BSET_RW_AUX_TREE].nr, - stats.sets[BSET_RW_AUX_TREE].bytes, - stats.sets[BSET_NO_AUX_TREE].nr, - stats.sets[BSET_NO_AUX_TREE].bytes, - stats.floats, - stats.failed_unpacked, - stats.failed_prev, - stats.failed_overflow); -} - -static unsigned bch_root_usage(struct bch_fs *c) -{ - unsigned bytes = 0; - struct bkey_packed *k; - struct btree *b; - struct btree_node_iter iter; - - goto lock_root; - - do { - six_unlock_read(&b->lock); -lock_root: - b = c->btree_roots[BTREE_ID_EXTENTS].b; - six_lock_read(&b->lock); - } while (b != c->btree_roots[BTREE_ID_EXTENTS].b); - - for_each_btree_node_key(b, k, &iter, btree_node_is_extents(b)) - bytes += bkey_bytes(k); - - six_unlock_read(&b->lock); - - return (bytes * 100) / btree_bytes(c); -} - -static size_t bch_btree_cache_size(struct bch_fs *c) -{ - size_t ret = 0; - struct btree *b; - - mutex_lock(&c->btree_cache_lock); - list_for_each_entry(b, &c->btree_cache, list) - ret += btree_bytes(c); - - mutex_unlock(&c->btree_cache_lock); - return ret; -} - -static unsigned bch_fs_available_percent(struct bch_fs *c) -{ - return div64_u64((u64) sectors_available(c) * 100, - c->capacity ?: 1); -} - -#if 0 -static unsigned bch_btree_used(struct bch_fs *c) -{ - return div64_u64(c->gc_stats.key_bytes * 100, - (c->gc_stats.nodes ?: 1) * btree_bytes(c)); -} - -static unsigned bch_average_key_size(struct bch_fs *c) -{ - return c->gc_stats.nkeys - ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) - : 0; -} -#endif - -static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) -{ - struct bch_fs_usage stats = bch_fs_usage_read(c); - - return scnprintf(buf, PAGE_SIZE, - "capacity:\t\t%llu\n" - "compressed:\n" - "\tmeta:\t\t%llu\n" - "\tdirty:\t\t%llu\n" - "\tcached:\t\t%llu\n" - "uncompressed:\n" - "\tmeta:\t\t%llu\n" - "\tdirty:\t\t%llu\n" - "\tcached:\t\t%llu\n" - "persistent reserved sectors:\t%llu\n" - "online reserved sectors:\t%llu\n", - c->capacity, - stats.s[S_COMPRESSED][S_META], - stats.s[S_COMPRESSED][S_DIRTY], - stats.s[S_COMPRESSED][S_CACHED], - stats.s[S_UNCOMPRESSED][S_META], - stats.s[S_UNCOMPRESSED][S_DIRTY], - stats.s[S_UNCOMPRESSED][S_CACHED], - stats.persistent_reserved, - stats.online_reserved); -} - -static ssize_t bch_compression_stats(struct bch_fs *c, char *buf) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, - nr_compressed_extents = 0, - compressed_sectors_compressed = 0, - compressed_sectors_uncompressed = 0; - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k) - if (k.k->type == BCH_EXTENT) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - const union bch_extent_crc *crc; - - extent_for_each_ptr_crc(e, ptr, crc) { - if (crc_compression_type(crc) == BCH_COMPRESSION_NONE) { - nr_uncompressed_extents++; - uncompressed_sectors += e.k->size; - } else { - nr_compressed_extents++; - compressed_sectors_compressed += - crc_compressed_size(e.k, crc); - compressed_sectors_uncompressed += - crc_uncompressed_size(e.k, crc); - } - - /* only looking at the first ptr */ - break; - } - } - bch_btree_iter_unlock(&iter); - - return snprintf(buf, PAGE_SIZE, - "uncompressed data:\n" - " nr extents: %llu\n" - " size (bytes): %llu\n" - "compressed data:\n" - " nr extents: %llu\n" - " compressed size (bytes): %llu\n" - " uncompressed size (bytes): %llu\n", - nr_uncompressed_extents, - uncompressed_sectors << 9, - nr_compressed_extents, - compressed_sectors_compressed << 9, - compressed_sectors_uncompressed << 9); -} - -SHOW(bch_fs) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - sysfs_print(minor, c->minor); - - sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); - sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); - sysfs_hprint(journal_entry_size_max, c->journal.entry_size_max); - - sysfs_hprint(block_size, block_bytes(c)); - sysfs_print(block_size_bytes, block_bytes(c)); - sysfs_hprint(btree_node_size, c->sb.btree_node_size << 9); - sysfs_print(btree_node_size_bytes, c->sb.btree_node_size << 9); - - sysfs_hprint(btree_cache_size, bch_btree_cache_size(c)); - sysfs_print(cache_available_percent, bch_fs_available_percent(c)); - - sysfs_print(btree_gc_running, c->gc_pos.phase != GC_PHASE_DONE); - -#if 0 - /* XXX: reimplement */ - sysfs_print(btree_used_percent, bch_btree_used(c)); - sysfs_print(btree_nodes, c->gc_stats.nodes); - sysfs_hprint(average_key_size, bch_average_key_size(c)); -#endif - - sysfs_print(cache_read_races, - atomic_long_read(&c->cache_read_races)); - - sysfs_print(writeback_keys_done, - atomic_long_read(&c->writeback_keys_done)); - sysfs_print(writeback_keys_failed, - atomic_long_read(&c->writeback_keys_failed)); - - /* See count_io_errors for why 88 */ - sysfs_print(io_error_halflife, c->error_decay * 88); - sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); - - sysfs_hprint(congested, - ((uint64_t) bch_get_congested(c)) << 9); - sysfs_print(congested_read_threshold_us, - c->congested_read_threshold_us); - sysfs_print(congested_write_threshold_us, - c->congested_write_threshold_us); - - sysfs_printf(foreground_write_ratelimit_enabled, "%i", - c->foreground_write_ratelimit_enabled); - sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); - sysfs_pd_controller_show(foreground_write, &c->foreground_write_pd); - - sysfs_print(pd_controllers_update_seconds, - c->pd_controllers_update_seconds); - sysfs_print(foreground_target_percent, c->foreground_target_percent); - - sysfs_printf(tiering_enabled, "%i", c->tiering_enabled); - sysfs_print(tiering_percent, c->tiering_percent); - - sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */ - - sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have); - sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have); - - /* Debugging: */ - - if (attr == &sysfs_journal_debug) - return bch_journal_print_debug(&c->journal, buf); - -#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - - if (!bch_fs_running(c)) - return -EPERM; - - if (attr == &sysfs_bset_tree_stats) - return bch_bset_print_stats(c, buf); - if (attr == &sysfs_alloc_debug) - return show_fs_alloc_debug(c, buf); - - sysfs_print(tree_depth, c->btree_roots[BTREE_ID_EXTENTS].b->level); - sysfs_print(root_usage_percent, bch_root_usage(c)); - - if (attr == &sysfs_compression_stats) - return bch_compression_stats(c, buf); - - sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); - - return 0; -} - -STORE(__bch_fs) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - if (attr == &sysfs_unregister) { - bch_fs_detach(c); - return size; - } - - if (attr == &sysfs_stop) { - bch_fs_stop_async(c); - return size; - } - - if (attr == &sysfs_clear_stats) { - atomic_long_set(&c->writeback_keys_done, 0); - atomic_long_set(&c->writeback_keys_failed, 0); - bch_cache_accounting_clear(&c->accounting); - - return size; - } - - sysfs_strtoul(congested_read_threshold_us, - c->congested_read_threshold_us); - sysfs_strtoul(congested_write_threshold_us, - c->congested_write_threshold_us); - - if (attr == &sysfs_io_error_limit) { - c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; - return size; - } - - /* See count_io_errors() for why 88 */ - if (attr == &sysfs_io_error_halflife) { - c->error_decay = strtoul_or_return(buf) / 88; - return size; - } - - sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); - sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); - - sysfs_strtoul(foreground_write_ratelimit_enabled, - c->foreground_write_ratelimit_enabled); - - if (attr == &sysfs_copy_gc_enabled) { - struct bch_dev *ca; - unsigned i; - ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) - ?: (ssize_t) size; - - for_each_member_device(ca, c, i) - if (ca->moving_gc_read) - wake_up_process(ca->moving_gc_read); - return ret; - } - - if (attr == &sysfs_tiering_enabled) { - ssize_t ret = strtoul_safe(buf, c->tiering_enabled) - ?: (ssize_t) size; - - bch_tiering_start(c); /* issue wakeups */ - return ret; - } - - sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd); - - sysfs_strtoul(pd_controllers_update_seconds, - c->pd_controllers_update_seconds); - sysfs_strtoul(foreground_target_percent, c->foreground_target_percent); - - sysfs_strtoul(tiering_percent, c->tiering_percent); - sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */ - - /* Debugging: */ - -#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - - if (!bch_fs_running(c)) - return -EPERM; - - if (attr == &sysfs_journal_flush) { - bch_journal_meta_async(&c->journal, NULL); - - return size; - } - - if (attr == &sysfs_blockdev_volume_create) { - u64 v = strtoi_h_or_return(buf); - int r = bch_blockdev_volume_create(c, v); - - if (r) - return r; - } - - if (attr == &sysfs_trigger_btree_coalesce) - bch_coalesce(c); - - /* Debugging: */ - - if (attr == &sysfs_trigger_gc) - bch_gc(c); - - if (attr == &sysfs_prune_cache) { - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = strtoul_or_return(buf); - c->btree_cache_shrink.scan_objects(&c->btree_cache_shrink, &sc); - } - - return size; -} - -STORE(bch_fs) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - mutex_lock(&c->state_lock); - size = __bch_fs_store(kobj, attr, buf, size); - mutex_unlock(&c->state_lock); - - return size; -} - -static struct attribute *bch_fs_files[] = { - &sysfs_unregister, - &sysfs_stop, - &sysfs_journal_write_delay_ms, - &sysfs_journal_reclaim_delay_ms, - &sysfs_journal_entry_size_max, - &sysfs_blockdev_volume_create, - - &sysfs_block_size, - &sysfs_block_size_bytes, - &sysfs_btree_node_size, - &sysfs_btree_node_size_bytes, - &sysfs_tree_depth, - &sysfs_root_usage_percent, - &sysfs_btree_cache_size, - &sysfs_cache_available_percent, - &sysfs_compression_stats, - - &sysfs_average_key_size, - - &sysfs_io_error_limit, - &sysfs_io_error_halflife, - &sysfs_congested, - &sysfs_congested_read_threshold_us, - &sysfs_congested_write_threshold_us, - &sysfs_clear_stats, - - &sysfs_meta_replicas_have, - &sysfs_data_replicas_have, - - &sysfs_foreground_target_percent, - &sysfs_tiering_percent, - - &sysfs_journal_flush, - NULL -}; -KTYPE(bch_fs); - -/* internal dir - just a wrapper */ - -SHOW(bch_fs_internal) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, internal); - return bch_fs_show(&c->kobj, attr, buf); -} - -STORE(bch_fs_internal) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, internal); - return bch_fs_store(&c->kobj, attr, buf, size); -} - -static void bch_fs_internal_release(struct kobject *k) -{ -} - -static struct attribute *bch_fs_internal_files[] = { - &sysfs_journal_debug, - - &sysfs_alloc_debug, - - &sysfs_btree_gc_running, - - &sysfs_btree_nodes, - &sysfs_btree_used_percent, - - &sysfs_bset_tree_stats, - &sysfs_cache_read_races, - &sysfs_writeback_keys_done, - &sysfs_writeback_keys_failed, - - &sysfs_trigger_btree_coalesce, - &sysfs_trigger_gc, - &sysfs_prune_cache, - &sysfs_foreground_write_ratelimit_enabled, - &sysfs_copy_gc_enabled, - &sysfs_tiering_enabled, - sysfs_pd_controller_files(tiering), - sysfs_pd_controller_files(foreground_write), - &sysfs_internal_uuid, - -#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - - NULL -}; -KTYPE(bch_fs_internal); - -/* options */ - -SHOW(bch_fs_opts_dir) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - - return bch_opt_show(&c->opts, attr->name, buf, PAGE_SIZE); -} - -STORE(bch_fs_opts_dir) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - const struct bch_option *opt; - enum bch_opt_id id; - u64 v; - - id = bch_parse_sysfs_opt(attr->name, buf, &v); - if (id < 0) - return id; - - opt = &bch_opt_table[id]; - - mutex_lock(&c->sb_lock); - - if (id == Opt_compression) { - int ret = bch_check_set_has_compressed_data(c, v); - if (ret) { - mutex_unlock(&c->sb_lock); - return ret; - } - } - - if (opt->set_sb != SET_NO_SB_OPT) { - opt->set_sb(c->disk_sb, v); - bch_write_super(c); - } - - bch_opt_set(&c->opts, id, v); - - mutex_unlock(&c->sb_lock); - - return size; -} - -static void bch_fs_opts_dir_release(struct kobject *k) -{ -} - -static struct attribute *bch_fs_opts_dir_files[] = { -#define BCH_OPT(_name, ...) \ - &sysfs_opt_##_name, - - BCH_VISIBLE_OPTS() -#undef BCH_OPT - - NULL -}; -KTYPE(bch_fs_opts_dir); - -/* time stats */ - -SHOW(bch_fs_time_stats) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); - -#define BCH_TIME_STAT(name, frequency_units, duration_units) \ - sysfs_print_time_stats(&c->name##_time, name, \ - frequency_units, duration_units); - BCH_TIME_STATS() -#undef BCH_TIME_STAT - - return 0; -} - -STORE(bch_fs_time_stats) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); - -#define BCH_TIME_STAT(name, frequency_units, duration_units) \ - sysfs_clear_time_stats(&c->name##_time, name); - BCH_TIME_STATS() -#undef BCH_TIME_STAT - - return size; -} - -static void bch_fs_time_stats_release(struct kobject *k) -{ -} - -static struct attribute *bch_fs_time_stats_files[] = { -#define BCH_TIME_STAT(name, frequency_units, duration_units) \ - sysfs_time_stats_attribute_list(name, frequency_units, duration_units) - BCH_TIME_STATS() -#undef BCH_TIME_STAT - - NULL -}; -KTYPE(bch_fs_time_stats); - -typedef unsigned (bucket_map_fn)(struct bch_dev *, struct bucket *, void *); - -static unsigned bucket_priority_fn(struct bch_dev *ca, struct bucket *g, - void *private) -{ - int rw = (private ? 1 : 0); - - return ca->fs->prio_clock[rw].hand - g->prio[rw]; -} - -static unsigned bucket_sectors_used_fn(struct bch_dev *ca, struct bucket *g, - void *private) -{ - return bucket_sectors_used(g); -} - -static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, struct bucket *g, - void *private) -{ - return bucket_gc_gen(ca, g); -} - -static ssize_t show_quantiles(struct bch_dev *ca, char *buf, - bucket_map_fn *fn, void *private) -{ - int cmp(const void *l, const void *r) - { return *((unsigned *) r) - *((unsigned *) l); } - - size_t n = ca->mi.nbuckets, i; - /* Compute 31 quantiles */ - unsigned q[31], *p; - ssize_t ret = 0; - - p = vzalloc(ca->mi.nbuckets * sizeof(unsigned)); - if (!p) - return -ENOMEM; - - for (i = ca->mi.first_bucket; i < n; i++) - p[i] = fn(ca, &ca->buckets[i], private); - - sort(p, n, sizeof(unsigned), cmp, NULL); - - while (n && - !p[n - 1]) - --n; - - for (i = 0; i < ARRAY_SIZE(q); i++) - q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)]; - - vfree(p); - - for (i = 0; i < ARRAY_SIZE(q); i++) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "%u ", q[i]); - buf[ret - 1] = '\n'; - - return ret; - -} - -static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) -{ - enum alloc_reserve i; - ssize_t ret; - - spin_lock(&ca->freelist_lock); - - ret = scnprintf(buf, PAGE_SIZE, - "free_inc:\t%zu\t%zu\n", - fifo_used(&ca->free_inc), - ca->free_inc.size); - - for (i = 0; i < RESERVE_NR; i++) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "free[%u]:\t%zu\t%zu\n", i, - fifo_used(&ca->free[i]), - ca->free[i].size); - - spin_unlock(&ca->freelist_lock); - - return ret; -} - -static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) -{ - struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch_dev_usage_read(ca); - - return scnprintf(buf, PAGE_SIZE, - "free_inc: %zu/%zu\n" - "free[RESERVE_PRIO]: %zu/%zu\n" - "free[RESERVE_BTREE]: %zu/%zu\n" - "free[RESERVE_MOVINGGC]: %zu/%zu\n" - "free[RESERVE_NONE]: %zu/%zu\n" - "alloc: %llu/%llu\n" - "meta: %llu/%llu\n" - "dirty: %llu/%llu\n" - "available: %llu/%llu\n" - "freelist_wait: %s\n" - "open buckets: %u/%u (reserved %u)\n" - "open_buckets_wait: %s\n", - fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->free[RESERVE_PRIO]), ca->free[RESERVE_PRIO].size, - fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, - fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, - fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, - stats.buckets_alloc, ca->mi.nbuckets - ca->mi.first_bucket, - stats.buckets_meta, ca->mi.nbuckets - ca->mi.first_bucket, - stats.buckets_dirty, ca->mi.nbuckets - ca->mi.first_bucket, - __dev_buckets_available(ca, stats), ca->mi.nbuckets - ca->mi.first_bucket, - c->freelist_wait.list.first ? "waiting" : "empty", - c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE, - c->open_buckets_wait.list.first ? "waiting" : "empty"); -} - -static u64 sectors_written(struct bch_dev *ca) -{ - u64 ret = 0; - int cpu; - - for_each_possible_cpu(cpu) - ret += *per_cpu_ptr(ca->sectors_written, cpu); - - return ret; -} - -SHOW(bch_dev) -{ - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch_dev_usage_read(ca); - - sysfs_printf(uuid, "%pU\n", ca->uuid.b); - - sysfs_hprint(bucket_size, bucket_bytes(ca)); - sysfs_print(bucket_size_bytes, bucket_bytes(ca)); - sysfs_hprint(block_size, block_bytes(c)); - sysfs_print(block_size_bytes, block_bytes(c)); - sysfs_print(first_bucket, ca->mi.first_bucket); - sysfs_print(nbuckets, ca->mi.nbuckets); - sysfs_print(discard, ca->mi.discard); - sysfs_hprint(written, sectors_written(ca) << 9); - sysfs_hprint(btree_written, - atomic64_read(&ca->btree_sectors_written) << 9); - sysfs_hprint(metadata_written, - (atomic64_read(&ca->meta_sectors_written) + - atomic64_read(&ca->btree_sectors_written)) << 9); - - sysfs_print(io_errors, - atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); - - sysfs_hprint(dirty_data, stats.sectors[S_DIRTY] << 9); - sysfs_print(dirty_bytes, stats.sectors[S_DIRTY] << 9); - sysfs_print(dirty_buckets, stats.buckets_dirty); - sysfs_hprint(cached_data, stats.sectors[S_CACHED] << 9); - sysfs_print(cached_bytes, stats.sectors[S_CACHED] << 9); - sysfs_print(cached_buckets, stats.buckets_cached); - sysfs_print(meta_buckets, stats.buckets_meta); - sysfs_print(alloc_buckets, stats.buckets_alloc); - sysfs_print(available_buckets, dev_buckets_available(ca)); - sysfs_print(free_buckets, dev_buckets_free(ca)); - sysfs_print(has_data, ca->mi.has_data); - sysfs_print(has_metadata, ca->mi.has_metadata); - - sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd); - - if (attr == &sysfs_cache_replacement_policy) - return bch_snprint_string_list(buf, PAGE_SIZE, - bch_cache_replacement_policies, - ca->mi.replacement); - - sysfs_print(tier, ca->mi.tier); - - if (attr == &sysfs_state_rw) - return bch_snprint_string_list(buf, PAGE_SIZE, - bch_dev_state, - ca->mi.state); - - if (attr == &sysfs_read_priority_stats) - return show_quantiles(ca, buf, bucket_priority_fn, (void *) 0); - if (attr == &sysfs_write_priority_stats) - return show_quantiles(ca, buf, bucket_priority_fn, (void *) 1); - if (attr == &sysfs_fragmentation_stats) - return show_quantiles(ca, buf, bucket_sectors_used_fn, NULL); - if (attr == &sysfs_oldest_gen_stats) - return show_quantiles(ca, buf, bucket_oldest_gen_fn, NULL); - if (attr == &sysfs_reserve_stats) - return show_reserve_stats(ca, buf); - if (attr == &sysfs_alloc_debug) - return show_dev_alloc_debug(ca, buf); - - return 0; -} - -STORE(bch_dev) -{ - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - struct bch_fs *c = ca->fs; - struct bch_member *mi; - - sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd); - - if (attr == &sysfs_discard) { - bool v = strtoul_or_return(buf); - - mutex_lock(&c->sb_lock); - mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx]; - - if (v != BCH_MEMBER_DISCARD(mi)) { - SET_BCH_MEMBER_DISCARD(mi, v); - bch_write_super(c); - } - mutex_unlock(&c->sb_lock); - } - - if (attr == &sysfs_cache_replacement_policy) { - ssize_t v = bch_read_string_list(buf, bch_cache_replacement_policies); - - if (v < 0) - return v; - - mutex_lock(&c->sb_lock); - mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx]; - - if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { - SET_BCH_MEMBER_REPLACEMENT(mi, v); - bch_write_super(c); - } - mutex_unlock(&c->sb_lock); - } - - if (attr == &sysfs_tier) { - unsigned prev_tier; - unsigned v = strtoul_restrict_or_return(buf, - 0, BCH_TIER_MAX - 1); - - mutex_lock(&c->sb_lock); - prev_tier = ca->mi.tier; - - if (v == ca->mi.tier) { - mutex_unlock(&c->sb_lock); - return size; - } - - mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx]; - SET_BCH_MEMBER_TIER(mi, v); - bch_write_super(c); - - bch_dev_group_remove(&c->tiers[prev_tier].devs, ca); - bch_dev_group_add(&c->tiers[ca->mi.tier].devs, ca); - mutex_unlock(&c->sb_lock); - - bch_recalc_capacity(c); - bch_tiering_start(c); - } - - if (attr == &sysfs_clear_stats) { - int cpu; - - for_each_possible_cpu(cpu) - *per_cpu_ptr(ca->sectors_written, cpu) = 0; - - atomic64_set(&ca->btree_sectors_written, 0); - atomic64_set(&ca->meta_sectors_written, 0); - atomic_set(&ca->io_count, 0); - atomic_set(&ca->io_errors, 0); - } - - return size; -} - -static struct attribute *bch_dev_files[] = { - &sysfs_uuid, - &sysfs_bucket_size, - &sysfs_bucket_size_bytes, - &sysfs_block_size, - &sysfs_block_size_bytes, - &sysfs_first_bucket, - &sysfs_nbuckets, - &sysfs_read_priority_stats, - &sysfs_write_priority_stats, - &sysfs_fragmentation_stats, - &sysfs_oldest_gen_stats, - &sysfs_reserve_stats, - &sysfs_available_buckets, - &sysfs_free_buckets, - &sysfs_dirty_data, - &sysfs_dirty_bytes, - &sysfs_dirty_buckets, - &sysfs_cached_data, - &sysfs_cached_bytes, - &sysfs_cached_buckets, - &sysfs_meta_buckets, - &sysfs_alloc_buckets, - &sysfs_has_data, - &sysfs_has_metadata, - &sysfs_discard, - &sysfs_written, - &sysfs_btree_written, - &sysfs_metadata_written, - &sysfs_io_errors, - &sysfs_clear_stats, - &sysfs_cache_replacement_policy, - &sysfs_tier, - &sysfs_state_rw, - &sysfs_alloc_debug, - - sysfs_pd_controller_files(copy_gc), - NULL -}; -KTYPE(bch_dev); diff --git a/libbcache/sysfs.h b/libbcache/sysfs.h deleted file mode 100644 index 02700246..00000000 --- a/libbcache/sysfs.h +++ /dev/null @@ -1,103 +0,0 @@ -#ifndef _BCACHE_SYSFS_H_ -#define _BCACHE_SYSFS_H_ - -#include "util.h" - -#define KTYPE(type) \ -struct kobj_type type ## _ktype = { \ - .release = type ## _release, \ - .sysfs_ops = &((const struct sysfs_ops) { \ - .show = type ## _show, \ - .store = type ## _store \ - }), \ - .default_attrs = type ## _files \ -} - -#define SHOW(fn) \ -static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ - char *buf) \ - -#define STORE(fn) \ -static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ - const char *buf, size_t size) \ - -#define __sysfs_attribute(_name, _mode) \ - static struct attribute sysfs_##_name = \ - { .name = #_name, .mode = _mode } - -#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) -#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) -#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) - -#define sysfs_printf(file, fmt, ...) \ -do { \ - if (attr == &sysfs_ ## file) \ - return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \ -} while (0) - -#define sysfs_print(file, var) \ -do { \ - if (attr == &sysfs_ ## file) \ - return snprint(buf, PAGE_SIZE, var); \ -} while (0) - -#define sysfs_hprint(file, val) \ -do { \ - if (attr == &sysfs_ ## file) { \ - ssize_t ret = bch_hprint(buf, val); \ - strcat(buf, "\n"); \ - return ret + 1; \ - } \ -} while (0) - -#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) -#define var_print(_var) sysfs_print(_var, var(_var)) -#define var_hprint(_var) sysfs_hprint(_var, var(_var)) - -#define sysfs_strtoul(file, var) \ -do { \ - if (attr == &sysfs_ ## file) \ - return strtoul_safe(buf, var) ?: (ssize_t) size; \ -} while (0) - -#define sysfs_strtoul_clamp(file, var, min, max) \ -do { \ - if (attr == &sysfs_ ## file) \ - return strtoul_safe_clamp(buf, var, min, max) \ - ?: (ssize_t) size; \ -} while (0) - -#define strtoul_or_return(cp) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (_r) \ - return _r; \ - _v; \ -}) - -#define strtoul_restrict_or_return(cp, min, max) \ -({ \ - unsigned long __v = 0; \ - int _r = strtoul_safe_restrict(cp, __v, min, max); \ - if (_r) \ - return _r; \ - __v; \ -}) - -#define strtoi_h_or_return(cp) \ -({ \ - u64 _v; \ - int _r = strtoi_h(cp, &_v); \ - if (_r) \ - return _r; \ - _v; \ -}) - -#define sysfs_hatoi(file, var) \ -do { \ - if (attr == &sysfs_ ## file) \ - return strtoi_h(buf, &var) ?: (ssize_t) size; \ -} while (0) - -#endif /* _BCACHE_SYSFS_H_ */ diff --git a/libbcache/tier.c b/libbcache/tier.c deleted file mode 100644 index 8627ac3e..00000000 --- a/libbcache/tier.c +++ /dev/null @@ -1,282 +0,0 @@ - -#include "bcache.h" -#include "alloc.h" -#include "btree_iter.h" -#include "buckets.h" -#include "clock.h" -#include "extents.h" -#include "io.h" -#include "keylist.h" -#include "move.h" -#include "super-io.h" -#include "tier.h" - -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <trace/events/bcache.h> - -struct tiering_state { - struct bch_tier *tier; - unsigned sectors; - unsigned stripe_size; - unsigned dev_idx; - struct bch_dev *ca; -}; - -static bool tiering_pred(struct bch_fs *c, - struct tiering_state *s, - struct bkey_s_c k) -{ - if (bkey_extent_is_data(k.k)) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - unsigned replicas = 0; - - /* Make sure we have room to add a new pointer: */ - if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > - BKEY_EXTENT_VAL_U64s_MAX) - return false; - - extent_for_each_ptr(e, ptr) - if (c->devs[ptr->dev]->mi.tier >= s->tier->idx) - replicas++; - - return replicas < c->opts.data_replicas; - } - - return false; -} - -static void tier_put_device(struct tiering_state *s) -{ - if (s->ca) - percpu_ref_put(&s->ca->io_ref); - s->ca = NULL; -} - -/** - * refill_next - move on to refilling the next cache's tiering keylist - */ -static void tier_next_device(struct bch_fs *c, struct tiering_state *s) -{ - if (!s->ca || s->sectors > s->stripe_size) { - tier_put_device(s); - s->sectors = 0; - s->dev_idx++; - - spin_lock(&s->tier->devs.lock); - if (s->dev_idx >= s->tier->devs.nr) - s->dev_idx = 0; - - if (s->tier->devs.nr) { - s->ca = s->tier->devs.d[s->dev_idx].dev; - percpu_ref_get(&s->ca->io_ref); - } - spin_unlock(&s->tier->devs.lock); - } -} - -static int issue_tiering_move(struct bch_fs *c, - struct tiering_state *s, - struct moving_context *ctxt, - struct bkey_s_c k) -{ - int ret; - - ret = bch_data_move(c, ctxt, &s->ca->tiering_write_point, k, NULL); - if (!ret) { - trace_bcache_tiering_copy(k.k); - s->sectors += k.k->size; - } else { - trace_bcache_tiering_alloc_fail(c, k.k->size); - } - - return ret; -} - -/** - * tiering_next_cache - issue a move to write an extent to the next cache - * device in round robin order - */ -static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier) -{ - struct moving_context ctxt; - struct tiering_state s; - struct btree_iter iter; - struct bkey_s_c k; - unsigned nr_devices = READ_ONCE(tier->devs.nr); - int ret; - - if (!nr_devices) - return 0; - - trace_bcache_tiering_start(c); - - memset(&s, 0, sizeof(s)); - s.tier = tier; - s.stripe_size = 2048; /* 1 mb for now */ - - bch_move_ctxt_init(&ctxt, &tier->pd.rate, - nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE); - bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); - - while (!kthread_should_stop() && - !bch_move_ctxt_wait(&ctxt) && - (k = bch_btree_iter_peek(&iter)).k && - !btree_iter_err(k)) { - if (!tiering_pred(c, &s, k)) - goto next; - - tier_next_device(c, &s); - if (!s.ca) - break; - - ret = issue_tiering_move(c, &s, &ctxt, k); - if (ret) { - bch_btree_iter_unlock(&iter); - - /* memory allocation failure, wait for some IO to finish */ - bch_move_ctxt_wait_for_io(&ctxt); - continue; - } -next: - bch_btree_iter_advance_pos(&iter); - //bch_btree_iter_cond_resched(&iter); - - /* unlock before calling moving_context_wait() */ - bch_btree_iter_unlock(&iter); - cond_resched(); - } - - bch_btree_iter_unlock(&iter); - tier_put_device(&s); - bch_move_ctxt_exit(&ctxt); - trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved); - - return ctxt.sectors_moved; -} - -static int bch_tiering_thread(void *arg) -{ - struct bch_tier *tier = arg; - struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]); - struct io_clock *clock = &c->io_clock[WRITE]; - struct bch_dev *ca; - u64 tier_capacity, available_sectors; - unsigned long last; - unsigned i; - - set_freezable(); - - while (!kthread_should_stop()) { - if (kthread_wait_freezable(c->tiering_enabled && - tier->devs.nr)) - break; - - while (1) { - struct bch_tier *faster_tier; - - last = atomic_long_read(&clock->now); - - tier_capacity = available_sectors = 0; - for (faster_tier = c->tiers; - faster_tier != tier; - faster_tier++) { - spin_lock(&faster_tier->devs.lock); - group_for_each_dev(ca, &faster_tier->devs, i) { - tier_capacity += - (ca->mi.nbuckets - - ca->mi.first_bucket) << ca->bucket_bits; - available_sectors += - dev_buckets_available(ca) << ca->bucket_bits; - } - spin_unlock(&faster_tier->devs.lock); - } - - if (available_sectors < (tier_capacity >> 1)) - break; - - bch_kthread_io_clock_wait(clock, - last + - available_sectors - - (tier_capacity >> 1)); - if (kthread_should_stop()) - return 0; - } - - read_tiering(c, tier); - } - - return 0; -} - -static void __bch_tiering_stop(struct bch_tier *tier) -{ - tier->pd.rate.rate = UINT_MAX; - bch_ratelimit_reset(&tier->pd.rate); - - if (tier->migrate) - kthread_stop(tier->migrate); - - tier->migrate = NULL; -} - -void bch_tiering_stop(struct bch_fs *c) -{ - struct bch_tier *tier; - - for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) - __bch_tiering_stop(tier); -} - -static int __bch_tiering_start(struct bch_tier *tier) -{ - if (!tier->migrate) { - struct task_struct *p = - kthread_create(bch_tiering_thread, tier, - "bch_tier[%u]", tier->idx); - if (IS_ERR(p)) - return PTR_ERR(p); - - tier->migrate = p; - } - - wake_up_process(tier->migrate); - return 0; -} - -int bch_tiering_start(struct bch_fs *c) -{ - struct bch_tier *tier; - bool have_faster_tier = false; - - if (c->opts.nochanges) - return 0; - - for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { - if (!tier->devs.nr) - continue; - - if (have_faster_tier) { - int ret = __bch_tiering_start(tier); - if (ret) - return ret; - } else { - __bch_tiering_stop(tier); - } - - have_faster_tier = true; - } - - return 0; -} - -void bch_fs_tiering_init(struct bch_fs *c) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { - c->tiers[i].idx = i; - bch_pd_controller_init(&c->tiers[i].pd); - } -} diff --git a/libbcache/tier.h b/libbcache/tier.h deleted file mode 100644 index b6f8d4a2..00000000 --- a/libbcache/tier.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _BCACHE_TIER_H -#define _BCACHE_TIER_H - -void bch_tiering_stop(struct bch_fs *); -int bch_tiering_start(struct bch_fs *); -void bch_fs_tiering_init(struct bch_fs *); - -#endif diff --git a/libbcache/trace.c b/libbcache/trace.c deleted file mode 100644 index def525d1..00000000 --- a/libbcache/trace.c +++ /dev/null @@ -1,11 +0,0 @@ -#include "bcache.h" -#include "alloc_types.h" -#include "blockdev_types.h" -#include "buckets.h" -#include "btree_types.h" -#include "keylist.h" - -#include <linux/blktrace_api.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/bcache.h> diff --git a/libbcache/util.c b/libbcache/util.c deleted file mode 100644 index 5f816593..00000000 --- a/libbcache/util.c +++ /dev/null @@ -1,418 +0,0 @@ -/* - * random utiility code, for bcache but in theory not specific to bcache - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/ctype.h> -#include <linux/debugfs.h> -#include <linux/module.h> -#include <linux/random.h> -#include <linux/seq_file.h> -#include <linux/types.h> - -#include <linux/freezer.h> -#include <linux/kthread.h> - -#include "util.h" - -#define simple_strtoint(c, end, base) simple_strtol(c, end, base) -#define simple_strtouint(c, end, base) simple_strtoul(c, end, base) - -#define STRTO_H(name, type) \ -int bch_ ## name ## _h(const char *cp, type *res) \ -{ \ - int u = 0; \ - char *e; \ - type i = simple_ ## name(cp, &e, 10); \ - \ - switch (tolower(*e)) { \ - default: \ - return -EINVAL; \ - case 'y': \ - case 'z': \ - u++; \ - case 'e': \ - u++; \ - case 'p': \ - u++; \ - case 't': \ - u++; \ - case 'g': \ - u++; \ - case 'm': \ - u++; \ - case 'k': \ - u++; \ - if (e++ == cp) \ - return -EINVAL; \ - case '\n': \ - case '\0': \ - if (*e == '\n') \ - e++; \ - } \ - \ - if (*e) \ - return -EINVAL; \ - \ - while (u--) { \ - if ((type) ~0 > 0 && \ - (type) ~0 / 1024 <= i) \ - return -EINVAL; \ - if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \ - (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \ - return -EINVAL; \ - i *= 1024; \ - } \ - \ - *res = i; \ - return 0; \ -} \ - -STRTO_H(strtoint, int) -STRTO_H(strtouint, unsigned int) -STRTO_H(strtoll, long long) -STRTO_H(strtoull, unsigned long long) - -ssize_t bch_hprint(char *buf, s64 v) -{ - static const char units[] = "?kMGTPEZY"; - char dec[4] = ""; - int u, t = 0; - - for (u = 0; v >= 1024 || v <= -1024; u++) { - t = v & ~(~0 << 10); - v >>= 10; - } - - if (!u) - return sprintf(buf, "%lli", v); - - /* - * 103 is magic: t is in the range [-1023, 1023] and we want - * to turn it into [-9, 9] - */ - if (v < 100 && v > -100) - snprintf(dec, sizeof(dec), ".%i", t / 103); - - return sprintf(buf, "%lli%s%c", v, dec, units[u]); -} - -ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], - size_t selected) -{ - char *out = buf; - size_t i; - - for (i = 0; list[i]; i++) - out += snprintf(out, buf + size - out, - i == selected ? "[%s] " : "%s ", list[i]); - - out[-1] = '\n'; - return out - buf; -} - -ssize_t bch_read_string_list(const char *buf, const char * const list[]) -{ - size_t i; - char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL); - if (!d) - return -ENOMEM; - - s = strim(d); - - for (i = 0; list[i]; i++) - if (!strcmp(list[i], s)) - break; - - kfree(d); - - if (!list[i]) - return -EINVAL; - - return i; -} - -bool bch_is_zero(const void *_p, size_t n) -{ - const char *p = _p; - size_t i; - - for (i = 0; i < n; i++) - if (p[i]) - return false; - return true; -} - -void bch_time_stats_clear(struct time_stats *stats) -{ - spin_lock(&stats->lock); - - stats->count = 0; - stats->last_duration = 0; - stats->max_duration = 0; - stats->average_duration = 0; - stats->average_frequency = 0; - stats->last = 0; - - spin_unlock(&stats->lock); -} - -void __bch_time_stats_update(struct time_stats *stats, u64 start_time) -{ - u64 now, duration, last; - - stats->count++; - - now = local_clock(); - duration = time_after64(now, start_time) - ? now - start_time : 0; - last = time_after64(now, stats->last) - ? now - stats->last : 0; - - stats->last_duration = duration; - stats->max_duration = max(stats->max_duration, duration); - - if (stats->last) { - stats->average_duration = ewma_add(stats->average_duration, - duration << 8, 3); - - if (stats->average_frequency) - stats->average_frequency = - ewma_add(stats->average_frequency, - last << 8, 3); - else - stats->average_frequency = last << 8; - } else { - stats->average_duration = duration << 8; - } - - stats->last = now ?: 1; -} - -void bch_time_stats_update(struct time_stats *stats, u64 start_time) -{ - spin_lock(&stats->lock); - __bch_time_stats_update(stats, start_time); - spin_unlock(&stats->lock); -} - -/** - * bch_ratelimit_delay() - return how long to delay until the next time to do - * some work - * - * @d - the struct bch_ratelimit to update - * - * Returns the amount of time to delay by, in jiffies - */ -u64 bch_ratelimit_delay(struct bch_ratelimit *d) -{ - u64 now = local_clock(); - - return time_after64(d->next, now) - ? nsecs_to_jiffies(d->next - now) - : 0; -} - -/** - * bch_ratelimit_increment() - increment @d by the amount of work done - * - * @d - the struct bch_ratelimit to update - * @done - the amount of work done, in arbitrary units - */ -void bch_ratelimit_increment(struct bch_ratelimit *d, u64 done) -{ - u64 now = local_clock(); - - d->next += div_u64(done * NSEC_PER_SEC, d->rate); - - if (time_before64(now + NSEC_PER_SEC, d->next)) - d->next = now + NSEC_PER_SEC; - - if (time_after64(now - NSEC_PER_SEC * 2, d->next)) - d->next = now - NSEC_PER_SEC * 2; -} - -int bch_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d) -{ - while (1) { - u64 delay = bch_ratelimit_delay(d); - - if (delay) - set_current_state(TASK_INTERRUPTIBLE); - - if (kthread_should_stop()) - return 1; - - if (!delay) - return 0; - - schedule_timeout(delay); - try_to_freeze(); - } -} - -/* - * Updates pd_controller. Attempts to scale inputed values to units per second. - * @target: desired value - * @actual: current value - * - * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing - * it makes actual go down. - */ -void bch_pd_controller_update(struct bch_pd_controller *pd, - s64 target, s64 actual, int sign) -{ - s64 proportional, derivative, change; - - unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; - - if (seconds_since_update == 0) - return; - - pd->last_update = jiffies; - - proportional = actual - target; - proportional *= seconds_since_update; - proportional = div_s64(proportional, pd->p_term_inverse); - - derivative = actual - pd->last_actual; - derivative = div_s64(derivative, seconds_since_update); - derivative = ewma_add(pd->smoothed_derivative, derivative, - (pd->d_term / seconds_since_update) ?: 1); - derivative = derivative * pd->d_term; - derivative = div_s64(derivative, pd->p_term_inverse); - - change = proportional + derivative; - - /* Don't increase rate if not keeping up */ - if (change > 0 && - pd->backpressure && - time_after64(local_clock(), - pd->rate.next + NSEC_PER_MSEC)) - change = 0; - - change *= (sign * -1); - - pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, - 1, UINT_MAX); - - pd->last_actual = actual; - pd->last_derivative = derivative; - pd->last_proportional = proportional; - pd->last_change = change; - pd->last_target = target; -} - -void bch_pd_controller_init(struct bch_pd_controller *pd) -{ - pd->rate.rate = 1024; - pd->last_update = jiffies; - pd->p_term_inverse = 6000; - pd->d_term = 30; - pd->d_smooth = pd->d_term; - pd->backpressure = 1; -} - -size_t bch_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) -{ - /* 2^64 - 1 is 20 digits, plus null byte */ - char rate[21]; - char actual[21]; - char target[21]; - char proportional[21]; - char derivative[21]; - char change[21]; - s64 next_io; - - bch_hprint(rate, pd->rate.rate); - bch_hprint(actual, pd->last_actual); - bch_hprint(target, pd->last_target); - bch_hprint(proportional, pd->last_proportional); - bch_hprint(derivative, pd->last_derivative); - bch_hprint(change, pd->last_change); - - next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); - - return sprintf(buf, - "rate:\t\t%s/sec\n" - "target:\t\t%s\n" - "actual:\t\t%s\n" - "proportional:\t%s\n" - "derivative:\t%s\n" - "change:\t\t%s/sec\n" - "next io:\t%llims\n", - rate, target, actual, proportional, - derivative, change, next_io); -} - -void bch_bio_map(struct bio *bio, void *base) -{ - size_t size = bio->bi_iter.bi_size; - struct bio_vec *bv = bio->bi_io_vec; - - BUG_ON(!bio->bi_iter.bi_size); - BUG_ON(bio->bi_vcnt); - - bv->bv_offset = base ? offset_in_page(base) : 0; - goto start; - - for (; size; bio->bi_vcnt++, bv++) { - bv->bv_offset = 0; -start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, - size); - BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs); - if (base) { - bv->bv_page = is_vmalloc_addr(base) - ? vmalloc_to_page(base) - : virt_to_page(base); - - base += bv->bv_len; - } - - size -= bv->bv_len; - } -} - -size_t bch_rand_range(size_t max) -{ - size_t rand; - - do { - get_random_bytes(&rand, sizeof(rand)); - rand &= roundup_pow_of_two(max) - 1; - } while (rand >= max); - - return rand; -} - -void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src) -{ - struct bio_vec bv; - struct bvec_iter iter; - - __bio_for_each_segment(bv, dst, iter, dst_iter) { - void *dstp = kmap_atomic(bv.bv_page); - memcpy(dstp + bv.bv_offset, src, bv.bv_len); - kunmap_atomic(dstp); - - src += bv.bv_len; - } -} - -void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) -{ - struct bio_vec bv; - struct bvec_iter iter; - - __bio_for_each_segment(bv, src, iter, src_iter) { - void *srcp = kmap_atomic(bv.bv_page); - memcpy(dst, srcp + bv.bv_offset, bv.bv_len); - kunmap_atomic(srcp); - - dst += bv.bv_len; - } -} diff --git a/libbcache/util.h b/libbcache/util.h deleted file mode 100644 index 88cbe301..00000000 --- a/libbcache/util.h +++ /dev/null @@ -1,755 +0,0 @@ -#ifndef _BCACHE_UTIL_H -#define _BCACHE_UTIL_H - -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/errno.h> -#include <linux/blkdev.h> -#include <linux/freezer.h> -#include <linux/kernel.h> -#include <linux/llist.h> -#include <linux/ratelimit.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/workqueue.h> - -#include "closure.h" - -#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) -#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT) - -struct closure; - -#ifdef CONFIG_BCACHE_DEBUG - -#define EBUG_ON(cond) BUG_ON(cond) -#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) -#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) -#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0) -#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0) -#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0) -#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0) -#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0) -#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i) -#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) -#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) - -#define memcpy(_dst, _src, _len) \ -do { \ - BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ - (void *) (_dst) + (_len) <= (void *) (_src))); \ - memcpy(_dst, _src, _len); \ -} while (0) - -#else /* DEBUG */ - -#define EBUG_ON(cond) -#define atomic_dec_bug(v) atomic_dec(v) -#define atomic_inc_bug(v, i) atomic_inc(v) -#define atomic_sub_bug(i, v) atomic_sub(i, v) -#define atomic_add_bug(i, v) atomic_add(i, v) -#define atomic_long_dec_bug(v) atomic_long_dec(v) -#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v) -#define atomic64_dec_bug(v) atomic64_dec(v) -#define atomic64_inc_bug(v, i) atomic64_inc(v) -#define atomic64_sub_bug(i, v) atomic64_sub(i, v) -#define atomic64_add_bug(i, v) atomic64_add(i, v) - -#endif - -#ifndef __CHECKER__ -#define __flatten __attribute__((flatten)) -#else -/* sparse doesn't know about attribute((flatten)) */ -#define __flatten -#endif - -#ifdef __LITTLE_ENDIAN -#define CPU_BIG_ENDIAN 0 -#else -#define CPU_BIG_ENDIAN 1 -#endif - -/* type hackery */ - -#define type_is_exact(_val, _type) \ - __builtin_types_compatible_p(typeof(_val), _type) - -#define type_is(_val, _type) \ - (__builtin_types_compatible_p(typeof(_val), _type) || \ - __builtin_types_compatible_p(typeof(_val), const _type)) - -static inline void *kvmalloc(size_t bytes, gfp_t gfp) -{ - if (bytes <= PAGE_SIZE || - !(gfp & GFP_KERNEL)) - return kmalloc(bytes, gfp); - - return ((bytes <= KMALLOC_MAX_SIZE) - ? kmalloc(bytes, gfp|__GFP_NOWARN) - : NULL) ?: - vmalloc(bytes); -} - -#define DECLARE_HEAP(type, name) \ - struct { \ - size_t size, used; \ - type *data; \ - } name - -#define init_heap(heap, _size, gfp) \ -({ \ - size_t _bytes; \ - (heap)->used = 0; \ - (heap)->size = (_size); \ - _bytes = (heap)->size * sizeof(*(heap)->data); \ - (heap)->data = kvmalloc(_bytes, (gfp)); \ - (heap)->data; \ -}) - -#define free_heap(heap) \ -do { \ - kvfree((heap)->data); \ - (heap)->data = NULL; \ -} while (0) - -#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) - -#define heap_sift(h, i, cmp) \ -do { \ - size_t _r, _j = i; \ - \ - for (; _j * 2 + 1 < (h)->used; _j = _r) { \ - _r = _j * 2 + 1; \ - if (_r + 1 < (h)->used && \ - cmp((h)->data[_r], (h)->data[_r + 1])) \ - _r++; \ - \ - if (cmp((h)->data[_r], (h)->data[_j])) \ - break; \ - heap_swap(h, _r, _j); \ - } \ -} while (0) - -#define heap_sift_down(h, i, cmp) \ -do { \ - while (i) { \ - size_t p = (i - 1) / 2; \ - if (cmp((h)->data[i], (h)->data[p])) \ - break; \ - heap_swap(h, i, p); \ - i = p; \ - } \ -} while (0) - -#define heap_add(h, d, cmp) \ -({ \ - bool _r = !heap_full(h); \ - if (_r) { \ - size_t _i = (h)->used++; \ - (h)->data[_i] = d; \ - \ - heap_sift_down(h, _i, cmp); \ - heap_sift(h, _i, cmp); \ - } \ - _r; \ -}) - -#define heap_del(h, i, cmp) \ -do { \ - size_t _i = (i); \ - \ - BUG_ON(_i >= (h)->used); \ - (h)->used--; \ - heap_swap(h, _i, (h)->used); \ - heap_sift_down(h, _i, cmp); \ - heap_sift(h, _i, cmp); \ -} while (0) - -#define heap_pop(h, d, cmp) \ -({ \ - bool _r = (h)->used; \ - if (_r) { \ - (d) = (h)->data[0]; \ - heap_del(h, 0, cmp); \ - } \ - _r; \ -}) - -#define heap_peek(h) \ -({ \ - EBUG_ON(!(h)->used); \ - (h)->data[0]; \ -}) - -#define heap_full(h) ((h)->used == (h)->size) - -#define heap_resort(heap, cmp) \ -do { \ - ssize_t _i; \ - for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ - heap_sift(heap, _i, cmp); \ -} while (0) - -/* - * Simple array based allocator - preallocates a number of elements and you can - * never allocate more than that, also has no locking. - * - * Handy because if you know you only need a fixed number of elements you don't - * have to worry about memory allocation failure, and sometimes a mempool isn't - * what you want. - * - * We treat the free elements as entries in a singly linked list, and the - * freelist as a stack - allocating and freeing push and pop off the freelist. - */ - -#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \ - struct { \ - type *freelist; \ - type data[size]; \ - } name - -#define array_alloc(array) \ -({ \ - typeof((array)->freelist) _ret = (array)->freelist; \ - \ - if (_ret) \ - (array)->freelist = *((typeof((array)->freelist) *) _ret);\ - \ - _ret; \ -}) - -#define array_free(array, ptr) \ -do { \ - typeof((array)->freelist) _ptr = ptr; \ - \ - *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \ - (array)->freelist = _ptr; \ -} while (0) - -#define array_allocator_init(array) \ -do { \ - typeof((array)->freelist) _i; \ - \ - BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \ - (array)->freelist = NULL; \ - \ - for (_i = (array)->data; \ - _i < (array)->data + ARRAY_SIZE((array)->data); \ - _i++) \ - array_free(array, _i); \ -} while (0) - -#define array_freelist_empty(array) ((array)->freelist == NULL) - -#define ANYSINT_MAX(t) \ - ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) - -int bch_strtoint_h(const char *, int *); -int bch_strtouint_h(const char *, unsigned int *); -int bch_strtoll_h(const char *, long long *); -int bch_strtoull_h(const char *, unsigned long long *); - -static inline int bch_strtol_h(const char *cp, long *res) -{ -#if BITS_PER_LONG == 32 - return bch_strtoint_h(cp, (int *) res); -#else - return bch_strtoll_h(cp, (long long *) res); -#endif -} - -static inline int bch_strtoul_h(const char *cp, long *res) -{ -#if BITS_PER_LONG == 32 - return bch_strtouint_h(cp, (unsigned int *) res); -#else - return bch_strtoull_h(cp, (unsigned long long *) res); -#endif -} - -#define strtoi_h(cp, res) \ - ( type_is(*res, int) ? bch_strtoint_h(cp, (void *) res)\ - : type_is(*res, long) ? bch_strtol_h(cp, (void *) res)\ - : type_is(*res, long long) ? bch_strtoll_h(cp, (void *) res)\ - : type_is(*res, unsigned) ? bch_strtouint_h(cp, (void *) res)\ - : type_is(*res, unsigned long) ? bch_strtoul_h(cp, (void *) res)\ - : type_is(*res, unsigned long long) ? bch_strtoull_h(cp, (void *) res)\ - : -EINVAL) - -#define strtoul_safe(cp, var) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (!_r) \ - var = _v; \ - _r; \ -}) - -#define strtoul_safe_clamp(cp, var, min, max) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (!_r) \ - var = clamp_t(typeof(var), _v, min, max); \ - _r; \ -}) - -#define strtoul_safe_restrict(cp, var, min, max) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (!_r && _v >= min && _v <= max) \ - var = _v; \ - else \ - _r = -EINVAL; \ - _r; \ -}) - -#define snprint(buf, size, var) \ - snprintf(buf, size, \ - type_is(var, int) ? "%i\n" \ - : type_is(var, unsigned) ? "%u\n" \ - : type_is(var, long) ? "%li\n" \ - : type_is(var, unsigned long) ? "%lu\n" \ - : type_is(var, s64) ? "%lli\n" \ - : type_is(var, u64) ? "%llu\n" \ - : type_is(var, char *) ? "%s\n" \ - : "%i\n", var) - -ssize_t bch_hprint(char *buf, s64 v); - -bool bch_is_zero(const void *, size_t); - -ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], - size_t selected); - -ssize_t bch_read_string_list(const char *buf, const char * const list[]); - -struct time_stats { - spinlock_t lock; - u64 count; - /* - * all fields are in nanoseconds, averages are ewmas stored left shifted - * by 8 - */ - u64 last_duration; - u64 max_duration; - u64 average_duration; - u64 average_frequency; - u64 last; -}; - -void bch_time_stats_clear(struct time_stats *stats); -void __bch_time_stats_update(struct time_stats *stats, u64 time); -void bch_time_stats_update(struct time_stats *stats, u64 time); - -static inline unsigned local_clock_us(void) -{ - return local_clock() >> 10; -} - -#define NSEC_PER_ns 1L -#define NSEC_PER_us NSEC_PER_USEC -#define NSEC_PER_ms NSEC_PER_MSEC -#define NSEC_PER_sec NSEC_PER_SEC - -#define __print_time_stat(stats, name, stat, units) \ - sysfs_print(name ## _ ## stat ## _ ## units, \ - div_u64((stats)->stat >> 8, NSEC_PER_ ## units)) - -#define sysfs_print_time_stats(stats, name, \ - frequency_units, \ - duration_units) \ -do { \ - __print_time_stat(stats, name, \ - average_frequency, frequency_units); \ - __print_time_stat(stats, name, \ - average_duration, duration_units); \ - sysfs_print(name ## _ ##count, (stats)->count); \ - sysfs_print(name ## _ ##last_duration ## _ ## duration_units, \ - div_u64((stats)->last_duration, \ - NSEC_PER_ ## duration_units)); \ - sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \ - div_u64((stats)->max_duration, \ - NSEC_PER_ ## duration_units)); \ - \ - sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ - ? div_s64(local_clock() - (stats)->last, \ - NSEC_PER_ ## frequency_units) \ - : -1LL); \ -} while (0) - -#define sysfs_clear_time_stats(stats, name) \ -do { \ - if (attr == &sysfs_ ## name ## _clear) \ - bch_time_stats_clear(stats); \ -} while (0) - -#define sysfs_time_stats_attribute(name, \ - frequency_units, \ - duration_units) \ -write_attribute(name ## _clear); \ -read_attribute(name ## _count); \ -read_attribute(name ## _average_frequency_ ## frequency_units); \ -read_attribute(name ## _average_duration_ ## duration_units); \ -read_attribute(name ## _last_duration_ ## duration_units); \ -read_attribute(name ## _max_duration_ ## duration_units); \ -read_attribute(name ## _last_ ## frequency_units) - -#define sysfs_time_stats_attribute_list(name, \ - frequency_units, \ - duration_units) \ -&sysfs_ ## name ## _clear, \ -&sysfs_ ## name ## _count, \ -&sysfs_ ## name ## _average_frequency_ ## frequency_units, \ -&sysfs_ ## name ## _average_duration_ ## duration_units, \ -&sysfs_ ## name ## _last_duration_ ## duration_units, \ -&sysfs_ ## name ## _max_duration_ ## duration_units, \ -&sysfs_ ## name ## _last_ ## frequency_units, - -#define ewma_add(ewma, val, weight) \ -({ \ - typeof(ewma) _ewma = (ewma); \ - typeof(weight) _weight = (weight); \ - \ - (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ -}) - -struct bch_ratelimit { - /* Next time we want to do some work, in nanoseconds */ - u64 next; - - /* - * Rate at which we want to do work, in units per nanosecond - * The units here correspond to the units passed to - * bch_ratelimit_increment() - */ - unsigned rate; -}; - -static inline void bch_ratelimit_reset(struct bch_ratelimit *d) -{ - d->next = local_clock(); -} - -u64 bch_ratelimit_delay(struct bch_ratelimit *); -void bch_ratelimit_increment(struct bch_ratelimit *, u64); -int bch_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *); - -struct bch_pd_controller { - struct bch_ratelimit rate; - unsigned long last_update; - - s64 last_actual; - s64 smoothed_derivative; - - unsigned p_term_inverse; - unsigned d_smooth; - unsigned d_term; - - /* for exporting to sysfs (no effect on behavior) */ - s64 last_derivative; - s64 last_proportional; - s64 last_change; - s64 last_target; - - /* If true, the rate will not increase if bch_ratelimit_delay() - * is not being called often enough. */ - bool backpressure; -}; - -void bch_pd_controller_update(struct bch_pd_controller *, s64, s64, int); -void bch_pd_controller_init(struct bch_pd_controller *); -size_t bch_pd_controller_print_debug(struct bch_pd_controller *, char *); - -#define sysfs_pd_controller_attribute(name) \ - rw_attribute(name##_rate); \ - rw_attribute(name##_rate_bytes); \ - rw_attribute(name##_rate_d_term); \ - rw_attribute(name##_rate_p_term_inverse); \ - read_attribute(name##_rate_debug) - -#define sysfs_pd_controller_files(name) \ - &sysfs_##name##_rate, \ - &sysfs_##name##_rate_bytes, \ - &sysfs_##name##_rate_d_term, \ - &sysfs_##name##_rate_p_term_inverse, \ - &sysfs_##name##_rate_debug - -#define sysfs_pd_controller_show(name, var) \ -do { \ - sysfs_hprint(name##_rate, (var)->rate.rate); \ - sysfs_print(name##_rate_bytes, (var)->rate.rate); \ - sysfs_print(name##_rate_d_term, (var)->d_term); \ - sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ - \ - if (attr == &sysfs_##name##_rate_debug) \ - return bch_pd_controller_print_debug(var, buf); \ -} while (0) - -#define sysfs_pd_controller_store(name, var) \ -do { \ - sysfs_strtoul_clamp(name##_rate, \ - (var)->rate.rate, 1, UINT_MAX); \ - sysfs_strtoul_clamp(name##_rate_bytes, \ - (var)->rate.rate, 1, UINT_MAX); \ - sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ - sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ - (var)->p_term_inverse, 1, INT_MAX); \ -} while (0) - -#define __DIV_SAFE(n, d, zero) \ -({ \ - typeof(n) _n = (n); \ - typeof(d) _d = (d); \ - _d ? _n / _d : zero; \ -}) - -#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0) - -#define container_of_or_null(ptr, type, member) \ -({ \ - typeof(ptr) _ptr = ptr; \ - _ptr ? container_of(_ptr, type, member) : NULL; \ -}) - -#define RB_INSERT(root, new, member, cmp) \ -({ \ - __label__ dup; \ - struct rb_node **n = &(root)->rb_node, *parent = NULL; \ - typeof(new) this; \ - int res, ret = -1; \ - \ - while (*n) { \ - parent = *n; \ - this = container_of(*n, typeof(*(new)), member); \ - res = cmp(new, this); \ - if (!res) \ - goto dup; \ - n = res < 0 \ - ? &(*n)->rb_left \ - : &(*n)->rb_right; \ - } \ - \ - rb_link_node(&(new)->member, parent, n); \ - rb_insert_color(&(new)->member, root); \ - ret = 0; \ -dup: \ - ret; \ -}) - -#define RB_SEARCH(root, search, member, cmp) \ -({ \ - struct rb_node *n = (root)->rb_node; \ - typeof(&(search)) this, ret = NULL; \ - int res; \ - \ - while (n) { \ - this = container_of(n, typeof(search), member); \ - res = cmp(&(search), this); \ - if (!res) { \ - ret = this; \ - break; \ - } \ - n = res < 0 \ - ? n->rb_left \ - : n->rb_right; \ - } \ - ret; \ -}) - -#define RB_GREATER(root, search, member, cmp) \ -({ \ - struct rb_node *n = (root)->rb_node; \ - typeof(&(search)) this, ret = NULL; \ - int res; \ - \ - while (n) { \ - this = container_of(n, typeof(search), member); \ - res = cmp(&(search), this); \ - if (res < 0) { \ - ret = this; \ - n = n->rb_left; \ - } else \ - n = n->rb_right; \ - } \ - ret; \ -}) - -#define RB_FIRST(root, type, member) \ - container_of_or_null(rb_first(root), type, member) - -#define RB_LAST(root, type, member) \ - container_of_or_null(rb_last(root), type, member) - -#define RB_NEXT(ptr, member) \ - container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member) - -#define RB_PREV(ptr, member) \ - container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) - -/* Does linear interpolation between powers of two */ -static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) -{ - unsigned fract = x & ~(~0 << fract_bits); - - x >>= fract_bits; - x = 1 << x; - x += (x * fract) >> fract_bits; - - return x; -} - -void bch_bio_map(struct bio *bio, void *base); - -static inline sector_t bdev_sectors(struct block_device *bdev) -{ - return bdev->bd_inode->i_size >> 9; -} - -#define closure_bio_submit(bio, cl) \ -do { \ - closure_get(cl); \ - generic_make_request(bio); \ -} while (0) - -#define closure_bio_submit_punt(bio, cl, c) \ -do { \ - closure_get(cl); \ - bch_generic_make_request(bio, c); \ -} while (0) - -#define kthread_wait_freezable(cond) \ -({ \ - int _ret = 0; \ - while (1) { \ - set_current_state(TASK_INTERRUPTIBLE); \ - if (kthread_should_stop()) { \ - _ret = -1; \ - break; \ - } \ - \ - if (cond) \ - break; \ - \ - schedule(); \ - try_to_freeze(); \ - } \ - set_current_state(TASK_RUNNING); \ - _ret; \ -}) - -size_t bch_rand_range(size_t); - -void memcpy_to_bio(struct bio *, struct bvec_iter, void *); -void memcpy_from_bio(void *, struct bio *, struct bvec_iter); - -static inline void __memcpy_u64s(void *dst, const void *src, - unsigned u64s) -{ -#ifdef CONFIG_X86_64 - long d0, d1, d2; - asm volatile("rep ; movsq" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (u64s), "1" (dst), "2" (src) - : "memory"); -#else - u64 *d = dst; - const u64 *s = src; - - while (u64s--) - *d++ = *s++; -#endif -} - -static inline void memcpy_u64s(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(!(dst >= src + u64s * sizeof(u64) || - dst + u64s * sizeof(u64) <= src)); - - __memcpy_u64s(dst, src, u64s); -} - -static inline void __memmove_u64s_down(void *dst, const void *src, - unsigned u64s) -{ - __memcpy_u64s(dst, src, u64s); -} - -static inline void memmove_u64s_down(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst > src); - - __memmove_u64s_down(dst, src, u64s); -} - -static inline void __memmove_u64s_up(void *_dst, const void *_src, - unsigned u64s) -{ - u64 *dst = (u64 *) _dst + u64s - 1; - u64 *src = (u64 *) _src + u64s - 1; - -#ifdef CONFIG_X86_64 - long d0, d1, d2; - asm volatile("std ;\n" - "rep ; movsq\n" - "cld ;\n" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (u64s), "1" (dst), "2" (src) - : "memory"); -#else - while (u64s--) - *dst-- = *src--; -#endif -} - -static inline void memmove_u64s_up(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst < src); - - __memmove_u64s_up(dst, src, u64s); -} - -static inline void memmove_u64s(void *dst, const void *src, - unsigned u64s) -{ - if (dst < src) - __memmove_u64s_down(dst, src, u64s); - else - __memmove_u64s_up(dst, src, u64s); -} - -static inline struct bio_vec next_contig_bvec(struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - bio_advance_iter(bio, iter, bv.bv_len); -#ifndef CONFIG_HIGHMEM - while (iter->bi_size) { - struct bio_vec next = bio_iter_iovec(bio, *iter); - - if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != - page_address(next.bv_page) + next.bv_offset) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter(bio, iter, next.bv_len); - } -#endif - return bv; -} - -#define __bio_for_each_contig_segment(bv, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bv = next_contig_bvec((bio), &(iter))), 1);) - -#define bio_for_each_contig_segment(bv, bio, iter) \ - __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) - -#endif /* _BCACHE_UTIL_H */ diff --git a/libbcache/vstructs.h b/libbcache/vstructs.h deleted file mode 100644 index ce2cece0..00000000 --- a/libbcache/vstructs.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef _VSTRUCTS_H -#define _VSTRUCTS_H - -#include "util.h" - -/* - * NOTE: we can't differentiate between __le64 and u64 with type_is - this - * assumes u64 is little endian: - */ -#define __vstruct_u64s(_s) \ -({ \ - ( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s) \ - : type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s) \ - : type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s) \ - : ((_s)->u64s)); \ -}) - -#define __vstruct_bytes(_type, _u64s) \ -({ \ - BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ - \ - (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ -}) - -#define vstruct_bytes(_s) \ - __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) - -#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ - (round_up(__vstruct_bytes(_type, _u64s), \ - 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) - -#define vstruct_blocks(_s, _sector_block_bits) \ - __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) - -#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ - __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ - __vstruct_u64s(_s) + (_u64s)) - -#define vstruct_sectors(_s, _sector_block_bits) \ - (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) - -#define vstruct_next(_s) \ - ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) -#define vstruct_last(_s) \ - ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) -#define vstruct_end(_s) \ - ((void *) ((_s)->_data + __vstruct_u64s(_s))) - -#define vstruct_for_each(_s, _i) \ - for (_i = (_s)->start; \ - _i < vstruct_last(_s); \ - _i = vstruct_next(_i)) - -#define vstruct_for_each_safe(_s, _i, _t) \ - for (_i = (_s)->start; \ - _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ - _i = _t) - -#define vstruct_idx(_s, _idx) \ - ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) - -#endif /* _VSTRUCTS_H */ diff --git a/libbcache/writeback.c b/libbcache/writeback.c deleted file mode 100644 index 279cfe67..00000000 --- a/libbcache/writeback.c +++ /dev/null @@ -1,657 +0,0 @@ -/* - * background writeback - scan btree for dirty data and write it to the backing - * device - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include "bcache.h" -#include "btree_update.h" -#include "clock.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "io.h" -#include "keybuf.h" -#include "keylist.h" -#include "writeback.h" - -#include <linux/delay.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <trace/events/bcache.h> - -/* Rate limiting */ - -static void __update_writeback_rate(struct cached_dev *dc) -{ - struct bch_fs *c = dc->disk.c; - u64 cache_dirty_target = - div_u64(c->capacity * dc->writeback_percent, 100); - s64 target = div64_u64(cache_dirty_target * - bdev_sectors(dc->disk_sb.bdev), - c->cached_dev_sectors); - s64 dirty = bcache_dev_sectors_dirty(&dc->disk); - - bch_pd_controller_update(&dc->writeback_pd, target << 9, - dirty << 9, -1); -} - -static void update_writeback_rate(struct work_struct *work) -{ - struct cached_dev *dc = container_of(to_delayed_work(work), - struct cached_dev, - writeback_pd_update); - - down_read(&dc->writeback_lock); - - if (atomic_read(&dc->has_dirty) && - dc->writeback_percent && - !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - __update_writeback_rate(dc); - else - dc->writeback_pd.rate.rate = UINT_MAX; - - up_read(&dc->writeback_lock); - - schedule_delayed_work(&dc->writeback_pd_update, - dc->writeback_pd_update_seconds * HZ); -} - -struct dirty_io { - struct closure cl; - struct bch_replace_info replace; - struct cached_dev *dc; - struct bch_dev *ca; - struct keybuf_key *w; - struct bch_extent_ptr ptr; - int error; - bool from_mempool; - /* Must be last */ - struct bio bio; -}; - -#define DIRTY_IO_MEMPOOL_BVECS 64 -#define DIRTY_IO_MEMPOOL_SECTORS (DIRTY_IO_MEMPOOL_BVECS * PAGE_SECTORS) - -static void dirty_init(struct dirty_io *io) -{ - struct bio *bio = &io->bio; - - bio_init(bio); - if (!io->dc->writeback_percent) - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); - - bio->bi_iter.bi_size = io->replace.key.k.size << 9; - bio->bi_max_vecs = - DIV_ROUND_UP(io->replace.key.k.size, PAGE_SECTORS); - bio->bi_io_vec = bio->bi_inline_vecs; - bch_bio_map(bio, NULL); -} - -static void dirty_io_destructor(struct closure *cl) -{ - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - - if (io->from_mempool) - mempool_free(io, &io->dc->writeback_io_pool); - else - kfree(io); -} - -static void write_dirty_finish(struct closure *cl) -{ - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - struct cached_dev *dc = io->dc; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, &io->bio, i) - mempool_free(bv->bv_page, &dc->writeback_page_pool); - - if (!io->error) { - BKEY_PADDED(k) tmp; - int ret; - - bkey_copy(&tmp.k, &io->replace.key); - io->replace.hook.fn = bch_extent_cmpxchg; - bkey_extent_set_cached(&tmp.k.k, true); - - ret = bch_btree_insert(dc->disk.c, BTREE_ID_EXTENTS, &tmp.k, - NULL, &io->replace.hook, NULL, 0); - if (io->replace.successes == 0) - trace_bcache_writeback_collision(&io->replace.key.k); - - atomic_long_inc(ret - ? &dc->disk.c->writeback_keys_failed - : &dc->disk.c->writeback_keys_done); - } - - bch_keybuf_put(&dc->writeback_keys, io->w); - - closure_return_with_destructor(cl, dirty_io_destructor); -} - -static void dirty_endio(struct bio *bio) -{ - struct dirty_io *io = container_of(bio, struct dirty_io, bio); - - if (bio->bi_error) { - trace_bcache_writeback_error(&io->replace.key.k, - op_is_write(bio_op(&io->bio)), - bio->bi_error); - io->error = bio->bi_error; - } - - closure_put(&io->cl); -} - -static void write_dirty(struct closure *cl) -{ - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - - if (!io->error) { - dirty_init(io); - bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); - io->bio.bi_iter.bi_sector = - bkey_start_offset(&io->replace.key.k); - io->bio.bi_bdev = io->dc->disk_sb.bdev; - io->bio.bi_end_io = dirty_endio; - - closure_bio_submit(&io->bio, cl); - } - - continue_at(cl, write_dirty_finish, io->dc->disk.c->wq); -} - -static void read_dirty_endio(struct bio *bio) -{ - struct dirty_io *io = container_of(bio, struct dirty_io, bio); - - bch_dev_nonfatal_io_err_on(bio->bi_error, io->ca, "writeback read"); - - bch_account_io_completion(io->ca); - - if (ptr_stale(io->ca, &io->ptr)) - bio->bi_error = -EINTR; - - dirty_endio(bio); -} - -static void read_dirty_submit(struct closure *cl) -{ - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - - closure_bio_submit(&io->bio, cl); - - continue_at(cl, write_dirty, system_freezable_wq); -} - -static u64 read_dirty(struct cached_dev *dc) -{ - struct keybuf_key *w; - struct dirty_io *io; - struct closure cl; - unsigned i; - struct bio_vec *bv; - u64 sectors_written = 0; - BKEY_PADDED(k) tmp; - - closure_init_stack(&cl); - - while (!bch_ratelimit_wait_freezable_stoppable(&dc->writeback_pd.rate)) { - w = bch_keybuf_next(&dc->writeback_keys); - if (!w) - break; - - sectors_written += w->key.k.size; - bkey_copy(&tmp.k, &w->key); - - while (tmp.k.k.size) { - struct extent_pick_ptr pick; - - bch_extent_pick_ptr(dc->disk.c, - bkey_i_to_s_c(&tmp.k), - &pick); - if (IS_ERR_OR_NULL(pick.ca)) - break; - - io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * - DIV_ROUND_UP(tmp.k.k.size, - PAGE_SECTORS), - GFP_KERNEL); - if (!io) { - trace_bcache_writeback_alloc_fail(pick.ca->fs, - tmp.k.k.size); - io = mempool_alloc(&dc->writeback_io_pool, - GFP_KERNEL); - memset(io, 0, sizeof(*io) + - sizeof(struct bio_vec) * - DIRTY_IO_MEMPOOL_BVECS); - io->from_mempool = true; - - bkey_copy(&io->replace.key, &tmp.k); - - if (DIRTY_IO_MEMPOOL_SECTORS < - io->replace.key.k.size) - bch_key_resize(&io->replace.key.k, - DIRTY_IO_MEMPOOL_SECTORS); - } else { - bkey_copy(&io->replace.key, &tmp.k); - } - - io->dc = dc; - io->ca = pick.ca; - io->w = w; - io->ptr = pick.ptr; - atomic_inc(&w->ref); - - dirty_init(io); - bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); - io->bio.bi_iter.bi_sector = pick.ptr.offset; - io->bio.bi_bdev = pick.ca->disk_sb.bdev; - io->bio.bi_end_io = read_dirty_endio; - - bio_for_each_segment_all(bv, &io->bio, i) { - bv->bv_page = - mempool_alloc(&dc->writeback_page_pool, - i ? GFP_NOWAIT - : GFP_KERNEL); - if (!bv->bv_page) { - BUG_ON(!i); - io->bio.bi_vcnt = i; - - io->bio.bi_iter.bi_size = - io->bio.bi_vcnt * PAGE_SIZE; - - bch_key_resize(&io->replace.key.k, - bio_sectors(&io->bio)); - break; - } - } - - bch_cut_front(io->replace.key.k.p, &tmp.k); - trace_bcache_writeback(&io->replace.key.k); - - bch_ratelimit_increment(&dc->writeback_pd.rate, - io->replace.key.k.size << 9); - - closure_call(&io->cl, read_dirty_submit, NULL, &cl); - } - - bch_keybuf_put(&dc->writeback_keys, w); - } - - /* - * Wait for outstanding writeback IOs to finish (and keybuf slots to be - * freed) before refilling again - */ - closure_sync(&cl); - - return sectors_written; -} - -/* Scan for dirty data */ - -static void __bcache_dev_sectors_dirty_add(struct bcache_device *d, - u64 offset, int nr_sectors) -{ - unsigned stripe_offset, stripe, sectors_dirty; - - if (!d) - return; - - if (!d->stripe_sectors_dirty) - return; - - stripe = offset_to_stripe(d, offset); - stripe_offset = offset & (d->stripe_size - 1); - - while (nr_sectors) { - int s = min_t(unsigned, abs(nr_sectors), - d->stripe_size - stripe_offset); - - if (nr_sectors < 0) - s = -s; - - if (stripe >= d->nr_stripes) - return; - - sectors_dirty = atomic_add_return(s, - d->stripe_sectors_dirty + stripe); - if (sectors_dirty == d->stripe_size) - set_bit(stripe, d->full_dirty_stripes); - else - clear_bit(stripe, d->full_dirty_stripes); - - nr_sectors -= s; - stripe_offset = 0; - stripe++; - } -} - -void bcache_dev_sectors_dirty_add(struct bch_fs *c, unsigned inode, - u64 offset, int nr_sectors) -{ - struct bcache_device *d; - - rcu_read_lock(); - d = bch_dev_find(c, inode); - if (d) - __bcache_dev_sectors_dirty_add(d, offset, nr_sectors); - rcu_read_unlock(); -} - -static bool dirty_pred(struct keybuf *buf, struct bkey_s_c k) -{ - struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys); - - BUG_ON(k.k->p.inode != bcache_dev_inum(&dc->disk)); - - return bkey_extent_is_data(k.k) && - !bkey_extent_is_cached(k.k); -} - -static void refill_full_stripes(struct cached_dev *dc) -{ - struct keybuf *buf = &dc->writeback_keys; - unsigned inode = bcache_dev_inum(&dc->disk); - unsigned start_stripe, stripe, next_stripe; - bool wrapped = false; - - stripe = offset_to_stripe(&dc->disk, buf->last_scanned.offset); - - if (stripe >= dc->disk.nr_stripes) - stripe = 0; - - start_stripe = stripe; - - while (1) { - stripe = find_next_bit(dc->disk.full_dirty_stripes, - dc->disk.nr_stripes, stripe); - - if (stripe == dc->disk.nr_stripes) - goto next; - - next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, - dc->disk.nr_stripes, stripe); - - buf->last_scanned = POS(inode, - stripe * dc->disk.stripe_size); - - bch_refill_keybuf(dc->disk.c, buf, - POS(inode, - next_stripe * dc->disk.stripe_size), - dirty_pred); - - if (array_freelist_empty(&buf->freelist)) - return; - - stripe = next_stripe; -next: - if (wrapped && stripe > start_stripe) - return; - - if (stripe == dc->disk.nr_stripes) { - stripe = 0; - wrapped = true; - } - } -} - -static u64 bch_writeback(struct cached_dev *dc) -{ - struct keybuf *buf = &dc->writeback_keys; - unsigned inode = bcache_dev_inum(&dc->disk); - struct bpos start = POS(inode, 0); - struct bpos end = POS(inode, KEY_OFFSET_MAX); - struct bpos start_pos; - u64 sectors_written = 0; - - buf->last_scanned = POS(inode, 0); - - while (bkey_cmp(buf->last_scanned, end) < 0 && - !kthread_should_stop()) { - down_write(&dc->writeback_lock); - - if (!atomic_read(&dc->has_dirty)) { - up_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); - - if (kthread_should_stop()) - return sectors_written; - - schedule(); - try_to_freeze(); - return sectors_written; - } - - if (bkey_cmp(buf->last_scanned, end) >= 0) - buf->last_scanned = POS(inode, 0); - - if (dc->partial_stripes_expensive) { - refill_full_stripes(dc); - if (array_freelist_empty(&buf->freelist)) - goto refill_done; - } - - start_pos = buf->last_scanned; - bch_refill_keybuf(dc->disk.c, buf, end, dirty_pred); - - if (bkey_cmp(buf->last_scanned, end) >= 0) { - /* - * If we get to the end start scanning again from the - * beginning, and only scan up to where we initially - * started scanning from: - */ - buf->last_scanned = start; - bch_refill_keybuf(dc->disk.c, buf, start_pos, - dirty_pred); - } - - if (RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { - atomic_set(&dc->has_dirty, 0); - cached_dev_put(dc); - SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); - } - -refill_done: - up_write(&dc->writeback_lock); - - bch_ratelimit_reset(&dc->writeback_pd.rate); - sectors_written += read_dirty(dc); - } - - return sectors_written; -} - -static int bch_writeback_thread(void *arg) -{ - struct cached_dev *dc = arg; - struct bch_fs *c = dc->disk.c; - struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last; - u64 sectors_written; - - set_freezable(); - - while (!kthread_should_stop()) { - if (kthread_wait_freezable(dc->writeback_running || - test_bit(BCACHE_DEV_DETACHING, - &dc->disk.flags))) - break; - - last = atomic_long_read(&clock->now); - - sectors_written = bch_writeback(dc); - - if (sectors_written < c->capacity >> 4) - bch_kthread_io_clock_wait(clock, - last + (c->capacity >> 5)); - } - - return 0; -} - -/** - * bch_keylist_recalc_oldest_gens - update oldest_gen pointers from writeback keys - * - * This prevents us from wrapping around gens for a bucket only referenced from - * writeback keybufs. We don't actually care that the data in those buckets is - * marked live, only that we don't wrap the gens. - */ -void bch_writeback_recalc_oldest_gens(struct bch_fs *c) -{ - struct radix_tree_iter iter; - void **slot; - - rcu_read_lock(); - - radix_tree_for_each_slot(slot, &c->devices, &iter, 0) { - struct bcache_device *d; - struct cached_dev *dc; - - d = radix_tree_deref_slot(slot); - - if (!CACHED_DEV(&d->inode.v)) - continue; - dc = container_of(d, struct cached_dev, disk); - - bch_keybuf_recalc_oldest_gens(c, &dc->writeback_keys); - } - - rcu_read_unlock(); -} - -/* Init */ - -void bch_sectors_dirty_init(struct cached_dev *dc, struct bch_fs *c) -{ - struct bcache_device *d = &dc->disk; - struct btree_iter iter; - struct bkey_s_c k; - - /* - * We have to do this before the disk is added to the radix tree or we - * race with moving GC - */ - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(bcache_dev_inum(d), 0), k) { - if (k.k->p.inode > bcache_dev_inum(d)) - break; - - if (bkey_extent_is_data(k.k) && - !bkey_extent_is_cached(k.k)) - __bcache_dev_sectors_dirty_add(d, - bkey_start_offset(k.k), - k.k->size); - - bch_btree_iter_cond_resched(&iter); - } - bch_btree_iter_unlock(&iter); - - dc->writeback_pd.last_actual = bcache_dev_sectors_dirty(d); -} - -void bch_cached_dev_writeback_stop(struct cached_dev *dc) -{ - cancel_delayed_work_sync(&dc->writeback_pd_update); - if (!IS_ERR_OR_NULL(dc->writeback_thread)) { - kthread_stop(dc->writeback_thread); - dc->writeback_thread = NULL; - } -} - -void bch_cached_dev_writeback_free(struct cached_dev *dc) -{ - struct bcache_device *d = &dc->disk; - - mempool_exit(&dc->writeback_page_pool); - mempool_exit(&dc->writeback_io_pool); - kvfree(d->full_dirty_stripes); - kvfree(d->stripe_sectors_dirty); -} - -int bch_cached_dev_writeback_init(struct cached_dev *dc) -{ - struct bcache_device *d = &dc->disk; - sector_t sectors; - size_t n; - - sectors = get_capacity(dc->disk.disk); - - if (!d->stripe_size) { -#ifdef CONFIG_BCACHE_DEBUG - d->stripe_size = 1 << 0; -#else - d->stripe_size = 1 << 31; -#endif - } - - pr_debug("stripe size: %d sectors", d->stripe_size); - d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); - - if (!d->nr_stripes || - d->nr_stripes > INT_MAX || - d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) { - pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)", - (unsigned)d->nr_stripes); - return -ENOMEM; - } - - n = d->nr_stripes * sizeof(atomic_t); - d->stripe_sectors_dirty = n < PAGE_SIZE << 6 - ? kzalloc(n, GFP_KERNEL) - : vzalloc(n); - if (!d->stripe_sectors_dirty) { - pr_err("cannot allocate stripe_sectors_dirty"); - return -ENOMEM; - } - - n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); - d->full_dirty_stripes = n < PAGE_SIZE << 6 - ? kzalloc(n, GFP_KERNEL) - : vzalloc(n); - if (!d->full_dirty_stripes) { - pr_err("cannot allocate full_dirty_stripes"); - return -ENOMEM; - } - - if (mempool_init_kmalloc_pool(&dc->writeback_io_pool, 4, - sizeof(struct dirty_io) + - sizeof(struct bio_vec) * - DIRTY_IO_MEMPOOL_BVECS) || - mempool_init_page_pool(&dc->writeback_page_pool, - (64 << 10) / PAGE_SIZE, 0)) - return -ENOMEM; - - init_rwsem(&dc->writeback_lock); - bch_keybuf_init(&dc->writeback_keys); - - dc->writeback_metadata = true; - dc->writeback_running = true; - dc->writeback_percent = 10; - dc->writeback_pd_update_seconds = 5; - - bch_pd_controller_init(&dc->writeback_pd); - INIT_DELAYED_WORK(&dc->writeback_pd_update, update_writeback_rate); - - return 0; -} - -int bch_cached_dev_writeback_start(struct cached_dev *dc) -{ - dc->writeback_thread = kthread_create(bch_writeback_thread, dc, - "bcache_writeback"); - if (IS_ERR(dc->writeback_thread)) - return PTR_ERR(dc->writeback_thread); - - schedule_delayed_work(&dc->writeback_pd_update, - dc->writeback_pd_update_seconds * HZ); - - bch_writeback_queue(dc); - - return 0; -} diff --git a/libbcache/writeback.h b/libbcache/writeback.h deleted file mode 100644 index 82ce306e..00000000 --- a/libbcache/writeback.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef _BCACHE_WRITEBACK_H -#define _BCACHE_WRITEBACK_H - -#include "blockdev.h" -#include "buckets.h" - -#define CUTOFF_WRITEBACK 60 -#define CUTOFF_WRITEBACK_SYNC 30 - -static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) -{ - uint64_t i, ret = 0; - - for (i = 0; i < d->nr_stripes; i++) - ret += atomic_read(d->stripe_sectors_dirty + i); - - return ret; -} - -static inline unsigned offset_to_stripe(struct bcache_device *d, - uint64_t offset) -{ - do_div(offset, d->stripe_size); - return offset; -} - -static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, - uint64_t offset, - unsigned nr_sectors) -{ - unsigned stripe = offset_to_stripe(&dc->disk, offset); - - while (1) { - if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) - return true; - - if (nr_sectors <= dc->disk.stripe_size) - return false; - - nr_sectors -= dc->disk.stripe_size; - stripe++; - } -} - -static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, - unsigned cache_mode, bool would_skip) -{ - struct bch_fs *c = dc->disk.c; - u64 available = sectors_available(c); - - if (cache_mode != CACHE_MODE_WRITEBACK || - test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - available * 100 < c->capacity * CUTOFF_WRITEBACK_SYNC) - return false; - - if (dc->partial_stripes_expensive && - bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector, - bio_sectors(bio))) - return true; - - if (would_skip) - return false; - - return bio->bi_opf & REQ_SYNC || - available * 100 < c->capacity * CUTOFF_WRITEBACK; -} - -static inline void bch_writeback_queue(struct cached_dev *dc) -{ - if (!IS_ERR_OR_NULL(dc->writeback_thread)) - wake_up_process(dc->writeback_thread); -} - -static inline void bch_writeback_add(struct cached_dev *dc) -{ - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { - atomic_inc(&dc->count); - - if (BDEV_STATE(dc->disk_sb.sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ - bch_write_bdev_super(dc, NULL); - } - - bch_writeback_queue(dc); - } -} - -#ifndef NO_BCACHE_WRITEBACK - -void bcache_dev_sectors_dirty_add(struct bch_fs *, unsigned, u64, int); - -void bch_writeback_recalc_oldest_gens(struct bch_fs *); -void bch_sectors_dirty_init(struct cached_dev *, struct bch_fs *c); - -void bch_cached_dev_writeback_stop(struct cached_dev *); -void bch_cached_dev_writeback_free(struct cached_dev *); -int bch_cached_dev_writeback_init(struct cached_dev *); -int bch_cached_dev_writeback_start(struct cached_dev *); - -#else - -static inline void bcache_dev_sectors_dirty_add(struct bch_fs *c, - unsigned i, u64 o, int n) {} -static inline void bch_writeback_recalc_oldest_gens(struct bch_fs *c) {} -static inline void bch_sectors_dirty_init(struct cached_dev *dc, - struct bch_fs *c) {} -static inline void bch_cached_dev_writeback_stop(struct cached_dev *dc) {} -static inline void bch_cached_dev_writeback_free(struct cached_dev *dc) {} -static inline int bch_cached_dev_writeback_init(struct cached_dev *dc) -{ - return 0; -} -static inline int bch_cached_dev_writeback_start(struct cached_dev *dc) -{ - return 0; -} - -#endif - -#endif diff --git a/libbcache/xattr.c b/libbcache/xattr.c deleted file mode 100644 index a5c66fa1..00000000 --- a/libbcache/xattr.c +++ /dev/null @@ -1,365 +0,0 @@ - -#include "bcache.h" -#include "bkey_methods.h" -#include "btree_update.h" -#include "extents.h" -#include "fs.h" -#include "str_hash.h" -#include "xattr.h" - -#include <linux/posix_acl_xattr.h> -#include <linux/xattr.h> - -struct xattr_search_key { - u8 type; - struct qstr name; -}; - -#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ - { .type = _type, .name = QSTR_INIT(_name, _len) }) - -static u64 bch_xattr_hash(const struct bch_hash_info *info, - const struct xattr_search_key *key) -{ - struct bch_str_hash_ctx ctx; - - bch_str_hash_init(&ctx, info); - bch_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); - bch_str_hash_update(&ctx, info, key->name.name, key->name.len); - - return bch_str_hash_end(&ctx, info); -} - -#define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len) - -static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) -{ - return bch_xattr_hash(info, key); -} - -static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -{ - struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); - - return bch_xattr_hash(info, - &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); -} - -static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) -{ - struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); - const struct xattr_search_key *r = _r; - - return l.v->x_type != r->type || - l.v->x_name_len != r->name.len || - memcmp(l.v->x_name, r->name.name, r->name.len); -} - -static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -{ - struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); - struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); - - return l.v->x_type != r.v->x_type || - l.v->x_name_len != r.v->x_name_len || - memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); -} - -static const struct bch_hash_desc xattr_hash_desc = { - .btree_id = BTREE_ID_XATTRS, - .key_type = BCH_XATTR, - .whiteout_type = BCH_XATTR_WHITEOUT, - .hash_key = xattr_hash_key, - .hash_bkey = xattr_hash_bkey, - .cmp_key = xattr_cmp_key, - .cmp_bkey = xattr_cmp_bkey, -}; - -static const char *bch_xattr_invalid(const struct bch_fs *c, - struct bkey_s_c k) -{ - switch (k.k->type) { - case BCH_XATTR: - return bkey_val_bytes(k.k) < sizeof(struct bch_xattr) - ? "value too small" - : NULL; - - case BCH_XATTR_WHITEOUT: - return bkey_val_bytes(k.k) != 0 - ? "value size should be zero" - : NULL; - - default: - return "invalid type"; - } -} - -static void bch_xattr_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) -{ - struct bkey_s_c_xattr xattr; - int n; - - switch (k.k->type) { - case BCH_XATTR: - xattr = bkey_s_c_to_xattr(k); - - if (size) { - n = min_t(unsigned, size, xattr.v->x_name_len); - memcpy(buf, xattr.v->x_name, n); - buf[size - 1] = '\0'; - buf += n; - size -= n; - } - - n = scnprintf(buf, size, " -> "); - buf += n; - size -= n; - - if (size) { - n = min_t(unsigned, size, - le16_to_cpu(xattr.v->x_val_len)); - memcpy(buf, xattr_val(xattr.v), n); - buf[size - 1] = '\0'; - buf += n; - size -= n; - } - - break; - case BCH_XATTR_WHITEOUT: - scnprintf(buf, size, "whiteout"); - break; - } -} - -const struct bkey_ops bch_bkey_xattr_ops = { - .key_invalid = bch_xattr_invalid, - .val_to_text = bch_xattr_to_text, -}; - -int bch_xattr_get(struct bch_fs *c, struct inode *inode, - const char *name, void *buffer, size_t size, int type) -{ - struct bch_inode_info *ei = to_bch_ei(inode); - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_xattr xattr; - int ret; - - k = bch_hash_lookup(xattr_hash_desc, &ei->str_hash, c, - ei->vfs_inode.i_ino, &iter, - &X_SEARCH(type, name, strlen(name))); - if (IS_ERR(k.k)) - return bch_btree_iter_unlock(&iter) ?: -ENODATA; - - xattr = bkey_s_c_to_xattr(k); - ret = le16_to_cpu(xattr.v->x_val_len); - if (buffer) { - if (ret > size) - ret = -ERANGE; - else - memcpy(buffer, xattr_val(xattr.v), ret); - } - - bch_btree_iter_unlock(&iter); - return ret; -} - -int __bch_xattr_set(struct bch_fs *c, u64 inum, - const struct bch_hash_info *hash_info, - const char *name, const void *value, size_t size, - int flags, int type, u64 *journal_seq) -{ - struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); - int ret; - - if (!value) { - ret = bch_hash_delete(xattr_hash_desc, hash_info, - c, inum, - journal_seq, &search); - } else { - struct bkey_i_xattr *xattr; - unsigned u64s = BKEY_U64s + - DIV_ROUND_UP(sizeof(struct bch_xattr) + - search.name.len + size, - sizeof(u64)); - - if (u64s > U8_MAX) - return -ERANGE; - - xattr = kmalloc(u64s * sizeof(u64), GFP_NOFS); - if (!xattr) - return -ENOMEM; - - bkey_xattr_init(&xattr->k_i); - xattr->k.u64s = u64s; - xattr->v.x_type = type; - xattr->v.x_name_len = search.name.len; - xattr->v.x_val_len = cpu_to_le16(size); - memcpy(xattr->v.x_name, search.name.name, search.name.len); - memcpy(xattr_val(&xattr->v), value, size); - - ret = bch_hash_set(xattr_hash_desc, hash_info, c, - inum, journal_seq, - &xattr->k_i, - (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| - (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); - kfree(xattr); - } - - if (ret == -ENOENT) - ret = flags & XATTR_REPLACE ? -ENODATA : 0; - - return ret; -} - -int bch_xattr_set(struct bch_fs *c, struct inode *inode, - const char *name, const void *value, size_t size, - int flags, int type) -{ - struct bch_inode_info *ei = to_bch_ei(inode); - - return __bch_xattr_set(c, inode->i_ino, &ei->str_hash, - name, value, size, flags, type, - &ei->journal_seq); -} - -static const struct xattr_handler *bch_xattr_type_to_handler(unsigned); - -static size_t bch_xattr_emit(struct dentry *dentry, - const struct bch_xattr *xattr, - char *buffer, size_t buffer_size) -{ - const struct xattr_handler *handler = - bch_xattr_type_to_handler(xattr->x_type); - - if (handler && (!handler->list || handler->list(dentry))) { - const char *prefix = handler->prefix ?: handler->name; - const size_t prefix_len = strlen(prefix); - const size_t total_len = prefix_len + xattr->x_name_len + 1; - - if (buffer && total_len <= buffer_size) { - memcpy(buffer, prefix, prefix_len); - memcpy(buffer + prefix_len, - xattr->x_name, xattr->x_name_len); - buffer[prefix_len + xattr->x_name_len] = '\0'; - } - - return total_len; - } else { - return 0; - } -} - -ssize_t bch_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - struct bch_fs *c = dentry->d_sb->s_fs_info; - struct btree_iter iter; - struct bkey_s_c k; - const struct bch_xattr *xattr; - u64 inum = dentry->d_inode->i_ino; - ssize_t ret = 0; - size_t len; - - for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), k) { - BUG_ON(k.k->p.inode < inum); - - if (k.k->p.inode > inum) - break; - - if (k.k->type != BCH_XATTR) - continue; - - xattr = bkey_s_c_to_xattr(k).v; - - len = bch_xattr_emit(dentry, xattr, buffer, buffer_size); - if (buffer) { - if (len > buffer_size) { - bch_btree_iter_unlock(&iter); - return -ERANGE; - } - - buffer += len; - buffer_size -= len; - } - - ret += len; - - } - bch_btree_iter_unlock(&iter); - - return ret; -} - -static int bch_xattr_get_handler(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - struct bch_fs *c = inode->i_sb->s_fs_info; - - return bch_xattr_get(c, inode, name, buffer, size, handler->flags); -} - -static int bch_xattr_set_handler(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - struct bch_fs *c = inode->i_sb->s_fs_info; - - return bch_xattr_set(c, inode, name, value, size, flags, - handler->flags); -} - -static const struct xattr_handler bch_xattr_user_handler = { - .prefix = XATTR_USER_PREFIX, - .get = bch_xattr_get_handler, - .set = bch_xattr_set_handler, - .flags = BCH_XATTR_INDEX_USER, -}; - -static bool bch_xattr_trusted_list(struct dentry *dentry) -{ - return capable(CAP_SYS_ADMIN); -} - -static const struct xattr_handler bch_xattr_trusted_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .list = bch_xattr_trusted_list, - .get = bch_xattr_get_handler, - .set = bch_xattr_set_handler, - .flags = BCH_XATTR_INDEX_TRUSTED, -}; - -static const struct xattr_handler bch_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .get = bch_xattr_get_handler, - .set = bch_xattr_set_handler, - .flags = BCH_XATTR_INDEX_SECURITY, -}; - -static const struct xattr_handler *bch_xattr_handler_map[] = { - [BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler, - [BCH_XATTR_INDEX_POSIX_ACL_ACCESS] = - &posix_acl_access_xattr_handler, - [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT] = - &posix_acl_default_xattr_handler, - [BCH_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, - [BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, -}; - -const struct xattr_handler *bch_xattr_handlers[] = { - &bch_xattr_user_handler, - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, - &bch_xattr_trusted_handler, - &bch_xattr_security_handler, - NULL -}; - -static const struct xattr_handler *bch_xattr_type_to_handler(unsigned type) -{ - return type < ARRAY_SIZE(bch_xattr_handler_map) - ? bch_xattr_handler_map[type] - : NULL; -} diff --git a/libbcache/xattr.h b/libbcache/xattr.h deleted file mode 100644 index c48c7acf..00000000 --- a/libbcache/xattr.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _BCACHE_XATTR_H -#define _BCACHE_XATTR_H - -extern const struct bkey_ops bch_bkey_xattr_ops; - -struct dentry; -struct xattr_handler; -struct bch_hash_info; - -int bch_xattr_get(struct bch_fs *, struct inode *, - const char *, void *, size_t, int); -int __bch_xattr_set(struct bch_fs *, u64, const struct bch_hash_info *, - const char *, const void *, size_t, int, int, u64 *); -int bch_xattr_set(struct bch_fs *, struct inode *, - const char *, const void *, size_t, int, int); -ssize_t bch_xattr_list(struct dentry *, char *, size_t); - -extern const struct xattr_handler *bch_xattr_handlers[]; - -#endif /* _BCACHE_XATTR_H */ |